3192 files changed, 65149 insertions, 146739 deletions
diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore
index b458815f1d1b..e8d2269bad9d 100644
--- a/.get_maintainer.ignore
+++ b/.get_maintainer.ignore
@@ -1,5 +1,6 @@
 Alan Cox <alan@lxorguk.ukuu.org.uk>
 Alan Cox <root@hraefn.swansea.linux.org.uk>
+Alyssa Rosenzweig <alyssa@rosenzweig.io>
 Christoph Hellwig <hch@lst.de>
 Jeff Kirsher <jeffrey.t.kirsher@intel.com>
 Marc Gonzalez <marc.w.gonzalez@free.fr>
diff --git a/.mailmap b/.mailmap
index d9fa1b555116..8db24be50158 100644
--- a/.mailmap
+++ b/.mailmap
@@ -134,6 +134,7 @@ Ben M Cahill <ben.m.cahill@intel.com>
 Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
 Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
 Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
+Bence Csókás <bence98@sch.bme.hu> <csokas.bence@prolan.hu>
 Benjamin Poirier <benjamin.poirier@gmail.com> <bpoirier@suse.de>
 Benjamin Tissoires <bentiss@kernel.org> <benjamin.tissoires@gmail.com>
 Benjamin Tissoires <bentiss@kernel.org> <benjamin.tissoires@redhat.com>
@@ -226,6 +227,8 @@ Domen Puncer <domen@coderock.org>
 Douglas Gilbert <dougg@torque.net>
 Drew Fustini <fustini@kernel.org> <drew@pdp7.com>
 <duje@dujemihanovic.xyz> <duje.mihanovic@skole.hr>
+Easwar Hariharan <easwar.hariharan@linux.microsoft.com> <easwar.hariharan@intel.com>
+Easwar Hariharan <easwar.hariharan@linux.microsoft.com> <eahariha@linux.microsoft.com>
 Ed L. Cashin <ecashin@coraid.com>
 Elliot Berman <quic_eberman@quicinc.com> <eberman@codeaurora.org>
 Enric Balletbo i Serra <eballetbo@kernel.org> <enric.balletbo@collabora.com>
@@ -587,6 +590,7 @@ Nikolay Aleksandrov <razor@blackwall.org> <nikolay@redhat.com>
 Nikolay Aleksandrov <razor@blackwall.org> <nikolay@cumulusnetworks.com>
 Nikolay Aleksandrov <razor@blackwall.org> <nikolay@nvidia.com>
 Nikolay Aleksandrov <razor@blackwall.org> <nikolay@isovalent.com>
+Nobuhiro Iwamatsu <nobuhiro.iwamatsu.x90@mail.toshiba> <nobuhiro1.iwamatsu@toshiba.co.jp>
 Odelu Kukatla <quic_okukatla@quicinc.com> <okukatla@codeaurora.org>
 Oleksandr Natalenko <oleksandr@natalenko.name> <oleksandr@redhat.com>
 Oleksij Rempel <linux@rempel-privat.de> <bug-track@fisher-privat.net>
@@ -620,6 +624,7 @@ Paulo Alcantara <pc@manguebit.org> <palcantara@suse.com>
 Paulo Alcantara <pc@manguebit.org> <pc@manguebit.com>
 Pavankumar Kondeti <quic_pkondeti@quicinc.com> <pkondeti@codeaurora.org>
 Peter A Jonsson <pj@ludd.ltu.se>
+Peter Hilber <peter.hilber@oss.qualcomm.com> <quic_philber@quicinc.com>
 Peter Oruba <peter.oruba@amd.com>
 Peter Oruba <peter@oruba.de>
 Pierre-Louis Bossart <pierre-louis.bossart@linux.dev> <pierre-louis.bossart@linux.intel.com>
diff --git a/CREDITS b/CREDITS
index a357f9cbb05d..a687c3c35c4c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3222,6 +3222,10 @@ D: AIC5800 IEEE 1394, RAW I/O on 1394
 D: Starter of Linux1394 effort
 S: ask per mail for current address
 
+N: Boris Pismenny
+E: borisp@mellanox.com
+D: Kernel TLS implementation and offload support.
+
 N: Nicolas Pitre
 E: nico@fluxnic.net
 D: StrongARM SA1100 support integrator & hacker
@@ -4168,6 +4172,9 @@ S: 1513 Brewster Dr.
 S: Carrollton, TX 75010
 S: USA
 
+N: Dave Watson
+D: Kernel TLS implementation.
+
 N: Tim Waugh
 E: tim@cyberelk.net
 D: Co-architect of the parallel-port sharing system
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 803f578dc023..0ddffc9133d0 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -731,7 +731,7 @@ Contact:	linux-block@vger.kernel.org
 Description:
 		[RW] If the device is registered for writeback throttling, then
 		this file shows the target minimum read latency. If this latency
-		is exceeded in a given window of time (see wb_window_usec), then
+		is exceeded in a given window of time (see curr_win_nsec), then
 		the writeback throttling will start scaling back writes. Writing
 		a value of '0' to this file disables the feature. Writing a
 		value of '-1' to this file resets the value to the default
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl
new file mode 100644
index 000000000000..7b7c789a5cf5
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-vpa-dtl
@@ -0,0 +1,25 @@
+What:           /sys/bus/event_source/devices/vpa_dtl/format
+Date:           February 2025
+Contact:        Linux on PowerPC Developer List <linuxppc-dev at lists.ozlabs.org>
+Description:    Read-only. Attribute group to describe the magic bits
+                that go into perf_event_attr.config for a particular pmu.
+                (See ABI/testing/sysfs-bus-event_source-devices-format).
+
+                Each attribute under this group defines a bit range of the
+                perf_event_attr.config. Supported attribute are listed
+                below::
+
+				event  = "config:0-7"  - event ID
+
+		For example::
+
+		  dtl_cede = "event=0x1"
+
+What:           /sys/bus/event_source/devices/vpa_dtl/events
+Date:           February 2025
+Contact:        Linux on PowerPC Developer List <linuxppc-dev at lists.ozlabs.org>
+Description:	(RO) Attribute group to describe performance monitoring events
+                for the Virtual Processor Dispatch Trace Log. Each attribute in
+		this group describes a single performance monitoring event
+		supported by vpa_dtl pmu.  The name of the file is the name of
+		the event (See ABI/testing/sysfs-bus-event_source-devices-events).
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index ab8cd337f43a..8aed6d94c4cd 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -586,6 +586,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/srbds
 		/sys/devices/system/cpu/vulnerabilities/tsa
 		/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
+		/sys/devices/system/cpu/vulnerabilities/vmscape
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst
index 9c7aa3b482f3..64dcfde7450a 100644
--- a/Documentation/admin-guide/blockdev/zoned_loop.rst
+++ b/Documentation/admin-guide/blockdev/zoned_loop.rst
@@ -79,7 +79,7 @@ zone_capacity_mb   Device zone capacity (must always be equal to or lower than
                    the zone size. Default: zone size.
 conv_zones         Total number of conventioanl zones starting from sector 0.
                    Default: 8.
-base_dir           Path to the base directoy where to create the directory
+base_dir           Path to the base directory where to create the directory
                    containing the zone files of the device.
                    Default=/var/local/zloop.
                    The device directory containing the zone files is always
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index d9d3cc7df348..0e6c67ac585a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -15,6 +15,9 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
 
 .. CONTENTS
 
+   [Whenever any new section is added to this document, please also add
+    an entry here.]
+
    1. Introduction
      1-1. Terminology
      1-2. What is cgroup?
@@ -25,9 +28,10 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
        2-2-2. Threads
      2-3. [Un]populated Notification
      2-4. Controlling Controllers
-       2-4-1. Enabling and Disabling
-       2-4-2. Top-down Constraint
-       2-4-3. No Internal Process Constraint
+       2-4-1. Availability
+       2-4-2. Enabling and Disabling
+       2-4-3. Top-down Constraint
+       2-4-4. No Internal Process Constraint
      2-5. Delegation
        2-5-1. Model of Delegation
        2-5-2. Delegation Containment
@@ -61,14 +65,15 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
        5-4-1. PID Interface Files
      5-5. Cpuset
        5.5-1. Cpuset Interface Files
-     5-6. Device
+     5-6. Device controller
      5-7. RDMA
        5-7-1. RDMA Interface Files
      5-8. DMEM
+       5-8-1. DMEM Interface Files
      5-9. HugeTLB
        5.9-1. HugeTLB Interface Files
      5-10. Misc
-       5.10-1 Miscellaneous cgroup Interface Files
+       5.10-1 Misc Interface Files
        5.10-2 Migration and Ownership
      5-11. Others
        5-11-1. perf_event
@@ -435,8 +440,8 @@ both cgroups.
 Controlling Controllers
 -----------------------
 
-Availablity
-~~~~~~~~~~~
+Availability
+~~~~~~~~~~~~
 
 A controller is available in a cgroup when it is supported by the kernel (i.e.,
 compiled in, not disabled and not attached to a v1 hierarchy) and listed in the
@@ -1001,6 +1006,24 @@ All cgroup core files are prefixed with "cgroup."
 		Total number of dying cgroup subsystems (e.g. memory
 		cgroup) at and beneath the current cgroup.
 
+  cgroup.stat.local
+	A read-only flat-keyed file which exists in non-root cgroups.
+	The following entry is defined:
+
+	  frozen_usec
+		Cumulative time that this cgroup has spent between freezing and
+		thawing, regardless of whether by self or ancestor groups.
+		NB: (not) reaching "frozen" state is not accounted here.
+
+		Using the following ASCII representation of a cgroup's freezer
+		state, ::
+
+			       1    _____
+			frozen 0 __/     \__
+			          ab    cd
+
+		the duration being measured is the span between a and c.
+
   cgroup.freeze
 	A read-write single value file which exists on non-root cgroups.
 	Allowed values are "0" and "1". The default is "0".
diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
index b4de16f5ec44..d0bdbd81dcf9 100644
--- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
@@ -214,10 +214,11 @@ Spectre_v1            X
 Spectre_v2            X                           X
 Spectre_v2_user                      X                           X            *       (Note 1)
 SRBDS                 X              X            X              X
-SRSO                  X                           X
-SSB                                                                                   (Note 4)
+SRSO                  X              X            X              X
+SSB                                  X
 TAA                   X              X            X              X            *       (Note 2)
 TSA                   X              X            X              X
+VMSCAPE                                           X
 =============== ============== ============ ============= ============== ============ ========
 
 Notes:
@@ -229,9 +230,6 @@ Notes:
    3 --  Disables SMT if cross-thread mitigations are fully enabled, the CPU is
    vulnerable, and STIBP is not supported
 
-   4 --  Speculative store bypass is always enabled by default (no kernel
-   mitigation applied) unless overridden with spec_store_bypass_disable option
-
 When an attack-vector is disabled, all mitigations for the vulnerabilities
 listed in the above table are disabled, unless mitigation is required for a
 different enabled attack-vector or a mitigation is explicitly selected via a
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 89ca636081b7..55d747511f83 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -26,3 +26,4 @@ are configurable at compile, boot or run time.
    rsb
    old_microcode
    indirect-target-selection
+   vmscape
diff --git a/Documentation/admin-guide/hw-vuln/vmscape.rst b/Documentation/admin-guide/hw-vuln/vmscape.rst
new file mode 100644
index 000000000000..d9b9a2b6c114
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/vmscape.rst
@@ -0,0 +1,110 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+VMSCAPE
+=======
+
+VMSCAPE is a vulnerability that may allow a guest to influence the branch
+prediction in host userspace. It particularly affects hypervisors like QEMU.
+
+Even if a hypervisor may not have any sensitive data like disk encryption keys,
+guest-userspace may be able to attack the guest-kernel using the hypervisor as
+a confused deputy.
+
+Affected processors
+-------------------
+
+The following CPU families are affected by VMSCAPE:
+
+**Intel processors:**
+  - Skylake generation (Parts without Enhanced-IBRS)
+  - Cascade Lake generation - (Parts affected by ITS guest/host separation)
+  - Alder Lake and newer (Parts affected by BHI)
+
+Note that, BHI affected parts that use BHB clearing software mitigation e.g.
+Icelake are not vulnerable to VMSCAPE.
+
+**AMD processors:**
+  - Zen series (families 0x17, 0x19, 0x1a)
+
+** Hygon processors:**
+ - Family 0x18
+
+Mitigation
+----------
+
+Conditional IBPB
+----------------
+
+Kernel tracks when a CPU has run a potentially malicious guest and issues an
+IBPB before the first exit to userspace after VM-exit. If userspace did not run
+between VM-exit and the next VM-entry, no IBPB is issued.
+
+Note that the existing userspace mitigation against Spectre-v2 is effective in
+protecting the userspace. They are insufficient to protect the userspace VMMs
+from a malicious guest. This is because Spectre-v2 mitigations are applied at
+context switch time, while the userspace VMM can run after a VM-exit without a
+context switch.
+
+Vulnerability enumeration and mitigation is not applied inside a guest. This is
+because nested hypervisors should already be deploying IBPB to isolate
+themselves from nested guests.
+
+SMT considerations
+------------------
+
+When Simultaneous Multi-Threading (SMT) is enabled, hypervisors can be
+vulnerable to cross-thread attacks. For complete protection against VMSCAPE
+attacks in SMT environments, STIBP should be enabled.
+
+The kernel will issue a warning if SMT is enabled without adequate STIBP
+protection. Warning is not issued when:
+
+- SMT is disabled
+- STIBP is enabled system-wide
+- Intel eIBRS is enabled (which implies STIBP protection)
+
+System information and options
+------------------------------
+
+The sysfs file showing VMSCAPE mitigation status is:
+
+  /sys/devices/system/cpu/vulnerabilities/vmscape
+
+The possible values in this file are:
+
+ * 'Not affected':
+
+   The processor is not vulnerable to VMSCAPE attacks.
+
+ * 'Vulnerable':
+
+   The processor is vulnerable and no mitigation has been applied.
+
+ * 'Mitigation: IBPB before exit to userspace':
+
+   Conditional IBPB mitigation is enabled. The kernel tracks when a CPU has
+   run a potentially malicious guest and issues an IBPB before the first
+   exit to userspace after VM-exit.
+
+ * 'Mitigation: IBPB on VMEXIT':
+
+   IBPB is issued on every VM-exit. This occurs when other mitigations like
+   RETBLEED or SRSO are already issuing IBPB on VM-exit.
+
+Mitigation control on the kernel command line
+----------------------------------------------
+
+The mitigation can be controlled via the ``vmscape=`` command line parameter:
+
+ * ``vmscape=off``:
+
+   Disable the VMSCAPE mitigation.
+
+ * ``vmscape=ibpb``:
+
+   Enable conditional IBPB mitigation (default when CONFIG_MITIGATION_VMSCAPE=y).
+
+ * ``vmscape=force``:
+
+   Force vulnerability detection and mitigation even on processors that are
+   not known to be affected.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 747a55abf494..e019db1633fd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2606,6 +2606,11 @@
 			for it. Intended to get systems with badly broken
 			firmware running.
 
+	irqhandler.duration_warn_us= [KNL]
+			Warn if an IRQ handler exceeds the specified duration
+			threshold in microseconds. Useful for identifying
+			long-running IRQs in the system.
+
 	irqpoll		[HW]
 			When an interrupt is not handled search all handlers
 			for it. Also check all handlers each timer
@@ -3767,8 +3772,16 @@
 
 	mga=		[HW,DRM]
 
-	microcode.force_minrev=	[X86]
-			Format: <bool>
+	microcode=      [X86] Control the behavior of the microcode loader.
+	                Available options, comma separated:
+
+			base_rev=X - with <X> with format: <u32>
+			Set the base microcode revision of each thread when in
+			debug mode.
+
+			dis_ucode_ldr: disable the microcode loader
+
+			force_minrev:
 			Enable or disable the microcode minimal revision
 			enforcement for the runtime microcode loader.
 
@@ -3829,6 +3842,7 @@
 					       srbds=off [X86,INTEL]
 					       ssbd=force-off [ARM64]
 					       tsx_async_abort=off [X86]
+					       vmscape=off [X86]
 
 				Exceptions:
 					       This does not have any effect on
@@ -6154,7 +6168,7 @@
 	rdt=		[HW,X86,RDT]
 			Turn on/off individual RDT features. List is:
 			cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
-			mba, smba, bmec.
+			mba, smba, bmec, abmc.
 			E.g. to turn on cmt and turn off mba use:
 				rdt=cmt,!mba
 
@@ -6405,8 +6419,9 @@
 	rodata=		[KNL,EARLY]
 		on	Mark read-only kernel memory as read-only (default).
 		off	Leave read-only kernel memory writable for debugging.
-		full	Mark read-only kernel memory and aliases as read-only
-		        [arm64]
+		noalias	Mark read-only kernel memory as read-only but retain
+			writable aliases in the direct map for regions outside
+			of the kernel image. [arm64]
 
 	rockchip.usb_uart
 			[EARLY]
@@ -6428,6 +6443,9 @@
 
 	rootflags=	[KNL] Set root filesystem mount option string
 
+	initramfs_options= [KNL]
+                        Specify mount options for for the initramfs mount.
+
 	rootfstype=	[KNL] Set root filesystem type
 
 	rootwait	[KNL] Wait (indefinitely) for root device to show up.
@@ -8041,6 +8059,16 @@
 	vmpoff=		[KNL,S390] Perform z/VM CP command after power off.
 			Format: <command>
 
+	vmscape=	[X86] Controls mitigation for VMscape attacks.
+			VMscape attacks can leak information from a userspace
+			hypervisor to a guest via speculative side-channels.
+
+			off		- disable the mitigation
+			ibpb		- use Indirect Branch Prediction Barrier
+					  (IBPB) mitigation (default)
+			force		- force vulnerability detection even on
+					  unaffected processors
+
 	vsyscall=	[X86-64,EARLY]
 			Controls the behavior of vsyscalls (i.e. calls to
 			fixed addresses of 0xffffffffff600x00 from legacy
diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst
index 67fd6932cef4..c4dd534f91ed 100644
--- a/Documentation/admin-guide/laptops/lg-laptop.rst
+++ b/Documentation/admin-guide/laptops/lg-laptop.rst
@@ -48,8 +48,8 @@ This value is reset to 100 when the kernel boots.
 Fan mode
 --------
 
-Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables
-the fan silent mode.
+Writing 0/1/2 to /sys/devices/platform/lg-laptop/fan_mode sets fan mode to
+Optimal/Silent/Performance respectively.
 
 
 USB charge
diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst
index cb376f335f40..167f9281fbf5 100644
--- a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst
+++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst
@@ -16,8 +16,8 @@ provides the following two features:
 
 - one 64-bit counter for Time Based Analysis (RX/TX data throughput and
   time spent in each low-power LTSSM state) and
-- one 32-bit counter for Event Counting (error and non-error events for
-  a specified lane)
+- one 32-bit counter per event for Event Counting (error and non-error
+  events for a specified lane)
 
 Note: There is no interrupt for counter overflow.
 
diff --git a/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst
new file mode 100644
index 000000000000..46595b788d3a
--- /dev/null
+++ b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst
@@ -0,0 +1,110 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+================================================
+Fujitsu Uncore Performance Monitoring Unit (PMU)
+================================================
+
+This driver supports the Uncore MAC PMUs and the Uncore PCI PMUs found
+in Fujitsu chips.
+Each MAC PMU on these chips is exposed as a uncore perf PMU with device name
+mac_iod<iod>_mac<mac>_ch<ch>.
+And each PCI PMU on these chips is exposed as a uncore perf PMU with device name
+pci_iod<iod>_pci<pci>.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/bus/event_sources/devices/mac_iod<iod>_mac<mac>_ch<ch>/
+and /sys/bus/event_sources/devices/pci_iod<iod>_pci<pci>/.
+This driver exports:
+- formats, used by perf user space and other tools to configure events
+- events, used by perf user space and other tools to create events
+  symbolically, e.g.:
+    perf stat -a -e mac_iod0_mac0_ch0/event=0x21/ ls
+    perf stat -a -e pci_iod0_pci0/event=0x24/ ls
+- cpumask, used by perf user space and other tools to know on which CPUs
+  to open the events
+
+This driver supports the following events for MAC:
+- cycles
+  This event counts MAC cycles at MAC frequency.
+- read-count
+  This event counts the number of read requests to MAC.
+- read-count-request
+  This event counts the number of read requests including retry to MAC.
+- read-count-return
+  This event counts the number of responses to read requests to MAC.
+- read-count-request-pftgt
+  This event counts the number of read requests including retry with PFTGT
+  flag.
+- read-count-request-normal
+  This event counts the number of read requests including retry without PFTGT
+  flag.
+- read-count-return-pftgt-hit
+  This event counts the number of responses to read requests which hit the
+  PFTGT buffer.
+- read-count-return-pftgt-miss
+  This event counts the number of responses to read requests which miss the
+  PFTGT buffer.
+- read-wait
+  This event counts outstanding read requests issued by DDR memory controller
+  per cycle.
+- write-count
+  This event counts the number of write requests to MAC (including zero write,
+  full write, partial write, write cancel).
+- write-count-write
+  This event counts the number of full write requests to MAC (not including
+  zero write).
+- write-count-pwrite
+  This event counts the number of partial write requests to MAC.
+- memory-read-count
+  This event counts the number of read requests from MAC to memory.
+- memory-write-count
+  This event counts the number of full write requests from MAC to memory.
+- memory-pwrite-count
+  This event counts the number of partial write requests from MAC to memory.
+- ea-mac
+  This event counts energy consumption of MAC.
+- ea-memory
+  This event counts energy consumption of memory.
+- ea-memory-mac-write
+  This event counts the number of write requests from MAC to memory.
+- ea-ha
+  This event counts energy consumption of HA.
+
+  'ea' is the abbreviation for 'Energy Analyzer'.
+
+Examples for use with perf::
+
+  perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls
+
+And, this driver supports the following events for PCI:
+- pci-port0-cycles
+  This event counts PCI cycles at PCI frequency in port0.
+- pci-port0-read-count
+  This event counts read transactions for data transfer in port0.
+- pci-port0-read-count-bus
+  This event counts read transactions for bus usage in port0.
+- pci-port0-write-count
+  This event counts write transactions for data transfer in port0.
+- pci-port0-write-count-bus
+  This event counts write transactions for bus usage in port0.
+- pci-port1-cycles
+  This event counts PCI cycles at PCI frequency in port1.
+- pci-port1-read-count
+  This event counts read transactions for data transfer in port1.
+- pci-port1-read-count-bus
+  This event counts read transactions for bus usage in port1.
+- pci-port1-write-count
+  This event counts write transactions for data transfer in port1.
+- pci-port1-write-count-bus
+  This event counts write transactions for bus usage in port1.
+- ea-pci
+  This event counts energy consumption of PCI.
+
+  'ea' is the abbreviation for 'Energy Analyzer'.
+
+Examples for use with perf::
+
+  perf stat -e pci_iod0_pci0/ea-pci/ ls
+
+Given that these are uncore PMUs the driver does not support sampling, therefore
+"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst
index 48992a0b8e94..c4c2cbbf88cb 100644
--- a/Documentation/admin-guide/perf/hisi-pmu.rst
+++ b/Documentation/admin-guide/perf/hisi-pmu.rst
@@ -18,9 +18,10 @@ HiSilicon SoC uncore PMU driver
 Each device PMU has separate registers for event counting, control and
 interrupt, and the PMU driver shall register perf PMU drivers like L3C,
 HHA and DDRC etc. The available events and configuration options shall
-be described in the sysfs, see:
+be described in the sysfs, see::
+
+/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>
 
-/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
 The "perf list" command shall list the available events from sysfs.
 
 Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
@@ -112,6 +113,50 @@ uring channel. It is 2 bits. Some important codes are as follows:
 - 2'b00: default value, count the events which sent to the both uring and
   uring_ext channel;
 
+6. ch: NoC PMU supports filtering the event counts of certain transaction
+channel with this option. The current supported channels are as follows:
+
+- 3'b010: Request channel
+- 3'b100: Snoop channel
+- 3'b110: Response channel
+- 3'b111: Data channel
+
+7. tt_en: NoC PMU supports counting only transactions that have tracetag set
+if this option is set. See the 2nd list for more information about tracetag.
+
+For HiSilicon uncore PMU v3 whose identifier is 0x40, some uncore PMUs are
+further divided into parts for finer granularity of tracing, each part has its
+own dedicated PMU, and all such PMUs together cover the monitoring job of events
+on particular uncore device. Such PMUs are described in sysfs with name format
+slightly changed::
+
+/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}_{Z}/ddrc{Y}_{Z}/noc{Y}_{Z}>
+
+Z is the sub-id, indicating different PMUs for part of hardware device.
+
+Usage of most PMUs with different sub-ids are identical. Specially, L3C PMU
+provides ``ext`` option to allow exploration of even finer granual statistics
+of L3C PMU.  L3C PMU driver uses that as hint of termination when delivering
+perf command to hardware:
+
+- ext=0: Default, could be used with event names.
+- ext=1 and ext=2: Must be used with event codes, event names are not supported.
+
+An example of perf command could be::
+
+  $# perf stat -a -e hisi_sccl0_l3c1_0/rd_spipe/ sleep 5
+
+or::
+
+  $# perf stat -a -e hisi_sccl0_l3c1_0/event=0x1,ext=1/ sleep 5
+
+As above, ``hisi_sccl0_l3c1_0`` locates PMU of Super CPU CLuster 0, L3 cache 1
+pipe0.
+
+First command locates the first part of L3C since ``ext=0`` is implied by
+default. Second command issues the counting on another part of L3C with the
+event ``0x1``.
+
 Users could configure IDs to count data come from specific CCL/ICL, by setting
 srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting
 tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 072b510385c4..47d9a3df6329 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -29,3 +29,4 @@ Performance monitor support
    cxl
    ampere_cspmu
    mrvl-pem-pmu
+   fujitsu_uncore_pmu
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index a18328a5fb93..c85cd327af28 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -34,22 +34,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	to the file. Specifying a fixed ``allocsize`` value turns off
 	the dynamic behaviour.
 
-  attr2 or noattr2
-	The options enable/disable an "opportunistic" improvement to
-	be made in the way inline extended attributes are stored
-	on-disk.  When the new form is used for the first time when
-	``attr2`` is selected (either when setting or removing extended
-	attributes) the on-disk superblock feature bit field will be
-	updated to reflect this format being in use.
-
-	The default behaviour is determined by the on-disk feature
-	bit indicating that ``attr2`` behaviour is active. If either
-	mount option is set, then that becomes the new default used
-	by the filesystem.
-
-	CRC enabled filesystems always use the ``attr2`` format, and so
-	will reject the ``noattr2`` mount option if it is set.
-
   discard or nodiscard (default)
 	Enable/disable the issuing of commands to let the block
 	device reclaim space freed by the filesystem.  This is
@@ -75,12 +59,6 @@ When mounting an XFS filesystem, the following options are accepted.
 	across the entire filesystem rather than just on directories
 	configured to use it.
 
-  ikeep or noikeep (default)
-	When ``ikeep`` is specified, XFS does not delete empty inode
-	clusters and keeps them around on disk.  When ``noikeep`` is
-	specified, empty inode clusters are returned to the free
-	space pool.
-
   inode32 or inode64 (default)
 	When ``inode32`` is specified, it indicates that XFS limits
 	inode creation to locations which will not result in inode
@@ -253,9 +231,8 @@ latest version and try again.
 
 The deprecation will take place in two parts.  Support for mounting V4
 filesystems can now be disabled at kernel build time via Kconfig option.
-The option will default to yes until September 2025, at which time it
-will be changed to default to no.  In September 2030, support will be
-removed from the codebase entirely.
+These options were changed to default to no in September 2025.  In
+September 2030, support will be removed from the codebase entirely.
 
 Note: Distributors may choose to withdraw V4 format support earlier than
 the dates listed above.
@@ -268,8 +245,6 @@ Deprecated Mount Options
 ============================    ================
 Mounting with V4 filesystem     September 2030
 Mounting ascii-ci filesystem    September 2030
-ikeep/noikeep			September 2025
-attr2/noattr2			September 2025
 ============================    ================
 
 
@@ -285,6 +260,8 @@ Removed Mount Options
   osyncisdsync/osyncisosync	v4.0
   barrier			v4.19
   nobarrier			v4.19
+  ikeep/noikeep			v6.18
+  attr2/noattr2			v6.18
 ===========================     =======
 
 sysctls
@@ -312,9 +289,6 @@ The following sysctls are available for the XFS filesystem:
 	removes unused preallocation from clean inodes and releases
 	the unused space back to the free pool.
 
-  fs.xfs.speculative_cow_prealloc_lifetime
-	This is an alias for speculative_prealloc_lifetime.
-
   fs.xfs.error_level		(Min: 0  Default: 3  Max: 11)
 	A volume knob for error reporting when internal errors occur.
 	This will generate detailed messages & backtraces for filesystem
@@ -341,17 +315,6 @@ The following sysctls are available for the XFS filesystem:
 
 	This option is intended for debugging only.
 
-  fs.xfs.irix_symlink_mode	(Min: 0  Default: 0  Max: 1)
-	Controls whether symlinks are created with mode 0777 (default)
-	or whether their mode is affected by the umask (irix mode).
-
-  fs.xfs.irix_sgid_inherit	(Min: 0  Default: 0  Max: 1)
-	Controls files created in SGID directories.
-	If the group ID of the new file does not match the effective group
-	ID or one of the supplementary group IDs of the parent dir, the
-	ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
-	is set.
-
   fs.xfs.inherit_sync		(Min: 0  Default: 1  Max: 1)
 	Setting this to "1" will cause the "sync" flag set
 	by the **xfs_io(8)** chattr command on a directory to be
@@ -387,24 +350,20 @@ The following sysctls are available for the XFS filesystem:
 Deprecated Sysctls
 ==================
 
-===========================================     ================
-  Name                                          Removal Schedule
-===========================================     ================
-fs.xfs.irix_sgid_inherit                        September 2025
-fs.xfs.irix_symlink_mode                        September 2025
-fs.xfs.speculative_cow_prealloc_lifetime        September 2025
-===========================================     ================
-
+None currently.
 
 Removed Sysctls
 ===============
 
-=============================	=======
-  Name				Removed
-=============================	=======
-  fs.xfs.xfsbufd_centisec	v4.0
-  fs.xfs.age_buffer_centisecs	v4.0
-=============================	=======
+==========================================   =======
+  Name                                       Removed
+==========================================   =======
+  fs.xfs.xfsbufd_centisec                    v4.0
+  fs.xfs.age_buffer_centisecs                v4.0
+  fs.xfs.irix_symlink_mode                   v6.18
+  fs.xfs.irix_sgid_inherit                   v6.18
+  fs.xfs.speculative_cow_prealloc_lifetime   v6.18
+==========================================   =======
 
 Error handling
 ==============
diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index 2f666a7c303c..e4f953839f71 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -466,6 +466,17 @@ Before jumping into the kernel, the following conditions must be met:
     - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1.
     - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1.
 
+  For CPUs with SPE data source filtering (FEAT_SPE_FDS):
+
+  - If EL3 is present:
+
+    - MDCR_EL3.EnPMS3 (bit 42) must be initialised to 0b1.
+
+  - If the kernel is entered at EL1 and EL2 is present:
+
+    - HDFGRTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1.
+    - HDFGWTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1.
+
   For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS):
 
   - If the kernel is entered at EL1 and EL2 is present:
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index f58ada4d6cb2..a15df4956849 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -441,6 +441,10 @@ HWCAP3_MTE_FAR
 HWCAP3_MTE_STORE_ONLY
     Functionality implied by ID_AA64PFR2_EL1.MTESTOREONLY == 0b0001.
 
+HWCAP3_LSFE
+    Functionality implied by ID_AA64ISAR3_EL1.LSFE == 0b0001
+
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index b18ef4064bc0..a7ec57060f64 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -200,6 +200,8 @@ stable kernels.
 +----------------+-----------------+-----------------+-----------------------------+
 | ARM            | Neoverse-V3     | #3312417        | ARM64_ERRATUM_3194386       |
 +----------------+-----------------+-----------------+-----------------------------+
+| ARM            | Neoverse-V3AE   | #3312417        | ARM64_ERRATUM_3194386       |
++----------------+-----------------+-----------------+-----------------------------+
 | ARM            | MMU-500         | #841119,826419  | ARM_SMMU_MMU_500_CPRE_ERRATA|
 |                |                 | #562869,1047329 |                             |
 +----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index 4cb38330e704..583f2ee9cb97 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -81,17 +81,7 @@ The ZA matrix is square with each side having as many bytes as a streaming
 mode SVE vector.
 
 
-3.  Sharing of streaming and non-streaming mode SVE state
----------------------------------------------------------
-
-It is implementation defined which if any parts of the SVE state are shared
-between streaming and non-streaming modes.  When switching between modes
-via software interfaces such as ptrace if no register content is provided as
-part of switching no state will be assumed to be shared and everything will
-be zeroed.
-
-
-4.  System call behaviour
+3.  System call behaviour
 -------------------------
 
 * On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the
@@ -112,7 +102,7 @@ be zeroed.
   exceptions for execve() described in section 6.
 
 
-5.  Signal handling
+4.  Signal handling
 -------------------
 
 * Signal handlers are invoked with PSTATE.SM=0, PSTATE.ZA=0, and TPIDR2_EL0=0.
diff --git a/Documentation/arch/powerpc/index.rst b/Documentation/arch/powerpc/index.rst
index 53fc9f89f3e4..1be2ee3f0361 100644
--- a/Documentation/arch/powerpc/index.rst
+++ b/Documentation/arch/powerpc/index.rst
@@ -37,6 +37,7 @@ powerpc
     vas-api
     vcpudispatch_stats
     vmemmap_dedup
+    vpa-dtl
 
     features
 
diff --git a/Documentation/arch/powerpc/vpa-dtl.rst b/Documentation/arch/powerpc/vpa-dtl.rst
new file mode 100644
index 000000000000..58d0022f993a
--- /dev/null
+++ b/Documentation/arch/powerpc/vpa-dtl.rst
@@ -0,0 +1,156 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. _vpa-dtl:
+
+===================================
+DTL (Dispatch Trace Log)
+===================================
+
+Athira Rajeev, 19 April 2025
+
+.. contents::
+    :depth: 3
+
+
+Basic overview
+==============
+
+The pseries Shared Processor Logical Partition(SPLPAR) machines can
+retrieve a log of dispatch and preempt events from the hypervisor
+using data from Disptach Trace Log(DTL) buffer. With this information,
+user can retrieve when and why each dispatch & preempt has occurred.
+The vpa-dtl PMU exposes the Virtual Processor Area(VPA) DTL counters
+via perf.
+
+Infrastructure used
+===================
+
+The VPA DTL PMU counters do not interrupt on overflow or generate any
+PMI interrupts. Therefore, hrtimer is used to poll the DTL data. The timer
+nterval can be provided by user via sample_period field in nano seconds.
+vpa dtl pmu has one hrtimer added per vpa-dtl pmu thread. DTL (Dispatch
+Trace Log) contains information about dispatch/preempt, enqueue time etc.
+We directly copy the DTL buffer data as part of auxiliary buffer and it
+will be processed later. This will avoid time taken to create samples
+in the kernel space. The PMU driver collecting Dispatch Trace Log (DTL)
+entries makes use of AUX support in perf infrastructure. On the tools side,
+this data is made available as PERF_RECORD_AUXTRACE records.
+
+To correlate each DTL entry with other events across CPU's, an auxtrace_queue
+is created for each CPU. Each auxtrace queue has a array/list of auxtrace buffers.
+All auxtrace queues is maintained in auxtrace heap. The queues are sorted
+based on timestamp. When the different PERF_RECORD_XX records are processed,
+compare the timestamp of perf record with timestamp of top element in the
+auxtrace heap so that DTL events can be co-related with other events
+Process the auxtrace queue if the timestamp of element from heap is
+lower than timestamp from entry in perf record. Sometimes it could happen that
+one buffer is only partially processed. if the timestamp of occurrence of
+another event is more than currently processed element in the queue, it will
+move on to next perf record. So keep track of position of buffer to continue
+processing next time. Update the timestamp of the auxtrace heap with the timestamp
+of last processed entry from the auxtrace buffer.
+
+This infrastructure ensures dispatch trace log entries can be correlated
+and presented along with other events like sched.
+
+vpa-dtl PMU example usage
+=========================
+
+.. code-block:: sh
+
+  # ls /sys/devices/vpa_dtl/
+  events  format  perf_event_mux_interval_ms  power  subsystem  type  uevent
+
+
+To capture the DTL data using perf record:
+.. code-block:: sh
+
+  # ./perf record -a -e sched:\*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1
+
+The result can be interpreted using perf record. Snippet of perf report -D
+
+.. code-block:: sh
+
+  # ./perf report -D
+
+There are different PERF_RECORD_XX records. In that records corresponding to
+auxtrace buffers includes:
+
+1. PERF_RECORD_AUX
+   Conveys that new data is available in AUX area
+
+2. PERF_RECORD_AUXTRACE_INFO
+   Describes offset and size of auxtrace data in the buffers
+
+3. PERF_RECORD_AUXTRACE
+   This is the record that defines the auxtrace data which here in case of
+   vpa-dtl pmu is dispatch trace log data.
+
+Snippet from perf report -D showing the PERF_RECORD_AUXTRACE dump
+
+.. code-block:: sh
+
+0 0 0x39b10 [0x30]: PERF_RECORD_AUXTRACE size: 0x690  offset: 0  ref: 0  idx: 0  tid: -1  cpu: 0
+.
+. ... VPA DTL PMU data: size 1680 bytes, entries is 35
+.  00000000: boot_tb: 21349649546353231, tb_freq: 512000000
+.  00000030: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:7064, ready_to_enqueue_time:187, waiting_to_ready_time:6611773
+.  00000060: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:146, ready_to_enqueue_time:0, waiting_to_ready_time:15359437
+.  00000090: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4868, ready_to_enqueue_time:232, waiting_to_ready_time:5100709
+.  000000c0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:179, ready_to_enqueue_time:0, waiting_to_ready_time:30714243
+.  000000f0: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:197, ready_to_enqueue_time:0, waiting_to_ready_time:15350648
+.  00000120: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:213, ready_to_enqueue_time:0, waiting_to_ready_time:15353446
+.  00000150: dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:212, ready_to_enqueue_time:0, waiting_to_ready_time:15355126
+.  00000180: dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:6368, ready_to_enqueue_time:164, waiting_to_ready_time:5104665
+
+Above is representation of dtl entry of below format:
+
+struct dtl_entry {
+        u8      dispatch_reason;
+        u8      preempt_reason;
+        u16     processor_id;
+        u32     enqueue_to_dispatch_time;
+        u32     ready_to_enqueue_time;
+        u32     waiting_to_ready_time;
+        u64     timebase;
+        u64     fault_addr;
+        u64     srr0;
+        u64     srr1;
+
+};
+
+First two fields represent the dispatch reason and preempt reason. The post
+processing of PERF_RECORD_AUXTRACE records will translate to meaningful data
+for user to consume.
+
+Visualize the dispatch trace log entries with perf report
+=========================================================
+
+.. code-block:: sh
+
+  # ./perf record -a -e sched:*,vpa_dtl/dtl_all/ -c 1000000000 sleep 1
+  [ perf record: Woken up 1 times to write data ]
+  [ perf record: Captured and wrote 0.300 MB perf.data ]
+
+  # ./perf report
+  # Samples: 321  of event 'vpa-dtl'
+  # Event count (approx.): 321
+  #
+  # Children      Self  Command  Shared Object      Symbol
+  # ........  ........  .......  .................  ..............................
+  #
+     100.00%   100.00%  swapper  [kernel.kallsyms]  [k] plpar_hcall_norets_notrace
+
+Visualize the dispatch trace log entries with perf script
+=========================================================
+
+.. code-block:: sh
+
+   # ./perf script
+     migration/9      67 [009] 105373.359903:                     sched:sched_waking: comm=perf pid=13418 prio=120 target_cpu=009
+     migration/9      67 [009] 105373.359904:               sched:sched_migrate_task: comm=perf pid=13418 prio=120 orig_cpu=9 dest_cpu=10
+     migration/9      67 [009] 105373.359907:               sched:sched_stat_runtime: comm=migration/9 pid=67 runtime=4050 [ns]
+     migration/9      67 [009] 105373.359908:                     sched:sched_switch: prev_comm=migration/9 prev_pid=67 prev_prio=0 prev_state=S ==> next_comm=swapper/9 next_pid=0 next_prio=120
+            :256     256 [016] 105373.359913:                                vpa-dtl: timebase: 21403600706628832 dispatch_reason:decrementer interrupt, preempt_reason:H_CEDE, enqueue_to_dispatch_time:4854,                        ready_to_enqueue_time:139, waiting_to_ready_time:511842115 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms])
+            :256     256 [017] 105373.360012:                                vpa-dtl: timebase: 21403600706679454 dispatch_reason:priv doorbell, preempt_reason:H_CEDE, enqueue_to_dispatch_time:236,                         ready_to_enqueue_time:0, waiting_to_ready_time:133864583 c0000000000fcd28 plpar_hcall_norets_notrace+0x18 ([kernel.kallsyms])
+            perf   13418 [010] 105373.360048:               sched:sched_stat_runtime: comm=perf pid=13418 runtime=139748 [ns]
+            perf   13418 [010] 105373.360052:                     sched:sched_waking: comm=migration/10 pid=72 prio=0 target_cpu=010
diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst
index 2aa9be272d5d..2f449c9b15bd 100644
--- a/Documentation/arch/riscv/hwprobe.rst
+++ b/Documentation/arch/riscv/hwprobe.rst
@@ -327,6 +327,15 @@ The following keys are defined:
   * :c:macro:`RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED`: Misaligned vector accesses are
     not supported at all and will generate a misaligned address fault.
 
+* :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0`: A bitmask containing the
+  mips vendor extensions that are compatible with the
+  :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior.
+
+  * MIPS
+
+    * :c:macro:`RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL`: The xmipsexectl vendor
+        extension is supported in the MIPS ISA extensions spec.
+
 * :c:macro:`RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0`: A bitmask containing the
   thead vendor extensions that are compatible with the
   :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior.
diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
index c12837e61bda..86bec8ac2c4d 100644
--- a/Documentation/arch/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
@@ -141,6 +141,197 @@ Thread-related topology information in the kernel:
 
 
 
+System topology enumeration
+===========================
+
+The topology on x86 systems can be discovered using a combination of vendor
+specific CPUID leaves which enumerate the processor topology and the cache
+hierarchy.
+
+The CPUID leaves in their preferred order of parsing for each x86 vendor is as
+follows:
+
+1) AMD
+
+   1) CPUID leaf 0x80000026 [Extended CPU Topology] (Core::X86::Cpuid::ExCpuTopology)
+
+      The extended CPUID leaf 0x80000026 is the extension of the CPUID leaf 0xB
+      and provides the topology information of Core, Complex, CCD (Die), and
+      Socket in each level.
+
+      Support for the leaf is discovered by checking if the maximum extended
+      CPUID level is >= 0x80000026 and then checking if `LogProcAtThisLevel`
+      in `EBX[15:0]` at a particular level (starting from 0) is non-zero.
+
+      The `LevelType` in `ECX[15:8]` at the level provides the topology domain
+      the level describes - Core, Complex, CCD(Die), or the Socket.
+
+      The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the
+      number of bits that need to be right-shifted from `ExtendedLocalApicId`
+      in `EDX[31:0]` in order to get a unique Topology ID for the topology 
+      level. CPUs with the same Topology ID share the resources at that level.
+
+      CPUID leaf 0x80000026 also provides more information regarding the power
+      and efficiency rankings, and about the core type on AMD processors with
+      heterogeneous characteristics.
+
+      If CPUID leaf 0x80000026 is supported, further parsing is not required.
+
+   2) CPUID leaf 0x0000000B [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnum)
+
+      The extended CPUID leaf 0x0000000B is the predecessor on the extended
+      CPUID leaf 0x80000026 and only describes the core, and the socket domains
+      of the processor topology.
+
+      The support for the leaf is discovered by checking if the maximum supported
+      CPUID level is >= 0xB and then if `EBX[31:0]` at a particular level
+      (starting from 0) is non-zero.
+
+      The `LevelType` in `ECX[15:8]` at the level provides the topology domain
+      that the level describes - Thread, or Processor (Socket).
+
+      The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the
+      number of bits that need to be right-shifted from the `ExtendedLocalApicId`
+      in `EDX[31:0]` to get a unique Topology ID for that topology level. CPUs
+      sharing the Topology ID share the resources at that level.
+
+      If CPUID leaf 0xB is supported, further parsing is not required.
+
+
+   3) CPUID leaf 0x80000008 ECX [Size Identifiers] (Core::X86::Cpuid::SizeId)
+
+      If neither the CPUID leaf 0x80000026 nor 0xB is supported, the number of
+      CPUs on the package is detected using the Size Identifier leaf
+      0x80000008 ECX.
+
+      The support for the leaf is discovered by checking if the supported
+      extended CPUID level is >= 0x80000008.
+
+      The shifts from the APIC ID for the Socket ID is calculated from the
+      `ApicIdSize` field in `ECX[15:12]` if it is non-zero.
+
+      If `ApicIdSize` is reported to be zero, the shift is calculated as the
+      order of the `number of threads` calculated from `NC` field in
+      `ECX[7:0]` which describes the `number of threads - 1` on the package.
+
+      Unless Extended APIC ID is supported, the APIC ID used to find the
+      Socket ID is from the `LocalApicId` field of CPUID leaf 0x00000001
+      `EBX[31:24]`.
+
+      The topology parsing continues to detect if Extended APIC ID is
+      supported or not.
+
+
+   4) CPUID leaf 0x8000001E [Extended APIC ID, Core Identifiers, Node Identifiers]
+      (Core::X86::Cpuid::{ExtApicId,CoreId,NodeId})
+
+      The support for Extended APIC ID can be detected by checking for the
+      presence of `TopologyExtensions` in `ECX[22]` of CPUID leaf 0x80000001
+      [Feature Identifiers] (Core::X86::Cpuid::FeatureExtIdEcx).
+
+      If Topology Extensions is supported, the APIC ID from `ExtendedApicId`
+      from CPUID leaf 0x8000001E `EAX[31:0]` should be preferred over that from
+      `LocalApicId` field of CPUID leaf 0x00000001 `EBX[31:24]` for topology
+      enumeration.
+
+      On processors of Family 0x17 and above that do not support CPUID leaf
+      0x80000026 or CPUID leaf 0xB, the shifts from the APIC ID for the Core
+      ID is calculated using the order of `number of threads per core`
+      calculated using the `ThreadsPerCore` field in `EBX[15:8]` which
+      describes `number of threads per core - 1`.
+
+      On Processors of Family 0x15, the Core ID from `EBX[7:0]` is used as the
+      `cu_id` (Compute Unit ID) to detect CPUs that share the compute units.
+
+
+   All AMD processors that support the `TopologyExtensions` feature store the
+   `NodeId` from the `ECX[7:0]` of CPUID leaf 0x8000001E 
+   (Core::X86::Cpuid::NodeId) as the per-CPU `node_id`. On older processors,
+   the `node_id` was discovered using MSR_FAM10H_NODE_ID MSR (MSR
+   0x0xc001_100c). The presence of the NODE_ID MSR was detected by checking
+   `ECX[19]` of CPUID leaf 0x80000001 [Feature Identifiers]
+   (Core::X86::Cpuid::FeatureExtIdEcx).
+
+
+2) Intel
+
+   On Intel platforms, the CPUID leaves that enumerate the processor
+   topology are as follows:
+
+   1) CPUID leaf 0x1F (V2 Extended Topology Enumeration Leaf)
+
+      The CPUID leaf 0x1F is the extension of the CPUID leaf 0xB and provides
+      the topology information of Core, Module, Tile, Die, DieGrp, and Socket
+      in each level.
+
+      The support for the leaf is discovered by checking if the supported
+      CPUID level is >= 0x1F and then `EBX[31:0]` at a particular level
+      (starting from 0) is non-zero.
+
+      The `Domain Type` in `ECX[15:8]` of the sub-leaf provides the topology
+      domain that the level describes - Core, Module, Tile, Die, DieGrp, and
+      Socket.
+
+      The kernel uses the value from `EAX[4:0]` to discover the number of
+      bits that need to be right shifted from the `x2APIC ID` in `EDX[31:0]`
+      to get a unique Topology ID for the topology level. CPUs with the same
+      Topology ID share the resources at that level.
+
+      If CPUID leaf 0x1F is supported, further parsing is not required.
+
+
+   2) CPUID leaf 0x0000000B (Extended Topology Enumeration Leaf)
+
+      The extended CPUID leaf 0x0000000B is the predecessor of the V2 Extended
+      Topology Enumeration Leaf 0x1F and only describes the core, and the
+      socket domains of the processor topology.
+
+      The support for the leaf is iscovered by checking if the supported CPUID
+      level is >= 0xB and then checking if `EBX[31:0]` at a particular level
+      (starting from 0) is non-zero.
+
+      CPUID leaf 0x0000000B shares the same layout as CPUID leaf 0x1F and
+      should be enumerated in a similar manner.
+
+      If CPUID leaf 0xB is supported, further parsing is not required.
+
+
+   3) CPUID leaf 0x00000004 (Deterministic Cache Parameters Leaf)
+
+      On Intel processors that support neither CPUID leaf 0x1F, nor CPUID leaf
+      0xB, the shifts for the SMT domains is calculated using the number of
+      CPUs sharing the L1 cache.
+
+      Processors that feature Hyper-Threading is detected using `EDX[28]` of
+      CPUID leaf 0x1 (Basic CPUID Information).
+
+      The order of `Maximum number of addressable IDs for logical processors
+      sharing this cache` from `EAX[25:14]` of level-0 of CPUID 0x4 provides
+      the shifts from the APIC ID required to compute the Core ID.
+
+      The APIC ID and Package information is computed using the data from
+      CPUID leaf 0x1.
+
+
+   4) CPUID leaf 0x00000001 (Basic CPUID Information)
+
+      The mask and shifts to derive the Physical Package (socket) ID is
+      computed using the `Maximum number of addressable IDs for logical
+      processors in this physical package` from `EBX[23:16]` of CPUID leaf
+      0x1.
+
+     The APIC ID on the legacy platforms is derived from the `Initial APIC
+     ID` field from `EBX[31:24]` of CPUID leaf 0x1.
+
+
+3) Centaur and Zhaoxin
+
+   Similar to Intel, Centaur and Zhaoxin use a combination of CPUID leaf
+   0x00000004 (Deterministic Cache Parameters Leaf) and CPUID leaf 0x00000001
+   (Basic CPUID Information) to derive the topology information.
+
+
+
 System topology examples
 ========================
 
diff --git a/Documentation/core-api/symbol-namespaces.rst b/Documentation/core-api/symbol-namespaces.rst
index 32fc73dc5529..034898e81ba2 100644
--- a/Documentation/core-api/symbol-namespaces.rst
+++ b/Documentation/core-api/symbol-namespaces.rst
@@ -76,20 +76,21 @@ unit as preprocessor statement. The above example would then read::
 within the corresponding compilation unit before the #include for
 <linux/export.h>. Typically it's placed before the first #include statement.
 
-Using the EXPORT_SYMBOL_GPL_FOR_MODULES() macro
------------------------------------------------
+Using the EXPORT_SYMBOL_FOR_MODULES() macro
+-------------------------------------------
 
 Symbols exported using this macro are put into a module namespace. This
-namespace cannot be imported.
+namespace cannot be imported. These exports are GPL-only as they are only
+intended for in-tree modules.
 
 The macro takes a comma separated list of module names, allowing only those
 modules to access this symbol. Simple tail-globs are supported.
 
 For example::
 
-  EXPORT_SYMBOL_GPL_FOR_MODULES(preempt_notifier_inc, "kvm,kvm-*")
+  EXPORT_SYMBOL_FOR_MODULES(preempt_notifier_inc, "kvm,kvm-*")
 
-will limit usage of this symbol to modules whoes name matches the given
+will limit usage of this symbol to modules whose name matches the given
 patterns.
 
 How to use Symbols exported in Namespaces
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
index 5bd517befb68..4accf4cbc6c7 100644
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
@@ -353,6 +353,12 @@ properties:
     $ref: /schemas/types.yaml#/definitions/phandle
     description: Link to Mediatek Cache Coherent Interconnect
 
+  edac-enabled:
+    $ref: /schemas/types.yaml#/definitions/flag
+    description:
+      A72 CPUs support Error Detection And Correction (EDAC) on their L1 and
+      L2 caches. This flag marks this function as usable.
+
   qcom,saw:
     $ref: /schemas/types.yaml#/definitions/phandle
     description:
@@ -400,6 +406,17 @@ allOf:
   - $ref: /schemas/cpu.yaml#
   - $ref: /schemas/opp/opp-v1.yaml#
   - if:
+      not:
+        properties:
+          compatible:
+            contains:
+              const: arm,cortex-a72
+    then:
+      # Allow edac-enabled only for Cortex A72
+      properties:
+        edac-enabled: false
+
+  - if:
       # If the enable-method property contains one of those values
       properties:
         enable-method:
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml b/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml
index e153f8d26e7a..2735c78b0b67 100644
--- a/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml
+++ b/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml
@@ -60,7 +60,6 @@ properties:
           - const: bus
           - const: core
           - const: vsync
-          - const: lut
           - const: tbu
           - const: tbu_rt
         # MSM8996 has additional iommu clock
diff --git a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml
index f2f87f0f545b..6493a6968bb4 100644
--- a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml
@@ -92,8 +92,12 @@ required:
 anyOf:
   - required:
       - qcom,powered-remotely
+      - num-channels
+      - qcom,num-ees
   - required:
       - qcom,controlled-remotely
+      - num-channels
+      - qcom,num-ees
   - required:
       - clocks
       - clock-names
diff --git a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml
index 3d6aefb0d0f1..226c600deae1 100644
--- a/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml
@@ -9,6 +9,9 @@ title: I2C controller embedded in SpacemiT's K1 SoC
 maintainers:
   - Troy Mitchell <troymitchell988@gmail.com>
 
+allOf:
+  - $ref: /schemas/i2c/i2c-controller.yaml#
+
 properties:
   compatible:
     const: spacemit,k1-i2c
diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml
new file mode 100644
index 000000000000..479288567d0b
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/memory-controllers/xlnx,versal-net-ddrmc5.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx Versal NET Memory Controller
+
+maintainers:
+  - Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
+
+description:
+  The integrated DDR Memory Controllers (DDRMCs) support both DDR5 and LPDDR5
+  compact and extended  memory interfaces. Versal NET DDR memory controller
+  has an optional ECC support which correct single bit ECC errors and detect
+  double bit ECC errors. It also has support for reporting other errors like
+  MMCM (Mixed-Mode Clock Manager) errors and General software errors.
+
+properties:
+  compatible:
+    const: xlnx,versal-net-ddrmc5
+
+  amd,rproc:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description:
+      phandle to the remoteproc_r5 rproc node using which APU interacts
+      with remote processor. APU primarily communicates with the RPU for
+      accessing the DDRMC address space and getting error notification.
+
+required:
+  - compatible
+  - amd,rproc
+
+additionalProperties: false
+
+examples:
+  - |
+    memory-controller {
+        compatible = "xlnx,versal-net-ddrmc5";
+        amd,rproc = <&remoteproc_r5>;
+    };
diff --git a/Documentation/devicetree/bindings/net/thead,th1520-gmac.yaml b/Documentation/devicetree/bindings/net/thead,th1520-gmac.yaml
index 6d9de3303762..b3492a9aa4ef 100644
--- a/Documentation/devicetree/bindings/net/thead,th1520-gmac.yaml
+++ b/Documentation/devicetree/bindings/net/thead,th1520-gmac.yaml
@@ -62,11 +62,13 @@ properties:
     items:
       - description: GMAC main clock
       - description: Peripheral registers interface clock
+      - description: APB glue registers interface clock
 
   clock-names:
     items:
       - const: stmmaceth
       - const: pclk
+      - const: apb
 
   interrupts:
     items:
@@ -88,8 +90,8 @@ examples:
         compatible = "thead,th1520-gmac", "snps,dwmac-3.70a";
         reg = <0xe7070000 0x2000>, <0xec003000 0x1000>;
         reg-names = "dwmac", "apb";
-        clocks = <&clk 1>, <&clk 2>;
-        clock-names = "stmmaceth", "pclk";
+        clocks = <&clk 1>, <&clk 2>, <&clk 3>;
+        clock-names = "stmmaceth", "pclk", "apb";
         interrupts = <66>;
         interrupt-names = "macirq";
         phy-mode = "rgmii-id";
diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
index 8597ea625edb..d2e578d6b83b 100644
--- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
@@ -33,6 +33,7 @@ properties:
       - items:
           - enum:
               - fsl,imx91-ddr-pmu
+              - fsl,imx94-ddr-pmu
               - fsl,imx95-ddr-pmu
           - const: fsl,imx93-ddr-pmu
 
diff --git a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml
index d9501df42886..c35d31642805 100644
--- a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml
+++ b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml
@@ -47,21 +47,19 @@ properties:
     const: 0
 
   clocks:
+    minItems: 1
     maxItems: 3
-    description: Reference clocks for CP110; MG clock, MG Core clock, AXI clock
 
   clock-names:
-    items:
-      - const: mg_clk
-      - const: mg_core_clk
-      - const: axi_clk
+    minItems: 1
+    maxItems: 3
 
   marvell,system-controller:
     description: Phandle to the Marvell system controller (CP110 only)
     $ref: /schemas/types.yaml#/definitions/phandle
 
 patternProperties:
-  '^phy@[0-2]$':
+  '^phy@[0-5]$':
     description: A COMPHY lane child node
     type: object
     additionalProperties: false
@@ -69,10 +67,14 @@ patternProperties:
     properties:
       reg:
         description: COMPHY lane number
+        maximum: 5
 
       '#phy-cells':
         const: 1
 
+      connector:
+        type: object
+
     required:
       - reg
       - '#phy-cells'
@@ -91,13 +93,24 @@ allOf:
 
     then:
       properties:
-        clocks: false
-        clock-names: false
+        clocks:
+          maxItems: 1
+        clock-names:
+          const: xtal
 
       required:
         - reg-names
 
     else:
+      properties:
+        clocks:
+          minItems: 3
+        clock-names:
+          items:
+            - const: mg_clk
+            - const: mg_core_clk
+            - const: axi_clk
+
       required:
         - marvell,system-controller
 
diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
index a1ae8c7988c8..b6f140bf5b3b 100644
--- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
+++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
@@ -176,6 +176,8 @@ allOf:
         compatible:
           contains:
             enum:
+              - qcom,sa8775p-qmp-gen4x2-pcie-phy
+              - qcom,sa8775p-qmp-gen4x4-pcie-phy
               - qcom,sc8280xp-qmp-gen3x1-pcie-phy
               - qcom,sc8280xp-qmp-gen3x2-pcie-phy
               - qcom,sc8280xp-qmp-gen3x4-pcie-phy
@@ -197,8 +199,6 @@ allOf:
           contains:
             enum:
               - qcom,qcs8300-qmp-gen4x2-pcie-phy
-              - qcom,sa8775p-qmp-gen4x2-pcie-phy
-              - qcom,sa8775p-qmp-gen4x4-pcie-phy
     then:
       properties:
         clocks:
diff --git a/Documentation/devicetree/bindings/regulator/infineon,ir38060.yaml b/Documentation/devicetree/bindings/regulator/infineon,ir38060.yaml
index e6ffbc2a2298..57ff6bf1e188 100644
--- a/Documentation/devicetree/bindings/regulator/infineon,ir38060.yaml
+++ b/Documentation/devicetree/bindings/regulator/infineon,ir38060.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
 title: Infineon Buck Regulators with PMBUS interfaces
 
 maintainers:
-  - Not Me.
+  - Guenter Roeck <linux@roeck-us.net>
 
 allOf:
   - $ref: regulator.yaml#
diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml
index ede6a58ccf53..de41a6f074d3 100644
--- a/Documentation/devicetree/bindings/riscv/extensions.yaml
+++ b/Documentation/devicetree/bindings/riscv/extensions.yaml
@@ -662,6 +662,12 @@ properties:
             Registers in the AX45MP datasheet.
             https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf
 
+        # MIPS
+        - const: xmipsexectl
+          description:
+            The MIPS extension for execution control as documented in
+            https://mips.com/wp-content/uploads/2025/06/P8700_Programmers_Reference_Manual_Rev1.84_5-31-2025.pdf
+
         # SiFive
         - const: xsfvqmaccdod
           description:
diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml
index e46bee8d25bf..b243afa69a1a 100644
--- a/Documentation/devicetree/bindings/serial/8250.yaml
+++ b/Documentation/devicetree/bindings/serial/8250.yaml
@@ -48,7 +48,6 @@ allOf:
       oneOf:
         - required: [ clock-frequency ]
         - required: [ clocks ]
-
   - if:
       properties:
         compatible:
@@ -60,12 +59,39 @@ allOf:
           items:
             - const: uartclk
             - const: reg
-    else:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: spacemit,k1-uart
+    then:
       properties:
         clock-names:
           items:
             - const: core
             - const: bus
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - spacemit,k1-uart
+              - nxp,lpc1850-uart
+    then:
+      required:
+        - clocks
+        - clock-names
+      properties:
+        clocks:
+          minItems: 2
+        clock-names:
+          minItems: 2
+    else:
+      properties:
+        clocks:
+          maxItems: 1
+        clock-names:
+          maxItems: 1
 
 properties:
   compatible:
@@ -162,6 +188,9 @@ properties:
     minItems: 1
     maxItems: 2
     oneOf:
+      - enum:
+          - main
+          - uart
       - items:
           - const: core
           - const: bus
@@ -264,29 +293,6 @@ required:
   - reg
   - interrupts
 
-if:
-  properties:
-    compatible:
-      contains:
-        enum:
-          - spacemit,k1-uart
-          - nxp,lpc1850-uart
-then:
-  required:
-    - clocks
-    - clock-names
-  properties:
-    clocks:
-      minItems: 2
-    clock-names:
-      minItems: 2
-else:
-  properties:
-    clocks:
-      maxItems: 1
-    clock-names:
-      maxItems: 1
-
 unevaluatedProperties: false
 
 examples:
diff --git a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml
index 89c462653e2d..8cc848ae11cb 100644
--- a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml
+++ b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml
@@ -41,7 +41,7 @@ properties:
           - const: dma_intr2
 
   clocks:
-    minItems: 1
+    maxItems: 1
 
   clock-names:
     const: sw_baud
diff --git a/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.yaml b/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.yaml
index a65a42ccaafe..a82360bed188 100644
--- a/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.yaml
+++ b/Documentation/devicetree/bindings/spi/spi-fsl-lpspi.yaml
@@ -20,6 +20,7 @@ properties:
       - enum:
           - fsl,imx7ulp-spi
           - fsl,imx8qxp-spi
+          - nxp,s32g2-lpspi
       - items:
           - enum:
               - fsl,imx8ulp-spi
@@ -27,6 +28,10 @@ properties:
               - fsl,imx94-spi
               - fsl,imx95-spi
           - const: fsl,imx7ulp-spi
+      - items:
+          - const: nxp,s32g3-lpspi
+          - const: nxp,s32g2-lpspi
+
   reg:
     maxItems: 1
 
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 77160cd47f54..9ec8947dfcad 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -507,6 +507,8 @@ patternProperties:
     description: Espressif Systems Co. Ltd.
   "^est,.*":
     description: ESTeem Wireless Modems
+  "^eswin,.*":
+    description: Beijing ESWIN Technology Group Co. Ltd.
   "^ettus,.*":
     description: NI Ettus Research
   "^eukrea,.*":
diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst
deleted file mode 100644
index b29562a6bf55..000000000000
--- a/Documentation/filesystems/bcachefs/CodingStyle.rst
+++ /dev/null
@@ -1,186 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-bcachefs coding style
-=====================
-
-Good development is like gardening, and codebases are our gardens. Tend to them
-every day; look for little things that are out of place or in need of tidying.
-A little weeding here and there goes a long way; don't wait until things have
-spiraled out of control.
-
-Things don't always have to be perfect - nitpicking often does more harm than
-good. But appreciate beauty when you see it - and let people know.
-
-The code that you are afraid to touch is the code most in need of refactoring.
-
-A little organizing here and there goes a long way.
-
-Put real thought into how you organize things.
-
-Good code is readable code, where the structure is simple and leaves nowhere
-for bugs to hide.
-
-Assertions are one of our most important tools for writing reliable code. If in
-the course of writing a patchset you encounter a condition that shouldn't
-happen (and will have unpredictable or undefined behaviour if it does), or
-you're not sure if it can happen and not sure how to handle it yet - make it a
-BUG_ON(). Don't leave undefined or unspecified behavior lurking in the codebase.
-
-By the time you finish the patchset, you should understand better which
-assertions need to be handled and turned into checks with error paths, and
-which should be logically impossible. Leave the BUG_ON()s in for the ones which
-are logically impossible. (Or, make them debug mode assertions if they're
-expensive - but don't turn everything into a debug mode assertion, so that
-we're not stuck debugging undefined behaviour should it turn out that you were
-wrong).
-
-Assertions are documentation that can't go out of date. Good assertions are
-wonderful.
-
-Good assertions drastically and dramatically reduce the amount of testing
-required to shake out bugs.
-
-Good assertions are based on state, not logic. To write good assertions, you
-have to think about what the invariants on your state are.
-
-Good invariants and assertions will hold everywhere in your codebase. This
-means that you can run them in only a few places in the checked in version, but
-should you need to debug something that caused the assertion to fail, you can
-quickly shotgun them everywhere to find the codepath that broke the invariant.
-
-A good assertion checks something that the compiler could check for us, and
-elide - if we were working in a language with embedded correctness proofs that
-the compiler could check. This is something that exists today, but it'll likely
-still be a few decades before it comes to systems programming languages. But we
-can still incorporate that kind of thinking into our code and document the
-invariants with runtime checks - much like the way people working in
-dynamically typed languages may add type annotations, gradually making their
-code statically typed.
-
-Looking for ways to make your assertions simpler - and higher level - will
-often nudge you towards making the entire system simpler and more robust.
-
-Good code is code where you can poke around and see what it's doing -
-introspection. We can't debug anything if we can't see what's going on.
-
-Whenever we're debugging, and the solution isn't immediately obvious, if the
-issue is that we don't know where the issue is because we can't see what's
-going on - fix that first.
-
-We have the tools to make anything visible at runtime, efficiently - RCU and
-percpu data structures among them. Don't let things stay hidden.
-
-The most important tool for introspection is the humble pretty printer - in
-bcachefs, this means `*_to_text()` functions, which output to printbufs.
-
-Pretty printers are wonderful, because they compose and you can use them
-everywhere. Having functions to print whatever object you're working with will
-make your error messages much easier to write (therefore they will actually
-exist) and much more informative. And they can be used from sysfs/debugfs, as
-well as tracepoints.
-
-Runtime info and debugging tools should come with clear descriptions and
-labels, and good structure - we don't want files with a list of bare integers,
-like in procfs. Part of the job of the debugging tools is to educate users and
-new developers as to how the system works.
-
-Error messages should, whenever possible, tell you everything you need to debug
-the issue. It's worth putting effort into them.
-
-Tracepoints shouldn't be the first thing you reach for. They're an important
-tool, but always look for more immediate ways to make things visible. When we
-have to rely on tracing, we have to know which tracepoints we're looking for,
-and then we have to run the troublesome workload, and then we have to sift
-through logs. This is a lot of steps to go through when a user is hitting
-something, and if it's intermittent it may not even be possible.
-
-The humble counter is an incredibly useful tool. They're cheap and simple to
-use, and many complicated internal operations with lots of things that can
-behave weirdly (anything involving memory reclaim, for example) become
-shockingly easy to debug once you have counters on every distinct codepath.
-
-Persistent counters are even better.
-
-When debugging, try to get the most out of every bug you come across; don't
-rush to fix the initial issue. Look for things that will make related bugs
-easier the next time around - introspection, new assertions, better error
-messages, new debug tools, and do those first. Look for ways to make the system
-better behaved; often one bug will uncover several other bugs through
-downstream effects.
-
-Fix all that first, and then the original bug last - even if that means keeping
-a user waiting. They'll thank you in the long run, and when they understand
-what you're doing you'll be amazed at how patient they're happy to be. Users
-like to help - otherwise they wouldn't be reporting the bug in the first place.
-
-Talk to your users. Don't isolate yourself.
-
-Users notice all sorts of interesting things, and by just talking to them and
-interacting with them you can benefit from their experience.
-
-Spend time doing support and helpdesk stuff. Don't just write code - code isn't
-finished until it's being used trouble free.
-
-This will also motivate you to make your debugging tools as good as possible,
-and perhaps even your documentation, too. Like anything else in life, the more
-time you spend at it the better you'll get, and you the developer are the
-person most able to improve the tools to make debugging quick and easy.
-
-Be wary of how you take on and commit to big projects. Don't let development
-become product-manager focused. Often time an idea is a good one but needs to
-wait for its proper time - but you won't know if it's the proper time for an
-idea until you start writing code.
-
-Expect to throw a lot of things away, or leave them half finished for later.
-Nobody writes all perfect code that all gets shipped, and you'll be much more
-productive in the long run if you notice this early and shift to something
-else. The experience gained and lessons learned will be valuable for all the
-other work you do.
-
-But don't be afraid to tackle projects that require significant rework of
-existing code. Sometimes these can be the best projects, because they can lead
-us to make existing code more general, more flexible, more multipurpose and
-perhaps more robust. Just don't hesitate to abandon the idea if it looks like
-it's going to make a mess of things.
-
-Complicated features can often be done as a series of refactorings, with the
-final change that actually implements the feature as a quite small patch at the
-end. It's wonderful when this happens, especially when those refactorings are
-things that improve the codebase in their own right. When that happens there's
-much less risk of wasted effort if the feature you were going for doesn't work
-out.
-
-Always strive to work incrementally. Always strive to turn the big projects
-into little bite sized projects that can prove their own merits.
-
-Instead of always tackling those big projects, look for little things that
-will be useful, and make the big projects easier.
-
-The question of what's likely to be useful is where junior developers most
-often go astray - doing something because it seems like it'll be useful often
-leads to overengineering. Knowing what's useful comes from many years of
-experience, or talking with people who have that experience - or from simply
-reading lots of code and looking for common patterns and issues. Don't be
-afraid to throw things away and do something simpler.
-
-Talk about your ideas with your fellow developers; often times the best things
-come from relaxed conversations where people aren't afraid to say "what if?".
-
-Don't neglect your tools.
-
-The most important tools (besides the compiler and our text editor) are the
-tools we use for testing. The shortest possible edit/test/debug cycle is
-essential for working productively. We learn, gain experience, and discover the
-errors in our thinking by running our code and seeing what happens. If your
-time is being wasted because your tools are bad or too slow - don't accept it,
-fix it.
-
-Put effort into your documentation, commit messages, and code comments - but
-don't go overboard. A good commit message is wonderful - but if the information
-was important enough to go in a commit message, ask yourself if it would be
-even better as a code comment.
-
-A good code comment is wonderful, but even better is the comment that didn't
-need to exist because the code was so straightforward as to be obvious;
-organized into small clean and tidy modules, with clear and descriptive names
-for functions and variables, where every line of code has a clear purpose.
diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst
deleted file mode 100644
index 18c79d548391..000000000000
--- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-Submitting patches to bcachefs
-==============================
-
-Here are suggestions for submitting patches to bcachefs subsystem.
-
-Submission checklist
---------------------
-
-Patches must be tested before being submitted, either with the xfstests suite
-[0]_, or the full bcachefs test suite in ktest [1]_, depending on what's being
-touched. Note that ktest wraps xfstests and will be an easier method to running
-it for most users; it includes single-command wrappers for all the mainstream
-in-kernel local filesystems.
-
-Patches will undergo more testing after being merged (including
-lockdep/kasan/preempt/etc. variants), these are not generally required to be
-run by the submitter - but do put some thought into what you're changing and
-which tests might be relevant, e.g. are you dealing with tricky memory layout
-work? kasan, are you doing locking work? then lockdep; and ktest includes
-single-command variants for the debug build types you'll most likely need.
-
-The exception to this rule is incomplete WIP/RFC patches: if you're working on
-something nontrivial, it's encouraged to send out a WIP patch to let people
-know what you're doing and make sure you're on the right track. Just make sure
-it includes a brief note as to what's done and what's incomplete, to avoid
-confusion.
-
-Rigorous checkpatch.pl adherence is not required (many of its warnings are
-considered out of date), but try not to deviate too much without reason.
-
-Focus on writing code that reads well and is organized well; code should be
-aesthetically pleasing.
-
-CI
---
-
-Instead of running your tests locally, when running the full test suite it's
-preferable to let a server farm do it in parallel, and then have the results
-in a nice test dashboard (which can tell you which failures are new, and
-presents results in a git log view, avoiding the need for most bisecting).
-
-That exists [2]_, and community members may request an account. If you work for
-a big tech company, you'll need to help out with server costs to get access -
-but the CI is not restricted to running bcachefs tests: it runs any ktest test
-(which generally makes it easy to wrap other tests that can run in qemu).
-
-Other things to think about
----------------------------
-
-- How will we debug this code? Is there sufficient introspection to diagnose
-  when something starts acting wonky on a user machine?
-
-  We don't necessarily need every single field of every data structure visible
-  with introspection, but having the important fields of all the core data
-  types wired up makes debugging drastically easier - a bit of thoughtful
-  foresight greatly reduces the need to have people build custom kernels with
-  debug patches.
-
-  More broadly, think about all the debug tooling that might be needed.
-
-- Does it make the codebase more or less of a mess? Can we also try to do some
-  organizing, too?
-
-- Do new tests need to be written? New assertions? How do we know and verify
-  that the code is correct, and what happens if something goes wrong?
-
-  We don't yet have automated code coverage analysis or easy fault injection -
-  but for now, pretend we did and ask what they might tell us.
-
-  Assertions are hugely important, given that we don't yet have a systems
-  language that can do ergonomic embedded correctness proofs. Hitting an assert
-  in testing is much better than wandering off into undefined behaviour la-la
-  land - use them. Use them judiciously, and not as a replacement for proper
-  error handling, but use them.
-
-- Does it need to be performance tested? Should we add new performance counters?
-
-  bcachefs has a set of persistent runtime counters which can be viewed with
-  the 'bcachefs fs top' command; this should give users a basic idea of what
-  their filesystem is currently doing. If you're doing a new feature or looking
-  at old code, think if anything should be added.
-
-- If it's a new on disk format feature - have upgrades and downgrades been
-  tested? (Automated tests exists but aren't in the CI, due to the hassle of
-  disk image management; coordinate to have them run.)
-
-Mailing list, IRC
------------------
-
-Patches should hit the list [3]_, but much discussion and code review happens
-on IRC as well [4]_; many people appreciate the more conversational approach
-and quicker feedback.
-
-Additionally, we have a lively user community doing excellent QA work, which
-exists primarily on IRC. Please make use of that resource; user feedback is
-important for any nontrivial feature, and documenting it in commit messages
-would be a good idea.
-
-.. rubric:: References
-
-.. [0] git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git
-.. [1] https://evilpiepirate.org/git/ktest.git/
-.. [2] https://evilpiepirate.org/~testdashboard/ci/
-.. [3] linux-bcachefs@vger.kernel.org
-.. [4] irc.oftc.net#bcache, #bcachefs-dev
diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst
deleted file mode 100644
index 871a38f557e8..000000000000
--- a/Documentation/filesystems/bcachefs/casefolding.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Casefolding
-===========
-
-bcachefs has support for case-insensitive file and directory
-lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`)
-casefolding attributes.
-
-The main usecase for casefolding is compatibility with software written
-against other filesystems that rely on casefolded lookups
-(eg. NTFS and Wine/Proton).
-Taking advantage of file-system level casefolding can lead to great
-loading time gains in many applications and games.
-
-Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled.
-Once a directory has been flagged for casefolding, a feature bit
-is enabled on the superblock which marks the filesystem as using
-casefolding.
-When the feature bit for casefolding is enabled, it is no longer possible
-to mount that filesystem on kernels without `CONFIG_UNICODE` enabled.
-
-On the lookup/query side: casefolding is implemented by allocating a new
-string of `BCH_NAME_MAX` length using the `utf8_casefold` function to
-casefold the query string.
-
-On the dirent side: casefolding is implemented by ensuring the `bkey`'s
-hash is made from the casefolded string and storing the cached casefolded
-name with the regular name in the dirent.
-
-The structure looks like this:
-
-* Regular:    [dirent data][regular name][nul][nul]...
-* Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]...
-
-(Do note, the number of NULs here is merely for illustration; their count can
-vary per-key, and they may not even be present if the key is aligned to
-`sizeof(u64)`.)
-
-This is efficient as it means that for all file lookups that require casefolding,
-it has identical performance to a regular lookup:
-a hash comparison and a `memcmp` of the name.
-
-Rationale
----------
-
-Several designs were considered for this system:
-One was to introduce a dirent_v2, however that would be painful especially as
-the hash system only has support for a single key type. This would also need
-`BCH_NAME_MAX` to change between versions, and a new feature bit.
-
-Another option was to store without the two lengths, and just take the length of
-the regular name and casefolded name contiguously / 2 as the length. This would
-assume that the regular length == casefolded length, but that could potentially
-not be true, if the uppercase unicode glyph had a different UTF-8 encoding than
-the lowercase unicode glyph.
-It would be possible to disregard the casefold cache for those cases, but it was
-decided to simply encode the two string lengths in the key to avoid random
-performance issues if this edgecase was ever hit.
-
-The option settled on was to use a free-bit in d_type to mark a dirent as having
-a casefold cache, and then treat the first 4 bytes the name block as lengths.
-You can see this in the `d_cf_name_block` member of union in `bch_dirent`.
-
-The feature bit was used to allow casefolding support to be enabled for the majority
-of users, but some allow users who have no need for the feature to still use bcachefs as
-`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used,
-which may be decider between using bcachefs for eg. embedded platforms.
-
-Other filesystems like ext4 and f2fs have a super-block level option for casefolding
-encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose
-any encodings than a single UTF-8 version. When future encodings are desirable,
-they will be added trivially using the opts mechanism.
-
-dentry/dcache considerations
-----------------------------
-
-Currently, in casefolded directories, bcachefs (like other filesystems) will not cache
-negative dentry's.
-
-This is because currently doing so presents a problem in the following scenario:
-
- - Lookup file "blAH" in a casefolded directory
- - Creation of file "BLAH" in a casefolded directory
- - Lookup file "blAH" in a casefolded directory
-
-This would fail if negative dentry's were cached.
-
-This is slightly suboptimal, but could be fixed in future with some vfs work.
-
-
-References
-----------
-
-(from Peter Anvin, on the list)
-
-It is worth noting that Microsoft has basically declared their
-"recommended" case folding (upcase) table to be permanently frozen (for
-new filesystem instances in the case where they use an on-disk
-translation table created at format time.)  As far as I know they have
-never supported anything other than 1:1 conversion of BMP code points,
-nor normalization.
-
-The exFAT specification enumerates the full recommended upcase table,
-although in a somewhat annoying format (basically a hex dump of
-compressed data):
-
-https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification
diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst
deleted file mode 100644
index 2cccaa0ba7cd..000000000000
--- a/Documentation/filesystems/bcachefs/errorcodes.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-bcachefs private error codes
-----------------------------
-
-In bcachefs, as a hard rule we do not throw or directly use standard error
-codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed
-in fs/bcachefs/errcode.h.
-
-This gives us much better error messages and makes debugging much easier. Any
-direct uses of standard error codes you see in the source code are simply old
-code that has yet to be converted - feel free to clean it up!
-
-Private error codes may subtype another error code, this allows for grouping of
-related errors that should be handled similarly (e.g. transaction restart
-errors), as well as specifying which standard error code should be returned at
-the bcachefs module boundary.
-
-At the module boundary, we use bch2_err_class() to convert to a standard error
-code; this also emits a trace event so that the original error code be
-recovered even if it wasn't logged.
-
-Do not reuse error codes! Generally speaking, a private error code should only
-be thrown in one place. That means that when we see it in a log message we can
-see, unambiguously, exactly which file and line number it was returned from.
-
-Try to give error codes names that are as reasonably descriptive of the error
-as possible. Frequently, the error will be logged at a place far removed from
-where the error was generated; good names for error codes mean much more
-descriptive and useful error messages.
diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst
deleted file mode 100644
index 59a332509dcd..000000000000
--- a/Documentation/filesystems/bcachefs/future/idle_work.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-Idle/background work classes design doc:
-
-Right now, our behaviour at idle isn't ideal, it was designed for servers that
-would be under sustained load, to keep pending work at a "medium" level, to
-let work build up so we can process it in more efficient batches, while also
-giving headroom for bursts in load.
-
-But for desktops or mobile - scenarios where work is less sustained and power
-usage is more important - we want to operate differently, with a "rush to
-idle" so the system can go to sleep. We don't want to be dribbling out
-background work while the system should be idle.
-
-The complicating factor is that there are a number of background tasks, which
-form a heirarchy (or a digraph, depending on how you divide it up) - one
-background task may generate work for another.
-
-Thus proper idle detection needs to model this heirarchy.
-
-- Foreground writes
-- Page cache writeback
-- Copygc, rebalance
-- Journal reclaim
-
-When we implement idle detection and rush to idle, we need to be careful not
-to disturb too much the existing behaviour that works reasonably well when the
-system is under sustained load (or perhaps improve it in the case of
-rebalance, which currently does not actively attempt to let work batch up).
-
-SUSTAINED LOAD REGIME
----------------------
-
-When the system is under continuous load, we want these jobs to run
-continuously - this is perhaps best modelled with a P/D controller, where
-they'll be trying to keep a target value (i.e. fragmented disk space,
-available journal space) roughly in the middle of some range.
-
-The goal under sustained load is to balance our ability to handle load spikes
-without running out of x resource (free disk space, free space in the
-journal), while also letting some work accumululate to be batched (or become
-unnecessary).
-
-For example, we don't want to run copygc too aggressively, because then it
-will be evacuating buckets that would have become empty (been overwritten or
-deleted) anyways, and we don't want to wait until we're almost out of free
-space because then the system will behave unpredicably - suddenly we're doing
-a lot more work to service each write and the system becomes much slower.
-
-IDLE REGIME
------------
-
-When the system becomes idle, we should start flushing our pending work
-quicker so the system can go to sleep.
-
-Note that the definition of "idle" depends on where in the heirarchy a task
-is - a task should start flushing work more quickly when the task above it has
-stopped generating new work.
-
-e.g. rebalance should start flushing more quickly when page cache writeback is
-idle, and journal reclaim should only start flushing more quickly when both
-copygc and rebalance are idle.
-
-It's important to let work accumulate when more work is still incoming and we
-still have room, because flushing is always more efficient if we let it batch
-up. New writes may overwrite data before rebalance moves it, and tasks may be
-generating more updates for the btree nodes that journal reclaim needs to flush.
-
-On idle, how much work we do at each interval should be proportional to the
-length of time we have been idle for. If we're idle only for a short duration,
-we shouldn't flush everything right away; the system might wake up and start
-generating new work soon, and flushing immediately might end up doing a lot of
-work that would have been unnecessary if we'd allowed things to batch more.
- 
-To summarize, we will need:
-
- - A list of classes for background tasks that generate work, which will
-   include one "foreground" class.
- - Tracking for each class - "Am I doing work, or have I gone to sleep?"
- - And each class should check the class above it when deciding how much work to issue.
diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst
deleted file mode 100644
index e5c4c2120b93..000000000000
--- a/Documentation/filesystems/bcachefs/index.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================
-bcachefs Documentation
-======================
-
-Subsystem-specific development process notes
---------------------------------------------
-
-Development notes specific to bcachefs. These are intended to supplement
-:doc:`general kernel development handbook </process/index>`.
-
-.. toctree::
-   :maxdepth: 1
-   :numbered:
-
-   CodingStyle
-   SubmittingPatches
-
-Filesystem implementation
--------------------------
-
-Documentation for filesystem features and their implementation details.
-At this moment, only a few of these are described here.
-
-.. toctree::
-   :maxdepth: 1
-   :numbered:
-
-   casefolding
-   errorcodes
-
-Future design
--------------
-.. toctree::
-   :maxdepth: 1
-
-   future/idle_work
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 11a599387266..622187a96bdc 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -72,7 +72,6 @@ Documentation for filesystem implementations.
    afs
    autofs
    autofs-mount-control
-   bcachefs/index
    befs
    bfs
    btrfs
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 85f590254f07..78c3d07c0c08 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil
 
 ->drop_inode() returns int now; it's called on final iput() with
 inode->i_lock held and it returns true if filesystems wants the inode to be
-dropped.  As before, generic_drop_inode() is still the default and it's been
-updated appropriately.  generic_delete_inode() is also alive and it consists
+dropped.  As before, inode_generic_drop() is still the default and it's been
+updated appropriately.  inode_just_drop() is also alive and it consists
 simply of return 1.  Note that all actual eviction work is done by caller after
 ->drop_inode() returns.
 
@@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid.
 The vm_area_desc provides the minimum required information for a filesystem
 to initialise state upon memory mapping of a file-backed region, and output
 parameters for the file system to set this state.
+
+---
+
+**mandatory**
+
+Several functions are renamed:
+
+-  kern_path_locked -> start_removing_path
+-  kern_path_create -> start_creating_path
+-  user_path_create -> start_creating_user_path
+-  user_path_locked_at -> start_removing_user_path_at
+-  done_path_create -> end_creating_path
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 2971551b7235..b7e3147ba3d4 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -2362,6 +2362,7 @@ The following mount options are supported:
 	hidepid=	Set /proc/<pid>/ access mode.
 	gid=		Set the group authorized to learn processes information.
 	subset=		Show only the specified subset of procfs.
+	pidns=		Specify a the namespace used by this procfs.
 	=========	========================================================
 
 hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group.
 subset=pid hides all top level files and directories in the procfs that
 are not related to tasks.
 
+pidns= specifies a pid namespace (either as a string path to something like
+`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
+will be used by the procfs instance when translating pids. By default, procfs
+will use the calling process's active pid namespace. Note that the pid
+namespace of an existing procfs instance cannot be modified (attempting to do
+so will give an `-EBUSY` error).
+
 Chapter 5: Filesystem behavior
 ==============================
 
diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst
index c7949dd44f2f..006d23af66e1 100644
--- a/Documentation/filesystems/resctrl.rst
+++ b/Documentation/filesystems/resctrl.rst
@@ -26,6 +26,7 @@ MBM (Memory Bandwidth Monitoring)		"cqm_mbm_total", "cqm_mbm_local"
 MBA (Memory Bandwidth Allocation)		"mba"
 SMBA (Slow Memory Bandwidth Allocation)         ""
 BMEC (Bandwidth Monitoring Event Configuration) ""
+ABMC (Assignable Bandwidth Monitoring Counters) ""
 ===============================================	================================
 
 Historically, new features were made visible by default in /proc/cpuinfo. This
@@ -256,6 +257,144 @@ with the following files:
 	    # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config
 	    0=0x30;1=0x30;3=0x15;4=0x15
 
+"mbm_assign_mode":
+	The supported counter assignment modes. The enclosed brackets indicate which mode
+	is enabled. The MBM events associated with counters may reset when "mbm_assign_mode"
+	is changed.
+	::
+
+	  # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+	  [mbm_event]
+	  default
+
+	"mbm_event":
+
+	mbm_event mode allows users to assign a hardware counter to an RMID, event
+	pair and monitor the bandwidth usage as long as it is assigned. The hardware
+	continues to track the assigned counter until it is explicitly unassigned by
+	the user. Each event within a resctrl group can be assigned independently.
+
+	In this mode, a monitoring event can only accumulate data while it is backed
+	by a hardware counter. Use "mbm_L3_assignments" found in each CTRL_MON and MON
+	group to specify which of the events should have a counter assigned. The number
+	of counters available is described in the "num_mbm_cntrs" file. Changing the
+	mode may cause all counters on the resource to reset.
+
+	Moving to mbm_event counter assignment mode requires users to assign the counters
+	to the events. Otherwise, the MBM event counters will return 'Unassigned' when read.
+
+	The mode is beneficial for AMD platforms that support more CTRL_MON
+	and MON groups than available hardware counters. By default, this
+	feature is enabled on AMD platforms with the ABMC (Assignable Bandwidth
+	Monitoring Counters) capability, ensuring counters remain assigned even
+	when the corresponding RMID is not actively used by any processor.
+
+	"default":
+
+	In default mode, resctrl assumes there is a hardware counter for each
+	event within every CTRL_MON and MON group. On AMD platforms, it is
+	recommended to use the mbm_event mode, if supported, to prevent reset of MBM
+	events between reads resulting from hardware re-allocating counters. This can
+	result in misleading values or display "Unavailable" if no counter is assigned
+	to the event.
+
+	* To enable "mbm_event" counter assignment mode:
+	  ::
+
+	    # echo "mbm_event" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+
+	* To enable "default" monitoring mode:
+	  ::
+
+	    # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+
+"num_mbm_cntrs":
+	The maximum number of counters (total of available and assigned counters) in
+	each domain when the system supports mbm_event mode.
+
+	For example, on a system with maximum of 32 memory bandwidth monitoring
+	counters in each of its L3 domains:
+	::
+
+	  # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs
+	  0=32;1=32
+
+"available_mbm_cntrs":
+	The number of counters available for assignment in each domain when mbm_event
+	mode is enabled on the system.
+
+	For example, on a system with 30 available [hardware] assignable counters
+	in each of its L3 domains:
+	::
+
+	  # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs
+	  0=30;1=30
+
+"event_configs":
+	Directory that exists when "mbm_event" counter assignment mode is supported.
+	Contains a sub-directory for each MBM event that can be assigned to a counter.
+
+	Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes.
+	Each MBM event's sub-directory contains a file named "event_filter" that is
+	used to view and modify which memory transactions the MBM event is configured
+	with. The file is accessible only when "mbm_event" counter assignment mode is
+	enabled.
+
+	List of memory transaction types supported:
+
+	==========================  ========================================================
+	Name			    Description
+	==========================  ========================================================
+	dirty_victim_writes_all     Dirty Victims from the QOS domain to all types of memory
+	remote_reads_slow_memory    Reads to slow memory in the non-local NUMA domain
+	local_reads_slow_memory     Reads to slow memory in the local NUMA domain
+	remote_non_temporal_writes  Non-temporal writes to non-local NUMA domain
+	local_non_temporal_writes   Non-temporal writes to local NUMA domain
+	remote_reads                Reads to memory in the non-local NUMA domain
+	local_reads                 Reads to memory in the local NUMA domain
+	==========================  ========================================================
+
+	For example::
+
+	  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter
+	  local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes,
+	  local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all
+
+	  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter
+	  local_reads,local_non_temporal_writes,local_reads_slow_memory
+
+	Modify the event configuration by writing to the "event_filter" file within
+	the "event_configs" directory. The read/write "event_filter" file contains the
+	configuration of the event that reflects which memory transactions are counted by it.
+
+	For example::
+
+	  # echo "local_reads, local_non_temporal_writes" >
+	    /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter
+
+	  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter
+	   local_reads,local_non_temporal_writes
+
+"mbm_assign_on_mkdir":
+	Exists when "mbm_event" counter assignment mode is supported. Accessible
+	only when "mbm_event" counter assignment mode is enabled.
+
+	Determines if a counter will automatically be assigned to an RMID, MBM event
+	pair when its associated monitor group is created via mkdir. Enabled by default
+	on boot, also when switched from "default" mode to "mbm_event" counter assignment
+	mode. Users can disable this capability by writing to the interface.
+
+	"0":
+		Auto assignment is disabled.
+	"1":
+		Auto assignment is enabled.
+
+	Example::
+
+	  # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir
+	  # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir
+	  0
+
 "max_threshold_occupancy":
 		Read/write file provides the largest value (in
 		bytes) at which a previously used LLC_occupancy
@@ -380,10 +519,77 @@ When monitoring is enabled all MON groups will also contain:
 	for the L3 cache they occupy). These are named "mon_sub_L3_YY"
 	where "YY" is the node number.
 
+	When the 'mbm_event' counter assignment mode is enabled, reading
+	an MBM event of a MON group returns 'Unassigned' if no hardware
+	counter is assigned to it. For CTRL_MON groups, 'Unassigned' is
+	returned if the MBM event does not have an assigned counter in the
+	CTRL_MON group nor in any of its associated MON groups.
+
 "mon_hw_id":
 	Available only with debug option. The identifier used by hardware
 	for the monitor group. On x86 this is the RMID.
 
+When monitoring is enabled all MON groups may also contain:
+
+"mbm_L3_assignments":
+	Exists when "mbm_event" counter assignment mode is supported and lists the
+	counter assignment states of the group.
+
+	The assignment list is displayed in the following format:
+
+	<Event>:<Domain ID>=<Assignment state>;<Domain ID>=<Assignment state>
+
+	Event: A valid MBM event in the
+	       /sys/fs/resctrl/info/L3_MON/event_configs directory.
+
+	Domain ID: A valid domain ID. When writing, '*' applies the changes
+		   to all the domains.
+
+	Assignment states:
+
+	_ : No counter assigned.
+
+	e : Counter assigned exclusively.
+
+	Example:
+
+	To display the counter assignment states for the default group.
+	::
+
+	 # cd /sys/fs/resctrl
+	 # cat /sys/fs/resctrl/mbm_L3_assignments
+	   mbm_total_bytes:0=e;1=e
+	   mbm_local_bytes:0=e;1=e
+
+	Assignments can be modified by writing to the interface.
+
+	Examples:
+
+	To unassign the counter associated with the mbm_total_bytes event on domain 0:
+	::
+
+	 # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments
+	 # cat /sys/fs/resctrl/mbm_L3_assignments
+	   mbm_total_bytes:0=_;1=e
+	   mbm_local_bytes:0=e;1=e
+
+	To unassign the counter associated with the mbm_total_bytes event on all the domains:
+	::
+
+	 # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments
+	 # cat /sys/fs/resctrl/mbm_L3_assignments
+	   mbm_total_bytes:0=_;1=_
+	   mbm_local_bytes:0=e;1=e
+
+	To assign a counter associated with the mbm_total_bytes event on all domains in
+	exclusive mode:
+	::
+
+	 # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments
+	 # cat /sys/fs/resctrl/mbm_L3_assignments
+	   mbm_total_bytes:0=e;1=e
+	   mbm_local_bytes:0=e;1=e
+
 When the "mba_MBps" mount option is used all CTRL_MON groups will also contain:
 
 "mba_MBps_event":
@@ -1429,6 +1635,125 @@ View the llc occupancy snapshot::
   # cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
   11234000
 
+
+Examples on working with mbm_assign_mode
+========================================
+
+a. Check if MBM counter assignment mode is supported.
+::
+
+  # mount -t resctrl resctrl /sys/fs/resctrl/
+
+  # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+  [mbm_event]
+  default
+
+The "mbm_event" mode is detected and enabled.
+
+b. Check how many assignable counters are supported.
+::
+
+  # cat /sys/fs/resctrl/info/L3_MON/num_mbm_cntrs
+  0=32;1=32
+
+c. Check how many assignable counters are available for assignment in each domain.
+::
+
+  # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs
+  0=30;1=30
+
+d. To list the default group's assign states.
+::
+
+  # cat /sys/fs/resctrl/mbm_L3_assignments
+  mbm_total_bytes:0=e;1=e
+  mbm_local_bytes:0=e;1=e
+
+e.  To unassign the counter associated with the mbm_total_bytes event on domain 0.
+::
+
+  # echo "mbm_total_bytes:0=_" > /sys/fs/resctrl/mbm_L3_assignments
+  # cat /sys/fs/resctrl/mbm_L3_assignments
+  mbm_total_bytes:0=_;1=e
+  mbm_local_bytes:0=e;1=e
+
+f. To unassign the counter associated with the mbm_total_bytes event on all domains.
+::
+
+  # echo "mbm_total_bytes:*=_" > /sys/fs/resctrl/mbm_L3_assignments
+  # cat /sys/fs/resctrl/mbm_L3_assignment
+  mbm_total_bytes:0=_;1=_
+  mbm_local_bytes:0=e;1=e
+
+g. To assign a counter associated with the mbm_total_bytes event on all domains in
+exclusive mode.
+::
+
+  # echo "mbm_total_bytes:*=e" > /sys/fs/resctrl/mbm_L3_assignments
+  # cat /sys/fs/resctrl/mbm_L3_assignments
+  mbm_total_bytes:0=e;1=e
+  mbm_local_bytes:0=e;1=e
+
+h. Read the events mbm_total_bytes and mbm_local_bytes of the default group. There is
+no change in reading the events with the assignment.
+::
+
+  # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_bytes
+  779247936
+  # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_total_bytes
+  562324232
+  # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes
+  212122123
+  # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes
+  121212144
+
+i. Check the event configurations.
+::
+
+  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter
+  local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes,
+  local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all
+
+  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter
+  local_reads,local_non_temporal_writes,local_reads_slow_memory
+
+j. Change the event configuration for mbm_local_bytes.
+::
+
+  # echo "local_reads, local_non_temporal_writes, local_reads_slow_memory, remote_reads" >
+  /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter
+
+  # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter
+  local_reads,local_non_temporal_writes,local_reads_slow_memory,remote_reads
+
+k. Now read the local events again. The first read may come back with "Unavailable"
+status. The subsequent read of mbm_local_bytes will display the current value.
+::
+
+  # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes
+  Unavailable
+  # cat /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_bytes
+  2252323
+  # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes
+  Unavailable
+  # cat /sys/fs/resctrl/mon_data/mon_L3_01/mbm_local_bytes
+  1566565
+
+l. Users have the option to go back to 'default' mbm_assign_mode if required. This can be
+done using the following command. Note that switching the mbm_assign_mode may reset all
+the MBM counters (and thus all MBM events) of all the resctrl groups.
+::
+
+  # echo "default" > /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+  # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_mode
+  mbm_event
+  [default]
+
+m. Unmount the resctrl filesystem.
+::
+
+  # umount /sys/fs/resctrl/
+
 Intel RDT Errata
 ================
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 486a91633474..4f13b01e42eb 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -209,31 +209,8 @@ method fills in is the "s_op" field.  This is a pointer to a "struct
 super_operations" which describes the next level of the filesystem
 implementation.
 
-Usually, a filesystem uses one of the generic mount() implementations
-and provides a fill_super() callback instead.  The generic variants are:
-
-``mount_bdev``
-	mount a filesystem residing on a block device
-
-``mount_nodev``
-	mount a filesystem that is not backed by a device
-
-``mount_single``
-	mount a filesystem which shares the instance between all mounts
-
-A fill_super() callback implementation has the following arguments:
-
-``struct super_block *sb``
-	the superblock structure.  The callback must initialize this
-	properly.
-
-``void *data``
-	arbitrary mount options, usually comes as an ASCII string (see
-	"Mount Options" section)
-
-``int silent``
-	whether or not to be silent on error
-
+For more information on mounting (and the new mount API), see
+Documentation/filesystems/mount_api.rst.
 
 The Superblock Object
 =====================
@@ -327,11 +304,11 @@ or bottom half).
 	inode->i_lock spinlock held.
 
 	This method should be either NULL (normal UNIX filesystem
-	semantics) or "generic_delete_inode" (for filesystems that do
+	semantics) or "inode_just_drop" (for filesystems that do
 	not want to cache inodes - causing "delete_inode" to always be
 	called regardless of the value of i_nlink)
 
-	The "generic_delete_inode()" behavior is equivalent to the old
+	The "inode_just_drop()" behavior is equivalent to the old
 	practice of using "force_delete" in the put_inode() case, but
 	does not have the races that the "force_delete()" approach had.
 
diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst
index a91abb8f6840..abce88f15d7c 100644
--- a/Documentation/kbuild/kconfig-language.rst
+++ b/Documentation/kbuild/kconfig-language.rst
@@ -232,6 +232,38 @@ applicable everywhere (see syntax).
   enables the third modular state for all config symbols.
   At most one symbol may have the "modules" option set.
 
+- transitional attribute: "transitional"
+  This declares the symbol as transitional, meaning it should be processed
+  during configuration but omitted from newly written .config files.
+  Transitional symbols are useful for backward compatibility during config
+  option migrations - they allow olddefconfig to process existing .config
+  files while ensuring the old option doesn't appear in new configurations.
+
+  A transitional symbol:
+  - Has no prompt (is not visible to users in menus)
+  - Is processed normally during configuration (values are read and used)
+  - Can be referenced in default expressions of other symbols
+  - Is not written to new .config files
+  - Cannot have any other properties (it is a pass-through option)
+
+  Example migration from OLD_NAME to NEW_NAME::
+
+    config NEW_NAME
+	bool "New option name"
+	default OLD_NAME
+	help
+	  This replaces the old CONFIG_OLD_NAME option.
+
+    config OLD_NAME
+	bool
+	transitional
+	help
+	  Transitional config for OLD_NAME to NEW_NAME migration.
+
+  With this setup, existing .config files with "CONFIG_OLD_NAME=y" will
+  result in "CONFIG_NEW_NAME=y" being set, while CONFIG_OLD_NAME will be
+  omitted from newly written .config files.
+
 Menu dependencies
 -----------------
 
diff --git a/Documentation/netlink/specs/conntrack.yaml b/Documentation/netlink/specs/conntrack.yaml
index c6832633ab7b..591e22a2ee43 100644
--- a/Documentation/netlink/specs/conntrack.yaml
+++ b/Documentation/netlink/specs/conntrack.yaml
@@ -575,8 +575,8 @@ operations:
             - nat-dst
             - timeout
             - mark
-            - counter-orig
-            - counter-reply
+            - counters-orig
+            - counters-reply
             - use
             - id
             - nat-dst
@@ -591,7 +591,6 @@ operations:
         request:
           value: 0x101
           attributes:
-            - nfgen-family
             - mark
             - filter
             - status
@@ -608,8 +607,8 @@ operations:
             - nat-dst
             - timeout
             - mark
-            - counter-orig
-            - counter-reply
+            - counters-orig
+            - counters-reply
             - use
             - id
             - nat-dst
diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml
index 02f1ddcfbf1c..d1b4829b580a 100644
--- a/Documentation/netlink/specs/mptcp_pm.yaml
+++ b/Documentation/netlink/specs/mptcp_pm.yaml
@@ -28,13 +28,13 @@ definitions:
           traffic-patterns it can take a long time until the
           MPTCP_EVENT_ESTABLISHED is sent.
           Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport,
-          dport, server-side.
+          dport, server-side, [flags].
       -
         name: established
         doc: >-
           A MPTCP connection is established (can start new subflows).
           Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport,
-          dport, server-side.
+          dport, server-side, [flags].
       -
         name: closed
         doc: >-
@@ -256,7 +256,7 @@ attribute-sets:
         type: u32
       -
         name: if-idx
-        type: u32
+        type: s32
       -
         name: reset-reason
         type: u32
diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
index bc1b585355f7..7650c4b5be5f 100644
--- a/Documentation/networking/can.rst
+++ b/Documentation/networking/can.rst
@@ -742,7 +742,7 @@ The broadcast manager sends responses to user space in the same form:
             struct timeval ival1, ival2;    /* count and subsequent interval */
             canid_t can_id;                 /* unique can_id for task */
             __u32 nframes;                  /* number of can_frames following */
-            struct can_frame frames[0];
+            struct can_frame frames[];
     };
 
 The aligned payload 'frames' uses the same basic CAN frame structure defined
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index bb620f554598..9756d16e3df1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1420,7 +1420,7 @@ udp_hash_entries - INTEGER
 	A negative value means the networking namespace does not own its
 	hash buckets and shares the initial networking namespace's one.
 
-udp_child_ehash_entries - INTEGER
+udp_child_hash_entries - INTEGER
 	Control the number of hash buckets for UDP sockets in the child
 	networking namespace, which must be set before clone() or unshare().
 
diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst
index 5bfab01eff5a..1683c139821e 100644
--- a/Documentation/networking/mptcp-sysctl.rst
+++ b/Documentation/networking/mptcp-sysctl.rst
@@ -12,6 +12,8 @@ add_addr_timeout - INTEGER (seconds)
 	resent to an MPTCP peer that has not acknowledged a previous
 	ADD_ADDR message.
 
+	Do not retransmit if set to 0.
+
 	The default value matches TCP_RTO_MAX. This is a per-namespace
 	sysctl.
 
diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst
index 17f2bab61164..2e31038d6462 100644
--- a/Documentation/networking/mptcp.rst
+++ b/Documentation/networking/mptcp.rst
@@ -60,10 +60,10 @@ address announcements. Typically, it is the client side that initiates subflows,
 and the server side that announces additional addresses via the ``ADD_ADDR`` and
 ``REMOVE_ADDR`` options.
 
-Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see
-mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the
-same rules are applied for all the connections (see: ``ip mptcp``) ; and the
-userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd
+Path managers are controlled by the ``net.mptcp.path_manager`` sysctl knob --
+see mptcp-sysctl.rst. There are two types: the in-kernel one (``kernel``) where
+the same rules are applied for all the connections (see: ``ip mptcp``) ; and the
+userspace one (``userspace``), controlled by a userspace daemon (i.e. `mptcpd
 <https://mptcpd.mptcp.dev/>`_) where different rules can be applied for each
 connection. The path managers can be controlled via a Netlink API; see
 netlink_spec/mptcp_pm.rst.
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index a15754adb041..7dd60366f4ff 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -433,9 +433,8 @@ Threaded NAPI
 
 Threaded NAPI is an operating mode that uses dedicated kernel
 threads rather than software IRQ context for NAPI processing.
-The configuration is per netdevice and will affect all
-NAPI instances of that device. Each NAPI instance will spawn a separate
-thread (called ``napi/${ifc-name}-${napi-id}``).
+Each threaded NAPI instance will spawn a separate thread
+(called ``napi/${ifc-name}-${napi-id}``).
 
 It is recommended to pin each kernel thread to a single CPU, the same
 CPU as the CPU which services the interrupt. Note that the mapping
diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst
index 56c560a00b37..84657e7d2e5b 100644
--- a/Documentation/process/security-bugs.rst
+++ b/Documentation/process/security-bugs.rst
@@ -8,8 +8,22 @@ like to know when a security bug is found so that it can be fixed and
 disclosed as quickly as possible.  Please report security bugs to the
 Linux kernel security team.
 
-Contact
--------
+The security team and maintainers almost always require additional
+information beyond what was initially provided in a report and rely on
+active and efficient collaboration with the reporter to perform further
+testing (e.g., verifying versions, configuration options, mitigations, or
+patches). Before contacting the security team, the reporter must ensure
+they are available to explain their findings, engage in discussions, and
+run additional tests.  Reports where the reporter does not respond promptly
+or cannot effectively discuss their findings may be abandoned if the
+communication does not quickly improve.
+
+As it is with any bug, the more information provided the easier it
+will be to diagnose and fix.  Please review the procedure outlined in
+'Documentation/admin-guide/reporting-issues.rst' if you are unclear about what
+information is helpful.  Any exploit code is very helpful and will not
+be released without consent from the reporter unless it has already been
+made public.
 
 The Linux kernel security team can be contacted by email at
 <security@kernel.org>.  This is a private list of security officers
@@ -19,13 +33,6 @@ that can speed up the process considerably.  It is possible that the
 security team will bring in extra help from area maintainers to
 understand and fix the security vulnerability.
 
-As it is with any bug, the more information provided the easier it
-will be to diagnose and fix.  Please review the procedure outlined in
-'Documentation/admin-guide/reporting-issues.rst' if you are unclear about what
-information is helpful.  Any exploit code is very helpful and will not
-be released without consent from the reporter unless it has already been
-made public.
-
 Please send plain text emails without attachments where possible.
 It is much harder to have a context-quoted discussion about a complex
 issue if all the details are hidden away in attachments.  Think of it like a
diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst
index a45174d165eb..accaebbdd642 100644
--- a/Documentation/sound/alsa-configuration.rst
+++ b/Documentation/sound/alsa-configuration.rst
@@ -2253,8 +2253,15 @@ device_setup
     Default: 0x0000 
 ignore_ctl_error
     Ignore any USB-controller regarding mixer interface (default: no)
+    ``ignore_ctl_error=1`` may help when you get an error at accessing
+    the mixer element such as URB error -22.  This happens on some
+    buggy USB device or the controller.  This workaround corresponds to
+    the ``quirk_flags`` bit 14, too.
 autoclock
     Enable auto-clock selection for UAC2 devices (default: yes)
+lowlatency
+    Enable low latency playback mode (default: yes).
+    Could disable it to switch back to the old mode if face a regression.
 quirk_alias
     Quirk alias list, pass strings like ``0123abcd:5678beef``, which
     applies the existing quirk for the device 5678:beef to a new
@@ -2284,6 +2291,11 @@ delayed_register
     The driver prints a message like "Found post-registration device
     assignment: 1234abcd:04" for such a device, so that user can
     notice the need.
+skip_validation
+    Skip unit descriptor validation (default: no).
+    The option is used to ignore the validation errors with the hexdump
+    of the unit descriptor instead of a driver probe error, so that we
+    can check its details.
 quirk_flags
     Contains the bit flags for various device specific workarounds.
     Applied to the corresponding card index.
@@ -2307,6 +2319,16 @@ quirk_flags
         * bit 16: Set up the interface at first like UAC1
         * bit 17: Apply the generic implicit feedback sync mode
         * bit 18: Don't apply implicit feedback sync mode
+        * bit 19: Don't closed interface during setting sample rate
+        * bit 20: Force an interface reset whenever stopping & restarting
+          a stream
+        * bit 21: Do not set PCM rate (frequency) when only one rate is
+          available for the given endpoint.
+        * bit 22: Set the fixed resolution 16 for Mic Capture Volume
+        * bit 23: Set the fixed resolution 384 for Mic Capture Volume
+        * bit 24: Set minimum volume control value as mute for devices
+          where the lowest playback value represents muted state instead
+          of minimum audible volume
 
 This module supports multiple devices, autoprobe and hotplugging.
 
@@ -2314,10 +2336,9 @@ NB: ``nrpacks`` parameter can be modified dynamically via sysfs.
 Don't put the value over 20.  Changing via sysfs has no sanity
 check.
 
-NB: ``ignore_ctl_error=1`` may help when you get an error at accessing
-the mixer element such as URB error -22.  This happens on some
-buggy USB device or the controller.  This workaround corresponds to
-the ``quirk_flags`` bit 14, too.
+NB: ``ignore_ctl_error=1`` just provides a quick way to work around the
+issues.  If you have a buggy device that requires these quirks, please
+report it to the upstream.
 
 NB: ``quirk_alias`` option is provided only for testing / development.
 If you want to have a proper support, contact to upstream for
diff --git a/Documentation/staging/crc32.rst b/Documentation/staging/crc32.rst
index 7542220967cb..64f3dd430a6c 100644
--- a/Documentation/staging/crc32.rst
+++ b/Documentation/staging/crc32.rst
@@ -34,7 +34,7 @@ do it in the right order, matching the endianness.
 Just like with ordinary division, you proceed one digit (bit) at a time.
 Each step of the division you take one more digit (bit) of the dividend
 and append it to the current remainder.  Then you figure out the
-appropriate multiple of the divisor to subtract to being the remainder
+appropriate multiple of the divisor to subtract to bring the remainder
 back into range.  In binary, this is easy - it has to be either 0 or 1,
 and to make the XOR cancel, it's just a copy of bit 32 of the remainder.
 
@@ -116,7 +116,7 @@ for any fractional bytes at the end.
 To reduce the number of conditional branches, software commonly uses
 the byte-at-a-time table method, popularized by Dilip V. Sarwate,
 "Computation of Cyclic Redundancy Checks via Table Look-Up", Comm. ACM
-v.31 no.8 (August 1998) p. 1008-1013.
+v.31 no.8 (August 1988) p. 1008-1013.
 
 Here, rather than just shifting one bit of the remainder to decide
 in the correct multiple to subtract, we can shift a byte at a time.
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 406a9f4d0869..7c527a01d1cf 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -374,6 +374,8 @@ Code  Seq#    Include File                                             Comments
                                                                        <mailto:linuxppc-dev@lists.ozlabs.org>
 0xB2  08     arch/powerpc/include/uapi/asm/papr-physical-attestation.h powerpc/pseries Physical Attestation API
                                                                        <mailto:linuxppc-dev@lists.ozlabs.org>
+0xB2  09     arch/powerpc/include/uapi/asm/papr-hvpipe.h               powerpc/pseries HVPIPE API
+                                                                       <mailto:linuxppc-dev@lists.ozlabs.org>
 0xB3  00     linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h                                              <mailto:linux-gpio@vger.kernel.org>
 0xB5  00-0F  uapi/linux/rpmsg.h                                        <mailto:linux-remoteproc@vger.kernel.org>
diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst
index 03f7510384d2..f1c4d21e5c5e 100644
--- a/Documentation/userspace-api/iommufd.rst
+++ b/Documentation/userspace-api/iommufd.rst
@@ -43,7 +43,7 @@ Following IOMMUFD objects are exposed to userspace:
 
 - IOMMUFD_OBJ_HWPT_PAGING, representing an actual hardware I/O page table
   (i.e. a single struct iommu_domain) managed by the iommu driver. "PAGING"
-  primarly indicates this type of HWPT should be linked to an IOAS. It also
+  primarily indicates this type of HWPT should be linked to an IOAS. It also
   indicates that it is backed by an iommu_domain with __IOMMU_DOMAIN_PAGING
   feature flag. This can be either an UNMANAGED stage-1 domain for a device
   running in the user space, or a nesting parent stage-2 domain for mappings
@@ -76,7 +76,7 @@ Following IOMMUFD objects are exposed to userspace:
 
   * Security namespace for guest owned ID, e.g. guest-controlled cache tags
   * Non-device-affiliated event reporting, e.g. invalidation queue errors
-  * Access to a sharable nesting parent pagetable across physical IOMMUs
+  * Access to a shareable nesting parent pagetable across physical IOMMUs
   * Virtualization of various platforms IDs, e.g. RIDs and others
   * Delivery of paravirtualized invalidation
   * Direct assigned invalidation queues
diff --git a/Kbuild b/Kbuild
index f327ca86990c..13324b4bbe23 100644
--- a/Kbuild
+++ b/Kbuild
@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file)
 $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
 	$(call filechk,offsets,__ASM_OFFSETS_H__)
 
+# Generate rq-offsets.h
+
+rq-offsets-file := include/generated/rq-offsets.h
+
+targets += kernel/sched/rq-offsets.s
+
+kernel/sched/rq-offsets.s: $(offsets-file)
+
+$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
+	$(call filechk,offsets,__RQ_OFFSETS_H__)
+
 # Check for missing system calls
 
 quiet_cmd_syscalls = CALL    $<
       cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags)
 
 PHONY += missing-syscalls
-missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
+missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
 	$(call cmd,syscalls)
 
 # Check the manual modification of atomic headers
diff --git a/MAINTAINERS b/MAINTAINERS
index fe168477caa4..62d16c20a888 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -931,13 +931,13 @@ F:	Documentation/devicetree/bindings/dma/altr,msgdma.yaml
 F:	drivers/dma/altera-msgdma.c
 
 ALTERA PIO DRIVER
-M:	Mun Yew Tham <mun.yew.tham@intel.com>
+M:	Adrian Ng <adrianhoyin.ng@altera.com>
 L:	linux-gpio@vger.kernel.org
 S:	Maintained
 F:	drivers/gpio/gpio-altera.c
 
 ALTERA TRIPLE SPEED ETHERNET DRIVER
-M:	Joyce Ooi <joyce.ooi@intel.com>
+M:	Boon Khai Ng <boon.khai.ng@altera.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/altera/
@@ -1772,7 +1772,7 @@ F:	drivers/staging/iio/*/ad*
 X:	drivers/iio/*/adjd*
 
 ANALOGBITS PLL LIBRARIES
-M:	Paul Walmsley <paul.walmsley@sifive.com>
+M:	Paul Walmsley <pjw@kernel.org>
 M:	Samuel Holland <samuel.holland@sifive.com>
 S:	Supported
 F:	drivers/clk/analogbits/*
@@ -1845,7 +1845,6 @@ S:	Odd fixes
 F:	drivers/input/mouse/bcm5974.c
 
 APPLE PCIE CONTROLLER DRIVER
-M:	Alyssa Rosenzweig <alyssa@rosenzweig.io>
 M:	Marc Zyngier <maz@kernel.org>
 L:	linux-pci@vger.kernel.org
 S:	Maintained
@@ -2364,7 +2363,6 @@ F:	sound/soc/codecs/ssm3515.c
 ARM/APPLE MACHINE SUPPORT
 M:	Sven Peter <sven@kernel.org>
 M:	Janne Grunau <j@jannau.net>
-R:	Alyssa Rosenzweig <alyssa@rosenzweig.io>
 R:	Neal Gompa <neal@gompa.dev>
 L:	asahi@lists.linux.dev
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@ -3526,7 +3524,7 @@ F:	Documentation/devicetree/bindings/arm/ti/nspire.yaml
 F:	arch/arm/boot/dts/nspire/
 
 ARM/TOSHIBA VISCONTI ARCHITECTURE
-M:	Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
+M:	Nobuhiro Iwamatsu <nobuhiro.iwamatsu.x90@mail.toshiba>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/iwamatsu/linux-visconti.git
@@ -3667,6 +3665,7 @@ F:	drivers/virt/coco/arm-cca-guest/
 F:	drivers/virt/coco/pkvm-guest/
 F:	tools/testing/selftests/arm64/
 X:	arch/arm64/boot/dts/
+X:	arch/arm64/configs/defconfig
 
 ARROW SPEEDCHIPS XRS7000 SERIES ETHERNET SWITCH DRIVER
 M:	George McCollister <george.mccollister@gmail.com>
@@ -3989,8 +3988,9 @@ F:	drivers/input/touchscreen/atmel_mxt_ts.c
 ATOMIC INFRASTRUCTURE
 M:	Will Deacon <will@kernel.org>
 M:	Peter Zijlstra <peterz@infradead.org>
-R:	Boqun Feng <boqun.feng@gmail.com>
+M:	Boqun Feng <boqun.feng@gmail.com>
 R:	Mark Rutland <mark.rutland@arm.com>
+R:	Gary Guo <gary@garyguo.net>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/atomic_*.txt
@@ -3998,6 +3998,9 @@ F:	arch/*/include/asm/atomic*.h
 F:	include/*/atomic*.h
 F:	include/linux/refcount.h
 F:	scripts/atomic/
+F:	rust/kernel/sync/atomic.rs
+F:	rust/kernel/sync/atomic/
+F:	rust/kernel/sync/refcount.rs
 
 ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER
 M:	Bradley Grove <linuxdrivers@attotech.com>
@@ -4205,7 +4208,7 @@ W:	http://www.baycom.org/~tom/ham/ham.html
 F:	drivers/net/hamradio/baycom*
 
 BCACHE (BLOCK LAYER CACHE)
-M:	Coly Li <colyli@kernel.org>
+M:	Coly Li <colyli@fnnas.com>
 M:	Kent Overstreet <kent.overstreet@linux.dev>
 L:	linux-bcache@vger.kernel.org
 S:	Maintained
@@ -4216,12 +4219,9 @@ F:	drivers/md/bcache/
 BCACHEFS
 M:	Kent Overstreet <kent.overstreet@linux.dev>
 L:	linux-bcachefs@vger.kernel.org
-S:	Supported
+S:	Externally maintained
 C:	irc://irc.oftc.net/bcache
-P:      Documentation/filesystems/bcachefs/SubmittingPatches.rst
 T:	git https://evilpiepirate.org/git/bcachefs.git
-F:	fs/bcachefs/
-F:	Documentation/filesystems/bcachefs/
 
 BDISP ST MEDIA DRIVER
 M:	Fabien Dessenne <fabien.dessenne@foss.st.com>
@@ -4682,7 +4682,6 @@ F:	security/bpf/
 BPF [SELFTESTS] (Test Runners & Infrastructure)
 M:	Andrii Nakryiko <andrii@kernel.org>
 M:	Eduard Zingerman <eddyz87@gmail.com>
-R:	Mykola Lysenko <mykolal@fb.com>
 L:	bpf@vger.kernel.org
 S:	Maintained
 F:	tools/testing/selftests/bpf/
@@ -5258,7 +5257,6 @@ F:	drivers/gpio/gpio-bt8xx.c
 
 BTRFS FILE SYSTEM
 M:	Chris Mason <clm@fb.com>
-M:	Josef Bacik <josef@toxicpanda.com>
 M:	David Sterba <dsterba@suse.com>
 L:	linux-btrfs@vger.kernel.org
 S:	Maintained
@@ -6224,7 +6222,7 @@ M:	Josef Bacik <josef@toxicpanda.com>
 M:	Jens Axboe <axboe@kernel.dk>
 L:	cgroups@vger.kernel.org
 L:	linux-block@vger.kernel.org
-T:	git git://git.kernel.dk/linux-block
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git
 F:	Documentation/admin-guide/cgroup-v1/blkio-controller.rst
 F:	block/bfq-cgroup.c
 F:	block/blk-cgroup.c
@@ -6484,6 +6482,7 @@ S:	Supported
 T:	git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git
 F:	include/linux/cred.h
 F:	kernel/cred.c
+F:	rust/kernel/cred.rs
 F:	Documentation/security/credentials.rst
 
 INTEL CRPS COMMON REDUNDANT PSU DRIVER
@@ -7239,15 +7238,15 @@ F:	include/linux/swiotlb.h
 F:	kernel/dma/
 
 DMA MAPPING HELPERS DEVICE DRIVER API [RUST]
-M:	Abdiel Janulgue <abdiel.janulgue@gmail.com>
 M:	Danilo Krummrich <dakr@kernel.org>
+R:	Abdiel Janulgue <abdiel.janulgue@gmail.com>
 R:	Daniel Almeida <daniel.almeida@collabora.com>
 R:	Robin Murphy <robin.murphy@arm.com>
 R:	Andreas Hindborg <a.hindborg@kernel.org>
 L:	rust-for-linux@vger.kernel.org
 S:	Supported
 W:	https://rust-for-linux.com
-T:	git https://github.com/Rust-for-Linux/linux.git alloc-next
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git
 F:	rust/helpers/dma.c
 F:	rust/kernel/dma.rs
 F:	samples/rust/rust_dma.rs
@@ -7431,7 +7430,7 @@ S:	Supported
 F:	Documentation/devicetree/bindings/dpll/dpll-device.yaml
 F:	Documentation/devicetree/bindings/dpll/dpll-pin.yaml
 F:	Documentation/driver-api/dpll.rst
-F:	drivers/dpll/*
+F:	drivers/dpll/
 F:	include/linux/dpll.h
 F:	include/uapi/linux/dpll.h
 
@@ -7820,7 +7819,7 @@ Q:	https://patchwork.freedesktop.org/project/nouveau/
 Q:	https://gitlab.freedesktop.org/drm/nouveau/-/merge_requests
 B:	https://gitlab.freedesktop.org/drm/nouveau/-/issues
 C:	irc://irc.oftc.net/nouveau
-T:	git https://gitlab.freedesktop.org/drm/nouveau.git
+T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
 F:	drivers/gpu/drm/nouveau/
 F:	include/uapi/drm/nouveau_drm.h
 
@@ -8079,7 +8078,6 @@ F:	Documentation/devicetree/bindings/gpu/
 F:	Documentation/gpu/
 F:	drivers/gpu/drm/
 F:	drivers/gpu/vga/
-F:	rust/kernel/drm/
 F:	include/drm/drm
 F:	include/linux/vga*
 F:	include/uapi/drm/
@@ -8091,11 +8089,21 @@ X:	drivers/gpu/drm/i915/
 X:	drivers/gpu/drm/kmb/
 X:	drivers/gpu/drm/mediatek/
 X:	drivers/gpu/drm/msm/
-X:	drivers/gpu/drm/nouveau/
+X:	drivers/gpu/drm/nova/
 X:	drivers/gpu/drm/radeon/
 X:	drivers/gpu/drm/tegra/
 X:	drivers/gpu/drm/xe/
 
+DRM DRIVERS AND COMMON INFRASTRUCTURE [RUST]
+M:	Danilo Krummrich <dakr@kernel.org>
+M:	Alice Ryhl <aliceryhl@google.com>
+S:	Supported
+W:	https://drm.pages.freedesktop.org/maintainer-tools/drm-rust.html
+T:	git https://gitlab.freedesktop.org/drm/rust/kernel.git
+F:	drivers/gpu/drm/nova/
+F:	drivers/gpu/nova-core/
+F:	rust/kernel/drm/
+
 DRM DRIVERS FOR ALLWINNER A10
 M:	Maxime Ripard <mripard@kernel.org>
 M:	Chen-Yu Tsai <wens@csie.org>
@@ -8426,6 +8434,17 @@ T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
 F:	drivers/gpu/drm/scheduler/
 F:	include/drm/gpu_scheduler.h
 
+DRM GPUVM
+M:	Danilo Krummrich <dakr@kernel.org>
+R:	Matthew Brost <matthew.brost@intel.com>
+R:	Thomas Hellström <thomas.hellstrom@linux.intel.com>
+R:	Alice Ryhl <aliceryhl@google.com>
+L:	dri-devel@lists.freedesktop.org
+S:	Supported
+T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
+F:	drivers/gpu/drm/drm_gpuvm.c
+F:	include/drm/drm_gpuvm.h
+
 DRM LOG
 M:	Jocelyn Falempe <jfalempe@redhat.com>
 M:	Javier Martinez Canillas <javierm@redhat.com>
@@ -8726,9 +8745,6 @@ F:	drivers/edac/thunderx_edac*
 EDAC-CORE
 M:	Borislav Petkov <bp@alien8.de>
 M:	Tony Luck <tony.luck@intel.com>
-R:	James Morse <james.morse@arm.com>
-R:	Mauro Carvalho Chehab <mchehab@kernel.org>
-R:	Robert Richter <rric@kernel.org>
 L:	linux-edac@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
@@ -8736,6 +8752,13 @@ F:	Documentation/driver-api/edac.rst
 F:	drivers/edac/
 F:	include/linux/edac.h
 
+EDAC-A72
+M:	Vijay Balakrishna <vijayb@linux.microsoft.com>
+M:	Tyler Hicks <code@tyhicks.com>
+L:	linux-edac@vger.kernel.org
+S:	Supported
+F:	drivers/edac/a72_edac.c
+
 EDAC-DMC520
 M:	Lei Wang <lewan@microsoft.com>
 L:	linux-edac@vger.kernel.org
@@ -9744,11 +9767,14 @@ F:	drivers/video/fbdev/imxfb.c
 
 FREESCALE IMX DDR PMU DRIVER
 M:	Frank Li <Frank.li@nxp.com>
+M:	Xu Yang <xu.yang_2@nxp.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	Documentation/admin-guide/perf/imx-ddr.rst
 F:	Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
 F:	drivers/perf/fsl_imx8_ddr_perf.c
+F:	drivers/perf/fsl_imx9_ddr_perf.c
+F:	tools/perf/pmu-events/arch/arm64/freescale/
 
 FREESCALE IMX I2C DRIVER
 M:	Oleksij Rempel <o.rempel@pengutronix.de>
@@ -10377,7 +10403,7 @@ S:	Maintained
 F:	drivers/input/touchscreen/goodix*
 
 GOOGLE ETHERNET DRIVERS
-M:	Jeroen de Borst <jeroendb@google.com>
+M:	Joshua Washington <joshwash@google.com>
 M:	Harshitha Ramamurthy <hramamurthy@google.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
@@ -10655,7 +10681,8 @@ S:	Maintained
 F:	block/partitions/efi.*
 
 HABANALABS PCI DRIVER
-M:	Yaron Avizrat <yaron.avizrat@intel.com>
+M:	Koby Elbaz <koby.elbaz@intel.com>
+M:	Konstantin Sinyuk <konstantin.sinyuk@intel.com>
 L:	dri-devel@lists.freedesktop.org
 S:	Supported
 C:	irc://irc.oftc.net/dri-devel
@@ -10791,8 +10818,10 @@ M:	John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
 M:	Yangtao Li <frank.li@vivo.com>
 L:	linux-fsdevel@vger.kernel.org
 S:	Maintained
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git
 F:	Documentation/filesystems/hfs.rst
 F:	fs/hfs/
+F:	include/linux/hfs_common.h
 
 HFSPLUS FILESYSTEM
 M:	Viacheslav Dubeyko <slava@dubeyko.com>
@@ -10800,8 +10829,10 @@ M:	John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
 M:	Yangtao Li <frank.li@vivo.com>
 L:	linux-fsdevel@vger.kernel.org
 S:	Maintained
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git
 F:	Documentation/filesystems/hfsplus.rst
 F:	fs/hfsplus/
+F:	include/linux/hfs_common.h
 
 HGA FRAMEBUFFER DRIVER
 M:	Ferenc Bakonyi <fero@drama.obuda.kando.hu>
@@ -11013,7 +11044,7 @@ F:	Documentation/admin-guide/perf/hns3-pmu.rst
 F:	drivers/perf/hisilicon/hns3_pmu.c
 
 HISILICON I2C CONTROLLER DRIVER
-M:	Yicong Yang <yangyicong@hisilicon.com>
+M:	Devyn Liu <liudingyuan@h-partners.com>
 L:	linux-i2c@vger.kernel.org
 S:	Maintained
 W:	https://www.hisilicon.com
@@ -11059,7 +11090,6 @@ F:	Documentation/devicetree/bindings/net/hisilicon*.txt
 F:	drivers/net/ethernet/hisilicon/
 
 HISILICON PMU DRIVER
-M:	Yicong Yang <yangyicong@hisilicon.com>
 M:	Jonathan Cameron <jonathan.cameron@huawei.com>
 S:	Supported
 W:	http://www.hisilicon.com
@@ -11438,6 +11468,7 @@ F:	drivers/tty/hvc/
 HUNG TASK DETECTOR
 M:	Andrew Morton <akpm@linux-foundation.org>
 R:	Lance Yang <lance.yang@linux.dev>
+R:	Masami Hiramatsu <mhiramat@kernel.org>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	include/linux/hung_task.h
@@ -12280,7 +12311,6 @@ F:	include/linux/avf/virtchnl.h
 F:	include/linux/net/intel/*/
 
 INTEL ETHERNET PROTOCOL DRIVER FOR RDMA
-M:	Mustafa Ismail <mustafa.ismail@intel.com>
 M:	Tatyana Nikolova <tatyana.e.nikolova@intel.com>
 L:	linux-rdma@vger.kernel.org
 S:	Supported
@@ -12583,10 +12613,9 @@ S:	Supported
 F:	drivers/cpufreq/intel_pstate.c
 
 INTEL PTP DFL ToD DRIVER
-M:	Tianfei Zhang <tianfei.zhang@intel.com>
 L:	linux-fpga@vger.kernel.org
 L:	netdev@vger.kernel.org
-S:	Maintained
+S:	Orphan
 F:	drivers/ptp/ptp_dfl_tod.c
 
 INTEL QUADRATURE ENCODER PERIPHERAL DRIVER
@@ -12724,9 +12753,8 @@ S:	Maintained
 F:	drivers/platform/x86/intel/wmi/thunderbolt.c
 
 INTEL WWAN IOSM DRIVER
-M:	M Chetan Kumar <m.chetan.kumar@intel.com>
 L:	netdev@vger.kernel.org
-S:	Maintained
+S:	Orphan
 F:	drivers/net/wwan/iosm/
 
 INTEL(R) FLEXIBLE RETURN AND EVENT DELIVERY
@@ -12860,8 +12888,8 @@ IO_URING
 M:	Jens Axboe <axboe@kernel.dk>
 L:	io-uring@vger.kernel.org
 S:	Maintained
-T:	git git://git.kernel.dk/linux-block
-T:	git git://git.kernel.dk/liburing
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/liburing.git
 F:	include/linux/io_uring/
 F:	include/linux/io_uring.h
 F:	include/linux/io_uring_types.h
@@ -13686,7 +13714,6 @@ F:	scripts/Makefile.kmsan
 
 KPROBES
 M:	Naveen N Rao <naveen@kernel.org>
-M:	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
 M:	"David S. Miller" <davem@davemloft.net>
 M:	Masami Hiramatsu <mhiramat@kernel.org>
 L:	linux-kernel@vger.kernel.org
@@ -15674,7 +15701,6 @@ MEDIATEK T7XX 5G WWAN MODEM DRIVER
 M:	Chandrashekar Devegowda <chandrashekar.devegowda@intel.com>
 R:	Chiranjeevi Rapolu <chiranjeevi.rapolu@linux.intel.com>
 R:	Liu Haijun <haijun.liu@mediatek.com>
-R:	M Chetan Kumar <m.chetan.kumar@linux.intel.com>
 R:	Ricardo Martinez <ricardo.martinez@linux.intel.com>
 L:	netdev@vger.kernel.org
 S:	Supported
@@ -15732,13 +15758,6 @@ S:	Supported
 W:	http://www.melexis.com
 F:	drivers/iio/temperature/mlx90635.c
 
-MELFAS MIP4 TOUCHSCREEN DRIVER
-M:	Sangwon Jee <jeesw@melfas.com>
-S:	Supported
-W:	http://www.melfas.com
-F:	Documentation/devicetree/bindings/input/touchscreen/melfas_mip4.txt
-F:	drivers/input/touchscreen/melfas_mip4.c
-
 MELLANOX BLUEFIELD I2C DRIVER
 M:	Khalil Blaiech <kblaiech@nvidia.com>
 M:	Asmaa Mnebhi <asmaa@nvidia.com>
@@ -16061,6 +16080,23 @@ F:	mm/mempolicy.c
 F:	mm/migrate.c
 F:	mm/migrate_device.c
 
+MEMORY MANAGEMENT - MGLRU (MULTI-GEN LRU)
+M:	Andrew Morton <akpm@linux-foundation.org>
+M:	Axel Rasmussen <axelrasmussen@google.com>
+M:	Yuanchu Xie <yuanchu@google.com>
+R:	Wei Xu <weixugc@google.com>
+L:	linux-mm@kvack.org
+S:	Maintained
+W:	http://www.linux-mm.org
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F:	Documentation/admin-guide/mm/multigen_lru.rst
+F:	Documentation/mm/multigen_lru.rst
+F:	include/linux/mm_inline.h
+F:	include/linux/mmzone.h
+F:	mm/swap.c
+F:	mm/vmscan.c
+F:	mm/workingset.c
+
 MEMORY MANAGEMENT - MISC
 M:	Andrew Morton <akpm@linux-foundation.org>
 M:	David Hildenbrand <david@redhat.com>
@@ -16102,6 +16138,7 @@ M:	Andrew Morton <akpm@linux-foundation.org>
 M:	Mike Rapoport <rppt@kernel.org>
 L:	linux-mm@kvack.org
 S:	Maintained
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git
 F:	include/linux/numa_memblks.h
 F:	mm/numa.c
 F:	mm/numa_emulation.c
@@ -16169,6 +16206,7 @@ R:	Rik van Riel <riel@surriel.com>
 R:	Liam R. Howlett <Liam.Howlett@oracle.com>
 R:	Vlastimil Babka <vbabka@suse.cz>
 R:	Harry Yoo <harry.yoo@oracle.com>
+R:	Jann Horn <jannh@google.com>
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	include/linux/rmap.h
@@ -16213,6 +16251,7 @@ R:	Nico Pache <npache@redhat.com>
 R:	Ryan Roberts <ryan.roberts@arm.com>
 R:	Dev Jain <dev.jain@arm.com>
 R:	Barry Song <baohua@kernel.org>
+R:	Lance Yang <lance.yang@linux.dev>
 L:	linux-mm@kvack.org
 S:	Maintained
 W:	http://www.linux-mm.org
@@ -16251,8 +16290,10 @@ S:	Maintained
 W:	http://www.linux-mm.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
 F:	rust/helpers/mm.c
+F:	rust/helpers/page.c
 F:	rust/kernel/mm.rs
 F:	rust/kernel/mm/
+F:	rust/kernel/page.rs
 
 MEMORY MAPPING
 M:	Andrew Morton <akpm@linux-foundation.org>
@@ -16661,7 +16702,6 @@ F:	drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c
 F:	drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c
 
 MICROCHIP PCI1XXXX I2C DRIVER
-M:	Tharun Kumar P <tharunkumar.pasumarthi@microchip.com>
 M:	Kumaravel Thiagarajan <kumaravel.thiagarajan@microchip.com>
 M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	linux-i2c@vger.kernel.org
@@ -16670,7 +16710,6 @@ F:	drivers/i2c/busses/i2c-mchp-pci1xxxx.c
 
 MICROCHIP PCIe UART DRIVER
 M:	Kumaravel Thiagarajan <kumaravel.thiagarajan@microchip.com>
-M:	Tharun Kumar P <tharunkumar.pasumarthi@microchip.com>
 L:	linux-serial@vger.kernel.org
 S:	Maintained
 F:	drivers/tty/serial/8250/8250_pci1xxxx.c
@@ -17451,6 +17490,8 @@ F:	drivers/net/ethernet/neterion/
 NETFILTER
 M:	Pablo Neira Ayuso <pablo@netfilter.org>
 M:	Jozsef Kadlecsik <kadlec@netfilter.org>
+M:	Florian Westphal <fw@strlen.de>
+R:	Phil Sutter <phil@nwl.cc>
 L:	netfilter-devel@vger.kernel.org
 L:	coreteam@netfilter.org
 S:	Maintained
@@ -17820,9 +17861,9 @@ F:	net/ipv6/syncookies.c
 F:	net/ipv6/tcp*.c
 
 NETWORKING [TLS]
-M:	Boris Pismenny <borisp@nvidia.com>
 M:	John Fastabend <john.fastabend@gmail.com>
 M:	Jakub Kicinski <kuba@kernel.org>
+M:	Sabrina Dubroca <sd@queasysnail.net>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	include/net/tls.h
@@ -19265,7 +19306,7 @@ S:	Maintained
 F:	drivers/pci/controller/dwc/*layerscape*
 
 PCI DRIVER FOR FU740
-M:	Paul Walmsley <paul.walmsley@sifive.com>
+M:	Paul Walmsley <pjw@kernel.org>
 M:	Greentime Hu <greentime.hu@sifive.com>
 M:	Samuel Holland <samuel.holland@sifive.com>
 L:	linux-pci@vger.kernel.org
@@ -19834,6 +19875,7 @@ M:	Christian Brauner <christian@brauner.io>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
+F:	rust/kernel/pid_namespace.rs
 F:	samples/pidfd/
 F:	tools/testing/selftests/clone3/
 F:	tools/testing/selftests/pid_namespace/
@@ -20745,8 +20787,8 @@ S:	Supported
 F:	drivers/dma/qcom/hidma*
 
 QUALCOMM I2C QCOM GENI DRIVER
-M:	Mukesh Kumar Savaliya <quic_msavaliy@quicinc.com>
-M:	Viken Dadhaniya <quic_vdadhani@quicinc.com>
+M:	Mukesh Kumar Savaliya <mukesh.savaliya@oss.qualcomm.com>
+M:	Viken Dadhaniya <viken.dadhaniya@oss.qualcomm.com>
 L:	linux-i2c@vger.kernel.org
 L:	linux-arm-msm@vger.kernel.org
 S:	Maintained
@@ -20850,8 +20892,8 @@ S:	Maintained
 F:	drivers/firmware/qcom/qcom_qseecom_uefisecapp.c
 
 QUALCOMM RMNET DRIVER
-M:	Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
-M:	Sean Tranchetti <quic_stranche@quicinc.com>
+M:	Subash Abhinov Kasiviswanathan <subash.a.kasiviswanathan@oss.qualcomm.com>
+M:	Sean Tranchetti <sean.tranchetti@oss.qualcomm.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst
@@ -21144,6 +21186,7 @@ M:	Tony Luck <tony.luck@intel.com>
 M:	Reinette Chatre <reinette.chatre@intel.com>
 R:	Dave Martin <Dave.Martin@arm.com>
 R:	James Morse <james.morse@arm.com>
+R:	Babu Moger <babu.moger@amd.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	Documentation/filesystems/resctrl.rst
@@ -21625,7 +21668,7 @@ F:	Documentation/devicetree/bindings/timer/andestech,plmt0.yaml
 F:	arch/riscv/boot/dts/andes/
 
 RISC-V ARCHITECTURE
-M:	Paul Walmsley <paul.walmsley@sifive.com>
+M:	Paul Walmsley <pjw@kernel.org>
 M:	Palmer Dabbelt <palmer@dabbelt.com>
 M:	Albert Ou <aou@eecs.berkeley.edu>
 R:	Alexandre Ghiti <alex@ghiti.fr>
@@ -22019,6 +22062,7 @@ F:	drivers/infiniband/ulp/rtrs/
 
 RUNTIME VERIFICATION (RV)
 M:	Steven Rostedt <rostedt@goodmis.org>
+M:	Gabriele Monaco <gmonaco@redhat.com>
 L:	linux-trace-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/trace/rv/
@@ -22176,7 +22220,7 @@ F:	arch/s390/mm
 
 S390 NETWORK DRIVERS
 M:	Alexandra Winter <wintera@linux.ibm.com>
-M:	Thorsten Winkler <twinkler@linux.ibm.com>
+R:	Aswin Karuvally <aswin@linux.ibm.com>
 L:	linux-s390@vger.kernel.org
 L:	netdev@vger.kernel.org
 S:	Supported
@@ -22790,6 +22834,7 @@ F:	include/linux/security.h
 F:	include/uapi/linux/lsm.h
 F:	security/
 F:	tools/testing/selftests/lsm/
+F:	rust/kernel/security.rs
 X:	security/selinux/
 K:	\bsecurity_[a-z_0-9]\+\b
 
@@ -23076,7 +23121,7 @@ S:	Maintained
 F:	drivers/watchdog/simatic-ipc-wdt.c
 
 SIFIVE DRIVERS
-M:	Paul Walmsley <paul.walmsley@sifive.com>
+M:	Paul Walmsley <pjw@kernel.org>
 M:	Samuel Holland <samuel.holland@sifive.com>
 L:	linux-riscv@lists.infradead.org
 S:	Supported
@@ -23680,6 +23725,12 @@ W:	https://linuxtv.org
 Q:	http://patchwork.linuxtv.org/project/linux-media/list/
 F:	drivers/media/dvb-frontends/sp2*
 
+SPACEMIT K1 I2C DRIVER
+M:	Troy Mitchell <troy.mitchell@linux.spacemit.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/i2c/spacemit,k1-i2c.yaml
+F:	drivers/i2c/busses/i2c-k1.c
+
 SPANISH DOCUMENTATION
 M:	Carlos Bilbao <carlos.bilbao@kernel.org>
 R:	Avadhut Naik <avadhut.naik@amd.com>
@@ -24225,6 +24276,12 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml
 F:	drivers/input/keyboard/sun4i-lradc-keys.c
 
+SUNDANCE NETWORK DRIVER
+M:	Denis Kirjanov <kirjanov@gmail.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/ethernet/dlink/sundance.c
+
 SUNPLUS ETHERNET DRIVER
 M:	Wells Lu <wellslutw@gmail.com>
 L:	netdev@vger.kernel.org
@@ -24446,9 +24503,8 @@ F:	Documentation/devicetree/bindings/media/snps,dw-hdmi-rx.yaml
 F:	drivers/media/platform/synopsys/hdmirx/*
 
 SYNOPSYS DESIGNWARE I2C DRIVER
-M:	Jarkko Nikula <jarkko.nikula@linux.intel.com>
+M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 R:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
-R:	Mika Westerberg <mika.westerberg@linux.intel.com>
 R:	Jan Dabros <jsd@semihalf.com>
 L:	linux-i2c@vger.kernel.org
 S:	Supported
@@ -26747,7 +26803,7 @@ F:	drivers/nvdimm/nd_virtio.c
 F:	drivers/nvdimm/virtio_pmem.c
 
 VIRTIO RTC DRIVER
-M:	Peter Hilber <quic_philber@quicinc.com>
+M:	Peter Hilber <peter.hilber@oss.qualcomm.com>
 L:	virtualization@lists.linux.dev
 S:	Maintained
 F:	drivers/virtio/virtio_rtc_*
@@ -27624,6 +27680,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
 F:	drivers/edac/versal_edac.c
 
+XILINX VERSALNET EDAC DRIVER
+M:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
+S:	Maintained
+F:	Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml
+F:	drivers/edac/versalnet_edac.c
+F:	include/linux/cdx/edac_cdx_pcol.h
+
 XILINX WATCHDOG DRIVER
 M:	Srinivas Neeli <srinivas.neeli@amd.com>
 R:	Shubhrajyoti Datta <shubhrajyoti.datta@amd.com>
diff --git a/Makefile b/Makefile
index 6bfe776bf3c5..d426446aeef5 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION =
 NAME = Baby Opossum Posse
 
 # *DOCUMENTATION*
@@ -1020,7 +1020,7 @@ KBUILD_AFLAGS	+= -fno-lto
 export CC_FLAGS_LTO
 endif
 
-ifdef CONFIG_CFI_CLANG
+ifdef CONFIG_CFI
 CC_FLAGS_CFI	:= -fsanitize=kcfi
 ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS
 	CC_FLAGS_CFI	+= -fsanitize-cfi-icall-experimental-normalize-integers
@@ -1444,11 +1444,11 @@ endif
 
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/
+	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/
 
 tools/%: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $*
+	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $*
 
 # ---------------------------------------------------------------------------
 # Kernel selftest
diff --git a/arch/Kconfig b/arch/Kconfig
index d1b4ffd6e085..c10a301950ff 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -41,6 +41,44 @@ config HOTPLUG_SMT
 config SMT_NUM_THREADS_DYNAMIC
 	bool
 
+config ARCH_SUPPORTS_SCHED_SMT
+	bool
+
+config ARCH_SUPPORTS_SCHED_CLUSTER
+	bool
+
+config ARCH_SUPPORTS_SCHED_MC
+	bool
+
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on ARCH_SUPPORTS_SCHED_SMT
+	default y
+	help
+	  Improves the CPU scheduler's decision making when dealing with
+	  MultiThreading at a cost of slightly increased overhead in some
+	  places. If unsure say N here.
+
+config SCHED_CLUSTER
+	bool "Cluster scheduler support"
+	depends on ARCH_SUPPORTS_SCHED_CLUSTER
+	default y
+	help
+	  Cluster scheduler support improves the CPU scheduler's decision
+	  making when dealing with machines that have clusters of CPUs.
+	  Cluster usually means a couple of CPUs which are placed closely
+	  by sharing mid-level caches, last-level cache tags or internal
+	  busses.
+
+config SCHED_MC
+	bool "Multi-Core Cache (MC) scheduler support"
+	depends on ARCH_SUPPORTS_SCHED_MC
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL
 config HOTPLUG_CORE_SYNC
 	bool
@@ -867,22 +905,26 @@ config PROPELLER_CLANG
 
 	  If unsure, say N.
 
-config ARCH_SUPPORTS_CFI_CLANG
+config ARCH_SUPPORTS_CFI
 	bool
 	help
-	  An architecture should select this option if it can support Clang's
-	  Control-Flow Integrity (CFI) checking.
+	  An architecture should select this option if it can support Kernel
+	  Control-Flow Integrity (CFI) checking (-fsanitize=kcfi).
 
 config ARCH_USES_CFI_TRAPS
 	bool
+	help
+	  An architecture should select this option if it requires the
+	  .kcfi_traps section for KCFI trap handling.
 
-config CFI_CLANG
-	bool "Use Clang's Control Flow Integrity (CFI)"
-	depends on ARCH_SUPPORTS_CFI_CLANG
+config CFI
+	bool "Use Kernel Control Flow Integrity (kCFI)"
+	default CFI_CLANG
+	depends on ARCH_SUPPORTS_CFI
 	depends on $(cc-option,-fsanitize=kcfi)
 	help
-	  This option enables Clang's forward-edge Control Flow Integrity
-	  (CFI) checking, where the compiler injects a runtime check to each
+	  This option enables forward-edge Control Flow Integrity (CFI)
+	  checking, where the compiler injects a runtime check to each
 	  indirect function call to ensure the target is a valid function with
 	  the correct static type. This restricts possible call targets and
 	  makes it more difficult for an attacker to exploit bugs that allow
@@ -891,10 +933,16 @@ config CFI_CLANG
 
 	    https://clang.llvm.org/docs/ControlFlowIntegrity.html
 
+config CFI_CLANG
+	bool
+	transitional
+	help
+	  Transitional config for CFI_CLANG to CFI migration.
+
 config CFI_ICALL_NORMALIZE_INTEGERS
 	bool "Normalize CFI tags for integers"
-	depends on CFI_CLANG
-	depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG
+	depends on CFI
+	depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS
 	help
 	  This option normalizes the CFI tags for integer types so that all
 	  integer types of the same size and signedness receive the same CFI
@@ -907,7 +955,7 @@ config CFI_ICALL_NORMALIZE_INTEGERS
 
 	  This option is necessary for using CFI with Rust. If unsure, say N.
 
-config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG
+config HAVE_CFI_ICALL_NORMALIZE_INTEGERS
 	def_bool y
 	depends on $(cc-option,-fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers)
 	# With GCOV/KASAN we need this fix: https://github.com/llvm/llvm-project/pull/104826
@@ -915,7 +963,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG
 
 config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC
 	def_bool y
-	depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG
+	depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS
 	depends on RUSTC_VERSION >= 107900
 	# With GCOV/KASAN we need this fix: https://github.com/rust-lang/rust/pull/129373
 	depends on (RUSTC_LLVM_VERSION >= 190103 && RUSTC_VERSION >= 108200) || \
@@ -923,7 +971,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC
 
 config CFI_PERMISSIVE
 	bool "Use CFI in permissive mode"
-	depends on CFI_CLANG
+	depends on CFI
 	help
 	  When selected, Control Flow Integrity (CFI) violations result in a
 	  warning instead of a kernel panic. This option should only be used
@@ -1730,6 +1778,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS
 	  relocations preserved. This is used by some architectures to
 	  construct bespoke relocation tables for KASLR.
 
+# Select if architecture uses the common generic TIF bits
+config HAVE_GENERIC_TIF_BITS
+       bool
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 3e33621922c3..76e4343c090f 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -328,7 +328,7 @@ static inline unsigned long ffz_b(unsigned long x)
 	return sum;
 }
 
-static inline unsigned long ffz(unsigned long word)
+static inline unsigned long __attribute_const__ ffz(unsigned long word)
 {
 #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)
 	/* Whee.  EV67 can calculate it directly.  */
@@ -348,7 +348,7 @@ static inline unsigned long ffz(unsigned long word)
 /*
  * __ffs = Find First set bit in word.  Undefined if no set bit exists.
  */
-static inline unsigned long __ffs(unsigned long word)
+static inline __attribute_const__ unsigned long __ffs(unsigned long word)
 {
 #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)
 	/* Whee.  EV67 can calculate it directly.  */
@@ -373,7 +373,7 @@ static inline unsigned long __ffs(unsigned long word)
  * differs in spirit from the above __ffs.
  */
 
-static inline int ffs(int word)
+static inline __attribute_const__ int ffs(int word)
 {
 	int result = __ffs(word) + 1;
 	return word ? result : 0;
@@ -383,14 +383,14 @@ static inline int ffs(int word)
  * fls: find last bit set.
  */
 #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)
-static inline int fls64(unsigned long word)
+static inline __attribute_const__ int fls64(unsigned long word)
 {
 	return 64 - __kernel_ctlz(word);
 }
 #else
 extern const unsigned char __flsm1_tab[256];
 
-static inline int fls64(unsigned long x)
+static inline __attribute_const__ int fls64(unsigned long x)
 {
 	unsigned long t, a, r;
 
@@ -403,12 +403,12 @@ static inline int fls64(unsigned long x)
 }
 #endif
 
-static inline unsigned long __fls(unsigned long x)
+static inline __attribute_const__ unsigned long __fls(unsigned long x)
 {
 	return fls64(x) - 1;
 }
 
-static inline int fls(unsigned int x)
+static inline __attribute_const__ int fls(unsigned int x)
 {
 	return fls64(x);
 }
diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
index e9dad60b147f..1ebb05890499 100644
--- a/arch/alpha/kernel/asm-offsets.c
+++ b/arch/alpha/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
  * This code generates raw asm output which is post-processed to extract
  * and format the required data.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/types.h>
 #include <linux/stddef.h>
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 582d96548385..06522451f018 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -231,7 +231,7 @@ flush_thread(void)
  */
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	extern void ret_from_fork(void);
diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c
index f77deb799175..2978da85fcb6 100644
--- a/arch/arc/kernel/asm-offsets.c
+++ b/arch/arc/kernel/asm-offsets.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
  */
+#define COMPILE_OFFSETS
 
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 186ceab661eb..8166d0908713 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -166,7 +166,7 @@ asmlinkage void ret_from_fork(void);
  */
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *c_regs;        /* child's pt_regs */
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b1f3df39ed40..358057001859 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -38,7 +38,7 @@ config ARM
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
 	select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6
 	select ARCH_SUPPORTS_ATOMIC_RMW
-	select ARCH_SUPPORTS_CFI_CLANG
+	select ARCH_SUPPORTS_CFI
 	select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
 	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_USE_BUILTIN_BSWAP
@@ -941,28 +941,14 @@ config IRQSTACKS
 config ARM_CPU_TOPOLOGY
 	bool "Support cpu topology definition"
 	depends on SMP && CPU_V7
+	select ARCH_SUPPORTS_SCHED_MC
+	select ARCH_SUPPORTS_SCHED_SMT
 	default y
 	help
 	  Support ARM cpu topology definition. The MPIDR register defines
 	  affinity between processors which is then used to describe the cpu
 	  topology of an ARM System.
 
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	depends on ARM_CPU_TOPOLOGY
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
-config SCHED_SMT
-	bool "SMT scheduler support"
-	depends on ARM_CPU_TOPOLOGY
-	help
-	  Improves the CPU scheduler's decision making when dealing with
-	  MultiThreading at a cost of slightly increased overhead in some
-	  places. If unsure say N here.
-
 config HAVE_ARM_SCU
 	bool
 	help
diff --git a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts
index 83d283cf6633..d425d9ee83db 100644
--- a/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts
+++ b/arch/arm/boot/dts/allwinner/sun4i-a10-olinuxino-lime.dts
@@ -218,7 +218,7 @@
 &usbphy {
 	usb0_id_det-gpios = <&pio 7 4 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH4 */
 	usb0_vbus_det-gpios = <&pio 7 5 (GPIO_ACTIVE_HIGH | GPIO_PULL_UP)>; /* PH5 */
-	usb0_vbus-supply   = <&reg_usb0_vbus>;
+	usb0_vbus-supply = <&reg_usb0_vbus>;
 	usb1_vbus-supply = <&reg_usb1_vbus>;
 	usb2_vbus-supply = <&reg_usb2_vbus>;
 	status = "okay";
diff --git a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi
index 272584881bb2..a0f787581dd9 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-q8-common.dtsi
@@ -82,7 +82,7 @@
 };
 
 &ehci0 {
-	status  = "okay";
+	status = "okay";
 };
 
 &mmc1 {
diff --git a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi
index fa162f7fa9f0..f0ed802a9d08 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-r40.dtsi
@@ -705,7 +705,7 @@
 			};
 
 			/omit-if-no-ref/
-			uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins{
+			uart2_rts_cts_pi_pins: uart2-rts-cts-pi-pins {
 				pins = "PI16", "PI17";
 				function = "uart2";
 			};
diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts
index 5143cb4e7b78..cb6292319f39 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts
+++ b/arch/arm/boot/dts/allwinner/sun8i-v3s-netcube-kumquat.dts
@@ -29,7 +29,7 @@
 	clk_can0: clock-can0 {
 		compatible = "fixed-clock";
 		#clock-cells = <0>;
-		clock-frequency  = <40000000>;
+		clock-frequency = <40000000>;
 	};
 
 	gpio-keys {
diff --git a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts
index ce0d6514eeb5..e4794ccb8e41 100644
--- a/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts
+++ b/arch/arm/boot/dts/intel/socfpga/socfpga_cyclone5_sodia.dts
@@ -66,8 +66,10 @@
 	mdio0 {
 		#address-cells = <1>;
 		#size-cells = <0>;
-		phy0: ethernet-phy@0 {
-			reg = <0>;
+		compatible = "snps,dwmac-mdio";
+
+		phy0: ethernet-phy@4 {
+			reg = <4>;
 			rxd0-skew-ps = <0>;
 			rxd1-skew-ps = <0>;
 			rxd2-skew-ps = <0>;
diff --git a/arch/arm/boot/dts/marvell/armada-370-db.dts b/arch/arm/boot/dts/marvell/armada-370-db.dts
index a7dc4c04d10b..a9a05d826f22 100644
--- a/arch/arm/boot/dts/marvell/armada-370-db.dts
+++ b/arch/arm/boot/dts/marvell/armada-370-db.dts
@@ -119,7 +119,7 @@
 			"Out Jack", "HPL",
 			"Out Jack", "HPR",
 			"AIN1L", "In Jack",
-			"AIN1L", "In Jack";
+			"AIN1R", "In Jack";
 		status = "okay";
 
 		simple-audio-card,dai-link@0 {
diff --git a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts
index d4e0b8150a84..cf26e2ceaaa0 100644
--- a/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts
+++ b/arch/arm/boot/dts/marvell/kirkwood-openrd-client.dts
@@ -38,7 +38,7 @@
 		simple-audio-card,mclk-fs = <256>;
 
 		simple-audio-card,cpu {
-			sound-dai = <&audio0 0>;
+			sound-dai = <&audio0>;
 		};
 
 		simple-audio-card,codec {
diff --git a/arch/arm/boot/dts/microchip/at91-sama7d65_curiosity.dts b/arch/arm/boot/dts/microchip/at91-sama7d65_curiosity.dts
index 7eaf6ca233ec..d086437f5e6f 100644
--- a/arch/arm/boot/dts/microchip/at91-sama7d65_curiosity.dts
+++ b/arch/arm/boot/dts/microchip/at91-sama7d65_curiosity.dts
@@ -387,6 +387,8 @@
 
 &sdmmc1 {
 	bus-width = <4>;
+	no-1-8-v;
+	sdhci-caps-mask = <0x0 0x00200000>;
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_sdmmc1_default>;
 	status = "okay";
diff --git a/arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts b/arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts
index 21f824b09191..decbf2726ec4 100644
--- a/arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts
+++ b/arch/arm/boot/dts/rockchip/rk3128-xpi-3128.dts
@@ -272,7 +272,7 @@
 	phy-mode = "rmii";
 	phy-handle = <&phy0>;
 	assigned-clocks = <&cru SCLK_MAC_SRC>;
-	assigned-clock-rates= <50000000>;
+	assigned-clock-rates = <50000000>;
 	pinctrl-names = "default";
 	pinctrl-0 = <&rmii_pins>;
 	status = "okay";
diff --git a/arch/arm/boot/dts/rockchip/rv1109-relfor-saib.dts b/arch/arm/boot/dts/rockchip/rv1109-relfor-saib.dts
index c13829d32c32..8a92700349b4 100644
--- a/arch/arm/boot/dts/rockchip/rv1109-relfor-saib.dts
+++ b/arch/arm/boot/dts/rockchip/rv1109-relfor-saib.dts
@@ -250,9 +250,9 @@
 &i2s0 {
 	/delete-property/ pinctrl-0;
 	rockchip,trcm-sync-rx-only;
-	pinctrl-0 =  <&i2s0m0_sclk_rx>,
-		     <&i2s0m0_lrck_rx>,
-		     <&i2s0m0_sdi0>;
+	pinctrl-0 = <&i2s0m0_sclk_rx>,
+		    <&i2s0m0_lrck_rx>,
+		    <&i2s0m0_sdi0>;
 	pinctrl-names = "default";
 	status = "okay";
 };
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index 6915c766923a..84070e9698e8 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_AES_ARM_BS=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
 CONFIG_CRYPTO_DEV_S5P=y
 CONFIG_DMA_CMA=y
diff --git a/arch/arm/configs/milbeaut_m10v_defconfig b/arch/arm/configs/milbeaut_m10v_defconfig
index a3be0b2ede09..a2995eb390c6 100644
--- a/arch/arm/configs/milbeaut_m10v_defconfig
+++ b/arch/arm/configs/milbeaut_m10v_defconfig
@@ -101,7 +101,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=64
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index f2822eeefb95..cc0e0e4a879c 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -1291,7 +1291,6 @@ CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_SUN4I_SS=m
 CONFIG_CRYPTO_DEV_FSL_CAAM=m
 CONFIG_CRYPTO_DEV_EXYNOS_RNG=m
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 939913ed9a73..1d5f75241739 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -708,7 +708,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_OMAP=m
 CONFIG_CRYPTO_DEV_OMAP_SHAM=m
 CONFIG_CRYPTO_DEV_OMAP_AES=m
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 1e5f3cdf691c..c436eec22d86 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -2,19 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (arm)"
 
-config CRYPTO_CURVE25519_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_KPP
-	select CRYPTO_LIB_CURVE25519_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
-	default CRYPTO_LIB_CURVE25519_INTERNAL
-	help
-	  Curve25519 algorithm
-
-	  Architecture: arm with
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_GHASH_ARM_CE
 	tristate "Hash functions: GHASH (PMULL/NEON/ARMv8 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 4f23999ae17d..6346a73effc0 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
-obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
@@ -18,4 +17,3 @@ blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
-curve25519-neon-y := curve25519-core.o curve25519-glue.o
diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c
deleted file mode 100644
index e7b87e09dd99..000000000000
--- a/arch/arm/crypto/curve25519-glue.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
- * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
- * manually reworked for use in kernel space.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/kpp.h>
-#include <crypto/internal/simd.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/jump_label.h>
-#include <linux/scatterlist.h>
-#include <crypto/curve25519.h>
-
-asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
-				const u8 secret[CURVE25519_KEY_SIZE],
-				const u8 basepoint[CURVE25519_KEY_SIZE]);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
-		     const u8 scalar[CURVE25519_KEY_SIZE],
-		     const u8 point[CURVE25519_KEY_SIZE])
-{
-	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		curve25519_neon(out, scalar, point);
-		kernel_neon_end();
-	} else {
-		curve25519_generic(out, scalar, point);
-	}
-}
-EXPORT_SYMBOL(curve25519_arch);
-
-void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
-			  const u8 secret[CURVE25519_KEY_SIZE])
-{
-	return curve25519_arch(pub, secret, curve25519_base_point);
-}
-EXPORT_SYMBOL(curve25519_base_arch);
-
-static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-				 unsigned int len)
-{
-	u8 *secret = kpp_tfm_ctx(tfm);
-
-	if (!len)
-		curve25519_generate_secret(secret);
-	else if (len == CURVE25519_KEY_SIZE &&
-		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
-		memcpy(secret, buf, CURVE25519_KEY_SIZE);
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_compute_value(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 public_key[CURVE25519_KEY_SIZE];
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-	u8 const *bp;
-
-	if (req->src) {
-		copied = sg_copy_to_buffer(req->src,
-					   sg_nents_for_len(req->src,
-							    CURVE25519_KEY_SIZE),
-					   public_key, CURVE25519_KEY_SIZE);
-		if (copied != CURVE25519_KEY_SIZE)
-			return -EINVAL;
-		bp = public_key;
-	} else {
-		bp = curve25519_base_point;
-	}
-
-	curve25519_arch(buf, secret, bp);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
-
-static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
-{
-	return CURVE25519_KEY_SIZE;
-}
-
-static struct kpp_alg curve25519_alg = {
-	.base.cra_name		= "curve25519",
-	.base.cra_driver_name	= "curve25519-neon",
-	.base.cra_priority	= 200,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
-
-	.set_secret		= curve25519_set_secret,
-	.generate_public_key	= curve25519_compute_value,
-	.compute_shared_secret	= curve25519_compute_value,
-	.max_size		= curve25519_max_size,
-};
-
-static int __init arm_curve25519_init(void)
-{
-	if (elf_hwcap & HWCAP_NEON) {
-		static_branch_enable(&have_neon);
-		return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
-			crypto_register_kpp(&curve25519_alg) : 0;
-	}
-	return 0;
-}
-
-static void __exit arm_curve25519_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
-		crypto_unregister_kpp(&curve25519_alg);
-}
-
-module_init(arm_curve25519_init);
-module_exit(arm_curve25519_exit);
-
-MODULE_ALIAS_CRYPTO("curve25519");
-MODULE_ALIAS_CRYPTO("curve25519-neon");
-MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h
index f80a85b091d6..ba2f771cca23 100644
--- a/arch/arm/include/asm/stacktrace.h
+++ b/arch/arm/include/asm/stacktrace.h
@@ -2,8 +2,9 @@
 #ifndef __ASM_STACKTRACE_H
 #define __ASM_STACKTRACE_H
 
-#include <asm/ptrace.h>
 #include <linux/llist.h>
+#include <asm/ptrace.h>
+#include <asm/sections.h>
 
 struct stackframe {
 	/*
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 123f4a8ef446..2101938d27fc 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -7,6 +7,8 @@
  * This code generates raw asm output which is post-processed to extract
  * and format the required data.
  */
+#define COMPILE_OFFSETS
+
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index a12efd0f43e8..cd4b34c96e35 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -904,7 +904,7 @@ unlock:
 	watchpoint_single_step_handler(addr);
 }
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 static void hw_breakpoint_cfi_handler(struct pt_regs *regs)
 {
 	/*
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index da488d92e7a0..55ca3fcd37e8 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -484,7 +484,7 @@ module_arch_cleanup(struct module *mod)
 #endif
 }
 
-void __weak module_arch_freeing_init(struct module *mod)
+void module_arch_freeing_init(struct module *mod)
 {
 #ifdef CONFIG_ARM_UNWIND
 	struct unwind_table *init = mod->arch.init_table;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index e16ed102960c..d7aa95225c70 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -234,7 +234,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long stack_start = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *thread = task_thread_info(p);
diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
index 04bd91c72521..c5ef27e3cd8f 100644
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config ARCH_MICROCHIP
+	bool
+
 menuconfig ARCH_AT91
 	bool "AT91/Microchip SoCs"
 	depends on (CPU_LITTLE_ENDIAN && (ARCH_MULTI_V4T || ARCH_MULTI_V5)) || \
@@ -8,6 +11,7 @@ menuconfig ARCH_AT91
 	select GPIOLIB
 	select PINCTRL
 	select SOC_BUS
+	select ARCH_MICROCHIP
 
 if ARCH_AT91
 config SOC_SAMV7
diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
index dc47b2312127..6ea1bd55acf8 100644
--- a/arch/arm/mach-imx/Kconfig
+++ b/arch/arm/mach-imx/Kconfig
@@ -242,7 +242,7 @@ choice
 
 	config VF_USE_PIT_TIMER
 		bool "Use PIT timer"
-		select VF_PIT_TIMER
+		select NXP_PIT_TIMER
 		help
 		  Use SoC Periodic Interrupt Timer (PIT) as clocksource
 
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index a195cd1d3e6d..1e2201013371 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -89,7 +89,7 @@ obj-$(CONFIG_CPU_V6)		+= proc-v6.o
 obj-$(CONFIG_CPU_V6K)		+= proc-v6.o
 obj-$(CONFIG_CPU_V7)		+= proc-v7.o proc-v7-bugs.o
 obj-$(CONFIG_CPU_V7M)		+= proc-v7m.o
-obj-$(CONFIG_CFI_CLANG)		+= proc.o
+obj-$(CONFIG_CFI)		+= proc.o
 
 obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
 obj-$(CONFIG_CACHE_B15_RAC)	+= cache-b15-rac.o
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index 4a3668b52a2d..e1641799569b 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -112,7 +112,7 @@ SYM_FUNC_END(fa_flush_user_cache_range)
  *	- end	 - virtual end address
  */
 SYM_TYPED_FUNC_START(fa_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	fa_coherent_user_range
 #endif
 SYM_FUNC_END(fa_coherent_kern_range)
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 0e94e5193dbd..001d7042bd46 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -104,7 +104,7 @@ SYM_FUNC_END(v4_coherent_user_range)
  *	- size	- region size
  */
 SYM_TYPED_FUNC_START(v4_flush_kern_dcache_area)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v4_dma_flush_range
 #endif
 SYM_FUNC_END(v4_flush_kern_dcache_area)
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index ce55a2eef5da..874fe5310f9a 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -136,7 +136,7 @@ SYM_FUNC_END(v4wb_flush_user_cache_range)
  */
 SYM_TYPED_FUNC_START(v4wb_flush_kern_dcache_area)
 	add	r1, r0, r1
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v4wb_coherent_user_range
 #endif
 SYM_FUNC_END(v4wb_flush_kern_dcache_area)
@@ -152,7 +152,7 @@ SYM_FUNC_END(v4wb_flush_kern_dcache_area)
  *	- end	 - virtual end address
  */
 SYM_TYPED_FUNC_START(v4wb_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v4wb_coherent_user_range
 #endif
 SYM_FUNC_END(v4wb_coherent_kern_range)
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index a97dc267b3b0..2ee62e4b2b07 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -108,7 +108,7 @@ SYM_FUNC_END(v4wt_flush_user_cache_range)
  *	- end	 - virtual end address
  */
 SYM_TYPED_FUNC_START(v4wt_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v4wt_coherent_user_range
 #endif
 SYM_FUNC_END(v4wt_coherent_kern_range)
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9f415476e218..5ceea8965ea1 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -117,7 +117,7 @@ SYM_FUNC_END(v6_flush_user_cache_range)
  *	- the Icache does not read data from the write buffer
  */
 SYM_TYPED_FUNC_START(v6_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v6_coherent_user_range
 #endif
 SYM_FUNC_END(v6_coherent_kern_range)
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 201ca05436fa..726681fb7d4d 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -261,7 +261,7 @@ SYM_FUNC_END(v7_flush_user_cache_range)
  *	- the Icache does not read data from the write buffer
  */
 SYM_TYPED_FUNC_START(v7_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v7_coherent_user_range
 #endif
 SYM_FUNC_END(v7_coherent_kern_range)
diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S
index 14d719eba729..7f9cfad2ea21 100644
--- a/arch/arm/mm/cache-v7m.S
+++ b/arch/arm/mm/cache-v7m.S
@@ -286,7 +286,7 @@ SYM_FUNC_END(v7m_flush_user_cache_range)
  *	- the Icache does not read data from the write buffer
  */
 SYM_TYPED_FUNC_START(v7m_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	v7m_coherent_user_range
 #endif
 SYM_FUNC_END(v7m_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index d0ce3414a13e..4612a4961e81 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -203,7 +203,7 @@ SYM_FUNC_END(arm1020_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm1020_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm1020_coherent_user_range
 #endif
 SYM_FUNC_END(arm1020_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index 64f031bf6eff..b4a8a3a8eda3 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -200,7 +200,7 @@ SYM_FUNC_END(arm1020e_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm1020e_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm1020e_coherent_user_range
 #endif
 SYM_FUNC_END(arm1020e_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index 42ed5ed07252..709870e99e19 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -199,7 +199,7 @@ SYM_FUNC_END(arm1022_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm1022_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm1022_coherent_user_range
 #endif
 SYM_FUNC_END(arm1022_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index b3ae62cd553a..02f7370a8c5c 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -194,7 +194,7 @@ SYM_FUNC_END(arm1026_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm1026_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm1026_coherent_user_range
 #endif
 SYM_FUNC_END(arm1026_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index a30df54ad5fa..4727f4b5b6e8 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -180,7 +180,7 @@ SYM_FUNC_END(arm920_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm920_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm920_coherent_user_range
 #endif
 SYM_FUNC_END(arm920_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index aac4e048100d..5a4a3f4f2683 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -182,7 +182,7 @@ SYM_FUNC_END(arm922_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm922_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm922_coherent_user_range
 #endif
 SYM_FUNC_END(arm922_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 035941faeb2e..1c4830afe1d3 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -229,7 +229,7 @@ SYM_FUNC_END(arm925_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm925_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm925_coherent_user_range
 #endif
 SYM_FUNC_END(arm925_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 6f43d6af2d9a..a09cc3e02efd 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -192,7 +192,7 @@ SYM_FUNC_END(arm926_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm926_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm926_coherent_user_range
 #endif
 SYM_FUNC_END(arm926_coherent_kern_range)
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 0d30bb25c42b..545c076c36d2 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -153,7 +153,7 @@ SYM_FUNC_END(arm940_coherent_kern_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm940_coherent_user_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm940_flush_kern_dcache_area
 #endif
 SYM_FUNC_END(arm940_coherent_user_range)
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 27750ace2ced..f3d4e18c3fba 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -173,7 +173,7 @@ SYM_FUNC_END(arm946_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(arm946_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	arm946_coherent_user_range
 #endif
 SYM_FUNC_END(arm946_coherent_kern_range)
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index f67b2ffac854..7f08d06c9625 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -208,7 +208,7 @@ SYM_FUNC_END(feroceon_flush_user_cache_range)
  */
 	.align	5
 SYM_TYPED_FUNC_START(feroceon_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	feroceon_coherent_user_range
 #endif
 SYM_FUNC_END(feroceon_coherent_kern_range)
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 8e9f38da863a..4669c63e3121 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -163,7 +163,7 @@ SYM_FUNC_END(mohawk_flush_user_cache_range)
  *	- end	- virtual end address
  */
 SYM_TYPED_FUNC_START(mohawk_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	mohawk_coherent_user_range
 #endif
 SYM_FUNC_END(mohawk_coherent_kern_range)
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 14927b380452..fd25634a2ed5 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -223,7 +223,7 @@ SYM_FUNC_END(xsc3_flush_user_cache_range)
  *	it also trashes the mini I-cache used by JTAG debuggers.
  */
 SYM_TYPED_FUNC_START(xsc3_coherent_kern_range)
-#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */
+#ifdef CONFIG_CFI /* Fallthrough if !CFI */
 	b	xsc3_coherent_user_range
 #endif
 SYM_FUNC_END(xsc3_coherent_kern_range)
diff --git a/arch/arm/mm/tlb-v4.S b/arch/arm/mm/tlb-v4.S
index 09ff69008d94..079774a02be6 100644
--- a/arch/arm/mm/tlb-v4.S
+++ b/arch/arm/mm/tlb-v4.S
@@ -52,7 +52,7 @@ SYM_FUNC_END(v4_flush_user_tlb_range)
  *	- start - virtual address (may not be aligned)
  *	- end   - virtual address (may not be aligned)
  */
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 SYM_TYPED_FUNC_START(v4_flush_kern_tlb_range)
 	b	.v4_flush_kern_tlb_range
 SYM_FUNC_END(v4_flush_kern_tlb_range)
diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c
index 885e0c5e8c20..3d96fb41d624 100644
--- a/arch/arm/probes/uprobes/core.c
+++ b/arch/arm/probes/uprobes/core.c
@@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	     unsigned long vaddr)
 {
 	return uprobe_write_opcode(auprobe, vma, vaddr,
-		   __opcode_to_mem_arm(auprobe->bpinsn));
+		   __opcode_to_mem_arm(auprobe->bpinsn), true);
 }
 
 bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e9bbfacc35a6..616b702f10d4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -100,7 +100,7 @@ config ARM64
 	select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
 	select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
-	select ARCH_SUPPORTS_CFI_CLANG
+	select ARCH_SUPPORTS_CFI
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_NUMA_BALANCING
@@ -108,6 +108,9 @@ config ARM64
 	select ARCH_SUPPORTS_PER_VMA_LOCK
 	select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
 	select ARCH_SUPPORTS_RT
+	select ARCH_SUPPORTS_SCHED_SMT
+	select ARCH_SUPPORTS_SCHED_CLUSTER
+	select ARCH_SUPPORTS_SCHED_MC
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
 	select ARCH_WANT_DEFAULT_BPF_JIT
@@ -151,6 +154,7 @@ config ARM64
 	select GENERIC_EARLY_IOREMAP
 	select GENERIC_IDLE_POLL_SETUP
 	select GENERIC_IOREMAP
+	select GENERIC_IRQ_ENTRY
 	select GENERIC_IRQ_IPI
 	select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
 	select GENERIC_IRQ_PROBE
@@ -212,7 +216,7 @@ config ARM64
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \
 		if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS
 	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \
-		if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \
+		if (DYNAMIC_FTRACE_WITH_ARGS && !CFI && \
 		    (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE))
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_ARGS
@@ -1138,6 +1142,7 @@ config ARM64_ERRATUM_3194386
 	  * ARM Neoverse-V1 erratum 3324341
 	  * ARM Neoverse V2 erratum 3324336
 	  * ARM Neoverse-V3 erratum 3312417
+	  * ARM Neoverse-V3AE erratum 3312417
 
 	  On affected cores "MSR SSBS, #0" instructions may not affect
 	  subsequent speculative instructions, which may permit unexepected
@@ -1493,7 +1498,7 @@ choice
 config CPU_BIG_ENDIAN
 	bool "Build big-endian kernel"
 	# https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
-	depends on AS_IS_GNU || AS_VERSION >= 150000
+	depends on (AS_IS_GNU || AS_VERSION >= 150000) && BROKEN
 	help
 	  Say Y if you plan on running a kernel with a big-endian userspace.
 
@@ -1505,29 +1510,6 @@ config CPU_LITTLE_ENDIAN
 
 endchoice
 
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
-config SCHED_CLUSTER
-	bool "Cluster scheduler support"
-	help
-	  Cluster scheduler support improves the CPU scheduler's decision
-	  making when dealing with machines that have clusters of CPUs.
-	  Cluster usually means a couple of CPUs which are placed closely
-	  by sharing mid-level caches, last-level cache tags or internal
-	  busses.
-
-config SCHED_SMT
-	bool "SMT scheduler support"
-	help
-	  Improves the CPU scheduler's decision making when dealing with
-	  MultiThreading at a cost of slightly increased overhead in some
-	  places. If unsure say N here.
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-4096)"
 	range 2 4096
@@ -1698,20 +1680,6 @@ config MITIGATE_SPECTRE_BRANCH_HISTORY
 	  When taking an exception from user-space, a sequence of branches
 	  or a firmware call overwrites the branch history.
 
-config RODATA_FULL_DEFAULT_ENABLED
-	bool "Apply r/o permissions of VM areas also to their linear aliases"
-	default y
-	help
-	  Apply read-only attributes of VM areas to the linear alias of
-	  the backing pages as well. This prevents code or read-only data
-	  from being modified (inadvertently or intentionally) via another
-	  mapping of the same memory page. This additional enhancement can
-	  be turned off at runtime by passing rodata=[off|on] (and turned on
-	  with rodata=full if this option is set to 'n')
-
-	  This requires the linear region to be mapped down to pages,
-	  which may adversely affect performance in some cases.
-
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
 	depends on !KCSAN
@@ -2218,14 +2186,13 @@ config ARM64_HAFT
 
 endmenu # "ARMv8.9 architectural features"
 
-menu "v9.4 architectural features"
+menu "ARMv9.4 architectural features"
 
 config ARM64_GCS
 	bool "Enable support for Guarded Control Stack (GCS)"
 	default y
 	select ARCH_HAS_USER_SHADOW_STACK
 	select ARCH_USES_HIGH_VMA_FLAGS
-	depends on !UPROBES
 	help
 	  Guarded Control Stack (GCS) provides support for a separate
 	  stack with restricted access which contains only return
@@ -2237,7 +2204,7 @@ config ARM64_GCS
 	  The feature is detected at runtime, and will remain disabled
 	  if the system does not implement the feature.
 
-endmenu # "v9.4 architectural features"
+endmenu # "ARMv9.4 architectural features"
 
 config ARM64_SVE
 	bool "ARM Scalable Vector Extension support"
diff --git a/arch/arm64/boot/dts/axiado/ax3000-evk.dts b/arch/arm64/boot/dts/axiado/ax3000-evk.dts
index 92101c5b534b..b86e96962557 100644
--- a/arch/arm64/boot/dts/axiado/ax3000-evk.dts
+++ b/arch/arm64/boot/dts/axiado/ax3000-evk.dts
@@ -14,6 +14,9 @@
 	#size-cells = <2>;
 
 	aliases {
+		serial0 = &uart0;
+		serial1 = &uart1;
+		serial2 = &uart2;
 		serial3 = &uart3;
 	};
 
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts b/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts
index d0fc5977258f..16078ff60ef0 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-data-modul-edm-sbc.dts
@@ -555,6 +555,7 @@
 	pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_gpio>;
 	cd-gpios = <&gpio2 12 GPIO_ACTIVE_LOW>;
 	vmmc-supply = <&reg_usdhc2_vmmc>;
+	vqmmc-supply = <&ldo5>;
 	bus-width = <4>;
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi
index 7f754e0a5d69..68c2e0156a5c 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-dhcom-som.dtsi
@@ -609,6 +609,7 @@
 	pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_gpio>;
 	cd-gpios = <&gpio2 12 GPIO_ACTIVE_LOW>;
 	vmmc-supply = <&reg_usdhc2_vmmc>;
+	vqmmc-supply = <&ldo5>;
 	bus-width = <4>;
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mp-ras314.dts b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mp-ras314.dts
index d7fd9d36f824..f7346b3d35fe 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mp-ras314.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mp-ras314.dts
@@ -467,6 +467,10 @@
 	status = "okay";
 };
 
+&reg_usdhc2_vqmmc {
+	status = "okay";
+};
+
 &sai5 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_sai5>;
@@ -876,8 +880,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d2>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d2>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d2>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d2>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d2>;
 	};
 
 	pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp {
@@ -886,8 +889,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d4>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>;
 	};
 
 	pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp {
@@ -896,8 +898,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d4>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>;
 	};
 
 	pinctrl_usdhc2_gpio: usdhc2-gpiogrp {
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts
index 33cd92e63c5d..4eedd00d83b9 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql-mba8mpxl.dts
@@ -604,6 +604,10 @@
 	status = "okay";
 };
 
+&reg_usdhc2_vqmmc {
+	status = "okay";
+};
+
 &sai3 {
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_sai3>;
@@ -983,8 +987,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d2>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d2>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d2>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d2>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d2>;
 	};
 
 	pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp {
@@ -993,8 +996,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d4>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>;
 	};
 
 	pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp {
@@ -1003,8 +1005,7 @@
 			   <MX8MP_IOMUXC_SD2_DATA0__USDHC2_DATA0	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA1__USDHC2_DATA1	0x1d4>,
 			   <MX8MP_IOMUXC_SD2_DATA2__USDHC2_DATA2	0x1d4>,
-			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>,
-			   <MX8MP_IOMUXC_GPIO1_IO04__USDHC2_VSELECT	0xc0>;
+			   <MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3	0x1d4>;
 	};
 
 	pinctrl_usdhc2_gpio: usdhc2-gpiogrp {
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
index fd70b686e7ef..9716f24f7c6e 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi
@@ -16,13 +16,18 @@
 		reg = <0x0 0x40000000 0 0x80000000>;
 	};
 
-	/* identical to buck4_reg, but should never change */
-	reg_vcc3v3: regulator-vcc3v3 {
-		compatible = "regulator-fixed";
-		regulator-name = "VCC3V3";
-		regulator-min-microvolt = <3300000>;
+	reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc {
+		compatible = "regulator-gpio";
+		pinctrl-names = "default";
+		pinctrl-0 = <&pinctrl_reg_usdhc2_vqmmc>;
+		regulator-name = "V_SD2";
+		regulator-min-microvolt = <1800000>;
 		regulator-max-microvolt = <3300000>;
-		regulator-always-on;
+		gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>;
+		states = <1800000 0x1>,
+			 <3300000 0x0>;
+		vin-supply = <&ldo5_reg>;
+		status = "disabled";
 	};
 };
 
@@ -173,17 +178,21 @@
 		read-only;
 		reg = <0x53>;
 		pagesize = <16>;
-		vcc-supply = <&reg_vcc3v3>;
+		vcc-supply = <&buck4_reg>;
 	};
 
 	m24c64: eeprom@57 {
 		compatible = "atmel,24c64";
 		reg = <0x57>;
 		pagesize = <32>;
-		vcc-supply = <&reg_vcc3v3>;
+		vcc-supply = <&buck4_reg>;
 	};
 };
 
+&usdhc2 {
+	vqmmc-supply = <&reg_usdhc2_vqmmc>;
+};
+
 &usdhc3 {
 	pinctrl-names = "default", "state_100mhz", "state_200mhz";
 	pinctrl-0 = <&pinctrl_usdhc3>;
@@ -193,7 +202,7 @@
 	non-removable;
 	no-sd;
 	no-sdio;
-	vmmc-supply = <&reg_vcc3v3>;
+	vmmc-supply = <&buck4_reg>;
 	vqmmc-supply = <&buck5_reg>;
 	status = "okay";
 };
@@ -233,6 +242,10 @@
 		fsl,pins = <MX8MP_IOMUXC_SD2_RESET_B__GPIO2_IO19	0x10>;
 	};
 
+	pinctrl_reg_usdhc2_vqmmc: regusdhc2vqmmcgrp {
+		fsl,pins = <MX8MP_IOMUXC_GPIO1_IO04__GPIO1_IO04		0xc0>;
+	};
+
 	pinctrl_usdhc3: usdhc3grp {
 		fsl,pins = <MX8MP_IOMUXC_NAND_WE_B__USDHC3_CLK		0x194>,
 			   <MX8MP_IOMUXC_NAND_WP_B__USDHC3_CMD		0x1d4>,
diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
index bb24dba7338e..d6d21e8498dc 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
@@ -298,7 +298,7 @@
 		cpu-thermal {
 			polling-delay-passive = <250>;
 			polling-delay = <2000>;
-			thermal-sensors = <&tmu 0>;
+			thermal-sensors = <&tmu 1>;
 			trips {
 				cpu_alert0: trip0 {
 					temperature = <85000>;
@@ -331,7 +331,7 @@
 		soc-thermal {
 			polling-delay-passive = <250>;
 			polling-delay = <2000>;
-			thermal-sensors = <&tmu 1>;
+			thermal-sensors = <&tmu 0>;
 			trips {
 				soc_alert0: trip0 {
 					temperature = <85000>;
diff --git a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
index 2f949a0d48d2..9d034275c847 100644
--- a/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
+++ b/arch/arm64/boot/dts/freescale/imx95-19x19-evk.dts
@@ -80,17 +80,17 @@
 	flexcan1_phy: can-phy0 {
 		compatible = "nxp,tjr1443";
 		#phy-cells = <0>;
-		max-bitrate = <1000000>;
+		max-bitrate = <8000000>;
 		enable-gpios = <&i2c6_pcal6416 6 GPIO_ACTIVE_HIGH>;
-		standby-gpios = <&i2c6_pcal6416 5 GPIO_ACTIVE_HIGH>;
+		standby-gpios = <&i2c6_pcal6416 5 GPIO_ACTIVE_LOW>;
 	};
 
 	flexcan2_phy: can-phy1 {
 		compatible = "nxp,tjr1443";
 		#phy-cells = <0>;
-		max-bitrate = <1000000>;
-		enable-gpios = <&i2c6_pcal6416 4 GPIO_ACTIVE_HIGH>;
-		standby-gpios = <&i2c6_pcal6416 3 GPIO_ACTIVE_HIGH>;
+		max-bitrate = <8000000>;
+		enable-gpios = <&i2c4_gpio_expander_21 4 GPIO_ACTIVE_HIGH>;
+		standby-gpios = <&i2c4_gpio_expander_21 3 GPIO_ACTIVE_LOW>;
 	};
 
 	reg_vref_1v8: regulator-1p8v {
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 4ca6a7ea586e..8296888bce59 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1843,7 +1843,7 @@
 				     <GIC_SPI 294 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&scmi_clk IMX95_CLK_VPU>,
 				 <&vpu_blk_ctrl IMX95_CLK_VPUBLK_JPEG_ENC>;
-			assigned-clocks = <&vpu_blk_ctrl IMX95_CLK_VPUBLK_JPEG_DEC>;
+			assigned-clocks = <&vpu_blk_ctrl IMX95_CLK_VPUBLK_JPEG_ENC>;
 			assigned-clock-parents = <&scmi_clk IMX95_CLK_VPUJPEG>;
 			power-domains = <&scmi_devpd IMX95_PD_VPU>;
 		};
diff --git a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi
index 0d4a5fd9503f..f2d278d171eb 100644
--- a/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-8040-mcbin.dtsi
@@ -345,11 +345,13 @@
 	/* CPS Lane 1 - U32 */
 	sata-port@0 {
 		phys = <&cp1_comphy1 0>;
+		status = "okay";
 	};
 
 	/* CPS Lane 3 - U31 */
 	sata-port@1 {
 		phys = <&cp1_comphy3 1>;
+		status = "okay";
 	};
 };
 
diff --git a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi
index ad0ab34b6602..bd42bfbe408b 100644
--- a/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi
+++ b/arch/arm64/boot/dts/marvell/cn9130-cf.dtsi
@@ -152,11 +152,12 @@
 
 /* SRDS #0 - SATA on M.2 connector */
 &cp0_sata0 {
-	phys = <&cp0_comphy0 1>;
 	status = "okay";
 
-	/* only port 1 is available */
-	/delete-node/ sata-port@0;
+	sata-port@1 {
+		phys = <&cp0_comphy0 1>;
+		status = "okay";
+	};
 };
 
 /* microSD */
diff --git a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts
index 47234d0858dd..338853d3b179 100644
--- a/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts
+++ b/arch/arm64/boot/dts/marvell/cn9131-cf-solidwan.dts
@@ -563,11 +563,13 @@
 
 /* SRDS #1 - SATA on M.2 (J44) */
 &cp1_sata0 {
-	phys = <&cp1_comphy1 0>;
 	status = "okay";
 
 	/* only port 0 is available */
-	/delete-node/ sata-port@1;
+	sata-port@0 {
+		phys = <&cp1_comphy1 0>;
+		status = "okay";
+	};
 };
 
 &cp1_syscon0 {
diff --git a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts
index 0f53745a6fa0..6f237d3542b9 100644
--- a/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts
+++ b/arch/arm64/boot/dts/marvell/cn9132-clearfog.dts
@@ -413,7 +413,13 @@
 /* SRDS #0,#1,#2,#3 - PCIe */
 &cp0_pcie0 {
 	num-lanes = <4>;
-	phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>;
+	/*
+	 * The mvebu-comphy driver does not currently know how to pass correct
+	 * lane-count to ATF while configuring the serdes lanes.
+	 * Rely on bootloader configuration only.
+	 *
+	 * phys = <&cp0_comphy0 0>, <&cp0_comphy1 0>, <&cp0_comphy2 0>, <&cp0_comphy3 0>;
+	 */
 	status = "okay";
 };
 
@@ -475,7 +481,13 @@
 /* SRDS #0,#1 - PCIe */
 &cp1_pcie0 {
 	num-lanes = <2>;
-	phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>;
+	/*
+	 * The mvebu-comphy driver does not currently know how to pass correct
+	 * lane-count to ATF while configuring the serdes lanes.
+	 * Rely on bootloader configuration only.
+	 *
+	 * phys = <&cp1_comphy0 0>, <&cp1_comphy1 0>;
+	 */
 	status = "okay";
 };
 
@@ -512,10 +524,9 @@
 	status = "okay";
 
 	/* only port 1 is available */
-	/delete-node/ sata-port@0;
-
 	sata-port@1 {
 		phys = <&cp1_comphy3 1>;
+		status = "okay";
 	};
 };
 
@@ -631,9 +642,8 @@
 	status = "okay";
 
 	/* only port 1 is available */
-	/delete-node/ sata-port@0;
-
 	sata-port@1 {
+		status = "okay";
 		phys = <&cp2_comphy3 1>;
 	};
 };
diff --git a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi
index afc041c1c448..bb2bb47fd77c 100644
--- a/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi
+++ b/arch/arm64/boot/dts/marvell/cn9132-sr-cex7.dtsi
@@ -137,6 +137,14 @@
 	pinctrl-0 = <&ap_mmc0_pins>;
 	pinctrl-names = "default";
 	vqmmc-supply = <&v_1_8>;
+	/*
+	 * Not stable in HS modes - phy needs "more calibration", so disable
+	 * UHS (by preventing voltage switch), SDR104, SDR50 and DDR50 modes.
+	 */
+	no-1-8-v;
+	no-sd;
+	no-sdio;
+	non-removable;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/px30-pp1516.dtsi b/arch/arm64/boot/dts/rockchip/px30-pp1516.dtsi
index b4bd4e34747c..192791993f05 100644
--- a/arch/arm64/boot/dts/rockchip/px30-pp1516.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-pp1516.dtsi
@@ -72,7 +72,7 @@
 	};
 
 	vcc_cam_avdd: regulator-vcc-cam-avdd {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		regulator-name = "vcc_cam_avdd";
 		gpio = <&gpio3 RK_PC0 GPIO_ACTIVE_LOW>;
 		pinctrl-names = "default";
@@ -83,7 +83,7 @@
 	};
 
 	vcc_cam_dovdd: regulator-vcc-cam-dovdd {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		regulator-name = "vcc_cam_dovdd";
 		gpio = <&gpio3 RK_PC1 GPIO_ACTIVE_LOW>;
 		pinctrl-names = "default";
@@ -94,7 +94,7 @@
 	};
 
 	vcc_cam_dvdd: regulator-vcc-cam-dvdd {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		regulator-name = "vcc_cam_dvdd";
 		gpio = <&gpio3 RK_PC5 GPIO_ACTIVE_HIGH>;
 		enable-active-high;
@@ -106,7 +106,7 @@
 	};
 
 	vcc_lens_afvdd: regulator-vcc-lens-afvdd {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		regulator-name = "vcc_lens_afvdd";
 		gpio = <&gpio3 RK_PB2 GPIO_ACTIVE_LOW>;
 		pinctrl-names = "default";
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou-video-demo.dtso b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou-video-demo.dtso
index ea5ce919984f..760d5139f95d 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou-video-demo.dtso
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou-video-demo.dtso
@@ -26,7 +26,7 @@
 	};
 
 	cam_afvdd_2v8: regulator-cam-afvdd-2v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		gpio = <&pca9670 2 GPIO_ACTIVE_LOW>;
 		regulator-max-microvolt = <2800000>;
 		regulator-min-microvolt = <2800000>;
@@ -35,7 +35,7 @@
 	};
 
 	cam_avdd_2v8: regulator-cam-avdd-2v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		gpio = <&pca9670 4 GPIO_ACTIVE_LOW>;
 		regulator-max-microvolt = <2800000>;
 		regulator-min-microvolt = <2800000>;
@@ -44,7 +44,7 @@
 	};
 
 	cam_dovdd_1v8: regulator-cam-dovdd-1v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 	        gpio = <&pca9670 3 GPIO_ACTIVE_LOW>;
 	        regulator-max-microvolt = <1800000>;
 	        regulator-min-microvolt = <1800000>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3308-sakurapi-rk3308b.dts b/arch/arm64/boot/dts/rockchip/rk3308-sakurapi-rk3308b.dts
index f9f633aebb64..e5e6b800c2d1 100644
--- a/arch/arm64/boot/dts/rockchip/rk3308-sakurapi-rk3308b.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3308-sakurapi-rk3308b.dts
@@ -260,6 +260,6 @@
 	status = "okay";
 };
 
-&usb_host_ohci{
+&usb_host_ohci {
 	status = "okay";
 };
diff --git a/arch/arm64/boot/dts/rockchip/rk3368-lba3368.dts b/arch/arm64/boot/dts/rockchip/rk3368-lba3368.dts
index b99bb0a5f900..b9801a691b48 100644
--- a/arch/arm64/boot/dts/rockchip/rk3368-lba3368.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3368-lba3368.dts
@@ -609,7 +609,7 @@
 
 	bluetooth {
 		compatible = "brcm,bcm4345c5";
-		interrupts-extended = <&gpio3 RK_PA7 GPIO_ACTIVE_HIGH>;
+		interrupts-extended = <&gpio3 RK_PA7 IRQ_TYPE_LEVEL_HIGH>;
 		interrupt-names = "host-wakeup";
 		clocks = <&rk808 RK808_CLKOUT1>;
 		clock-names = "lpo";
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
index 5a8551d9ffe4..b33a1509a8e9 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
@@ -959,6 +959,7 @@
 		reg = <0>;
 		m25p,fast-read;
 		spi-max-frequency = <10000000>;
+		vcc-supply = <&vcc_3v0>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts
index 585ef0fd88ef..6f97e57f36f5 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-pinephone-pro.dts
@@ -754,6 +754,7 @@
 		compatible = "jedec,spi-nor";
 		reg = <0>;
 		spi-max-frequency = <10000000>;
+		vcc-supply = <&vcc_1v8>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
index d28880b8dd44..5e8f729c2cf2 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
+++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
@@ -26,7 +26,7 @@
 	};
 
 	cam_afvdd_2v8: regulator-cam-afvdd-2v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		gpio = <&pca9670 2 GPIO_ACTIVE_LOW>;
 		regulator-max-microvolt = <2800000>;
 		regulator-min-microvolt = <2800000>;
@@ -35,7 +35,7 @@
 	};
 
 	cam_avdd_2v8: regulator-cam-avdd-2v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 		gpio = <&pca9670 4 GPIO_ACTIVE_LOW>;
 		regulator-max-microvolt = <2800000>;
 		regulator-min-microvolt = <2800000>;
@@ -44,7 +44,7 @@
 	};
 
 	cam_dovdd_1v8: regulator-cam-dovdd-1v8 {
-		compatible  = "regulator-fixed";
+		compatible = "regulator-fixed";
 	        gpio = <&pca9670 3 GPIO_ACTIVE_LOW>;
 	        regulator-max-microvolt = <1800000>;
 	        regulator-min-microvolt = <1800000>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
index e7ba477e75f9..7f578c50b4ad 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
@@ -53,7 +53,7 @@
 			gpios = <&gpio4 RK_PA1 GPIO_ACTIVE_LOW>;
 			linux,default-trigger = "default-on";
 			pinctrl-names = "default";
-			pinctrl-0 =<&blue_led>;
+			pinctrl-0 = <&blue_led>;
 		};
 
 		led-1 {
@@ -62,7 +62,7 @@
 			gpios = <&gpio0 RK_PB7 GPIO_ACTIVE_LOW>;
 			linux,default-trigger = "heartbeat";
 			pinctrl-names = "default";
-			pinctrl-0 =<&heartbeat_led>;
+			pinctrl-0 = <&heartbeat_led>;
 		};
 	};
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
index 101e2ee9766d..3386084f6318 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3576-armsom-sige5.dts
@@ -302,8 +302,7 @@
 		     &eth1m0_tx_bus2
 		     &eth1m0_rx_bus2
 		     &eth1m0_rgmii_clk
-		     &eth1m0_rgmii_bus
-		     &ethm0_clk1_25m_out>;
+		     &eth1m0_rgmii_bus>;
 	status = "okay";
 };
 
@@ -784,7 +783,6 @@
 	rgmii_phy0: phy@1 {
 		compatible = "ethernet-phy-ieee802.3-c22";
 		reg = <0x1>;
-		clocks = <&cru REFCLKO25M_GMAC0_OUT>;
 		pinctrl-names = "default";
 		pinctrl-0 = <&gmac0_rst>;
 		reset-assert-us = <20000>;
@@ -797,7 +795,6 @@
 	rgmii_phy1: phy@1 {
 		compatible = "ethernet-phy-ieee802.3-c22";
 		reg = <0x1>;
-		clocks = <&cru REFCLKO25M_GMAC1_OUT>;
 		pinctrl-names = "default";
 		pinctrl-0 = <&gmac1_rst>;
 		reset-assert-us = <20000>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3582-radxa-e52c.dts b/arch/arm64/boot/dts/rockchip/rk3582-radxa-e52c.dts
index e04f21d8c831..431ff77d4518 100644
--- a/arch/arm64/boot/dts/rockchip/rk3582-radxa-e52c.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3582-radxa-e52c.dts
@@ -250,6 +250,7 @@
 		compatible = "belling,bl24c16a", "atmel,24c16";
 		reg = <0x50>;
 		pagesize = <16>;
+		read-only;
 		vcc-supply = <&vcc_3v3_pmu>;
 	};
 };
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi
index 3d8b6f0c5541..69833a0a94d0 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-nanopc-t6.dtsi
@@ -731,6 +731,7 @@
 		spi-max-frequency = <104000000>;
 		spi-rx-bus-width = <4>;
 		spi-tx-bus-width = <1>;
+		vcc-supply = <&vcc_1v8_s3>;
 	};
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts
index 121e4d1c3fa5..8222f1fae8fa 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts
@@ -77,7 +77,7 @@
 	pinctrl-names = "default";
 	pinctrl-0 = <&hp_detect>;
 	simple-audio-card,aux-devs = <&speaker_amp>, <&headphone_amp>;
-	simple-audio-card,hp-det-gpios = <&gpio1 RK_PD3 GPIO_ACTIVE_LOW>;
+	simple-audio-card,hp-det-gpios = <&gpio1 RK_PD3 GPIO_ACTIVE_HIGH>;
 	simple-audio-card,widgets =
 		"Microphone", "Onboard Microphone",
 		"Microphone", "Microphone Jack",
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5.dtsi
index 91d56c34a1e4..8a8f3b26754d 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5.dtsi
@@ -365,6 +365,8 @@
 	max-frequency = <200000000>;
 	mmc-hs400-1_8v;
 	mmc-hs400-enhanced-strobe;
+	vmmc-supply = <&vcc_3v3_s3>;
+	vqmmc-supply = <&vcc_1v8_s3>;
 	status = "okay";
 };
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5t.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5t.dts
index 258c7400301d..f16ff0064309 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5t.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5t.dts
@@ -68,6 +68,22 @@
 	status = "okay";
 };
 
+&pcie30phy {
+	data-lanes = <1 1 2 2>;
+};
+
+&pcie3x2 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&pcie3x2_rst>;
+	reset-gpios = <&gpio4 RK_PB0 GPIO_ACTIVE_HIGH>;
+	vpcie3v3-supply = <&vcc3v3_pcie30>;
+	status = "okay";
+};
+
+&pcie3x4 {
+	num-lanes = <2>;
+};
+
 &pinctrl {
 	hdmirx {
 		hdmirx_hpd: hdmirx-5v-detection {
@@ -90,11 +106,23 @@
 		};
 	};
 
+	pcie3 {
+		pcie3x2_rst: pcie3x2-rst {
+			rockchip,pins = <4 RK_PB0 RK_FUNC_GPIO &pcfg_pull_none>;
+		};
+	};
+
 	sound {
 		hp_detect: hp-detect {
 			rockchip,pins = <4 RK_PC3 RK_FUNC_GPIO &pcfg_pull_none>;
 		};
 	};
+
+	usb {
+		vcc5v0_host_en: vcc5v0-host-en {
+			rockchip,pins = <1 RK_PA1 RK_FUNC_GPIO &pcfg_pull_none>;
+		};
+	};
 };
 
 &vcc3v3_pcie2x1l0 {
@@ -103,3 +131,10 @@
 	pinctrl-0 = <&pcie2_0_vcc3v3_en>;
 	status = "okay";
 };
+
+&vcc5v0_host {
+	enable-active-high;
+	gpio = <&gpio1 RK_PA1 GPIO_ACTIVE_HIGH>;
+	pinctrl-names = "default";
+	pinctrl-0 = <&vcc5v0_host_en>;
+};
diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
index 3045cb3bd68c..9884a5df47df 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
@@ -28,7 +28,7 @@
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp-1200000000{
+		opp-1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <750000 750000 950000>;
 			clock-latency-ns = <40000>;
@@ -49,7 +49,7 @@
 		compatible = "operating-points-v2";
 		opp-shared;
 
-		opp-1200000000{
+		opp-1200000000 {
 			opp-hz = /bits/ 64 <1200000000>;
 			opp-microvolt = <750000 750000 950000>;
 			clock-latency-ns = <40000>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi
index 4fedc50cce8c..11940c77f2bd 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dtsi
@@ -42,9 +42,8 @@
 		simple-audio-card,bitclock-master = <&masterdai>;
 		simple-audio-card,format = "i2s";
 		simple-audio-card,frame-master = <&masterdai>;
-		simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_LOW>;
+		simple-audio-card,hp-det-gpios = <&gpio1 RK_PD5 GPIO_ACTIVE_HIGH>;
 		simple-audio-card,mclk-fs = <256>;
-		simple-audio-card,pin-switches = "Headphones";
 		simple-audio-card,routing =
 			"Headphones", "LOUT1",
 			"Headphones", "ROUT1",
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-roc-pc.dts b/arch/arm64/boot/dts/rockchip/rk3588s-roc-pc.dts
index 7434ac39246f..7e179862da6e 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-roc-pc.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-roc-pc.dts
@@ -320,9 +320,9 @@
 &i2c3 {
 	status = "okay";
 
-	es8388: audio-codec@10 {
+	es8388: audio-codec@11 {
 		compatible = "everest,es8388", "everest,es8328";
-		reg = <0x10>;
+		reg = <0x11>;
 		clocks = <&cru I2S1_8CH_MCLKOUT>;
 		AVDD-supply = <&vcc_3v3_s0>;
 		DVDD-supply = <&vcc_1v8_s0>;
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index bf13d676aae2..e223cbf350e4 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -871,6 +871,8 @@ static inline bool system_supports_pmuv3(void)
 	return cpus_have_final_cap(ARM64_HAS_PMUV3);
 }
 
+bool cpu_supports_bbml2_noabort(void);
+
 static inline bool system_supports_bbml2_noabort(void)
 {
 	return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 661735616787..9b00b75acbf2 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -81,7 +81,6 @@
 #define ARM_CPU_PART_CORTEX_A78AE	0xD42
 #define ARM_CPU_PART_CORTEX_X1		0xD44
 #define ARM_CPU_PART_CORTEX_A510	0xD46
-#define ARM_CPU_PART_CORTEX_X1C		0xD4C
 #define ARM_CPU_PART_CORTEX_A520	0xD80
 #define ARM_CPU_PART_CORTEX_A710	0xD47
 #define ARM_CPU_PART_CORTEX_A715	0xD4D
@@ -93,9 +92,11 @@
 #define ARM_CPU_PART_NEOVERSE_V2	0xD4F
 #define ARM_CPU_PART_CORTEX_A720	0xD81
 #define ARM_CPU_PART_CORTEX_X4		0xD82
+#define ARM_CPU_PART_NEOVERSE_V3AE	0xD83
 #define ARM_CPU_PART_NEOVERSE_V3	0xD84
 #define ARM_CPU_PART_CORTEX_X925	0xD85
 #define ARM_CPU_PART_CORTEX_A725	0xD87
+#define ARM_CPU_PART_CORTEX_A720AE	0xD89
 #define ARM_CPU_PART_NEOVERSE_N3	0xD8E
 
 #define APM_CPU_PART_XGENE		0x000
@@ -129,6 +130,7 @@
 
 #define NVIDIA_CPU_PART_DENVER		0x003
 #define NVIDIA_CPU_PART_CARMEL		0x004
+#define NVIDIA_CPU_PART_OLYMPUS		0x010
 
 #define FUJITSU_CPU_PART_A64FX		0x001
 
@@ -170,7 +172,6 @@
 #define MIDR_CORTEX_A78AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE)
 #define MIDR_CORTEX_X1	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1)
 #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510)
-#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C)
 #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520)
 #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710)
 #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715)
@@ -182,9 +183,11 @@
 #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2)
 #define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720)
 #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4)
+#define MIDR_NEOVERSE_V3AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3AE)
 #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3)
 #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925)
 #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725)
+#define MIDR_CORTEX_A720AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720AE)
 #define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3)
 #define MIDR_THUNDERX	MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
@@ -220,6 +223,7 @@
 
 #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER)
 #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL)
+#define MIDR_NVIDIA_OLYMPUS MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_OLYMPUS)
 #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
 #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110)
 #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09)
diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h
index fbb5c99eb2f9..5fca48009043 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -128,7 +128,7 @@ static inline void local_daif_inherit(struct pt_regs *regs)
 {
 	unsigned long flags = regs->pstate & DAIF_MASK;
 
-	if (interrupts_enabled(regs))
+	if (!regs_irqs_disabled(regs))
 		trace_hardirqs_on();
 
 	if (system_uses_irq_prio_masking())
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 46033027510c..b37da3ee8529 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -91,6 +91,14 @@
 	msr	cntvoff_el2, xzr		// Clear virtual offset
 .endm
 
+/* Branch to skip_label if SPE version is less than given version */
+.macro __spe_vers_imp skip_label, version, tmp
+    mrs    \tmp, id_aa64dfr0_el1
+    ubfx   \tmp, \tmp, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4
+    cmp    \tmp, \version
+    b.lt   \skip_label
+.endm
+
 .macro __init_el2_debug
 	mrs	x1, id_aa64dfr0_el1
 	ubfx	x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
@@ -103,8 +111,7 @@
 	csel	x2, xzr, x0, eq			// all PMU counters from EL1
 
 	/* Statistical profiling */
-	ubfx	x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4
-	cbz	x0, .Lskip_spe_\@		// Skip if SPE not present
+	__spe_vers_imp .Lskip_spe_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x0 // Skip if SPE not present
 
 	mrs_s	x0, SYS_PMBIDR_EL1              // If SPE available at EL2,
 	and	x0, x0, #(1 << PMBIDR_EL1_P_SHIFT)
@@ -263,10 +270,8 @@
 
 	mov	x0, xzr
 	mov	x2, xzr
-	mrs	x1, id_aa64dfr0_el1
-	ubfx	x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4
-	cmp	x1, #3
-	b.lt	.Lskip_spe_fgt_\@
+	/* If SPEv1p2 is implemented, */
+	__spe_vers_imp .Lskip_spe_fgt_\@, #ID_AA64DFR0_EL1_PMSVer_V1P2, x1
 	/* Disable PMSNEVFR_EL1 read and write traps */
 	orr	x0, x0, #HDFGRTR_EL2_nPMSNEVFR_EL1_MASK
 	orr	x2, x2, #HDFGWTR_EL2_nPMSNEVFR_EL1_MASK
@@ -387,6 +392,17 @@
 	orr	x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0
 	orr	x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1
 .Lskip_pmuv3p9_\@:
+	/* If SPE is implemented, */
+	__spe_vers_imp .Lskip_spefds_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x1
+	/* we can read PMSIDR and */
+	mrs_s	x1, SYS_PMSIDR_EL1
+	and	x1, x1,  #PMSIDR_EL1_FDS
+	/* if FEAT_SPE_FDS is implemented, */
+	cbz	x1, .Lskip_spefds_\@
+	/* disable traps of PMSDSFR to EL2. */
+	orr	x0, x0, #HDFGRTR2_EL2_nPMSDSFR_EL1
+
+.Lskip_spefds_\@:
 	msr_s   SYS_HDFGRTR2_EL2, x0
 	msr_s   SYS_HDFGWTR2_EL2, x0
 	msr_s   SYS_HFGRTR2_EL2, xzr
diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h
new file mode 100644
index 000000000000..cab8cd78f693
--- /dev/null
+++ b/arch/arm64/include/asm/entry-common.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_ARM64_ENTRY_COMMON_H
+#define _ASM_ARM64_ENTRY_COMMON_H
+
+#include <linux/thread_info.h>
+
+#include <asm/cpufeature.h>
+#include <asm/daifflags.h>
+#include <asm/fpsimd.h>
+#include <asm/mte.h>
+#include <asm/stacktrace.h>
+
+#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_MTE_ASYNC_FAULT | _TIF_FOREIGN_FPSTATE)
+
+static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+							unsigned long ti_work)
+{
+	if (ti_work & _TIF_MTE_ASYNC_FAULT) {
+		clear_thread_flag(TIF_MTE_ASYNC_FAULT);
+		send_sig_fault(SIGSEGV, SEGV_MTEAERR, (void __user *)NULL, current);
+	}
+
+	if (ti_work & _TIF_FOREIGN_FPSTATE)
+		fpsimd_restore_current_state();
+}
+
+#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work
+
+static inline bool arch_irqentry_exit_need_resched(void)
+{
+	/*
+	 * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC
+	 * priority masking is used the GIC irqchip driver will clear DAIF.IF
+	 * using gic_arch_enable_irqs() for normal IRQs. If anything is set in
+	 * DAIF we must have handled an NMI, so skip preemption.
+	 */
+	if (system_uses_irq_prio_masking() && read_sysreg(daif))
+		return false;
+
+	/*
+	 * Preempting a task from an IRQ means we leave copies of PSTATE
+	 * on the stack. cpufeature's enable calls may modify PSTATE, but
+	 * resuming one of these preempted tasks would undo those changes.
+	 *
+	 * Only allow a task to be preempted once cpufeatures have been
+	 * enabled.
+	 */
+	if (!system_capabilities_finalized())
+		return false;
+
+	return true;
+}
+
+#define arch_irqentry_exit_need_resched arch_irqentry_exit_need_resched
+
+#endif /* _ASM_ARM64_ENTRY_COMMON_H */
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index e3874c4fc399..a2da3cb21c24 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -89,7 +89,6 @@ void do_el1_fpac(struct pt_regs *regs, unsigned long esr);
 void do_el0_mops(struct pt_regs *regs, unsigned long esr);
 void do_el1_mops(struct pt_regs *regs, unsigned long esr);
 void do_serror(struct pt_regs *regs, unsigned long esr);
-void do_signal(struct pt_regs *regs);
 
 void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far);
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 5bc432234d3a..8fa0707069e8 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -21,7 +21,7 @@ static inline void gcsstr(u64 *addr, u64 val)
 	register u64 *_addr __asm__ ("x0") = addr;
 	register long _val __asm__ ("x1") = val;
 
-	/* GCSSTTR x1, x0 */
+	/* GCSSTTR x1, [x0] */
 	asm volatile(
 		".inst 0xd91f1c01\n"
 		:
@@ -81,6 +81,82 @@ static inline int gcs_check_locked(struct task_struct *task,
 	return 0;
 }
 
+static inline int gcssttr(unsigned long __user *addr, unsigned long val)
+{
+	register unsigned long __user *_addr __asm__ ("x0") = addr;
+	register unsigned long _val __asm__ ("x1") = val;
+	int err = 0;
+
+	/* GCSSTTR x1, [x0] */
+	asm volatile(
+		"1: .inst 0xd91f1c01\n"
+		"2: \n"
+		_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
+		: "+r" (err)
+		: "rZ" (_val), "r" (_addr)
+		: "memory");
+
+	return err;
+}
+
+static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
+				int *err)
+{
+	int ret;
+
+	if (!access_ok((char __user *)addr, sizeof(u64))) {
+		*err = -EFAULT;
+		return;
+	}
+
+	uaccess_ttbr0_enable();
+	ret = gcssttr(addr, val);
+	if (ret != 0)
+		*err = ret;
+	uaccess_ttbr0_disable();
+}
+
+static inline void push_user_gcs(unsigned long val, int *err)
+{
+	u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+
+	gcspr -= sizeof(u64);
+	put_user_gcs(val, (unsigned long __user *)gcspr, err);
+	if (!*err)
+		write_sysreg_s(gcspr, SYS_GCSPR_EL0);
+}
+
+/*
+ * Unlike put/push_user_gcs() above, get/pop_user_gsc() doesn't
+ * validate the GCS permission is set on the page being read.  This
+ * differs from how the hardware works when it consumes data stored at
+ * GCSPR. Callers should ensure this is acceptable.
+ */
+static inline u64 get_user_gcs(unsigned long __user *addr, int *err)
+{
+	unsigned long ret;
+	u64 load = 0;
+
+	/* Ensure previous GCS operation are visible before we read the page */
+	gcsb_dsync();
+	ret = copy_from_user(&load, addr, sizeof(load));
+	if (ret != 0)
+		*err = ret;
+	return load;
+}
+
+static inline u64 pop_user_gcs(int *err)
+{
+	u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+	u64 read_val;
+
+	read_val = get_user_gcs((__force unsigned long __user *)gcspr, err);
+	if (!*err)
+		write_sysreg_s(gcspr + sizeof(u64), SYS_GCSPR_EL0);
+
+	return read_val;
+}
+
 #else
 
 static inline bool task_gcs_el0_enabled(struct task_struct *task)
@@ -91,6 +167,10 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task)
 static inline void gcs_set_el0_mode(struct task_struct *task) { }
 static inline void gcs_free(struct task_struct *task) { }
 static inline void gcs_preserve_current_state(void) { }
+static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
+				int *err) { }
+static inline void push_user_gcs(unsigned long val, int *err) { }
+
 static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 						   const struct kernel_clone_args *args)
 {
@@ -101,6 +181,15 @@ static inline int gcs_check_locked(struct task_struct *task,
 {
 	return 0;
 }
+static inline u64 get_user_gcs(unsigned long __user *addr, int *err)
+{
+	*err = -EFAULT;
+	return 0;
+}
+static inline u64 pop_user_gcs(int *err)
+{
+	return 0;
+}
 
 #endif
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 13f94c8ddfc0..6d567265467c 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -178,6 +178,7 @@
 #define __khwcap3_feature(x)		(const_ilog2(HWCAP3_ ## x) + 128)
 #define KERNEL_HWCAP_MTE_FAR		__khwcap3_feature(MTE_FAR)
 #define KERNEL_HWCAP_MTE_STORE_ONLY	__khwcap3_feature(MTE_STORE_ONLY)
+#define KERNEL_HWCAP_LSFE		__khwcap3_feature(LSFE)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 9b96840fb979..83e03abbb2ca 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -274,6 +274,10 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook);
 #define ioremap_np(addr, size)	\
 	ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
 
+
+#define ioremap_encrypted(addr, size)	\
+	ioremap_prot((addr), (size), PAGE_KERNEL)
+
 /*
  * io{read,write}{16,32,64}be() macros
  */
@@ -311,7 +315,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
 static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size)
 {
 	if (unlikely(is_realm_world()))
-		return __arm64_is_protected_mmio(phys_addr, size);
+		return arm64_rsi_is_protected(phys_addr, size);
 	return false;
 }
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2f2394cce24e..0ee4f6fa3a17 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1160,115 +1160,8 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
 		__v;							\
 	})
 
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
-
-static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
-{
-	/*
-	 * *** VHE ONLY ***
-	 *
-	 * System registers listed in the switch are not saved on every
-	 * exit from the guest but are only saved on vcpu_put.
-	 *
-	 * SYSREGS_ON_CPU *MUST* be checked before using this helper.
-	 *
-	 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
-	 * should never be listed below, because the guest cannot modify its
-	 * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
-	 * thread when emulating cross-VCPU communication.
-	 */
-	if (!has_vhe())
-		return false;
-
-	switch (reg) {
-	case SCTLR_EL1:		*val = read_sysreg_s(SYS_SCTLR_EL12);	break;
-	case CPACR_EL1:		*val = read_sysreg_s(SYS_CPACR_EL12);	break;
-	case TTBR0_EL1:		*val = read_sysreg_s(SYS_TTBR0_EL12);	break;
-	case TTBR1_EL1:		*val = read_sysreg_s(SYS_TTBR1_EL12);	break;
-	case TCR_EL1:		*val = read_sysreg_s(SYS_TCR_EL12);	break;
-	case TCR2_EL1:		*val = read_sysreg_s(SYS_TCR2_EL12);	break;
-	case PIR_EL1:		*val = read_sysreg_s(SYS_PIR_EL12);	break;
-	case PIRE0_EL1:		*val = read_sysreg_s(SYS_PIRE0_EL12);	break;
-	case POR_EL1:		*val = read_sysreg_s(SYS_POR_EL12);	break;
-	case ESR_EL1:		*val = read_sysreg_s(SYS_ESR_EL12);	break;
-	case AFSR0_EL1:		*val = read_sysreg_s(SYS_AFSR0_EL12);	break;
-	case AFSR1_EL1:		*val = read_sysreg_s(SYS_AFSR1_EL12);	break;
-	case FAR_EL1:		*val = read_sysreg_s(SYS_FAR_EL12);	break;
-	case MAIR_EL1:		*val = read_sysreg_s(SYS_MAIR_EL12);	break;
-	case VBAR_EL1:		*val = read_sysreg_s(SYS_VBAR_EL12);	break;
-	case CONTEXTIDR_EL1:	*val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
-	case TPIDR_EL0:		*val = read_sysreg_s(SYS_TPIDR_EL0);	break;
-	case TPIDRRO_EL0:	*val = read_sysreg_s(SYS_TPIDRRO_EL0);	break;
-	case TPIDR_EL1:		*val = read_sysreg_s(SYS_TPIDR_EL1);	break;
-	case AMAIR_EL1:		*val = read_sysreg_s(SYS_AMAIR_EL12);	break;
-	case CNTKCTL_EL1:	*val = read_sysreg_s(SYS_CNTKCTL_EL12);	break;
-	case ELR_EL1:		*val = read_sysreg_s(SYS_ELR_EL12);	break;
-	case SPSR_EL1:		*val = read_sysreg_s(SYS_SPSR_EL12);	break;
-	case PAR_EL1:		*val = read_sysreg_par();		break;
-	case DACR32_EL2:	*val = read_sysreg_s(SYS_DACR32_EL2);	break;
-	case IFSR32_EL2:	*val = read_sysreg_s(SYS_IFSR32_EL2);	break;
-	case DBGVCR32_EL2:	*val = read_sysreg_s(SYS_DBGVCR32_EL2);	break;
-	case ZCR_EL1:		*val = read_sysreg_s(SYS_ZCR_EL12);	break;
-	case SCTLR2_EL1:	*val = read_sysreg_s(SYS_SCTLR2_EL12);	break;
-	default:		return false;
-	}
-
-	return true;
-}
-
-static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
-{
-	/*
-	 * *** VHE ONLY ***
-	 *
-	 * System registers listed in the switch are not restored on every
-	 * entry to the guest but are only restored on vcpu_load.
-	 *
-	 * SYSREGS_ON_CPU *MUST* be checked before using this helper.
-	 *
-	 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
-	 * should never be listed below, because the MPIDR should only be set
-	 * once, before running the VCPU, and never changed later.
-	 */
-	if (!has_vhe())
-		return false;
-
-	switch (reg) {
-	case SCTLR_EL1:		write_sysreg_s(val, SYS_SCTLR_EL12);	break;
-	case CPACR_EL1:		write_sysreg_s(val, SYS_CPACR_EL12);	break;
-	case TTBR0_EL1:		write_sysreg_s(val, SYS_TTBR0_EL12);	break;
-	case TTBR1_EL1:		write_sysreg_s(val, SYS_TTBR1_EL12);	break;
-	case TCR_EL1:		write_sysreg_s(val, SYS_TCR_EL12);	break;
-	case TCR2_EL1:		write_sysreg_s(val, SYS_TCR2_EL12);	break;
-	case PIR_EL1:		write_sysreg_s(val, SYS_PIR_EL12);	break;
-	case PIRE0_EL1:		write_sysreg_s(val, SYS_PIRE0_EL12);	break;
-	case POR_EL1:		write_sysreg_s(val, SYS_POR_EL12);	break;
-	case ESR_EL1:		write_sysreg_s(val, SYS_ESR_EL12);	break;
-	case AFSR0_EL1:		write_sysreg_s(val, SYS_AFSR0_EL12);	break;
-	case AFSR1_EL1:		write_sysreg_s(val, SYS_AFSR1_EL12);	break;
-	case FAR_EL1:		write_sysreg_s(val, SYS_FAR_EL12);	break;
-	case MAIR_EL1:		write_sysreg_s(val, SYS_MAIR_EL12);	break;
-	case VBAR_EL1:		write_sysreg_s(val, SYS_VBAR_EL12);	break;
-	case CONTEXTIDR_EL1:	write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
-	case TPIDR_EL0:		write_sysreg_s(val, SYS_TPIDR_EL0);	break;
-	case TPIDRRO_EL0:	write_sysreg_s(val, SYS_TPIDRRO_EL0);	break;
-	case TPIDR_EL1:		write_sysreg_s(val, SYS_TPIDR_EL1);	break;
-	case AMAIR_EL1:		write_sysreg_s(val, SYS_AMAIR_EL12);	break;
-	case CNTKCTL_EL1:	write_sysreg_s(val, SYS_CNTKCTL_EL12);	break;
-	case ELR_EL1:		write_sysreg_s(val, SYS_ELR_EL12);	break;
-	case SPSR_EL1:		write_sysreg_s(val, SYS_SPSR_EL12);	break;
-	case PAR_EL1:		write_sysreg_s(val, SYS_PAR_EL1);	break;
-	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	break;
-	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	break;
-	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	break;
-	case ZCR_EL1:		write_sysreg_s(val, SYS_ZCR_EL12);	break;
-	case SCTLR2_EL1:	write_sysreg_s(val, SYS_SCTLR2_EL12);	break;
-	default:		return false;
-	}
-
-	return true;
-}
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *, enum vcpu_sysreg);
+void vcpu_write_sys_reg(struct kvm_vcpu *, u64, enum vcpu_sysreg);
 
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
@@ -1476,6 +1369,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
 }
 
 void kvm_init_host_debug_data(void);
+void kvm_debug_init_vhe(void);
 void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu);
 void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index ae563ebd6aee..e4069f2ce642 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -180,6 +180,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable);
 
+int kvm_handle_guest_sea(struct kvm_vcpu *vcpu);
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h
deleted file mode 100644
index 9398ade632aa..000000000000
--- a/arch/arm64/include/asm/kvm_ras.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2018 - Arm Ltd */
-
-#ifndef __ARM64_KVM_RAS_H__
-#define __ARM64_KVM_RAS_H__
-
-#include <linux/acpi.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-
-#include <asm/acpi.h>
-
-/*
- * Was this synchronous external abort a RAS notification?
- * Returns '0' for errors handled by some RAS subsystem, or -ENOENT.
- */
-static inline int kvm_handle_guest_sea(void)
-{
-	/* apei_claim_sea(NULL) expects to mask interrupts itself */
-	lockdep_assert_irqs_enabled();
-
-	return apei_claim_sea(NULL);
-}
-
-#endif /* __ARM64_KVM_RAS_H__ */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6e8aa8e72601..ff6fd0bbd7d2 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,6 +17,13 @@
 #include <linux/refcount.h>
 #include <asm/cpufeature.h>
 
+enum pgtable_type {
+	TABLE_PTE,
+	TABLE_PMD,
+	TABLE_PUD,
+	TABLE_P4D,
+};
+
 typedef struct {
 	atomic64_t	id;
 #ifdef CONFIG_COMPAT
@@ -71,6 +78,9 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot, bool page_mappings_only);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
+extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end);
+extern void init_idmap_kpti_bbml2_flag(void);
+extern void linear_map_maybe_split_to_ptes(void);
 
 /*
  * This check is triggered during the early boot before the cpufeature
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 79550b22ba19..fb9b88eebeb1 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -19,6 +19,7 @@ struct mod_arch_specific {
 
 	/* for CONFIG_DYNAMIC_FTRACE */
 	struct plt_entry	*ftrace_trampolines;
+	struct plt_entry	*init_ftrace_trampolines;
 };
 
 u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h
index b9ae8349e35d..fb944b46846d 100644
--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@@ -2,6 +2,7 @@ SECTIONS {
 	.plt 0 : { BYTE(0) }
 	.init.plt 0 : { BYTE(0) }
 	.text.ftrace_trampoline 0 : { BYTE(0) }
+	.init.text.ftrace_trampoline 0 : { BYTE(0) }
 
 #ifdef CONFIG_KASAN_SW_TAGS
 	/*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index abd2dee416b3..aa89c2e67ebc 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -371,6 +371,11 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
 	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
 }
 
+static inline pmd_t pmd_mknoncont(pmd_t pmd)
+{
+	return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT);
+}
+
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 static inline int pte_uffd_wp(pte_t pte)
 {
diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h
index 0159b625cc7f..932ea4b62042 100644
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -2,7 +2,6 @@
 #ifndef __ASM_PREEMPT_H
 #define __ASM_PREEMPT_H
 
-#include <linux/jump_label.h>
 #include <linux/thread_info.h>
 
 #define PREEMPT_NEED_RESCHED	BIT(32)
@@ -87,7 +86,6 @@ void preempt_schedule_notrace(void);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
-DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
 void dynamic_preempt_schedule(void);
 #define __preempt_schedule()		dynamic_preempt_schedule()
 void dynamic_preempt_schedule_notrace(void);
diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index fded5358641f..baff24004459 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -7,6 +7,8 @@
 
 #include <linux/ptdump.h>
 
+DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
+
 #ifdef CONFIG_PTDUMP
 
 #include <linux/mm_types.h>
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 47ff8654c5ec..65b053a24d82 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -169,10 +169,6 @@ struct pt_regs {
 
 	u64 sdei_ttbr1;
 	struct frame_record_meta stackframe;
-
-	/* Only valid for some EL1 exceptions. */
-	u64 lockdep_hardirqs;
-	u64 exit_rcu;
 };
 
 /* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */
@@ -214,11 +210,12 @@ static inline void forget_syscall(struct pt_regs *regs)
 		(regs)->pmr == GIC_PRIO_IRQON :				\
 		true)
 
-#define interrupts_enabled(regs)			\
-	(!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs))
+static __always_inline bool regs_irqs_disabled(const struct pt_regs *regs)
+{
+	return (regs->pstate & PSR_I_BIT) || !irqs_priority_unmasked(regs);
+}
 
-#define fast_interrupts_enabled(regs) \
-	(!((regs)->pstate & PSR_F_BIT))
+#define interrupts_enabled(regs)	(!regs_irqs_disabled(regs))
 
 static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 {
diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
index b42aeac05340..88b50d660e85 100644
--- a/arch/arm64/include/asm/rsi.h
+++ b/arch/arm64/include/asm/rsi.h
@@ -16,7 +16,7 @@ DECLARE_STATIC_KEY_FALSE(rsi_present);
 
 void __init arm64_rsi_init(void);
 
-bool __arm64_is_protected_mmio(phys_addr_t base, size_t size);
+bool arm64_rsi_is_protected(phys_addr_t base, size_t size);
 
 static inline bool is_realm_world(void)
 {
diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h
index ba269a7a3201..3d96dde4d214 100644
--- a/arch/arm64/include/asm/setup.h
+++ b/arch/arm64/include/asm/setup.h
@@ -21,7 +21,7 @@ static inline bool arch_parse_debug_rodata(char *arg)
 	if (!arg)
 		return false;
 
-	if (!strcmp(arg, "full")) {
+	if (!strcmp(arg, "on")) {
 		rodata_enabled = rodata_full = true;
 		return true;
 	}
@@ -31,7 +31,7 @@ static inline bool arch_parse_debug_rodata(char *arg)
 		return true;
 	}
 
-	if (!strcmp(arg, "on")) {
+	if (!strcmp(arg, "noalias")) {
 		rodata_enabled = true;
 		rodata_full = false;
 		return true;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d5b5f2ae1afa..6455db1b54fd 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -281,8 +281,6 @@
 #define SYS_RGSR_EL1			sys_reg(3, 0, 1, 0, 5)
 #define SYS_GCR_EL1			sys_reg(3, 0, 1, 0, 6)
 
-#define SYS_TCR_EL1			sys_reg(3, 0, 2, 0, 2)
-
 #define SYS_APIAKEYLO_EL1		sys_reg(3, 0, 2, 1, 0)
 #define SYS_APIAKEYHI_EL1		sys_reg(3, 0, 2, 1, 1)
 #define SYS_APIBKEYLO_EL1		sys_reg(3, 0, 2, 1, 2)
@@ -344,15 +342,6 @@
 #define SYS_PAR_EL1_ATTR		GENMASK_ULL(63, 56)
 #define SYS_PAR_EL1_F0_RES0		(GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52))
 
-/*** Statistical Profiling Extension ***/
-#define PMSEVFR_EL1_RES0_IMP	\
-	(GENMASK_ULL(47, 32) | GENMASK_ULL(23, 16) | GENMASK_ULL(11, 8) |\
-	 BIT_ULL(6) | BIT_ULL(4) | BIT_ULL(2) | BIT_ULL(0))
-#define PMSEVFR_EL1_RES0_V1P1	\
-	(PMSEVFR_EL1_RES0_IMP & ~(BIT_ULL(18) | BIT_ULL(17) | BIT_ULL(11)))
-#define PMSEVFR_EL1_RES0_V1P2	\
-	(PMSEVFR_EL1_RES0_V1P1 & ~BIT_ULL(6))
-
 /* Buffer error reporting */
 #define PMBSR_EL1_FAULT_FSC_SHIFT	PMBSR_EL1_MSS_SHIFT
 #define PMBSR_EL1_FAULT_FSC_MASK	PMBSR_EL1_MSS_MASK
@@ -1142,9 +1131,6 @@
 
 #define ARM64_FEATURE_FIELD_BITS	4
 
-/* Defined for compatibility only, do not add new users. */
-#define ARM64_FEATURE_MASK(x)	(x##_MASK)
-
 #ifdef __ASSEMBLY__
 
 	.macro	mrs_s, rt, sreg
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 5b91803201ef..1aa4ecb73429 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -502,44 +502,4 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr,
 
 #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
 
-#ifdef CONFIG_ARM64_GCS
-
-static inline int gcssttr(unsigned long __user *addr, unsigned long val)
-{
-	register unsigned long __user *_addr __asm__ ("x0") = addr;
-	register unsigned long _val __asm__ ("x1") = val;
-	int err = 0;
-
-	/* GCSSTTR x1, x0 */
-	asm volatile(
-		"1: .inst 0xd91f1c01\n"
-		"2: \n"
-		_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
-		: "+r" (err)
-		: "rZ" (_val), "r" (_addr)
-		: "memory");
-
-	return err;
-}
-
-static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
-				int *err)
-{
-	int ret;
-
-	if (!access_ok((char __user *)addr, sizeof(u64))) {
-		*err = -EFAULT;
-		return;
-	}
-
-	uaccess_ttbr0_enable();
-	ret = gcssttr(addr, val);
-	if (ret != 0)
-		*err = ret;
-	uaccess_ttbr0_disable();
-}
-
-
-#endif /* CONFIG_ARM64_GCS */
-
 #endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 12f534e8f3ed..4ec1acd3c1b3 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -9,18 +9,13 @@
 #define arch_vmap_pud_supported arch_vmap_pud_supported
 static inline bool arch_vmap_pud_supported(pgprot_t prot)
 {
-	/*
-	 * SW table walks can't handle removal of intermediate entries.
-	 */
-	return pud_sect_supported() &&
-	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+	return pud_sect_supported();
 }
 
 #define arch_vmap_pmd_supported arch_vmap_pmd_supported
 static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 {
-	/* See arch_vmap_pud_supported() */
-	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+	return true;
 }
 
 #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h
index 2788e95d0ff0..2977b5fe068d 100644
--- a/arch/arm64/include/asm/xen/events.h
+++ b/arch/arm64/include/asm/xen/events.h
@@ -14,7 +14,7 @@ enum ipi_vector {
 
 static inline int xen_irqs_disabled(struct pt_regs *regs)
 {
-	return !interrupts_enabled(regs);
+	return regs_irqs_disabled(regs);
 }
 
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
diff --git a/arch/arm64/include/uapi/asm/bitsperlong.h b/arch/arm64/include/uapi/asm/bitsperlong.h
index 485d60bee26c..d59730975f30 100644
--- a/arch/arm64/include/uapi/asm/bitsperlong.h
+++ b/arch/arm64/include/uapi/asm/bitsperlong.h
@@ -17,7 +17,12 @@
 #ifndef __ASM_BITSPERLONG_H
 #define __ASM_BITSPERLONG_H
 
+#if defined(__KERNEL__) && !defined(__aarch64__)
+/* Used by the compat vDSO */
+#define __BITS_PER_LONG 32
+#else
 #define __BITS_PER_LONG 64
+#endif
 
 #include <asm-generic/bitsperlong.h>
 
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 72c78468b806..575564ecdb0b 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -145,5 +145,6 @@
  */
 #define HWCAP3_MTE_FAR		(1UL << 0)
 #define HWCAP3_MTE_STORE_ONLY		(1UL << 1)
+#define HWCAP3_LSFE		(1UL << 2)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 4d529ff7ba51..7aca29e1d30b 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -357,6 +357,16 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
 			 * as long as we take care not to create a writable
 			 * mapping for executable code.
 			 */
+			fallthrough;
+
+		case EFI_ACPI_MEMORY_NVS:
+			/*
+			 * ACPI NVS marks an area reserved for use by the
+			 * firmware, even after exiting the boot service.
+			 * This may be used by the firmware for sharing dynamic
+			 * tables/data (e.g., ACPI CCEL) with the OS. Map it
+			 * as read-only.
+			 */
 			prot = PAGE_KERNEL_RO;
 			break;
 
@@ -407,7 +417,7 @@ int apei_claim_sea(struct pt_regs *regs)
 	return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags());
 
 	if (regs)
-		return_to_irqs_enabled = interrupts_enabled(regs);
+		return_to_irqs_enabled = !regs_irqs_disabled(regs);
 
 	/*
 	 * SEA can interrupt SError, mask it and describe this as an NMI so
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 30d4bbe68661..b6367ff3a49c 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -6,6 +6,7 @@
  *               2001-2002 Keith Owens
  * Copyright (C) 2012 ARM Ltd.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/arm_sdei.h>
 #include <linux/sched.h>
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 59d723c9ab8f..8cb3b575a031 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -531,6 +531,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = {
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+	MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
@@ -545,6 +546,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = {
 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
 	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE),
 	{}
 };
 #endif
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9ad065f15f1d..63cd05e6973d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -84,6 +84,7 @@
 #include <asm/hwcap.h>
 #include <asm/insn.h>
 #include <asm/kvm_host.h>
+#include <asm/mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/mte.h>
 #include <asm/hypervisor.h>
@@ -278,6 +279,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
 
 static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
@@ -1945,11 +1947,11 @@ static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope)
 extern
 void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 			     phys_addr_t size, pgprot_t prot,
-			     phys_addr_t (*pgtable_alloc)(int), int flags);
+			     phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags);
 
 static phys_addr_t __initdata kpti_ng_temp_alloc;
 
-static phys_addr_t __init kpti_ng_pgd_alloc(int shift)
+static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type)
 {
 	kpti_ng_temp_alloc -= PAGE_SIZE;
 	return kpti_ng_temp_alloc;
@@ -2027,6 +2029,7 @@ static void __init kpti_install_ng_mappings(void)
 	if (arm64_use_ng_mappings)
 		return;
 
+	init_idmap_kpti_bbml2_flag();
 	stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask);
 }
 
@@ -2217,7 +2220,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
 	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
 }
 
-static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope)
+bool cpu_supports_bbml2_noabort(void)
 {
 	/*
 	 * We want to allow usage of BBML2 in as wide a range of kernel contexts
@@ -2234,6 +2237,10 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco
 	static const struct midr_range supports_bbml2_noabort_list[] = {
 		MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf),
 		MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf),
+		MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf),
+		MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
+		MIDR_ALL_VERSIONS(MIDR_AMPERE1),
+		MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
 		{}
 	};
 
@@ -2249,6 +2256,11 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco
 	return true;
 }
 
+static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope)
+{
+	return cpu_supports_bbml2_noabort();
+}
+
 #ifdef CONFIG_ARM64_PAN
 static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
 {
@@ -2269,6 +2281,24 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
 	/* Firmware may have left a deferred SError in this register. */
 	write_sysreg_s(0, SYS_DISR_EL1);
 }
+static bool has_rasv1p1(const struct arm64_cpu_capabilities *__unused, int scope)
+{
+	const struct arm64_cpu_capabilities rasv1p1_caps[] = {
+		{
+			ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, V1P1)
+		},
+		{
+			ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP)
+		},
+		{
+			ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, RAS_frac, RASv1p1)
+		},
+	};
+
+	return (has_cpuid_feature(&rasv1p1_caps[0], scope) ||
+		(has_cpuid_feature(&rasv1p1_caps[1], scope) &&
+		 has_cpuid_feature(&rasv1p1_caps[2], scope)));
+}
 #endif /* CONFIG_ARM64_RAS_EXTN */
 
 #ifdef CONFIG_ARM64_PTR_AUTH
@@ -2687,6 +2717,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_clear_disr,
 		ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP)
 	},
+	{
+		.desc = "RASv1p1 Extension Support",
+		.capability = ARM64_HAS_RASV1P1_EXTN,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_rasv1p1,
+	},
 #endif /* CONFIG_ARM64_RAS_EXTN */
 #ifdef CONFIG_ARM64_AMU_EXTN
 	{
@@ -3252,6 +3288,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM),
 	HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT),
 	HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX),
+	HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE),
 	HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT),
 #ifdef CONFIG_ARM64_SVE
 	HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE),
@@ -3923,6 +3960,7 @@ void __init setup_system_features(void)
 {
 	setup_system_capabilities();
 
+	linear_map_maybe_split_to_ptes();
 	kpti_install_ng_mappings();
 
 	sve_setup();
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index ba834909a28b..c44e6d94f5de 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -162,6 +162,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_SME_SMOP4]	= "smesmop4",
 	[KERNEL_HWCAP_MTE_FAR]		= "mtefar",
 	[KERNEL_HWCAP_MTE_STORE_ONLY]	= "mtestoreonly",
+	[KERNEL_HWCAP_LSFE]		= "lsfe",
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 110d9ff54174..29307642f4c9 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -167,7 +167,7 @@ static void send_user_sigtrap(int si_code)
 	if (WARN_ON(!user_mode(regs)))
 		return;
 
-	if (interrupts_enabled(regs))
+	if (!regs_irqs_disabled(regs))
 		local_irq_enable();
 
 	arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs),
@@ -212,7 +212,7 @@ static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr)
 	if (esr_brk_comment(esr) == BUG_BRK_IMM)
 		return bug_brk_handler(regs, esr);
 
-	if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr))
+	if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr))
 		return cfi_brk_handler(regs, esr);
 
 	if (esr_brk_comment(esr) == FAULT_BRK_IMM)
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 2b0c5925502e..f546a914f041 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/context_tracking.h>
+#include <linux/irq-entry-common.h>
 #include <linux/kasan.h>
 #include <linux/linkage.h>
 #include <linux/livepatch.h>
@@ -37,29 +38,20 @@
  * This is intended to match the logic in irqentry_enter(), handling the kernel
  * mode transitions only.
  */
-static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs)
+static __always_inline irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs)
 {
-	regs->exit_rcu = false;
-
-	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
-		lockdep_hardirqs_off(CALLER_ADDR0);
-		ct_irq_enter();
-		trace_hardirqs_off_finish();
-
-		regs->exit_rcu = true;
-		return;
-	}
-
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	rcu_irq_enter_check_tick();
-	trace_hardirqs_off_finish();
+	return irqentry_enter(regs);
 }
 
-static void noinstr enter_from_kernel_mode(struct pt_regs *regs)
+static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
 {
-	__enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = __enter_from_kernel_mode(regs);
 	mte_check_tfsr_entry();
 	mte_disable_tco_entry(current);
+
+	return state;
 }
 
 /*
@@ -70,30 +62,17 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs)
  * This is intended to match the logic in irqentry_exit(), handling the kernel
  * mode transitions only, and with preemption handled elsewhere.
  */
-static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs)
-{
-	lockdep_assert_irqs_disabled();
-
-	if (interrupts_enabled(regs)) {
-		if (regs->exit_rcu) {
-			trace_hardirqs_on_prepare();
-			lockdep_hardirqs_on_prepare();
-			ct_irq_exit();
-			lockdep_hardirqs_on(CALLER_ADDR0);
-			return;
-		}
-
-		trace_hardirqs_on();
-	} else {
-		if (regs->exit_rcu)
-			ct_irq_exit();
-	}
+static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs,
+						  irqentry_state_t state)
+{
+	irqentry_exit(regs, state);
 }
 
-static void noinstr exit_to_kernel_mode(struct pt_regs *regs)
+static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
+					irqentry_state_t state)
 {
 	mte_check_tfsr_exit();
-	__exit_to_kernel_mode(regs);
+	__exit_to_kernel_mode(regs, state);
 }
 
 /*
@@ -101,18 +80,15 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs)
  * Before this function is called it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
  */
-static __always_inline void __enter_from_user_mode(void)
+static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 {
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	CT_WARN_ON(ct_state() != CT_STATE_USER);
-	user_exit_irqoff();
-	trace_hardirqs_off_finish();
+	enter_from_user_mode(regs);
 	mte_disable_tco_entry(current);
 }
 
-static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
 {
-	__enter_from_user_mode();
+	__enter_from_user_mode(regs);
 }
 
 /*
@@ -120,113 +96,19 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
  * After this function returns it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
  */
-static __always_inline void __exit_to_user_mode(void)
-{
-	trace_hardirqs_on_prepare();
-	lockdep_hardirqs_on_prepare();
-	user_enter_irqoff();
-	lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
-{
-	do {
-		local_irq_enable();
-
-		if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
-			schedule();
-
-		if (thread_flags & _TIF_UPROBE)
-			uprobe_notify_resume(regs);
-
-		if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
-			clear_thread_flag(TIF_MTE_ASYNC_FAULT);
-			send_sig_fault(SIGSEGV, SEGV_MTEAERR,
-				       (void __user *)NULL, current);
-		}
-
-		if (thread_flags & _TIF_PATCH_PENDING)
-			klp_update_patch_state(current);
 
-		if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			do_signal(regs);
-
-		if (thread_flags & _TIF_NOTIFY_RESUME)
-			resume_user_mode_work(regs);
-
-		if (thread_flags & _TIF_FOREIGN_FPSTATE)
-			fpsimd_restore_current_state();
-
-		local_irq_disable();
-		thread_flags = read_thread_flags();
-	} while (thread_flags & _TIF_WORK_MASK);
-}
-
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs)
 {
-	unsigned long flags;
-
 	local_irq_disable();
-
-	flags = read_thread_flags();
-	if (unlikely(flags & _TIF_WORK_MASK))
-		do_notify_resume(regs, flags);
-
-	local_daif_mask();
-
-	lockdep_sys_exit();
-}
-
-static __always_inline void exit_to_user_mode(struct pt_regs *regs)
-{
 	exit_to_user_mode_prepare(regs);
+	local_daif_mask();
 	mte_check_tfsr_exit();
-	__exit_to_user_mode();
+	exit_to_user_mode();
 }
 
 asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs)
 {
-	exit_to_user_mode(regs);
-}
-
-/*
- * Handle IRQ/context state management when entering an NMI from user/kernel
- * mode. Before this function is called it is not safe to call regular kernel
- * code, instrumentable code, or any code which may trigger an exception.
- */
-static void noinstr arm64_enter_nmi(struct pt_regs *regs)
-{
-	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
-
-	__nmi_enter();
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	lockdep_hardirq_enter();
-	ct_nmi_enter();
-
-	trace_hardirqs_off_finish();
-	ftrace_nmi_enter();
-}
-
-/*
- * Handle IRQ/context state management when exiting an NMI from user/kernel
- * mode. After this function returns it is not safe to call regular kernel
- * code, instrumentable code, or any code which may trigger an exception.
- */
-static void noinstr arm64_exit_nmi(struct pt_regs *regs)
-{
-	bool restore = regs->lockdep_hardirqs;
-
-	ftrace_nmi_exit();
-	if (restore) {
-		trace_hardirqs_on_prepare();
-		lockdep_hardirqs_on_prepare();
-	}
-
-	ct_nmi_exit();
-	lockdep_hardirq_exit();
-	if (restore)
-		lockdep_hardirqs_on(CALLER_ADDR0);
-	__nmi_exit();
+	arm64_exit_to_user_mode(regs);
 }
 
 /*
@@ -234,14 +116,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs)
  * kernel mode. Before this function is called it is not safe to call regular
  * kernel code, instrumentable code, or any code which may trigger an exception.
  */
-static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs)
+static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs)
 {
-	regs->lockdep_hardirqs = lockdep_hardirqs_enabled();
+	irqentry_state_t state;
+
+	state.lockdep = lockdep_hardirqs_enabled();
 
 	lockdep_hardirqs_off(CALLER_ADDR0);
 	ct_nmi_enter();
 
 	trace_hardirqs_off_finish();
+
+	return state;
 }
 
 /*
@@ -249,62 +135,19 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs)
  * kernel mode. After this function returns it is not safe to call regular
  * kernel code, instrumentable code, or any code which may trigger an exception.
  */
-static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs)
+static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs,
+				       irqentry_state_t state)
 {
-	bool restore = regs->lockdep_hardirqs;
-
-	if (restore) {
+	if (state.lockdep) {
 		trace_hardirqs_on_prepare();
 		lockdep_hardirqs_on_prepare();
 	}
 
 	ct_nmi_exit();
-	if (restore)
+	if (state.lockdep)
 		lockdep_hardirqs_on(CALLER_ADDR0);
 }
 
-#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
-#define need_irq_preemption() \
-	(static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
-#else
-#define need_irq_preemption()	(IS_ENABLED(CONFIG_PREEMPTION))
-#endif
-
-static void __sched arm64_preempt_schedule_irq(void)
-{
-	if (!need_irq_preemption())
-		return;
-
-	/*
-	 * Note: thread_info::preempt_count includes both thread_info::count
-	 * and thread_info::need_resched, and is not equivalent to
-	 * preempt_count().
-	 */
-	if (READ_ONCE(current_thread_info()->preempt_count) != 0)
-		return;
-
-	/*
-	 * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC
-	 * priority masking is used the GIC irqchip driver will clear DAIF.IF
-	 * using gic_arch_enable_irqs() for normal IRQs. If anything is set in
-	 * DAIF we must have handled an NMI, so skip preemption.
-	 */
-	if (system_uses_irq_prio_masking() && read_sysreg(daif))
-		return;
-
-	/*
-	 * Preempting a task from an IRQ means we leave copies of PSTATE
-	 * on the stack. cpufeature's enable calls may modify PSTATE, but
-	 * resuming one of these preempted tasks would undo those changes.
-	 *
-	 * Only allow a task to be preempted once cpufeatures have been
-	 * enabled.
-	 */
-	if (system_capabilities_finalized())
-		preempt_schedule_irq();
-}
-
 static void do_interrupt_handler(struct pt_regs *regs,
 				 void (*handler)(struct pt_regs *))
 {
@@ -324,7 +167,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *);
 static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector,
 				      unsigned long esr)
 {
-	arm64_enter_nmi(regs);
+	irqentry_nmi_enter(regs);
 
 	console_verbose();
 
@@ -475,73 +318,87 @@ UNHANDLED(el1t, 64, error)
 static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
+	irqentry_state_t state;
 
-	enter_from_kernel_mode(regs);
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_mem_abort(far, esr, regs);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
+	irqentry_state_t state;
 
-	enter_from_kernel_mode(regs);
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_sp_pc_abort(far, esr, regs);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_undef(regs, esr);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_bti(regs, esr);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_gcs(regs, esr);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_mops(regs, esr);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr)
 {
-	arm64_enter_el1_dbg(regs);
+	irqentry_state_t state;
+
+	state = arm64_enter_el1_dbg(regs);
 	debug_exception_enter(regs);
 	do_breakpoint(esr, regs);
 	debug_exception_exit(regs);
-	arm64_exit_el1_dbg(regs);
+	arm64_exit_el1_dbg(regs, state);
 }
 
 static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr)
 {
-	arm64_enter_el1_dbg(regs);
+	irqentry_state_t state;
+
+	state = arm64_enter_el1_dbg(regs);
 	if (!cortex_a76_erratum_1463225_debug_handler(regs)) {
 		debug_exception_enter(regs);
 		/*
@@ -554,37 +411,42 @@ static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr)
 			do_el1_softstep(esr, regs);
 		debug_exception_exit(regs);
 	}
-	arm64_exit_el1_dbg(regs);
+	arm64_exit_el1_dbg(regs, state);
 }
 
 static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr)
 {
 	/* Watchpoints are the only debug exception to write FAR_EL1 */
 	unsigned long far = read_sysreg(far_el1);
+	irqentry_state_t state;
 
-	arm64_enter_el1_dbg(regs);
+	state = arm64_enter_el1_dbg(regs);
 	debug_exception_enter(regs);
 	do_watchpoint(far, esr, regs);
 	debug_exception_exit(regs);
-	arm64_exit_el1_dbg(regs);
+	arm64_exit_el1_dbg(regs, state);
 }
 
 static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr)
 {
-	arm64_enter_el1_dbg(regs);
+	irqentry_state_t state;
+
+	state = arm64_enter_el1_dbg(regs);
 	debug_exception_enter(regs);
 	do_el1_brk64(esr, regs);
 	debug_exception_exit(regs);
-	arm64_exit_el1_dbg(regs);
+	arm64_exit_el1_dbg(regs, state);
 }
 
 static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_fpac(regs, esr);
 	local_daif_mask();
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 
 asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
@@ -639,30 +501,32 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 static __always_inline void __el1_pnmi(struct pt_regs *regs,
 				       void (*handler)(struct pt_regs *))
 {
-	arm64_enter_nmi(regs);
+	irqentry_state_t state;
+
+	state = irqentry_nmi_enter(regs);
 	do_interrupt_handler(regs, handler);
-	arm64_exit_nmi(regs);
+	irqentry_nmi_exit(regs, state);
 }
 
 static __always_inline void __el1_irq(struct pt_regs *regs,
 				      void (*handler)(struct pt_regs *))
 {
-	enter_from_kernel_mode(regs);
+	irqentry_state_t state;
+
+	state = enter_from_kernel_mode(regs);
 
 	irq_enter_rcu();
 	do_interrupt_handler(regs, handler);
 	irq_exit_rcu();
 
-	arm64_preempt_schedule_irq();
-
-	exit_to_kernel_mode(regs);
+	exit_to_kernel_mode(regs, state);
 }
 static void noinstr el1_interrupt(struct pt_regs *regs,
 				  void (*handler)(struct pt_regs *))
 {
 	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
 
-	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
+	if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs))
 		__el1_pnmi(regs, handler);
 	else
 		__el1_irq(regs, handler);
@@ -681,21 +545,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs)
 asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
+	irqentry_state_t state;
 
 	local_daif_restore(DAIF_ERRCTX);
-	arm64_enter_nmi(regs);
+	state = irqentry_nmi_enter(regs);
 	do_serror(regs, esr);
-	arm64_exit_nmi(regs);
+	irqentry_nmi_exit(regs, state);
 }
 
 static void noinstr el0_da(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_mem_abort(far, esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr)
@@ -710,50 +575,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr)
 	if (!is_ttbr0_addr(far))
 		arm64_apply_bp_hardening();
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_mem_abort(far, esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_fpsimd_acc(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_sve_acc(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_sme_acc(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_fpsimd_exc(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_sys(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr)
@@ -763,58 +628,58 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr)
 	if (!is_ttbr0_addr(instruction_pointer(regs)))
 		arm64_apply_bp_hardening();
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_sp_pc_abort(far, esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_sp_pc_abort(regs->sp, esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_undef(regs, esr);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_bti(struct pt_regs *regs)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_bti(regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_mops(regs, esr);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_gcs(regs, esr);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	bad_el0_sync(regs, 0, esr);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr)
@@ -822,12 +687,12 @@ static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr)
 	if (!is_ttbr0_addr(regs->pc))
 		arm64_apply_bp_hardening();
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	debug_exception_enter(regs);
 	do_breakpoint(esr, regs);
 	debug_exception_exit(regs);
 	local_daif_restore(DAIF_PROCCTX);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr)
@@ -835,7 +700,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr)
 	if (!is_ttbr0_addr(regs->pc))
 		arm64_apply_bp_hardening();
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	/*
 	 * After handling a breakpoint, we suspend the breakpoint
 	 * and use single-step to move to the next instruction.
@@ -846,7 +711,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr)
 		local_daif_restore(DAIF_PROCCTX);
 		do_el0_softstep(esr, regs);
 	}
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr)
@@ -854,39 +719,39 @@ static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr)
 	/* Watchpoints are the only debug exception to write FAR_EL1 */
 	unsigned long far = read_sysreg(far_el1);
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	debug_exception_enter(regs);
 	do_watchpoint(far, esr, regs);
 	debug_exception_exit(regs);
 	local_daif_restore(DAIF_PROCCTX);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_brk64(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_svc(struct pt_regs *regs)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	cortex_a76_erratum_1463225_svc_handler();
 	fpsimd_syscall_enter();
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_svc(regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 	fpsimd_syscall_exit();
 }
 
 static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_fpac(regs, esr);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
@@ -960,7 +825,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
 static void noinstr el0_interrupt(struct pt_regs *regs,
 				  void (*handler)(struct pt_regs *))
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 
 	write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
 
@@ -971,7 +836,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs,
 	do_interrupt_handler(regs, handler);
 	irq_exit_rcu();
 
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr __el0_irq_handler_common(struct pt_regs *regs)
@@ -997,14 +862,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs)
 static void noinstr __el0_error_handler_common(struct pt_regs *regs)
 {
 	unsigned long esr = read_sysreg(esr_el1);
+	irqentry_state_t state;
 
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_ERRCTX);
-	arm64_enter_nmi(regs);
+	state = irqentry_nmi_enter(regs);
 	do_serror(regs, esr);
-	arm64_exit_nmi(regs);
+	irqentry_nmi_exit(regs, state);
 	local_daif_restore(DAIF_PROCCTX);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs)
@@ -1015,27 +881,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs)
 #ifdef CONFIG_COMPAT
 static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_cp15(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_svc_compat(struct pt_regs *regs)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	cortex_a76_erratum_1463225_svc_handler();
 	local_daif_restore(DAIF_PROCCTX);
 	do_el0_svc_compat(regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr)
 {
-	enter_from_user_mode(regs);
+	arm64_enter_from_user_mode(regs);
 	local_daif_restore(DAIF_PROCCTX);
 	do_bkpt32(esr, regs);
-	exit_to_user_mode(regs);
+	arm64_exit_to_user_mode(regs);
 }
 
 asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs)
@@ -1114,7 +980,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs)
 	unsigned long esr = read_sysreg(esr_el1);
 	unsigned long far = read_sysreg(far_el1);
 
-	arm64_enter_nmi(regs);
+	irqentry_nmi_enter(regs);
 	panic_bad_stack(regs, esr, far);
 }
 
@@ -1122,6 +988,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs)
 asmlinkage noinstr unsigned long
 __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
 {
+	irqentry_state_t state;
 	unsigned long ret;
 
 	/*
@@ -1146,9 +1013,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
 	else if (cpu_has_pan())
 		set_pstate_pan(0);
 
-	arm64_enter_nmi(regs);
+	state = irqentry_nmi_enter(regs);
 	ret = do_sdei_event(regs, arg);
-	arm64_exit_nmi(regs);
+	irqentry_nmi_exit(regs, state);
 
 	return ret;
 }
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index c37f02d7194e..e3f8f51748bc 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1265,6 +1265,8 @@ void __init sme_setup(void)
 	if (!system_supports_sme())
 		return;
 
+	min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX);
+
 	/*
 	 * SME doesn't require any particular vector length be
 	 * supported but it does require at least one.  We should have
@@ -1272,9 +1274,8 @@ void __init sme_setup(void)
 	 * let's double check here.  The bitmap is SVE_VQ_MAP sized for
 	 * sharing with SVE.
 	 */
-	WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX));
+	WARN_ON(min_bit >= SVE_VQ_MAX);
 
-	min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX);
 	info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit));
 
 	max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX);
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index 5a890714ee2e..5adad37ab4fa 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -258,10 +258,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ftrace_modify_code(pc, 0, new, false);
 }
 
-static struct plt_entry *get_ftrace_plt(struct module *mod)
+static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
 {
 #ifdef CONFIG_MODULES
-	struct plt_entry *plt = mod->arch.ftrace_trampolines;
+	struct plt_entry *plt = NULL;
+
+	if (within_module_mem_type(addr, mod, MOD_INIT_TEXT))
+		plt = mod->arch.init_ftrace_trampolines;
+	else if (within_module_mem_type(addr, mod, MOD_TEXT))
+		plt = mod->arch.ftrace_trampolines;
+	else
+		return NULL;
 
 	return &plt[FTRACE_PLT_IDX];
 #else
@@ -332,7 +339,7 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
 	if (WARN_ON(!mod))
 		return false;
 
-	plt = get_ftrace_plt(mod);
+	plt = get_ftrace_plt(mod, pc);
 	if (!plt) {
 		pr_err("ftrace: no module PLT for %ps\n", (void *)*addr);
 		return false;
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index af1ca875c52c..410060ebd86d 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image,
 			char *initrd, unsigned long initrd_len,
 			char *cmdline)
 {
-	struct kexec_buf kbuf;
+	struct kexec_buf kbuf = {};
 	void *dtb = NULL;
 	unsigned long initrd_load_addr = 0, dtb_len,
 		      orig_segments = image->nr_segments;
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index bde32979c06a..7afd370da9f4 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -283,7 +283,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 	unsigned long core_plts = 0;
 	unsigned long init_plts = 0;
 	Elf64_Sym *syms = NULL;
-	Elf_Shdr *pltsec, *tramp = NULL;
+	Elf_Shdr *pltsec, *tramp = NULL, *init_tramp = NULL;
 	int i;
 
 	/*
@@ -298,6 +298,9 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		else if (!strcmp(secstrings + sechdrs[i].sh_name,
 				 ".text.ftrace_trampoline"))
 			tramp = sechdrs + i;
+		else if (!strcmp(secstrings + sechdrs[i].sh_name,
+				 ".init.text.ftrace_trampoline"))
+			init_tramp = sechdrs + i;
 		else if (sechdrs[i].sh_type == SHT_SYMTAB)
 			syms = (Elf64_Sym *)sechdrs[i].sh_addr;
 	}
@@ -363,5 +366,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry);
 	}
 
+	if (init_tramp) {
+		init_tramp->sh_type = SHT_NOBITS;
+		init_tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+		init_tramp->sh_addralign = __alignof__(struct plt_entry);
+		init_tramp->sh_size = NR_FTRACE_PLTS * sizeof(struct plt_entry);
+	}
+
 	return 0;
 }
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 40148d2725ce..d6d443c4a01a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -466,6 +466,17 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
 	__init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);
 
 	mod->arch.ftrace_trampolines = plts;
+
+	s = find_section(hdr, sechdrs, ".init.text.ftrace_trampoline");
+	if (!s)
+		return -ENOEXEC;
+
+	plts = (void *)s->sh_addr;
+
+	__init_plt(&plts[FTRACE_PLT_IDX], FTRACE_ADDR);
+
+	mod->arch.init_ftrace_trampolines = plts;
+
 #endif
 	return 0;
 }
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index 0f4bd7771859..e8ddbde31a83 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -18,9 +18,9 @@
 
 extern const u8 __eh_frame_start[], __eh_frame_end[];
 
-extern void idmap_cpu_replace_ttbr1(void *pgdir);
+extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir);
 
-static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset,
+static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset,
 			       void *start, void *end, pgprot_t prot,
 			       bool may_use_cont, int root_level)
 {
@@ -40,7 +40,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 {
 	bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS);
 	bool twopass = IS_ENABLED(CONFIG_RELOCATABLE);
-	u64 pgdp = (u64)init_pg_dir + PAGE_SIZE;
+	phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE;
 	pgprot_t text_prot = PAGE_KERNEL_ROX;
 	pgprot_t data_prot = PAGE_KERNEL;
 	pgprot_t prot;
@@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	twopass |= enable_scs;
 	prot = twopass ? data_prot : text_prot;
 
+	/*
+	 * [_stext, _text) isn't executed after boot and contains some
+	 * non-executable, unpredictable data, so map it non-executable.
+	 */
+	map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot,
+		    false, root_level);
 	map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot,
 		    !twopass, root_level);
 	map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata,
@@ -90,7 +96,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 		    true, root_level);
 	dsb(ishst);
 
-	idmap_cpu_replace_ttbr1(init_pg_dir);
+	idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir);
 
 	if (twopass) {
 		if (IS_ENABLED(CONFIG_RELOCATABLE))
@@ -129,10 +135,10 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	/* Copy the root page table to its final location */
 	memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE);
 	dsb(ishst);
-	idmap_cpu_replace_ttbr1(swapper_pg_dir);
+	idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir);
 }
 
-static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr)
+static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr)
 {
 	u64 sctlr = read_sysreg(sctlr_el1);
 	u64 tcr = read_sysreg(tcr_el1) | TCR_DS;
@@ -172,30 +178,30 @@ static void __init remap_idmap_for_lpa2(void)
 	 */
 	create_init_idmap(init_pg_dir, mask);
 	dsb(ishst);
-	set_ttbr0_for_lpa2((u64)init_pg_dir);
+	set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir);
 
 	/*
 	 * Recreate the initial ID map with the same granularity as before.
 	 * Don't bother with the FDT, we no longer need it after this.
 	 */
 	memset(init_idmap_pg_dir, 0,
-	       (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir);
+	       (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir);
 
 	create_init_idmap(init_idmap_pg_dir, mask);
 	dsb(ishst);
 
 	/* switch back to the updated initial ID map */
-	set_ttbr0_for_lpa2((u64)init_idmap_pg_dir);
+	set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir);
 
 	/* wipe the temporary ID map from memory */
-	memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir);
+	memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir);
 }
 
-static void __init map_fdt(u64 fdt)
+static void *__init map_fdt(phys_addr_t fdt)
 {
 	static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE);
-	u64 efdt = fdt + MAX_FDT_SIZE;
-	u64 ptep = (u64)ptes;
+	phys_addr_t efdt = fdt + MAX_FDT_SIZE;
+	phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */
 
 	/*
 	 * Map up to MAX_FDT_SIZE bytes, but avoid overlap with
@@ -205,6 +211,8 @@ static void __init map_fdt(u64 fdt)
 		  fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL,
 		  (pte_t *)init_idmap_pg_dir, false, 0);
 	dsb(ishst);
+
+	return (void *)fdt;
 }
 
 /*
@@ -230,7 +238,7 @@ static bool __init ng_mappings_allowed(void)
 	return true;
 }
 
-asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
+asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt)
 {
 	static char const chosen_str[] __initconst = "/chosen";
 	u64 va_base, pa_base = (u64)&_text;
@@ -238,15 +246,14 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	int root_level = 4 - CONFIG_PGTABLE_LEVELS;
 	int va_bits = VA_BITS;
 	int chosen;
-
-	map_fdt((u64)fdt);
+	void *fdt_mapped = map_fdt(fdt);
 
 	/* Clear BSS and the initial page tables */
-	memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start);
+	memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start);
 
 	/* Parse the command line for CPU feature overrides */
-	chosen = fdt_path_offset(fdt, chosen_str);
-	init_feature_override(boot_status, fdt, chosen);
+	chosen = fdt_path_offset(fdt_mapped, chosen_str);
+	init_feature_override(boot_status, fdt_mapped, chosen);
 
 	if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) {
 		va_bits = VA_BITS_MIN;
@@ -266,7 +273,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	 * fill in the high bits from the seed.
 	 */
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		u64 kaslr_seed = kaslr_early_init(fdt, chosen);
+		u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen);
 
 		if (kaslr_seed && kaslr_requires_kpti())
 			arm64_use_ng_mappings = ng_mappings_allowed();
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 7982788e7b9a..de52cd85c691 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -26,8 +26,9 @@
  * @va_offset:		Offset between a physical page and its current mapping
  * 			in the VA space
  */
-void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
-		      int level, pte_t *tbl, bool may_use_cont, u64 va_offset)
+void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
+		      pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
+		      u64 va_offset)
 {
 	u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
 	ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
@@ -87,19 +88,22 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
 	}
 }
 
-asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask)
+asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask)
 {
-	u64 ptep = (u64)pg_dir + PAGE_SIZE;
+	phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */
 	pgprot_t text_prot = PAGE_KERNEL_ROX;
 	pgprot_t data_prot = PAGE_KERNEL;
 
 	pgprot_val(text_prot) &= ~clrmask;
 	pgprot_val(data_prot) &= ~clrmask;
 
-	map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext,
-		  text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
-	map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin,
-		  data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
+	/* MMU is off; pointer casts to phys_addr_t are safe */
+	map_range(&ptep, (u64)_stext, (u64)__initdata_begin,
+		  (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL,
+		  (pte_t *)pg_dir, false, 0);
+	map_range(&ptep, (u64)__initdata_begin, (u64)_end,
+		  (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL,
+		  (pte_t *)pg_dir, false, 0);
 
 	return ptep;
 }
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index 46cafee7829f..08ef9f80456b 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -29,9 +29,10 @@ u64 kaslr_early_init(void *fdt, int chosen);
 void relocate_kernel(u64 offset);
 int scs_patch(const u8 eh_frame[], int size);
 
-void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
-	       int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
+	       pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
+	       u64 va_offset);
 
-asmlinkage void early_map_kernel(u64 boot_status, void *fdt);
+asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt);
 
-asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask);
+asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask);
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 6438bf62e753..4137cc5ef031 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -108,9 +108,10 @@ arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api)
 	    aarch64_insn_is_bl(insn)) {
 		api->handler = simulate_b_bl;
 	} else if (aarch64_insn_is_br(insn) ||
-	    aarch64_insn_is_blr(insn) ||
-	    aarch64_insn_is_ret(insn)) {
-		api->handler = simulate_br_blr_ret;
+		aarch64_insn_is_blr(insn)) {
+		api->handler = simulate_br_blr;
+	} else if (aarch64_insn_is_ret(insn)) {
+		api->handler = simulate_ret;
 	} else {
 		/*
 		 * Instruction cannot be stepped out-of-line and we don't
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index 4c6d2d712fbd..89fbeb32107e 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -13,6 +13,7 @@
 #include <asm/traps.h>
 
 #include "simulate-insn.h"
+#include "asm/gcs.h"
 
 #define bbl_displacement(insn)		\
 	sign_extend32(((insn) & 0x3ffffff) << 2, 27)
@@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg)
 	return lower_32_bits(pt_regs_read_reg(regs, reg));
 }
 
+static inline int update_lr(struct pt_regs *regs, long addr)
+{
+	int err = 0;
+
+	if (user_mode(regs) && task_gcs_el0_enabled(current)) {
+		push_user_gcs(addr, &err);
+		if (err) {
+			force_sig(SIGSEGV);
+			return err;
+		}
+	}
+	procedure_link_pointer_set(regs, addr);
+	return err;
+}
+
 static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs)
 {
 	int xn = opcode & 0x1f;
@@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs)
 {
 	int disp = bbl_displacement(opcode);
 
-	/* Link register is x30 */
 	if (opcode & (1 << 31))
-		set_x_reg(regs, 30, addr + 4);
+		if (update_lr(regs, addr + 4))
+			return;
 
 	instruction_pointer_set(regs, addr + disp);
 }
@@ -126,16 +142,34 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs)
 }
 
 void __kprobes
-simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs)
+simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs)
 {
 	int xn = (opcode >> 5) & 0x1f;
+	u64 b_target = get_x_reg(regs, xn);
 
-	/* update pc first in case we're doing a "blr lr" */
-	instruction_pointer_set(regs, get_x_reg(regs, xn));
-
-	/* Link register is x30 */
 	if (((opcode >> 21) & 0x3) == 1)
-		set_x_reg(regs, 30, addr + 4);
+		if (update_lr(regs, addr + 4))
+			return;
+
+	instruction_pointer_set(regs, b_target);
+}
+
+void __kprobes
+simulate_ret(u32 opcode, long addr, struct pt_regs *regs)
+{
+	u64 ret_addr;
+	int err = 0;
+	int xn = (opcode >> 5) & 0x1f;
+	u64 r_target = get_x_reg(regs, xn);
+
+	if (user_mode(regs) && task_gcs_el0_enabled(current)) {
+		ret_addr = pop_user_gcs(&err);
+		if (err || ret_addr != r_target) {
+			force_sig(SIGSEGV);
+			return;
+		}
+	}
+	instruction_pointer_set(regs, r_target);
 }
 
 void __kprobes
diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h
index efb2803ec943..9e772a292d56 100644
--- a/arch/arm64/kernel/probes/simulate-insn.h
+++ b/arch/arm64/kernel/probes/simulate-insn.h
@@ -11,7 +11,8 @@
 void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs);
-void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_ret(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs);
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index 1f91fd2a8187..2799bdb2fb82 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -6,6 +6,7 @@
 #include <linux/ptrace.h>
 #include <linux/uprobes.h>
 #include <asm/cacheflush.h>
+#include <asm/gcs.h>
 
 #include "decode-insn.h"
 
@@ -159,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
 				  struct pt_regs *regs)
 {
 	unsigned long orig_ret_vaddr;
+	unsigned long gcs_ret_vaddr;
+	int err = 0;
+	u64 gcspr;
 
 	orig_ret_vaddr = procedure_link_pointer(regs);
+
+	if (task_gcs_el0_enabled(current)) {
+		gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+		gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err);
+		if (err) {
+			force_sig(SIGSEGV);
+			goto out;
+		}
+
+		/*
+		 * If the LR and GCS return addr don't match, then some kind of PAC
+		 * signing or control flow occurred since entering the probed function.
+		 * Likely because the user is attempting to retprobe on an instruction
+		 * that isn't a function boundary or inside a leaf function. Explicitly
+		 * abort this retprobe because it will generate a GCS exception.
+		 */
+		if (gcs_ret_vaddr != orig_ret_vaddr) {
+			orig_ret_vaddr = -1;
+			goto out;
+		}
+
+		put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err);
+		if (err) {
+			force_sig(SIGSEGV);
+			goto out;
+		}
+	}
+
 	/* Replace the return addr with trampoline addr */
 	procedure_link_pointer_set(regs, trampoline_vaddr);
 
+out:
 	return orig_ret_vaddr;
 }
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 96482a1412c6..fba7ca102a8c 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -409,7 +409,7 @@ asmlinkage void ret_from_fork(void) asm("ret_from_fork");
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long stack_start = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index edf1783ffc81..f9a32dfde006 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -884,6 +884,7 @@ static u8 spectre_bhb_loop_affected(void)
 	static const struct midr_range spectre_bhb_k38_list[] = {
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
 		MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
 		{},
 	};
 	static const struct midr_range spectre_bhb_k32_list[] = {
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index ce4778141ec7..c64a06f58c0b 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -84,7 +84,25 @@ static void __init arm64_rsi_setup_memory(void)
 	}
 }
 
-bool __arm64_is_protected_mmio(phys_addr_t base, size_t size)
+/*
+ * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device
+ * mapping, or an MMIO emulated in the Realm world).
+ *
+ * We can rely on the RIPAS value of the region to detect if a given region is
+ * protected.
+ *
+ *  RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm
+ *		world
+ *  RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware
+ *		reserved regions for data sharing).
+ *
+ *  RIPAS_DESTROYED is a special case of one of the above, where the host did
+ *  something without our permission and as such we can't do anything about it.
+ *
+ * The only case where something is emulated by the untrusted hypervisor or is
+ * backed by shared memory is indicated by RSI_RIPAS_EMPTY.
+ */
+bool arm64_rsi_is_protected(phys_addr_t base, size_t size)
 {
 	enum ripas ripas;
 	phys_addr_t end, top;
@@ -101,18 +119,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size)
 			break;
 		if (WARN_ON(top <= base))
 			break;
-		if (ripas != RSI_RIPAS_DEV)
+		if (ripas == RSI_RIPAS_EMPTY)
 			break;
 		base = top;
 	}
 
 	return base >= end;
 }
-EXPORT_SYMBOL(__arm64_is_protected_mmio);
+EXPORT_SYMBOL(arm64_rsi_is_protected);
 
 static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot)
 {
-	if (__arm64_is_protected_mmio(phys, size))
+	if (arm64_rsi_is_protected(phys, size))
 		*prot = pgprot_encrypted(*prot);
 	else
 		*prot = pgprot_decrypted(*prot);
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 6f24a0251e18..95169f7b6531 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -243,7 +243,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs,
 	 * If we interrupted the kernel with interrupts masked, we always go
 	 * back to wherever we came from.
 	 */
-	if (mode == kernel_mode && !interrupts_enabled(regs))
+	if (mode == kernel_mode && regs_irqs_disabled(regs))
 		return SDEI_EV_HANDLED;
 
 	/*
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c7926a4df6..23c05dc7a8f2 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -214,7 +214,7 @@ static void __init request_standard_resources(void)
 	unsigned long i = 0;
 	size_t res_size;
 
-	kernel_code.start   = __pa_symbol(_stext);
+	kernel_code.start   = __pa_symbol(_text);
 	kernel_code.end     = __pa_symbol(__init_begin - 1);
 	kernel_data.start   = __pa_symbol(_sdata);
 	kernel_data.end     = __pa_symbol(_end - 1);
@@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu)
 
 void __init __no_sanitize_address setup_arch(char **cmdline_p)
 {
-	setup_initial_init_mm(_stext, _etext, _edata, _end);
+	setup_initial_init_mm(_text, _etext, _edata, _end);
 
 	*cmdline_p = boot_command_line;
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index db3f972f8cd9..1110eeb21f57 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -9,6 +9,7 @@
 #include <linux/cache.h>
 #include <linux/compat.h>
 #include <linux/errno.h>
+#include <linux/irq-entry-common.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/freezer.h>
@@ -1576,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * the kernel can handle, and then we build all the user-level signal handling
  * stack-frames in one go after that.
  */
-void do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs)
 {
 	unsigned long continue_addr = 0, restart_addr = 0;
 	int retval = 0;
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index c442fcec6b9e..aba7ca6bca2d 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -43,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
 
 	add_random_kstack_offset();
 
-	if (scno < sc_nr) {
+	if (likely(scno < sc_nr)) {
 		syscall_fn_t syscall_fn;
 		syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
 		ret = __invoke_syscall(regs, syscall_fn);
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index f528b6041f6a..5041817af267 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -1015,7 +1015,7 @@ int bug_brk_handler(struct pt_regs *regs, unsigned long esr)
 	return DBG_HOOK_HANDLED;
 }
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 int cfi_brk_handler(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long target;
@@ -1039,7 +1039,7 @@ int cfi_brk_handler(struct pt_regs *regs, unsigned long esr)
 	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
 	return DBG_HOOK_HANDLED;
 }
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr)
 {
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index f2dfdc7dc818..5de4deaf4299 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -21,8 +21,6 @@ endif
 
 cc32-option = $(call try-run,\
         $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
-cc32-disable-warning = $(call try-run,\
-	$(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
 # We cannot use the global flags to compile the vDSO files, the main reason
 # being that the 32-bit compiler may be older than the main (64-bit) compiler
@@ -63,6 +61,7 @@ VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1
 # KBUILD_CFLAGS from top-level Makefile
 VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
                -fno-strict-aliasing -fno-common \
+               $(filter -Werror,$(KBUILD_CPPFLAGS)) \
                -Werror-implicit-function-declaration \
                -Wno-format-security \
                -std=gnu11
@@ -74,16 +73,6 @@ VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes)
 VDSO_CFLAGS += -Werror=date-time
 VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types)
 
-# The 32-bit compiler does not provide 128-bit integers, which are used in
-# some headers that are indirectly included from the vDSO code.
-# This hack makes the compiler happy and should trigger a warning/error if
-# variables of such type are referenced.
-VDSO_CFLAGS += -D__uint128_t='void*'
-# Silence some warnings coming from headers that operate on long's
-# (on GCC 4.8 or older, there is unfortunately no way to silence this warning)
-VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow)
-VDSO_CFLAGS += -Wno-int-to-pointer-cast
-
 # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is
 # unreliable.
 ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 888f7c7abf54..bd6b6a620a09 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2113,8 +2113,10 @@ static void cpu_hyp_init_features(void)
 {
 	cpu_set_hyp_vector();
 
-	if (is_kernel_in_hyp_mode())
+	if (is_kernel_in_hyp_mode()) {
 		kvm_timer_init_vhe();
+		kvm_debug_init_vhe();
+	}
 
 	if (vgic_present)
 		kvm_vgic_init_cpu_hardware();
@@ -2408,12 +2410,12 @@ static u64 get_hyp_id_aa64pfr0_el1(void)
 	 */
 	u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
 
-	val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
-		 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
+	val &= ~(ID_AA64PFR0_EL1_CSV2 |
+		 ID_AA64PFR0_EL1_CSV3);
 
-	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2,
 			  arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
-	val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
+	val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3,
 			  arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
 
 	return val;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 0e5610533949..d71ca4ddc9d1 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1420,10 +1420,10 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 		return;
 
 	/*
-	 * If we only have a single stage of translation (E2H=0 or
-	 * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
+	 * If we only have a single stage of translation (EL2&0), exit
+	 * early. Same thing if {VM,DC}=={0,0}.
 	 */
-	if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
+	if (compute_translation_regime(vcpu, op) == TR_EL20 ||
 	    !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
 		return;
 
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 381382c19fe4..e027d9c32b0d 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -96,6 +96,13 @@ void kvm_init_host_debug_data(void)
 	}
 }
 
+void kvm_debug_init_vhe(void)
+{
+	/* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */
+	if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1)))
+		write_sysreg_el1(0, SYS_PMSCR);
+}
+
 /*
  * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
  * has taken over MDSCR_EL1.
@@ -138,6 +145,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
 	/* Must be called before kvm_vcpu_load_vhe() */
 	KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);
 
+	if (has_vhe())
+		*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+
 	/*
 	 * Determine which of the possible debug states we're in:
 	 *
@@ -184,6 +194,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
 
 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
 {
+	if (has_vhe())
+		write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
 	if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
 		return;
 
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 90cb4b7ae0ff..af69c897c2c3 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -2833,7 +2833,7 @@ int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
 			     iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
 	esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
 
-	vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
+	vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
 
 	if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
 		return kvm_inject_nested(vcpu, esr, except_type_serror);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index a598072f36d2..8bdb1eed090a 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
 			kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
 		else
 			print_nvhe_hyp_panic("BUG", panic_addr);
-	} else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
+	} else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) {
 		kvm_nvhe_report_cfi_failure(panic_addr);
 	} else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) &&
 		   ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 95d186e0bf54..bef40ddb16db 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -22,36 +22,28 @@
 
 static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
 {
-	u64 val;
-
-	if (unlikely(vcpu_has_nv(vcpu)))
+	if (has_vhe())
 		return vcpu_read_sys_reg(vcpu, reg);
-	else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
-		 __vcpu_read_sys_reg_from_cpu(reg, &val))
-		return val;
 
 	return __vcpu_sys_reg(vcpu, reg);
 }
 
 static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 {
-	if (unlikely(vcpu_has_nv(vcpu)))
+	if (has_vhe())
 		vcpu_write_sys_reg(vcpu, val, reg);
-	else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) ||
-		 !__vcpu_write_sys_reg_to_cpu(val, reg))
+	else
 		__vcpu_assign_sys_reg(vcpu, reg, val);
 }
 
 static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
 			      u64 val)
 {
-	if (unlikely(vcpu_has_nv(vcpu))) {
+	if (has_vhe()) {
 		if (target_mode == PSR_MODE_EL1h)
 			vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
 		else
 			vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
-	} else if (has_vhe()) {
-		write_sysreg_el1(val, SYS_SPSR);
 	} else {
 		__vcpu_assign_sys_reg(vcpu, SPSR_EL1, val);
 	}
@@ -59,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
 
 static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
 {
-	if (has_vhe())
+	if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
 		write_sysreg(val, spsr_abt);
 	else
 		vcpu->arch.ctxt.spsr_abt = val;
@@ -67,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
 
 static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
 {
-	if (has_vhe())
+	if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
 		write_sysreg(val, spsr_und);
 	else
 		vcpu->arch.ctxt.spsr_und = val;
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 84ec4e100fbb..b6682202edf3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -431,9 +431,6 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
 		vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
 	}
 
-	*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
-	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-
 	if (cpus_have_final_cap(ARM64_HAS_HCX)) {
 		u64 hcrx = vcpu->arch.hcrx_el2;
 		if (is_nested_ctxt(vcpu)) {
@@ -454,8 +451,6 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
 
-	write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
-
 	write_sysreg(0, hstr_el2);
 	if (system_supports_pmuv3()) {
 		write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
diff --git a/arch/arm64/kvm/hyp/nvhe/list_debug.c b/arch/arm64/kvm/hyp/nvhe/list_debug.c
index 46a2d4f2b3c6..baa6260f88dc 100644
--- a/arch/arm64/kvm/hyp/nvhe/list_debug.c
+++ b/arch/arm64/kvm/hyp/nvhe/list_debug.c
@@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
 		bool corruption = unlikely(condition);			 \
 		if (corruption) {					 \
 			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
-				BUG_ON(1);				 \
+				BUG();				 	 \
 			} else						 \
 				WARN_ON(1);				 \
 		}							 \
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index ccd575d5f6de..d3b9ec8a7c28 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -50,6 +50,10 @@ extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	___activate_traps(vcpu, vcpu->arch.hcr_el2);
+
+	*host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
+	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+
 	__activate_traps_common(vcpu);
 	__activate_cptr_traps(vcpu);
 
@@ -93,6 +97,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 		isb();
 	}
 
+	write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);
+
 	__deactivate_traps_common(vcpu);
 
 	write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 1ddd9ed3cbb3..82da9b03692d 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -253,6 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
 
 	*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
 	*vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
+	__vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR));
 
 	kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
 
@@ -372,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
 
 	/* Debug and Trace Registers are restricted. */
 
+	/* Group 1 ID registers */
+	HOST_HANDLED(SYS_REVIDR_EL1),
+
 	/* AArch64 mappings of the AArch32 ID registers */
 	/* CRm=1 */
 	AARCH32(SYS_ID_PFR0_EL1),
@@ -460,6 +464,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
 
 	HOST_HANDLED(SYS_CCSIDR_EL1),
 	HOST_HANDLED(SYS_CLIDR_EL1),
+	HOST_HANDLED(SYS_AIDR_EL1),
 	HOST_HANDLED(SYS_CSSELR_EL1),
 	HOST_HANDLED(SYS_CTR_EL0),
 
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 87a54375bd6e..78579b31a420 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu)
 	if (vcpu_mode_is_32bit(vcpu))
 		return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT);
 
-	return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE);
+	return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE);
 }
 
 /*
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index e482181c6632..0998ad4a2552 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -43,8 +43,11 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
  *
  * - API/APK: they are already accounted for by vcpu_load(), and can
  *   only take effect across a load/put cycle (such as ERET)
+ *
+ * - FIEN: no way we let a guest have access to the RAS "Common Fault
+ *   Injection" thing, whatever that does
  */
-#define NV_HCR_GUEST_EXCLUDE	(HCR_TGE | HCR_API | HCR_APK)
+#define NV_HCR_GUEST_EXCLUDE	(HCR_TGE | HCR_API | HCR_APK | HCR_FIEN)
 
 static u64 __compute_hcr(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1c78864767c5..736394292503 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -4,19 +4,20 @@
  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
  */
 
+#include <linux/acpi.h>
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/signal.h>
 #include <trace/events/kvm.h>
+#include <asm/acpi.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/kvm_pkvm.h>
-#include <asm/kvm_ras.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/virt.h>
@@ -1073,6 +1074,10 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 		mmu->pgt = NULL;
 		free_percpu(mmu->last_vcpu_ran);
 	}
+
+	if (kvm_is_nested_s2_mmu(kvm, mmu))
+		kvm_init_nested_s2_mmu(mmu);
+
 	write_unlock(&kvm->mmu_lock);
 
 	if (pgt) {
@@ -1508,11 +1513,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
 	VM_BUG_ON(write_fault && exec_fault);
 
-	if (fault_is_perm && !write_fault && !exec_fault) {
-		kvm_err("Unexpected L2 read permission error\n");
-		return -EFAULT;
-	}
-
 	if (!is_protected_kvm_enabled())
 		memcache = &vcpu->arch.mmu_page_cache;
 	else
@@ -1811,6 +1811,19 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	read_unlock(&vcpu->kvm->mmu_lock);
 }
 
+int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Give APEI the opportunity to claim the abort before handling it
+	 * within KVM. apei_claim_sea() expects to be called with IRQs enabled.
+	 */
+	lockdep_assert_irqs_enabled();
+	if (apei_claim_sea(NULL) == 0)
+		return 1;
+
+	return kvm_inject_serror(vcpu);
+}
+
 /**
  * kvm_handle_guest_abort - handles all 2nd stage aborts
  * @vcpu:	the VCPU pointer
@@ -1834,17 +1847,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 	gfn_t gfn;
 	int ret, idx;
 
-	/* Synchronous External Abort? */
-	if (kvm_vcpu_abt_issea(vcpu)) {
-		/*
-		 * For RAS the host kernel may handle this abort.
-		 * There is no need to pass the error into the guest.
-		 */
-		if (kvm_handle_guest_sea())
-			return kvm_inject_serror(vcpu);
-
-		return 1;
-	}
+	if (kvm_vcpu_abt_issea(vcpu))
+		return kvm_handle_guest_sea(vcpu);
 
 	esr = kvm_vcpu_get_esr(vcpu);
 
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 153b3e11b115..50d559248a1f 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -847,7 +847,7 @@ static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
 
 		ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
 							    vt->wr.level));
-		ipa_start = vt->wr.pa & (ipa_size - 1);
+		ipa_start = vt->wr.pa & ~(ipa_size - 1);
 		ipa_end = ipa_start + ipa_size;
 
 		if (ipa_end <= start || ipa_start >= end)
@@ -887,7 +887,7 @@ static void invalidate_vncr_va(struct kvm *kvm,
 
 		va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
 							   vt->wr.level));
-		va_start = vt->gva & (va_size - 1);
+		va_start = vt->gva & ~(va_size - 1);
 		va_end = va_start + va_size;
 
 		switch (scope->type) {
@@ -1276,7 +1276,7 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
 		    !(tcr & TCR_ASID16))
 			asid &= GENMASK(7, 0);
 
-		return asid != vt->wr.asid;
+		return asid == vt->wr.asid;
 	}
 
 	return true;
@@ -1287,7 +1287,10 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
 	struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
 	u64 esr = kvm_vcpu_get_esr(vcpu);
 
-	BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT));
+	WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
+
+	if (kvm_vcpu_abt_issea(vcpu))
+		return kvm_handle_guest_sea(vcpu);
 
 	if (esr_fsc_is_permission_fault(esr)) {
 		inject_vncr_perm(vcpu);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 82ffb3b3b3cf..b29f72478a50 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -82,43 +82,105 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
 			"sys_reg write to read-only register");
 }
 
-#define PURE_EL2_SYSREG(el2)						\
-	case el2: {							\
-		*el1r = el2;						\
-		return true;						\
+enum sr_loc_attr {
+	SR_LOC_MEMORY	= 0,	  /* Register definitely in memory */
+	SR_LOC_LOADED	= BIT(0), /* Register on CPU, unless it cannot */
+	SR_LOC_MAPPED	= BIT(1), /* Register in a different CPU register */
+	SR_LOC_XLATED	= BIT(2), /* Register translated to fit another reg */
+	SR_LOC_SPECIAL	= BIT(3), /* Demanding register, implies loaded */
+};
+
+struct sr_loc {
+	enum sr_loc_attr loc;
+	enum vcpu_sysreg map_reg;
+	u64		 (*xlate)(u64);
+};
+
+static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu,
+					       enum vcpu_sysreg reg)
+{
+	switch (reg) {
+	case SCTLR_EL1:
+	case CPACR_EL1:
+	case TTBR0_EL1:
+	case TTBR1_EL1:
+	case TCR_EL1:
+	case TCR2_EL1:
+	case PIR_EL1:
+	case PIRE0_EL1:
+	case POR_EL1:
+	case ESR_EL1:
+	case AFSR0_EL1:
+	case AFSR1_EL1:
+	case FAR_EL1:
+	case MAIR_EL1:
+	case VBAR_EL1:
+	case CONTEXTIDR_EL1:
+	case AMAIR_EL1:
+	case CNTKCTL_EL1:
+	case ELR_EL1:
+	case SPSR_EL1:
+	case ZCR_EL1:
+	case SCTLR2_EL1:
+		/*
+		 * EL1 registers which have an ELx2 mapping are loaded if
+		 * we're not in hypervisor context.
+		 */
+		return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED;
+
+	case TPIDR_EL0:
+	case TPIDRRO_EL0:
+	case TPIDR_EL1:
+	case PAR_EL1:
+	case DACR32_EL2:
+	case IFSR32_EL2:
+	case DBGVCR32_EL2:
+		/* These registers are always loaded, no matter what */
+		return SR_LOC_LOADED;
+
+	default:
+		/* Non-mapped EL2 registers are by definition in memory. */
+		return SR_LOC_MEMORY;
 	}
+}
 
-#define MAPPED_EL2_SYSREG(el2, el1, fn)					\
-	case el2: {							\
-		*xlate = fn;						\
-		*el1r = el1;						\
-		return true;						\
+static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu,
+				       enum vcpu_sysreg reg,
+				       enum vcpu_sysreg map_reg,
+				       u64 (*xlate)(u64),
+				       struct sr_loc *loc)
+{
+	if (!is_hyp_ctxt(vcpu)) {
+		loc->loc = SR_LOC_MEMORY;
+		return;
+	}
+
+	loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED;
+	loc->map_reg = map_reg;
+
+	WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY);
+
+	if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) {
+		loc->loc |= SR_LOC_XLATED;
+		loc->xlate = xlate;
 	}
+}
 
-static bool get_el2_to_el1_mapping(unsigned int reg,
-				   unsigned int *el1r, u64 (**xlate)(u64))
+#define MAPPED_EL2_SYSREG(r, m, t)					\
+	case r:	{							\
+		locate_mapped_el2_register(vcpu, r, m, t, loc);		\
+		break;							\
+	}
+
+static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg,
+			    struct sr_loc *loc)
 {
+	if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) {
+		loc->loc = SR_LOC_MEMORY;
+		return;
+	}
+
 	switch (reg) {
-		PURE_EL2_SYSREG(  VPIDR_EL2	);
-		PURE_EL2_SYSREG(  VMPIDR_EL2	);
-		PURE_EL2_SYSREG(  ACTLR_EL2	);
-		PURE_EL2_SYSREG(  HCR_EL2	);
-		PURE_EL2_SYSREG(  MDCR_EL2	);
-		PURE_EL2_SYSREG(  HSTR_EL2	);
-		PURE_EL2_SYSREG(  HACR_EL2	);
-		PURE_EL2_SYSREG(  VTTBR_EL2	);
-		PURE_EL2_SYSREG(  VTCR_EL2	);
-		PURE_EL2_SYSREG(  TPIDR_EL2	);
-		PURE_EL2_SYSREG(  HPFAR_EL2	);
-		PURE_EL2_SYSREG(  HCRX_EL2	);
-		PURE_EL2_SYSREG(  HFGRTR_EL2	);
-		PURE_EL2_SYSREG(  HFGWTR_EL2	);
-		PURE_EL2_SYSREG(  HFGITR_EL2	);
-		PURE_EL2_SYSREG(  HDFGRTR_EL2	);
-		PURE_EL2_SYSREG(  HDFGWTR_EL2	);
-		PURE_EL2_SYSREG(  HAFGRTR_EL2	);
-		PURE_EL2_SYSREG(  CNTVOFF_EL2	);
-		PURE_EL2_SYSREG(  CNTHCTL_EL2	);
 		MAPPED_EL2_SYSREG(SCTLR_EL2,   SCTLR_EL1,
 				  translate_sctlr_el2_to_sctlr_el1	     );
 		MAPPED_EL2_SYSREG(CPTR_EL2,    CPACR_EL1,
@@ -144,125 +206,189 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
 		MAPPED_EL2_SYSREG(ZCR_EL2,     ZCR_EL1,     NULL	     );
 		MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL	     );
 		MAPPED_EL2_SYSREG(SCTLR2_EL2,  SCTLR2_EL1,  NULL	     );
+	case CNTHCTL_EL2:
+		/* CNTHCTL_EL2 is super special, until we support NV2.1 */
+		loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ?
+			    SR_LOC_SPECIAL : SR_LOC_MEMORY);
+		break;
 	default:
-		return false;
+		loc->loc = locate_direct_register(vcpu, reg);
 	}
 }
 
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+static u64 read_sr_from_cpu(enum vcpu_sysreg reg)
 {
 	u64 val = 0x8badf00d8badf00d;
-	u64 (*xlate)(u64) = NULL;
-	unsigned int el1r;
 
-	if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
-		goto memory_read;
+	switch (reg) {
+	case SCTLR_EL1:		val = read_sysreg_s(SYS_SCTLR_EL12);	break;
+	case CPACR_EL1:		val = read_sysreg_s(SYS_CPACR_EL12);	break;
+	case TTBR0_EL1:		val = read_sysreg_s(SYS_TTBR0_EL12);	break;
+	case TTBR1_EL1:		val = read_sysreg_s(SYS_TTBR1_EL12);	break;
+	case TCR_EL1:		val = read_sysreg_s(SYS_TCR_EL12);	break;
+	case TCR2_EL1:		val = read_sysreg_s(SYS_TCR2_EL12);	break;
+	case PIR_EL1:		val = read_sysreg_s(SYS_PIR_EL12);	break;
+	case PIRE0_EL1:		val = read_sysreg_s(SYS_PIRE0_EL12);	break;
+	case POR_EL1:		val = read_sysreg_s(SYS_POR_EL12);	break;
+	case ESR_EL1:		val = read_sysreg_s(SYS_ESR_EL12);	break;
+	case AFSR0_EL1:		val = read_sysreg_s(SYS_AFSR0_EL12);	break;
+	case AFSR1_EL1:		val = read_sysreg_s(SYS_AFSR1_EL12);	break;
+	case FAR_EL1:		val = read_sysreg_s(SYS_FAR_EL12);	break;
+	case MAIR_EL1:		val = read_sysreg_s(SYS_MAIR_EL12);	break;
+	case VBAR_EL1:		val = read_sysreg_s(SYS_VBAR_EL12);	break;
+	case CONTEXTIDR_EL1:	val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+	case AMAIR_EL1:		val = read_sysreg_s(SYS_AMAIR_EL12);	break;
+	case CNTKCTL_EL1:	val = read_sysreg_s(SYS_CNTKCTL_EL12);	break;
+	case ELR_EL1:		val = read_sysreg_s(SYS_ELR_EL12);	break;
+	case SPSR_EL1:		val = read_sysreg_s(SYS_SPSR_EL12);	break;
+	case ZCR_EL1:		val = read_sysreg_s(SYS_ZCR_EL12);	break;
+	case SCTLR2_EL1:	val = read_sysreg_s(SYS_SCTLR2_EL12);	break;
+	case TPIDR_EL0:		val = read_sysreg_s(SYS_TPIDR_EL0);	break;
+	case TPIDRRO_EL0:	val = read_sysreg_s(SYS_TPIDRRO_EL0);	break;
+	case TPIDR_EL1:		val = read_sysreg_s(SYS_TPIDR_EL1);	break;
+	case PAR_EL1:		val = read_sysreg_par();		break;
+	case DACR32_EL2:	val = read_sysreg_s(SYS_DACR32_EL2);	break;
+	case IFSR32_EL2:	val = read_sysreg_s(SYS_IFSR32_EL2);	break;
+	case DBGVCR32_EL2:	val = read_sysreg_s(SYS_DBGVCR32_EL2);	break;
+	default:		WARN_ON_ONCE(1);
+	}
 
-	if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
-		if (!is_hyp_ctxt(vcpu))
-			goto memory_read;
+	return val;
+}
+
+static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val)
+{
+	switch (reg) {
+	case SCTLR_EL1:		write_sysreg_s(val, SYS_SCTLR_EL12);	break;
+	case CPACR_EL1:		write_sysreg_s(val, SYS_CPACR_EL12);	break;
+	case TTBR0_EL1:		write_sysreg_s(val, SYS_TTBR0_EL12);	break;
+	case TTBR1_EL1:		write_sysreg_s(val, SYS_TTBR1_EL12);	break;
+	case TCR_EL1:		write_sysreg_s(val, SYS_TCR_EL12);	break;
+	case TCR2_EL1:		write_sysreg_s(val, SYS_TCR2_EL12);	break;
+	case PIR_EL1:		write_sysreg_s(val, SYS_PIR_EL12);	break;
+	case PIRE0_EL1:		write_sysreg_s(val, SYS_PIRE0_EL12);	break;
+	case POR_EL1:		write_sysreg_s(val, SYS_POR_EL12);	break;
+	case ESR_EL1:		write_sysreg_s(val, SYS_ESR_EL12);	break;
+	case AFSR0_EL1:		write_sysreg_s(val, SYS_AFSR0_EL12);	break;
+	case AFSR1_EL1:		write_sysreg_s(val, SYS_AFSR1_EL12);	break;
+	case FAR_EL1:		write_sysreg_s(val, SYS_FAR_EL12);	break;
+	case MAIR_EL1:		write_sysreg_s(val, SYS_MAIR_EL12);	break;
+	case VBAR_EL1:		write_sysreg_s(val, SYS_VBAR_EL12);	break;
+	case CONTEXTIDR_EL1:	write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+	case AMAIR_EL1:		write_sysreg_s(val, SYS_AMAIR_EL12);	break;
+	case CNTKCTL_EL1:	write_sysreg_s(val, SYS_CNTKCTL_EL12);	break;
+	case ELR_EL1:		write_sysreg_s(val, SYS_ELR_EL12);	break;
+	case SPSR_EL1:		write_sysreg_s(val, SYS_SPSR_EL12);	break;
+	case ZCR_EL1:		write_sysreg_s(val, SYS_ZCR_EL12);	break;
+	case SCTLR2_EL1:	write_sysreg_s(val, SYS_SCTLR2_EL12);	break;
+	case TPIDR_EL0:		write_sysreg_s(val, SYS_TPIDR_EL0);	break;
+	case TPIDRRO_EL0:	write_sysreg_s(val, SYS_TPIDRRO_EL0);	break;
+	case TPIDR_EL1:		write_sysreg_s(val, SYS_TPIDR_EL1);	break;
+	case PAR_EL1:		write_sysreg_s(val, SYS_PAR_EL1);	break;
+	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	break;
+	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	break;
+	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	break;
+	default:		WARN_ON_ONCE(1);
+	}
+}
+
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
+{
+	struct sr_loc loc = {};
+
+	locate_register(vcpu, reg, &loc);
+
+	WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
+
+	if (loc.loc & SR_LOC_SPECIAL) {
+		u64 val;
+
+		WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
 
 		/*
-		 * CNTHCTL_EL2 requires some special treatment to
-		 * account for the bits that can be set via CNTKCTL_EL1.
+		 * CNTHCTL_EL2 requires some special treatment to account
+		 * for the bits that can be set via CNTKCTL_EL1 when E2H==1.
 		 */
 		switch (reg) {
 		case CNTHCTL_EL2:
-			if (vcpu_el2_e2h_is_set(vcpu)) {
-				val = read_sysreg_el1(SYS_CNTKCTL);
-				val &= CNTKCTL_VALID_BITS;
-				val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
-				return val;
-			}
-			break;
+			val = read_sysreg_el1(SYS_CNTKCTL);
+			val &= CNTKCTL_VALID_BITS;
+			val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
+			return val;
+		default:
+			WARN_ON_ONCE(1);
 		}
+	}
 
-		/*
-		 * If this register does not have an EL1 counterpart,
-		 * then read the stored EL2 version.
-		 */
-		if (reg == el1r)
-			goto memory_read;
+	if (loc.loc & SR_LOC_LOADED) {
+		enum vcpu_sysreg map_reg = reg;
 
-		/*
-		 * If we have a non-VHE guest and that the sysreg
-		 * requires translation to be used at EL1, use the
-		 * in-memory copy instead.
-		 */
-		if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
-			goto memory_read;
+		if (loc.loc & SR_LOC_MAPPED)
+			map_reg = loc.map_reg;
 
-		/* Get the current version of the EL1 counterpart. */
-		WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val));
-		if (reg >= __SANITISED_REG_START__)
-			val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
-
-		return val;
-	}
+		if (!(loc.loc & SR_LOC_XLATED)) {
+			u64 val = read_sr_from_cpu(map_reg);
 
-	/* EL1 register can't be on the CPU if the guest is in vEL2. */
-	if (unlikely(is_hyp_ctxt(vcpu)))
-		goto memory_read;
+			if (reg >= __SANITISED_REG_START__)
+				val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
 
-	if (__vcpu_read_sys_reg_from_cpu(reg, &val))
-		return val;
+			return val;
+		}
+	}
 
-memory_read:
 	return __vcpu_sys_reg(vcpu, reg);
 }
 
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg)
 {
-	u64 (*xlate)(u64) = NULL;
-	unsigned int el1r;
+	struct sr_loc loc = {};
 
-	if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
-		goto memory_write;
+	locate_register(vcpu, reg, &loc);
 
-	if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
-		if (!is_hyp_ctxt(vcpu))
-			goto memory_write;
+	WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
 
-		/*
-		 * Always store a copy of the write to memory to avoid having
-		 * to reverse-translate virtual EL2 system registers for a
-		 * non-VHE guest hypervisor.
-		 */
-		__vcpu_assign_sys_reg(vcpu, reg, val);
+	if (loc.loc & SR_LOC_SPECIAL) {
+
+		WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
 
 		switch (reg) {
 		case CNTHCTL_EL2:
 			/*
-			 * If E2H=0, CNHTCTL_EL2 is a pure shadow register.
-			 * Otherwise, some of the bits are backed by
+			 * If E2H=1, some of the bits are backed by
 			 * CNTKCTL_EL1, while the rest is kept in memory.
 			 * Yes, this is fun stuff.
 			 */
-			if (vcpu_el2_e2h_is_set(vcpu))
-				write_sysreg_el1(val, SYS_CNTKCTL);
-			return;
+			write_sysreg_el1(val, SYS_CNTKCTL);
+			break;
+		default:
+			WARN_ON_ONCE(1);
 		}
+	}
 
-		/* No EL1 counterpart? We're done here.? */
-		if (reg == el1r)
-			return;
+	if (loc.loc & SR_LOC_LOADED) {
+		enum vcpu_sysreg map_reg = reg;
+		u64 xlated_val;
 
-		if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
-			val = xlate(val);
+		if (reg >= __SANITISED_REG_START__)
+			val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
 
-		/* Redirect this to the EL1 version of the register. */
-		WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r));
-		return;
-	}
+		if (loc.loc & SR_LOC_MAPPED)
+			map_reg = loc.map_reg;
 
-	/* EL1 register can't be on the CPU if the guest is in vEL2. */
-	if (unlikely(is_hyp_ctxt(vcpu)))
-		goto memory_write;
+		if (loc.loc & SR_LOC_XLATED)
+			xlated_val = loc.xlate(val);
+		else
+			xlated_val = val;
 
-	if (__vcpu_write_sys_reg_to_cpu(val, reg))
-		return;
+		write_sr_to_cpu(map_reg, xlated_val);
+
+		/*
+		 * Fall through to write the backing store anyway, which
+		 * allows translated registers to be directly read without a
+		 * reverse translation.
+		 */
+	}
 
-memory_write:
 	__vcpu_assign_sys_reg(vcpu, reg, val);
 }
 
@@ -1584,6 +1710,7 @@ static u8 pmuver_to_perfmon(u8 pmuver)
 }
 
 static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val);
 static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
@@ -1606,19 +1733,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
 		val = sanitise_id_aa64pfr0_el1(vcpu, val);
 		break;
 	case SYS_ID_AA64PFR1_EL1:
-		if (!kvm_has_mte(vcpu->kvm)) {
-			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
-			val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
-		}
-
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
-		val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
+		val = sanitise_id_aa64pfr1_el1(vcpu, val);
 		break;
 	case SYS_ID_AA64PFR2_EL1:
 		val &= ID_AA64PFR2_EL1_FPMR |
@@ -1628,18 +1743,18 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64ISAR1_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
-			val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
-				 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
-				 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
-				 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
+			val &= ~(ID_AA64ISAR1_EL1_APA |
+				 ID_AA64ISAR1_EL1_API |
+				 ID_AA64ISAR1_EL1_GPA |
+				 ID_AA64ISAR1_EL1_GPI);
 		break;
 	case SYS_ID_AA64ISAR2_EL1:
 		if (!vcpu_has_ptrauth(vcpu))
-			val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
-				 ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
+			val &= ~(ID_AA64ISAR2_EL1_APA3 |
+				 ID_AA64ISAR2_EL1_GPA3);
 		if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
 		    has_broken_cntvoff())
-			val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
+			val &= ~ID_AA64ISAR2_EL1_WFxT;
 		break;
 	case SYS_ID_AA64ISAR3_EL1:
 		val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX;
@@ -1655,7 +1770,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
 		       ID_AA64MMFR3_EL1_S1PIE;
 		break;
 	case SYS_ID_MMFR4_EL1:
-		val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
+		val &= ~ID_MMFR4_EL1_CCIDX;
 		break;
 	}
 
@@ -1836,6 +1951,31 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
 	return val;
 }
 
+static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+	if (!kvm_has_mte(vcpu->kvm)) {
+		val &= ~ID_AA64PFR1_EL1_MTE;
+		val &= ~ID_AA64PFR1_EL1_MTE_frac;
+	}
+
+	if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) &&
+	      SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP))
+		val &= ~ID_AA64PFR1_EL1_RAS_frac;
+
+	val &= ~ID_AA64PFR1_EL1_SME;
+	val &= ~ID_AA64PFR1_EL1_RNDR_trap;
+	val &= ~ID_AA64PFR1_EL1_NMI;
+	val &= ~ID_AA64PFR1_EL1_GCS;
+	val &= ~ID_AA64PFR1_EL1_THE;
+	val &= ~ID_AA64PFR1_EL1_MTEX;
+	val &= ~ID_AA64PFR1_EL1_PFAR;
+	val &= ~ID_AA64PFR1_EL1_MPAM_frac;
+
+	return val;
+}
+
 static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
 {
 	val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
@@ -2697,6 +2837,18 @@ static bool access_ras(struct kvm_vcpu *vcpu,
 	struct kvm *kvm = vcpu->kvm;
 
 	switch(reg_to_encoding(r)) {
+	case SYS_ERXPFGCDN_EL1:
+	case SYS_ERXPFGCTL_EL1:
+	case SYS_ERXPFGF_EL1:
+	case SYS_ERXMISC2_EL1:
+	case SYS_ERXMISC3_EL1:
+		if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
+		      (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
+		       kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) {
+			kvm_inject_undefined(vcpu);
+			return false;
+		}
+		break;
 	default:
 		if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
 			kvm_inject_undefined(vcpu);
@@ -2929,7 +3081,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 		    ~(ID_AA64PFR0_EL1_AMU |
 		      ID_AA64PFR0_EL1_MPAM |
 		      ID_AA64PFR0_EL1_SVE |
-		      ID_AA64PFR0_EL1_RAS |
 		      ID_AA64PFR0_EL1_AdvSIMD |
 		      ID_AA64PFR0_EL1_FP)),
 	ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
@@ -2943,7 +3094,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 				       ID_AA64PFR1_EL1_SME |
 				       ID_AA64PFR1_EL1_RES0 |
 				       ID_AA64PFR1_EL1_MPAM_frac |
-				       ID_AA64PFR1_EL1_RAS_frac |
 				       ID_AA64PFR1_EL1_MTE)),
 	ID_WRITABLE(ID_AA64PFR2_EL1,
 		    ID_AA64PFR2_EL1_FPMR |
@@ -3063,8 +3213,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
 	{ SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
 	{ SYS_DESC(SYS_ERXADDR_EL1), access_ras },
+	{ SYS_DESC(SYS_ERXPFGF_EL1), access_ras },
+	{ SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras },
+	{ SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras },
 	{ SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
 	{ SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
+	{ SYS_DESC(SYS_ERXMISC2_EL1), access_ras },
+	{ SYS_DESC(SYS_ERXMISC3_EL1), access_ras },
 
 	MTE_REG(TFSR_EL1),
 	MTE_REG(TFSRE0_EL1),
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 2684f273d9e1..4c1209261b65 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -69,7 +69,7 @@ static int iter_mark_lpis(struct kvm *kvm)
 	int nr_lpis = 0;
 
 	xa_for_each(&dist->lpi_xa, intid, irq) {
-		if (!vgic_try_get_irq_kref(irq))
+		if (!vgic_try_get_irq_ref(irq))
 			continue;
 
 		xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1e680ad6e863..4c3c0d82e476 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 
-	xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
+	xa_init(&dist->lpi_xa);
 }
 
 /* CREATION */
@@ -208,7 +208,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 		raw_spin_lock_init(&irq->irq_lock);
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu0;
-		kref_init(&irq->refcount);
+		refcount_set(&irq->refcount, 0);
 		switch (dist->vgic_model) {
 		case KVM_DEV_TYPE_ARM_VGIC_V2:
 			irq->targets = 0;
@@ -277,7 +277,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
 		irq->intid = i;
 		irq->vcpu = NULL;
 		irq->target_vcpu = vcpu;
-		kref_init(&irq->refcount);
+		refcount_set(&irq->refcount, 0);
 		if (vgic_irq_is_sgi(i)) {
 			/* SGIs */
 			irq->enabled = 1;
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 7368c13f16b7..ce3e3ed3f29f 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -78,7 +78,6 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
-	unsigned long flags;
 	int ret;
 
 	/* In this case there is no put, since we keep the reference. */
@@ -89,7 +88,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	if (!irq)
 		return ERR_PTR(-ENOMEM);
 
-	ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+	ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
 	if (ret) {
 		kfree(irq);
 		return ERR_PTR(ret);
@@ -99,19 +98,19 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	raw_spin_lock_init(&irq->irq_lock);
 
 	irq->config = VGIC_CONFIG_EDGE;
-	kref_init(&irq->refcount);
+	refcount_set(&irq->refcount, 1);
 	irq->intid = intid;
 	irq->target_vcpu = vcpu;
 	irq->group = 1;
 
-	xa_lock_irqsave(&dist->lpi_xa, flags);
+	xa_lock(&dist->lpi_xa);
 
 	/*
 	 * There could be a race with another vgic_add_lpi(), so we need to
 	 * check that we don't add a second list entry with the same LPI.
 	 */
 	oldirq = xa_load(&dist->lpi_xa, intid);
-	if (vgic_try_get_irq_kref(oldirq)) {
+	if (vgic_try_get_irq_ref(oldirq)) {
 		/* Someone was faster with adding this LPI, lets use that. */
 		kfree(irq);
 		irq = oldirq;
@@ -126,7 +125,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	}
 
 out_unlock:
-	xa_unlock_irqrestore(&dist->lpi_xa, flags);
+	xa_unlock(&dist->lpi_xa);
 
 	if (ret)
 		return ERR_PTR(ret);
@@ -547,7 +546,7 @@ static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
 	rcu_read_lock();
 
 	irq = xa_load(&its->translation_cache, cache_key);
-	if (!vgic_try_get_irq_kref(irq))
+	if (!vgic_try_get_irq_ref(irq))
 		irq = NULL;
 
 	rcu_read_unlock();
@@ -571,7 +570,7 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 	 * its_lock, as the ITE (and the reference it holds) cannot be freed.
 	 */
 	lockdep_assert_held(&its->its_lock);
-	vgic_get_irq_kref(irq);
+	vgic_get_irq_ref(irq);
 
 	old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
 
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index a3ef185209e9..70d50c77e5dc 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -50,6 +50,14 @@ bool vgic_has_its(struct kvm *kvm)
 
 bool vgic_supports_direct_msis(struct kvm *kvm)
 {
+	/*
+	 * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+	 * indirectly allowing userspace to control whether or not vPEs are
+	 * allocated for the VM.
+	 */
+	if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm))
+		return false;
+
 	return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index e416e433baff..a573b1f0c6cb 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -1091,7 +1091,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 		len = vgic_v3_init_dist_iodev(io_device);
 		break;
 	default:
-		BUG_ON(1);
+		BUG();
 	}
 
 	io_device->base_addr = dist_base_address;
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 4d9343d2b0b1..548aec9d5a72 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -518,7 +518,7 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
 		if (!irq->hw || irq->host_irq != host_irq)
 			continue;
 
-		if (!vgic_try_get_irq_kref(irq))
+		if (!vgic_try_get_irq_ref(irq))
 			return NULL;
 
 		return irq;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index f5148b38120a..6dd5a10081e2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -28,8 +28,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
  *     kvm->arch.config_lock (mutex)
  *       its->cmd_lock (mutex)
  *         its->its_lock (mutex)
- *           vgic_cpu->ap_list_lock		must be taken with IRQs disabled
- *             vgic_dist->lpi_xa.xa_lock	must be taken with IRQs disabled
+ *           vgic_dist->lpi_xa.xa_lock
+ *             vgic_cpu->ap_list_lock		must be taken with IRQs disabled
  *               vgic_irq->irq_lock		must be taken with IRQs disabled
  *
  * As the ap_list_lock might be taken from the timer interrupt handler,
@@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
 	rcu_read_lock();
 
 	irq = xa_load(&dist->lpi_xa, intid);
-	if (!vgic_try_get_irq_kref(irq))
+	if (!vgic_try_get_irq_ref(irq))
 		irq = NULL;
 
 	rcu_read_unlock();
@@ -114,37 +114,66 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
 	return vgic_get_irq(vcpu->kvm, intid);
 }
 
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
+static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq)
+{
+	lockdep_assert_held(&dist->lpi_xa.xa_lock);
+	__xa_erase(&dist->lpi_xa, irq->intid);
+	kfree_rcu(irq, rcu);
+}
+
+static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+	if (irq->intid < VGIC_MIN_LPI)
+		return false;
+
+	return refcount_dec_and_test(&irq->refcount);
+}
+
+static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq)
 {
+	if (!__vgic_put_irq(kvm, irq))
+		return false;
+
+	irq->pending_release = true;
+	return true;
 }
 
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
-	unsigned long flags;
 
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
+	if (irq->intid >= VGIC_MIN_LPI)
+		might_lock(&dist->lpi_xa.xa_lock);
 
-	if (!kref_put(&irq->refcount, vgic_irq_release))
+	if (!__vgic_put_irq(kvm, irq))
 		return;
 
-	xa_lock_irqsave(&dist->lpi_xa, flags);
-	__xa_erase(&dist->lpi_xa, irq->intid);
-	xa_unlock_irqrestore(&dist->lpi_xa, flags);
+	xa_lock(&dist->lpi_xa);
+	vgic_release_lpi_locked(dist, irq);
+	xa_unlock(&dist->lpi_xa);
+}
 
-	kfree_rcu(irq, rcu);
+static void vgic_release_deleted_lpis(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	unsigned long intid;
+	struct vgic_irq *irq;
+
+	xa_lock(&dist->lpi_xa);
+
+	xa_for_each(&dist->lpi_xa, intid, irq) {
+		if (irq->pending_release)
+			vgic_release_lpi_locked(dist, irq);
+	}
+
+	xa_unlock(&dist->lpi_xa);
 }
 
 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq, *tmp;
+	bool deleted = false;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
@@ -155,11 +184,14 @@ void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
 			list_del(&irq->ap_list);
 			irq->vcpu = NULL;
 			raw_spin_unlock(&irq->irq_lock);
-			vgic_put_irq(vcpu->kvm, irq);
+			deleted |= vgic_put_irq_norelease(vcpu->kvm, irq);
 		}
 	}
 
 	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+	if (deleted)
+		vgic_release_deleted_lpis(vcpu->kvm);
 }
 
 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
@@ -399,7 +431,7 @@ retry:
 	 * now in the ap_list. This is safe as the caller must already hold a
 	 * reference on the irq.
 	 */
-	vgic_get_irq_kref(irq);
+	vgic_get_irq_ref(irq);
 	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
 	irq->vcpu = vcpu;
 
@@ -630,6 +662,7 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_irq *irq, *tmp;
+	bool deleted_lpis = false;
 
 	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
 
@@ -657,12 +690,12 @@ retry:
 
 			/*
 			 * This vgic_put_irq call matches the
-			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
+			 * vgic_get_irq_ref in vgic_queue_irq_unlock,
 			 * where we added the LPI to the ap_list. As
 			 * we remove the irq from the list, we drop
 			 * also drop the refcount.
 			 */
-			vgic_put_irq(vcpu->kvm, irq);
+			deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq);
 			continue;
 		}
 
@@ -725,6 +758,9 @@ retry:
 	}
 
 	raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+	if (unlikely(deleted_lpis))
+		vgic_release_deleted_lpis(vcpu->kvm);
 }
 
 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -818,7 +854,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 		 * the AP list has been sorted already.
 		 */
 		if (multi_sgi && irq->priority > prio) {
-			_raw_spin_unlock(&irq->irq_lock);
+			raw_spin_unlock(&irq->irq_lock);
 			break;
 		}
 
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 1384a04c0784..ac5f9c5d2b98 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -267,7 +267,7 @@ void vgic_v2_put(struct kvm_vcpu *vcpu);
 void vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
 
-static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
+static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq)
 {
 	if (!irq)
 		return false;
@@ -275,12 +275,12 @@ static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
 	if (irq->intid < VGIC_MIN_LPI)
 		return true;
 
-	return kref_get_unless_zero(&irq->refcount);
+	return refcount_inc_not_zero(&irq->refcount);
 }
 
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+static inline void vgic_get_irq_ref(struct vgic_irq *irq)
 {
-	WARN_ON_ONCE(!vgic_try_get_irq_kref(irq));
+	WARN_ON_ONCE(!vgic_try_get_irq_ref(irq));
 }
 
 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -396,15 +396,7 @@ bool vgic_supports_direct_sgis(struct kvm *kvm);
 
 static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
 {
-	/*
-	 * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
-	 * indirectly allowing userspace to control whether or not vPEs are
-	 * allocated for the VM.
-	 */
-	if (system_supports_direct_sgis())
-		return vgic_supports_direct_sgis(kvm);
-
-	return vgic_supports_direct_msis(kvm);
+	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
 }
 
 int vgic_v4_init(struct kvm *kvm);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ea84a61ed508..524d34a0e921 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -243,7 +243,7 @@ void __init arm64_memblock_init(void)
 	 */
 	if (memory_limit != PHYS_ADDR_MAX) {
 		memblock_mem_limit_remove_map(memory_limit);
-		memblock_add(__pa_symbol(_text), (u64)(_end - _text));
+		memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text));
 	}
 
 	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
@@ -252,8 +252,8 @@ void __init arm64_memblock_init(void)
 		 * initrd to become inaccessible via the linear mapping.
 		 * Otherwise, this is a no-op
 		 */
-		u64 base = phys_initrd_start & PAGE_MASK;
-		u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
+		phys_addr_t base = phys_initrd_start & PAGE_MASK;
+		resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
 
 		/*
 		 * We can only add back the initrd memory if we don't end up
@@ -279,7 +279,7 @@ void __init arm64_memblock_init(void)
 	 * Register the kernel text, kernel data, initrd, and initial
 	 * pagetables with memblock.
 	 */
-	memblock_reserve(__pa_symbol(_stext), _end - _stext);
+	memblock_reserve(__pa_symbol(_text), _end - _text);
 	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
 		/* the generic initrd code expects virtual addresses */
 		initrd_start = __phys_to_virt(phys_initrd_start);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 34e5d78af076..3a444a5fe469 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -27,6 +27,8 @@
 #include <linux/kfence.h>
 #include <linux/pkeys.h>
 #include <linux/mm_inline.h>
+#include <linux/pagewalk.h>
+#include <linux/stop_machine.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -47,12 +49,7 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
 
-enum pgtable_type {
-	TABLE_PTE,
-	TABLE_PMD,
-	TABLE_PUD,
-	TABLE_P4D,
-};
+DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
 
 u64 kimage_voffset __ro_after_init;
 EXPORT_SYMBOL(kimage_voffset);
@@ -481,14 +478,18 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 			     int flags);
 #endif
 
-static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
+#define INVALID_PHYS_ADDR	(-1ULL)
+
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
 				       enum pgtable_type pgtable_type)
 {
 	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
-	struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+	struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
 	phys_addr_t pa;
 
-	BUG_ON(!ptdesc);
+	if (!ptdesc)
+		return INVALID_PHYS_ADDR;
+
 	pa = page_to_phys(ptdesc_page(ptdesc));
 
 	switch (pgtable_type) {
@@ -509,16 +510,392 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
 	return pa;
 }
 
+static phys_addr_t
+try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp)
+{
+	return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
+}
+
 static phys_addr_t __maybe_unused
 pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
 {
-	return __pgd_pgtable_alloc(&init_mm, pgtable_type);
+	phys_addr_t pa;
+
+	pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type);
+	BUG_ON(pa == INVALID_PHYS_ADDR);
+	return pa;
 }
 
 static phys_addr_t
 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
 {
-	return __pgd_pgtable_alloc(NULL, pgtable_type);
+	phys_addr_t pa;
+
+	pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
+	BUG_ON(pa == INVALID_PHYS_ADDR);
+	return pa;
+}
+
+static void split_contpte(pte_t *ptep)
+{
+	int i;
+
+	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+	for (i = 0; i < CONT_PTES; i++, ptep++)
+		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
+}
+
+static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
+{
+	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
+	unsigned long pfn = pmd_pfn(pmd);
+	pgprot_t prot = pmd_pgprot(pmd);
+	phys_addr_t pte_phys;
+	pte_t *ptep;
+	int i;
+
+	pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp);
+	if (pte_phys == INVALID_PHYS_ADDR)
+		return -ENOMEM;
+	ptep = (pte_t *)phys_to_virt(pte_phys);
+
+	if (pgprot_val(prot) & PMD_SECT_PXN)
+		tableprot |= PMD_TABLE_PXN;
+
+	prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
+	prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+	if (to_cont)
+		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
+		__set_pte(ptep, pfn_pte(pfn, prot));
+
+	/*
+	 * Ensure the pte entries are visible to the table walker by the time
+	 * the pmd entry that points to the ptes is visible.
+	 */
+	dsb(ishst);
+	__pmd_populate(pmdp, pte_phys, tableprot);
+
+	return 0;
+}
+
+static void split_contpmd(pmd_t *pmdp)
+{
+	int i;
+
+	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
+	for (i = 0; i < CONT_PMDS; i++, pmdp++)
+		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
+}
+
+static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
+{
+	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
+	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
+	unsigned long pfn = pud_pfn(pud);
+	pgprot_t prot = pud_pgprot(pud);
+	phys_addr_t pmd_phys;
+	pmd_t *pmdp;
+	int i;
+
+	pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp);
+	if (pmd_phys == INVALID_PHYS_ADDR)
+		return -ENOMEM;
+	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+
+	if (pgprot_val(prot) & PMD_SECT_PXN)
+		tableprot |= PUD_TABLE_PXN;
+
+	prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
+	prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
+	if (to_cont)
+		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
+
+	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
+		set_pmd(pmdp, pfn_pmd(pfn, prot));
+
+	/*
+	 * Ensure the pmd entries are visible to the table walker by the time
+	 * the pud entry that points to the pmds is visible.
+	 */
+	dsb(ishst);
+	__pud_populate(pudp, pmd_phys, tableprot);
+
+	return 0;
+}
+
+static int split_kernel_leaf_mapping_locked(unsigned long addr)
+{
+	pgd_t *pgdp, pgd;
+	p4d_t *p4dp, p4d;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
+	int ret = 0;
+
+	/*
+	 * PGD: If addr is PGD aligned then addr already describes a leaf
+	 * boundary. If not present then there is nothing to split.
+	 */
+	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
+		goto out;
+	pgdp = pgd_offset_k(addr);
+	pgd = pgdp_get(pgdp);
+	if (!pgd_present(pgd))
+		goto out;
+
+	/*
+	 * P4D: If addr is P4D aligned then addr already describes a leaf
+	 * boundary. If not present then there is nothing to split.
+	 */
+	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
+		goto out;
+	p4dp = p4d_offset(pgdp, addr);
+	p4d = p4dp_get(p4dp);
+	if (!p4d_present(p4d))
+		goto out;
+
+	/*
+	 * PUD: If addr is PUD aligned then addr already describes a leaf
+	 * boundary. If not present then there is nothing to split. Otherwise,
+	 * if we have a pud leaf, split to contpmd.
+	 */
+	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
+		goto out;
+	pudp = pud_offset(p4dp, addr);
+	pud = pudp_get(pudp);
+	if (!pud_present(pud))
+		goto out;
+	if (pud_leaf(pud)) {
+		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * CONTPMD: If addr is CONTPMD aligned then addr already describes a
+	 * leaf boundary. If not present then there is nothing to split.
+	 * Otherwise, if we have a contpmd leaf, split to pmd.
+	 */
+	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
+		goto out;
+	pmdp = pmd_offset(pudp, addr);
+	pmd = pmdp_get(pmdp);
+	if (!pmd_present(pmd))
+		goto out;
+	if (pmd_leaf(pmd)) {
+		if (pmd_cont(pmd))
+			split_contpmd(pmdp);
+		/*
+		 * PMD: If addr is PMD aligned then addr already describes a
+		 * leaf boundary. Otherwise, split to contpte.
+		 */
+		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
+			goto out;
+		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * CONTPTE: If addr is CONTPTE aligned then addr already describes a
+	 * leaf boundary. If not present then there is nothing to split.
+	 * Otherwise, if we have a contpte leaf, split to pte.
+	 */
+	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
+		goto out;
+	ptep = pte_offset_kernel(pmdp, addr);
+	pte = __ptep_get(ptep);
+	if (!pte_present(pte))
+		goto out;
+	if (pte_cont(pte))
+		split_contpte(ptep);
+
+out:
+	return ret;
+}
+
+static DEFINE_MUTEX(pgtable_split_lock);
+
+int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
+{
+	int ret;
+
+	/*
+	 * !BBML2_NOABORT systems should not be trying to change permissions on
+	 * anything that is not pte-mapped in the first place. Just return early
+	 * and let the permission change code raise a warning if not already
+	 * pte-mapped.
+	 */
+	if (!system_supports_bbml2_noabort())
+		return 0;
+
+	/*
+	 * Ensure start and end are at least page-aligned since this is the
+	 * finest granularity we can split to.
+	 */
+	if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
+		return -EINVAL;
+
+	mutex_lock(&pgtable_split_lock);
+	arch_enter_lazy_mmu_mode();
+
+	/*
+	 * The split_kernel_leaf_mapping_locked() may sleep, it is not a
+	 * problem for ARM64 since ARM64's lazy MMU implementation allows
+	 * sleeping.
+	 *
+	 * Optimize for the common case of splitting out a single page from a
+	 * larger mapping. Here we can just split on the "least aligned" of
+	 * start and end and this will guarantee that there must also be a split
+	 * on the more aligned address since the both addresses must be in the
+	 * same contpte block and it must have been split to ptes.
+	 */
+	if (end - start == PAGE_SIZE) {
+		start = __ffs(start) < __ffs(end) ? start : end;
+		ret = split_kernel_leaf_mapping_locked(start);
+	} else {
+		ret = split_kernel_leaf_mapping_locked(start);
+		if (!ret)
+			ret = split_kernel_leaf_mapping_locked(end);
+	}
+
+	arch_leave_lazy_mmu_mode();
+	mutex_unlock(&pgtable_split_lock);
+	return ret;
+}
+
+static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
+					  unsigned long next,
+					  struct mm_walk *walk)
+{
+	pud_t pud = pudp_get(pudp);
+	int ret = 0;
+
+	if (pud_leaf(pud))
+		ret = split_pud(pudp, pud, GFP_ATOMIC, false);
+
+	return ret;
+}
+
+static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
+					  unsigned long next,
+					  struct mm_walk *walk)
+{
+	pmd_t pmd = pmdp_get(pmdp);
+	int ret = 0;
+
+	if (pmd_leaf(pmd)) {
+		if (pmd_cont(pmd))
+			split_contpmd(pmdp);
+		ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
+
+		/*
+		 * We have split the pmd directly to ptes so there is no need to
+		 * visit each pte to check if they are contpte.
+		 */
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return ret;
+}
+
+static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
+					  unsigned long next,
+					  struct mm_walk *walk)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (pte_cont(pte))
+		split_contpte(ptep);
+
+	return 0;
+}
+
+static const struct mm_walk_ops split_to_ptes_ops __initconst = {
+	.pud_entry	= split_to_ptes_pud_entry,
+	.pmd_entry	= split_to_ptes_pmd_entry,
+	.pte_entry	= split_to_ptes_pte_entry,
+};
+
+static bool linear_map_requires_bbml2 __initdata;
+
+u32 idmap_kpti_bbml2_flag;
+
+void __init init_idmap_kpti_bbml2_flag(void)
+{
+	WRITE_ONCE(idmap_kpti_bbml2_flag, 1);
+	/* Must be visible to other CPUs before stop_machine() is called. */
+	smp_mb();
+}
+
+static int __init linear_map_split_to_ptes(void *__unused)
+{
+	/*
+	 * Repainting the linear map must be done by CPU0 (the boot CPU) because
+	 * that's the only CPU that we know supports BBML2. The other CPUs will
+	 * be held in a waiting area with the idmap active.
+	 */
+	if (!smp_processor_id()) {
+		unsigned long lstart = _PAGE_OFFSET(vabits_actual);
+		unsigned long lend = PAGE_END;
+		unsigned long kstart = (unsigned long)lm_alias(_stext);
+		unsigned long kend = (unsigned long)lm_alias(__init_begin);
+		int ret;
+
+		/*
+		 * Wait for all secondary CPUs to be put into the waiting area.
+		 */
+		smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus());
+
+		/*
+		 * Walk all of the linear map [lstart, lend), except the kernel
+		 * linear map alias [kstart, kend), and split all mappings to
+		 * PTE. The kernel alias remains static throughout runtime so
+		 * can continue to be safely mapped with large mappings.
+		 */
+		ret = walk_kernel_page_table_range_lockless(lstart, kstart,
+						&split_to_ptes_ops, NULL, NULL);
+		if (!ret)
+			ret = walk_kernel_page_table_range_lockless(kend, lend,
+						&split_to_ptes_ops, NULL, NULL);
+		if (ret)
+			panic("Failed to split linear map\n");
+		flush_tlb_kernel_range(lstart, lend);
+
+		/*
+		 * Relies on dsb in flush_tlb_kernel_range() to avoid reordering
+		 * before any page table split operations.
+		 */
+		WRITE_ONCE(idmap_kpti_bbml2_flag, 0);
+	} else {
+		typedef void (wait_split_fn)(void);
+		extern wait_split_fn wait_linear_map_split_to_ptes;
+		wait_split_fn *wait_fn;
+
+		wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes);
+
+		/*
+		 * At least one secondary CPU doesn't support BBML2 so cannot
+		 * tolerate the size of the live mappings changing. So have the
+		 * secondary CPUs wait for the boot CPU to make the changes
+		 * with the idmap active and init_mm inactive.
+		 */
+		cpu_install_idmap();
+		wait_fn();
+		cpu_uninstall_idmap();
+	}
+
+	return 0;
+}
+
+void __init linear_map_maybe_split_to_ptes(void)
+{
+	if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) {
+		init_idmap_kpti_bbml2_flag();
+		stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
+	}
 }
 
 /*
@@ -581,8 +958,8 @@ void __init mark_linear_text_alias_ro(void)
 	/*
 	 * Remove the write permissions from the linear alias of .text/.rodata
 	 */
-	update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
-			    (unsigned long)__init_begin - (unsigned long)_stext,
+	update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+			    (unsigned long)__init_begin - (unsigned long)_text,
 			    PAGE_KERNEL_RO);
 }
 
@@ -640,10 +1017,20 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
 
 #endif /* CONFIG_KFENCE */
 
+static inline bool force_pte_mapping(void)
+{
+	bool bbml2 = system_capabilities_finalized() ?
+		system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
+
+	return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
+			   is_realm_world())) ||
+		debug_pagealloc_enabled();
+}
+
 static void __init map_mem(pgd_t *pgdp)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
-	phys_addr_t kernel_start = __pa_symbol(_stext);
+	phys_addr_t kernel_start = __pa_symbol(_text);
 	phys_addr_t kernel_end = __pa_symbol(__init_begin);
 	phys_addr_t start, end;
 	phys_addr_t early_kfence_pool;
@@ -665,7 +1052,9 @@ static void __init map_mem(pgd_t *pgdp)
 
 	early_kfence_pool = arm64_kfence_alloc_pool();
 
-	if (can_set_direct_map())
+	linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map();
+
+	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
@@ -690,7 +1079,7 @@ static void __init map_mem(pgd_t *pgdp)
 	}
 
 	/*
-	 * Map the linear alias of the [_stext, __init_begin) interval
+	 * Map the linear alias of the [_text, __init_begin) interval
 	 * as non-executable now, and remove the write permission in
 	 * mark_linear_text_alias_ro() below (which will be called after
 	 * alternative patching has completed). This makes the contents
@@ -717,6 +1106,10 @@ void mark_rodata_ro(void)
 	WRITE_ONCE(rodata_is_rw, false);
 	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
+	/* mark the range between _text and _stext as read only. */
+	update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+			    (unsigned long)_stext - (unsigned long)_text,
+			    PAGE_KERNEL_RO);
 }
 
 static void __init declare_vma(struct vm_struct *vma,
@@ -787,38 +1180,41 @@ static void __init declare_kernel_vmas(void)
 {
 	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
 
-	declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
 	declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
 	declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
 	declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
 	declare_vma(&vmlinux_seg[4], _data, _end, 0);
 }
 
-void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
-		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
+		    pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
+		    u64 va_offset);
 
 static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
-	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
+	  kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
 
 static void __init create_idmap(void)
 {
-	u64 start = __pa_symbol(__idmap_text_start);
-	u64 end   = __pa_symbol(__idmap_text_end);
-	u64 ptep  = __pa_symbol(idmap_ptes);
+	phys_addr_t start = __pa_symbol(__idmap_text_start);
+	phys_addr_t end   = __pa_symbol(__idmap_text_end);
+	phys_addr_t ptep  = __pa_symbol(idmap_ptes);
 
 	__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
 		       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
 		       __phys_to_virt(ptep) - ptep);
 
-	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
-		extern u32 __idmap_kpti_flag;
-		u64 pa = __pa_symbol(&__idmap_kpti_flag);
+	if (linear_map_requires_bbml2 ||
+	    (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) {
+		phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag);
 
 		/*
 		 * The KPTI G-to-nG conversion code needs a read-write mapping
-		 * of its synchronization flag in the ID map.
+		 * of its synchronization flag in the ID map. This is also used
+		 * when splitting the linear map to ptes if a secondary CPU
+		 * doesn't support bbml2.
 		 */
-		ptep = __pa_symbol(kpti_ptes);
+		ptep = __pa_symbol(kpti_bbml2_ptes);
 		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
 			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
 			       __phys_to_virt(ptep) - ptep);
@@ -1268,7 +1664,8 @@ int pmd_clear_huge(pmd_t *pmdp)
 	return 1;
 }
 
-int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr,
+			       bool acquire_mmap_lock)
 {
 	pte_t *table;
 	pmd_t pmd;
@@ -1280,13 +1677,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
 		return 1;
 	}
 
+	/* See comment in pud_free_pmd_page for static key logic */
 	table = pte_offset_kernel(pmdp, addr);
 	pmd_clear(pmdp);
 	__flush_tlb_kernel_pgtable(addr);
+	if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) {
+		mmap_read_lock(&init_mm);
+		mmap_read_unlock(&init_mm);
+	}
+
 	pte_free_kernel(NULL, table);
 	return 1;
 }
 
+int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
+{
+	/* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */
+	return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true);
+}
+
 int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 {
 	pmd_t *table;
@@ -1302,16 +1711,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	}
 
 	table = pmd_offset(pudp, addr);
+
+	/*
+	 * Our objective is to prevent ptdump from reading a PMD table which has
+	 * been freed. In this race, if pud_free_pmd_page observes the key on
+	 * (which got flipped by ptdump) then the mmap lock sequence here will,
+	 * as a result of the mmap write lock/unlock sequence in ptdump, give
+	 * us the correct synchronization. If not, this means that ptdump has
+	 * yet not started walking the pagetables - the sequence of barriers
+	 * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will
+	 * observe an empty PUD.
+	 */
+	pud_clear(pudp);
+	__flush_tlb_kernel_pgtable(addr);
+	if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
+		mmap_read_lock(&init_mm);
+		mmap_read_unlock(&init_mm);
+	}
+
 	pmdp = table;
 	next = addr;
 	end = addr + PUD_SIZE;
 	do {
 		if (pmd_present(pmdp_get(pmdp)))
-			pmd_free_pte_page(pmdp, next);
+			/*
+			 * PMD has been isolated, so ptdump won't see it. No
+			 * need to acquire init_mm.mmap_lock.
+			 */
+			__pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false);
 	} while (pmdp++, next += PMD_SIZE, next != end);
 
-	pud_clear(pudp);
-	__flush_tlb_kernel_pgtable(addr);
 	pmd_free(NULL, table);
 	return 1;
 }
@@ -1331,8 +1760,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
 struct range arch_get_mappable_range(void)
 {
 	struct range mhp_range;
-	u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
-	u64 end_linear_pa = __pa(PAGE_END - 1);
+	phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
+	phys_addr_t end_linear_pa = __pa(PAGE_END - 1);
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
 		/*
@@ -1367,7 +1796,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
 	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 
-	if (can_set_direct_map())
+	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 04d4a8f676db..5135f2d66958 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,6 +8,7 @@
 #include <linux/mem_encrypt.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
+#include <linux/pagewalk.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgtable-prot.h>
@@ -20,7 +21,66 @@ struct page_change_data {
 	pgprot_t clear_mask;
 };
 
-bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED);
+static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
+{
+	struct page_change_data *masks = walk->private;
+
+	val &= ~(pgprot_val(masks->clear_mask));
+	val |= (pgprot_val(masks->set_mask));
+
+	return val;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pud_t val = pudp_get(pud);
+
+	if (pud_sect(val)) {
+		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
+			return -EINVAL;
+		val = __pud(set_pageattr_masks(pud_val(val), walk));
+		set_pud(pud, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pmd_t val = pmdp_get(pmd);
+
+	if (pmd_sect(val)) {
+		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
+			return -EINVAL;
+		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+		set_pmd(pmd, val);
+		walk->action = ACTION_CONTINUE;
+	}
+
+	return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	pte_t val = __ptep_get(pte);
+
+	val = __pte(set_pageattr_masks(pte_val(val), walk));
+	__set_pte(pte, val);
+
+	return 0;
+}
+
+static const struct mm_walk_ops pageattr_ops = {
+	.pud_entry	= pageattr_pud_entry,
+	.pmd_entry	= pageattr_pmd_entry,
+	.pte_entry	= pageattr_pte_entry,
+};
+
+bool rodata_full __ro_after_init = true;
 
 bool can_set_direct_map(void)
 {
@@ -37,32 +97,39 @@ bool can_set_direct_map(void)
 		arm64_kfence_can_set_direct_map() || is_realm_world();
 }
 
-static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
+static int update_range_prot(unsigned long start, unsigned long size,
+			     pgprot_t set_mask, pgprot_t clear_mask)
 {
-	struct page_change_data *cdata = data;
-	pte_t pte = __ptep_get(ptep);
+	struct page_change_data data;
+	int ret;
 
-	pte = clear_pte_bit(pte, cdata->clear_mask);
-	pte = set_pte_bit(pte, cdata->set_mask);
+	data.set_mask = set_mask;
+	data.clear_mask = clear_mask;
 
-	__set_pte(ptep, pte);
-	return 0;
+	ret = split_kernel_leaf_mapping(start, start + size);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	arch_enter_lazy_mmu_mode();
+
+	/*
+	 * The caller must ensure that the range we are operating on does not
+	 * partially overlap a block mapping, or a cont mapping. Any such case
+	 * must be eliminated by splitting the mapping.
+	 */
+	ret = walk_kernel_page_table_range_lockless(start, start + size,
+						    &pageattr_ops, NULL, &data);
+	arch_leave_lazy_mmu_mode();
+
+	return ret;
 }
 
-/*
- * This function assumes that the range is mapped with PAGE_SIZE pages.
- */
 static int __change_memory_common(unsigned long start, unsigned long size,
-				pgprot_t set_mask, pgprot_t clear_mask)
+				  pgprot_t set_mask, pgprot_t clear_mask)
 {
-	struct page_change_data data;
 	int ret;
 
-	data.set_mask = set_mask;
-	data.clear_mask = clear_mask;
-
-	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
-					&data);
+	ret = update_range_prot(start, size, set_mask, clear_mask);
 
 	/*
 	 * If the memory is being made valid without changing any other bits
@@ -174,32 +241,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
 
 int set_direct_map_invalid_noflush(struct page *page)
 {
-	struct page_change_data data = {
-		.set_mask = __pgprot(0),
-		.clear_mask = __pgprot(PTE_VALID),
-	};
+	pgprot_t clear_mask = __pgprot(PTE_VALID);
+	pgprot_t set_mask = __pgprot(0);
 
 	if (!can_set_direct_map())
 		return 0;
 
-	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+	return update_range_prot((unsigned long)page_address(page),
+				 PAGE_SIZE, set_mask, clear_mask);
 }
 
 int set_direct_map_default_noflush(struct page *page)
 {
-	struct page_change_data data = {
-		.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
-		.clear_mask = __pgprot(PTE_RDONLY),
-	};
+	pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
+	pgprot_t clear_mask = __pgprot(PTE_RDONLY);
 
 	if (!can_set_direct_map())
 		return 0;
 
-	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+	return update_range_prot((unsigned long)page_address(page),
+				 PAGE_SIZE, set_mask, clear_mask);
 }
 
 static int __set_memory_enc_dec(unsigned long addr,
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 8c75965afc9e..86818511962b 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -245,10 +245,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
  *
  * Called exactly once from stop_machine context by each CPU found during boot.
  */
-	.pushsection	".data", "aw", %progbits
-SYM_DATA(__idmap_kpti_flag, .long 1)
-	.popsection
-
 SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	cpu		.req	w0
 	temp_pte	.req	x0
@@ -273,7 +269,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 
 	mov	x5, x3				// preserve temp_pte arg
 	mrs	swapper_ttb, ttbr1_el1
-	adr_l	flag_ptr, __idmap_kpti_flag
+	adr_l	flag_ptr, idmap_kpti_bbml2_flag
 
 	cbnz	cpu, __idmap_kpti_secondary
 
@@ -416,7 +412,25 @@ alternative_else_nop_endif
 __idmap_kpti_secondary:
 	/* Uninstall swapper before surgery begins */
 	__idmap_cpu_set_reserved_ttbr1 x16, x17
+	b scondary_cpu_wait
+
+	.unreq	swapper_ttb
+	.unreq	flag_ptr
+SYM_FUNC_END(idmap_kpti_install_ng_mappings)
+	.popsection
+#endif
+
+	.pushsection ".idmap.text", "a"
+SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes)
+	/* Must be same registers as in idmap_kpti_install_ng_mappings */
+	swapper_ttb	.req	x3
+	flag_ptr	.req	x4
+
+	mrs     swapper_ttb, ttbr1_el1
+	adr_l   flag_ptr, idmap_kpti_bbml2_flag
+	__idmap_cpu_set_reserved_ttbr1 x16, x17
 
+scondary_cpu_wait:
 	/* Increment the flag to let the boot CPU we're ready */
 1:	ldxr	w16, [flag_ptr]
 	add	w16, w16, #1
@@ -436,9 +450,8 @@ __idmap_kpti_secondary:
 
 	.unreq	swapper_ttb
 	.unreq	flag_ptr
-SYM_FUNC_END(idmap_kpti_install_ng_mappings)
+SYM_FUNC_END(wait_linear_map_split_to_ptes)
 	.popsection
-#endif
 
 /*
  *	__cpu_setup
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 421a5de806c6..ab9899ca1e5f 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st)
 	note_page(pt_st, 0, -1, pte_val(pte_zero));
 }
 
+static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm)
+{
+	static_branch_inc(&arm64_ptdump_lock_key);
+	ptdump_walk_pgd(st, mm, NULL);
+	static_branch_dec(&arm64_ptdump_lock_key);
+}
+
 void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 {
 	unsigned long end = ~0UL;
@@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 		}
 	};
 
-	ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
+	arm64_ptdump_walk_pgd(&st.ptdump, info->mm);
 }
 
 static void __init ptdump_initialize(void)
@@ -353,7 +360,7 @@ bool ptdump_check_wx(void)
 		}
 	};
 
-	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+	arm64_ptdump_walk_pgd(&st.ptdump, &init_mm);
 
 	if (st.wx_pages || st.uxn_pages) {
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 52ffe115a8c4..28996e0a9b00 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -185,7 +185,7 @@ static inline void emit_bti(u32 insn, struct jit_ctx *ctx)
 
 static inline void emit_kcfi(u32 hash, struct jit_ctx *ctx)
 {
-	if (IS_ENABLED(CONFIG_CFI_CLANG))
+	if (IS_ENABLED(CONFIG_CFI))
 		emit_u32_data(hash, ctx);
 }
 
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index ef0b7946f5a4..9ff5cdbd2759 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -53,6 +53,7 @@ HAS_S1PIE
 HAS_S1POE
 HAS_SCTLR2
 HAS_RAS_EXTN
+HAS_RASV1P1_EXTN
 HAS_RNG
 HAS_SB
 HAS_STAGE2_FWB
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index f2a1732cb1f6..bbbb812603e8 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -122,6 +122,10 @@ $1 == "SysregFields" && block_current() == "Root" {
 	res1 = "UL(0)"
 	unkn = "UL(0)"
 
+	if (reg in defined_fields)
+		fatal("Duplicate SysregFields definition for " reg)
+	defined_fields[reg] = 1
+
 	next_bit = 63
 
 	next
@@ -162,6 +166,10 @@ $1 == "Sysreg" && block_current() == "Root" {
 	res1 = "UL(0)"
 	unkn = "UL(0)"
 
+	if (reg in defined_regs)
+		fatal("Duplicate Sysreg definition for " reg)
+	defined_regs[reg] = 1
+
 	define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2)
 	define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")")
 
@@ -284,6 +292,8 @@ $1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysreg
 	define_field(reg, field, msb, lsb)
 	define_field_sign(reg, field, "true")
 
+	delete seen_enum_vals
+
 	next
 }
 
@@ -297,6 +307,8 @@ $1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysr
 	define_field(reg, field, msb, lsb)
 	define_field_sign(reg, field, "false")
 
+	delete seen_enum_vals
+
 	next
 }
 
@@ -309,6 +321,8 @@ $1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields
 
 	define_field(reg, field, msb, lsb)
 
+	delete seen_enum_vals
+
 	next
 }
 
@@ -320,6 +334,8 @@ $1 == "EndEnum" && block_current() == "Enum" {
 	lsb = null
 	print ""
 
+	delete seen_enum_vals
+
 	block_pop()
 	next
 }
@@ -329,6 +345,10 @@ $1 == "EndEnum" && block_current() == "Enum" {
 	val = $1
 	name = $2
 
+	if (val in seen_enum_vals)
+		fatal("Duplicate Enum value " val " for " name)
+	seen_enum_vals[val] = 1
+
 	define(reg "_" field "_" name, "UL(" val ")")
 	next
 }
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 696ab1f32a67..1c6cdf9d54bb 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -31,7 +31,7 @@
 # Mapping	<name_EL1>
 # EndSysreg
 
-# Where multiple system regsiters are not VHE aliases but share a
+# Where multiple system registers are not VHE aliases but share a
 # common layout, a SysregFields block can be used to describe the
 # shared layout:
 
@@ -54,7 +54,7 @@
 #
 # In general it is recommended that new enumeration items be named for the
 # feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration
-# item ACCDATA) though it may be more taseful to do something else.
+# item ACCDATA) though it may be more tasteful to do something else.
 
 Sysreg	OSDTRRX_EL1	2	0	0	0	2
 Res0	63:32
@@ -474,7 +474,7 @@ EndEnum
 Enum	7:4	Security
 	0b0000	NI
 	0b0001	EL3
-	0b0001	NSACR_RFR
+	0b0010	NSACR_RFR
 EndEnum
 UnsignedEnum	3:0	ProgMod
 	0b0000	NI
@@ -1693,7 +1693,7 @@ UnsignedEnum	43:40	TraceFilt
 	0b0000	NI
 	0b0001	IMP
 EndEnum
-UnsignedEnum	39:36	DoubleLock
+SignedEnum	39:36	DoubleLock
 	0b0000	IMP
 	0b1111	NI
 EndEnum
@@ -2409,7 +2409,7 @@ UnsignedEnum	11:8	ASID2
 	0b0000	NI
 	0b0001	IMP
 EndEnum
-SignedEnum	7:4	EIESB
+UnsignedEnum	7:4	EIESB
 	0b0000	NI
 	0b0001	ToEL3
 	0b0010	ToELx
@@ -2528,10 +2528,6 @@ Field	17:16	ZEN
 Res0	15:0
 EndSysreg
 
-Sysreg	CPACR_EL12      3	5	1	0	2
-Mapping	CPACR_EL1
-EndSysreg
-
 Sysreg	CPACRALIAS_EL1  3	0	1	4	4
 Mapping	CPACR_EL1
 EndSysreg
@@ -2576,10 +2572,6 @@ Sysreg	PFAR_EL12	3	5	6	0	5
 Mapping	PFAR_EL1
 EndSysreg
 
-Sysreg	RCWSMASK_EL1	3	0	13	0	3
-Field	63:0	RCWSMASK
-EndSysreg
-
 Sysreg	SCTLR2_EL1      3	0	1	0	3
 Res0    63:13
 Field   12      CPTM0
@@ -2994,11 +2986,20 @@ Field	0	RND
 EndSysreg
 
 Sysreg	PMSFCR_EL1	3	0	9	9	4
-Res0	63:19
+Res0	63:53
+Field	52	SIMDm
+Field	51	FPm
+Field	50	STm
+Field	49	LDm
+Field	48	Bm
+Res0	47:21
+Field	20	SIMD
+Field	19	FP
 Field	18	ST
 Field	17	LD
 Field	16	B
-Res0	15:4
+Res0	15:5
+Field	4	FDS
 Field	3	FnE
 Field	2	FL
 Field	1	FT
@@ -4756,17 +4757,53 @@ Field   37      TBI0
 Field   36      AS
 Res0    35
 Field   34:32   IPS
-Field   31:30   TG1
-Field   29:28   SH1
-Field   27:26   ORGN1
-Field   25:24   IRGN1
+Enum	31:30   TG1
+	0b01	16K
+	0b10	4K
+	0b11	64K
+EndEnum
+Enum	29:28   SH1
+	0b00	NONE
+	0b10	OUTER
+	0b11	INNER
+EndEnum
+Enum	27:26  ORGN1
+	0b00	NC
+	0b01	WBWA
+	0b10	WT
+	0b11	WBnWA
+EndEnum
+Enum	25:24   IRGN1
+	0b00	NC
+	0b01	WBWA
+	0b10	WT
+	0b11	WBnWA
+EndEnum
 Field   23      EPD1
 Field   22      A1
 Field   21:16   T1SZ
-Field   15:14   TG0
-Field   13:12   SH0
-Field   11:10   ORGN0
-Field   9:8     IRGN0
+Enum	15:14   TG0
+	0b00	4K
+	0b01	64K
+	0b10	16K
+EndEnum
+Enum	13:12   SH0
+	0b00	NONE
+	0b10	OUTER
+	0b11	INNER
+EndEnum
+Enum	11:10  ORGN0
+	0b00	NC
+	0b01	WBWA
+	0b10	WT
+	0b11	WBnWA
+EndEnum
+Enum	9:8   IRGN0
+	0b00	NC
+	0b01	WBWA
+	0b10	WT
+	0b11	WBnWA
+EndEnum
 Field   7       EPD0
 Res0    6
 Field   5:0     T0SZ
diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h
index 72e1b2aa29a0..80d67eee6e86 100644
--- a/arch/csky/include/asm/bitops.h
+++ b/arch/csky/include/asm/bitops.h
@@ -9,7 +9,7 @@
 /*
  * asm-generic/bitops/ffs.h
  */
-static inline int ffs(int x)
+static inline __attribute_const__ int ffs(int x)
 {
 	if (!x)
 		return 0;
@@ -26,7 +26,7 @@ static inline int ffs(int x)
 /*
  * asm-generic/bitops/__ffs.h
  */
-static __always_inline unsigned long __ffs(unsigned long x)
+static __always_inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	asm volatile (
 		"brev %0\n"
@@ -39,7 +39,7 @@ static __always_inline unsigned long __ffs(unsigned long x)
 /*
  * asm-generic/bitops/fls.h
  */
-static __always_inline int fls(unsigned int x)
+static __always_inline __attribute_const__ int fls(unsigned int x)
 {
 	asm volatile(
 		"ff1 %0\n"
@@ -52,7 +52,7 @@ static __always_inline int fls(unsigned int x)
 /*
  * asm-generic/bitops/__fls.h
  */
-static __always_inline unsigned long __fls(unsigned long x)
+static __always_inline __attribute_const__ unsigned long __fls(unsigned long x)
 {
 	return fls(x) - 1;
 }
diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c
index d1e903579473..5525c8e7e1d9 100644
--- a/arch/csky/kernel/asm-offsets.c
+++ b/arch/csky/kernel/asm-offsets.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
+#define COMPILE_OFFSETS
 
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index 0c6e4b17fe00..a7a90340042a 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -32,7 +32,7 @@ void flush_thread(void){}
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct switch_stack *childstack;
diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h
index 160d8f37fa1a..b23cb13833af 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -200,7 +200,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline long ffz(int x)
+static inline long __attribute_const__ ffz(int x)
 {
 	int r;
 
@@ -217,7 +217,7 @@ static inline long ffz(int x)
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
-static inline int fls(unsigned int x)
+static inline __attribute_const__ int fls(unsigned int x)
 {
 	int r;
 
@@ -238,7 +238,7 @@ static inline int fls(unsigned int x)
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from the above ffz (man ffs).
  */
-static inline int ffs(int x)
+static inline __attribute_const__ int ffs(int x)
 {
 	int r;
 
@@ -260,7 +260,7 @@ static inline int ffs(int x)
  * bits_per_long assumed to be 32
  * numbering starts at 0 I think (instead of 1 like ffs)
  */
-static inline unsigned long __ffs(unsigned long word)
+static inline __attribute_const__ unsigned long __ffs(unsigned long word)
 {
 	int num;
 
@@ -278,7 +278,7 @@ static inline unsigned long __ffs(unsigned long word)
  * Undefined if no set bit exists, so code should check against 0 first.
  * bits_per_long assumed to be 32
  */
-static inline unsigned long __fls(unsigned long word)
+static inline __attribute_const__ unsigned long __fls(unsigned long word)
 {
 	int num;
 
diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
index 03a7063f9456..50eea9fa6f13 100644
--- a/arch/hexagon/kernel/asm-offsets.c
+++ b/arch/hexagon/kernel/asm-offsets.c
@@ -8,6 +8,7 @@
  *
  * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/compat.h>
 #include <linux/types.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 2a77bfd75694..15b4992bfa29 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -52,7 +52,7 @@ void arch_cpu_idle(void)
  */
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index f0abc38c40ac..8a39cd53297a 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -70,6 +70,8 @@ config LOONGARCH
 	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_RT
+	select ARCH_SUPPORTS_SCHED_SMT if SMP
+	select ARCH_SUPPORTS_SCHED_MC  if SMP
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_MEMTEST
@@ -140,6 +142,7 @@ config LOONGARCH
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
 	select HAVE_EXIT_THREAD
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
 	select HAVE_FUNCTION_ARG_ACCESS_API
@@ -298,6 +301,10 @@ config AS_HAS_LVZ_EXTENSION
 config CC_HAS_ANNOTATE_TABLEJUMP
 	def_bool $(cc-option,-mannotate-tablejump)
 
+config RUSTC_HAS_ANNOTATE_TABLEJUMP
+	depends on RUST
+	def_bool $(rustc-option,-Cllvm-args=--loongarch-annotate-tablejump)
+
 menu "Kernel type and options"
 
 source "kernel/Kconfig.hz"
@@ -448,23 +455,6 @@ config EFI_STUB
 	  This kernel feature allows the kernel to be loaded directly by
 	  EFI firmware without the use of a bootloader.
 
-config SCHED_SMT
-	bool "SMT scheduler support"
-	depends on SMP
-	default y
-	help
-	  Improves scheduler's performance when there are multiple
-	  threads in one physical core.
-
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	depends on SMP
-	default y
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places.
-
 config SMP
 	bool "Multi-Processing support"
 	help
@@ -563,10 +553,14 @@ config ARCH_STRICT_ALIGN
 	  -mstrict-align build parameter to prevent unaligned accesses.
 
 	  CPUs with h/w unaligned access support:
-	  Loongson-2K2000/2K3000/3A5000/3C5000/3D5000.
+	  Loongson-2K2000/2K3000 and all of Loongson-3 series processors
+	  based on LoongArch.
 
 	  CPUs without h/w unaligned access support:
-	  Loongson-2K500/2K1000.
+	  Loongson-2K0300/2K0500/2K1000.
+
+	  If you want to make sure whether to support unaligned memory access
+	  on your hardware, please read the bit 20 (UAL) of CPUCFG1 register.
 
 	  This option is enabled by default to make the kernel be able to run
 	  on all LoongArch systems. But you can disable it manually if you want
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index b0703a4e02a2..ae419e32f22e 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -106,6 +106,17 @@ KBUILD_CFLAGS			+= -mannotate-tablejump
 else
 KBUILD_CFLAGS			+= -fno-jump-tables # keep compatibility with older compilers
 endif
+ifdef CONFIG_RUSTC_HAS_ANNOTATE_TABLEJUMP
+KBUILD_RUSTFLAGS		+= -Cllvm-args=--loongarch-annotate-tablejump
+else
+KBUILD_RUSTFLAGS		+= -Zno-jump-tables # keep compatibility with older compilers
+endif
+ifdef CONFIG_LTO_CLANG
+# The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled.
+# Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to
+# be passed via '-mllvm' to ld.lld.
+KBUILD_LDFLAGS			+= -mllvm --loongarch-annotate-tablejump
+endif
 endif
 
 KBUILD_RUSTFLAGS		+= --target=loongarch64-unknown-none-softfloat -Ccode-model=small
diff --git a/arch/loongarch/include/asm/acenv.h b/arch/loongarch/include/asm/acenv.h
index 52f298f7293b..483c955f2ae5 100644
--- a/arch/loongarch/include/asm/acenv.h
+++ b/arch/loongarch/include/asm/acenv.h
@@ -10,9 +10,8 @@
 #ifndef _ASM_LOONGARCH_ACENV_H
 #define _ASM_LOONGARCH_ACENV_H
 
-/*
- * This header is required by ACPI core, but we have nothing to fill in
- * right now. Will be updated later when needed.
- */
+#ifdef CONFIG_ARCH_STRICT_ALIGN
+#define ACPI_MISALIGNMENT_NOT_SUPPORTED
+#endif /* CONFIG_ARCH_STRICT_ALIGN */
 
 #endif /* _ASM_LOONGARCH_ACENV_H */
diff --git a/arch/loongarch/include/asm/kvm_mmu.h b/arch/loongarch/include/asm/kvm_mmu.h
index 099bafc6f797..e36cc7e8ed20 100644
--- a/arch/loongarch/include/asm/kvm_mmu.h
+++ b/arch/loongarch/include/asm/kvm_mmu.h
@@ -16,6 +16,13 @@
  */
 #define KVM_MMU_CACHE_MIN_PAGES	(CONFIG_PGTABLE_LEVELS - 1)
 
+/*
+ * _PAGE_MODIFIED is a SW pte bit, it records page ever written on host
+ * kernel, on secondary MMU it records the page writeable attribute, in
+ * order for fast path handling.
+ */
+#define KVM_PAGE_WRITEABLE	_PAGE_MODIFIED
+
 #define _KVM_FLUSH_PGTABLE	0x1
 #define _KVM_HAS_PGMASK		0x2
 #define kvm_pfn_pte(pfn, prot)	(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot))
@@ -52,10 +59,10 @@ static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val)
 	WRITE_ONCE(*ptep, val);
 }
 
-static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; }
-static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; }
 static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; }
 static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; }
+static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & __WRITEABLE; }
+static inline int kvm_pte_writeable(kvm_pte_t pte) { return pte & KVM_PAGE_WRITEABLE; }
 
 static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte)
 {
@@ -69,12 +76,12 @@ static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte)
 
 static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte)
 {
-	return pte | _PAGE_DIRTY;
+	return pte | __WRITEABLE;
 }
 
 static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte)
 {
-	return pte & ~_PAGE_DIRTY;
+	return pte & ~__WRITEABLE;
 }
 
 static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte)
@@ -87,6 +94,11 @@ static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte)
 	return pte & ~_PAGE_HUGE;
 }
 
+static inline kvm_pte_t kvm_pte_mkwriteable(kvm_pte_t pte)
+{
+	return pte | KVM_PAGE_WRITEABLE;
+}
+
 static inline int kvm_need_flush(kvm_ptw_ctx *ctx)
 {
 	return ctx->flag & _KVM_FLUSH_PGTABLE;
diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
index 3eda298702b1..5cb568a60cf8 100644
--- a/arch/loongarch/include/asm/stackframe.h
+++ b/arch/loongarch/include/asm/stackframe.h
@@ -58,7 +58,7 @@
 	.endm
 
 	.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	bl	stackleak_erase_on_task_stack
 #endif
 	.endm
diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h
index 9dfa2ef00816..4d7117fcdc78 100644
--- a/arch/loongarch/include/asm/thread_info.h
+++ b/arch/loongarch/include/asm/thread_info.h
@@ -65,50 +65,42 @@ register unsigned long current_stack_pointer __asm__("$sp");
  *   access
  * - pending work-to-be-done flags are in LSW
  * - other flags in MSW
+ *
+ * Tell the generic TIF infrastructure which special bits loongarch supports
  */
-#define TIF_NEED_RESCHED	0	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	1	/* lazy rescheduling necessary */
-#define TIF_SIGPENDING		2	/* signal pending */
-#define TIF_NOTIFY_RESUME	3	/* callback before returning to user */
-#define TIF_NOTIFY_SIGNAL	4	/* signal notifications exist */
-#define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
-#define TIF_NOHZ		6	/* in adaptive nohz mode */
-#define TIF_UPROBE		7	/* breakpointed or singlestepping */
-#define TIF_USEDFPU		8	/* FPU was used by this task this quantum (SMP) */
-#define TIF_USEDSIMD		9	/* SIMD has been used this quantum */
-#define TIF_MEMDIE		10	/* is terminating due to OOM killer */
-#define TIF_FIXADE		11	/* Fix address errors in software */
-#define TIF_LOGADE		12	/* Log address errors to syslog */
-#define TIF_32BIT_REGS		13	/* 32-bit general purpose registers */
-#define TIF_32BIT_ADDR		14	/* 32-bit address space */
-#define TIF_LOAD_WATCH		15	/* If set, load watch registers */
-#define TIF_SINGLESTEP		16	/* Single Step */
-#define TIF_LSX_CTX_LIVE	17	/* LSX context must be preserved */
-#define TIF_LASX_CTX_LIVE	18	/* LASX context must be preserved */
-#define TIF_USEDLBT		19	/* LBT was used by this task this quantum (SMP) */
-#define TIF_LBT_CTX_LIVE	20	/* LBT context must be preserved */
-#define TIF_PATCH_PENDING	21	/* pending live patching update */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific bits */
+#define TIF_NOHZ		16	/* in adaptive nohz mode */
+#define TIF_USEDFPU		17	/* FPU was used by this task this quantum (SMP) */
+#define TIF_USEDSIMD		18	/* SIMD has been used this quantum */
+#define TIF_FIXADE		19	/* Fix address errors in software */
+#define TIF_LOGADE		20	/* Log address errors to syslog */
+#define TIF_32BIT_REGS		21	/* 32-bit general purpose registers */
+#define TIF_32BIT_ADDR		22	/* 32-bit address space */
+#define TIF_LOAD_WATCH		23	/* If set, load watch registers */
+#define TIF_SINGLESTEP		24	/* Single Step */
+#define TIF_LSX_CTX_LIVE	25	/* LSX context must be preserved */
+#define TIF_LASX_CTX_LIVE	26	/* LASX context must be preserved */
+#define TIF_USEDLBT		27	/* LBT was used by this task this quantum (SMP) */
+#define TIF_LBT_CTX_LIVE	28	/* LBT context must be preserved */
 
-#define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
-#define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
-#define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
-#define _TIF_NOTIFY_SIGNAL	(1<<TIF_NOTIFY_SIGNAL)
-#define _TIF_NOHZ		(1<<TIF_NOHZ)
-#define _TIF_UPROBE		(1<<TIF_UPROBE)
-#define _TIF_USEDFPU		(1<<TIF_USEDFPU)
-#define _TIF_USEDSIMD		(1<<TIF_USEDSIMD)
-#define _TIF_FIXADE		(1<<TIF_FIXADE)
-#define _TIF_LOGADE		(1<<TIF_LOGADE)
-#define _TIF_32BIT_REGS		(1<<TIF_32BIT_REGS)
-#define _TIF_32BIT_ADDR		(1<<TIF_32BIT_ADDR)
-#define _TIF_LOAD_WATCH		(1<<TIF_LOAD_WATCH)
-#define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
-#define _TIF_LSX_CTX_LIVE	(1<<TIF_LSX_CTX_LIVE)
-#define _TIF_LASX_CTX_LIVE	(1<<TIF_LASX_CTX_LIVE)
-#define _TIF_USEDLBT		(1<<TIF_USEDLBT)
-#define _TIF_LBT_CTX_LIVE	(1<<TIF_LBT_CTX_LIVE)
-#define _TIF_PATCH_PENDING	(1<<TIF_PATCH_PENDING)
+#define _TIF_NOHZ		BIT(TIF_NOHZ)
+#define _TIF_USEDFPU		BIT(TIF_USEDFPU)
+#define _TIF_USEDSIMD		BIT(TIF_USEDSIMD)
+#define _TIF_FIXADE		BIT(TIF_FIXADE)
+#define _TIF_LOGADE		BIT(TIF_LOGADE)
+#define _TIF_32BIT_REGS		BIT(TIF_32BIT_REGS)
+#define _TIF_32BIT_ADDR		BIT(TIF_32BIT_ADDR)
+#define _TIF_LOAD_WATCH		BIT(TIF_LOAD_WATCH)
+#define _TIF_SINGLESTEP		BIT(TIF_SINGLESTEP)
+#define _TIF_LSX_CTX_LIVE	BIT(TIF_LSX_CTX_LIVE)
+#define _TIF_LASX_CTX_LIVE	BIT(TIF_LASX_CTX_LIVE)
+#define _TIF_USEDLBT		BIT(TIF_USEDLBT)
+#define _TIF_LBT_CTX_LIVE	BIT(TIF_LBT_CTX_LIVE)
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/loongarch/include/uapi/asm/setup.h b/arch/loongarch/include/uapi/asm/setup.h
new file mode 100644
index 000000000000..d46363ce3e02
--- /dev/null
+++ b/arch/loongarch/include/uapi/asm/setup.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_ASM_LOONGARCH_SETUP_H
+#define _UAPI_ASM_LOONGARCH_SETUP_H
+
+#define COMMAND_LINE_SIZE	4096
+
+#endif /* _UAPI_ASM_LOONGARCH_SETUP_H */
diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
index db1e4bb26b6a..3017c7157600 100644
--- a/arch/loongarch/kernel/asm-offsets.c
+++ b/arch/loongarch/kernel/asm-offsets.c
@@ -4,6 +4,8 @@
  *
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
+#define COMPILE_OFFSETS
+
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c
index c0a5dc9aeae2..23bd5ae2212c 100644
--- a/arch/loongarch/kernel/env.c
+++ b/arch/loongarch/kernel/env.c
@@ -86,7 +86,7 @@ late_initcall(fdt_cpu_clk_init);
 static ssize_t boardinfo_show(struct kobject *kobj,
 			      struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf,
+	return sysfs_emit(buf,
 		"BIOS Information\n"
 		"Vendor\t\t\t: %s\n"
 		"Version\t\t\t: %s\n"
@@ -109,6 +109,8 @@ static int __init boardinfo_init(void)
 	struct kobject *loongson_kobj;
 
 	loongson_kobj = kobject_create_and_add("loongson", firmware_kobj);
+	if (!loongson_kobj)
+		return -ENOMEM;
 
 	return sysfs_create_file(loongson_kobj, &boardinfo_attr.attr);
 }
diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c
index e2f30ff9afde..a43ba7f9f987 100644
--- a/arch/loongarch/kernel/module-sections.c
+++ b/arch/loongarch/kernel/module-sections.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace.h>
+#include <linux/sort.h>
 
 Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val)
 {
@@ -61,39 +62,38 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v
 	return (Elf_Addr)&plt[nr];
 }
 
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
-{
-	return x->r_info == y->r_info && x->r_addend == y->r_addend;
-}
+#define cmp_3way(a, b)  ((a) < (b) ? -1 : (a) > (b))
 
-static bool duplicate_rela(const Elf_Rela *rela, int idx)
+static int compare_rela(const void *x, const void *y)
 {
-	int i;
+	int ret;
+	const Elf_Rela *rela_x = x, *rela_y = y;
 
-	for (i = 0; i < idx; i++) {
-		if (is_rela_equal(&rela[i], &rela[idx]))
-			return true;
-	}
+	ret = cmp_3way(rela_x->r_info, rela_y->r_info);
+	if (ret == 0)
+		ret = cmp_3way(rela_x->r_addend, rela_y->r_addend);
 
-	return false;
+	return ret;
 }
 
 static void count_max_entries(Elf_Rela *relas, int num,
 			      unsigned int *plts, unsigned int *gots)
 {
-	unsigned int i, type;
+	unsigned int i;
+
+	sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL);
 
 	for (i = 0; i < num; i++) {
-		type = ELF_R_TYPE(relas[i].r_info);
-		switch (type) {
+		if (i && !compare_rela(&relas[i-1], &relas[i]))
+			continue;
+
+		switch (ELF_R_TYPE(relas[i].r_info)) {
 		case R_LARCH_SOP_PUSH_PLT_PCREL:
 		case R_LARCH_B26:
-			if (!duplicate_rela(relas, i))
-				(*plts)++;
+			(*plts)++;
 			break;
 		case R_LARCH_GOT_PC_HI20:
-			if (!duplicate_rela(relas, i))
-				(*gots)++;
+			(*gots)++;
 			break;
 		default:
 			break; /* Do nothing. */
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index 3582f591bab2..efd9edf65603 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -167,7 +167,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	unsigned long childksp;
 	unsigned long tls = args->tls;
 	unsigned long usp = args->stack;
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	struct pt_regs *childregs, *regs = current_pt_regs();
 
 	childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c
index 4740cb5b2388..c9f7ca778364 100644
--- a/arch/loongarch/kernel/signal.c
+++ b/arch/loongarch/kernel/signal.c
@@ -677,6 +677,11 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	for (i = 1; i < 32; i++)
 		err |= __put_user(regs->regs[i], &sc->sc_regs[i]);
 
+#ifdef CONFIG_CPU_HAS_LBT
+	if (extctx->lbt.addr)
+		err |= protected_save_lbt_context(extctx);
+#endif
+
 	if (extctx->lasx.addr)
 		err |= protected_save_lasx_context(extctx);
 	else if (extctx->lsx.addr)
@@ -684,11 +689,6 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	else if (extctx->fpu.addr)
 		err |= protected_save_fpu_context(extctx);
 
-#ifdef CONFIG_CPU_HAS_LBT
-	if (extctx->lbt.addr)
-		err |= protected_save_lbt_context(extctx);
-#endif
-
 	/* Set the "end" magic */
 	info = (struct sctx_info *)extctx->end.addr;
 	err |= __put_user(0, &info->magic);
diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c
index 9a038d1070d7..387dc4d3c486 100644
--- a/arch/loongarch/kernel/stacktrace.c
+++ b/arch/loongarch/kernel/stacktrace.c
@@ -51,12 +51,13 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 	if (task == current) {
 		regs->regs[3] = (unsigned long)__builtin_frame_address(0);
 		regs->csr_era = (unsigned long)__builtin_return_address(0);
+		regs->regs[22] = 0;
 	} else {
 		regs->regs[3] = thread_saved_fp(task);
 		regs->csr_era = thread_saved_ra(task);
+		regs->regs[22] = task->thread.reg22;
 	}
 	regs->regs[1] = 0;
-	regs->regs[22] = 0;
 
 	for (unwind_start(&state, task, regs);
 	     !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) {
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index 367906b10f81..f3092f2de8b5 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
 #include <linux/clockchips.h>
+#include <linux/cpuhotplug.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/init.h>
@@ -102,6 +103,23 @@ static int constant_timer_next_event(unsigned long delta, struct clock_event_dev
 	return 0;
 }
 
+static int arch_timer_starting(unsigned int cpu)
+{
+	set_csr_ecfg(ECFGF_TIMER);
+
+	return 0;
+}
+
+static int arch_timer_dying(unsigned int cpu)
+{
+	constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device));
+
+	/* Clear Timer Interrupt */
+	write_csr_tintclear(CSR_TINTCLR_TI);
+
+	return 0;
+}
+
 static unsigned long get_loops_per_jiffy(void)
 {
 	unsigned long lpj = (unsigned long)const_clock_freq;
@@ -172,6 +190,10 @@ int constant_clockevent_init(void)
 	lpj_fine = get_loops_per_jiffy();
 	pr_info("Constant clock event device register\n");
 
+	cpuhp_setup_state(CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING,
+			  "clockevents/loongarch/timer:starting",
+			  arch_timer_starting, arch_timer_dying);
+
 	return 0;
 }
 
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 7b888d9085a0..dee1a15d7f4c 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -54,6 +54,9 @@ static int __init init_vdso(void)
 	vdso_info.code_mapping.pages =
 		kcalloc(vdso_info.size / PAGE_SIZE, sizeof(struct page *), GFP_KERNEL);
 
+	if (!vdso_info.code_mapping.pages)
+		return -ENOMEM;
+
 	pfn = __phys_to_pfn(__pa_symbol(vdso_info.vdso));
 	for (i = 0; i < vdso_info.size / PAGE_SIZE; i++)
 		vdso_info.code_mapping.pages[i] = pfn_to_page(pfn + i);
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 2ce41f93b2a4..6c9c7de7226b 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -778,10 +778,8 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu)
 		return 0;
 	default:
 		return KVM_HCALL_INVALID_CODE;
-	};
-
-	return KVM_HCALL_INVALID_CODE;
-};
+	}
+}
 
 /*
  * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root.
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
index a3a12af9ecbf..c32333695381 100644
--- a/arch/loongarch/kvm/intc/eiointc.c
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -45,7 +45,12 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level)
 	}
 
 	cpu = s->sw_coremap[irq];
-	vcpu = kvm_get_vcpu(s->kvm, cpu);
+	vcpu = kvm_get_vcpu_by_id(s->kvm, cpu);
+	if (unlikely(vcpu == NULL)) {
+		kvm_err("%s: invalid target cpu: %d\n", __func__, cpu);
+		return;
+	}
+
 	if (level) {
 		/* if not enable return false */
 		if (!test_bit(irq, (unsigned long *)s->enable.reg_u32))
@@ -421,21 +426,26 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev,
 	struct loongarch_eiointc *s = dev->kvm->arch.eiointc;
 
 	data = (void __user *)attr->addr;
-	spin_lock_irqsave(&s->lock, flags);
 	switch (type) {
 	case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU:
+	case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE:
 		if (copy_from_user(&val, data, 4))
-			ret = -EFAULT;
-		else {
-			if (val >= EIOINTC_ROUTE_MAX_VCPUS)
-				ret = -EINVAL;
-			else
-				s->num_cpu = val;
-		}
+			return -EFAULT;
+		break;
+	default:
+		break;
+	}
+
+	spin_lock_irqsave(&s->lock, flags);
+	switch (type) {
+	case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU:
+		if (val >= EIOINTC_ROUTE_MAX_VCPUS)
+			ret = -EINVAL;
+		else
+			s->num_cpu = val;
 		break;
 	case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE:
-		if (copy_from_user(&s->features, data, 4))
-			ret = -EFAULT;
+		s->features = val;
 		if (!(s->features & BIT(EIOINTC_HAS_VIRT_EXTENSION)))
 			s->status |= BIT(EIOINTC_ENABLE);
 		break;
@@ -457,19 +467,17 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev,
 
 static int kvm_eiointc_regs_access(struct kvm_device *dev,
 					struct kvm_device_attr *attr,
-					bool is_write)
+					bool is_write, int *data)
 {
 	int addr, cpu, offset, ret = 0;
 	unsigned long flags;
 	void *p = NULL;
-	void __user *data;
 	struct loongarch_eiointc *s;
 
 	s = dev->kvm->arch.eiointc;
 	addr = attr->attr;
 	cpu = addr >> 16;
 	addr &= 0xffff;
-	data = (void __user *)attr->addr;
 	switch (addr) {
 	case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END:
 		offset = (addr - EIOINTC_NODETYPE_START) / 4;
@@ -508,13 +516,10 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev,
 	}
 
 	spin_lock_irqsave(&s->lock, flags);
-	if (is_write) {
-		if (copy_from_user(p, data, 4))
-			ret = -EFAULT;
-	} else {
-		if (copy_to_user(data, p, 4))
-			ret = -EFAULT;
-	}
+	if (is_write)
+		memcpy(p, data, 4);
+	else
+		memcpy(data, p, 4);
 	spin_unlock_irqrestore(&s->lock, flags);
 
 	return ret;
@@ -522,19 +527,17 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev,
 
 static int kvm_eiointc_sw_status_access(struct kvm_device *dev,
 					struct kvm_device_attr *attr,
-					bool is_write)
+					bool is_write, int *data)
 {
 	int addr, ret = 0;
 	unsigned long flags;
 	void *p = NULL;
-	void __user *data;
 	struct loongarch_eiointc *s;
 
 	s = dev->kvm->arch.eiointc;
 	addr = attr->attr;
 	addr &= 0xffff;
 
-	data = (void __user *)attr->addr;
 	switch (addr) {
 	case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU:
 		if (is_write)
@@ -556,13 +559,10 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev,
 		return -EINVAL;
 	}
 	spin_lock_irqsave(&s->lock, flags);
-	if (is_write) {
-		if (copy_from_user(p, data, 4))
-			ret = -EFAULT;
-	} else {
-		if (copy_to_user(data, p, 4))
-			ret = -EFAULT;
-	}
+	if (is_write)
+		memcpy(p, data, 4);
+	else
+		memcpy(data, p, 4);
 	spin_unlock_irqrestore(&s->lock, flags);
 
 	return ret;
@@ -571,11 +571,27 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev,
 static int kvm_eiointc_get_attr(struct kvm_device *dev,
 				struct kvm_device_attr *attr)
 {
+	int ret, data;
+
 	switch (attr->group) {
 	case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS:
-		return kvm_eiointc_regs_access(dev, attr, false);
+		ret = kvm_eiointc_regs_access(dev, attr, false, &data);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)attr->addr, &data, 4))
+			ret = -EFAULT;
+
+		return ret;
 	case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS:
-		return kvm_eiointc_sw_status_access(dev, attr, false);
+		ret = kvm_eiointc_sw_status_access(dev, attr, false, &data);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((void __user *)attr->addr, &data, 4))
+			ret = -EFAULT;
+
+		return ret;
 	default:
 		return -EINVAL;
 	}
@@ -584,13 +600,21 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev,
 static int kvm_eiointc_set_attr(struct kvm_device *dev,
 				struct kvm_device_attr *attr)
 {
+	int data;
+
 	switch (attr->group) {
 	case KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL:
 		return kvm_eiointc_ctrl_access(dev, attr);
 	case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS:
-		return kvm_eiointc_regs_access(dev, attr, true);
+		if (copy_from_user(&data, (void __user *)attr->addr, 4))
+			return -EFAULT;
+
+		return kvm_eiointc_regs_access(dev, attr, true, &data);
 	case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS:
-		return kvm_eiointc_sw_status_access(dev, attr, true);
+		if (copy_from_user(&data, (void __user *)attr->addr, 4))
+			return -EFAULT;
+
+		return kvm_eiointc_sw_status_access(dev, attr, true, &data);
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c
index e658d5b37c04..5a8481dda052 100644
--- a/arch/loongarch/kvm/intc/ipi.c
+++ b/arch/loongarch/kvm/intc/ipi.c
@@ -99,7 +99,7 @@ static void write_mailbox(struct kvm_vcpu *vcpu, int offset, uint64_t data, int
 static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data)
 {
 	int i, idx, ret;
-	uint32_t val = 0, mask = 0;
+	uint64_t val = 0, mask = 0;
 
 	/*
 	 * Bit 27-30 is mask for byte writing.
@@ -108,7 +108,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data)
 	if ((data >> 27) & 0xf) {
 		/* Read the old val */
 		idx = srcu_read_lock(&vcpu->kvm->srcu);
-		ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val);
+		ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, 4, &val);
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		if (unlikely(ret)) {
 			kvm_err("%s: : read data from addr %llx failed\n", __func__, addr);
@@ -124,7 +124,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data)
 	}
 	val |= ((uint32_t)(data >> 32) & ~mask);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val);
+	ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, 4, &val);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	if (unlikely(ret))
 		kvm_err("%s: : write data to addr %llx failed\n", __func__, addr);
@@ -298,7 +298,7 @@ static int kvm_ipi_regs_access(struct kvm_device *dev,
 	cpu = (attr->attr >> 16) & 0x3ff;
 	addr = attr->attr & 0xff;
 
-	vcpu = kvm_get_vcpu(dev->kvm, cpu);
+	vcpu = kvm_get_vcpu_by_id(dev->kvm, cpu);
 	if (unlikely(vcpu == NULL)) {
 		kvm_err("%s: invalid target cpu: %d\n", __func__, cpu);
 		return -EINVAL;
diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c
index 6f00ffe05c54..baf3b4faf7ea 100644
--- a/arch/loongarch/kvm/intc/pch_pic.c
+++ b/arch/loongarch/kvm/intc/pch_pic.c
@@ -195,6 +195,11 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 	}
 
+	if (addr & (len - 1)) {
+		kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len);
+		return -EINVAL;
+	}
+
 	/* statistics of pch pic reading */
 	vcpu->stat.pch_pic_read_exits++;
 	ret = loongarch_pch_pic_read(s, addr, len, val);
@@ -302,6 +307,11 @@ static int kvm_pch_pic_write(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 	}
 
+	if (addr & (len - 1)) {
+		kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len);
+		return -EINVAL;
+	}
+
 	/* statistics of pch pic writing */
 	vcpu->stat.pch_pic_write_exits++;
 	ret = loongarch_pch_pic_write(s, addr, len, val);
@@ -338,6 +348,7 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev,
 				struct kvm_device_attr *attr,
 				bool is_write)
 {
+	char buf[8];
 	int addr, offset, len = 8, ret = 0;
 	void __user *data;
 	void *p = NULL;
@@ -387,17 +398,23 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev,
 		return -EINVAL;
 	}
 
-	spin_lock(&s->lock);
-	/* write or read value according to is_write */
 	if (is_write) {
-		if (copy_from_user(p, data, len))
-			ret = -EFAULT;
-	} else {
-		if (copy_to_user(data, p, len))
-			ret = -EFAULT;
+		if (copy_from_user(buf, data, len))
+			return -EFAULT;
 	}
+
+	spin_lock(&s->lock);
+	if (is_write)
+		memcpy(p, buf, len);
+	else
+		memcpy(buf, p, len);
 	spin_unlock(&s->lock);
 
+	if (!is_write) {
+		if (copy_to_user(data, buf, len))
+			return -EFAULT;
+	}
+
 	return ret;
 }
 
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index ed956c5cf2cc..7c8143e79c12 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -569,7 +569,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ
 	/* Track access to pages marked old */
 	new = kvm_pte_mkyoung(*ptep);
 	if (write && !kvm_pte_dirty(new)) {
-		if (!kvm_pte_write(new)) {
+		if (!kvm_pte_writeable(new)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -856,9 +856,9 @@ retry:
 		prot_bits |= _CACHE_SUC;
 
 	if (writeable) {
-		prot_bits |= _PAGE_WRITE;
+		prot_bits = kvm_pte_mkwriteable(prot_bits);
 		if (write)
-			prot_bits |= __WRITEABLE;
+			prot_bits = kvm_pte_mkdirty(prot_bits);
 	}
 
 	/* Disable dirty logging on HugePages */
@@ -904,7 +904,7 @@ retry:
 	kvm_release_faultin_page(kvm, page, false, writeable);
 	spin_unlock(&kvm->mmu_lock);
 
-	if (prot_bits & _PAGE_DIRTY)
+	if (kvm_pte_dirty(prot_bits))
 		mark_page_dirty_in_slot(kvm, memslot, gfn);
 
 out:
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index d1b8c50941ca..ce478151466c 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -1283,9 +1283,11 @@ int kvm_own_lbt(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 
 	preempt_disable();
-	set_csr_euen(CSR_EUEN_LBTEN);
-	_restore_lbt(&vcpu->arch.lbt);
-	vcpu->arch.aux_inuse |= KVM_LARCH_LBT;
+	if (!(vcpu->arch.aux_inuse & KVM_LARCH_LBT)) {
+		set_csr_euen(CSR_EUEN_LBTEN);
+		_restore_lbt(&vcpu->arch.lbt);
+		vcpu->arch.aux_inuse |= KVM_LARCH_LBT;
+	}
 	preempt_enable();
 
 	return 0;
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 5171bb183967..fba8089c9fb3 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -125,6 +125,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -206,14 +207,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -233,10 +232,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -245,6 +242,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -309,7 +307,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -454,7 +451,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -560,7 +556,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -605,6 +600,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -636,7 +632,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 16f343ae48c6..6af37716384c 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -121,6 +121,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -202,14 +203,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -229,10 +228,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -241,6 +238,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -299,7 +297,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -411,7 +408,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -517,7 +513,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -562,6 +557,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -593,7 +589,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index c08788728ea9..471f4ec3730d 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -128,6 +128,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -209,14 +210,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -236,10 +235,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -248,6 +245,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -310,7 +308,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -431,7 +428,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -537,7 +533,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -582,6 +577,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -613,7 +609,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 962497e7c53f..28492ef51457 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -403,7 +400,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -509,7 +505,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -554,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -585,7 +581,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index ec28650189e4..2fbefb16b72e 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -120,6 +120,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -201,14 +202,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -228,10 +227,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -240,6 +237,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -413,7 +410,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -519,7 +515,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -564,6 +559,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -595,7 +591,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 0afb3ad180de..deec5df3f35a 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -298,7 +296,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -430,7 +427,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -536,7 +532,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -581,6 +576,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -612,7 +608,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index b311e953995d..301a05c12577 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -139,6 +139,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -220,14 +221,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -247,10 +246,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -259,6 +256,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -327,7 +325,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -517,7 +514,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -623,7 +619,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -668,6 +663,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -699,7 +695,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index f4e6224f137f..0d401db0e8f8 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -117,6 +117,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -198,14 +199,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -225,10 +224,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -237,6 +234,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -295,7 +293,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -403,7 +400,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -509,7 +505,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -554,6 +549,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -585,7 +581,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 498e167222f1..90fb5b6bcf83 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -118,6 +118,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -199,14 +200,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -226,10 +225,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -238,6 +235,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -296,7 +294,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -404,7 +401,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -510,7 +506,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -555,6 +550,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -586,7 +582,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 8c6b1eef8534..b89b0f7fe2da 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -119,6 +119,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -200,14 +201,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -227,10 +226,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -239,6 +236,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -300,7 +298,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -420,7 +417,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -526,7 +522,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -571,6 +566,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -602,7 +598,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index c34648f299ef..8cc372c4df72 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -114,6 +114,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -195,14 +196,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -222,10 +221,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -234,6 +231,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -292,7 +290,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -401,7 +398,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -507,7 +503,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -552,6 +547,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -582,7 +578,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 73810d14660f..f4569f64c6e4 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -115,6 +115,7 @@ CONFIG_NFT_FIB_NETDEV=m
 CONFIG_NFT_REJECT_NETDEV=m
 CONFIG_NF_FLOW_TABLE_INET=m
 CONFIG_NF_FLOW_TABLE=m
+CONFIG_NETFILTER_XTABLES_LEGACY=y
 CONFIG_NETFILTER_XT_SET=m
 CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
@@ -196,14 +197,12 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
 CONFIG_IP_NF_TARGET_SYNPROXY=m
 CONFIG_IP_NF_NAT=m
 CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -223,10 +222,8 @@ CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_MATCH_SRH=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_TARGET_SYNPROXY=m
-CONFIG_IP6_NF_MANGLE=m
 CONFIG_IP6_NF_RAW=m
 CONFIG_IP6_NF_NAT=m
 CONFIG_IP6_NF_TARGET_MASQUERADE=m
@@ -235,6 +232,7 @@ CONFIG_NF_TABLES_BRIDGE=m
 CONFIG_NFT_BRIDGE_META=m
 CONFIG_NFT_BRIDGE_REJECT=m
 CONFIG_NF_CONNTRACK_BRIDGE=m
+CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m
 CONFIG_BRIDGE_NF_EBTABLES=m
 CONFIG_BRIDGE_EBT_BROUTE=m
 CONFIG_BRIDGE_EBT_T_FILTER=m
@@ -293,7 +291,6 @@ CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
 CONFIG_RAID_ATTRS=m
@@ -401,7 +398,6 @@ CONFIG_XFS_FS=m
 CONFIG_OCFS2_FS=m
 # CONFIG_OCFS2_DEBUG_MASKLOG is not set
 CONFIG_BTRFS_FS=m
-CONFIG_BCACHEFS_FS=m
 CONFIG_FANOTIFY=y
 CONFIG_QUOTA_NETLINK_INTERFACE=y
 CONFIG_AUTOFS_FS=m
@@ -507,7 +503,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -552,6 +547,7 @@ CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_PRIME_NUMBERS=m
+CONFIG_CRC_BENCHMARK=y
 CONFIG_XZ_DEC_TEST=m
 CONFIG_GLOB_SELFTEST=m
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
@@ -583,7 +579,6 @@ CONFIG_FIND_BIT_BENCHMARK=m
 CONFIG_TEST_FIRMWARE=m
 CONFIG_TEST_SYSCTL=m
 CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CRC_BENCHMARK=y
 CONFIG_TEST_UDELAY=m
 CONFIG_TEST_STATIC_KEYS=m
 CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 14c64a6f1217..e9639e48c6c3 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -350,12 +350,12 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 #include <asm-generic/bitops/ffz.h>
 #else
 
-static inline int find_first_zero_bit(const unsigned long *vaddr,
-				      unsigned size)
+static inline unsigned long find_first_zero_bit(const unsigned long *vaddr,
+						unsigned long size)
 {
 	const unsigned long *p = vaddr;
-	int res = 32;
-	unsigned int words;
+	unsigned long res = 32;
+	unsigned long words;
 	unsigned long num;
 
 	if (!size)
@@ -376,8 +376,9 @@ out:
 }
 #define find_first_zero_bit find_first_zero_bit
 
-static inline int find_next_zero_bit(const unsigned long *vaddr, int size,
-				     int offset)
+static inline unsigned long find_next_zero_bit(const unsigned long *vaddr,
+					       unsigned long size,
+					       unsigned long offset)
 {
 	const unsigned long *p = vaddr + (offset >> 5);
 	int bit = offset & 31UL, res;
@@ -406,11 +407,12 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size,
 }
 #define find_next_zero_bit find_next_zero_bit
 
-static inline int find_first_bit(const unsigned long *vaddr, unsigned size)
+static inline unsigned long find_first_bit(const unsigned long *vaddr,
+					   unsigned long size)
 {
 	const unsigned long *p = vaddr;
-	int res = 32;
-	unsigned int words;
+	unsigned long res = 32;
+	unsigned long words;
 	unsigned long num;
 
 	if (!size)
@@ -431,8 +433,9 @@ out:
 }
 #define find_first_bit find_first_bit
 
-static inline int find_next_bit(const unsigned long *vaddr, int size,
-				int offset)
+static inline unsigned long find_next_bit(const unsigned long *vaddr,
+					  unsigned long size,
+					  unsigned long offset)
 {
 	const unsigned long *p = vaddr + (offset >> 5);
 	int bit = offset & 31UL, res;
@@ -465,7 +468,7 @@ static inline int find_next_bit(const unsigned long *vaddr, int size,
  * ffz = Find First Zero in word. Undefined if no zero exists,
  * so code should check against ~0UL first..
  */
-static inline unsigned long ffz(unsigned long word)
+static inline unsigned long __attribute_const__ ffz(unsigned long word)
 {
 	int res;
 
@@ -488,7 +491,7 @@ static inline unsigned long ffz(unsigned long word)
  */
 #if (defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
 	!defined(CONFIG_M68000)
-static inline unsigned long __ffs(unsigned long x)
+static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	__asm__ __volatile__ ("bitrev %0; ff1 %0"
 		: "=d" (x)
@@ -496,7 +499,7 @@ static inline unsigned long __ffs(unsigned long x)
 	return x;
 }
 
-static inline int ffs(int x)
+static inline __attribute_const__ int ffs(int x)
 {
 	if (!x)
 		return 0;
@@ -518,7 +521,7 @@ static inline int ffs(int x)
  *	the libc and compiler builtin ffs routines, therefore
  *	differs in spirit from the above ffz (man ffs).
  */
-static inline int ffs(int x)
+static inline __attribute_const__ int ffs(int x)
 {
 	int cnt;
 
@@ -528,7 +531,7 @@ static inline int ffs(int x)
 	return 32 - cnt;
 }
 
-static inline unsigned long __ffs(unsigned long x)
+static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	return ffs(x) - 1;
 }
@@ -536,7 +539,7 @@ static inline unsigned long __ffs(unsigned long x)
 /*
  *	fls: find last bit set.
  */
-static inline int fls(unsigned int x)
+static inline __attribute_const__ int fls(unsigned int x)
 {
 	int cnt;
 
@@ -546,7 +549,7 @@ static inline int fls(unsigned int x)
 	return 32 - cnt;
 }
 
-static inline unsigned long __fls(unsigned long x)
+static inline __attribute_const__ unsigned long __fls(unsigned long x)
 {
 	return fls(x) - 1;
 }
diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c
index 906d73230537..67a1990f9d74 100644
--- a/arch/m68k/kernel/asm-offsets.c
+++ b/arch/m68k/kernel/asm-offsets.c
@@ -9,6 +9,7 @@
  * #defines from the assembly-language output.
  */
 
+#define COMPILE_OFFSETS
 #define ASM_OFFSETS_C
 
 #include <linux/stddef.h>
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index fda7eac23f87..f5a07a70e938 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -141,7 +141,7 @@ asmlinkage int m68k_clone3(struct pt_regs *regs)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct fork_frame {
diff --git a/arch/microblaze/Kconfig.platform b/arch/microblaze/Kconfig.platform
index 7795f90dad86..9cf9007ed69a 100644
--- a/arch/microblaze/Kconfig.platform
+++ b/arch/microblaze/Kconfig.platform
@@ -8,10 +8,10 @@
 menu "Platform options"
 
 config OPT_LIB_FUNCTION
-	bool "Optimalized lib function"
+	bool "Optimized lib function"
 	default y
 	help
-	  Allows turn on optimalized library function (memcpy and memmove).
+	  Turns on optimized library functions (memcpy and memmove).
 	  They are optimized by using word alignment. This will work
 	  fine if both source and destination are aligned on the same
 	  boundary. However, if they are aligned on different boundaries
@@ -19,13 +19,13 @@ config OPT_LIB_FUNCTION
 	  on MicroBlaze systems without a barrel shifter.
 
 config OPT_LIB_ASM
-	bool "Optimalized lib function ASM"
+	bool "Optimized lib function ASM"
 	depends on OPT_LIB_FUNCTION && (XILINX_MICROBLAZE0_USE_BARREL = 1)
 	depends on CPU_BIG_ENDIAN
 	default n
 	help
-	  Allows turn on optimalized library function (memcpy and memmove).
-	  Function are written in asm code.
+	  Turns on optimized library functions (memcpy and memmove).
+	  They are written in assembly.
 
 # Definitions for MICROBLAZE0
 comment "Definitions for MICROBLAZE0"
diff --git a/arch/microblaze/include/asm/asm-compat.h b/arch/microblaze/include/asm/asm-compat.h
index c05259ce2d2c..9f0461476231 100644
--- a/arch/microblaze/include/asm/asm-compat.h
+++ b/arch/microblaze/include/asm/asm-compat.h
@@ -4,7 +4,7 @@
 
 #include <asm/types.h>
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #  define stringify_in_c(...)	__VA_ARGS__
 #  define ASM_CONST(x)		x
 #else
diff --git a/arch/microblaze/include/asm/current.h b/arch/microblaze/include/asm/current.h
index a4bb45be30e6..099e69f32bf9 100644
--- a/arch/microblaze/include/asm/current.h
+++ b/arch/microblaze/include/asm/current.h
@@ -14,13 +14,13 @@
  * but check asm/microblaze/kernel/entry.S to be sure.
  */
 #define CURRENT_TASK	r31
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 /*
  * Dedicate r31 to keeping the current task pointer
  */
 register struct task_struct *current asm("r31");
 
 # define get_current()	current
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_MICROBLAZE_CURRENT_H */
diff --git a/arch/microblaze/include/asm/entry.h b/arch/microblaze/include/asm/entry.h
index 6c42bed41166..9efadf12397c 100644
--- a/arch/microblaze/include/asm/entry.h
+++ b/arch/microblaze/include/asm/entry.h
@@ -21,7 +21,7 @@
 
 #define PER_CPU(var) var
 
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 DECLARE_PER_CPU(unsigned int, KSP); /* Saved kernel stack pointer */
 DECLARE_PER_CPU(unsigned int, KM); /* Kernel/user mode */
 DECLARE_PER_CPU(unsigned int, ENTRY_SP); /* Saved SP on kernel entry */
@@ -29,6 +29,6 @@ DECLARE_PER_CPU(unsigned int, R11_SAVE); /* Temp variable for entry */
 DECLARE_PER_CPU(unsigned int, CURRENT_SAVE); /* Saved current pointer */
 
 extern asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall);
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_MICROBLAZE_ENTRY_H */
diff --git a/arch/microblaze/include/asm/exceptions.h b/arch/microblaze/include/asm/exceptions.h
index 967f175173e1..c4591e4f7175 100644
--- a/arch/microblaze/include/asm/exceptions.h
+++ b/arch/microblaze/include/asm/exceptions.h
@@ -11,7 +11,7 @@
 #define _ASM_MICROBLAZE_EXCEPTIONS_H
 
 #ifdef __KERNEL__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* Macros to enable and disable HW exceptions in the MSR */
 /* Define MSR enable bit for HW exceptions */
@@ -64,6 +64,6 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig);
 void die(const char *str, struct pt_regs *fp, long err);
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr);
 
-#endif /*__ASSEMBLY__ */
+#endif /*__ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_EXCEPTIONS_H */
diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h
index e6e9288bff76..f9797849e4d4 100644
--- a/arch/microblaze/include/asm/fixmap.h
+++ b/arch/microblaze/include/asm/fixmap.h
@@ -15,7 +15,7 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/kernel.h>
 #include <asm/page.h>
 #ifdef CONFIG_HIGHMEM
@@ -62,5 +62,5 @@ extern void __set_fixmap(enum fixed_addresses idx,
 
 #include <asm-generic/fixmap.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif
diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h
index 4ca38b92a3a2..27c1bafb669c 100644
--- a/arch/microblaze/include/asm/ftrace.h
+++ b/arch/microblaze/include/asm/ftrace.h
@@ -7,7 +7,7 @@
 #define MCOUNT_ADDR		((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE	8 /* sizeof mcount call */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern void _mcount(void);
 extern void ftrace_call_graph(void);
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr);
diff --git a/arch/microblaze/include/asm/kgdb.h b/arch/microblaze/include/asm/kgdb.h
index 8dc5ebb07fd5..321c3c8bfcf2 100644
--- a/arch/microblaze/include/asm/kgdb.h
+++ b/arch/microblaze/include/asm/kgdb.h
@@ -3,7 +3,7 @@
 #ifndef __MICROBLAZE_KGDB_H__
 #define __MICROBLAZE_KGDB_H__
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define CACHE_FLUSH_IS_SAFE	1
 #define BUFMAX			2048
@@ -27,6 +27,6 @@ static inline void arch_kgdb_breakpoint(void)
 struct pt_regs;
 asmlinkage void microblaze_kgdb_break(struct pt_regs *regs);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __MICROBLAZE_KGDB_H__ */
 #endif /* __KERNEL__ */
diff --git a/arch/microblaze/include/asm/mmu.h b/arch/microblaze/include/asm/mmu.h
index b928a87c0076..7262dc4da338 100644
--- a/arch/microblaze/include/asm/mmu.h
+++ b/arch/microblaze/include/asm/mmu.h
@@ -9,7 +9,7 @@
 #define _ASM_MICROBLAZE_MMU_H
 
 #  ifdef __KERNEL__
-#   ifndef __ASSEMBLY__
+#   ifndef __ASSEMBLER__
 
 /* Default "unsigned long" context */
 typedef unsigned long mm_context_t;
@@ -56,7 +56,7 @@ extern void _tlbia(void);		/* invalidate all TLB entries */
  * mapping has to increase tlb_skip size.
  */
 extern u32 tlb_skip;
-#   endif /* __ASSEMBLY__ */
+#   endif /* __ASSEMBLER__ */
 
 /*
  * The MicroBlaze processor has a TLB architecture identical to PPC-40x. The
diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h
index 90fc9c81debd..90ac9f34b4b4 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -25,7 +25,7 @@
 
 #define PTE_SHIFT	(PAGE_SHIFT - 2)	/* 1024 ptes per page */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * PAGE_OFFSET -- the first address of the first page of memory. With MMU
@@ -100,7 +100,7 @@ extern int page_is_ram(unsigned long pfn);
 #  define page_to_virt(page)   __va(page_to_pfn(page) << PAGE_SHIFT)
 
 #  define ARCH_PFN_OFFSET	(memory_start >> PAGE_SHIFT)
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 
 /* Convert between virtual and physical address for MMU. */
 /* Handle MicroBlaze processor with virtual memory. */
@@ -113,7 +113,7 @@ extern int page_is_ram(unsigned long pfn);
 #define tovirt(rd, rs) \
 	addik rd, rs, (CONFIG_KERNEL_START - CONFIG_KERNEL_BASE_ADDR)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 # define __pa(x)	__virt_to_phys((unsigned long)(x))
 # define __va(x)	((void *)__phys_to_virt((unsigned long)(x)))
@@ -130,7 +130,7 @@ static inline const void *pfn_to_virt(unsigned long pfn)
 
 #define	virt_addr_valid(vaddr)	(pfn_valid(virt_to_pfn(vaddr)))
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define TOPHYS(addr)  __virt_to_phys(addr)
 
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index bae1abfa6f6b..a60e8d895102 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -10,14 +10,14 @@
 
 #include <asm/setup.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern int mem_init_done;
 #endif
 
 #include <asm-generic/pgtable-nopmd.h>
 
 #ifdef __KERNEL__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/sched.h>
 #include <linux/threads.h>
@@ -39,7 +39,7 @@ extern pte_t *va_to_pte(unsigned long address);
 #define VMALLOC_START	(CONFIG_KERNEL_START + CONFIG_LOWMEM_SIZE)
 #define VMALLOC_END	ioremap_bot
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * Macro to mark a page protection value as "uncacheable".
@@ -208,7 +208,7 @@ extern pte_t *va_to_pte(unsigned long address);
  * Also, write permissions imply read permissions.
  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
@@ -216,7 +216,7 @@ extern pte_t *va_to_pte(unsigned long address);
 extern unsigned long empty_zero_page[1024];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define pte_none(pte)		((pte_val(pte) & ~_PTE_NONE_MASK) == 0)
 #define pte_present(pte)	(pte_val(pte) & _PAGE_PRESENT)
@@ -237,7 +237,7 @@ extern unsigned long empty_zero_page[1024];
 #define pfn_pte(pfn, prot) \
 	__pte(((pte_basic_t)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
@@ -436,13 +436,13 @@ extern int mem_init_done;
 
 asmlinkage void __init mmu_init(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern unsigned long ioremap_bot, ioremap_base;
 
 void setup_memory(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_MICROBLAZE_PGTABLE_H */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 4e193c7550df..d59bdfffca7c 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -14,7 +14,7 @@
 #include <asm/entry.h>
 #include <asm/current.h>
 
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 /* from kernel/cpu/mb.c */
 extern const struct seq_operations cpuinfo_op;
 
@@ -29,7 +29,7 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp);
 extern void ret_from_fork(void);
 extern void ret_from_kernel_thread(void);
 
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 
 /*
  * This is used to define STACK_TOP, and with MMU it must be below
@@ -45,7 +45,7 @@ extern void ret_from_kernel_thread(void);
 
 # define THREAD_KSP	0
 
-#  ifndef __ASSEMBLY__
+#  ifndef __ASSEMBLER__
 
 /* If you change this, you must change the associated assembly-languages
  * constants defined below, THREAD_*.
@@ -88,5 +88,5 @@ unsigned long __get_wchan(struct task_struct *p);
 extern struct dentry *of_debugfs_root;
 #endif
 
-#  endif /* __ASSEMBLY__ */
+#  endif /* __ASSEMBLER__ */
 #endif /* _ASM_MICROBLAZE_PROCESSOR_H */
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h
index bfcb89df5e26..17982292a64f 100644
--- a/arch/microblaze/include/asm/ptrace.h
+++ b/arch/microblaze/include/asm/ptrace.h
@@ -7,7 +7,7 @@
 
 #include <uapi/asm/ptrace.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define kernel_mode(regs)		((regs)->pt_mode)
 #define user_mode(regs)			(!kernel_mode(regs))
 
@@ -20,5 +20,5 @@ static inline long regs_return_value(struct pt_regs *regs)
 	return regs->r3;
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_MICROBLAZE_PTRACE_H */
diff --git a/arch/microblaze/include/asm/sections.h b/arch/microblaze/include/asm/sections.h
index a9311ad84a67..f5008f5e7a5c 100644
--- a/arch/microblaze/include/asm/sections.h
+++ b/arch/microblaze/include/asm/sections.h
@@ -10,11 +10,11 @@
 
 #include <asm-generic/sections.h>
 
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 extern char _ssbss[], _esbss[];
 extern unsigned long __ivt_start[], __ivt_end[];
 
 extern u32 _fdt_start[], _fdt_end[];
 
-# endif /* !__ASSEMBLY__ */
+# endif /* !__ASSEMBLER__ */
 #endif /* _ASM_MICROBLAZE_SECTIONS_H */
diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h
index bf2600f75959..837ed0bbae4b 100644
--- a/arch/microblaze/include/asm/setup.h
+++ b/arch/microblaze/include/asm/setup.h
@@ -9,7 +9,7 @@
 
 #include <uapi/asm/setup.h>
 
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 extern char cmd_line[COMMAND_LINE_SIZE];
 
 extern char *klimit;
@@ -25,5 +25,5 @@ void machine_shutdown(void);
 void machine_halt(void);
 void machine_power_off(void);
 
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 #endif /* _ASM_MICROBLAZE_SETUP_H */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index a0ddd2a36fb9..0153f7c2717c 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -13,7 +13,7 @@
 #define THREAD_SIZE		(1 << THREAD_SHIFT)
 #define THREAD_SIZE_ORDER	1
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 # include <linux/types.h>
 # include <asm/processor.h>
 
@@ -86,7 +86,7 @@ static inline struct thread_info *current_thread_info(void)
 }
 
 /* thread information allocation */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * thread information flags
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index cfe3f888b432..fedda9908aa9 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/unistd.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* #define __ARCH_WANT_OLD_READDIR */
 /* #define __ARCH_WANT_OLD_STAT */
@@ -33,6 +33,6 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_FORK
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_MICROBLAZE_UNISTD_H */
diff --git a/arch/microblaze/include/asm/xilinx_mb_manager.h b/arch/microblaze/include/asm/xilinx_mb_manager.h
index 7b6995722b0c..121a3224882b 100644
--- a/arch/microblaze/include/asm/xilinx_mb_manager.h
+++ b/arch/microblaze/include/asm/xilinx_mb_manager.h
@@ -5,7 +5,7 @@
 #ifndef _XILINX_MB_MANAGER_H
 #define _XILINX_MB_MANAGER_H
 
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
 
 #include <linux/of_address.h>
 
@@ -21,7 +21,7 @@ void xmb_manager_register(uintptr_t phys_baseaddr, u32 cr_val,
 			  void *priv, void (*reset_callback)(void *data));
 asmlinkage void xmb_inject_err(void);
 
-# endif /* __ASSEMBLY__ */
+# endif /* __ASSEMBLER__ */
 
 /* Error injection offset */
 #define XMB_INJECT_ERR_OFFSET	0x200
diff --git a/arch/microblaze/include/uapi/asm/ptrace.h b/arch/microblaze/include/uapi/asm/ptrace.h
index 46dd94cb7802..8039957a1a9c 100644
--- a/arch/microblaze/include/uapi/asm/ptrace.h
+++ b/arch/microblaze/include/uapi/asm/ptrace.h
@@ -10,7 +10,7 @@
 #ifndef _UAPI_ASM_MICROBLAZE_PTRACE_H
 #define _UAPI_ASM_MICROBLAZE_PTRACE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 typedef unsigned long microblaze_reg_t;
 
@@ -68,6 +68,6 @@ struct pt_regs {
 
 #endif /* __KERNEL */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _UAPI_ASM_MICROBLAZE_PTRACE_H */
diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c
index 104c3ac5f30c..b4b67d58e7f6 100644
--- a/arch/microblaze/kernel/asm-offsets.c
+++ b/arch/microblaze/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
  * License. See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/init.h>
 #include <linux/stddef.h>
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 56342e11442d..6cbf642d7b80 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -54,7 +54,7 @@ void flush_thread(void)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index caf508f6e9ec..447b2fcaa316 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2223,7 +2223,7 @@ config MIPS_MT_SMP
 	select SMP
 	select SMP_UP
 	select SYS_SUPPORTS_SMP
-	select SYS_SUPPORTS_SCHED_SMT
+	select ARCH_SUPPORTS_SCHED_SMT
 	select MIPS_PERF_SHARED_TC_COUNTERS
 	help
 	  This is a kernel model which is known as SMVP. This is supported
@@ -2235,18 +2235,6 @@ config MIPS_MT_SMP
 config MIPS_MT
 	bool
 
-config SCHED_SMT
-	bool "SMT (multithreading) scheduler support"
-	depends on SYS_SUPPORTS_SCHED_SMT
-	default n
-	help
-	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with MIPS MT enabled cores at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
-config SYS_SUPPORTS_SCHED_SMT
-	bool
-
 config SYS_SUPPORTS_MULTITHREADING
 	bool
 
@@ -2318,7 +2306,7 @@ config MIPS_CPS
 	select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
 	select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
 	select SYS_SUPPORTS_HOTPLUG_CPU
-	select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6
+	select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6
 	select SYS_SUPPORTS_SMP
 	select WEAK_ORDERING
 	select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU
diff --git a/arch/mips/boot/dts/lantiq/danube_easy50712.dts b/arch/mips/boot/dts/lantiq/danube_easy50712.dts
index 1ce20b7d05cb..c4d7aa5753b0 100644
--- a/arch/mips/boot/dts/lantiq/danube_easy50712.dts
+++ b/arch/mips/boot/dts/lantiq/danube_easy50712.dts
@@ -82,13 +82,16 @@
 			};
 		};
 
-		etop@e180000 {
+		ethernet@e180000 {
 			compatible = "lantiq,etop-xway";
 			reg = <0xe180000 0x40000>;
 			interrupt-parent = <&icu0>;
 			interrupts = <73 78>;
+			interrupt-names = "tx", "rx";
 			phy-mode = "rmii";
 			mac-address = [ 00 11 22 33 44 55 ];
+			lantiq,rx-burst-length = <4>;
+			lantiq,tx-burst-length = <4>;
 		};
 
 		stp0: stp@e100bb0 {
diff --git a/arch/mips/cavium-octeon/Makefile b/arch/mips/cavium-octeon/Makefile
index 2a5926578841..ab84ede0cbe0 100644
--- a/arch/mips/cavium-octeon/Makefile
+++ b/arch/mips/cavium-octeon/Makefile
@@ -11,9 +11,9 @@
 
 obj-y := cpu.o setup.o octeon-platform.o octeon-irq.o csrc-octeon.o
 obj-y += dma-octeon.o
+obj-y += octeon-crypto.o
 obj-y += octeon-memcpy.o
 obj-y += executive/
-obj-y += crypto/
 
 obj-$(CONFIG_MTD)		      += flash_setup.o
 obj-$(CONFIG_SMP)		      += smp.o
diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile
deleted file mode 100644
index 83f2f5dd93cc..000000000000
--- a/arch/mips/cavium-octeon/crypto/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# OCTEON-specific crypto modules.
-#
-
-obj-y += octeon-crypto.o
-
-obj-$(CONFIG_CRYPTO_MD5_OCTEON)		+= octeon-md5.o
diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c
deleted file mode 100644
index a8ce831e2ceb..000000000000
--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Cryptographic API.
- *
- * MD5 Message Digest Algorithm (RFC1321).
- *
- * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
- *
- * Based on crypto/md5.c, which is:
- *
- * Derived from cryptoapi implementation, originally based on the
- * public domain implementation written by Colin Plumb in 1993.
- *
- * Copyright (c) Cryptoapi developers.
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <asm/octeon/crypto.h>
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/md5.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-struct octeon_md5_state {
-	__le32 hash[MD5_HASH_WORDS];
-	u64 byte_count;
-};
-
-/*
- * We pass everything as 64-bit. OCTEON can handle misaligned data.
- */
-
-static void octeon_md5_store_hash(struct octeon_md5_state *ctx)
-{
-	u64 *hash = (u64 *)ctx->hash;
-
-	write_octeon_64bit_hash_dword(hash[0], 0);
-	write_octeon_64bit_hash_dword(hash[1], 1);
-}
-
-static void octeon_md5_read_hash(struct octeon_md5_state *ctx)
-{
-	u64 *hash = (u64 *)ctx->hash;
-
-	hash[0] = read_octeon_64bit_hash_dword(0);
-	hash[1] = read_octeon_64bit_hash_dword(1);
-}
-
-static void octeon_md5_transform(const void *_block)
-{
-	const u64 *block = _block;
-
-	write_octeon_64bit_block_dword(block[0], 0);
-	write_octeon_64bit_block_dword(block[1], 1);
-	write_octeon_64bit_block_dword(block[2], 2);
-	write_octeon_64bit_block_dword(block[3], 3);
-	write_octeon_64bit_block_dword(block[4], 4);
-	write_octeon_64bit_block_dword(block[5], 5);
-	write_octeon_64bit_block_dword(block[6], 6);
-	octeon_md5_start(block[7]);
-}
-
-static int octeon_md5_init(struct shash_desc *desc)
-{
-	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
-
-	mctx->hash[0] = cpu_to_le32(MD5_H0);
-	mctx->hash[1] = cpu_to_le32(MD5_H1);
-	mctx->hash[2] = cpu_to_le32(MD5_H2);
-	mctx->hash[3] = cpu_to_le32(MD5_H3);
-	mctx->byte_count = 0;
-
-	return 0;
-}
-
-static int octeon_md5_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
-	struct octeon_cop2_state state;
-	unsigned long flags;
-
-	mctx->byte_count += len;
-	flags = octeon_crypto_enable(&state);
-	octeon_md5_store_hash(mctx);
-
-	do {
-		octeon_md5_transform(data);
-		data += MD5_HMAC_BLOCK_SIZE;
-		len -= MD5_HMAC_BLOCK_SIZE;
-	} while (len >= MD5_HMAC_BLOCK_SIZE);
-
-	octeon_md5_read_hash(mctx);
-	octeon_crypto_disable(&state, flags);
-	mctx->byte_count -= len;
-	return len;
-}
-
-static int octeon_md5_finup(struct shash_desc *desc, const u8 *src,
-			    unsigned int offset, u8 *out)
-{
-	struct octeon_md5_state *mctx = shash_desc_ctx(desc);
-	int padding = 56 - (offset + 1);
-	struct octeon_cop2_state state;
-	u32 block[MD5_BLOCK_WORDS];
-	unsigned long flags;
-	char *p;
-
-	p = memcpy(block, src, offset);
-	p += offset;
-	*p++ = 0x80;
-
-	flags = octeon_crypto_enable(&state);
-	octeon_md5_store_hash(mctx);
-
-	if (padding < 0) {
-		memset(p, 0x00, padding + sizeof(u64));
-		octeon_md5_transform(block);
-		p = (char *)block;
-		padding = 56;
-	}
-
-	memset(p, 0, padding);
-	mctx->byte_count += offset;
-	block[14] = mctx->byte_count << 3;
-	block[15] = mctx->byte_count >> 29;
-	cpu_to_le32_array(block + 14, 2);
-	octeon_md5_transform(block);
-
-	octeon_md5_read_hash(mctx);
-	octeon_crypto_disable(&state, flags);
-
-	memzero_explicit(block, sizeof(block));
-	memcpy(out, mctx->hash, sizeof(mctx->hash));
-
-	return 0;
-}
-
-static int octeon_md5_export(struct shash_desc *desc, void *out)
-{
-	struct octeon_md5_state *ctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u32 *u32;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		put_unaligned(le32_to_cpu(ctx->hash[i]), p.u32++);
-	put_unaligned(ctx->byte_count, p.u64);
-	return 0;
-}
-
-static int octeon_md5_import(struct shash_desc *desc, const void *in)
-{
-	struct octeon_md5_state *ctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u32 *u32;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		ctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++));
-	ctx->byte_count = get_unaligned(p.u64);
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	MD5_DIGEST_SIZE,
-	.init		=	octeon_md5_init,
-	.update		=	octeon_md5_update,
-	.finup		=	octeon_md5_finup,
-	.export		=	octeon_md5_export,
-	.import		=	octeon_md5_import,
-	.statesize	=	MD5_STATE_SIZE,
-	.descsize	=	sizeof(struct octeon_md5_state),
-	.base		=	{
-		.cra_name	=	"md5",
-		.cra_driver_name=	"octeon-md5",
-		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int __init md5_mod_init(void)
-{
-	if (!octeon_has_crypto())
-		return -ENOTSUPP;
-	return crypto_register_shash(&alg);
-}
-
-static void __exit md5_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(md5_mod_init);
-module_exit(md5_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MD5 Message Digest Algorithm (OCTEON)");
-MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/octeon-crypto.c
index 0ff8559391f5..0ff8559391f5 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-crypto.c
+++ b/arch/mips/cavium-octeon/octeon-crypto.c
diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig
index 3f50e1d78894..68c363366bce 100644
--- a/arch/mips/configs/cavium_octeon_defconfig
+++ b/arch/mips/configs/cavium_octeon_defconfig
@@ -155,7 +155,6 @@ CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_CRYPTO_CBC=y
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MD5_OCTEON=y
 CONFIG_CRYPTO_DES=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_FS=y
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index e4bcdb64df6c..2707ab134639 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -273,6 +273,7 @@ CONFIG_DM9102=m
 CONFIG_ULI526X=m
 CONFIG_PCMCIA_XIRCOM=m
 CONFIG_DL2K=m
+CONFIG_SUNDANCE=m
 CONFIG_PCMCIA_FMVJ18X=m
 CONFIG_E100=m
 CONFIG_E1000=m
diff --git a/arch/mips/crypto/Kconfig b/arch/mips/crypto/Kconfig
index 7b91f4ec65bf..6a5bd5074867 100644
--- a/arch/mips/crypto/Kconfig
+++ b/arch/mips/crypto/Kconfig
@@ -2,14 +2,4 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (mips)"
 
-config CRYPTO_MD5_OCTEON
-	tristate "Digests: MD5 (OCTEON)"
-	depends on CPU_CAVIUM_OCTEON
-	select CRYPTO_MD5
-	select CRYPTO_HASH
-	help
-	  MD5 message digest algorithm (RFC1321)
-
-	  Architecture: mips OCTEON using crypto instructions, when available
-
 endmenu
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 89f73d1a4ea4..42f88452c920 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -327,7 +327,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *
  * Return the bit position (0..63) of the most significant 1 bit in a word
  * Returns -1 if no 1 bit exists
  */
-static __always_inline unsigned long __fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned long __fls(unsigned long word)
 {
 	int num;
 
@@ -393,7 +393,7 @@ static __always_inline unsigned long __fls(unsigned long word)
  * Returns 0..SZLONG-1
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __ffs(unsigned long word)
+static __always_inline __attribute_const__ unsigned long __ffs(unsigned long word)
 {
 	return __fls(word & -word);
 }
@@ -405,7 +405,7 @@ static __always_inline unsigned long __ffs(unsigned long word)
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
-static inline int fls(unsigned int x)
+static inline __attribute_const__ int fls(unsigned int x)
 {
 	int r;
 
@@ -458,7 +458,7 @@ static inline int fls(unsigned int x)
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from the below ffz (man ffs).
  */
-static inline int ffs(int word)
+static inline __attribute_const__ int ffs(int word)
 {
 	if (!word)
 		return 0;
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 1e29efcba46e..5debd9a3854a 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -9,6 +9,8 @@
  * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
  * Copyright (C) 2000 MIPS Technologies, Inc.
  */
+#define COMPILE_OFFSETS
+
 #include <linux/compat.h>
 #include <linux/types.h>
 #include <linux/sched.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 02aa6a04a21d..29191fa1801e 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -107,7 +107,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
  */
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c
index 5a75283d17f1..6031a0272d87 100644
--- a/arch/mips/lantiq/xway/sysctrl.c
+++ b/arch/mips/lantiq/xway/sysctrl.c
@@ -497,7 +497,7 @@ void __init ltq_soc_init(void)
 		ifccr = CGU_IFCCR_VR9;
 		pcicr = CGU_PCICR_VR9;
 	} else {
-		clkdev_add_pmu("1e180000.etop", NULL, 1, 0, PMU_PPE);
+		clkdev_add_pmu("1e180000.ethernet", NULL, 1, 0, PMU_PPE);
 	}
 
 	if (!of_machine_is_compatible("lantiq,ase"))
@@ -531,9 +531,9 @@ void __init ltq_soc_init(void)
 						CLOCK_133M, CLOCK_133M);
 		clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0);
 		clkdev_add_pmu("1f203018.usb2-phy", "phy", 1, 0, PMU_USB0_P);
-		clkdev_add_pmu("1e180000.etop", "ppe", 1, 0, PMU_PPE);
-		clkdev_add_cgu("1e180000.etop", "ephycgu", CGU_EPHY);
-		clkdev_add_pmu("1e180000.etop", "ephy", 1, 0, PMU_EPHY);
+		clkdev_add_pmu("1e180000.ethernet", "ppe", 1, 0, PMU_PPE);
+		clkdev_add_cgu("1e180000.ethernet", "ephycgu", CGU_EPHY);
+		clkdev_add_pmu("1e180000.ethernet", "ephy", 1, 0, PMU_EPHY);
 		clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_ASE_SDIO);
 		clkdev_add_pmu("1e116000.mei", "dfe", 1, 0, PMU_DFE);
 	} else if (of_machine_is_compatible("lantiq,grx390")) {
@@ -592,7 +592,7 @@ void __init ltq_soc_init(void)
 		clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0 | PMU_AHBM);
 		clkdev_add_pmu("1f203034.usb2-phy", "phy", 1, 0, PMU_USB1_P);
 		clkdev_add_pmu("1e106000.usb", "otg", 1, 0, PMU_USB1 | PMU_AHBM);
-		clkdev_add_pmu("1e180000.etop", "switch", 1, 0, PMU_SWITCH);
+		clkdev_add_pmu("1e180000.ethernet", "switch", 1, 0, PMU_SWITCH);
 		clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_SDIO);
 		clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU);
 		clkdev_add_pmu("1e116000.mei", "dfe", 1, 0, PMU_DFE);
diff --git a/arch/nios2/include/asm/entry.h b/arch/nios2/include/asm/entry.h
index bafb7b2ca59f..cb25ed56450a 100644
--- a/arch/nios2/include/asm/entry.h
+++ b/arch/nios2/include/asm/entry.h
@@ -10,7 +10,7 @@
 #ifndef _ASM_NIOS2_ENTRY_H
 #define _ASM_NIOS2_ENTRY_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/processor.h>
 #include <asm/registers.h>
@@ -117,5 +117,5 @@
 	addi	sp, sp, SWITCH_STACK_SIZE
 .endm
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_NIOS2_ENTRY_H */
diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h
index 2897ec1b74f6..00a51623d38a 100644
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@@ -26,7 +26,7 @@
 #define PAGE_OFFSET	\
 	(CONFIG_NIOS2_MEM_BASE + CONFIG_NIOS2_KERNEL_REGION_BASE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * This gives the physical RAM offset.
@@ -90,6 +90,6 @@ extern struct page *mem_map;
 
 #include <asm-generic/getorder.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_NIOS2_PAGE_H */
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index eb44130364a9..d9521c3c2df9 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -36,7 +36,7 @@
 /* Kuser helpers is mapped to this user space address */
 #define KUSER_BASE		0x1000
 #define KUSER_SIZE		(PAGE_SIZE)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 # define TASK_SIZE		0x7FFF0000UL
 # define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
@@ -72,6 +72,6 @@ extern unsigned long __get_wchan(struct task_struct *p);
 
 #define cpu_relax()	barrier()
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_NIOS2_PROCESSOR_H */
diff --git a/arch/nios2/include/asm/ptrace.h b/arch/nios2/include/asm/ptrace.h
index 9da34c3022a2..96cbcd40c7ce 100644
--- a/arch/nios2/include/asm/ptrace.h
+++ b/arch/nios2/include/asm/ptrace.h
@@ -18,7 +18,7 @@
 /* This struct defines the way the registers are stored on the
    stack during a system call.  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 struct pt_regs {
 	unsigned long  r8;	/* r8-r15 Caller-saved GP registers */
 	unsigned long  r9;
@@ -78,5 +78,5 @@ extern void show_regs(struct pt_regs *);
 
 int do_syscall_trace_enter(void);
 void do_syscall_trace_exit(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_NIOS2_PTRACE_H */
diff --git a/arch/nios2/include/asm/registers.h b/arch/nios2/include/asm/registers.h
index 95b67dd16f81..165dab26221f 100644
--- a/arch/nios2/include/asm/registers.h
+++ b/arch/nios2/include/asm/registers.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_NIOS2_REGISTERS_H
 #define _ASM_NIOS2_REGISTERS_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/cpuinfo.h>
 #endif
 
@@ -44,7 +44,7 @@
 
 /* tlbmisc register bits */
 #define TLBMISC_PID_SHIFT	4
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define TLBMISC_PID_MASK	((1UL << cpuinfo.tlb_pid_num_bits) - 1)
 #endif
 #define TLBMISC_WAY_MASK	0xf
diff --git a/arch/nios2/include/asm/setup.h b/arch/nios2/include/asm/setup.h
index 908a1526d1bd..6d3f26a71cb5 100644
--- a/arch/nios2/include/asm/setup.h
+++ b/arch/nios2/include/asm/setup.h
@@ -8,7 +8,7 @@
 
 #include <asm-generic/setup.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifdef __KERNEL__
 
 extern char exception_handler_hook[];
@@ -18,6 +18,6 @@ extern char fast_handler_end[];
 extern void pagetable_init(void);
 
 #endif/* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_NIOS2_SETUP_H */
diff --git a/arch/nios2/include/asm/syscalls.h b/arch/nios2/include/asm/syscalls.h
index b4d4ed3bf9c8..0e214b0a0ac8 100644
--- a/arch/nios2/include/asm/syscalls.h
+++ b/arch/nios2/include/asm/syscalls.h
@@ -7,6 +7,7 @@
 
 int sys_cacheflush(unsigned long addr, unsigned long len,
 				unsigned int op);
+asmlinkage long __sys_clone3(struct clone_args __user *uargs, size_t size);
 
 #include <asm-generic/syscalls.h>
 
diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h
index 5abac9893b32..83df79286d62 100644
--- a/arch/nios2/include/asm/thread_info.h
+++ b/arch/nios2/include/asm/thread_info.h
@@ -24,7 +24,7 @@
 #define THREAD_SIZE_ORDER	1
 #define THREAD_SIZE		8192 /* 2 * PAGE_SIZE */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * low level task data that entry.S needs immediate access to
@@ -61,7 +61,7 @@ static inline struct thread_info *current_thread_info(void)
 
 	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
 }
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * thread information flags
diff --git a/arch/nios2/include/asm/traps.h b/arch/nios2/include/asm/traps.h
index afd77bef01c6..133a3dedbc3e 100644
--- a/arch/nios2/include/asm/traps.h
+++ b/arch/nios2/include/asm/traps.h
@@ -12,7 +12,7 @@
 
 #define TRAP_ID_SYSCALL		0
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr);
 void do_page_fault(struct pt_regs *regs, unsigned long cause,
 		   unsigned long address);
diff --git a/arch/nios2/include/asm/unistd.h b/arch/nios2/include/asm/unistd.h
index 1146e56473c5..213f6de3cf7b 100644
--- a/arch/nios2/include/asm/unistd.h
+++ b/arch/nios2/include/asm/unistd.h
@@ -7,6 +7,4 @@
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SET_GET_RLIMIT
 
-#define __ARCH_BROKEN_SYS_CLONE3
-
 #endif
diff --git a/arch/nios2/include/uapi/asm/ptrace.h b/arch/nios2/include/uapi/asm/ptrace.h
index 2b91dbe5bcfe..1298db9f0fc9 100644
--- a/arch/nios2/include/uapi/asm/ptrace.h
+++ b/arch/nios2/include/uapi/asm/ptrace.h
@@ -13,7 +13,7 @@
 #ifndef _UAPI_ASM_NIOS2_PTRACE_H
 #define _UAPI_ASM_NIOS2_PTRACE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 
@@ -80,5 +80,5 @@ struct user_pt_regs {
 	__u32		regs[49];
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _UAPI_ASM_NIOS2_PTRACE_H */
diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c
index e3d9b7b6fb48..88190b503ce5 100644
--- a/arch/nios2/kernel/asm-offsets.c
+++ b/arch/nios2/kernel/asm-offsets.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch>
  */
+#define COMPILE_OFFSETS
 
 #include <linux/stddef.h>
 #include <linux/sched.h>
diff --git a/arch/nios2/kernel/entry.S b/arch/nios2/kernel/entry.S
index 99f0a65e6234..dd40dfd908e5 100644
--- a/arch/nios2/kernel/entry.S
+++ b/arch/nios2/kernel/entry.S
@@ -403,6 +403,12 @@ ENTRY(sys_clone)
 	addi    sp, sp, 4
 	RESTORE_SWITCH_STACK
 	ret
+/* long syscall(SYS_clone3, struct clone_args *cl_args, size_t size); */
+ENTRY(__sys_clone3)
+	SAVE_SWITCH_STACK
+	call	sys_clone3
+	RESTORE_SWITCH_STACK
+	ret
 
 ENTRY(sys_rt_sigreturn)
 	SAVE_SWITCH_STACK
diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c
index f84021303f6a..151404139085 100644
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -101,7 +101,7 @@ void flush_thread(void)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 2a40150142c3..f43f01c4ab93 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -142,6 +142,20 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low,
 	*max_high = PFN_DOWN(memblock_end_of_DRAM());
 }
 
+static void __init adjust_lowmem_bounds(void)
+{
+	phys_addr_t block_start, block_end;
+	u64 i;
+	phys_addr_t memblock_limit = 0;
+
+	for_each_mem_range(i, &block_start, &block_end) {
+		if (block_end > memblock_limit)
+			memblock_limit = block_end;
+	}
+
+	memblock_set_current_limit(memblock_limit);
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	console_verbose();
@@ -157,6 +171,7 @@ void __init setup_arch(char **cmdline_p)
 	/* Keep a copy of command line */
 	*cmdline_p = boot_command_line;
 
+	adjust_lowmem_bounds();
 	find_limits(&min_low_pfn, &max_low_pfn, &max_pfn);
 
 	memblock_reserve(__pa_symbol(_stext), _end - _stext);
diff --git a/arch/nios2/kernel/syscall_table.c b/arch/nios2/kernel/syscall_table.c
index 434694067d8f..c99818aac9e1 100644
--- a/arch/nios2/kernel/syscall_table.c
+++ b/arch/nios2/kernel/syscall_table.c
@@ -13,6 +13,7 @@
 #define __SYSCALL_WITH_COMPAT(nr, native, compat)        __SYSCALL(nr, native)
 
 #define sys_mmap2 sys_mmap_pgoff
+#define sys_clone3 __sys_clone3
 
 void *sys_call_table[__NR_syscalls] = {
 	[0 ... __NR_syscalls-1] = sys_ni_syscall,
diff --git a/arch/openrisc/include/asm/bitops/__ffs.h b/arch/openrisc/include/asm/bitops/__ffs.h
index 1e224b616fdf..4827b66530b2 100644
--- a/arch/openrisc/include/asm/bitops/__ffs.h
+++ b/arch/openrisc/include/asm/bitops/__ffs.h
@@ -11,7 +11,7 @@
 
 #ifdef CONFIG_OPENRISC_HAVE_INST_FF1
 
-static inline unsigned long __ffs(unsigned long x)
+static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	int ret;
 
diff --git a/arch/openrisc/include/asm/bitops/__fls.h b/arch/openrisc/include/asm/bitops/__fls.h
index 9658446ad141..637cc76fe4b7 100644
--- a/arch/openrisc/include/asm/bitops/__fls.h
+++ b/arch/openrisc/include/asm/bitops/__fls.h
@@ -11,7 +11,7 @@
 
 #ifdef CONFIG_OPENRISC_HAVE_INST_FL1
 
-static inline unsigned long __fls(unsigned long x)
+static inline __attribute_const__ unsigned long __fls(unsigned long x)
 {
 	int ret;
 
diff --git a/arch/openrisc/include/asm/bitops/ffs.h b/arch/openrisc/include/asm/bitops/ffs.h
index b4c835d6bc84..536a60ab9cc3 100644
--- a/arch/openrisc/include/asm/bitops/ffs.h
+++ b/arch/openrisc/include/asm/bitops/ffs.h
@@ -10,7 +10,7 @@
 
 #ifdef CONFIG_OPENRISC_HAVE_INST_FF1
 
-static inline int ffs(int x)
+static inline __attribute_const__ int ffs(int x)
 {
 	int ret;
 
diff --git a/arch/openrisc/include/asm/bitops/fls.h b/arch/openrisc/include/asm/bitops/fls.h
index 6b77f6556fb9..77da7639bb3e 100644
--- a/arch/openrisc/include/asm/bitops/fls.h
+++ b/arch/openrisc/include/asm/bitops/fls.h
@@ -11,7 +11,7 @@
 
 #ifdef CONFIG_OPENRISC_HAVE_INST_FL1
 
-static inline int fls(unsigned int x)
+static inline __attribute_const__ int fls(unsigned int x)
 {
 	int ret;
 
diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c
index 710651d5aaae..3cc826f2216b 100644
--- a/arch/openrisc/kernel/asm-offsets.c
+++ b/arch/openrisc/kernel/asm-offsets.c
@@ -18,6 +18,7 @@
  * compile this file to assembler, and then extract the
  * #defines from the assembly-language output.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/signal.h>
 #include <linux/sched.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index eef99fee2110..73ffb9fa3118 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -165,7 +165,7 @@ extern asmlinkage void ret_from_fork(void);
 int
 copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *userregs;
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 2efa4b08b7b8..0940c162f1f7 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -44,6 +44,7 @@ config PARISC
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_ARCH_TOPOLOGY if SMP
+	select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00
 	select GENERIC_CPU_DEVICES if !SMP
 	select GENERIC_LIB_DEVMEM_IS_ALLOWED
 	select SYSCTL_ARCH_UNALIGN_ALLOW
@@ -319,14 +320,6 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	depends on GENERIC_ARCH_TOPOLOGY && PA8X00
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
 config IRQSTACKS
 	bool "Use separate kernel stacks when processing interrupts"
 	default y
diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h
index 0ec9cfc5131f..bd1280a8a5ec 100644
--- a/arch/parisc/include/asm/bitops.h
+++ b/arch/parisc/include/asm/bitops.h
@@ -123,7 +123,7 @@ static __inline__ int test_and_change_bit(int nr, volatile unsigned long * addr)
  * cycles for each mispredicted branch.
  */
 
-static __inline__ unsigned long __ffs(unsigned long x)
+static __inline__ __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	unsigned long ret;
 
@@ -161,7 +161,7 @@ static __inline__ unsigned long __ffs(unsigned long x)
  * This is defined the same way as the libc and compiler builtin
  * ffs routines, therefore differs in spirit from the above ffz (man ffs).
  */
-static __inline__ int ffs(int x)
+static __inline__ __attribute_const__ int ffs(int x)
 {
 	return x ? (__ffs((unsigned long)x) + 1) : 0;
 }
@@ -171,7 +171,7 @@ static __inline__ int ffs(int x)
  * fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 
-static __inline__ int fls(unsigned int x)
+static __inline__ __attribute_const__ int fls(unsigned int x)
 {
 	int ret;
 	if (!x)
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 757816a7bd4b..9abfe65492c6 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -13,6 +13,7 @@
  *    Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org>
  *    Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org>
  */
+#define COMPILE_OFFSETS
 
 #include <linux/types.h>
 #include <linux/sched.h>
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index ed93bd8c1545..e64ab5d2a40d 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -201,7 +201,7 @@ arch_initcall(parisc_idle_init);
 int
 copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *cregs = &(p->thread.regs);
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 93402a1d9c9f..12fb3da59700 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -170,6 +170,9 @@ config PPC
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx
+	select ARCH_SUPPORTS_SCHED_MC		if SMP
+	select ARCH_SUPPORTS_SCHED_SMT		if PPC64 && SMP
+	select SCHED_MC				if ARCH_SUPPORTS_SCHED_MC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_MEMTEST
@@ -243,12 +246,14 @@ config PPC
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
+	select HAVE_FTRACE_REGS_HAVING_PT_REGS
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_DESCRIPTORS	if PPC64_ELF_ABI_V1
 	select HAVE_FUNCTION_ERROR_INJECTION
+	select HAVE_FUNCTION_GRAPH_FREGS
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER		if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC))
-	select HAVE_GCC_PLUGINS			if GCC_VERSION >= 50200   # plugin support on gcc <= 5.1 is buggy on PPC
+	select HAVE_GCC_PLUGINS
 	select HAVE_GENERIC_VDSO
 	select HAVE_HARDLOCKUP_DETECTOR_ARCH	if PPC_BOOK3S_64 && SMP
 	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
@@ -963,14 +968,6 @@ config PPC_PROT_SAO_LPAR
 config PPC_COPRO_BASE
 	bool
 
-config SCHED_SMT
-	bool "SMT (Hyperthreading) scheduler support"
-	depends on PPC64 && SMP
-	help
-	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with POWER5 cpus at a cost of slightly increased
-	  overhead in some places. If unsure say N here.
-
 config PPC_DENORMALISATION
 	bool "PowerPC denormalisation exception handling"
 	depends on PPC_BOOK3S_64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 9753fb87217c..a58b1029592c 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -58,7 +58,7 @@ ifeq ($(CONFIG_PPC64)$(CONFIG_LD_IS_BFD),yy)
 # There is a corresponding test in arch/powerpc/lib/Makefile
 KBUILD_LDFLAGS_MODULE += --save-restore-funcs
 else
-KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
+KBUILD_LDFLAGS_MODULE += $(objtree)/arch/powerpc/lib/crtsavres.o
 endif
 
 ifdef CONFIG_CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index a7ab087d412c..c47b78c1d3e7 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -243,13 +243,13 @@ $(obj)/wrapper.a: $(obj-wlib) FORCE
 hostprogs	:= addnote hack-coff mktree
 
 targets		+= $(patsubst $(obj)/%,%,$(obj-boot) wrapper.a) zImage.lds
-extra-y		:= $(obj)/wrapper.a $(obj-plat) $(obj)/empty.o \
+always-y	:= $(obj)/wrapper.a $(obj-plat) $(obj)/empty.o \
 		   $(obj)/zImage.lds $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds
 
 dtstree		:= $(src)/dts
 
 wrapper		:= $(src)/wrapper
-wrapperbits	:= $(extra-y) $(addprefix $(obj)/,addnote hack-coff mktree) \
+wrapperbits	:= $(always-y) $(addprefix $(obj)/,addnote hack-coff mktree) \
 			$(wrapper) FORCE
 
 #############
@@ -456,7 +456,7 @@ WRAPPER_DTSDIR := /usr/lib/kernel-wrapper/dts
 WRAPPER_BINDIR := /usr/sbin
 INSTALL := install
 
-extra-installed		:= $(patsubst $(obj)/%, $(DESTDIR)$(WRAPPER_OBJDIR)/%, $(extra-y))
+extra-installed		:= $(patsubst $(obj)/%, $(DESTDIR)$(WRAPPER_OBJDIR)/%, $(always-y))
 hostprogs-installed	:= $(patsubst %, $(DESTDIR)$(WRAPPER_BINDIR)/%, $(hostprogs))
 wrapper-installed	:= $(DESTDIR)$(WRAPPER_BINDIR)/wrapper
 dts-installed		:= $(patsubst $(dtstree)/%, $(DESTDIR)$(WRAPPER_DTSDIR)/%, $(wildcard $(dtstree)/*.dts))
diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh
index 101fcb397a0f..c3df6c27ce75 100755
--- a/arch/powerpc/boot/install.sh
+++ b/arch/powerpc/boot/install.sh
@@ -19,19 +19,19 @@
 set -e
 
 # this should work for both the pSeries zImage and the iSeries vmlinux.sm
-image_name=`basename $2`
+image_name=$(basename "$2")
 
 
 echo "Warning: '${INSTALLKERNEL}' command not available... Copying" \
      "directly to $4/$image_name-$1" >&2
 
-if [ -f $4/$image_name-$1 ]; then
-	mv $4/$image_name-$1 $4/$image_name-$1.old
+if [ -f "$4"/"$image_name"-"$1" ]; then
+	mv "$4"/"$image_name"-"$1" "$4"/"$image_name"-"$1".old
 fi
 
-if [ -f $4/System.map-$1 ]; then
-	mv $4/System.map-$1 $4/System-$1.old
+if [ -f "$4"/System.map-"$1" ]; then
+	mv "$4"/System.map-"$1" "$4"/System-"$1".old
 fi
 
-cat $2 > $4/$image_name-$1
-cp $3 $4/System.map-$1
+cat "$2" > "$4"/"$image_name"-"$1"
+cp "$3" "$4"/System.map-"$1"
diff --git a/arch/powerpc/boot/page.h b/arch/powerpc/boot/page.h
index c3d55fc8f34c..e44a3119720d 100644
--- a/arch/powerpc/boot/page.h
+++ b/arch/powerpc/boot/page.h
@@ -5,7 +5,7 @@
  * Copyright (C) 2001 PPC64 Team, IBM Corp
  */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define ASM_CONST(x) x
 #else
 #define __ASM_CONST(x) x##UL
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 3d8dc822282a..a75baefd1cff 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -226,11 +226,7 @@ ld_is_lld()
 
 # Do not include PT_INTERP segment when linking pie. Non-pie linking
 # just ignores this option.
-LD_VERSION=$(${CROSS}ld --version | ld_version)
-LD_NO_DL_MIN_VERSION=$(echo 2.26 | ld_version)
-if [ "$LD_VERSION" -ge "$LD_NO_DL_MIN_VERSION" ] ; then
-	nodl="--no-dynamic-linker"
-fi
+nodl="--no-dynamic-linker"
 
 # suppress some warnings in recent ld versions
 nowarn="-z noexecstack"
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index d06388b0f66e..bd4685612de6 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -320,7 +320,6 @@ CONFIG_XMON=y
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index ce34597e9f3e..2d92c11eea7e 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -387,7 +387,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_LZO=m
-CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_AES_GCM_P10=m
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index bb359643ddc1..b082c1fae13c 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -433,6 +433,7 @@ CONFIG_DM9102=m
 CONFIG_ULI526X=m
 CONFIG_PCMCIA_XIRCOM=m
 CONFIG_DL2K=m
+CONFIG_SUNDANCE=m
 CONFIG_S2IO=m
 CONFIG_FEC_MPC52xx=m
 CONFIG_GIANFAR=m
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index cfe39fc221cf..662aed46f9c7 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -2,27 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (powerpc)"
 
-config CRYPTO_CURVE25519_PPC64
-	tristate
-	depends on PPC64 && CPU_LITTLE_ENDIAN
-	select CRYPTO_KPP
-	select CRYPTO_LIB_CURVE25519_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
-	default CRYPTO_LIB_CURVE25519_INTERNAL
-	help
-	  Curve25519 algorithm
-
-	  Architecture: PowerPC64
-	  - Little-endian
-
-config CRYPTO_MD5_PPC
-	tristate "Digests: MD5"
-	select CRYPTO_HASH
-	help
-	  MD5 message digest algorithm (RFC1321)
-
-	  Architecture: powerpc
-
 config CRYPTO_AES_PPC_SPE
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (SPE)"
 	depends on SPE
diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile
index bc8fd27344b8..5960e5300db7 100644
--- a/arch/powerpc/crypto/Makefile
+++ b/arch/powerpc/crypto/Makefile
@@ -6,16 +6,12 @@
 #
 
 obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o
-obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o
 obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o
 obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o
-obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o
 
 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
-md5-ppc-y := md5-asm.o md5-glue.o
 aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-ppc.o
 vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o
-curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override flavour := linux-ppc64le
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
deleted file mode 100644
index 204440a90cd8..000000000000
--- a/arch/powerpc/crypto/md5-glue.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for MD5 implementation for PPC assembler
- *
- * Based on generic implementation.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/md5.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks);
-
-static int ppc_md5_init(struct shash_desc *desc)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	sctx->hash[0] = MD5_H0;
-	sctx->hash[1] = MD5_H1;
-	sctx->hash[2] = MD5_H2;
-	sctx->hash[3] =	MD5_H3;
-	sctx->byte_count = 0;
-
-	return 0;
-}
-
-static int ppc_md5_update(struct shash_desc *desc, const u8 *data,
-			unsigned int len)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-
-	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
-	ppc_md5_transform(sctx->hash, data, len >> 6);
-	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
-}
-
-static int ppc_md5_finup(struct shash_desc *desc, const u8 *src,
-			 unsigned int offset, u8 *out)
-{
-	struct md5_state *sctx = shash_desc_ctx(desc);
-	__le64 block[MD5_BLOCK_WORDS] = {};
-	u8 *p = memcpy(block, src, offset);
-	__le32 *dst = (__le32 *)out;
-	__le64 *pbits;
-
-	src = p;
-	p += offset;
-	*p++ = 0x80;
-	sctx->byte_count += offset;
-	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
-	*pbits = cpu_to_le64(sctx->byte_count << 3);
-	ppc_md5_transform(sctx->hash, src, (pbits - block + 1) / 8);
-	memzero_explicit(block, sizeof(block));
-
-	dst[0] = cpu_to_le32(sctx->hash[0]);
-	dst[1] = cpu_to_le32(sctx->hash[1]);
-	dst[2] = cpu_to_le32(sctx->hash[2]);
-	dst[3] = cpu_to_le32(sctx->hash[3]);
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	MD5_DIGEST_SIZE,
-	.init		=	ppc_md5_init,
-	.update		=	ppc_md5_update,
-	.finup		=	ppc_md5_finup,
-	.descsize	=	MD5_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"md5",
-		.cra_driver_name=	"md5-ppc",
-		.cra_priority	=	200,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static int __init ppc_md5_mod_init(void)
-{
-	return crypto_register_shash(&alg);
-}
-
-static void __exit ppc_md5_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(ppc_md5_mod_init);
-module_exit(ppc_md5_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler");
-
-MODULE_ALIAS_CRYPTO("md5");
-MODULE_ALIAS_CRYPTO("md5-ppc");
diff --git a/arch/powerpc/include/asm/asm-const.h b/arch/powerpc/include/asm/asm-const.h
index bfb3c3534877..392bdb1f104f 100644
--- a/arch/powerpc/include/asm/asm-const.h
+++ b/arch/powerpc/include/asm/asm-const.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_POWERPC_ASM_CONST_H
 #define _ASM_POWERPC_ASM_CONST_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #  define stringify_in_c(...)	__VA_ARGS__
 #  define ASM_CONST(x)		x
 #else
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index b95b666f0374..9e9833faa4af 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -7,7 +7,7 @@
 
 #include <asm/asm-const.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/ppc-opcode.h>
 #endif
 
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 671ecc6711e3..0d0470cd5ac3 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -276,7 +276,7 @@ static inline void arch___clear_bit_unlock(int nr, volatile unsigned long *addr)
  * fls: find last (most-significant) bit set.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
-static __always_inline int fls(unsigned int x)
+static __always_inline __attribute_const__ int fls(unsigned int x)
 {
 	int lz;
 
@@ -294,7 +294,7 @@ static __always_inline int fls(unsigned int x)
  * 32-bit fls calls.
  */
 #ifdef CONFIG_PPC64
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
 {
 	int lz;
 
diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index 4e14a5427a63..873c5146e326 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -7,7 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/synch.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC_KUAP
 
@@ -170,6 +170,6 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 
 #endif /* CONFIG_PPC_KUAP */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 78c6a5fde1d6..8435bf3cdabf 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -29,7 +29,7 @@
 #define BPP_RX	0x01		/* Read only */
 #define BPP_RW	0x02		/* Read/write */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /* Contort a phys_addr_t into the right format/bits for a BAT */
 #ifdef CONFIG_PHYS_64BIT
 #define BAT_PHYS_ADDR(x) ((u32)((x & 0x00000000fffe0000ULL) | \
@@ -47,7 +47,7 @@ struct ppc_bat {
 	u32 batu;
 	u32 batl;
 };
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * Hash table
@@ -64,7 +64,7 @@ struct ppc_bat {
 #define SR_KP	0x20000000	/* User key */
 #define SR_KS	0x40000000	/* Supervisor key */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #include <asm/asm-offsets.h>
 
@@ -225,7 +225,7 @@ static __always_inline void update_user_segments(u32 val)
 
 int __init find_free_bat(void);
 unsigned int bat_block_size(unsigned long base, unsigned long top);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /* We happily ignore the smaller BATs on 601, we don't actually use
  * those definitions on hash32 at the moment anyway
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dd4eb3063175..f4390704d5ba 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -7,8 +7,14 @@
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
-			pgtable_gfp_flags(mm, GFP_KERNEL));
+	pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
+				      pgtable_gfp_flags(mm, GFP_KERNEL));
+
+#ifdef CONFIG_PPC_BOOK3S_603
+	memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD,
+	       (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+#endif
+	return pgd;
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 92d21c6faf1e..87dcca962be7 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -102,7 +102,7 @@
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
 #define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
 #define PMD_TABLE_SIZE	0
 #define PUD_TABLE_SIZE	0
@@ -110,7 +110,7 @@
 
 /* Bits to mask out from a PMD to get to the PTE page */
 #define PMD_MASKED_BITS		(PTE_TABLE_SIZE - 1)
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
@@ -132,12 +132,12 @@
 
 #define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 void unmap_kernel_page(unsigned long va);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
@@ -199,7 +199,7 @@ void unmap_kernel_page(unsigned long va);
 #define MODULES_SIZE	(CONFIG_MODULES_SIZE * SZ_1M)
 #define MODULES_VADDR	(MODULES_END - MODULES_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/sched.h>
 #include <linux/threads.h>
 
@@ -602,6 +602,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot)
 	return pgprot_noncached_wc(prot);
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /*  _ASM_POWERPC_BOOK3S_32_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 7132392fa7cd..8e5bd9902bed 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -32,7 +32,7 @@
  */
 #define H_KERN_VIRT_START	ASM_CONST(0xc0003d0000000000)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define H_PTE_TABLE_SIZE	(sizeof(pte_t) << H_PTE_INDEX_SIZE)
 #define H_PMD_TABLE_SIZE	(sizeof(pmd_t) << H_PMD_INDEX_SIZE)
 #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << H_PUD_INDEX_SIZE)
@@ -168,6 +168,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 extern int hash__has_transparent_hugepage(void);
 #endif
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0fb5b7da9478..7deb3a66890b 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -79,7 +79,7 @@
 #endif
 #define H_PMD_FRAG_NR	(PAGE_SIZE >> H_PMD_FRAG_SIZE_SHIFT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/errno.h>
 
 /*
@@ -281,6 +281,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 extern int hash__has_transparent_hugepage(void);
 #endif /*  CONFIG_TRANSPARENT_HUGEPAGE */
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 0755f2567021..5a8cbd496731 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -112,7 +112,7 @@
 #define H_PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define H_PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline int get_region_id(unsigned long ea)
 {
 	int region_id;
@@ -295,6 +295,6 @@ int hash__create_section_mapping(unsigned long start, unsigned long end,
 				 int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index 497a7bd31ecc..03aec3c6c851 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -10,7 +10,7 @@
 #define AMR_KUEP_BLOCKED	UL(0x5455555555555555)
 #define AMR_KUAP_BLOCKED	(AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE)
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 .macro kuap_user_restore gpr1, gpr2
 #if defined(CONFIG_PPC_PKEY)
@@ -191,7 +191,7 @@
 #endif
 .endm
 
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 
 #include <linux/jump_label.h>
 #include <linux/sched.h>
@@ -413,6 +413,6 @@ static __always_inline void restore_user_access(unsigned long flags)
 	if (static_branch_unlikely(&uaccess_flush_key) && flags == AMR_KUAP_BLOCKED)
 		do_uaccess_flush();
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_KUP_H */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 1c4eebbc69c9..346351423207 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -130,7 +130,7 @@
 #define POWER9_TLB_SETS_HASH	256	/* # sets in POWER9 TLB Hash mode */
 #define POWER9_TLB_SETS_RADIX	128	/* # sets in POWER9 TLB Radix mode */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct mmu_hash_ops {
 	void            (*hpte_invalidate)(unsigned long slot,
@@ -220,7 +220,7 @@ static inline unsigned long get_sllp_encoding(int psize)
 	return sllp;
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * Segment sizes.
@@ -248,7 +248,7 @@ static inline unsigned long get_sllp_encoding(int psize)
 #define LP_BITS		8
 #define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline int slb_vsid_shift(int ssize)
 {
@@ -532,7 +532,7 @@ void slb_set_size(u16 size);
 static inline void slb_set_size(u16 size) { }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * VSID allocation (256MB segment)
@@ -668,7 +668,7 @@ static inline void slb_set_size(u16 size) { }
 #define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
 #define LOW_SLICE_ARRAY_SZ	(BITS_PER_LONG / BITS_PER_BYTE)
 #define TASK_SLICE_ARRAY_SZ(x)	((x)->hash_context->slb_addr_limit >> 41)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC_SUBPAGE_PROT
 /*
@@ -881,5 +881,5 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
 	return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index fedbc5d38191..48631365b48c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -4,7 +4,7 @@
 
 #include <asm/page.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * Page size definition
  *
@@ -26,12 +26,12 @@ struct mmu_psize_def {
 	};
 };
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /* 64-bit classic hash table MMU */
 #include <asm/book3s/64/mmu-hash.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * ISA 3.0 partition and process table entry format
  */
@@ -288,5 +288,5 @@ static inline unsigned long get_user_vsid(mm_context_t *ctx,
 }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index 4d8d7b4ea16b..004a03e97e58 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
 #define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifdef CONFIG_HUGETLB_PAGE
 
 #endif /* CONFIG_HUGETLB_PAGE */
@@ -14,5 +14,5 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr,
 		BUG();
 	return hash__remap_4k_pfn(vma, addr, pfn, prot);
 }
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c19800365315..aac8ce30cd3b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -4,7 +4,7 @@
 
 #include <asm-generic/pgtable-nop4d.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/mmdebug.h>
 #include <linux/bug.h>
 #include <linux/sizes.h>
@@ -143,7 +143,7 @@
 #define PAGE_KERNEL_RO	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
 #define PAGE_KERNEL_ROX	__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * page table defines
  */
@@ -291,7 +291,7 @@ static inline unsigned long pud_leaf_size(pud_t pud)
 	else
 		return PUD_SIZE;
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #include <asm/book3s/64/hash.h>
 #include <asm/book3s/64/radix.h>
@@ -327,7 +327,7 @@ static inline unsigned long pud_leaf_size(pud_t pud)
 #define FIXADDR_SIZE	SZ_32M
 #define FIXADDR_TOP	(IOREMAP_END + FIXADDR_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr,
 				       pte_t *ptep, unsigned long clr,
@@ -1381,5 +1381,5 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
 	return false;
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index df23a8267e4d..da954e779744 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -4,7 +4,7 @@
 
 #include <asm/asm-const.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/cmpxchg.h>
 #endif
 
@@ -14,7 +14,7 @@
 #include <asm/book3s/64/radix-4k.h>
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/book3s/64/tlbflush-radix.h>
 #include <asm/cpu_has_feature.h>
 #endif
@@ -132,7 +132,7 @@
 #define RADIX_VMEMMAP_SIZE	RADIX_KERN_MAP_SIZE
 #define RADIX_VMEMMAP_END	(RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define RADIX_PTE_TABLE_SIZE	(sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
 #define RADIX_PMD_TABLE_SIZE	(sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE)
 #define RADIX_PUD_TABLE_SIZE	(sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
@@ -362,5 +362,5 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 					      unsigned long start,
 					      unsigned long end, int node,
 					      struct dev_pagemap *pgmap);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h
index 5fbe18544cbd..6e2f7a74cd75 100644
--- a/arch/powerpc/include/asm/book3s/64/slice.h
+++ b/arch/powerpc/include/asm/book3s/64/slice.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H
 #define _ASM_POWERPC_BOOK3S_64_SLICE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC_64S_HASH_MMU
 #ifdef CONFIG_HUGETLB_PAGE
@@ -37,6 +37,6 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 void slice_init_new_context_exec(struct mm_struct *mm);
 void slice_setup_new_exec(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 1db485aacbd9..bbaa7e81f821 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_BUG
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #include <asm/asm-offsets.h>
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 .macro EMIT_BUG_ENTRY addr,file,line,flags
@@ -31,7 +31,7 @@
 .endm
 #endif /* verbose */
 
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and
    sizeof(struct bug_entry), respectively */
 #ifdef CONFIG_DEBUG_BUGVERBOSE
@@ -101,12 +101,12 @@
 #define HAVE_ARCH_WARN_ON
 #endif
 
-#endif /* __ASSEMBLY __ */
+#endif /* __ASSEMBLER__ */
 #else
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 .macro EMIT_BUG_ENTRY addr,file,line,flags
 .endm
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 #define _EMIT_BUG_ENTRY
 #endif
 #endif /* CONFIG_BUG */
@@ -115,7 +115,7 @@
 
 #include <asm-generic/bug.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct pt_regs;
 void hash__do_page_fault(struct pt_regs *);
@@ -128,7 +128,7 @@ void die_mce(const char *str, struct pt_regs *regs, long err);
 extern bool die_will_crash(void);
 extern void panic_flush_kmsg_start(void);
 extern void panic_flush_kmsg_end(void);
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BUG_H */
diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 69232231d270..6796babc4d31 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -37,7 +37,7 @@
 #define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
 #endif
 
-#if !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLER__)
 #ifdef CONFIG_PPC64
 
 struct ppc_cache_info {
@@ -145,6 +145,6 @@ static inline void iccci(void *addr)
 	asm volatile ("iccci 0, %0" : : "r"(addr) : "memory");
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_CACHE_H */
diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h
index bf8a228229fa..604fa3b6c33d 100644
--- a/arch/powerpc/include/asm/cpu_has_feature.h
+++ b/arch/powerpc/include/asm/cpu_has_feature.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_POWERPC_CPU_HAS_FEATURE_H
 #define __ASM_POWERPC_CPU_HAS_FEATURE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/bug.h>
 #include <asm/cputable.h>
@@ -51,5 +51,5 @@ static __always_inline bool cpu_has_feature(unsigned long feature)
 }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_POWERPC_CPU_HAS_FEATURE_H */
diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 0cce5dc7fb1c..054cd2fcfd55 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -68,7 +68,7 @@
 #define ERR_EC_ESL_MISMATCH		-1
 #define ERR_DEEP_STATE_ESL_MISMATCH	-2
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define PNV_IDLE_NAME_LEN    16
 struct pnv_idle_states_t {
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 29a529d2ab8b..ec16c12296da 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -7,7 +7,7 @@
 #include <uapi/asm/cputable.h>
 #include <asm/asm-const.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* This structure can grow, it's real size is used by head.S code
  * via the mkdefs mechanism.
@@ -103,7 +103,7 @@ extern void cpu_feature_keys_init(void);
 static inline void cpu_feature_keys_init(void) { }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /* CPU kernel features */
 
@@ -195,7 +195,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_DEXCR_NPHIE		LONG_ASM_CONST(0x0010000000000000)
 #define CPU_FTR_P11_PVR			LONG_ASM_CONST(0x0020000000000000)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define CPU_FTR_PPCAS_ARCH_V2	(CPU_FTR_NOEXECUTE)
 
@@ -602,6 +602,6 @@ enum {
  */
 #define HBP_NUM_MAX	2
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_POWERPC_CPUTABLE_H */
diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h
index f26c430f3982..d06f2b20b810 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_CPUTHREADS_H
 #define _ASM_POWERPC_CPUTHREADS_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/cpumask.h>
 #include <asm/cpu_has_feature.h>
 
@@ -107,7 +107,7 @@ static inline u32 get_tensr(void)
 void book3e_start_thread(int thread, unsigned long addr);
 void book3e_stop_thread(int thread);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define INVALID_THREAD_HWID	0x0fff
 
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 3e9da22a2779..0b9ef726f92c 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -40,12 +40,6 @@ static inline void _ppc_msgsnd(u32 msg)
 				: : "i" (CPU_FTR_HVMODE), "r" (msg));
 }
 
-/* sync before sending message */
-static inline void ppc_msgsnd_sync(void)
-{
-	__asm__ __volatile__ ("sync" : : : "memory");
-}
-
 /* sync after taking message interrupt */
 static inline void ppc_msgsync(void)
 {
@@ -76,12 +70,6 @@ static inline void _ppc_msgsnd(u32 msg)
 	__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 }
 
-/* sync before sending message */
-static inline void ppc_msgsnd_sync(void)
-{
-	__asm__ __volatile__ ("sync" : : : "memory");
-}
-
 /* sync after taking message interrupt */
 static inline void ppc_msgsync(void)
 {
@@ -91,6 +79,12 @@ static inline void ppc_msgsync(void)
 
 extern void doorbell_exception(struct pt_regs *regs);
 
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
 {
 	u32 msg = PPC_DBELL_TYPE(type) | (flags & PPC_DBELL_MSG_BRDCAST) |
diff --git a/arch/powerpc/include/asm/dcr-native.h b/arch/powerpc/include/asm/dcr-native.h
index a92059964579..65b3fc2dc404 100644
--- a/arch/powerpc/include/asm/dcr-native.h
+++ b/arch/powerpc/include/asm/dcr-native.h
@@ -7,7 +7,7 @@
 #ifndef _ASM_POWERPC_DCR_NATIVE_H
 #define _ASM_POWERPC_DCR_NATIVE_H
 #ifdef __KERNEL__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/spinlock.h>
 #include <asm/cputable.h>
@@ -139,6 +139,6 @@ static inline void __dcri_clrset(int base_addr, int base_data, int reg,
 							      DCRN_ ## base ## _CONFIG_DATA,	\
 							      reg, clr, set)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_DCR_NATIVE_H */
diff --git a/arch/powerpc/include/asm/dcr.h b/arch/powerpc/include/asm/dcr.h
index 180021cd0b30..3c0fac2cc2b2 100644
--- a/arch/powerpc/include/asm/dcr.h
+++ b/arch/powerpc/include/asm/dcr.h
@@ -7,7 +7,7 @@
 #ifndef _ASM_POWERPC_DCR_H
 #define _ASM_POWERPC_DCR_H
 #ifdef __KERNEL__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifdef CONFIG_PPC_DCR
 
 #include <asm/dcr-native.h>
@@ -28,6 +28,6 @@ extern unsigned int dcr_resource_start(const struct device_node *np,
 extern unsigned int dcr_resource_len(const struct device_node *np,
 				     unsigned int index);
 #endif /* CONFIG_PPC_DCR */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_DCR_H */
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index cdf3c6df5123..8fc5aaa4bbba 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -52,7 +52,7 @@
 
 #include <uapi/asm/epapr_hcalls.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <asm/byteorder.h>
@@ -571,5 +571,5 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1,
 	in[3] = p4;
 	return epapr_hypercall(in, out, nr);
 }
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index b1ef1e92c34a..1a83b1ff3578 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -149,7 +149,7 @@ exc_##label##_book3e:
 	addi	r11,r13,PACA_EXTLB;					    \
 	TLB_MISS_RESTORE(r11)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern unsigned int interrupt_base_book3e;
 #endif
 
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index bb6f78fcf981..a9437e89f69f 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -53,7 +53,7 @@
  */
 #define MAX_MCE_DEPTH	4
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #define STF_ENTRY_BARRIER_SLOT						\
 	STF_ENTRY_BARRIER_FIXUP_SECTION;				\
@@ -170,9 +170,9 @@
 	RFSCV;								\
 	b	rfscv_flush_fallback
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 /* Prototype for function defined in exceptions-64s.S */
 void do_uaccess_flush(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif	/* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h
index 26ce2e5c0fa8..d483a9c24ba9 100644
--- a/arch/powerpc/include/asm/extable.h
+++ b/arch/powerpc/include/asm/extable.h
@@ -17,7 +17,7 @@
 
 #define ARCH_HAS_RELATIVE_EXTABLE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct exception_table_entry {
 	int insn;
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 17d168dd8b49..756a6c694018 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -168,7 +168,7 @@ label##5:							\
 #define ALT_FW_FTR_SECTION_END_IFCLR(msk)	\
 	ALT_FW_FTR_SECTION_END_NESTED_IFCLR(msk, 97)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define ASM_FTR_IF(section_if, section_else, msk, val)	\
 	stringify_in_c(BEGIN_FTR_SECTION)			\
@@ -196,7 +196,7 @@ label##5:							\
 #define ASM_MMU_FTR_IFCLR(section_if, section_else, msk)	\
 	ASM_MMU_FTR_IF(section_if, section_else, (msk), 0)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /* LWSYNC feature sections */
 #define START_LWSYNC_SECTION(label)	label##1:
@@ -276,7 +276,7 @@ label##3:					       	\
 	FTR_ENTRY_OFFSET 956b-957b;			\
 	.popsection;
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 
 extern long stf_barrier_fallback;
diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 69ae9cf57d50..abd7c56f4d55 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -58,7 +58,7 @@
 #define FW_FEATURE_WATCHDOG	ASM_CONST(0x0000080000000000)
 #define FW_FEATURE_PLPKS	ASM_CONST(0x0000100000000000)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 enum {
 #ifdef CONFIG_PPC64
@@ -146,6 +146,6 @@ void pseries_probe_fw_features(void);
 static inline void pseries_probe_fw_features(void) { }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_FIRMWARE_H */
diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index f9068dd8dfce..bc5109eab5b7 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -14,7 +14,7 @@
 #ifndef _ASM_FIXMAP_H
 #define _ASM_FIXMAP_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/sizes.h>
 #include <linux/pgtable.h>
 #include <asm/page.h>
@@ -111,5 +111,5 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE))
 #endif
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/include/asm/fprobe.h b/arch/powerpc/include/asm/fprobe.h
new file mode 100644
index 000000000000..d64bc28fb3d3
--- /dev/null
+++ b/arch/powerpc/include/asm/fprobe.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PPC_FPROBE_H
+#define _ASM_PPC_FPROBE_H
+
+#include <asm-generic/fprobe.h>
+
+#ifdef CONFIG_64BIT
+#undef FPROBE_HEADER_MSB_PATTERN
+#define FPROBE_HEADER_MSB_PATTERN	(PAGE_OFFSET & ~FPROBE_HEADER_MSB_MASK)
+#endif
+
+#endif /* _ASM_PPC_FPROBE_H */
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 82da7c7a1d12..5984eaa75ce8 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -15,7 +15,7 @@
 #define FTRACE_MCOUNT_MAX_OFFSET	8
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern void _mcount(void);
 
 unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
@@ -50,6 +50,21 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *
 		asm volatile("mfmsr %0" : "=r" ((_regs)->msr));	\
 	} while (0)
 
+#undef ftrace_regs_get_return_value
+static __always_inline unsigned long
+ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
+{
+	return arch_ftrace_regs(fregs)->regs.gpr[3];
+}
+#define ftrace_regs_get_return_value ftrace_regs_get_return_value
+
+#undef ftrace_regs_get_frame_pointer
+static __always_inline unsigned long
+ftrace_regs_get_frame_pointer(const struct ftrace_regs *fregs)
+{
+	return arch_ftrace_regs(fregs)->regs.gpr[1];
+}
+
 static __always_inline void
 ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
 				    unsigned long ip)
@@ -69,14 +84,14 @@ struct ftrace_ops;
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
 #endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #endif
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifdef CONFIG_FTRACE_SYSCALLS
 /*
  * Some syscall entry functions on powerpc start with "ppc_" (fork and clone,
@@ -160,6 +175,6 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi
 static inline void ftrace_free_init_tramp(void) { }
 static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; }
 #endif
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_FTRACE */
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index d73153b0275d..3966bd5810cb 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -4,7 +4,7 @@
 
 #include <asm/cache.h>
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 /*
  * We can't do CPP stringification and concatination directly into the section
  * name for some reason, so these macros can do it for us.
@@ -167,6 +167,6 @@ name:
 // find label from _within_ sname
 #define ABS_ADDR(label, sname) (label - start_ ## sname + sname ## _start)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif	/* _ASM_POWERPC_HEAD_64_H */
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index ea6c8dc400d2..9aef16149d92 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -534,7 +534,7 @@
 #define H_HTM_TARGET_NODAL_CHIP_INDEX(x)       ((unsigned long)(x)<<(63-31))
 #define H_HTM_TARGET_CORE_INDEX_ON_CHIP(x)     ((unsigned long)(x)<<(63-47))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 
 /**
@@ -735,6 +735,6 @@ struct hv_gpci_request_buffer {
 	uint8_t bytes[HGPCI_MAX_DATA_BYTES];
 } __packed;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 569ac1165b06..1078ba88efaf 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -59,7 +59,7 @@
 #define IRQS_PMI_DISABLED	2
 #define IRQS_ALL_DISABLED	(IRQS_DISABLED | IRQS_PMI_DISABLED)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline void __hard_irq_enable(void)
 {
@@ -516,6 +516,6 @@ static inline unsigned long mtmsr_isync_irqsafe(unsigned long msr)
 
 #define ARCH_IRQ_INIT_FLAGS	IRQ_NOREQUEST
 
-#endif  /* __ASSEMBLY__ */
+#endif  /* __ASSEMBLER__ */
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 23638d4e73ac..eb0e4a20b818 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -64,7 +64,7 @@
 #define INTERRUPT_DATA_LOAD_TLB_MISS_603	0x1100
 #define INTERRUPT_DATA_STORE_TLB_MISS_603	0x1200
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/context_tracking.h>
 #include <linux/hardirq.h>
@@ -675,6 +675,6 @@ unsigned long interrupt_exit_user_restart(struct pt_regs *regs);
 unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs);
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_INTERRUPT_H */
diff --git a/arch/powerpc/include/asm/irqflags.h b/arch/powerpc/include/asm/irqflags.h
index 47d46712928a..1351fb40fe74 100644
--- a/arch/powerpc/include/asm/irqflags.h
+++ b/arch/powerpc/include/asm/irqflags.h
@@ -5,7 +5,7 @@
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * Get definitions for arch_local_save_flags(x), etc.
  */
diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h
index 2f2a86ed2280..d4eaba459a0e 100644
--- a/arch/powerpc/include/asm/jump_label.h
+++ b/arch/powerpc/include/asm/jump_label.h
@@ -6,7 +6,7 @@
  * Copyright 2010 Michael Ellerman, IBM Corp.
  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 
 #include <asm/feature-fixups.h>
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index b5bbb94c51f6..db1214944622 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -12,7 +12,7 @@
 #define EXPORT_SYMBOL_KASAN(fn)
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/page.h>
 #include <linux/sizes.h>
@@ -80,5 +80,5 @@ void kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t
 int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end);
 int kasan_init_region(void *start, size_t size);
 
-#endif /* __ASSEMBLY */
+#endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/include/asm/kdump.h b/arch/powerpc/include/asm/kdump.h
index fd128d1e52b3..802644178f43 100644
--- a/arch/powerpc/include/asm/kdump.h
+++ b/arch/powerpc/include/asm/kdump.h
@@ -31,7 +31,7 @@
 
 #endif /* CONFIG_CRASH_DUMP */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #if defined(CONFIG_CRASH_DUMP) && !defined(CONFIG_NONSTATIC_KERNEL)
 extern void reserve_kdump_trampoline(void);
@@ -42,6 +42,6 @@ static inline void reserve_kdump_trampoline(void) { ; }
 static inline void setup_kdump_trampoline(void) { ; }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __PPC64_KDUMP_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 70f2f0517509..4bbf9f699aaa 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -49,7 +49,7 @@
 #define KEXEC_STATE_IRQS_OFF 1
 #define KEXEC_STATE_REAL_MODE 2
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/reg.h>
 
 typedef void (*crash_shutdown_t)(void);
@@ -210,6 +210,6 @@ static inline void reset_sprs(void)
 }
 #endif
 
-#endif /* ! __ASSEMBLY__ */
+#endif /* ! __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_KEXEC_H */
diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h
index 715c18b75334..f39531903325 100644
--- a/arch/powerpc/include/asm/kgdb.h
+++ b/arch/powerpc/include/asm/kgdb.h
@@ -21,7 +21,7 @@
 #ifndef __POWERPC_KGDB_H__
 #define __POWERPC_KGDB_H__
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define BREAK_INSTR_SIZE	4
 #define BUFMAX			((NUMREGBYTES * 2) + 512)
@@ -62,6 +62,6 @@ static inline void arch_kgdb_breakpoint(void)
 /* CR/LR, R1, R2, R13-R31 inclusive. */
 #define NUMCRITREGBYTES		(23 * sizeof(int))
 #endif /* 32/64 */
-#endif /* !(__ASSEMBLY__) */
+#endif /* !(__ASSEMBLER__) */
 #endif /* !__POWERPC_KGDB_H__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 2bb03d941e3e..dab63b82a8d4 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -6,7 +6,7 @@
 #define KUAP_WRITE	2
 #define KUAP_READ_WRITE	(KUAP_READ | KUAP_WRITE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 
 static __always_inline bool kuap_is_disabled(void);
@@ -28,14 +28,14 @@ static __always_inline bool kuap_is_disabled(void);
 #include <asm/book3s/32/kup.h>
 #endif
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #ifndef CONFIG_PPC_KUAP
 .macro kuap_check_amr	gpr1, gpr2
 .endm
 
 #endif
 
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 
 extern bool disable_kuep;
 extern bool disable_kuap;
@@ -181,6 +181,6 @@ static __always_inline void prevent_current_write_to_user(void)
 	prevent_user_access(KUAP_WRITE);
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_KUAP_H_ */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index d68d71987d5c..f9af8df09077 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -9,7 +9,7 @@
 #ifndef __POWERPC_KVM_ASM_H__
 #define __POWERPC_KVM_ASM_H__
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #ifdef CONFIG_64BIT
 #define PPC_STD(sreg, offset, areg)  std sreg, (offset)(areg)
 #define PPC_LD(treg, offset, areg)   ld treg, (offset)(areg)
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index a36797938620..3435fe144908 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -20,7 +20,7 @@
 /* Maximum number of subcores per physical core */
 #define MAX_SUBCORES		4
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
 
@@ -58,7 +58,7 @@ kvmppc_resume_\intno:
 
 #endif /* CONFIG_KVM_BOOK3S_HANDLER */
 
-#else  /*__ASSEMBLY__ */
+#else  /*__ASSEMBLER__ */
 
 struct kvmppc_vcore;
 
@@ -150,7 +150,7 @@ struct kvmppc_book3s_shadow_vcpu {
 #endif
 };
 
-#endif /*__ASSEMBLY__ */
+#endif /*__ASSEMBLER__ */
 
 /* Values for kvm_state */
 #define KVM_HWTHREAD_IN_KERNEL	0
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 7487ef582121..3acf2995d364 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -8,7 +8,7 @@
 
 #include <asm/feature-fixups.h>
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /*
  * All exceptions from guest state must go through KVM
@@ -64,5 +64,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
 #endif
 .endm
 
-#endif /*__ASSEMBLY__ */
+#endif /*__ASSEMBLER__ */
 #endif /* ASM_KVM_BOOKE_HV_ASM_H */
diff --git a/arch/powerpc/include/asm/lv1call.h b/arch/powerpc/include/asm/lv1call.h
index b11501b30193..ae70120953a8 100644
--- a/arch/powerpc/include/asm/lv1call.h
+++ b/arch/powerpc/include/asm/lv1call.h
@@ -10,7 +10,7 @@
 #if !defined(_ASM_POWERPC_LV1CALL_H)
 #define _ASM_POWERPC_LV1CALL_H
 
-#if !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLER__)
 
 #include <linux/types.h>
 #include <linux/export.h>
@@ -211,7 +211,7 @@
     {return _lv1_##name(LV1_##in##_IN_##out##_OUT_ARGS);}
 #endif
 
-#endif /* !defined(__ASSEMBLY__) */
+#endif /* !defined(__ASSEMBLER__) */
 
 /* lv1 call table */
 
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 4182d68d9cd1..5f9c5d436e17 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -137,7 +137,7 @@
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE | MMU_FTR_NO_SLBIE_B
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/bug.h>
 #include <asm/cputable.h>
 #include <asm/page.h>
@@ -332,7 +332,7 @@ static inline bool strict_module_rwx_enabled(void)
 {
 	return IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && strict_kernel_rwx_enabled();
 }
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /* The kernel use the constants below to index in the page sizes array.
  * The use of fixed constants for this purpose is better for performances
@@ -377,7 +377,7 @@ static inline bool strict_module_rwx_enabled(void)
 #include <asm/book3s/64/mmu.h>
 #else /* CONFIG_PPC_BOOK3S_64 */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /* MMU initialization */
 extern void early_init_mmu(void);
 extern void early_init_mmu_secondary(void);
@@ -388,7 +388,7 @@ static inline void mmu_early_init_devtree(void) { }
 static inline void pkey_early_init_devtree(void) {}
 
 extern void *abatron_pteptrs[2];
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif
 
 #if defined(CONFIG_PPC_BOOK3S_32)
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index e1ee5026ac4a..864e22deaa2c 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -27,6 +27,7 @@ struct ppc_plt_entry {
 struct mod_arch_specific {
 #ifdef __powerpc64__
 	unsigned int stubs_section;	/* Index of stubs section in module */
+	unsigned int stub_count;	/* Number of stubs used */
 #ifdef CONFIG_PPC_KERNEL_PCREL
 	unsigned int got_section;	/* What section is the GOT? */
 	unsigned int pcpu_section;	/* .data..percpu section */
diff --git a/arch/powerpc/include/asm/mpc52xx.h b/arch/powerpc/include/asm/mpc52xx.h
index 01ae6c351e50..d7ffbd06797d 100644
--- a/arch/powerpc/include/asm/mpc52xx.h
+++ b/arch/powerpc/include/asm/mpc52xx.h
@@ -13,10 +13,10 @@
 #ifndef __ASM_POWERPC_MPC52xx_H__
 #define __ASM_POWERPC_MPC52xx_H__
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/types.h>
 #include <asm/mpc5xxx.h>
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #include <linux/suspend.h>
 
@@ -30,7 +30,7 @@
 /* Structures mapping of some unit register set                             */
 /* ======================================================================== */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* Memory Mapping Control */
 struct mpc52xx_mmap_ctl {
@@ -258,14 +258,14 @@ struct mpc52xx_intr {
 	u32 per_error;		/* INTR + 0x38 */
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 
 /* ========================================================================= */
 /* Prototypes for MPC52xx sysdev                                             */
 /* ========================================================================= */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct device_node;
 
@@ -297,7 +297,7 @@ extern void __init mpc52xx_setup_pci(void);
 static inline void mpc52xx_setup_pci(void) { }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #ifdef CONFIG_PM
 struct mpc52xx_suspend {
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 46bc5925e5fd..08486b15b207 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_PPC_KUAP
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/reg.h>
 
@@ -82,7 +82,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 	return !((regs->kuap ^ MD_APG_KUAP) & 0xff000000);
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* CONFIG_PPC_KUAP */
 
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-44x.h b/arch/powerpc/include/asm/nohash/32/mmu-44x.h
index 2d92a39d8f2e..c3d192194324 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-44x.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-44x.h
@@ -100,7 +100,7 @@
 #define PPC47x_TLB2_S_RW	(PPC47x_TLB2_SW | PPC47x_TLB2_SR)
 #define PPC47x_TLB2_IMG		(PPC47x_TLB2_I | PPC47x_TLB2_M | PPC47x_TLB2_G)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern unsigned int tlb_44x_hwater;
 extern unsigned int tlb_44x_index;
@@ -114,7 +114,7 @@ typedef struct {
 /* patch sites */
 extern s32 patch__tlb_44x_hwater_D, patch__tlb_44x_hwater_I;
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #ifndef CONFIG_PPC_EARLY_DEBUG_44x
 #define PPC44x_EARLY_TLBS	1
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 2986f9ba40b8..f19115db8072 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -174,7 +174,7 @@
 #define MODULES_SIZE	(CONFIG_MODULES_SIZE * SZ_1M)
 #define MODULES_VADDR	(MODULES_END - MODULES_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/mmdebug.h>
 #include <linux/sizes.h>
@@ -265,6 +265,6 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size)
 extern s32 patch__itlbmiss_exit_1, patch__dtlbmiss_exit_1;
 extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf;
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_MMU_8XX_H_ */
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b481738c4bb5..2d71e4b7cd09 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -4,12 +4,12 @@
 
 #include <asm-generic/pgtable-nopmd.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/sched.h>
 #include <linux/threads.h>
 #include <asm/mmu.h>			/* For sub-arch specific PPC_PIN_SIZE */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define PTE_INDEX_SIZE	PTE_SHIFT
 #define PMD_INDEX_SIZE	0
@@ -19,14 +19,14 @@
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
 #define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
 #define PMD_TABLE_SIZE	0
 #define PUD_TABLE_SIZE	0
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #define PMD_MASKED_BITS (PTE_TABLE_SIZE - 1)
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
@@ -149,7 +149,7 @@
 #define MAX_POSSIBLE_PHYSMEM_BITS 32
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) & _PMD_BAD)
@@ -199,6 +199,6 @@ static inline void pmd_clear(pmd_t *pmdp)
 /* We borrow LSB 2 to store the exclusive marker in swap PTEs. */
 #define _PAGE_SWP_EXCLUSIVE	0x000004
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 54ebb91dbdcf..e2ea8ba9f8ca 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -83,7 +83,7 @@
 
 #include <asm/pgtable-masks.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline pte_t pte_wrprotect(pte_t pte)
 {
 	return __pte(pte_val(pte) | _PAGE_RO);
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
index 10f5cf444d72..fb6fa1d4e074 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h
@@ -14,12 +14,12 @@
 #define PUD_INDEX_SIZE  9
 #define PGD_INDEX_SIZE  9
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
 #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
@@ -57,7 +57,7 @@
 #define p4d_bad(p4d)		(p4d_val(p4d) == 0)
 #define p4d_present(p4d)	(p4d_val(p4d) != 0)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
@@ -80,7 +80,7 @@ static inline p4d_t pte_p4d(pte_t pte)
 }
 extern struct page *p4d_page(p4d_t p4d);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #define pud_ERROR(e) \
 	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 2202c78730e8..2deb955b7bc8 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -77,7 +77,7 @@
 
 #define H_PAGE_4K_PFN 0
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /* pte_clear moved to later in this file */
 
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
@@ -209,6 +209,6 @@ void __patch_exception(int exc, unsigned long addr);
 	__patch_exception((exc), (unsigned long)&name); \
 } while (0)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/nohash/kup-booke.h b/arch/powerpc/include/asm/nohash/kup-booke.h
index 0c7c3258134c..d6bbb6d78bbe 100644
--- a/arch/powerpc/include/asm/nohash/kup-booke.h
+++ b/arch/powerpc/include/asm/nohash/kup-booke.h
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_PPC_KUAP
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 .macro kuap_check_amr	gpr1, gpr2
 .endm
@@ -105,7 +105,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 	return !regs->kuap;
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* CONFIG_PPC_KUAP */
 
diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h b/arch/powerpc/include/asm/nohash/mmu-e500.h
index b281d9eeaf1e..2fad5ff426a0 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -230,7 +230,7 @@
 #define MAS2_M_IF_NEEDED	0
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/bug.h>
 
 extern unsigned int tlbcam_index;
@@ -318,6 +318,6 @@ extern int book3e_htw_mode;
 #include <asm/percpu.h>
 DECLARE_PER_CPU(int, next_tlbcam_idx);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
index bb5f3e8ea912..4ef780b291bc 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -22,7 +22,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 	pgd_t *pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
 			pgtable_gfp_flags(mm, GFP_KERNEL));
 
-#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_BOOK3S_603)
+#ifdef CONFIG_PPC_8xx
 	memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD,
 	       (MAX_PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 #endif
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 7d6b9e5b286e..5af168b7f292 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_NOHASH_PGTABLE_H
 #define _ASM_POWERPC_NOHASH_PGTABLE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
 				     unsigned long clr, unsigned long set, int huge);
 #endif
@@ -27,7 +27,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
 #define PAGE_KERNEL_RO	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RO)
 #define PAGE_KERNEL_ROX	__pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern int icache_44x_need_flush;
 
@@ -373,5 +373,5 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 void unmap_kernel_page(unsigned long va);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h
index cb78392494da..b61efc3ee904 100644
--- a/arch/powerpc/include/asm/nohash/pte-e500.h
+++ b/arch/powerpc/include/asm/nohash/pte-e500.h
@@ -86,7 +86,7 @@
 
 #include <asm/pgtable-masks.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline pte_t pte_mkexec(pte_t pte)
 {
 	return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX);
@@ -134,7 +134,7 @@ static inline unsigned long pud_leaf_size(pud_t pud)
 
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __KERNEL__ */
 #endif /*  _ASM_POWERPC_NOHASH_PTE_E500_H */
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 8c9d4b26bf57..d3eaa3425797 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -246,7 +246,7 @@
 #define OPAL_CONFIG_IDLE_UNDO		0
 #define OPAL_CONFIG_IDLE_APPLY		1
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* Other enums */
 enum OpalFreezeState {
@@ -1183,6 +1183,6 @@ struct opal_mpipl_fadump {
 	struct	opal_mpipl_region region[];
 } __packed;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index af304e6cb486..0a398265ba04 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -10,7 +10,7 @@
 
 #include <asm/opal-api.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/notifier.h>
 
@@ -390,6 +390,6 @@ void opal_powercap_init(void);
 void opal_psr_init(void);
 void opal_sensor_groups_init(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index af9a2628d1df..b28fbb1d57eb 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -6,7 +6,7 @@
  * Copyright (C) 2001,2005 IBM Corporation.
  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/bug.h>
@@ -23,7 +23,7 @@
  */
 #include <vdso/page.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #ifndef CONFIG_HUGETLB_PAGE
 #define HPAGE_SHIFT PAGE_SHIFT
 #elif defined(CONFIG_PPC_BOOK3S_64)
@@ -75,7 +75,7 @@ extern unsigned int hpage_shift;
 #define LOAD_OFFSET	ASM_CONST((CONFIG_KERNEL_START-CONFIG_PHYSICAL_START))
 
 #if defined(CONFIG_NONSTATIC_KERNEL)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern phys_addr_t memstart_addr;
 extern phys_addr_t kernstart_addr;
@@ -84,7 +84,7 @@ extern phys_addr_t kernstart_addr;
 extern long long virt_phys_offset;
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #define PHYSICAL_START	kernstart_addr
 
 #else	/* !CONFIG_NONSTATIC_KERNEL */
@@ -216,7 +216,7 @@ extern long long virt_phys_offset;
 #endif
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline unsigned long virt_to_pfn(const void *kaddr)
 {
 	return __pa(kaddr) >> PAGE_SHIFT;
@@ -261,7 +261,7 @@ static inline const void *pfn_to_kaddr(unsigned long pfn)
 #define is_kernel_addr(x)	((x) >= TASK_SIZE)
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/pgtable-be-types.h>
@@ -290,6 +290,6 @@ static inline unsigned long kaslr_offset(void)
 }
 
 #include <asm-generic/memory_model.h>
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_PAGE_H */
diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h
index b9ac9e3a771c..25482405a811 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -19,7 +19,7 @@
 #define PTE_SHIFT	(PAGE_SHIFT - PTE_T_LOG2)	/* full page */
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * The basic type of a PTE - 64 bits for those CPUs with > 32 bit
  * physical addressing.
@@ -53,6 +53,6 @@ extern void copy_page(void *to, void *from);
 #define PGD_T_LOG2	(__builtin_ffs(sizeof(pgd_t)) - 1)
 #define PTE_T_LOG2	(__builtin_ffs(sizeof(pte_t)) - 1)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_PAGE_32_H */
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 79a9b7c6a132..0f564a06bf68 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -35,7 +35,7 @@
 #define ESID_MASK_1T		0xffffff0000000000UL
 #define GET_ESID_1T(x)		(((x) >> SID_SHIFT_1T) & SID_MASK_1T)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/cache.h>
 
 typedef unsigned long pte_basic_t;
@@ -82,7 +82,7 @@ extern void copy_page(void *to, void *from);
 /* Log 2 of page table size */
 extern u64 ppc64_pft_size;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define VM_DATA_DEFAULT_FLAGS \
 	(is_32bit_task() ? \
diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h
index c3cd5b131033..a3b5a0d05db6 100644
--- a/arch/powerpc/include/asm/papr-sysparm.h
+++ b/arch/powerpc/include/asm/papr-sysparm.h
@@ -21,6 +21,7 @@ typedef struct {
 #define PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS     mk_papr_sysparm(44)
 #define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS    mk_papr_sysparm(50)
 #define PAPR_SYSPARM_LPAR_NAME                     mk_papr_sysparm(55)
+#define PAPR_SYSPARM_HVPIPE_ENABLE                 mk_papr_sysparm(64)
 
 /**
  * struct papr_sysparm_buf - RTAS work area layout for system parameter functions.
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..1dae53130782 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -133,8 +133,6 @@ struct pci_controller {
 
 	/* IRQ domain hierarchy */
 	struct irq_domain	*dev_domain;
-	struct irq_domain	*msi_domain;
-	struct fwnode_handle	*fwnode;
 
 	/* iommu_ops support */
 	struct iommu_device	iommu;
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 93d77ad5a92f..17fd7ff6e535 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_PGTABLE_H
 #define _ASM_POWERPC_PGTABLE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/mmdebug.h>
 #include <linux/mmzone.h>
 #include <asm/processor.h>		/* For TASK_SIZE */
@@ -12,7 +12,7 @@
 
 struct mm_struct;
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgtable.h>
@@ -20,18 +20,6 @@ struct mm_struct;
 #include <asm/nohash/pgtable.h>
 #endif /* !CONFIG_PPC_BOOK3S */
 
-/*
- * Protection used for kernel text. We want the debuggers to be able to
- * set breakpoints anywhere, so don't write protect the kernel text
- * on platforms where such control is possible.
- */
-#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \
-	defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE)
-#define PAGE_KERNEL_TEXT	PAGE_KERNEL_X
-#else
-#define PAGE_KERNEL_TEXT	PAGE_KERNEL_ROX
-#endif
-
 /* Make modules code happy. We don't set RO yet */
 #define PAGE_KERNEL_EXEC	PAGE_KERNEL_X
 
@@ -39,7 +27,7 @@ struct mm_struct;
 #define PAGE_AGP		(PAGE_KERNEL_NC)
 #define HAVE_PAGE_AGP
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define PFN_PTE_SHIFT		PTE_RPN_SHIFT
 
@@ -214,6 +202,6 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 
 #endif /* CONFIG_PPC64 */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 8053b24afc39..55ca49d18319 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -571,6 +571,7 @@
 					(0x54000001 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me))
 #define PPC_RAW_RLWIMI(d, a, i, mb, me) (0x50000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH(i) | __PPC_MB(mb) | __PPC_ME(me))
 #define PPC_RAW_RLDICL(d, a, i, mb)     (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb))
+#define PPC_RAW_RLDICL_DOT(d, a, i, mb) (0x78000000 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_MB64(mb) | 0x1)
 #define PPC_RAW_RLDICR(d, a, i, me)     (0x78000004 | ___PPC_RA(d) | ___PPC_RS(a) | __PPC_SH64(i) | __PPC_ME64(me))
 
 /* slwi = rlwinm Rx, Ry, n, 0, 31-n */
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index b891910fce8a..46947c82a712 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -12,7 +12,7 @@
 #include <asm/feature-fixups.h>
 #include <asm/extable.h>
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #define SZL			(BITS_PER_LONG/8)
 
@@ -868,7 +868,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96)
 
 #endif /* !CONFIG_PPC_BOOK3E_64 */
 
-#endif /*  __ASSEMBLY__ */
+#endif /*  __ASSEMBLER__ */
 
 #define SOFT_MASK_TABLE(_start, _end)		\
 	stringify_in_c(.section __soft_mask_table,"a";)\
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 6b94de17201c..f156bdb43e2b 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -29,14 +29,14 @@
 #ifdef CONFIG_PPC64
 /* Default SMT priority is set to 3. Use 11- 13bits to save priority. */
 #define PPR_PRIORITY 3
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define DEFAULT_PPR (PPR_PRIORITY << 50)
 #else
 #define DEFAULT_PPR ((u64)PPR_PRIORITY << 50)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* CONFIG_PPC64 */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/types.h>
 #include <linux/thread_info.h>
 #include <asm/ptrace.h>
@@ -460,5 +460,5 @@ int enter_vmx_ops(void);
 void *exit_vmx_ops(void *dest);
 
 #endif /* __KERNEL__ */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_POWERPC_PROCESSOR_H */
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 7b9350756875..94aa1de2b06e 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -24,7 +24,7 @@
 #include <asm/asm-const.h>
 #include <asm/reg.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 struct pt_regs
 {
 	union {
@@ -165,7 +165,7 @@ struct pt_regs
 #define STACK_INT_FRAME_SIZE	(KERNEL_REDZONE_SIZE + STACK_USER_INT_FRAME_SIZE)
 #define STACK_INT_FRAME_MARKER_LONGS	(STACK_INT_FRAME_MARKER/sizeof(long))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/paca.h>
 
 #ifdef CONFIG_SMP
@@ -414,7 +414,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsig
 	return 0;
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #ifndef __powerpc64__
 /* We need PT_SOFTE defined at all time to avoid #ifdefs */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0228c90bbcc7..3fe186635432 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -60,7 +60,7 @@
 #define MSR_RI_LG	1		/* Recoverable Exception */
 #define MSR_LE_LG	0 		/* Little Endian */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define __MASK(X)	(1<<(X))
 #else
 #define __MASK(X)	(1UL<<(X))
@@ -1358,7 +1358,7 @@
 #define PVR_ARCH_31_P11	0x0f000007
 
 /* Macros for setting and retrieving special purpose registers */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #if defined(CONFIG_PPC64) || defined(__CHECKER__)
 typedef struct {
@@ -1450,6 +1450,6 @@ extern void scom970_write(unsigned int address, unsigned long value);
 struct pt_regs;
 
 extern void ppc_save_regs(struct pt_regs *regs);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_REG_H */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 656bfaf91526..56f9d3b1de85 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -576,7 +576,7 @@
 
 #define TEN_THREAD(x)	(1 << (x))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define mftmr(rn)	({unsigned long rval; \
 			asm volatile(MFTMR(rn, %0) : "=r" (rval)); rval;})
 #define mttmr(rn, v)	asm volatile(MTTMR(rn, %0) : \
@@ -585,7 +585,7 @@
 
 extern unsigned long global_dbcr0[];
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_POWERPC_REG_BOOKE_H__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/reg_fsl_emb.h b/arch/powerpc/include/asm/reg_fsl_emb.h
index 9893d2001b68..ec459c3d9498 100644
--- a/arch/powerpc/include/asm/reg_fsl_emb.h
+++ b/arch/powerpc/include/asm/reg_fsl_emb.h
@@ -9,7 +9,7 @@
 
 #include <linux/stringify.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /* Performance Monitor Registers */
 static __always_inline unsigned int mfpmr(unsigned int rn)
 {
@@ -32,7 +32,7 @@ static __always_inline void mtpmr(unsigned int rn, unsigned int val)
 	     ".machine pop;"
 	     : [val] "=r" (val) : [rn] "i" (rn));
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /* Freescale Book E Performance Monitor APU Registers */
 #define PMRN_PMC0	0x010	/* Performance Monitor Counter 0 */
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 75fa0293c508..d046bbd5017d 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -68,9 +68,11 @@ enum rtas_function_index {
 	RTAS_FNIDX__IBM_READ_PCI_CONFIG,
 	RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE,
 	RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2,
+	RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG,
 	RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW,
 	RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW,
 	RTAS_FNIDX__IBM_SCAN_LOG_DUMP,
+	RTAS_FNIDX__IBM_SEND_HVPIPE_MSG,
 	RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR,
 	RTAS_FNIDX__IBM_SET_EEH_OPTION,
 	RTAS_FNIDX__IBM_SET_SLOT_RESET,
@@ -163,9 +165,11 @@ typedef struct {
 #define RTAS_FN_IBM_READ_PCI_CONFIG               rtas_fn_handle(RTAS_FNIDX__IBM_READ_PCI_CONFIG)
 #define RTAS_FN_IBM_READ_SLOT_RESET_STATE         rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE)
 #define RTAS_FN_IBM_READ_SLOT_RESET_STATE2        rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2)
+#define RTAS_FN_IBM_RECEIVE_HVPIPE_MSG		  rtas_fn_handle(RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG)
 #define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW          rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW)
 #define RTAS_FN_IBM_RESET_PE_DMA_WINDOW           rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOW)
 #define RTAS_FN_IBM_SCAN_LOG_DUMP                 rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP)
+#define RTAS_FN_IBM_SEND_HVPIPE_MSG		  rtas_fn_handle(RTAS_FNIDX__IBM_SEND_HVPIPE_MSG)
 #define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR         rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR)
 #define RTAS_FN_IBM_SET_EEH_OPTION                rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION)
 #define RTAS_FN_IBM_SET_SLOT_RESET                rtas_fn_handle(RTAS_FNIDX__IBM_SET_SLOT_RESET)
@@ -217,6 +221,7 @@ typedef struct {
 #define RTAS_HARDWARE_ERROR             -1 /* Hardware or other unspecified error. */
 #define RTAS_BUSY                       -2 /* Retry immediately. */
 #define RTAS_INVALID_PARAMETER          -3 /* Invalid indicator/domain/sensor etc. */
+#define	RTAS_FUNC_NOT_SUPPORTED		-5 /* Function not supported */
 #define RTAS_UNEXPECTED_STATE_CHANGE    -7 /* Seems limited to EEH and slot reset. */
 #define RTAS_EXTENDED_DELAY_MIN       9900 /* Retry after delaying for ~1ms. */
 #define RTAS_EXTENDED_DELAY_MAX       9905 /* Retry after delaying for ~100s. */
@@ -233,6 +238,7 @@ typedef struct {
 #define RTAS_EPOW_WARNING		0x40000000 /* set bit 1 */
 #define RTAS_HOTPLUG_EVENTS		0x10000000 /* set bit 3 */
 #define RTAS_IO_EVENTS			0x08000000 /* set bit 4 */
+#define RTAS_HVPIPE_MSG_EVENTS		0x04000000 /* set bit 5 */
 #define RTAS_EVENT_SCAN_ALL_EVENTS	0xffffffff
 
 /* RTAS event severity */
@@ -282,6 +288,7 @@ typedef struct {
 #define RTAS_TYPE_DEALLOC		0xE3
 #define RTAS_TYPE_DUMP			0xE4
 #define RTAS_TYPE_HOTPLUG		0xE5
+#define RTAS_TYPE_HVPIPE		0xE6
 /* I don't add PowerMGM events right now, this is a different topic */
 #define RTAS_TYPE_PMGM_POWER_SW_ON	0x60
 #define RTAS_TYPE_PMGM_POWER_SW_OFF	0x61
@@ -374,6 +381,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_HMC_ID		(('H' << 8) | 'M')
 #define PSERIES_ELOG_SECT_ID_EPOW		(('E' << 8) | 'P')
 #define PSERIES_ELOG_SECT_ID_IO_EVENT		(('I' << 8) | 'E')
+#define PSERIES_ELOG_SECT_ID_HVPIPE_EVENT	(('P' << 8) | 'E')
 #define PSERIES_ELOG_SECT_ID_MANUFACT_INFO	(('M' << 8) | 'I')
 #define PSERIES_ELOG_SECT_ID_CALL_HOME		(('C' << 8) | 'H')
 #define PSERIES_ELOG_SECT_ID_USER_DEF		(('U' << 8) | 'D')
@@ -519,6 +527,7 @@ extern struct mutex rtas_ibm_get_indices_lock;
 extern struct mutex rtas_ibm_set_dynamic_indicator_lock;
 extern struct mutex rtas_ibm_get_dynamic_sensor_state_lock;
 extern struct mutex rtas_ibm_physical_attestation_lock;
+extern struct mutex rtas_ibm_send_hvpipe_msg_lock;
 
 #define GLOBAL_INTERRUPT_QUEUE 9005
 
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index eed74c1fb832..50a92b24628d 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -4,7 +4,7 @@
 
 #include <uapi/asm/setup.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern void ppc_printk_progress(char *s, unsigned short hex);
 
 extern unsigned long long memory_limit;
@@ -89,7 +89,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
 
 extern struct seq_buf ppc_hw_desc;
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif	/* _ASM_POWERPC_SETUP_H */
 
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index b77927ccb0ab..e41b9ea42122 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -18,7 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/irqreturn.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
@@ -266,7 +266,7 @@ extern char __secondary_hold;
 extern unsigned int booting_thread_hwid;
 
 extern void __early_start(void);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SMP_H) */
diff --git a/arch/powerpc/include/asm/spu_csa.h b/arch/powerpc/include/asm/spu_csa.h
index c33df961c045..1b3271a03392 100644
--- a/arch/powerpc/include/asm/spu_csa.h
+++ b/arch/powerpc/include/asm/spu_csa.h
@@ -43,7 +43,7 @@
 #define SPU_DECR_STATUS_RUNNING 0x1
 #define SPU_DECR_STATUS_WRAPPED 0x2
 
-#ifndef  __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /**
  * spu_reg128 - generic 128-bit register definition.
  */
@@ -243,5 +243,5 @@ struct spu_state {
 
 #endif /* !__SPU__ */
 #endif /* __KERNEL__ */
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 #endif /* _SPU_CSA_H_ */
diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h
index b0b4c64870d7..0d3ccb34adfb 100644
--- a/arch/powerpc/include/asm/synch.h
+++ b/arch/powerpc/include/asm/synch.h
@@ -7,7 +7,7 @@
 #include <asm/feature-fixups.h>
 #include <asm/ppc-opcode.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
 extern void do_lwsync_fixups(unsigned long value, void *fixup_start,
 			     void *fixup_end);
@@ -40,7 +40,7 @@ static inline void ppc_after_tlbiel_barrier(void)
 	 */
 	asm volatile(ASM_FTR_IFSET(PPC_CP_ABORT, "", %0) : : "i" (CPU_FTR_ARCH_31) : "memory");
 }
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #if defined(__powerpc64__)
 #    define LWSYNC	lwsync
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 2785c7462ebf..b0f200aba2b3 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -41,7 +41,7 @@
 
 #define THREAD_ALIGN		(1 << THREAD_ALIGN_SHIFT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/cache.h>
 #include <asm/processor.h>
 #include <asm/accounting.h>
@@ -89,7 +89,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src
 void arch_setup_new_exec(void);
 #define arch_setup_new_exec arch_setup_new_exec
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * thread information flag bit numbers
@@ -162,7 +162,7 @@ void arch_setup_new_exec(void);
 #define _TLF_LAZY_MMU		(1 << TLF_LAZY_MMU)
 #define _TLF_RUNLATCH		(1 << TLF_RUNLATCH)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 static inline void clear_thread_local_flags(unsigned int flags)
 {
@@ -233,7 +233,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void *emergency_ctx[];
 #endif
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index f8885586efaf..7991ab1d4cb8 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -29,6 +29,10 @@ extern u64 decrementer_max;
 
 extern void generic_calibrate_decr(void);
 
+#ifdef CONFIG_PPC_SPLPAR
+extern u64 get_boot_tb(void);
+#endif
+
 /* Some sane defaults: 125 MHz timebase, 1GHz processor */
 extern unsigned long ppc_proc_freq;
 #define DEFAULT_PROC_FREQ	(DEFAULT_TB_FREQ * 8)
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index e94f6db5e367..d700affba448 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/tm.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern void tm_reclaim(struct thread_struct *thread,
 		       uint8_t cause);
@@ -19,4 +19,4 @@ extern void tm_restore_sprs(struct thread_struct *thread);
 
 extern bool tm_suspend_disabled;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index da15b5efe807..f19ca44512d1 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu)
 #ifdef CONFIG_SMP
 #include <asm/cputable.h>
 
+struct cpumask *cpu_coregroup_mask(int cpu);
+
 #ifdef CONFIG_PPC64
 #include <asm/smp.h>
 
diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h
index 93157a661dcc..55d7ba6d910b 100644
--- a/arch/powerpc/include/asm/types.h
+++ b/arch/powerpc/include/asm/types.h
@@ -11,10 +11,10 @@
 
 #include <uapi/asm/types.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 typedef __vector128 vector128;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_TYPES_H */
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 027ef94a12fb..b873fbb6d712 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -9,7 +9,7 @@
 
 #define NR_syscalls	__NR_syscalls
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -52,5 +52,5 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-#endif		/* __ASSEMBLY__ */
+#endif		/* __ASSEMBLER__ */
 #endif /* _ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index 1ca23fbfe087..07af32576072 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -5,7 +5,7 @@
 #define VDSO_VERSION_STRING	LINUX_2.6.15
 #define __VDSO_PAGES		4
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_PPC64
 #include <generated/vdso64-offsets.h>
@@ -21,7 +21,7 @@
 
 int vdso_getcpu_init(void);
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #ifdef __VDSO64__
 #define V_FUNCTION_BEGIN(name)		\
@@ -49,6 +49,6 @@ int vdso_getcpu_init(void);
 
 #endif /* __VDSO32__ */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_VDSO_H */
diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h
index 067a5396aac6..4c24976061f4 100644
--- a/arch/powerpc/include/asm/vdso/getrandom.h
+++ b/arch/powerpc/include/asm/vdso/getrandom.h
@@ -5,7 +5,7 @@
 #ifndef _ASM_POWERPC_VDSO_GETRANDOM_H
 #define _ASM_POWERPC_VDSO_GETRANDOM_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/vdso_datapage.h>
 
@@ -62,6 +62,6 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo
 ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state,
 			     size_t opaque_len);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_VDSO_GETRANDOM_H */
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 99c9d6f43fde..ab3df12c8d94 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H
 #define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/vdso/timebase.h>
 #include <asm/barrier.h>
@@ -141,6 +141,6 @@ int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz
 __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time,
 				    const struct vdso_time_data *vd);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h
index 80d13207c568..c1f3d7aaf3ee 100644
--- a/arch/powerpc/include/asm/vdso/processor.h
+++ b/arch/powerpc/include/asm/vdso/processor.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_POWERPC_VDSO_PROCESSOR_H
 #define _ASM_POWERPC_VDSO_PROCESSOR_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* Macros for adjusting thread priority (hardware multi-threading) */
 #ifdef CONFIG_PPC64
@@ -33,6 +33,6 @@
 #define cpu_relax()	barrier()
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_VDSO_PROCESSOR_H */
diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h
index c2c9ae1b22e7..bee18e8660a0 100644
--- a/arch/powerpc/include/asm/vdso/vsyscall.h
+++ b/arch/powerpc/include/asm/vdso/vsyscall.h
@@ -2,13 +2,13 @@
 #ifndef _ASM_POWERPC_VDSO_VSYSCALL_H
 #define _ASM_POWERPC_VDSO_VSYSCALL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/vdso_datapage.h>
 
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_POWERPC_VDSO_VSYSCALL_H */
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index 95d45a50355d..441264af0e36 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -9,11 +9,11 @@
  * 		      IBM Corp.
  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <vdso/datapage.h>
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 .macro get_datapage ptr symbol
 	bcl	20, 31, .+4
@@ -23,7 +23,7 @@
 	addi	\ptr, \ptr, (\symbol - 999b)@l
 .endm
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __KERNEL__ */
 #endif /* _SYSTEMCFG_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 92930b0b5d0e..efb0f5effcc6 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,7 +111,6 @@ void xive_native_free_vp_block(u32 vp_base);
 int xive_native_populate_irq_data(u32 hw_irq,
 				  struct xive_irq_data *data);
 void xive_cleanup_irq_data(struct xive_irq_data *xd);
-void xive_irq_free_data(unsigned int virq);
 void xive_native_free_irq(u32 irq);
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
 
diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h
index 1869cf83a870..11abcf0192ca 100644
--- a/arch/powerpc/include/uapi/asm/opal-prd.h
+++ b/arch/powerpc/include/uapi/asm/opal-prd.h
@@ -40,7 +40,7 @@
 #define OPAL_PRD_SCOM_READ		_IOR('o', 0x02, struct opal_prd_scom)
 #define OPAL_PRD_SCOM_WRITE		_IOW('o', 0x03, struct opal_prd_scom)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct opal_prd_info {
 	__u64	version;
@@ -54,6 +54,6 @@ struct opal_prd_scom {
 	__s64	rc;
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */
diff --git a/arch/powerpc/include/uapi/asm/papr-hvpipe.h b/arch/powerpc/include/uapi/asm/papr-hvpipe.h
new file mode 100644
index 000000000000..f8794139d06a
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-hvpipe.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_HVPIPE_H_
+#define _UAPI_PAPR_HVPIPE_H_
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+#include <asm/papr-miscdev.h>
+
+/*
+ * This header is included in payload between OS and the user
+ * space.
+ * flags: OS notifies the user space whether the hvpipe is
+ *        closed or the buffer has the payload.
+ */
+struct papr_hvpipe_hdr {
+	__u8 version;
+	__u8 reserved[3];
+	__u32 flags;
+	__u8 reserved2[40];
+};
+
+/*
+ * ioctl for /dev/papr-hvpipe
+ */
+#define PAPR_HVPIPE_IOC_CREATE_HANDLE	_IOW(PAPR_MISCDEV_IOC_ID, 9, __u32)
+
+/*
+ * hvpipe_hdr flags used for read()
+ */
+#define HVPIPE_MSG_AVAILABLE	0x01 /* Payload is available */
+#define HVPIPE_LOST_CONNECTION	0x02 /* Pipe connection is closed/unavailable */
+
+#endif /* _UAPI_PAPR_HVPIPE_H_ */
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h
index 7004cfea3f5f..01e630149d48 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -27,7 +27,7 @@
 
 #include <linux/types.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef __KERNEL__
 struct user_pt_regs
@@ -57,7 +57,7 @@ struct pt_regs
 	unsigned long result;		/* Result of a system call */
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 
 /*
@@ -200,7 +200,7 @@ struct pt_regs
 #define PPC_PTRACE_SETHWDEBUG	0x88
 #define PPC_PTRACE_DELHWDEBUG	0x87
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct ppc_debug_info {
 	__u32 version;			/* Only version 1 exists to date */
@@ -212,7 +212,7 @@ struct ppc_debug_info {
 	__u64 features;
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * features will have bits indication whether there is support for:
@@ -224,7 +224,7 @@ struct ppc_debug_info {
 #define PPC_DEBUG_FEATURE_DATA_BP_DAWR		0x0000000000000010
 #define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31	0x0000000000000020
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct ppc_hw_breakpoint {
 	__u32 version;		/* currently, version must be 1 */
@@ -236,7 +236,7 @@ struct ppc_hw_breakpoint {
 	__u64 condition_value;	/* contents of the DVC register */
 };
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * Trigger Type
diff --git a/arch/powerpc/include/uapi/asm/types.h b/arch/powerpc/include/uapi/asm/types.h
index 327616fb70e4..9dbf55e38ea5 100644
--- a/arch/powerpc/include/uapi/asm/types.h
+++ b/arch/powerpc/include/uapi/asm/types.h
@@ -28,14 +28,14 @@
 # include <asm-generic/int-ll64.h>
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 
 typedef struct {
 	__u32 u[4];
 } __attribute__((aligned(16))) __vector128;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 
 #endif /* _UAPI_ASM_POWERPC_TYPES_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fb2b95267022..2f0a2e69c607 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -199,7 +199,9 @@ obj-$(CONFIG_ALTIVEC)		+= vector.o
 
 obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o
 obj64-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_entry_64.o
-extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check
+ifdef KBUILD_BUILTIN
+always-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check
+endif
 
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
 obj-$(CONFIG_PPC32)		+= $(obj32-y)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b3048f6d3822..a4bc80b30410 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -8,6 +8,7 @@
  * compile this file to assembler, and then extract the
  * #defines from the assembly-language output.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/compat.h>
 #include <linux/signal.h>
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 56c5ebe21b99..393e19ee1322 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -162,7 +162,7 @@ instruction_counter:
  * For the MPC8xx, this is a software tablewalk to load the instruction
  * TLB.  The task switch loads the M_TWB register with the pointer to the first
  * level table.
- * If we discover there is no second level table (value is zero) or if there
+ * If there is no second level table (value is zero) or if there
  * is an invalid pte, we load that into the TLB, which causes another fault
  * into the TLB Error interrupt where we can handle such problems.
  * We have to use the MD_xxx registers for the tablewalk because the
@@ -183,14 +183,11 @@ instruction_counter:
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	mtspr	SPRN_M_TW, r11
 
-	/* If we are faulting a kernel address, we have to use the
-	 * kernel page tables.
-	 */
 	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
 	INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
 	mtspr	SPRN_MD_EPN, r10
 	mfspr	r10, SPRN_M_TWB	/* Get level 1 table */
-	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10)	/* Get level 1 entry */
+	lwz	r11, 0(r10)	/* Get level 1 entry */
 	mtspr	SPRN_MD_TWC, r11
 	mfspr	r10, SPRN_MD_TWC
 	lwz	r10, 0(r10)	/* Get the pte */
@@ -228,12 +225,8 @@ instruction_counter:
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	mtspr	SPRN_M_TW, r11
 
-	/* If we are faulting a kernel address, we have to use the
-	 * kernel page tables.
-	 */
-	mfspr	r10, SPRN_MD_EPN
 	mfspr	r10, SPRN_M_TWB	/* Get level 1 table */
-	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10)	/* Get level 1 entry */
+	lwz	r11, 0(r10)	/* Get level 1 entry */
 
 	mtspr	SPRN_MD_TWC, r11
 	mfspr	r10, SPRN_MD_TWC
@@ -375,7 +368,7 @@ FixupPGD:
 	mfspr	r10, SPRN_DAR
 	mtspr	SPRN_MD_EPN, r10
 	mfspr	r11, SPRN_M_TWB	/* Get level 1 table */
-	lwz	r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
+	lwz	r10, 0(r11)	/* Get the level 1 entry */
 	cmpwi	cr1, r10, 0
 	bne	cr1, 1f
 
@@ -384,7 +377,7 @@ FixupPGD:
 	lwz	r10, (swapper_pg_dir - PAGE_OFFSET)@l(r10)	/* Get the level 1 entry */
 	cmpwi	cr1, r10, 0
 	beq	cr1, 1f
-	stw	r10, (swapper_pg_dir - PAGE_OFFSET)@l(r11)	/* Set the level 1 entry */
+	stw	r10, 0(r11)	/* Set the level 1 entry */
 	mfspr	r10, SPRN_M_TW
 	mtcr	r10
 	mfspr	r10, SPRN_SPRG_SCRATCH0
@@ -412,9 +405,10 @@ FixupDAR:/* Entry point for dcbx workaround. */
 	tophys(r11, r10)
 	mfspr	r11, SPRN_M_TWB	/* Get level 1 table */
 	rlwinm	r11, r11, 0, 20, 31
-	oris	r11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha
+	oris	r11, r11, (swapper_pg_dir - PAGE_OFFSET)@h
+	ori	r11, r11, (swapper_pg_dir - PAGE_OFFSET)@l
 3:
-	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
+	lwz	r11, 0(r11)	/* Get the level 1 entry */
 	rlwinm	r11, r11, 0, ~_PMD_PAGE_8M
 	mtspr	SPRN_MD_TWC, r11
 	mfspr	r11, SPRN_MD_TWC
@@ -535,7 +529,8 @@ start_here:
 	li	r0,0
 	stwu	r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1)
 
-	lis	r6, swapper_pg_dir@ha
+	lis	r6, swapper_pg_dir@h
+	ori	r6, r6, swapper_pg_dir@l
 	tophys(r6,r6)
 	mtspr	SPRN_M_TWB, r6
 
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 0b5c1993809e..75471fb6fb10 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -7,7 +7,7 @@
 #include <asm/kvm_booke_hv_asm.h>
 #include <asm/thread_info.h>	/* for THREAD_SHIFT */
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /*
  * Macros used for common Book-e exception handling
@@ -522,5 +522,5 @@ label:
 	bl	kernel_fp_unavailable_exception;			      \
 	b	interrupt_return
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __HEAD_BOOKE_H__ */
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 5b3c093611ba..7209d00a9c25 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -632,19 +632,19 @@ static void __init kvm_check_ins(u32 *inst, u32 features)
 #endif
 	}
 
-	switch (inst_no_rt & ~KVM_MASK_RB) {
 #ifdef CONFIG_PPC_BOOK3S_32
+	switch (inst_no_rt & ~KVM_MASK_RB) {
 	case KVM_INST_MTSRIN:
 		if (features & KVM_MAGIC_FEAT_SR) {
 			u32 inst_rb = _inst & KVM_MASK_RB;
 			kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
 		}
 		break;
-#endif
 	}
+#endif
 
-	switch (_inst) {
 #ifdef CONFIG_BOOKE
+	switch (_inst) {
 	case KVM_INST_WRTEEI_0:
 		kvm_patch_ins_wrteei_0(inst);
 		break;
@@ -652,8 +652,8 @@ static void __init kvm_check_ins(u32 *inst, u32 features)
 	case KVM_INST_WRTEEI_1:
 		kvm_patch_ins_wrtee(inst, 0, 1);
 		break;
-#endif
 	}
+#endif
 }
 
 extern u32 kvm_template_start[];
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 126bf3b06ab7..2a44bc8e2439 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -209,8 +209,7 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
 				    char *secstrings,
 				    struct module *me)
 {
-	/* One extra reloc so it's always 0-addr terminated */
-	unsigned long relocs = 1;
+	unsigned long relocs = 0;
 	unsigned i;
 
 	/* Every relocated section... */
@@ -705,7 +704,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
 
 	/* Find this stub, or if that fails, the next avail. entry */
 	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
-	for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
+	for (i = 0; i < me->arch.stub_count; i++) {
 		if (WARN_ON(i >= num_stubs))
 			return 0;
 
@@ -716,6 +715,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
 	if (!create_stub(sechdrs, &stubs[i], addr, me, name))
 		return 0;
 
+	me->arch.stub_count++;
 	return (unsigned long)&stubs[i];
 }
 
@@ -1118,29 +1118,19 @@ int module_trampoline_target(struct module *mod, unsigned long addr,
 static int setup_ftrace_ool_stubs(const Elf64_Shdr *sechdrs, unsigned long addr, struct module *me)
 {
 #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
-	unsigned int i, total_stubs, num_stubs;
+	unsigned int total_stubs, num_stubs;
 	struct ppc64_stub_entry *stub;
 
 	total_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stub);
 	num_stubs = roundup(me->arch.ool_stub_count * sizeof(struct ftrace_ool_stub),
 			    sizeof(struct ppc64_stub_entry)) / sizeof(struct ppc64_stub_entry);
 
-	/* Find the next available entry */
-	stub = (void *)sechdrs[me->arch.stubs_section].sh_addr;
-	for (i = 0; stub_func_addr(stub[i].funcdata); i++)
-		if (WARN_ON(i >= total_stubs))
-			return -1;
-
-	if (WARN_ON(i + num_stubs > total_stubs))
+	if (WARN_ON(me->arch.stub_count + num_stubs > total_stubs))
 		return -1;
 
-	stub += i;
-	me->arch.ool_stubs = (struct ftrace_ool_stub *)stub;
-
-	/* reserve stubs */
-	for (i = 0; i < num_stubs; i++)
-		if (patch_u32((void *)&stub->funcdata, PPC_RAW_NOP()))
-			return -1;
+	stub = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+	me->arch.ool_stubs = (struct ftrace_ool_stub *)(stub + me->arch.stub_count);
+	me->arch.stub_count += num_stubs;
 #endif
 
 	return 0;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 855e09886503..eb23966ac0a9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1805,7 +1805,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 			f = ret_from_kernel_user_thread;
 		} else {
 			struct pt_regs *regs = current_pt_regs();
-			unsigned long clone_flags = args->flags;
+			u64 clone_flags = args->flags;
 			unsigned long usp = args->stack;
 
 			/* Copy registers */
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 69623b9045d5..3090b97258ae 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -15,8 +15,8 @@
 
 has_renamed_memintrinsics()
 {
-	grep -q "^CONFIG_KASAN=y$" ${KCONFIG_CONFIG} && \
-		! grep -q "^CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX=y" ${KCONFIG_CONFIG}
+	grep -q "^CONFIG_KASAN=y$" "${KCONFIG_CONFIG}" && \
+		! grep -q "^CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX=y" "${KCONFIG_CONFIG}"
 }
 
 if has_renamed_memintrinsics
@@ -42,15 +42,15 @@ check_section()
 {
     file=$1
     section=$2
-    size=$(objdump -h -j $section $file 2>/dev/null | awk "\$2 == \"$section\" {print \$3}")
+    size=$(objdump -h -j "$section" "$file" 2>/dev/null | awk "\$2 == \"$section\" {print \$3}")
     size=${size:-0}
-    if [ $size -ne 0 ]; then
+    if [ "$size" -ne 0 ]; then
 	ERROR=1
 	echo "Error: Section $section not empty in prom_init.c" >&2
     fi
 }
 
-for UNDEF in $($NM -u $OBJ | awk '{print $2}')
+for UNDEF in $($NM -u "$OBJ" | awk '{print $2}')
 do
 	# On 64-bit nm gives us the function descriptors, which have
 	# a leading . on the name, so strip it off here.
@@ -87,8 +87,8 @@ do
 	fi
 done
 
-check_section $OBJ .data
-check_section $OBJ .bss
-check_section $OBJ .init.data
+check_section "$OBJ" .data
+check_section "$OBJ" .bss
+check_section "$OBJ" .init.data
 
 exit $ERROR
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index e61245c4468e..8d81c1e7a8db 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -98,6 +98,8 @@ DEFINE_MUTEX(rtas_ibm_get_vpd_lock);
 DEFINE_MUTEX(rtas_ibm_get_indices_lock);
 DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock);
 DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock);
+DEFINE_MUTEX(rtas_ibm_receive_hvpipe_msg_lock);
+DEFINE_MUTEX(rtas_ibm_send_hvpipe_msg_lock);
 
 static struct rtas_function rtas_function_table[] __ro_after_init = {
 	[RTAS_FNIDX__CHECK_EXCEPTION] = {
@@ -373,6 +375,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = {
 	[RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = {
 		.name = "ibm,read-slot-reset-state2",
 	},
+	[RTAS_FNIDX__IBM_RECEIVE_HVPIPE_MSG] {
+		.name = "ibm,receive-hvpipe-msg",
+		.filter = &(const struct rtas_filter) {
+			.buf_idx1 = 0, .size_idx1 = 1,
+			.buf_idx2 = -1, .size_idx2 = -1,
+		},
+		/*
+		 * PAPR+ v2.13 R1–7.3.32.1
+		 */
+		.lock = &rtas_ibm_receive_hvpipe_msg_lock,
+	},
 	[RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = {
 		.name = "ibm,remove-pe-dma-window",
 	},
@@ -391,6 +404,17 @@ static struct rtas_function rtas_function_table[] __ro_after_init = {
 			.buf_idx2 = -1, .size_idx2 = -1,
 		},
 	},
+	[RTAS_FNIDX__IBM_SEND_HVPIPE_MSG] {
+		.name = "ibm,send-hvpipe-msg",
+		.filter = &(const struct rtas_filter) {
+			.buf_idx1 = 1, .size_idx1 = -1,
+			.buf_idx2 = -1, .size_idx2 = -1,
+		},
+		/*
+		 * PAPR+ v2.13 R1–7.3.32.2
+		 */
+		.lock = &rtas_ibm_send_hvpipe_msg_lock,
+	},
 	[RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = {
 		.name = "ibm,set-dynamic-indicator",
 		.filter = &(const struct rtas_filter) {
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 9bba469239fc..6336ec9aedd0 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -89,6 +89,8 @@ static char *rtas_event_type(int type)
 			return "Platform Resource Reassignment Event";
 		case RTAS_TYPE_HOTPLUG:
 			return "Hotplug Event";
+		case RTAS_TYPE_HVPIPE:
+			return "Hypervisor Pipe Notification event";
 	}
 
 	return rtas_type[0];
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 7284c8021eeb..8fd7cbf3bd04 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -141,10 +141,7 @@ void __init check_smt_enabled(void)
 			smt_enabled_at_boot = 0;
 		else {
 			int smt;
-			int rc;
-
-			rc = kstrtoint(smt_enabled_cmdline, 10, &smt);
-			if (!rc)
+			if (!kstrtoint(smt_enabled_cmdline, 10, &smt))
 				smt_enabled_at_boot =
 					min(threads_per_core, smt);
 		}
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index f59e4b9cc207..68edb66c2964 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void)
  * We can't just pass cpu_l2_cache_mask() directly because
  * returns a non-const pointer and the compiler barfs on that.
  */
-static const struct cpumask *shared_cache_mask(int cpu)
+static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu)
 {
 	return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
+static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu)
 {
 	return cpu_smallcore_mask(cpu);
 }
 #endif
 
-static struct cpumask *cpu_coregroup_mask(int cpu)
+struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return per_cpu(cpu_coregroup_map, cpu);
 }
@@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void)
 	return coregroup_enabled;
 }
 
-static const struct cpumask *cpu_mc_mask(int cpu)
-{
-	return cpu_coregroup_mask(cpu);
-}
-
 static int __init init_big_cores(void)
 {
 	int cpu;
@@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask)
 		return false;
 	}
 
-	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
+	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
 
 	/* Update l2-cache mask with all the CPUs that are part of submask */
 	or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask);
@@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
 		return;
 	}
 
-	cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu));
+	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
 
 	/* Update coregroup mask with all the CPUs that are part of submask */
 	or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
@@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu)
 
 	/* If chip_id is -1; limit the cpu_core_mask to within PKG */
 	if (chip_id == -1)
-		cpumask_and(mask, mask, cpu_cpu_mask(cpu));
+		cpumask_and(mask, mask, cpu_node_mask(cpu));
 
 	for_each_cpu(i, mask) {
 		if (chip_id == cpu_to_chip_id(i)) {
@@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void)
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
 		powerpc_topology[i++] =
-			SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
+			SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT);
 	} else {
-		powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
+		powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT);
 	}
 #endif
 	if (shared_caches) {
 		powerpc_topology[i++] =
-			SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
+			SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE);
 	}
 
 	if (has_coregroup_support()) {
 		powerpc_topology[i++] =
-			SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
+			SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);
 	}
 
-	powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
+	powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);
 
 	/* There must be one trailing NULL entry left.  */
 	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8224381c1dba..4bbeb8644d3d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL_GPL(rtc_lock);
 
 static u64 tb_to_ns_scale __read_mostly;
 static unsigned tb_to_ns_shift __read_mostly;
-static u64 boot_tb __read_mostly;
+static u64 boot_tb __ro_after_init;
 
 extern struct timezone sys_tz;
 static long timezone_offset;
@@ -639,6 +639,12 @@ notrace unsigned long long sched_clock(void)
 	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
 }
 
+#ifdef CONFIG_PPC_SPLPAR
+u64 get_boot_tb(void)
+{
+	return boot_tb;
+}
+#endif
 
 #ifdef CONFIG_PPC_PSERIES
 
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 6dca92d5a6e8..841d077e2825 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -488,8 +488,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 		return ret;
 
 	/* Set up out-of-line stub */
-	if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE))
-		return ftrace_init_ool_stub(mod, rec);
+	if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) {
+		ret = ftrace_init_ool_stub(mod, rec);
+		goto out;
+	}
 
 	/* Nop-out the ftrace location */
 	new = ppc_inst(PPC_RAW_NOP());
@@ -520,6 +522,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 		return -EINVAL;
 	}
 
+out:
+	if (!ret)
+		ret = ftrace_rec_set_nop_ops(rec);
+
 	return ret;
 }
 
diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S
index 3565c67fc638..6599fe3c6234 100644
--- a/arch/powerpc/kernel/trace/ftrace_entry.S
+++ b/arch/powerpc/kernel/trace/ftrace_entry.S
@@ -409,23 +409,31 @@ EXPORT_SYMBOL(_mcount)
 _GLOBAL(return_to_handler)
 	/* need to save return values */
 #ifdef CONFIG_PPC64
-	std	r4,  -32(r1)
-	std	r3,  -24(r1)
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+	std	r4, GPR4(r1)
+	std	r3, GPR3(r1)
+	/* Save previous stack pointer (r1) */
+	addi	r3, r1, SWITCH_FRAME_SIZE
+	std	r3, GPR1(r1)
 	/* save TOC */
-	std	r2,  -16(r1)
-	std	r31, -8(r1)
+	std	r2,  24(r1)
+	std	r31, 32(r1)
 	mr	r31, r1
-	stdu	r1, -112(r1)
-
+	/* pass ftrace_regs/pt_regs to ftrace_return_to_handler */
+	addi	r3, r1, STACK_INT_FRAME_REGS
 	/*
 	 * We might be called from a module.
 	 * Switch to our TOC to run inside the core kernel.
 	 */
 	LOAD_PACA_TOC()
 #else
-	stwu	r1, -16(r1)
-	stw	r3, 8(r1)
-	stw	r4, 12(r1)
+	stwu	r1, -SWITCH_FRAME_SIZE(r1)
+	stw	r4, GPR4(r1)
+	stw	r3, GPR3(r1)
+	addi	r3, r1, SWITCH_FRAME_SIZE
+	stw	r3, GPR1(r1)
+	/* pass ftrace_regs/pt_regs to ftrace_return_to_handler */
+	addi	r3, r1, STACK_INT_FRAME_REGS
 #endif
 
 	bl	ftrace_return_to_handler
@@ -435,15 +443,15 @@ _GLOBAL(return_to_handler)
 	mtlr	r3
 
 #ifdef CONFIG_PPC64
-	ld	r1, 0(r1)
-	ld	r4,  -32(r1)
-	ld	r3,  -24(r1)
-	ld	r2,  -16(r1)
-	ld	r31, -8(r1)
+	ld	r4,  GPR4(r1)
+	ld	r3,  GPR3(r1)
+	ld	r2,  24(r1)
+	ld	r31, 32(r1)
+	ld	r1,  0(r1)
 #else
-	lwz	r3, 8(r1)
-	lwz	r4, 12(r1)
-	addi	r1, r1, 16
+	lwz	r3, GPR3(r1)
+	lwz	r4, GPR4(r1)
+	addi	r1, r1, SWITCH_FRAME_SIZE
 #endif
 
 	/* Jump back to real return address */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 219d67bcf747..ab7c4cc80943 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -21,6 +21,7 @@
 #include <vdso/datapage.h>
 
 #include <asm/syscall.h>
+#include <asm/syscalls.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
@@ -40,8 +41,6 @@ static_assert(__VDSO_PAGES == VDSO_NR_PAGES);
 extern char vdso32_start, vdso32_end;
 extern char vdso64_start, vdso64_end;
 
-long sys_ni_syscall(void);
-
 static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma,
 		       unsigned long text_size)
 {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 153587741864..2ba057171ebe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -69,7 +69,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 /*
  * Common checks before entering the guest world.  Call with interrupts
- * disabled.
+ * enabled.
  *
  * returns:
  *
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index bcc7e4dff8c3..95ab4cdf582e 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -9,6 +9,7 @@
 #include <linux/sched/clock.h>
 #include <asm/qspinlock.h>
 #include <asm/paravirt.h>
+#include <trace/events/lock.h>
 
 #define MAX_NODES	4
 
@@ -708,26 +709,26 @@ release:
 	qnodesp->count--;
 }
 
-void queued_spin_lock_slowpath(struct qspinlock *lock)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock)
 {
+	trace_contention_begin(lock, LCB_F_SPIN);
 	/*
 	 * This looks funny, but it induces the compiler to inline both
 	 * sides of the branch rather than share code as when the condition
 	 * is passed as the paravirt argument to the functions.
 	 */
 	if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
-		if (try_to_steal_lock(lock, true)) {
+		if (try_to_steal_lock(lock, true))
 			spec_barrier();
-			return;
-		}
-		queued_spin_lock_mcs_queue(lock, true);
+		else
+			queued_spin_lock_mcs_queue(lock, true);
 	} else {
-		if (try_to_steal_lock(lock, false)) {
+		if (try_to_steal_lock(lock, false))
 			spec_barrier();
-			return;
-		}
-		queued_spin_lock_mcs_queue(lock, false);
+		else
+			queued_spin_lock_mcs_queue(lock, false);
 	}
+	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index be9c4106e22f..c42ecdf94e48 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -204,7 +204,7 @@ int mmu_mark_initmem_nx(void)
 
 	for (i = 0; i < nb - 1 && base < top;) {
 		size = bat_block_size(base, top);
-		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
 		base += size;
 	}
 	if (base < top) {
@@ -215,7 +215,7 @@ int mmu_mark_initmem_nx(void)
 				pr_warn("Some RW data is getting mapped X. "
 					"Adjust CONFIG_DATA_SHIFT to avoid that.\n");
 		}
-		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+		setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
 		base += size;
 	}
 	for (; i < nb; i++)
diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c
index a1a4e697251a..28a96a10c907 100644
--- a/arch/powerpc/mm/nohash/mmu_context.c
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@@ -203,15 +203,7 @@ static unsigned int steal_context_up(unsigned int id)
 static void set_context(unsigned long id, pgd_t *pgd)
 {
 	if (IS_ENABLED(CONFIG_PPC_8xx)) {
-		s16 offset = (s16)(__pa(swapper_pg_dir));
-
-		/*
-		 * Register M_TWB will contain base address of level 1 table minus the
-		 * lower part of the kernel PGDIR base address, so that all accesses to
-		 * level 1 table are done relative to lower part of kernel PGDIR base
-		 * address.
-		 */
-		mtspr(SPRN_M_TWB, __pa(pgd) - offset);
+		mtspr(SPRN_M_TWB, __pa(pgd));
 
 		/* Update context */
 		mtspr(SPRN_M_CASID, id - 1);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 15276068f657..0c9ef705803e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -104,7 +104,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 	p = memstart_addr + s;
 	for (; s < top; s += PAGE_SIZE) {
 		ktext = core_kernel_text(v);
-		map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
+		map_kernel_page(v, p, ktext ? PAGE_KERNEL_X : PAGE_KERNEL);
 		v += PAGE_SIZE;
 		p += PAGE_SIZE;
 	}
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 4c26912c2e3c..8334cd667bba 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -8,7 +8,7 @@
 #ifndef _BPF_JIT_H
 #define _BPF_JIT_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/types.h>
 #include <asm/ppc-opcode.h>
@@ -161,9 +161,11 @@ struct codegen_context {
 	unsigned int seen;
 	unsigned int idx;
 	unsigned int stack_size;
-	int b2p[MAX_BPF_JIT_REG + 2];
+	int b2p[MAX_BPF_JIT_REG + 3];
 	unsigned int exentry_idx;
 	unsigned int alt_exit_addr;
+	u64 arena_vm_start;
+	u64 user_vm_start;
 };
 
 #define bpf_to_ppc(r)	(ctx->b2p[r])
@@ -201,7 +203,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg,
 
 int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass,
 			  struct codegen_context *ctx, int insn_idx,
-			  int jmp_off, int dst_reg);
+			  int jmp_off, int dst_reg, u32 code);
 
 #endif
 
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index c0684733e9d6..88ad5ba7b87f 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -204,6 +204,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 
 	/* Make sure that the stack is quadword aligned. */
 	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
+	cgctx.arena_vm_start = bpf_arena_get_kern_vm_start(fp->aux->arena);
+	cgctx.user_vm_start = bpf_arena_get_user_vm_start(fp->aux->arena);
 
 	/* Scouting faux-generate pass 0 */
 	if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) {
@@ -326,7 +328,7 @@ out:
  */
 int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass,
 			  struct codegen_context *ctx, int insn_idx, int jmp_off,
-			  int dst_reg)
+			  int dst_reg, u32 code)
 {
 	off_t offset;
 	unsigned long pc;
@@ -355,6 +357,9 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass
 		(ctx->exentry_idx * BPF_FIXUP_LEN * 4);
 
 	fixup[0] = PPC_RAW_LI(dst_reg, 0);
+	if (BPF_CLASS(code) == BPF_ST || BPF_CLASS(code) == BPF_STX)
+		fixup[0] = PPC_RAW_NOP();
+
 	if (IS_ENABLED(CONFIG_PPC32))
 		fixup[1] = PPC_RAW_LI(dst_reg - 1, 0); /* clear higher 32-bit register too */
 
@@ -435,11 +440,32 @@ bool bpf_jit_supports_kfunc_call(void)
 	return true;
 }
 
+bool bpf_jit_supports_arena(void)
+{
+	return IS_ENABLED(CONFIG_PPC64);
+}
+
 bool bpf_jit_supports_far_kfunc_call(void)
 {
 	return IS_ENABLED(CONFIG_PPC64);
 }
 
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	if (!in_arena)
+		return true;
+	switch (insn->code) {
+	case BPF_STX | BPF_ATOMIC | BPF_H:
+	case BPF_STX | BPF_ATOMIC | BPF_B:
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (bpf_atomic_is_load_store(insn))
+			return false;
+		return IS_ENABLED(CONFIG_PPC64);
+	}
+	return true;
+}
+
 void *arch_alloc_bpf_trampoline(unsigned int size)
 {
 	return bpf_prog_pack_alloc(size, bpf_jit_fill_ill_insns);
@@ -579,7 +605,7 @@ static void bpf_trampoline_setup_tail_call_cnt(u32 *image, struct codegen_contex
 {
 	if (IS_ENABLED(CONFIG_PPC64)) {
 		/* See bpf_jit_stack_tailcallcnt() */
-		int tailcallcnt_offset = 6 * 8;
+		int tailcallcnt_offset = 7 * 8;
 
 		EMIT(PPC_RAW_LL(_R3, _R1, func_frame_offset - tailcallcnt_offset));
 		EMIT(PPC_RAW_STL(_R3, _R1, -tailcallcnt_offset));
@@ -594,7 +620,7 @@ static void bpf_trampoline_restore_tail_call_cnt(u32 *image, struct codegen_cont
 {
 	if (IS_ENABLED(CONFIG_PPC64)) {
 		/* See bpf_jit_stack_tailcallcnt() */
-		int tailcallcnt_offset = 6 * 8;
+		int tailcallcnt_offset = 7 * 8;
 
 		EMIT(PPC_RAW_LL(_R3, _R1, -tailcallcnt_offset));
 		EMIT(PPC_RAW_STL(_R3, _R1, func_frame_offset - tailcallcnt_offset));
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index 0aace304dfe1..3087e744fb25 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -1087,7 +1087,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 				}
 
 				ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, insn_idx,
-							    jmp_off, dst_reg);
+							    jmp_off, dst_reg, code);
 				if (ret)
 					return ret;
 			}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 025524378443..1fe37128c876 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -25,18 +25,18 @@
  * with our redzone usage.
  *
  *		[	prev sp		] <-------------
- *		[   nv gpr save area	] 5*8		|
+ *		[   nv gpr save area	] 6*8		|
  *		[    tail_call_cnt	] 8		|
- *		[    local_tmp_var	] 16		|
+ *		[    local_tmp_var	] 24		|
  * fp (r31) -->	[   ebpf stack space	] upto 512	|
  *		[     frame header	] 32/112	|
  * sp (r1) --->	[    stack pointer	] --------------
  */
 
 /* for gpr non volatile registers BPG_REG_6 to 10 */
-#define BPF_PPC_STACK_SAVE	(5*8)
+#define BPF_PPC_STACK_SAVE	(6*8)
 /* for bpf JIT code internal usage */
-#define BPF_PPC_STACK_LOCALS	24
+#define BPF_PPC_STACK_LOCALS	32
 /* stack frame excluding BPF stack, ensure this is quadword aligned */
 #define BPF_PPC_STACKFRAME	(STACK_FRAME_MIN_SIZE + \
 				 BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE)
@@ -44,6 +44,7 @@
 /* BPF register usage */
 #define TMP_REG_1	(MAX_BPF_JIT_REG + 0)
 #define TMP_REG_2	(MAX_BPF_JIT_REG + 1)
+#define ARENA_VM_START  (MAX_BPF_JIT_REG + 2)
 
 /* BPF to ppc register mappings */
 void bpf_jit_init_reg_mapping(struct codegen_context *ctx)
@@ -67,10 +68,12 @@ void bpf_jit_init_reg_mapping(struct codegen_context *ctx)
 	ctx->b2p[BPF_REG_AX] = _R12;
 	ctx->b2p[TMP_REG_1] = _R9;
 	ctx->b2p[TMP_REG_2] = _R10;
+	/* non volatile register for kern_vm_start address */
+	ctx->b2p[ARENA_VM_START] = _R26;
 }
 
-/* PPC NVR range -- update this if we ever use NVRs below r27 */
-#define BPF_PPC_NVR_MIN		_R27
+/* PPC NVR range -- update this if we ever use NVRs below r26 */
+#define BPF_PPC_NVR_MIN		_R26
 
 static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
 {
@@ -89,9 +92,9 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
  *		[	prev sp		] <-------------
  *		[	  ...       	] 		|
  * sp (r1) --->	[    stack pointer	] --------------
- *		[   nv gpr save area	] 5*8
+ *		[   nv gpr save area	] 6*8
  *		[    tail_call_cnt	] 8
- *		[    local_tmp_var	] 16
+ *		[    local_tmp_var	] 24
  *		[   unused red zone	] 224
  */
 static int bpf_jit_stack_local(struct codegen_context *ctx)
@@ -99,12 +102,12 @@ static int bpf_jit_stack_local(struct codegen_context *ctx)
 	if (bpf_has_stack_frame(ctx))
 		return STACK_FRAME_MIN_SIZE + ctx->stack_size;
 	else
-		return -(BPF_PPC_STACK_SAVE + 24);
+		return -(BPF_PPC_STACK_SAVE + 32);
 }
 
 static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx)
 {
-	return bpf_jit_stack_local(ctx) + 16;
+	return bpf_jit_stack_local(ctx) + 24;
 }
 
 static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
@@ -170,10 +173,17 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 		if (bpf_is_seen_register(ctx, bpf_to_ppc(i)))
 			EMIT(PPC_RAW_STD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i))));
 
+	if (ctx->arena_vm_start)
+		EMIT(PPC_RAW_STD(bpf_to_ppc(ARENA_VM_START), _R1,
+				 bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START))));
+
 	/* Setup frame pointer to point to the bpf stack area */
 	if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP)))
 		EMIT(PPC_RAW_ADDI(bpf_to_ppc(BPF_REG_FP), _R1,
 				STACK_FRAME_MIN_SIZE + ctx->stack_size));
+
+	if (ctx->arena_vm_start)
+		PPC_LI64(bpf_to_ppc(ARENA_VM_START), ctx->arena_vm_start);
 }
 
 static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx)
@@ -185,6 +195,10 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
 		if (bpf_is_seen_register(ctx, bpf_to_ppc(i)))
 			EMIT(PPC_RAW_LD(bpf_to_ppc(i), _R1, bpf_jit_stack_offsetof(ctx, bpf_to_ppc(i))));
 
+	if (ctx->arena_vm_start)
+		EMIT(PPC_RAW_LD(bpf_to_ppc(ARENA_VM_START), _R1,
+				bpf_jit_stack_offsetof(ctx, bpf_to_ppc(ARENA_VM_START))));
+
 	/* Tear down our stack frame */
 	if (bpf_has_stack_frame(ctx)) {
 		EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME + ctx->stack_size));
@@ -396,11 +410,11 @@ void bpf_stf_barrier(void);
 asm (
 "		.global bpf_stf_barrier		;"
 "	bpf_stf_barrier:			;"
-"		std	21,-64(1)		;"
-"		std	22,-56(1)		;"
+"		std	21,-80(1)		;"
+"		std	22,-72(1)		;"
 "		sync				;"
-"		ld	21,-64(1)		;"
-"		ld	22,-56(1)		;"
+"		ld	21,-80(1)		;"
+"		ld	22,-72(1)		;"
 "		ori	31,31,0			;"
 "		.rept 14			;"
 "		b	1f			;"
@@ -409,6 +423,141 @@ asm (
 "		blr				;"
 );
 
+static int bpf_jit_emit_atomic_ops(u32 *image, struct codegen_context *ctx,
+				   const struct bpf_insn *insn, u32 *jmp_off,
+				   u32 *tmp_idx, u32 *addrp)
+{
+	u32 tmp1_reg = bpf_to_ppc(TMP_REG_1);
+	u32 tmp2_reg = bpf_to_ppc(TMP_REG_2);
+	u32 size = BPF_SIZE(insn->code);
+	u32 src_reg = bpf_to_ppc(insn->src_reg);
+	u32 dst_reg = bpf_to_ppc(insn->dst_reg);
+	s32 imm = insn->imm;
+
+	u32 save_reg = tmp2_reg;
+	u32 ret_reg = src_reg;
+	u32 fixup_idx;
+
+	/* Get offset into TMP_REG_1 */
+	EMIT(PPC_RAW_LI(tmp1_reg, insn->off));
+       /*
+	* Enforce full ordering for operations with BPF_FETCH by emitting a 'sync'
+	* before and after the operation.
+	*
+	* This is a requirement in the Linux Kernel Memory Model.
+	* See __cmpxchg_u64() in asm/cmpxchg.h as an example.
+	*/
+	if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP))
+		EMIT(PPC_RAW_SYNC());
+
+	*tmp_idx = ctx->idx;
+
+	/* load value from memory into TMP_REG_2 */
+	if (size == BPF_DW)
+		EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0));
+	else
+		EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0));
+	/* Save old value in _R0 */
+	if (imm & BPF_FETCH)
+		EMIT(PPC_RAW_MR(_R0, tmp2_reg));
+
+	switch (imm) {
+	case BPF_ADD:
+	case BPF_ADD | BPF_FETCH:
+		EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg));
+		break;
+	case BPF_AND:
+	case BPF_AND | BPF_FETCH:
+		EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg));
+		break;
+	case BPF_OR:
+	case BPF_OR | BPF_FETCH:
+		EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg));
+		break;
+	case BPF_XOR:
+	case BPF_XOR | BPF_FETCH:
+		EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg));
+		break;
+	case BPF_CMPXCHG:
+	       /*
+		* Return old value in BPF_REG_0 for BPF_CMPXCHG &
+		* in src_reg for other cases.
+		*/
+		ret_reg = bpf_to_ppc(BPF_REG_0);
+
+		/* Compare with old value in BPF_R0 */
+		if (size == BPF_DW)
+			EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg));
+		else
+			EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg));
+		/* Don't set if different from old value */
+		PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4);
+		fallthrough;
+	case BPF_XCHG:
+		save_reg = src_reg;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* store new value */
+	if (size == BPF_DW)
+		EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg));
+	else
+		EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg));
+	/* we're done if this succeeded */
+	PPC_BCC_SHORT(COND_NE, *tmp_idx * 4);
+	fixup_idx = ctx->idx;
+
+	if (imm & BPF_FETCH) {
+		/* Emit 'sync' to enforce full ordering */
+		if (IS_ENABLED(CONFIG_SMP))
+			EMIT(PPC_RAW_SYNC());
+		EMIT(PPC_RAW_MR(ret_reg, _R0));
+		/*
+		 * Skip unnecessary zero-extension for 32-bit cmpxchg.
+		 * For context, see commit 39491867ace5.
+		 */
+		if (size != BPF_DW && imm == BPF_CMPXCHG &&
+		    insn_is_zext(insn + 1))
+			*addrp = ctx->idx * 4;
+	}
+
+	*jmp_off = (fixup_idx - *tmp_idx) * 4;
+
+	return 0;
+}
+
+static int bpf_jit_emit_probe_mem_store(struct codegen_context *ctx, u32 src_reg, s16 off,
+					u32 code, u32 *image)
+{
+	u32 tmp1_reg = bpf_to_ppc(TMP_REG_1);
+	u32 tmp2_reg = bpf_to_ppc(TMP_REG_2);
+
+	switch (BPF_SIZE(code)) {
+	case BPF_B:
+		EMIT(PPC_RAW_STB(src_reg, tmp1_reg, off));
+		break;
+	case BPF_H:
+		EMIT(PPC_RAW_STH(src_reg, tmp1_reg, off));
+		break;
+	case BPF_W:
+		EMIT(PPC_RAW_STW(src_reg, tmp1_reg, off));
+		break;
+	case BPF_DW:
+		if (off % 4) {
+			EMIT(PPC_RAW_LI(tmp2_reg, off));
+			EMIT(PPC_RAW_STDX(src_reg, tmp1_reg, tmp2_reg));
+		} else {
+			EMIT(PPC_RAW_STD(src_reg, tmp1_reg, off));
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static int emit_atomic_ld_st(const struct bpf_insn insn, struct codegen_context *ctx, u32 *image)
 {
 	u32 code = insn.code;
@@ -494,7 +643,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		u32 size = BPF_SIZE(code);
 		u32 tmp1_reg = bpf_to_ppc(TMP_REG_1);
 		u32 tmp2_reg = bpf_to_ppc(TMP_REG_2);
-		u32 save_reg, ret_reg;
 		s16 off = insn[i].off;
 		s32 imm = insn[i].imm;
 		bool func_addr_fixed;
@@ -502,6 +650,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		u64 imm64;
 		u32 true_cond;
 		u32 tmp_idx;
+		u32 jmp_off;
 
 		/*
 		 * addrs[] maps a BPF bytecode address into a real offset from
@@ -768,6 +917,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		 */
 		case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */
 		case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */
+
+			if (insn_is_cast_user(&insn[i])) {
+				EMIT(PPC_RAW_RLDICL_DOT(tmp1_reg, src_reg, 0, 32));
+				PPC_LI64(dst_reg, (ctx->user_vm_start & 0xffffffff00000000UL));
+				PPC_BCC_SHORT(COND_EQ, (ctx->idx + 2) * 4);
+				EMIT(PPC_RAW_OR(tmp1_reg, dst_reg, tmp1_reg));
+				EMIT(PPC_RAW_MR(dst_reg, tmp1_reg));
+				break;
+			}
+
 			if (imm == 1) {
 				/* special mov32 for zext */
 				EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 0, 31));
@@ -960,6 +1119,76 @@ emit_clear:
 			}
 			break;
 
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+
+			EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START)));
+
+			ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image);
+			if (ret)
+				return ret;
+
+			ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx,
+						    ctx->idx - 1, 4, -1, code);
+			if (ret)
+				return ret;
+
+			break;
+
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+
+			EMIT(PPC_RAW_ADD(tmp1_reg, dst_reg, bpf_to_ppc(ARENA_VM_START)));
+
+			if (BPF_SIZE(code) == BPF_W || BPF_SIZE(code) == BPF_DW) {
+				PPC_LI32(tmp2_reg, imm);
+				src_reg = tmp2_reg;
+			} else {
+				EMIT(PPC_RAW_LI(tmp2_reg, imm));
+				src_reg = tmp2_reg;
+			}
+
+			ret = bpf_jit_emit_probe_mem_store(ctx, src_reg, off, code, image);
+			if (ret)
+				return ret;
+
+			ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx,
+						    ctx->idx - 1, 4, -1, code);
+			if (ret)
+				return ret;
+
+			break;
+
+		/*
+		 * BPF_STX PROBE_ATOMIC (arena atomic ops)
+		 */
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+			EMIT(PPC_RAW_ADD(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START)));
+			ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i],
+						      &jmp_off, &tmp_idx, &addrs[i + 1]);
+			if (ret) {
+				if (ret == -EOPNOTSUPP) {
+					pr_err_ratelimited(
+						"eBPF filter atomic op code %02x (@%d) unsupported\n",
+						code, i);
+				}
+				return ret;
+			}
+			/* LDARX/LWARX should land here on exception. */
+			ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx,
+						    tmp_idx, jmp_off, dst_reg, code);
+			if (ret)
+				return ret;
+
+			/* Retrieve the dst_reg */
+			EMIT(PPC_RAW_SUB(dst_reg, dst_reg, bpf_to_ppc(ARENA_VM_START)));
+			break;
+
 		/*
 		 * BPF_STX ATOMIC (atomic ops)
 		 */
@@ -982,93 +1211,15 @@ emit_clear:
 				return -EOPNOTSUPP;
 			}
 
-			save_reg = tmp2_reg;
-			ret_reg = src_reg;
-
-			/* Get offset into TMP_REG_1 */
-			EMIT(PPC_RAW_LI(tmp1_reg, off));
-			/*
-			 * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync'
-			 * before and after the operation.
-			 *
-			 * This is a requirement in the Linux Kernel Memory Model.
-			 * See __cmpxchg_u64() in asm/cmpxchg.h as an example.
-			 */
-			if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP))
-				EMIT(PPC_RAW_SYNC());
-			tmp_idx = ctx->idx * 4;
-			/* load value from memory into TMP_REG_2 */
-			if (size == BPF_DW)
-				EMIT(PPC_RAW_LDARX(tmp2_reg, tmp1_reg, dst_reg, 0));
-			else
-				EMIT(PPC_RAW_LWARX(tmp2_reg, tmp1_reg, dst_reg, 0));
-
-			/* Save old value in _R0 */
-			if (imm & BPF_FETCH)
-				EMIT(PPC_RAW_MR(_R0, tmp2_reg));
-
-			switch (imm) {
-			case BPF_ADD:
-			case BPF_ADD | BPF_FETCH:
-				EMIT(PPC_RAW_ADD(tmp2_reg, tmp2_reg, src_reg));
-				break;
-			case BPF_AND:
-			case BPF_AND | BPF_FETCH:
-				EMIT(PPC_RAW_AND(tmp2_reg, tmp2_reg, src_reg));
-				break;
-			case BPF_OR:
-			case BPF_OR | BPF_FETCH:
-				EMIT(PPC_RAW_OR(tmp2_reg, tmp2_reg, src_reg));
-				break;
-			case BPF_XOR:
-			case BPF_XOR | BPF_FETCH:
-				EMIT(PPC_RAW_XOR(tmp2_reg, tmp2_reg, src_reg));
-				break;
-			case BPF_CMPXCHG:
-				/*
-				 * Return old value in BPF_REG_0 for BPF_CMPXCHG &
-				 * in src_reg for other cases.
-				 */
-				ret_reg = bpf_to_ppc(BPF_REG_0);
-
-				/* Compare with old value in BPF_R0 */
-				if (size == BPF_DW)
-					EMIT(PPC_RAW_CMPD(bpf_to_ppc(BPF_REG_0), tmp2_reg));
-				else
-					EMIT(PPC_RAW_CMPW(bpf_to_ppc(BPF_REG_0), tmp2_reg));
-				/* Don't set if different from old value */
-				PPC_BCC_SHORT(COND_NE, (ctx->idx + 3) * 4);
-				fallthrough;
-			case BPF_XCHG:
-				save_reg = src_reg;
-				break;
-			default:
-				pr_err_ratelimited(
-					"eBPF filter atomic op code %02x (@%d) unsupported\n",
-					code, i);
-				return -EOPNOTSUPP;
-			}
-
-			/* store new value */
-			if (size == BPF_DW)
-				EMIT(PPC_RAW_STDCX(save_reg, tmp1_reg, dst_reg));
-			else
-				EMIT(PPC_RAW_STWCX(save_reg, tmp1_reg, dst_reg));
-			/* we're done if this succeeded */
-			PPC_BCC_SHORT(COND_NE, tmp_idx);
-
-			if (imm & BPF_FETCH) {
-				/* Emit 'sync' to enforce full ordering */
-				if (IS_ENABLED(CONFIG_SMP))
-					EMIT(PPC_RAW_SYNC());
-				EMIT(PPC_RAW_MR(ret_reg, _R0));
-				/*
-				 * Skip unnecessary zero-extension for 32-bit cmpxchg.
-				 * For context, see commit 39491867ace5.
-				 */
-				if (size != BPF_DW && imm == BPF_CMPXCHG &&
-				    insn_is_zext(&insn[i + 1]))
-					addrs[++i] = ctx->idx * 4;
+			ret = bpf_jit_emit_atomic_ops(image, ctx, &insn[i],
+						      &jmp_off, &tmp_idx, &addrs[i + 1]);
+			if (ret) {
+				if (ret == -EOPNOTSUPP) {
+					pr_err_ratelimited(
+						"eBPF filter atomic op code %02x (@%d) unsupported\n",
+						code, i);
+				}
+				return ret;
 			}
 			break;
 
@@ -1112,9 +1263,10 @@ emit_clear:
 				 * Check if 'off' is word aligned for BPF_DW, because
 				 * we might generate two instructions.
 				 */
-				if ((BPF_SIZE(code) == BPF_DW ||
-				    (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_PROBE_MEMSX)) &&
-						(off & 3))
+				if ((BPF_SIZE(code) == BPF_DW && (off & 3)) ||
+				    (BPF_SIZE(code) == BPF_B &&
+				     BPF_MODE(code) == BPF_PROBE_MEMSX) ||
+				    (BPF_SIZE(code) == BPF_B && BPF_MODE(code) == BPF_MEMSX))
 					PPC_JMP((ctx->idx + 3) * 4);
 				else
 					PPC_JMP((ctx->idx + 2) * 4);
@@ -1160,12 +1312,49 @@ emit_clear:
 
 			if (BPF_MODE(code) == BPF_PROBE_MEM) {
 				ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx,
-							    ctx->idx - 1, 4, dst_reg);
+							    ctx->idx - 1, 4, dst_reg, code);
 				if (ret)
 					return ret;
 			}
 			break;
 
+		/* dst = *(u64 *)(ul) (src + ARENA_VM_START + off) */
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
+
+			EMIT(PPC_RAW_ADD(tmp1_reg, src_reg, bpf_to_ppc(ARENA_VM_START)));
+
+			switch (size) {
+			case BPF_B:
+				EMIT(PPC_RAW_LBZ(dst_reg, tmp1_reg, off));
+				break;
+			case BPF_H:
+				EMIT(PPC_RAW_LHZ(dst_reg, tmp1_reg, off));
+				break;
+			case BPF_W:
+				EMIT(PPC_RAW_LWZ(dst_reg, tmp1_reg, off));
+				break;
+			case BPF_DW:
+				if (off % 4) {
+					EMIT(PPC_RAW_LI(tmp2_reg, off));
+					EMIT(PPC_RAW_LDX(dst_reg, tmp1_reg, tmp2_reg));
+				} else {
+					EMIT(PPC_RAW_LD(dst_reg, tmp1_reg, off));
+				}
+				break;
+			}
+
+			if (size != BPF_DW && insn_is_zext(&insn[i + 1]))
+				addrs[++i] = ctx->idx * 4;
+
+			ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx,
+						    ctx->idx - 1, 4, dst_reg, code);
+			if (ret)
+				return ret;
+			break;
+
 		/*
 		 * Doubleword load
 		 * 16 byte instruction that uses two 'struct bpf_insn'
diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 7f53fcb7495a..78dd7e25219e 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
 
-obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
+obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o vpa-dtl.o
 
 obj-$(CONFIG_VPA_PMU) += vpa-pmu.o
 
diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c
new file mode 100644
index 000000000000..3c1d1c28deb9
--- /dev/null
+++ b/arch/powerpc/perf/vpa-dtl.c
@@ -0,0 +1,596 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Perf interface to expose Dispatch Trace Log counters.
+ *
+ * Copyright (C) 2024 Kajol Jain, IBM Corporation
+ */
+
+#ifdef CONFIG_PPC_SPLPAR
+#define pr_fmt(fmt) "vpa_dtl: " fmt
+
+#include <asm/dtl.h>
+#include <linux/perf_event.h>
+#include <asm/plpar_wrappers.h>
+#include <linux/vmalloc.h>
+
+#define EVENT(_name, _code)     enum{_name = _code}
+
+/*
+ * Based on Power Architecture Platform Reference(PAPR) documentation,
+ * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL)
+ * Enable Mask used to get corresponding virtual processor dispatch
+ * to preempt traces:
+ *   DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual
+ *   processor waits
+ *   DTL_PREEMPT(0x2): Trace time slice preempts
+ *   DTL_FAULT(0x4): Trace virtual partition memory page
+ faults.
+ *   DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT)
+ *
+ * Event codes based on Dispatch Trace Log Enable Mask.
+ */
+EVENT(DTL_CEDE,         0x1);
+EVENT(DTL_PREEMPT,      0x2);
+EVENT(DTL_FAULT,        0x4);
+EVENT(DTL_ALL,          0x7);
+
+GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
+GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
+GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
+GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *events_attr[] = {
+	GENERIC_EVENT_PTR(DTL_CEDE),
+	GENERIC_EVENT_PTR(DTL_PREEMPT),
+	GENERIC_EVENT_PTR(DTL_FAULT),
+	GENERIC_EVENT_PTR(DTL_ALL),
+	NULL
+};
+
+static struct attribute_group event_group = {
+	.name = "events",
+	.attrs = events_attr,
+};
+
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static const struct attribute_group format_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&format_group,
+	&event_group,
+	NULL,
+};
+
+struct vpa_dtl {
+	struct dtl_entry	*buf;
+	u64			last_idx;
+};
+
+struct vpa_pmu_ctx {
+	struct perf_output_handle handle;
+};
+
+struct vpa_pmu_buf {
+	int     nr_pages;
+	bool    snapshot;
+	u64     *base;
+	u64     size;
+	u64     head;
+	u64	head_size;
+	/* boot timebase and frequency needs to be saved only at once */
+	int	boottb_freq_saved;
+	u64	threshold;
+	bool	full;
+};
+
+/*
+ * To corelate each DTL entry with other events across CPU's,
+ * we need to map timebase from "struct dtl_entry" which phyp
+ * provides with boot timebase. This also needs timebase frequency.
+ * Formula is: ((timbase from DTL entry - boot time) / frequency)
+ *
+ * To match with size of "struct dtl_entry" to ease post processing,
+ * padded 24 bytes to the structure.
+ */
+struct boottb_freq {
+	u64	boot_tb;
+	u64	tb_freq;
+	u64	timebase;
+	u64	padded[3];
+};
+
+static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx);
+static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu);
+
+/* variable to capture reference count for the active dtl threads */
+static int dtl_global_refc;
+static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
+
+/*
+ * Capture DTL data in AUX buffer
+ */
+static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf,
+		struct vpa_dtl *dtl, int index)
+{
+	struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base;
+
+	/*
+	 * check if there is enough space to contain the
+	 * DTL data. If not, save the data for available
+	 * memory and set full to true.
+	 */
+	if (buf->head + *n_entries >= buf->threshold) {
+		*n_entries = buf->threshold - buf->head;
+		buf->full = 1;
+	}
+
+	/*
+	 * Copy to AUX buffer from per-thread address
+	 */
+	memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry));
+
+	if (buf->full) {
+		/*
+		 * Set head of private aux to zero when buffer is full
+		 * so that next data will be copied to beginning of the
+		 * buffer
+		 */
+		buf->head = 0;
+		return;
+	}
+
+	buf->head += *n_entries;
+
+	return;
+}
+
+/*
+ * Function to dump the dispatch trace log buffer data to the
+ * perf data.
+ *
+ * perf_aux_output_begin: This function is called before writing
+ * to AUX area. This returns the pointer to aux area private structure,
+ * ie "struct vpa_pmu_buf" here which is set in setup_aux() function.
+ * The function obtains the output handle (used in perf_aux_output_end).
+ * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end()
+ * to commit the recorded data.
+ *
+ * perf_aux_output_end: This function commits data by adjusting the
+ * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools
+ * side when writing the data from aux buffer to perf.data file in disk.
+ *
+ * Here in the private aux structure, we maintain head to know where
+ * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to
+ * maintain the aux head for PMU driver. It is responsiblity of PMU
+ * driver to make sure data is copied between perf_aux_output_begin and
+ * perf_aux_output_end.
+ *
+ * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end()
+ * is called to move the aux->head of "struct perf_buffer" to indicate size of
+ * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer.
+ * Data will be written to disk only when the allocated buffer is full.
+ *
+ * By this approach, all the DTL data will be present as-is in the
+ * perf.data. The data will be pre-processed in perf tools side when doing
+ * perf report/perf script and this will avoid time taken to create samples
+ * in the kernel space.
+ */
+static void vpa_dtl_dump_sample_data(struct perf_event *event)
+{
+	u64 cur_idx, last_idx, i;
+	u64 boot_tb;
+	struct boottb_freq boottb_freq;
+
+	/* actual number of entries read */
+	long n_read = 0, read_size = 0;
+
+	/* number of entries added to dtl buffer */
+	long n_req;
+
+	struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx);
+
+	struct vpa_pmu_buf *aux_buf;
+
+	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
+	u64 size;
+
+	cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx);
+	last_idx = dtl->last_idx;
+
+	if (last_idx + N_DISPATCH_LOG <= cur_idx)
+		last_idx = cur_idx - N_DISPATCH_LOG + 1;
+
+	n_req = cur_idx - last_idx;
+
+	/* no new entry added to the buffer, return */
+	if (n_req <= 0)
+		return;
+
+	dtl->last_idx = last_idx + n_req;
+	boot_tb = get_boot_tb();
+
+	i = last_idx % N_DISPATCH_LOG;
+
+	aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event);
+	if (!aux_buf) {
+		pr_debug("returning. no aux\n");
+		return;
+	}
+
+	if (!aux_buf->boottb_freq_saved) {
+		pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb);
+		/* Save boot_tb to convert raw timebase to it's relative system boot time */
+		boottb_freq.boot_tb = boot_tb;
+		/* Save tb_ticks_per_sec to convert timebase to sec */
+		boottb_freq.tb_freq = tb_ticks_per_sec;
+		boottb_freq.timebase = 0;
+		memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq));
+		aux_buf->head += 1;
+		aux_buf->boottb_freq_saved = 1;
+		n_read += 1;
+	}
+
+	/* read the tail of the buffer if we've wrapped */
+	if (i + n_req > N_DISPATCH_LOG) {
+		read_size = N_DISPATCH_LOG - i;
+		vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i);
+		n_req -= read_size;
+		n_read += read_size;
+		i = 0;
+		if (aux_buf->full) {
+			size = (n_read * sizeof(struct dtl_entry));
+			if ((size +  aux_buf->head_size) > aux_buf->size) {
+				size = aux_buf->size - aux_buf->head_size;
+				perf_aux_output_end(&vpa_ctx->handle, size);
+				aux_buf->head = 0;
+				aux_buf->head_size = 0;
+			} else {
+				aux_buf->head_size += (n_read * sizeof(struct dtl_entry));
+				perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry));
+			}
+			goto out;
+		}
+	}
+
+	/* .. and now the head */
+	vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i);
+
+	size = ((n_req + n_read) * sizeof(struct dtl_entry));
+	if ((size +  aux_buf->head_size) > aux_buf->size) {
+		size = aux_buf->size - aux_buf->head_size;
+		perf_aux_output_end(&vpa_ctx->handle, size);
+		aux_buf->head = 0;
+		aux_buf->head_size = 0;
+	} else {
+		aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry));
+		/* Move the aux->head to indicate size of data in aux buffer */
+		perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry));
+	}
+out:
+	aux_buf->full = 0;
+}
+
+/*
+ * The VPA Dispatch Trace log counters do not interrupt on overflow.
+ * Therefore, the kernel needs to poll the counters to avoid missing
+ * an overflow using hrtimer. The timer interval is based on sample_period
+ * count provided by user, and minimum interval is 1 millisecond.
+ */
+static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct perf_event *event;
+	u64 period;
+
+	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return HRTIMER_NORESTART;
+
+	vpa_dtl_dump_sample_data(event);
+	period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return HRTIMER_RESTART;
+}
+
+static void vpa_dtl_start_hrtimer(struct perf_event *event)
+{
+	u64 period;
+	struct hw_perf_event *hwc = &event->hw;
+
+	period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period);
+	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
+}
+
+static void vpa_dtl_stop_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hrtimer_cancel(&hwc->hrtimer);
+}
+
+static void vpa_dtl_reset_global_refc(struct perf_event *event)
+{
+	spin_lock(&dtl_global_lock);
+	dtl_global_refc--;
+	if (dtl_global_refc <= 0) {
+		dtl_global_refc = 0;
+		up_write(&dtl_access_lock);
+	}
+	spin_unlock(&dtl_global_lock);
+}
+
+static int vpa_dtl_mem_alloc(int cpu)
+{
+	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu);
+	struct dtl_entry *buf = NULL;
+
+	/* Check for dispatch trace log buffer cache */
+	if (!dtl_cache)
+		return -ENOMEM;
+
+	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu));
+	if (!buf) {
+		pr_warn("buffer allocation failed for cpu %d\n", cpu);
+		return -ENOMEM;
+	}
+	dtl->buf = buf;
+	return 0;
+}
+
+static int vpa_dtl_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* test the event attr type for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (!perfmon_capable())
+		return -EACCES;
+
+	/* Return if this is a counting event */
+	if (!is_sampling_event(event))
+		return -EOPNOTSUPP;
+
+	/* no branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	/* Invalid eventcode */
+	switch (event->attr.config) {
+	case DTL_LOG_CEDE:
+	case DTL_LOG_PREEMPT:
+	case DTL_LOG_FAULT:
+	case DTL_LOG_ALL:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	spin_lock(&dtl_global_lock);
+
+	/*
+	 * To ensure there are no other conflicting dtl users
+	 * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl),
+	 * below code try to take the dtl_access_lock.
+	 * The dtl_access_lock is a rwlock defined in dtl.h, which is used
+	 * to unsure there is no conflicting dtl users.
+	 * Based on below code, vpa_dtl pmu tries to take write access lock
+	 * and also checks for dtl_global_refc, to make sure that the
+	 * dtl_access_lock is taken by vpa_dtl pmu interface.
+	 */
+	if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) {
+		spin_unlock(&dtl_global_lock);
+		return -EBUSY;
+	}
+
+	/* Allocate dtl buffer memory */
+	if (vpa_dtl_mem_alloc(event->cpu)) {
+		spin_unlock(&dtl_global_lock);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Increment the number of active vpa_dtl pmu threads. The
+	 * dtl_global_refc is used to keep count of cpu threads that
+	 * currently capturing dtl data using vpa_dtl pmu interface.
+	 */
+	dtl_global_refc++;
+
+	spin_unlock(&dtl_global_lock);
+
+	hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+	/*
+	 * Since hrtimers have a fixed rate, we can do a static freq->period
+	 * mapping and avoid the whole period adjust feedback stuff.
+	 */
+	if (event->attr.freq) {
+		long freq = event->attr.sample_freq;
+
+		event->attr.sample_period = NSEC_PER_SEC / freq;
+		hwc->sample_period = event->attr.sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+		hwc->last_period = hwc->sample_period;
+		event->attr.freq = 0;
+	}
+
+	event->destroy = vpa_dtl_reset_global_refc;
+	return 0;
+}
+
+static int vpa_dtl_event_add(struct perf_event *event, int flags)
+{
+	int ret, hwcpu;
+	unsigned long addr;
+	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
+
+	/*
+	 * Register our dtl buffer with the hypervisor. The
+	 * HV expects the buffer size to be passed in the second
+	 * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA'
+	 * from PAPR for more information.
+	 */
+	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
+	dtl->last_idx = 0;
+
+	hwcpu = get_hard_smp_processor_id(event->cpu);
+	addr = __pa(dtl->buf);
+
+	ret = register_dtl(hwcpu, addr);
+	if (ret) {
+		pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
+			event->cpu, hwcpu, ret);
+		return ret;
+	}
+
+	/* set our initial buffer indices */
+	lppaca_of(event->cpu).dtl_idx = 0;
+
+	/*
+	 * Ensure that our updates to the lppaca fields have
+	 * occurred before we actually enable the logging
+	 */
+	smp_wmb();
+
+	/* enable event logging */
+	lppaca_of(event->cpu).dtl_enable_mask = event->attr.config;
+
+	vpa_dtl_start_hrtimer(event);
+
+	return 0;
+}
+
+static void vpa_dtl_event_del(struct perf_event *event, int flags)
+{
+	int hwcpu = get_hard_smp_processor_id(event->cpu);
+	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
+
+	vpa_dtl_stop_hrtimer(event);
+	unregister_dtl(hwcpu);
+	kmem_cache_free(dtl_cache, dtl->buf);
+	dtl->buf = NULL;
+	lppaca_of(event->cpu).dtl_enable_mask = 0x0;
+}
+
+/*
+ * This function definition is empty as vpa_dtl_dump_sample_data
+ * is used to parse and dump the dispatch trace log data,
+ * to perf data.
+ */
+static void vpa_dtl_event_read(struct perf_event *event)
+{
+}
+
+/*
+ * Set up pmu-private data structures for an AUX area
+ * **pages contains the aux buffer allocated for this event
+ * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node"
+ * and returns pointer to each page address. Map these pages to
+ * contiguous space using vmap and use that as base address.
+ *
+ * The aux private data structure ie, "struct vpa_pmu_buf" mainly
+ * saves
+ * - buf->base: aux buffer base address
+ * - buf->head: offset from base address where data will be written to.
+ * - buf->size: Size of allocated memory
+ */
+static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages,
+		int nr_pages, bool snapshot)
+{
+	int i, cpu = event->cpu;
+	struct vpa_pmu_buf *buf __free(kfree) = NULL;
+	struct page **pglist __free(kfree) = NULL;
+
+	/* We need at least one page for this to work. */
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+
+	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
+	if (!buf)
+		return NULL;
+
+	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
+	if (!pglist)
+		return NULL;
+
+	for (i = 0; i < nr_pages; ++i)
+		pglist[i] = virt_to_page(pages[i]);
+
+	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!buf->base)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->snapshot = false;
+
+	buf->size = nr_pages << PAGE_SHIFT;
+	buf->head = 0;
+	buf->head_size = 0;
+	buf->boottb_freq_saved = 0;
+	buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry));
+	return no_free_ptr(buf);
+}
+
+/*
+ * free pmu-private AUX data structures
+ */
+static void vpa_dtl_free_aux(void *aux)
+{
+	struct vpa_pmu_buf *buf = aux;
+
+	vunmap(buf->base);
+	kfree(buf);
+}
+
+static struct pmu vpa_dtl_pmu = {
+	.task_ctx_nr = perf_invalid_context,
+
+	.name = "vpa_dtl",
+	.attr_groups = attr_groups,
+	.event_init  = vpa_dtl_event_init,
+	.add         = vpa_dtl_event_add,
+	.del         = vpa_dtl_event_del,
+	.read        = vpa_dtl_event_read,
+	.setup_aux   = vpa_dtl_setup_aux,
+	.free_aux    = vpa_dtl_free_aux,
+	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE,
+};
+
+static int vpa_dtl_init(void)
+{
+	int r;
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		pr_debug("not a shared virtualized system, not enabling\n");
+		return -ENODEV;
+	}
+
+	/* This driver is intended only for L1 host. */
+	if (is_kvm_guest()) {
+		pr_debug("Only supported for L1 host system\n");
+		return -ENODEV;
+	}
+
+	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+device_initcall(vpa_dtl_init);
+#endif //CONFIG_PPC_SPLPAR
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 35a1f4b9f827..fc79f8466933 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -231,7 +231,6 @@ config PPC4xx_GPIO
 	bool "PPC4xx GPIO support"
 	depends on 44x
 	select GPIOLIB
-	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Enable gpiolib support for ppc440 based boards
 
diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c
index 08ab76582568..aea0d913b59d 100644
--- a/arch/powerpc/platforms/44x/gpio.c
+++ b/arch/powerpc/platforms/44x/gpio.c
@@ -14,10 +14,10 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/gpio/legacy-of-mm-gpiochip.h>
 #include <linux/gpio/driver.h>
 #include <linux/types.h>
 #include <linux/slab.h>
+#include <linux/platform_device.h>
 
 #define GPIO_MASK(gpio)		(0x80000000 >> (gpio))
 #define GPIO_MASK2(gpio)	(0xc0000000 >> ((gpio) * 2))
@@ -45,7 +45,8 @@ struct ppc4xx_gpio {
 };
 
 struct ppc4xx_gpio_chip {
-	struct of_mm_gpio_chip mm_gc;
+	struct gpio_chip gc;
+	void __iomem *regs;
 	spinlock_t lock;
 };
 
@@ -57,8 +58,8 @@ struct ppc4xx_gpio_chip {
 
 static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct ppc4xx_gpio __iomem *regs = mm_gc->regs;
+	struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc);
+	struct ppc4xx_gpio __iomem *regs = chip->regs;
 
 	return !!(in_be32(&regs->ir) & GPIO_MASK(gpio));
 }
@@ -66,8 +67,8 @@ static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 static inline void
 __ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct ppc4xx_gpio __iomem *regs = mm_gc->regs;
+	struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc);
+	struct ppc4xx_gpio __iomem *regs = chip->regs;
 
 	if (val)
 		setbits32(&regs->or, GPIO_MASK(gpio));
@@ -93,9 +94,8 @@ static int ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc);
-	struct ppc4xx_gpio __iomem *regs = mm_gc->regs;
+	struct ppc4xx_gpio __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&chip->lock, flags);
@@ -123,9 +123,8 @@ static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 static int
 ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc);
-	struct ppc4xx_gpio __iomem *regs = mm_gc->regs;
+	struct ppc4xx_gpio __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&chip->lock, flags);
@@ -155,42 +154,57 @@ ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 	return 0;
 }
 
-static int __init ppc4xx_add_gpiochips(void)
+static int ppc4xx_gpio_probe(struct platform_device *ofdev)
 {
-	struct device_node *np;
-
-	for_each_compatible_node(np, NULL, "ibm,ppc4xx-gpio") {
-		int ret;
-		struct ppc4xx_gpio_chip *ppc4xx_gc;
-		struct of_mm_gpio_chip *mm_gc;
-		struct gpio_chip *gc;
-
-		ppc4xx_gc = kzalloc(sizeof(*ppc4xx_gc), GFP_KERNEL);
-		if (!ppc4xx_gc) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		spin_lock_init(&ppc4xx_gc->lock);
-
-		mm_gc = &ppc4xx_gc->mm_gc;
-		gc = &mm_gc->gc;
-
-		gc->ngpio = 32;
-		gc->direction_input = ppc4xx_gpio_dir_in;
-		gc->direction_output = ppc4xx_gpio_dir_out;
-		gc->get = ppc4xx_gpio_get;
-		gc->set = ppc4xx_gpio_set;
-
-		ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc);
-		if (ret)
-			goto err;
-		continue;
-err:
-		pr_err("%pOF: registration failed with status %d\n", np, ret);
-		kfree(ppc4xx_gc);
-		/* try others anyway */
-	}
-	return 0;
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = dev->of_node;
+	struct ppc4xx_gpio_chip *chip;
+	struct gpio_chip *gc;
+
+	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	spin_lock_init(&chip->lock);
+
+	gc = &chip->gc;
+
+	gc->base = -1;
+	gc->ngpio = 32;
+	gc->direction_input = ppc4xx_gpio_dir_in;
+	gc->direction_output = ppc4xx_gpio_dir_out;
+	gc->get = ppc4xx_gpio_get;
+	gc->set = ppc4xx_gpio_set;
+
+	gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np);
+	if (!gc->label)
+		return -ENOMEM;
+
+	chip->regs = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(chip->regs))
+		return PTR_ERR(chip->regs);
+
+	return devm_gpiochip_add_data(dev, gc, chip);
+}
+
+static const struct of_device_id ppc4xx_gpio_match[] = {
+	{
+		.compatible = "ibm,ppc4xx-gpio",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ppc4xx_gpio_match);
+
+static struct platform_driver ppc4xx_gpio_driver = {
+	.probe		= ppc4xx_gpio_probe,
+	.driver		= {
+		.name	= "ppc4xx-gpio",
+		.of_match_table	= ppc4xx_gpio_match,
+	},
+};
+
+static int __init ppc4xx_gpio_init(void)
+{
+	return platform_driver_register(&ppc4xx_gpio_driver);
 }
-arch_initcall(ppc4xx_add_gpiochips);
+arch_initcall(ppc4xx_gpio_init);
diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig
index 8623aebfac48..abb2b45b2789 100644
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -101,7 +101,6 @@ comment "Generic MPC8xx Options"
 config 8xx_GPIO
 	bool "GPIO API Support"
 	select GPIOLIB
-	select OF_GPIO_MM_GPIOCHIP
 	help
 	  Saying Y here will cause the ports on an MPC8xx processor to be used
 	  with the GPIO API.  If you say N here, the kernel needs less memory.
diff --git a/arch/powerpc/platforms/8xx/cpm1-ic.c b/arch/powerpc/platforms/8xx/cpm1-ic.c
index a49d4a9ab3bc..3292071e4da3 100644
--- a/arch/powerpc/platforms/8xx/cpm1-ic.c
+++ b/arch/powerpc/platforms/8xx/cpm1-ic.c
@@ -110,8 +110,7 @@ static int cpm_pic_probe(struct platform_device *pdev)
 
 	out_be32(&data->reg->cpic_cimr, 0);
 
-	data->host = irq_domain_create_linear(of_fwnode_handle(dev->of_node),
-					      64, &cpm_pic_host_ops, data);
+	data->host = irq_domain_create_linear(dev_fwnode(dev), 64, &cpm_pic_host_ops, data);
 	if (!data->host)
 		return -ENODEV;
 
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index fea3766eac0f..364eef32ddcc 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -243,7 +243,6 @@ config CPM2
 	select CPM
 	select HAVE_PCI
 	select GPIOLIB
-	select OF_GPIO_MM_GPIOCHIP
 	help
 	  The CPM2 (Communications Processor Module) is a coprocessor on
 	  embedded CPUs made by Freescale.  Selecting this option means that
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 613b383ed8b3..7b527d18aa5e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -122,16 +122,11 @@ choice
 	  If unsure, select Generic.
 
 config POWERPC64_CPU
-	bool "Generic (POWER5 and PowerPC 970 and above)"
-	depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN
-	select PPC_64S_HASH_MMU
-
-config POWERPC64_CPU
-	bool "Generic (POWER8 and above)"
-	depends on PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
-	select ARCH_HAS_FAST_MULTIPLIER
+	bool "Generic 64 bits powerpc"
+	depends on PPC_BOOK3S_64
+	select ARCH_HAS_FAST_MULTIPLIER if CPU_LITTLE_ENDIAN
 	select PPC_64S_HASH_MMU
-	select PPC_HAS_LBARX_LHARX
+	select PPC_HAS_LBARX_LHARX if CPU_LITTLE_ENDIAN
 
 config POWERPC_CPU
 	bool "Generic 32 bits powerpc"
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index d5a2c77bc908..ce839783c0df 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file)
 	if (ctx->owner != current->mm)
 		return -EINVAL;
 
-	if (atomic_read(&inode->i_count) != 1)
+	if (icount_read(inode) != 1)
 		return -EBUSY;
 
 	mutex_lock(&ctx->mapping_lock);
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 157e046e6e93..ea4ba1b6ce6a 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,
 	struct dentry *dentry;
 	int ret;
 
-	dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
+	dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
 	ret = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
 		ret = spufs_create(&path, dentry, flags, mode, neighbor);
-		done_path_create(&path, dentry);
+		end_creating_path(&path, dentry);
 	}
 
 	return ret;
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 95d7ba73d43d..b5ad7c173ef0 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -9,6 +9,7 @@ config PPC_POWERNV
 	select PPC_P7_NAP
 	select FORCE_PCI
 	select PCI_MSI
+	select IRQ_MSI_LIB
 	select EPAPR_BOOT
 	select PPC_INDIRECT_PIO
 	select PPC_UDBG_16550
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8ccf2c9b98a..b0c1d9d16fb5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/memblock.h>
 #include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
 #include <linux/io.h>
 #include <linux/msi.h>
 #include <linux/iommu.h>
@@ -37,7 +38,6 @@
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
 #include <asm/mmzone.h>
-#include <asm/xive.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -1707,23 +1707,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 	return 0;
 }
 
-/*
- * The msi_free() op is called before irq_domain_free_irqs_top() when
- * the handler data is still available. Use that to clear the XIVE
- * controller.
- */
-static void pnv_msi_ops_msi_free(struct irq_domain *domain,
-				 struct msi_domain_info *info,
-				 unsigned int irq)
-{
-	if (xive_enabled())
-		xive_irq_free_data(irq);
-}
-
-static struct msi_domain_ops pnv_pci_msi_domain_ops = {
-	.msi_free	= pnv_msi_ops_msi_free,
-};
-
 static void pnv_msi_shutdown(struct irq_data *d)
 {
 	d = d->parent_data;
@@ -1731,31 +1714,33 @@ static void pnv_msi_shutdown(struct irq_data *d)
 		d->chip->irq_shutdown(d);
 }
 
-static void pnv_msi_mask(struct irq_data *d)
+static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				  struct irq_domain *real_parent, struct msi_domain_info *info)
 {
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
+	struct irq_chip *chip = info->chip;
 
-static void pnv_msi_unmask(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
 
-static struct irq_chip pnv_pci_msi_irq_chip = {
-	.name		= "PNV-PCI-MSI",
-	.irq_shutdown	= pnv_msi_shutdown,
-	.irq_mask	= pnv_msi_mask,
-	.irq_unmask	= pnv_msi_unmask,
-	.irq_eoi	= irq_chip_eoi_parent,
-};
+	chip->irq_shutdown = pnv_msi_shutdown;
+	return true;
+}
 
-static struct msi_domain_info pnv_msi_domain_info = {
-	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
-	.ops   = &pnv_pci_msi_domain_ops,
-	.chip  = &pnv_pci_msi_irq_chip,
+#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS		| \
+				    MSI_FLAG_USE_DEF_CHIP_OPS		| \
+				    MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK		| \
+				     MSI_FLAG_PCI_MSIX			| \
+				     MSI_FLAG_MULTI_PCI_MSI)
+
+static const struct msi_parent_ops pnv_msi_parent_ops = {
+	.required_flags		= PNV_PCI_MSI_FLAGS_REQUIRED,
+	.supported_flags	= PNV_PCI_MSI_FLAGS_SUPPORTED,
+	.chip_flags		= MSI_CHIP_FLAG_SET_EOI,
+	.bus_select_token	= DOMAIN_BUS_NEXUS,
+	.bus_select_mask	= MATCH_PCI_MSI,
+	.prefix			= "PNV-",
+	.init_dev_msi_info	= pnv_init_dev_msi_info,
 };
 
 static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
@@ -1854,7 +1839,7 @@ static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	return 0;
 
 out:
-	irq_domain_free_irqs_parent(domain, virq, i - 1);
+	irq_domain_free_irqs_parent(domain, virq, i);
 	msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs);
 	return ret;
 }
@@ -1870,41 +1855,30 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
 		 virq, d->hwirq, nr_irqs);
 
 	msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
-	/* XIVE domain is cleared through ->msi_free() */
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 
 static const struct irq_domain_ops pnv_irq_domain_ops = {
+	.select	= msi_lib_irq_domain_select,
 	.alloc  = pnv_irq_domain_alloc,
 	.free   = pnv_irq_domain_free,
 };
 
 static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
 {
-	struct pnv_phb *phb = hose->private_data;
 	struct irq_domain *parent = irq_get_default_domain();
-
-	hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id);
-	if (!hose->fwnode)
-		return -ENOMEM;
-
-	hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
-						       hose->fwnode,
-						       &pnv_irq_domain_ops, hose);
+	struct irq_domain_info info = {
+		.fwnode		= of_fwnode_handle(hose->dn),
+		.ops		= &pnv_irq_domain_ops,
+		.host_data	= hose,
+		.size		= count,
+		.parent		= parent,
+	};
+
+	hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops);
 	if (!hose->dev_domain) {
-		pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
-		       hose->dn, hose->global_number);
-		irq_domain_free_fwnode(hose->fwnode);
-		return -ENOMEM;
-	}
-
-	hose->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(hose->dn),
-						     &pnv_msi_domain_info,
-						     hose->dev_domain);
-	if (!hose->msi_domain) {
 		pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
 		       hose->dn, hose->global_number);
-		irq_domain_free_fwnode(hose->fwnode);
-		irq_domain_remove(hose->dev_domain);
 		return -ENOMEM;
 	}
 
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
index 77feee8436d4..413fd85d9bc2 100644
--- a/arch/powerpc/platforms/powernv/subcore.h
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -9,7 +9,7 @@
 #define SYNC_STEP_REAL_MODE	2	/* Set by secondary when in real mode  */
 #define SYNC_STEP_FINISHED	3	/* Set by secondary when split/unsplit is done */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_SMP
 void split_core_secondary_loop(u8 *state);
@@ -18,4 +18,4 @@ extern void update_subcore_sibling_mask(void);
 static inline void update_subcore_sibling_mask(void) { }
 #endif /* CONFIG_SMP */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index fa3c2fff082a..3e042218d6cd 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -7,6 +7,7 @@ config PPC_PSERIES
 	select OF_DYNAMIC
 	select FORCE_PCI
 	select PCI_MSI
+	select IRQ_MSI_LIB
 	select GENERIC_ALLOCATOR
 	select PPC_XICS
 	select PPC_XIVE_SPAPR
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 57222678bb3f..931ebaa474c8 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -5,6 +5,7 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   of_helpers.o rtas-work-area.o papr-sysparm.o \
 			   papr-rtas-common.o papr-vpd.o papr-indices.o \
 			   papr-platform-dump.o papr-phy-attest.o \
+			   papr-hvpipe.o \
 			   setup.o iommu.o event_sources.o ras.o \
 			   firmware.o power.o dlpar.o mobility.o rng.o \
 			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 62bd8e2d5d4c..95fe802ccdfd 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -28,6 +28,7 @@
 #include <asm/rtas.h>
 #include "pseries.h"
 #include "vas.h"	/* vas_migration_handler() */
+#include "papr-hvpipe.h"	/* hvpipe_migration_handler() */
 #include "../../kernel/cacheinfo.h"
 
 static struct kobject *mobility_kobj;
@@ -744,6 +745,7 @@ static int pseries_migrate_partition(u64 handle)
 	 * by closing VAS windows at the beginning of this function.
 	 */
 	vas_migration_handler(VAS_SUSPEND);
+	hvpipe_migration_handler(HVPIPE_SUSPEND);
 
 	ret = wait_for_vasi_session_suspending(handle);
 	if (ret)
@@ -770,6 +772,7 @@ static int pseries_migrate_partition(u64 handle)
 
 out:
 	vas_migration_handler(VAS_RESUME);
+	hvpipe_migration_handler(HVPIPE_RESUME);
 
 	return ret;
 }
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index ee1c8c6898a3..825f9432e03d 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/device.h>
 #include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/seq_file.h>
@@ -15,7 +16,6 @@
 #include <asm/hw_irq.h>
 #include <asm/ppc-pci.h>
 #include <asm/machdep.h>
-#include <asm/xive.h>
 
 #include "pseries.h"
 
@@ -430,43 +430,25 @@ again:
 static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev,
 				   int nvec, msi_alloc_info_t *arg)
 {
+	struct msi_domain_info *info = domain->host_data;
 	struct pci_dev *pdev = to_pci_dev(dev);
-	int type = pdev->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+	int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
 
 	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
 /*
- * ->msi_free() is called before irq_domain_free_irqs_top() when the
- * handler data is still available. Use that to clear the XIVE
- * controller data.
- */
-static void pseries_msi_ops_msi_free(struct irq_domain *domain,
-				     struct msi_domain_info *info,
-				     unsigned int irq)
-{
-	if (xive_enabled())
-		xive_irq_free_data(irq);
-}
-
-/*
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
  */
-static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev)
+static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
 {
-	if (WARN_ON_ONCE(!dev_is_pci(dev)))
-		return;
+	struct msi_desc *desc = arg->desc;
+	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
 
-	rtas_disable_msi(to_pci_dev(dev));
+	rtas_disable_msi(pdev);
 }
 
-static struct msi_domain_ops pseries_pci_msi_domain_ops = {
-	.msi_prepare	= pseries_msi_ops_prepare,
-	.msi_free	= pseries_msi_ops_msi_free,
-	.msi_post_free	= pseries_msi_post_free,
-};
-
 static void pseries_msi_shutdown(struct irq_data *d)
 {
 	d = d->parent_data;
@@ -474,18 +456,6 @@ static void pseries_msi_shutdown(struct irq_data *d)
 		d->chip->irq_shutdown(d);
 }
 
-static void pseries_msi_mask(struct irq_data *d)
-{
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
-
-static void pseries_msi_unmask(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
-
 static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct msi_desc *entry = irq_data_get_msi_desc(data);
@@ -500,27 +470,39 @@ static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 	entry->msg = *msg;
 }
 
-static struct irq_chip pseries_pci_msi_irq_chip = {
-	.name		= "pSeries-PCI-MSI",
-	.irq_shutdown	= pseries_msi_shutdown,
-	.irq_mask	= pseries_msi_mask,
-	.irq_unmask	= pseries_msi_unmask,
-	.irq_eoi	= irq_chip_eoi_parent,
-	.irq_write_msi_msg	= pseries_msi_write_msg,
-};
+static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				      struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	struct irq_chip *chip = info->chip;
 
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
 
-/*
- * Set MSI_FLAG_MSIX_CONTIGUOUS as there is no way to express to
- * firmware to request a discontiguous or non-zero based range of
- * MSI-X entries. Core code will reject such setup attempts.
- */
-static struct msi_domain_info pseries_msi_domain_info = {
-	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX |
-		  MSI_FLAG_MSIX_CONTIGUOUS),
-	.ops   = &pseries_pci_msi_domain_ops,
-	.chip  = &pseries_pci_msi_irq_chip,
+	chip->irq_shutdown = pseries_msi_shutdown;
+	chip->irq_write_msi_msg	= pseries_msi_write_msg;
+
+	info->ops->msi_prepare = pseries_msi_ops_prepare;
+	info->ops->msi_teardown = pseries_msi_ops_teardown;
+
+	return true;
+}
+
+#define PSERIES_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS	| \
+					MSI_FLAG_USE_DEF_CHIP_OPS	| \
+					MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define PSERIES_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK		| \
+					 MSI_FLAG_PCI_MSIX		| \
+					 MSI_FLAG_MSIX_CONTIGUOUS	| \
+					 MSI_FLAG_MULTI_PCI_MSI)
+
+static const struct msi_parent_ops pseries_msi_parent_ops = {
+	.required_flags		= PSERIES_PCI_MSI_FLAGS_REQUIRED,
+	.supported_flags	= PSERIES_PCI_MSI_FLAGS_SUPPORTED,
+	.chip_flags		= MSI_CHIP_FLAG_SET_EOI,
+	.bus_select_token	= DOMAIN_BUS_NEXUS,
+	.bus_select_mask	= MATCH_PCI_MSI,
+	.prefix			= "pSeries-",
+	.init_dev_msi_info	= pseries_init_dev_msi_info,
 };
 
 static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
@@ -593,7 +575,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 
 out:
 	/* TODO: handle RTAS cleanup in ->msi_finish() ? */
-	irq_domain_free_irqs_parent(domain, virq, i - 1);
+	irq_domain_free_irqs_parent(domain, virq, i);
 	return ret;
 }
 
@@ -604,11 +586,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
 	struct pci_controller *phb = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
-
-	/* XIVE domain data is cleared through ->msi_free() */
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 
 static const struct irq_domain_ops pseries_irq_domain_ops = {
+	.select	= msi_lib_irq_domain_select,
 	.alloc  = pseries_irq_domain_alloc,
 	.free   = pseries_irq_domain_free,
 };
@@ -617,30 +599,18 @@ static int __pseries_msi_allocate_domains(struct pci_controller *phb,
 					  unsigned int count)
 {
 	struct irq_domain *parent = irq_get_default_domain();
-
-	phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI",
-						       phb->global_number);
-	if (!phb->fwnode)
-		return -ENOMEM;
-
-	phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
-						      phb->fwnode,
-						      &pseries_irq_domain_ops, phb);
+	struct irq_domain_info info = {
+		.fwnode		= of_fwnode_handle(phb->dn),
+		.ops		= &pseries_irq_domain_ops,
+		.host_data	= phb,
+		.size		= count,
+		.parent		= parent,
+	};
+
+	phb->dev_domain = msi_create_parent_irq_domain(&info, &pseries_msi_parent_ops);
 	if (!phb->dev_domain) {
-		pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
-		       phb->dn, phb->global_number);
-		irq_domain_free_fwnode(phb->fwnode);
-		return -ENOMEM;
-	}
-
-	phb->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(phb->dn),
-						    &pseries_msi_domain_info,
-						    phb->dev_domain);
-	if (!phb->msi_domain) {
 		pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
 		       phb->dn, phb->global_number);
-		irq_domain_free_fwnode(phb->fwnode);
-		irq_domain_remove(phb->dev_domain);
 		return -ENOMEM;
 	}
 
@@ -662,12 +632,8 @@ int pseries_msi_allocate_domains(struct pci_controller *phb)
 
 void pseries_msi_free_domains(struct pci_controller *phb)
 {
-	if (phb->msi_domain)
-		irq_domain_remove(phb->msi_domain);
 	if (phb->dev_domain)
 		irq_domain_remove(phb->dev_domain);
-	if (phb->fwnode)
-		irq_domain_free_fwnode(phb->fwnode);
 }
 
 static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c
new file mode 100644
index 000000000000..21a2f447c43f
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c
@@ -0,0 +1,818 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-hvpipe: " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/anon_inodes.h>
+#include <linux/miscdevice.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/of.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/rtas-work-area.h>
+#include <asm/papr-sysparm.h>
+#include <uapi/asm/papr-hvpipe.h>
+#include "pseries.h"
+#include "papr-hvpipe.h"
+
+static DEFINE_SPINLOCK(hvpipe_src_list_lock);
+static LIST_HEAD(hvpipe_src_list);
+
+static unsigned char hvpipe_ras_buf[RTAS_ERROR_LOG_MAX];
+static struct workqueue_struct *papr_hvpipe_wq;
+static struct work_struct *papr_hvpipe_work;
+static int hvpipe_check_exception_token;
+static bool hvpipe_feature;
+
+/*
+ * New PowerPC FW provides support for partitions and various
+ * sources (Ex: remote hardware management console (HMC)) to
+ * exchange information through an inband hypervisor channel
+ * called HVPIPE. Only HMCs are supported right now and
+ * partitions can communicate with multiple HMCs and each
+ * source represented by source ID.
+ *
+ * FW introduces send HVPIPE and recv HVPIPE RTAS calls for
+ * partitions to send and receive payloads respectively.
+ *
+ * These RTAS functions have the following certain requirements
+ * / limitations:
+ * - One hvpipe per partition for all sources.
+ * - Assume the return status of send HVPIPE as delivered to source
+ * - Assume the return status of recv HVPIPE as ACK to source
+ * - Generates HVPIPE event message when the payload is ready
+ *   for the partition. The hypervisor will not deliver another
+ *   event until the partition read the previous payload which
+ *   means the pipe is blocked for any sources.
+ *
+ * Linux implementation:
+ * Follow the similar interfaces that the OS has for other RTAS calls.
+ * ex: /dev/papr-indices, /dev/papr-vpd, etc.
+ * - /dev/papr-hvpipe is available for the user space.
+ * - devfd = open("/dev/papr-hvpipe", ..)
+ * - fd = ioctl(fd,HVPIPE_IOC_CREATE_HANDLE,&srcID)-for each source
+ * - write(fd, buf, size) --> Issue send HVPIPE RTAS call and
+ *   returns size for success or the corresponding error for RTAS
+ *   return code for failure.
+ * - poll(fd,..) -> wakeup FD if the payload is available to read.
+ *   HVPIPE event message handler wakeup FD based on source ID in
+ *   the event message
+ * - read(fd, buf, size) --> Issue recv HVPIPE RTAS call and
+ *   returns size for success or the corresponding error for RTAS
+ *   return code for failure.
+ */
+
+/*
+ * ibm,receive-hvpipe-msg RTAS call.
+ * @area: Caller-provided work area buffer for results.
+ * @srcID: Source ID returned by the RTAS call.
+ * @bytesw: Bytes written by RTAS call to @area.
+ */
+static int rtas_ibm_receive_hvpipe_msg(struct rtas_work_area *area,
+					u32 *srcID, u32 *bytesw)
+{
+	const s32 token = rtas_function_token(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG);
+	u32 rets[2];
+	s32 fwrc;
+	int ret;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	do {
+		fwrc = rtas_call(token, 2, 3, rets,
+				rtas_work_area_phys(area),
+				rtas_work_area_size(area));
+
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case RTAS_SUCCESS:
+		*srcID = rets[0];
+		*bytesw = rets[1];
+		ret = 0;
+		break;
+	case RTAS_HARDWARE_ERROR:
+		ret = -EIO;
+		break;
+	case RTAS_INVALID_PARAMETER:
+		ret = -EINVAL;
+		break;
+	case RTAS_FUNC_NOT_SUPPORTED:
+		ret = -EOPNOTSUPP;
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * ibm,send-hvpipe-msg RTAS call
+ * @area: Caller-provided work area buffer to send.
+ * @srcID: Target source for the send pipe message.
+ */
+static int rtas_ibm_send_hvpipe_msg(struct rtas_work_area *area, u32 srcID)
+{
+	const s32 token = rtas_function_token(RTAS_FN_IBM_SEND_HVPIPE_MSG);
+	s32 fwrc;
+	int ret;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -ENOENT;
+
+	do {
+		fwrc = rtas_call(token, 2, 1, NULL, srcID,
+				rtas_work_area_phys(area));
+
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case RTAS_SUCCESS:
+		ret = 0;
+		break;
+	case RTAS_HARDWARE_ERROR:
+		ret = -EIO;
+		break;
+	case RTAS_INVALID_PARAMETER:
+		ret = -EINVAL;
+		break;
+	case RTAS_HVPIPE_CLOSED:
+		ret = -EPIPE;
+		break;
+	case RTAS_FUNC_NOT_SUPPORTED:
+		ret = -EOPNOTSUPP;
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,receive-hvpipe-msg status %d\n", fwrc);
+		break;
+	}
+
+	return ret;
+}
+
+static struct hvpipe_source_info *hvpipe_find_source(u32 srcID)
+{
+	struct hvpipe_source_info *src_info;
+
+	list_for_each_entry(src_info, &hvpipe_src_list, list)
+		if (src_info->srcID == srcID)
+			return src_info;
+
+	return NULL;
+}
+
+/*
+ * This work function collects receive buffer with recv HVPIPE
+ * RTAS call. Called from read()
+ * @buf: User specified buffer to copy the payload that returned
+ *       from recv HVPIPE RTAS.
+ * @size: Size of buffer user passed.
+ */
+static int hvpipe_rtas_recv_msg(char __user *buf, int size)
+{
+	struct rtas_work_area *work_area;
+	u32 srcID, bytes_written;
+	int ret;
+
+	work_area = rtas_work_area_alloc(SZ_4K);
+	if (!work_area) {
+		pr_err("Could not allocate RTAS buffer for recv pipe\n");
+		return -ENOMEM;
+	}
+
+	ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID,
+					&bytes_written);
+	if (!ret) {
+		/*
+		 * Recv HVPIPE RTAS is successful.
+		 * When releasing FD or no one is waiting on the
+		 * specific source, issue recv HVPIPE RTAS call
+		 * so that pipe is not blocked - this func is called
+		 * with NULL buf.
+		 */
+		if (buf) {
+			if (size < bytes_written) {
+				pr_err("Received the payload size = %d, but the buffer size = %d\n",
+					bytes_written, size);
+				bytes_written = size;
+			}
+			ret = copy_to_user(buf,
+					rtas_work_area_raw_buf(work_area),
+					bytes_written);
+			if (!ret)
+				ret = bytes_written;
+		}
+	} else {
+		pr_err("ibm,receive-hvpipe-msg failed with %d\n",
+				ret);
+	}
+
+	rtas_work_area_free(work_area);
+	return ret;
+}
+
+/*
+ * papr_hvpipe_handle_write -  Issue send HVPIPE RTAS and return
+ * the size (payload + HVPIPE_HDR_LEN) for RTAS success.
+ * Otherwise returns the status of RTAS to the user space
+ */
+static ssize_t papr_hvpipe_handle_write(struct file *file,
+	const char __user *buf, size_t size, loff_t *off)
+{
+	struct hvpipe_source_info *src_info = file->private_data;
+	struct rtas_work_area *work_area, *work_buf;
+	unsigned long ret, len;
+	__be64 *area_be;
+
+	/*
+	 * Return -ENXIO during migration
+	 */
+	if (!hvpipe_feature)
+		return -ENXIO;
+
+	if (!src_info)
+		return -EIO;
+
+	/*
+	 * Send HVPIPE RTAS is used to send payload to the specific
+	 * source with the input parameters source ID and the payload
+	 * as buffer list. Each entry in the buffer list contains
+	 * address/length pair of the buffer.
+	 *
+	 * The buffer list format is as follows:
+	 *
+	 * Header (length of address/length pairs and the header length)
+	 * Address of 4K buffer 1
+	 * Length of 4K buffer 1 used
+	 * ...
+	 * Address of 4K buffer n
+	 * Length of 4K buffer n used
+	 *
+	 * See PAPR 7.3.32.2 ibm,send-hvpipe-msg
+	 *
+	 * Even though can support max 1MB payload, the hypervisor
+	 * supports only 4048 bytes payload at present and also
+	 * just one address/length entry.
+	 *
+	 * writev() interface can be added in future when the
+	 * hypervisor supports multiple buffer list entries.
+	 */
+	/* HVPIPE_MAX_WRITE_BUFFER_SIZE = 4048 bytes */
+	if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) ||
+		(size <= HVPIPE_HDR_LEN))
+		return -EINVAL;
+
+	/*
+	 * The length of (address + length) pair + the length of header
+	 */
+	len = (2 * sizeof(u64)) + sizeof(u64);
+	size -= HVPIPE_HDR_LEN;
+	buf += HVPIPE_HDR_LEN;
+	mutex_lock(&rtas_ibm_send_hvpipe_msg_lock);
+	work_area = rtas_work_area_alloc(SZ_4K);
+	if (!work_area) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	area_be = (__be64 *)rtas_work_area_raw_buf(work_area);
+	/* header */
+	area_be[0] = cpu_to_be64(len);
+
+	work_buf = rtas_work_area_alloc(SZ_4K);
+	if (!work_buf) {
+		ret = -ENOMEM;
+		goto out_work;
+	}
+	/* First buffer address */
+	area_be[1] = cpu_to_be64(rtas_work_area_phys(work_buf));
+	/* First buffer address length */
+	area_be[2] = cpu_to_be64(size);
+
+	if (!copy_from_user(rtas_work_area_raw_buf(work_buf), buf, size)) {
+		ret = rtas_ibm_send_hvpipe_msg(work_area, src_info->srcID);
+		if (!ret)
+			ret = size + HVPIPE_HDR_LEN;
+	} else
+		ret = -EPERM;
+
+	rtas_work_area_free(work_buf);
+out_work:
+	rtas_work_area_free(work_area);
+out:
+	mutex_unlock(&rtas_ibm_send_hvpipe_msg_lock);
+	return ret;
+}
+
+/*
+ * papr_hvpipe_handle_read - If the payload for the specific
+ * source is pending in the hypervisor, issue recv HVPIPE RTAS
+ * and return the payload to the user space.
+ *
+ * When the payload is available for the partition, the
+ * hypervisor notifies HVPIPE event with the source ID
+ * and the event handler wakeup FD(s) that are waiting.
+ */
+static ssize_t papr_hvpipe_handle_read(struct file *file,
+		char __user *buf, size_t size, loff_t *off)
+{
+
+	struct hvpipe_source_info *src_info = file->private_data;
+	struct papr_hvpipe_hdr hdr;
+	long ret;
+
+	/*
+	 * Return -ENXIO during migration
+	 */
+	if (!hvpipe_feature)
+		return -ENXIO;
+
+	if (!src_info)
+		return -EIO;
+
+	/*
+	 * Max payload is 4048 (HVPIPE_MAX_WRITE_BUFFER_SIZE)
+	 */
+	if ((size > (HVPIPE_HDR_LEN + HVPIPE_MAX_WRITE_BUFFER_SIZE)) ||
+		(size < HVPIPE_HDR_LEN))
+		return -EINVAL;
+
+	/*
+	 * Payload is not available to receive or source pipe
+	 * is not closed.
+	 */
+	if (!src_info->hvpipe_status)
+		return 0;
+
+	hdr.version = 0;
+	hdr.flags = 0;
+
+	/*
+	 * In case if the hvpipe has payload and also the
+	 * hypervisor closed the pipe to the source, retrieve
+	 * the payload and return to the user space first and
+	 * then notify the userspace about the hvpipe close in
+	 * next read().
+	 */
+	if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE)
+		hdr.flags = HVPIPE_MSG_AVAILABLE;
+	else if (src_info->hvpipe_status & HVPIPE_LOST_CONNECTION)
+		hdr.flags = HVPIPE_LOST_CONNECTION;
+	else
+		/*
+		 * Should not be here without one of the above
+		 * flags set
+		 */
+		return -EIO;
+
+	ret = copy_to_user(buf, &hdr, HVPIPE_HDR_LEN);
+	if (ret)
+		return ret;
+
+	/*
+	 * Message event has payload, so get the payload with
+	 * recv HVPIPE RTAS.
+	 */
+	if (hdr.flags & HVPIPE_MSG_AVAILABLE) {
+		ret = hvpipe_rtas_recv_msg(buf + HVPIPE_HDR_LEN,
+				size - HVPIPE_HDR_LEN);
+		if (ret > 0) {
+			src_info->hvpipe_status &= ~HVPIPE_MSG_AVAILABLE;
+			ret += HVPIPE_HDR_LEN;
+		}
+	} else if (hdr.flags & HVPIPE_LOST_CONNECTION) {
+		/*
+		 * Hypervisor is closing the pipe for the specific
+		 * source. So notify user space.
+		 */
+		src_info->hvpipe_status &= ~HVPIPE_LOST_CONNECTION;
+		ret = HVPIPE_HDR_LEN;
+	}
+
+	return ret;
+}
+
+/*
+ * The user space waits for the payload to receive.
+ * The hypervisor sends HVPIPE event message to the partition
+ * when the payload is available. The event handler wakeup FD
+ * depends on the source ID in the message event.
+ */
+static __poll_t papr_hvpipe_handle_poll(struct file *filp,
+		struct poll_table_struct *wait)
+{
+	struct hvpipe_source_info *src_info = filp->private_data;
+
+	/*
+	 * HVPIPE is disabled during SUSPEND and enabled after migration.
+	 * So return POLLRDHUP during migration
+	 */
+	if (!hvpipe_feature)
+		return POLLRDHUP;
+
+	if (!src_info)
+		return POLLNVAL;
+
+	/*
+	 * If hvpipe already has pending payload, return so that
+	 * the user space can issue read().
+	 */
+	if (src_info->hvpipe_status)
+		return POLLIN | POLLRDNORM;
+
+	/*
+	 * Wait for the message event
+	 * hvpipe_event_interrupt() wakes up this wait_queue
+	 */
+	poll_wait(filp, &src_info->recv_wqh, wait);
+	if (src_info->hvpipe_status)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static int papr_hvpipe_handle_release(struct inode *inode,
+				struct file *file)
+{
+	struct hvpipe_source_info *src_info;
+
+	/*
+	 * Hold the lock, remove source from src_list, reset the
+	 * hvpipe status and release the lock to prevent any race
+	 * with message event IRQ.
+	 */
+	spin_lock(&hvpipe_src_list_lock);
+	src_info = file->private_data;
+	list_del(&src_info->list);
+	file->private_data = NULL;
+	/*
+	 * If the pipe for this specific source has any pending
+	 * payload, issue recv HVPIPE RTAS so that pipe will not
+	 * be blocked.
+	 */
+	if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) {
+		src_info->hvpipe_status = 0;
+		spin_unlock(&hvpipe_src_list_lock);
+		hvpipe_rtas_recv_msg(NULL, 0);
+	} else
+		spin_unlock(&hvpipe_src_list_lock);
+
+	kfree(src_info);
+	return 0;
+}
+
+static const struct file_operations papr_hvpipe_handle_ops = {
+	.read		=	papr_hvpipe_handle_read,
+	.write		=	papr_hvpipe_handle_write,
+	.release	=	papr_hvpipe_handle_release,
+	.poll		=	papr_hvpipe_handle_poll,
+};
+
+static int papr_hvpipe_dev_create_handle(u32 srcID)
+{
+	struct hvpipe_source_info *src_info;
+	struct file *file;
+	long err;
+	int fd;
+
+	spin_lock(&hvpipe_src_list_lock);
+	/*
+	 * Do not allow more than one process communicates with
+	 * each source.
+	 */
+	src_info = hvpipe_find_source(srcID);
+	if (src_info) {
+		spin_unlock(&hvpipe_src_list_lock);
+		pr_err("pid(%d) is already using the source(%d)\n",
+				src_info->tsk->pid, srcID);
+		return -EALREADY;
+	}
+	spin_unlock(&hvpipe_src_list_lock);
+
+	src_info = kzalloc(sizeof(*src_info), GFP_KERNEL_ACCOUNT);
+	if (!src_info)
+		return -ENOMEM;
+
+	src_info->srcID = srcID;
+	src_info->tsk = current;
+	init_waitqueue_head(&src_info->recv_wqh);
+
+	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto free_buf;
+	}
+
+	file = anon_inode_getfile("[papr-hvpipe]",
+			&papr_hvpipe_handle_ops, (void *)src_info,
+			O_RDWR);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto free_fd;
+	}
+
+	spin_lock(&hvpipe_src_list_lock);
+	/*
+	 * If two processes are executing ioctl() for the same
+	 * source ID concurrently, prevent the second process to
+	 * acquire FD.
+	 */
+	if (hvpipe_find_source(srcID)) {
+		spin_unlock(&hvpipe_src_list_lock);
+		err = -EALREADY;
+		goto free_file;
+	}
+	list_add(&src_info->list, &hvpipe_src_list);
+	spin_unlock(&hvpipe_src_list_lock);
+
+	fd_install(fd, file);
+	return fd;
+
+free_file:
+	fput(file);
+free_fd:
+	put_unused_fd(fd);
+free_buf:
+	kfree(src_info);
+	return err;
+}
+
+/*
+ * Top-level ioctl handler for /dev/papr_hvpipe
+ *
+ * Use separate FD for each source (exa :HMC). So ioctl is called
+ * with source ID which returns FD.
+ */
+static long papr_hvpipe_dev_ioctl(struct file *filp, unsigned int ioctl,
+		unsigned long arg)
+{
+	u32 __user *argp = (void __user *)arg;
+	u32 srcID;
+	long ret;
+
+	/*
+	 * Return -ENXIO during migration
+	 */
+	if (!hvpipe_feature)
+		return -ENXIO;
+
+	if (get_user(srcID, argp))
+		return -EFAULT;
+
+	/*
+	 * Support only HMC source right now
+	 */
+	if (!(srcID & HVPIPE_HMC_ID_MASK))
+		return -EINVAL;
+
+	switch (ioctl) {
+	case PAPR_HVPIPE_IOC_CREATE_HANDLE:
+		ret = papr_hvpipe_dev_create_handle(srcID);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * papr_hvpipe_work_fn - called to issue recv HVPIPE RTAS for
+ * sources that are not monitored by user space so that pipe
+ * will not be blocked.
+ */
+static void papr_hvpipe_work_fn(struct work_struct *work)
+{
+	hvpipe_rtas_recv_msg(NULL, 0);
+}
+
+/*
+ * HVPIPE event message IRQ handler.
+ * The hypervisor sends event IRQ if the partition has payload
+ * and generates another event only after payload is read with
+ * recv HVPIPE RTAS.
+ */
+static irqreturn_t hvpipe_event_interrupt(int irq, void *dev_id)
+{
+	struct hvpipe_event_buf *hvpipe_event;
+	struct pseries_errorlog *pseries_log;
+	struct hvpipe_source_info *src_info;
+	struct rtas_error_log *elog;
+	int rc;
+
+	rc = rtas_call(hvpipe_check_exception_token, 6, 1, NULL,
+		RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq),
+		RTAS_HVPIPE_MSG_EVENTS, 1, __pa(&hvpipe_ras_buf),
+		rtas_get_error_log_max());
+
+	if (rc != 0) {
+		pr_err_ratelimited("unexpected hvpipe-event-notification failed %d\n", rc);
+		return IRQ_HANDLED;
+	}
+
+	elog = (struct rtas_error_log *)hvpipe_ras_buf;
+	if (unlikely(rtas_error_type(elog) != RTAS_TYPE_HVPIPE)) {
+		pr_warn_ratelimited("Unexpected event type %d\n",
+				rtas_error_type(elog));
+		return IRQ_HANDLED;
+	}
+
+	pseries_log = get_pseries_errorlog(elog,
+				PSERIES_ELOG_SECT_ID_HVPIPE_EVENT);
+	hvpipe_event = (struct hvpipe_event_buf *)pseries_log->data;
+
+	/*
+	 * The hypervisor notifies partition when the payload is
+	 * available to read with recv HVPIPE RTAS and it will not
+	 * notify another event for any source until the previous
+	 * payload is read. Means the pipe is blocked in the
+	 * hypervisor until the payload is read.
+	 *
+	 * If the source is ready to accept payload and wakeup the
+	 * corresponding FD. Hold lock and update hvpipe_status
+	 * and this lock is needed in case the user space process
+	 * is in release FD instead of poll() so that release()
+	 * reads the payload to unblock pipe before closing FD.
+	 *
+	 * otherwise (means no other user process waiting for the
+	 * payload, issue recv HVPIPE RTAS (papr_hvpipe_work_fn())
+	 * to unblock pipe.
+	 */
+	spin_lock(&hvpipe_src_list_lock);
+	src_info = hvpipe_find_source(be32_to_cpu(hvpipe_event->srcID));
+	if (src_info) {
+		u32 flags = 0;
+
+		if (hvpipe_event->event_type & HVPIPE_LOST_CONNECTION)
+			flags = HVPIPE_LOST_CONNECTION;
+		else if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE)
+			flags = HVPIPE_MSG_AVAILABLE;
+
+		src_info->hvpipe_status |= flags;
+		wake_up(&src_info->recv_wqh);
+		spin_unlock(&hvpipe_src_list_lock);
+	} else {
+		spin_unlock(&hvpipe_src_list_lock);
+		/*
+		 * user space is not waiting on this source. So
+		 * execute receive pipe RTAS so that pipe will not
+		 * be blocked.
+		 */
+		if (hvpipe_event->event_type & HVPIPE_MSG_AVAILABLE)
+			queue_work(papr_hvpipe_wq, papr_hvpipe_work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Enable hvpipe by system parameter set with parameter
+ * token = 64 and with 1 byte buffer data:
+ * 0 = hvpipe not in use/disable
+ * 1 = hvpipe in use/enable
+ */
+static int set_hvpipe_sys_param(u8 val)
+{
+	struct papr_sysparm_buf *buf;
+	int ret;
+
+	buf = papr_sysparm_buf_alloc();
+	if (!buf)
+		return -ENOMEM;
+
+	buf->len = cpu_to_be16(1);
+	buf->val[0] = val;
+	ret = papr_sysparm_set(PAPR_SYSPARM_HVPIPE_ENABLE, buf);
+	if (ret)
+		pr_err("Can not enable hvpipe %d\n", ret);
+
+	papr_sysparm_buf_free(buf);
+
+	return ret;
+}
+
+static int __init enable_hvpipe_IRQ(void)
+{
+	struct device_node *np;
+
+	hvpipe_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION);
+	if (hvpipe_check_exception_token  == RTAS_UNKNOWN_SERVICE)
+		return -ENODEV;
+
+	/* hvpipe events */
+	np = of_find_node_by_path("/event-sources/ibm,hvpipe-msg-events");
+	if (np != NULL) {
+		request_event_sources_irqs(np, hvpipe_event_interrupt,
+					"HPIPE_EVENT");
+		of_node_put(np);
+	} else {
+		pr_err("Can not enable hvpipe event IRQ\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+void hvpipe_migration_handler(int action)
+{
+	pr_info("hvpipe migration event %d\n", action);
+
+	/*
+	 * HVPIPE is not used (Failed to create /dev/papr-hvpipe).
+	 * So nothing to do for migration.
+	 */
+	if (!papr_hvpipe_work)
+		return;
+
+	switch (action) {
+	case HVPIPE_SUSPEND:
+		if (hvpipe_feature) {
+			/*
+			 * Disable hvpipe_feature to the user space.
+			 * It will be enabled with RESUME event.
+			 */
+			hvpipe_feature = false;
+			/*
+			 * set system parameter hvpipe 'disable'
+			 */
+			set_hvpipe_sys_param(0);
+		}
+		break;
+	case HVPIPE_RESUME:
+		/*
+		 * set system parameter hvpipe 'enable'
+		 */
+		if (!set_hvpipe_sys_param(1))
+			hvpipe_feature = true;
+		else
+			pr_err("hvpipe is not enabled after migration\n");
+
+		break;
+	}
+}
+
+static const struct file_operations papr_hvpipe_ops = {
+	.unlocked_ioctl	=	papr_hvpipe_dev_ioctl,
+};
+
+static struct miscdevice papr_hvpipe_dev = {
+	.minor	=	MISC_DYNAMIC_MINOR,
+	.name	=	"papr-hvpipe",
+	.fops	=	&papr_hvpipe_ops,
+};
+
+static int __init papr_hvpipe_init(void)
+{
+	int ret;
+
+	if (!of_find_property(rtas.dev, "ibm,hypervisor-pipe-capable",
+		NULL))
+		return -ENODEV;
+
+	if (!rtas_function_implemented(RTAS_FN_IBM_SEND_HVPIPE_MSG) ||
+		!rtas_function_implemented(RTAS_FN_IBM_RECEIVE_HVPIPE_MSG))
+		return -ENODEV;
+
+	papr_hvpipe_work = kzalloc(sizeof(struct work_struct), GFP_ATOMIC);
+	if (!papr_hvpipe_work)
+		return -ENOMEM;
+
+	INIT_WORK(papr_hvpipe_work, papr_hvpipe_work_fn);
+
+	papr_hvpipe_wq = alloc_ordered_workqueue("papr hvpipe workqueue", 0);
+	if (!papr_hvpipe_wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = enable_hvpipe_IRQ();
+	if (!ret) {
+		ret = set_hvpipe_sys_param(1);
+		if (!ret)
+			ret = misc_register(&papr_hvpipe_dev);
+	}
+
+	if (!ret) {
+		pr_info("hvpipe feature is enabled\n");
+		hvpipe_feature = true;
+		return 0;
+	}
+
+	pr_err("hvpipe feature is not enabled %d\n", ret);
+	destroy_workqueue(papr_hvpipe_wq);
+out:
+	kfree(papr_hvpipe_work);
+	papr_hvpipe_work = NULL;
+	return ret;
+}
+machine_device_initcall(pseries, papr_hvpipe_init);
diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h
new file mode 100644
index 000000000000..c343f4230865
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _PAPR_HVPIPE_H
+#define _PAPR_HVPIPE_H
+
+#define	HVPIPE_HMC_ID_MASK	0x02000000 /*02-HMC,00-reserved and HMC ID */
+#define	HVPIPE_MAX_WRITE_BUFFER_SIZE	4048
+/*
+ * hvpipe specific RTAS return values
+ */
+#define	RTAS_HVPIPE_CLOSED		-4
+
+#define	HVPIPE_HDR_LEN	sizeof(struct papr_hvpipe_hdr)
+
+enum hvpipe_migrate_action {
+	HVPIPE_SUSPEND,
+	HVPIPE_RESUME,
+};
+
+struct hvpipe_source_info {
+	struct list_head list;	/* list of sources */
+	u32 srcID;
+	u32 hvpipe_status;
+	wait_queue_head_t recv_wqh;	 /* wake up poll() waitq */
+	struct task_struct *tsk;
+};
+
+/*
+ * Source ID Format 0xCCRRQQQQ
+ * CC = indicating value is source type (ex: 0x02 for HMC)
+ * RR = 0x00 (reserved)
+ * QQQQ = 0x0000 – 0xFFFF indicating the source index indetifier
+ */
+struct hvpipe_event_buf {
+	__be32	srcID;		/* Source ID */
+	u8	event_type;	/* 0x01 for hvpipe message available */
+				/* from specified src ID */
+				/* 0x02 for loss of pipe connection */
+				/* with specified src ID */
+};
+
+void hvpipe_migration_handler(int action);
+#endif /* _PAPR_HVPIPE_H */
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index f469f6a9f6e0..07ea605ab0e6 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -28,10 +28,6 @@
 
 #include <mm/mmu_decl.h>
 
-#if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO)
-#include <linux/gpio/legacy-of-mm-gpiochip.h>
-#endif
-
 static int __init cpm_init(void)
 {
 	struct device_node *np;
@@ -91,32 +87,33 @@ void __init udbg_init_cpm(void)
 
 #if defined(CONFIG_CPM2) || defined(CONFIG_8xx_GPIO)
 
+#include <linux/gpio/driver.h>
+
 struct cpm2_ioports {
 	u32 dir, par, sor, odr, dat;
 	u32 res[3];
 };
 
 struct cpm2_gpio32_chip {
-	struct of_mm_gpio_chip mm_gc;
+	struct gpio_chip gc;
+	void __iomem *regs;
 	spinlock_t lock;
 
 	/* shadowed data register to clear/set bits safely */
 	u32 cpdata;
 };
 
-static void cpm2_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc)
+static void cpm2_gpio32_save_regs(struct cpm2_gpio32_chip *cpm2_gc)
 {
-	struct cpm2_gpio32_chip *cpm2_gc =
-		container_of(mm_gc, struct cpm2_gpio32_chip, mm_gc);
-	struct cpm2_ioports __iomem *iop = mm_gc->regs;
+	struct cpm2_ioports __iomem *iop = cpm2_gc->regs;
 
 	cpm2_gc->cpdata = in_be32(&iop->dat);
 }
 
 static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct cpm2_ioports __iomem *iop = mm_gc->regs;
+	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc);
+	struct cpm2_ioports __iomem *iop = cpm2_gc->regs;
 	u32 pin_mask;
 
 	pin_mask = 1 << (31 - gpio);
@@ -124,11 +121,9 @@ static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio)
 	return !!(in_be32(&iop->dat) & pin_mask);
 }
 
-static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask,
-	int value)
+static void __cpm2_gpio32_set(struct cpm2_gpio32_chip *cpm2_gc, u32 pin_mask, int value)
 {
-	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(&mm_gc->gc);
-	struct cpm2_ioports __iomem *iop = mm_gc->regs;
+	struct cpm2_ioports __iomem *iop = cpm2_gc->regs;
 
 	if (value)
 		cpm2_gc->cpdata |= pin_mask;
@@ -140,14 +135,13 @@ static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask,
 
 static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc);
 	unsigned long flags;
 	u32 pin_mask = 1 << (31 - gpio);
 
 	spin_lock_irqsave(&cpm2_gc->lock, flags);
 
-	__cpm2_gpio32_set(mm_gc, pin_mask, value);
+	__cpm2_gpio32_set(cpm2_gc, pin_mask, value);
 
 	spin_unlock_irqrestore(&cpm2_gc->lock, flags);
 
@@ -156,16 +150,15 @@ static int cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
 
 static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc);
-	struct cpm2_ioports __iomem *iop = mm_gc->regs;
+	struct cpm2_ioports __iomem *iop = cpm2_gc->regs;
 	unsigned long flags;
 	u32 pin_mask = 1 << (31 - gpio);
 
 	spin_lock_irqsave(&cpm2_gc->lock, flags);
 
 	setbits32(&iop->dir, pin_mask);
-	__cpm2_gpio32_set(mm_gc, pin_mask, val);
+	__cpm2_gpio32_set(cpm2_gc, pin_mask, val);
 
 	spin_unlock_irqrestore(&cpm2_gc->lock, flags);
 
@@ -174,9 +167,8 @@ static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int cpm2_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc);
-	struct cpm2_ioports __iomem *iop = mm_gc->regs;
+	struct cpm2_ioports __iomem *iop = cpm2_gc->regs;
 	unsigned long flags;
 	u32 pin_mask = 1 << (31 - gpio);
 
@@ -193,19 +185,17 @@ int cpm2_gpiochip_add32(struct device *dev)
 {
 	struct device_node *np = dev->of_node;
 	struct cpm2_gpio32_chip *cpm2_gc;
-	struct of_mm_gpio_chip *mm_gc;
 	struct gpio_chip *gc;
 
-	cpm2_gc = kzalloc(sizeof(*cpm2_gc), GFP_KERNEL);
+	cpm2_gc = devm_kzalloc(dev, sizeof(*cpm2_gc), GFP_KERNEL);
 	if (!cpm2_gc)
 		return -ENOMEM;
 
 	spin_lock_init(&cpm2_gc->lock);
 
-	mm_gc = &cpm2_gc->mm_gc;
-	gc = &mm_gc->gc;
+	gc = &cpm2_gc->gc;
 
-	mm_gc->save_regs = cpm2_gpio32_save_regs;
+	gc->base = -1;
 	gc->ngpio = 32;
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
@@ -214,6 +204,16 @@ int cpm2_gpiochip_add32(struct device *dev)
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 
-	return of_mm_gpiochip_add_data(np, mm_gc, cpm2_gc);
+	gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np);
+	if (!gc->label)
+		return -ENOMEM;
+
+	cpm2_gc->regs = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(cpm2_gc->regs))
+		return PTR_ERR(cpm2_gc->regs);
+
+	cpm2_gpio32_save_regs(cpm2_gc);
+
+	return devm_gpiochip_add_data(dev, gc, cpm2_gc);
 }
 #endif /* CONFIG_CPM2 || CONFIG_8xx_GPIO */
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 4fe8a7b1b288..2a007bfb038d 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -412,9 +412,8 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 	}
 	platform_set_drvdata(dev, msi);
 
-	msi->irqhost = irq_domain_create_linear(of_fwnode_handle(dev->dev.of_node),
-				      NR_MSI_IRQS_MAX, &fsl_msi_host_ops, msi);
-
+	msi->irqhost = irq_domain_create_linear(dev_fwnode(&dev->dev), NR_MSI_IRQS_MAX,
+						&fsl_msi_host_ops, msi);
 	if (msi->irqhost == NULL) {
 		dev_err(&dev->dev, "No memory for MSI irqhost\n");
 		err = -ENOMEM;
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index f10592405024..625361a15424 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -317,7 +317,7 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
 	if (d) {
 		char buffer[128];
 
-		xive_irq_data_dump(irq_data_get_irq_handler_data(d),
+		xive_irq_data_dump(irq_data_get_irq_chip_data(d),
 				   buffer, sizeof(buffer));
 		xmon_printf("%s", buffer);
 	}
@@ -437,7 +437,7 @@ static void xive_do_source_eoi(struct xive_irq_data *xd)
 /* irq_chip eoi callback, called with irq descriptor lock held */
 static void xive_irq_eoi(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
 
 	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
@@ -595,7 +595,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 				const struct cpumask *affinity)
 {
 	static unsigned int fuzz;
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	cpumask_var_t mask;
 	int cpu = -1;
 
@@ -628,7 +628,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 
 static unsigned int xive_irq_startup(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	int target, rc;
 
@@ -673,7 +673,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
 /* called with irq descriptor lock held */
 static void xive_irq_shutdown(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 
 	pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d);
@@ -698,7 +698,7 @@ static void xive_irq_shutdown(struct irq_data *d)
 
 static void xive_irq_unmask(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd);
 
@@ -707,7 +707,7 @@ static void xive_irq_unmask(struct irq_data *d)
 
 static void xive_irq_mask(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd);
 
@@ -718,7 +718,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
 				 const struct cpumask *cpumask,
 				 bool force)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	u32 target, old_target;
 	int rc = 0;
@@ -776,7 +776,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
 
 static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	/*
 	 * We only support these. This has really no effect other than setting
@@ -815,7 +815,7 @@ static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
 
 static int xive_irq_retrigger(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	/* This should be only for MSIs */
 	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
@@ -837,7 +837,7 @@ static int xive_irq_retrigger(struct irq_data *d)
  */
 static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	int rc;
 	u8 pq;
@@ -951,7 +951,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
 static int xive_get_irqchip_state(struct irq_data *data,
 				  enum irqchip_irq_state which, bool *state)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(data);
 	u8 pq;
 
 	switch (which) {
@@ -1011,21 +1011,20 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
 }
 EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
 
-static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
+static struct xive_irq_data *xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 {
 	struct xive_irq_data *xd;
 	int rc;
 
 	xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
 	if (!xd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	rc = xive_ops->populate_irq_data(hw, xd);
 	if (rc) {
 		kfree(xd);
-		return rc;
+		return ERR_PTR(rc);
 	}
 	xd->target = XIVE_INVALID_TARGET;
-	irq_set_handler_data(virq, xd);
 
 	/*
 	 * Turn OFF by default the interrupt being mapped. A side
@@ -1036,20 +1035,19 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 	 */
 	xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
 
-	return 0;
+	return xd;
 }
 
-void xive_irq_free_data(unsigned int virq)
+static void xive_irq_free_data(unsigned int virq)
 {
-	struct xive_irq_data *xd = irq_get_handler_data(virq);
+	struct xive_irq_data *xd = irq_get_chip_data(virq);
 
 	if (!xd)
 		return;
-	irq_set_handler_data(virq, NULL);
+	irq_set_chip_data(virq, NULL);
 	xive_cleanup_irq_data(xd);
 	kfree(xd);
 }
-EXPORT_SYMBOL_GPL(xive_irq_free_data);
 
 #ifdef CONFIG_SMP
 
@@ -1286,7 +1284,7 @@ void __init xive_smp_probe(void)
 static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
 			       irq_hw_number_t hw)
 {
-	int rc;
+	struct xive_irq_data *xd;
 
 	/*
 	 * Mark interrupts as edge sensitive by default so that resend
@@ -1294,11 +1292,12 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
 	 */
 	irq_clear_status_flags(virq, IRQ_LEVEL);
 
-	rc = xive_irq_alloc_data(virq, hw);
-	if (rc)
-		return rc;
+	xd = xive_irq_alloc_data(virq, hw);
+	if (IS_ERR(xd))
+		return PTR_ERR(xd);
 
 	irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
+	irq_set_chip_data(virq, xd);
 
 	return 0;
 }
@@ -1366,7 +1365,7 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d,
 	seq_printf(m, "%*sXIVE:\n", ind, "");
 	ind++;
 
-	xd = irq_data_get_irq_handler_data(irqd);
+	xd = irq_data_get_irq_chip_data(irqd);
 	if (!xd) {
 		seq_printf(m, "%*snot assigned\n", ind, "");
 		return;
@@ -1403,6 +1402,7 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 				 unsigned int nr_irqs, void *arg)
 {
 	struct irq_fwspec *fwspec = arg;
+	struct xive_irq_data *xd;
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	int i, rc;
@@ -1423,12 +1423,11 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 		irq_clear_status_flags(virq, IRQ_LEVEL);
 
 		/* allocates and sets handler data */
-		rc = xive_irq_alloc_data(virq + i, hwirq + i);
-		if (rc)
-			return rc;
+		xd = xive_irq_alloc_data(virq + i, hwirq + i);
+		if (IS_ERR(xd))
+			return PTR_ERR(xd);
 
-		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
-					      &xive_irq_chip, domain->host_data);
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &xive_irq_chip, xd);
 		irq_set_handler(virq + i, handle_fasteoi_irq);
 	}
 
@@ -1764,7 +1763,7 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d)
 	seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
 		   hw_irq, target, prio, lirq);
 
-	xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer));
+	xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer));
 	seq_puts(m, buffer);
 	seq_puts(m, "\n");
 }
diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index 0774d711453e..de9b4236728c 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -954,8 +954,7 @@ const struct powerpc_operand powerpc_operands[] =
   { 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT },
 };
 
-const unsigned int num_powerpc_operands = (sizeof (powerpc_operands)
-					   / sizeof (powerpc_operands[0]));
+const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands);
 
 /* The functions used to insert and extract complicated operands.  */
 
@@ -6968,9 +6967,8 @@ const struct powerpc_opcode powerpc_opcodes[] = {
 {"fcfidu.",	XRC(63,974,1),	XRA_MASK, POWER7|PPCA2,	PPCVLE,		{FRT, FRB}},
 };
 
-const int powerpc_num_opcodes =
-  sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]);
-
+const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes);
+
 /* The VLE opcode table.
 
    The format of this opcode table is the same as the main opcode table.  */
@@ -7207,9 +7205,8 @@ const struct powerpc_opcode vle_opcodes[] = {
 {"se_bl",	BD8(58,0,1),	BD8_MASK,	PPCVLE,	0,		{B8}},
 };
 
-const int vle_num_opcodes =
-  sizeof (vle_opcodes) / sizeof (vle_opcodes[0]);
-
+const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes);
+
 /* The macro table.  This is only used by the assembler.  */
 
 /* The expressions of the form (-x ! 31) & (x | 31) have the value 0
@@ -7276,5 +7273,4 @@ const struct powerpc_macro powerpc_macros[] = {
 {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"},
 };
 
-const int powerpc_num_macros =
-  sizeof (powerpc_macros) / sizeof (powerpc_macros[0]);
+const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros);
diff --git a/arch/powerpc/xmon/xmon_bpts.h b/arch/powerpc/xmon/xmon_bpts.h
index 377068f52edb..e14e4fb862e0 100644
--- a/arch/powerpc/xmon/xmon_bpts.h
+++ b/arch/powerpc/xmon/xmon_bpts.h
@@ -3,12 +3,12 @@
 #define XMON_BPTS_H
 
 #define NBPTS	256
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/inst.h>
 #define BPT_SIZE	(sizeof(ppc_inst_t) * 2)
 #define BPT_WORDS	(BPT_SIZE / sizeof(ppc_inst_t))
 
 extern unsigned int bpt_table[NBPTS * BPT_WORDS];
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* XMON_BPTS_H */
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a4b233a0659e..1972910777c7 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -28,6 +28,7 @@ config RISCV
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
 	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_WX
+	select ARCH_HAS_ELF_CORE_EFLAGS
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
@@ -53,6 +54,7 @@ config RISCV
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN
 	select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK if ACPI
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE	if 64BIT && MMU
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -60,23 +62,24 @@ config RISCV
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	# clang >= 17: https://github.com/llvm/llvm-project/commit/62fa708ceb027713b386c7e0efda994f8bdc27e2
-	select ARCH_SUPPORTS_CFI_CLANG if CLANG_VERSION >= 170000
+	select ARCH_SUPPORTS_CFI if (!CC_IS_CLANG || CLANG_VERSION >= 170000)
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
 	select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
 	select ARCH_SUPPORTS_HUGETLBFS if MMU
 	# LLD >= 14: https://github.com/llvm/llvm-project/issues/50505
-	select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000
+	select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000 && CMODEL_MEDANY
 	select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000
 	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS if 64BIT && MMU
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
 	select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
 	select ARCH_SUPPORTS_RT
 	select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK
+	select ARCH_SUPPORTS_SCHED_MC if SMP
 	select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
 	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_SYM_ANNOTATIONS
-	select ARCH_USES_CFI_TRAPS if CFI_CLANG
+	select ARCH_USES_CFI_TRAPS if CFI
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
@@ -154,13 +157,14 @@ config RISCV
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
 	select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
-	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
+	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI)
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC
 	select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	select HAVE_FUNCTION_GRAPH_FREGS
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE
 	select HAVE_EBPF_JIT if MMU
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
@@ -453,14 +457,6 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	depends on SMP
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-512)"
 	depends on SMP
diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata
index e318119d570d..aca9b0cfcfec 100644
--- a/arch/riscv/Kconfig.errata
+++ b/arch/riscv/Kconfig.errata
@@ -21,6 +21,29 @@ config ERRATA_ANDES_CMO
 
 	  If you don't know what to do here, say "Y".
 
+config ERRATA_MIPS
+	bool "MIPS errata"
+	depends on RISCV_ALTERNATIVE
+	help
+	  All MIPS errata Kconfig depend on this Kconfig. Disabling
+	  this Kconfig will disable all MIPS errata. Please say "Y"
+	  here if your platform uses MIPS CPU cores.
+
+	  Otherwise, please say "N" here to avoid unnecessary overhead.
+
+config ERRATA_MIPS_P8700_PAUSE_OPCODE
+	bool "Fix the PAUSE Opcode for MIPS P8700"
+	depends on ERRATA_MIPS && 64BIT
+	default n
+	help
+	   The RISCV MIPS P8700 uses a different opcode for PAUSE.
+	   It is a 'hint' encoding of the SLLI instruction,
+	   with rd=0, rs1=0 and imm=5. It will behave as a NOP
+	   instruction if no additional behavior beyond that of
+	   SLLI is implemented.
+
+	   If you are not using the P8700 processor, say n.
+
 config ERRATA_SIFIVE
 	bool "SiFive errata"
 	depends on RISCV_ALTERNATIVE
diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor
index e14f26368963..3c1f92e406c3 100644
--- a/arch/riscv/Kconfig.vendor
+++ b/arch/riscv/Kconfig.vendor
@@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES
 	  If you don't know what to do here, say Y.
 endmenu
 
+menu "MIPS"
+config RISCV_ISA_VENDOR_EXT_MIPS
+	bool "MIPS vendor extension support"
+	select RISCV_ISA_VENDOR_EXT
+	default y
+	help
+	  Say N here to disable detection of and support for all MIPS vendor
+	  extensions. Without this option enabled, MIPS vendor extensions will
+	  not be detected at boot and their presence not reported to userspace.
+
+	  If you don't know what to do here, say Y.
+endmenu
+
 menu "SiFive"
 config RISCV_ISA_VENDOR_EXT_SIFIVE
 	bool "SiFive vendor extension support"
diff --git a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts
index bc5c84f22762..5f2e5cc3e3d5 100644
--- a/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts
+++ b/arch/riscv/boot/dts/allwinner/sun20i-d1-devterm-v3.14.dts
@@ -17,7 +17,7 @@
 		#cooling-cells = <2>;
 	};
 
-	i2c-gpio-0 {
+	i2c-0 {
 		compatible = "i2c-gpio";
 		sda-gpios = <&pio 3 14 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD14/GPIO44 */
 		scl-gpios = <&pio 3 15 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>; /* PD15/GPIO45 */
diff --git a/arch/riscv/boot/dts/thead/th1520.dtsi b/arch/riscv/boot/dts/thead/th1520.dtsi
index 42724bf7e90e..03f1d7319049 100644
--- a/arch/riscv/boot/dts/thead/th1520.dtsi
+++ b/arch/riscv/boot/dts/thead/th1520.dtsi
@@ -297,8 +297,9 @@
 			reg-names = "dwmac", "apb";
 			interrupts = <67 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "macirq";
-			clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>;
-			clock-names = "stmmaceth", "pclk";
+			clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC1>,
+				 <&clk CLK_PERISYS_APB4_HCLK>;
+			clock-names = "stmmaceth", "pclk", "apb";
 			snps,pbl = <32>;
 			snps,fixed-burst;
 			snps,multicast-filter-bins = <64>;
@@ -319,8 +320,9 @@
 			reg-names = "dwmac", "apb";
 			interrupts = <66 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "macirq";
-			clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>;
-			clock-names = "stmmaceth", "pclk";
+			clocks = <&clk CLK_GMAC_AXI>, <&clk CLK_GMAC0>,
+				 <&clk CLK_PERISYS_APB4_HCLK>;
+			clock-names = "stmmaceth", "pclk", "apb";
 			snps,pbl = <32>;
 			snps,fixed-burst;
 			snps,multicast-filter-bins = <64>;
diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile
index bc6c77ba837d..02a7a3335b1d 100644
--- a/arch/riscv/errata/Makefile
+++ b/arch/riscv/errata/Makefile
@@ -13,5 +13,6 @@ endif
 endif
 
 obj-$(CONFIG_ERRATA_ANDES) += andes/
+obj-$(CONFIG_ERRATA_MIPS) += mips/
 obj-$(CONFIG_ERRATA_SIFIVE) += sifive/
 obj-$(CONFIG_ERRATA_THEAD) += thead/
diff --git a/arch/riscv/errata/mips/Makefile b/arch/riscv/errata/mips/Makefile
new file mode 100644
index 000000000000..6278c389b801
--- /dev/null
+++ b/arch/riscv/errata/mips/Makefile
@@ -0,0 +1,5 @@
+ifdef CONFIG_RISCV_ALTERNATIVE_EARLY
+CFLAGS_errata.o := -mcmodel=medany
+endif
+
+obj-y += errata.o
diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c
new file mode 100644
index 000000000000..e984a8152208
--- /dev/null
+++ b/arch/riscv/errata/mips/errata.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 MIPS.
+ */
+
+#include <linux/memory.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
+#include <asm/alternative.h>
+#include <asm/errata_list.h>
+#include <asm/vendorid_list.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/mips.h>
+
+static inline bool errata_probe_pause(void)
+{
+	if (!IS_ENABLED(CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE))
+		return false;
+
+	if (!riscv_isa_vendor_extension_available(MIPS_VENDOR_ID, XMIPSEXECTL))
+		return false;
+
+	return true;
+}
+
+static u32 mips_errata_probe(void)
+{
+	u32 cpu_req_errata = 0;
+
+	if (errata_probe_pause())
+		cpu_req_errata |= BIT(ERRATA_MIPS_P8700_PAUSE_OPCODE);
+
+	return cpu_req_errata;
+}
+
+void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+			    unsigned long archid, unsigned long impid,
+			    unsigned int stage)
+{
+	struct alt_entry *alt;
+	u32 cpu_req_errata = mips_errata_probe();
+	u32 tmp;
+
+	BUILD_BUG_ON(ERRATA_MIPS_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
+	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+		return;
+
+	for (alt = begin; alt < end; alt++) {
+		if (alt->vendor_id != MIPS_VENDOR_ID)
+			continue;
+
+		if (alt->patch_id >= ERRATA_MIPS_NUMBER) {
+			WARN(1, "MIPS errata id:%d not in kernel errata list\n",
+			     alt->patch_id);
+			continue;
+		}
+
+		tmp = (1U << alt->patch_id);
+		if (cpu_req_errata && tmp) {
+			mutex_lock(&text_mutex);
+			patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt),
+					  alt->alt_len);
+			mutex_unlock(&text_mutex);
+		}
+	}
+}
diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h
index 231d777d936c..9619bd5c8eba 100644
--- a/arch/riscv/include/asm/alternative-macros.h
+++ b/arch/riscv/include/asm/alternative-macros.h
@@ -4,7 +4,7 @@
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 .macro ALT_ENTRY oldptr newptr vendor_id patch_id new_len
 	.4byte \oldptr - .
@@ -53,7 +53,7 @@
 #define __ALTERNATIVE_CFG(...)		ALTERNATIVE_CFG __VA_ARGS__
 #define __ALTERNATIVE_CFG_2(...)	ALTERNATIVE_CFG_2 __VA_ARGS__
 
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 
 #include <asm/asm.h>
 #include <linux/stringify.h>
@@ -98,7 +98,7 @@
 	__ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, patch_id_1, enable_1)	\
 	ALT_NEW_CONTENT(vendor_id_2, patch_id_2, enable_2, new_c_2)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, CONFIG_k)	\
 	__ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, IS_ENABLED(CONFIG_k))
@@ -109,7 +109,7 @@
 				   new_c_2, vendor_id_2, patch_id_2, IS_ENABLED(CONFIG_k_2))
 
 #else /* CONFIG_RISCV_ALTERNATIVE */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 .macro ALTERNATIVE_CFG old_c
 	\old_c
@@ -118,12 +118,12 @@
 #define __ALTERNATIVE_CFG(old_c, ...)		ALTERNATIVE_CFG old_c
 #define __ALTERNATIVE_CFG_2(old_c, ...)		ALTERNATIVE_CFG old_c
 
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 
 #define __ALTERNATIVE_CFG(old_c, ...)		old_c "\n"
 #define __ALTERNATIVE_CFG_2(old_c, ...)		old_c "\n"
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define _ALTERNATIVE_CFG(old_c, ...)		__ALTERNATIVE_CFG(old_c)
 #define _ALTERNATIVE_CFG_2(old_c, ...)		__ALTERNATIVE_CFG_2(old_c)
diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h
index 3c2b59b25017..8407d1d535b8 100644
--- a/arch/riscv/include/asm/alternative.h
+++ b/arch/riscv/include/asm/alternative.h
@@ -8,7 +8,7 @@
 
 #include <asm/alternative-macros.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
 
@@ -48,6 +48,9 @@ struct alt_entry {
 void andes_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
 			     unsigned long archid, unsigned long impid,
 			     unsigned int stage);
+void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
+			    unsigned long archid, unsigned long impid,
+			    unsigned int stage);
 void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
 			      unsigned long archid, unsigned long impid,
 			      unsigned int stage);
diff --git a/arch/riscv/include/asm/asm-extable.h b/arch/riscv/include/asm/asm-extable.h
index 0c8bfd54fc4e..37d425d7a762 100644
--- a/arch/riscv/include/asm/asm-extable.h
+++ b/arch/riscv/include/asm/asm-extable.h
@@ -10,7 +10,7 @@
 
 #ifdef CONFIG_MMU
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #define __ASM_EXTABLE_RAW(insn, fixup, type, data)	\
 	.pushsection	__ex_table, "a";		\
@@ -25,7 +25,7 @@
 	__ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0)
 	.endm
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #include <linux/bits.h>
 #include <linux/stringify.h>
@@ -77,7 +77,7 @@
 			    EX_DATA_REG(ADDR, addr)				\
 			  ")")
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #else /* CONFIG_MMU */
 	#define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err)
diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h
index a8a2af6dfe9d..8bd2a11382a3 100644
--- a/arch/riscv/include/asm/asm.h
+++ b/arch/riscv/include/asm/asm.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_RISCV_ASM_H
 #define _ASM_RISCV_ASM_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define __ASM_STR(x)	x
 #else
 #define __ASM_STR(x)	#x
@@ -30,7 +30,7 @@
 #define SRLI		__REG_SEL(srliw, srli)
 
 #if __SIZEOF_POINTER__ == 8
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define RISCV_PTR		.dword
 #define RISCV_SZPTR		8
 #define RISCV_LGPTR		3
@@ -40,7 +40,7 @@
 #define RISCV_LGPTR		"3"
 #endif
 #elif __SIZEOF_POINTER__ == 4
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define RISCV_PTR		.word
 #define RISCV_SZPTR		4
 #define RISCV_LGPTR		2
@@ -69,7 +69,7 @@
 #error "Unexpected __SIZEOF_SHORT__"
 #endif
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #include <asm/asm-offsets.h>
 
 /* Common assembly source macros */
@@ -91,7 +91,7 @@
 #endif
 
 .macro asm_per_cpu dst sym tmp
-	REG_L \tmp, TASK_TI_CPU_NUM(tp)
+	lw    \tmp, TASK_TI_CPU_NUM(tp)
 	slli  \tmp, \tmp, PER_CPU_OFFSET_SHIFT
 	la    \dst, __per_cpu_offset
 	add   \dst, \dst, \tmp
@@ -194,6 +194,6 @@
 #define ASM_NOKPROBE(name)
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_ASM_H */
diff --git a/arch/riscv/include/asm/assembler.h b/arch/riscv/include/asm/assembler.h
index 44b1457d3e95..16931712beab 100644
--- a/arch/riscv/include/asm/assembler.h
+++ b/arch/riscv/include/asm/assembler.h
@@ -5,7 +5,7 @@
  * Author: Jee Heng Sia <jeeheng.sia@starfivetech.com>
  */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #error "Only include this from assembly code"
 #endif
 
diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index b8c5726d86ac..700ba3f922cb 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -10,7 +10,7 @@
 #ifndef _ASM_RISCV_BARRIER_H
 #define _ASM_RISCV_BARRIER_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/cmpxchg.h>
 #include <asm/fence.h>
 
@@ -82,6 +82,6 @@ do {									\
 
 #include <asm-generic/barrier.h>
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_BARRIER_H */
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index d59310f74c2b..77880677b06e 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -45,7 +45,7 @@
 #error "Unexpected BITS_PER_LONG"
 #endif
 
-static __always_inline unsigned long variable__ffs(unsigned long word)
+static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word)
 {
 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
@@ -74,7 +74,7 @@ legacy:
 	 (unsigned long)__builtin_ctzl(word) :	\
 	 variable__ffs(word))
 
-static __always_inline unsigned long variable__fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned long variable__fls(unsigned long word)
 {
 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
@@ -103,7 +103,7 @@ legacy:
 	 (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) :	\
 	 variable__fls(word))
 
-static __always_inline int variable_ffs(int x)
+static __always_inline __attribute_const__ int variable_ffs(int x)
 {
 	asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
 				      RISCV_ISA_EXT_ZBB, 1)
diff --git a/arch/riscv/include/asm/cache.h b/arch/riscv/include/asm/cache.h
index 570e9d8acad1..eb42b739d132 100644
--- a/arch/riscv/include/asm/cache.h
+++ b/arch/riscv/include/asm/cache.h
@@ -24,7 +24,7 @@
 #define ARCH_SLAB_MINALIGN	16
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern int dma_cache_alignment;
 #ifdef CONFIG_RISCV_DMA_NONCOHERENT
@@ -35,6 +35,6 @@ static inline int dma_get_cache_alignment(void)
 }
 #endif
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_CACHE_H */
diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h
index 4508aaa7a2fd..710aa8192edd 100644
--- a/arch/riscv/include/asm/cfi.h
+++ b/arch/riscv/include/asm/cfi.h
@@ -11,7 +11,7 @@
 
 struct pt_regs;
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
 #define __bpfcall
 #else
@@ -19,6 +19,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_NONE;
 }
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 #endif /* _ASM_RISCV_CFI_H */
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 0b749e710216..80bd52363c68 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -14,6 +14,7 @@
 #include <asm/insn-def.h>
 #include <asm/cpufeature-macros.h>
 #include <asm/processor.h>
+#include <asm/errata_list.h>
 
 #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append,		\
 			   swap_append, r, p, n)				\
@@ -438,7 +439,7 @@ static __always_inline void __cmpwait(volatile void *ptr,
 	return;
 
 no_zawrs:
-	asm volatile(RISCV_PAUSE : : : "memory");
+	ALT_RISCV_PAUSE();
 }
 
 #define __cmpwait_relaxed(ptr, val) \
diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h
index d6e4665b3195..776fa55fbaa4 100644
--- a/arch/riscv/include/asm/cpu_ops_sbi.h
+++ b/arch/riscv/include/asm/cpu_ops_sbi.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_CPU_OPS_SBI_H
 #define __ASM_CPU_OPS_SBI_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/threads.h>
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 6fed42e37705..4a37a98398ad 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -513,7 +513,7 @@
 #define IE_TIE		(_AC(0x1, UL) << RV_IRQ_TIMER)
 #define IE_EIE		(_AC(0x1, UL) << RV_IRQ_EXT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define csr_swap(csr, val)					\
 ({								\
@@ -575,6 +575,6 @@
 			      : "memory");			\
 })
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_CSR_H */
diff --git a/arch/riscv/include/asm/current.h b/arch/riscv/include/asm/current.h
index 21774d868c65..ba5aa72aff63 100644
--- a/arch/riscv/include/asm/current.h
+++ b/arch/riscv/include/asm/current.h
@@ -13,7 +13,7 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct task_struct;
 
@@ -35,6 +35,6 @@ static __always_inline struct task_struct *get_current(void)
 
 register unsigned long current_stack_pointer __asm__("sp");
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_CURRENT_H */
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index 6e426ed7919a..6694b5ccdcf8 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -5,31 +5,14 @@
 #ifndef ASM_ERRATA_LIST_H
 #define ASM_ERRATA_LIST_H
 
-#include <asm/alternative.h>
 #include <asm/csr.h>
 #include <asm/insn-def.h>
 #include <asm/hwcap.h>
 #include <asm/vendorid_list.h>
+#include <asm/errata_list_vendors.h>
+#include <asm/vendor_extensions/mips.h>
 
-#ifdef CONFIG_ERRATA_ANDES
-#define ERRATA_ANDES_NO_IOCP 0
-#define ERRATA_ANDES_NUMBER 1
-#endif
-
-#ifdef CONFIG_ERRATA_SIFIVE
-#define	ERRATA_SIFIVE_CIP_453 0
-#define	ERRATA_SIFIVE_CIP_1200 1
-#define	ERRATA_SIFIVE_NUMBER 2
-#endif
-
-#ifdef CONFIG_ERRATA_THEAD
-#define	ERRATA_THEAD_MAE 0
-#define	ERRATA_THEAD_PMU 1
-#define	ERRATA_THEAD_GHOSTWRITE 2
-#define	ERRATA_THEAD_NUMBER 3
-#endif
-
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #define ALT_INSN_FAULT(x)						\
 ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault),			\
@@ -42,7 +25,7 @@ ALTERNATIVE(__stringify(RISCV_PTR do_page_fault),			\
 	    __stringify(RISCV_PTR sifive_cip_453_page_fault_trp),	\
 	    SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453,			\
 	    CONFIG_ERRATA_SIFIVE_CIP_453)
-#else /* !__ASSEMBLY__ */
+#else /* !__ASSEMBLER__ */
 
 #define ALT_SFENCE_VMA_ASID(asid)					\
 asm(ALTERNATIVE("sfence.vma x0, %0", "sfence.vma", SIFIVE_VENDOR_ID,	\
@@ -59,6 +42,17 @@ asm(ALTERNATIVE("sfence.vma %0, %1", "sfence.vma", SIFIVE_VENDOR_ID,	\
 		ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200)	\
 		: : "r" (addr), "r" (asid) : "memory")
 
+#define ALT_RISCV_PAUSE()					\
+asm(ALTERNATIVE(	\
+		RISCV_PAUSE, /* Original RISC‑V pause insn */	\
+		MIPS_PAUSE, /* Replacement for MIPS P8700 */	\
+		MIPS_VENDOR_ID, /* Vendor ID to match */	\
+		ERRATA_MIPS_P8700_PAUSE_OPCODE, /* patch_id */	\
+		CONFIG_ERRATA_MIPS_P8700_PAUSE_OPCODE)	\
+	: /* no outputs */	\
+	: /* no inputs */	\
+	: "memory")
+
 /*
  * _val is marked as "will be overwritten", so need to set it to 0
  * in the default case.
@@ -123,6 +117,6 @@ asm volatile(ALTERNATIVE(						\
 #define THEAD_C9XX_RV_IRQ_PMU			17
 #define THEAD_C9XX_CSR_SCOUNTEROF		0x5c5
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/riscv/include/asm/errata_list_vendors.h b/arch/riscv/include/asm/errata_list_vendors.h
new file mode 100644
index 000000000000..ec7eba373437
--- /dev/null
+++ b/arch/riscv/include/asm/errata_list_vendors.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef ASM_ERRATA_LIST_VENDORS_H
+#define ASM_ERRATA_LIST_VENDORS_H
+
+#ifdef CONFIG_ERRATA_ANDES
+#define	ERRATA_ANDES_NO_IOCP 0
+#define	ERRATA_ANDES_NUMBER 1
+#endif
+
+#ifdef CONFIG_ERRATA_SIFIVE
+#define	ERRATA_SIFIVE_CIP_453 0
+#define	ERRATA_SIFIVE_CIP_1200 1
+#define	ERRATA_SIFIVE_NUMBER 2
+#endif
+
+#ifdef CONFIG_ERRATA_THEAD
+#define	ERRATA_THEAD_MAE 0
+#define	ERRATA_THEAD_PMU 1
+#define	ERRATA_THEAD_GHOSTWRITE 2
+#define	ERRATA_THEAD_NUMBER 3
+#endif
+
+#ifdef CONFIG_ERRATA_MIPS
+#define	ERRATA_MIPS_P8700_PAUSE_OPCODE 0
+#define	ERRATA_MIPS_NUMBER 1
+#endif
+
+#endif /* ASM_ERRATA_LIST_VENDORS_H */
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 22ebea3c2b26..e5026cd8f022 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -13,7 +13,7 @@
 #endif
 
 #define ARCH_SUPPORTS_FTRACE_OPS 1
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern void *return_address(unsigned int level);
 
@@ -112,7 +112,7 @@ do {									\
 #define MCOUNT_JALR_SIZE	4
 #define MCOUNT_NOP4_SIZE	4
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 struct dyn_ftrace;
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 #define ftrace_init_nop ftrace_init_nop
@@ -235,7 +235,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsi
 
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
diff --git a/arch/riscv/include/asm/gpr-num.h b/arch/riscv/include/asm/gpr-num.h
index efeb5edf8a3a..b499cf832734 100644
--- a/arch/riscv/include/asm/gpr-num.h
+++ b/arch/riscv/include/asm/gpr-num.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_GPR_NUM_H
 #define __ASM_GPR_NUM_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 	.equ	.L__gpr_num_x\num, \num
@@ -41,7 +41,7 @@
 	.equ	.L__gpr_num_t5,		30
 	.equ	.L__gpr_num_t6,		31
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 #define __DEFINE_ASM_GPR_NUMS					\
 "	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
@@ -80,6 +80,6 @@
 "	.equ	.L__gpr_num_t5,		30\n"			\
 "	.equ	.L__gpr_num_t6,		31\n"
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_GPR_NUM_H */
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 7fe0a379474a..948d2b34e94e 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 13
+#define RISCV_HWPROBE_MAX_KEY 14
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
@@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key)
 	case RISCV_HWPROBE_KEY_IMA_EXT_0:
 	case RISCV_HWPROBE_KEY_CPUPERF_0:
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0:
+	case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0:
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0:
 		return true;
 	}
diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h
index 8927a6ea1127..899254966e85 100644
--- a/arch/riscv/include/asm/image.h
+++ b/arch/riscv/include/asm/image.h
@@ -29,7 +29,7 @@
 #define RISCV_HEADER_VERSION (RISCV_HEADER_VERSION_MAJOR << 16 | \
 			      RISCV_HEADER_VERSION_MINOR)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define riscv_image_flag_field(flags, field)\
 			       (((flags) >> field##_SHIFT) & field##_MASK)
 /**
@@ -63,5 +63,5 @@ struct riscv_image_header {
 	u32 magic2;
 	u32 res3;
 };
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_RISCV_IMAGE_H */
diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index d5adbaec1d01..c9cfcea52cbb 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -25,7 +25,7 @@
 #define INSN_S_SIMM5_SHIFT		 7
 #define INSN_S_OPCODE_SHIFT		 0
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #ifdef CONFIG_AS_HAS_INSN
 
@@ -77,7 +77,7 @@
 #define __INSN_I(...)	insn_i __VA_ARGS__
 #define __INSN_S(...)	insn_s __VA_ARGS__
 
-#else /* ! __ASSEMBLY__ */
+#else /* ! __ASSEMBLER__ */
 
 #ifdef CONFIG_AS_HAS_INSN
 
@@ -153,7 +153,7 @@
 
 #endif
 
-#endif /* ! __ASSEMBLY__ */
+#endif /* ! __ASSEMBLER__ */
 
 #define INSN_R(opcode, func3, func7, rd, rs1, rs2)		\
 	__INSN_R(RV_##opcode, RV_##func3, RV_##func7,		\
@@ -263,7 +263,7 @@
 
 #define RISCV_INSN_NOP4	_AC(0x00000013, U)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define nop()           __asm__ __volatile__ ("nop")
 #define __nops(n)       ".rept  " #n "\nnop\n.endr\n"
 #define nops(n)         __asm__ __volatile__ (__nops(n))
diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h
index 09fde95a5e8f..c3005573e8c9 100644
--- a/arch/riscv/include/asm/insn.h
+++ b/arch/riscv/include/asm/insn.h
@@ -64,6 +64,7 @@
 #define RVG_RS2_OPOFF		20
 #define RVG_RD_OPOFF		7
 #define RVG_RS1_MASK		GENMASK(4, 0)
+#define RVG_RS2_MASK		GENMASK(4, 0)
 #define RVG_RD_MASK		GENMASK(4, 0)
 
 /* The bit field of immediate value in RVC J instruction */
@@ -286,45 +287,216 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code)
 	       (code & RVC_INSN_J_RS1_MASK) != 0;
 }
 
-#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1))
-#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1))
-#define RV_X(X, s, mask)  (((X) >> (s)) & (mask))
-#define RVC_X(X, s, mask) RV_X(X, s, mask)
+#define INSN_MATCH_LB		0x3
+#define INSN_MASK_LB		0x707f
+#define INSN_MATCH_LH		0x1003
+#define INSN_MASK_LH		0x707f
+#define INSN_MATCH_LW		0x2003
+#define INSN_MASK_LW		0x707f
+#define INSN_MATCH_LD		0x3003
+#define INSN_MASK_LD		0x707f
+#define INSN_MATCH_LBU		0x4003
+#define INSN_MASK_LBU		0x707f
+#define INSN_MATCH_LHU		0x5003
+#define INSN_MASK_LHU		0x707f
+#define INSN_MATCH_LWU		0x6003
+#define INSN_MASK_LWU		0x707f
+#define INSN_MATCH_SB		0x23
+#define INSN_MASK_SB		0x707f
+#define INSN_MATCH_SH		0x1023
+#define INSN_MASK_SH		0x707f
+#define INSN_MATCH_SW		0x2023
+#define INSN_MASK_SW		0x707f
+#define INSN_MATCH_SD		0x3023
+#define INSN_MASK_SD		0x707f
+
+#define INSN_MATCH_C_LD		0x6000
+#define INSN_MASK_C_LD		0xe003
+#define INSN_MATCH_C_SD		0xe000
+#define INSN_MASK_C_SD		0xe003
+#define INSN_MATCH_C_LW		0x4000
+#define INSN_MASK_C_LW		0xe003
+#define INSN_MATCH_C_SW		0xc000
+#define INSN_MASK_C_SW		0xe003
+#define INSN_MATCH_C_LDSP	0x6002
+#define INSN_MASK_C_LDSP	0xe003
+#define INSN_MATCH_C_SDSP	0xe002
+#define INSN_MASK_C_SDSP	0xe003
+#define INSN_MATCH_C_LWSP	0x4002
+#define INSN_MASK_C_LWSP	0xe003
+#define INSN_MATCH_C_SWSP	0xc002
+#define INSN_MASK_C_SWSP	0xe003
+
+#define INSN_OPCODE_MASK	0x007c
+#define INSN_OPCODE_SHIFT	2
+#define INSN_OPCODE_SYSTEM	28
+
+#define INSN_MASK_WFI		0xffffffff
+#define INSN_MATCH_WFI		0x10500073
+
+#define INSN_MASK_WRS		0xffffffff
+#define INSN_MATCH_WRS		0x00d00073
+
+#define INSN_MATCH_CSRRW	0x1073
+#define INSN_MASK_CSRRW		0x707f
+#define INSN_MATCH_CSRRS	0x2073
+#define INSN_MASK_CSRRS		0x707f
+#define INSN_MATCH_CSRRC	0x3073
+#define INSN_MASK_CSRRC		0x707f
+#define INSN_MATCH_CSRRWI	0x5073
+#define INSN_MASK_CSRRWI	0x707f
+#define INSN_MATCH_CSRRSI	0x6073
+#define INSN_MASK_CSRRSI	0x707f
+#define INSN_MATCH_CSRRCI	0x7073
+#define INSN_MASK_CSRRCI	0x707f
+
+#define INSN_MATCH_FLW		0x2007
+#define INSN_MASK_FLW		0x707f
+#define INSN_MATCH_FLD		0x3007
+#define INSN_MASK_FLD		0x707f
+#define INSN_MATCH_FLQ		0x4007
+#define INSN_MASK_FLQ		0x707f
+#define INSN_MATCH_FSW		0x2027
+#define INSN_MASK_FSW		0x707f
+#define INSN_MATCH_FSD		0x3027
+#define INSN_MASK_FSD		0x707f
+#define INSN_MATCH_FSQ		0x4027
+#define INSN_MASK_FSQ		0x707f
+
+#define INSN_MATCH_C_FLD	0x2000
+#define INSN_MASK_C_FLD		0xe003
+#define INSN_MATCH_C_FLW	0x6000
+#define INSN_MASK_C_FLW		0xe003
+#define INSN_MATCH_C_FSD	0xa000
+#define INSN_MASK_C_FSD		0xe003
+#define INSN_MATCH_C_FSW	0xe000
+#define INSN_MASK_C_FSW		0xe003
+#define INSN_MATCH_C_FLDSP	0x2002
+#define INSN_MASK_C_FLDSP	0xe003
+#define INSN_MATCH_C_FSDSP	0xa002
+#define INSN_MASK_C_FSDSP	0xe003
+#define INSN_MATCH_C_FLWSP	0x6002
+#define INSN_MASK_C_FLWSP	0xe003
+#define INSN_MATCH_C_FSWSP	0xe002
+#define INSN_MASK_C_FSWSP	0xe003
+
+#define INSN_MATCH_C_LHU		0x8400
+#define INSN_MASK_C_LHU			0xfc43
+#define INSN_MATCH_C_LH			0x8440
+#define INSN_MASK_C_LH			0xfc43
+#define INSN_MATCH_C_SH			0x8c00
+#define INSN_MASK_C_SH			0xfc43
+
+#define INSN_16BIT_MASK		0x3
+#define INSN_IS_16BIT(insn)	(((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK)
+#define INSN_LEN(insn)		(INSN_IS_16BIT(insn) ? 2 : 4)
+
+#define SHIFT_RIGHT(x, y)		\
+	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
+
+#define REG_MASK			\
+	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
+
+#define REG_OFFSET(insn, pos)		\
+	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
+
+#define REG_PTR(insn, pos, regs)	\
+	((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)))
+
+#define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
+#define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
+#define GET_RS1S(insn, regs)	(*REG_PTR(RVC_RS1S(insn), 0, regs))
+#define GET_RS2S(insn, regs)	(*REG_PTR(RVC_RS2S(insn), 0, regs))
+#define GET_RS2C(insn, regs)	(*REG_PTR(insn, SH_RS2C, regs))
+#define GET_SP(regs)		(*REG_PTR(2, 0, regs))
+#define SET_RD(insn, regs, val)	(*REG_PTR(insn, SH_RD, regs) = (val))
+#define IMM_I(insn)		((s32)(insn) >> 20)
+#define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
+				 (s32)(((insn) >> 7) & 0x1f))
+
+#define SH_RD			7
+#define SH_RS1			15
+#define SH_RS2			20
+#define SH_RS2C			2
+#define MASK_RX			0x1f
+
+#if defined(CONFIG_64BIT)
+#define LOG_REGBYTES		3
+#else
+#define LOG_REGBYTES		2
+#endif
+
+#define MASK_FUNCT3		0x7000
+
+#define GET_FUNCT3(insn)	(((insn) >> 12) & 7)
+
+#define RV_IMM_SIGN(x)		(-(((x) >> 31) & 1))
+#define RVC_IMM_SIGN(x)		(-(((x) >> 12) & 1))
+#define RV_X_MASK(X, s, mask)	(((X) >> (s)) & (mask))
+#define RV_X(X, s, n)		RV_X_MASK(X, s, ((1 << (n)) - 1))
+#define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
+				 (RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 5, 1) << 6))
+#define RVC_LD_IMM(x)		((RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 5, 2) << 6))
+#define RVC_LWSP_IMM(x)		((RV_X(x, 4, 3) << 2) | \
+				 (RV_X(x, 12, 1) << 5) | \
+				 (RV_X(x, 2, 2) << 6))
+#define RVC_LDSP_IMM(x)		((RV_X(x, 5, 2) << 3) | \
+				 (RV_X(x, 12, 1) << 5) | \
+				 (RV_X(x, 2, 3) << 6))
+#define RVC_SWSP_IMM(x)		((RV_X(x, 9, 4) << 2) | \
+				 (RV_X(x, 7, 2) << 6))
+#define RVC_SDSP_IMM(x)		((RV_X(x, 10, 3) << 3) | \
+				 (RV_X(x, 7, 3) << 6))
+#define RVC_RS1S(insn)		(8 + RV_X(insn, SH_RD, 3))
+#define RVC_RS2S(insn)		(8 + RV_X(insn, SH_RS2C, 3))
+#define RVC_RS2(insn)		RV_X(insn, SH_RS2C, 5)
+#define RVC_X(X, s, mask)	RV_X_MASK(X, s, mask)
+
+#define RV_EXTRACT_FUNCT3(x) \
+	({typeof(x) x_ = (x); \
+	(RV_X_MASK(x_, RV_INSN_FUNCT3_OPOFF, \
+		   RV_INSN_FUNCT3_MASK >> RV_INSN_FUNCT3_OPOFF)); })
 
 #define RV_EXTRACT_RS1_REG(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); })
+	(RV_X_MASK(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); })
+
+#define RV_EXTRACT_RS2_REG(x) \
+	({typeof(x) x_ = (x); \
+	(RV_X_MASK(x_, RVG_RS2_OPOFF, RVG_RS2_MASK)); })
 
 #define RV_EXTRACT_RD_REG(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); })
+	(RV_X_MASK(x_, RVG_RD_OPOFF, RVG_RD_MASK)); })
 
 #define RV_EXTRACT_UTYPE_IMM(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); })
+	(RV_X_MASK(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); })
 
 #define RV_EXTRACT_JTYPE_IMM(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \
-	(RV_X(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \
-	(RV_X(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \
+	(RV_X_MASK(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \
+	(RV_X_MASK(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \
+	(RV_X_MASK(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \
 	(RV_IMM_SIGN(x_) << RV_J_IMM_SIGN_OFF); })
 
 #define RV_EXTRACT_ITYPE_IMM(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \
+	(RV_X_MASK(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \
 	(RV_IMM_SIGN(x_) << RV_I_IMM_SIGN_OFF); })
 
 #define RV_EXTRACT_BTYPE_IMM(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \
-	(RV_X(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \
-	(RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \
+	(RV_X_MASK(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \
+	(RV_X_MASK(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \
+	(RV_X_MASK(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \
 	(RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); })
 
 #define RVC_EXTRACT_C2_RS1_REG(x) \
 	({typeof(x) x_ = (x); \
-	(RV_X(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); })
+	(RV_X_MASK(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); })
 
 #define RVC_EXTRACT_JTYPE_IMM(x) \
 	({typeof(x) x_ = (x); \
@@ -346,13 +518,13 @@ static __always_inline bool riscv_insn_is_c_jalr(u32 code)
 	(RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); })
 
 #define RVG_EXTRACT_SYSTEM_CSR(x) \
-	({typeof(x) x_ = (x); RV_X(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); })
+	({typeof(x) x_ = (x); RV_X_MASK(x_, RVG_SYSTEM_CSR_OFF, RVG_SYSTEM_CSR_MASK); })
 
 #define RVFDQ_EXTRACT_FL_FS_WIDTH(x) \
-	({typeof(x) x_ = (x); RV_X(x_, RVFDQ_FL_FS_WIDTH_OFF, \
+	({typeof(x) x_ = (x); RV_X_MASK(x_, RVFDQ_FL_FS_WIDTH_OFF, \
 				   RVFDQ_FL_FS_WIDTH_MASK); })
 
-#define RVV_EXRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x)
+#define RVV_EXTRACT_VL_VS_WIDTH(x) RVFDQ_EXTRACT_FL_FS_WIDTH(x)
 
 /*
  * Get the immediate from a J-type instruction.
@@ -375,10 +547,10 @@ static inline void riscv_insn_insert_jtype_imm(u32 *insn, s32 imm)
 {
 	/* drop the old IMMs, all jal IMM bits sit at 31:12 */
 	*insn &= ~GENMASK(31, 12);
-	*insn |= (RV_X(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) |
-		 (RV_X(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) |
-		 (RV_X(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) |
-		 (RV_X(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF);
+	*insn |= (RV_X_MASK(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) |
+		 (RV_X_MASK(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) |
+		 (RV_X_MASK(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) |
+		 (RV_X_MASK(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF);
 }
 
 /*
diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h
index a0e51840b9db..09bb5f57a9d3 100644
--- a/arch/riscv/include/asm/io.h
+++ b/arch/riscv/include/asm/io.h
@@ -28,6 +28,10 @@
 #ifdef CONFIG_MMU
 #define IO_SPACE_LIMIT		(PCI_IO_SIZE - 1)
 #define PCI_IOBASE		((void __iomem *)PCI_IO_START)
+
+#define ioremap_wc(addr, size)	\
+	ioremap_prot((addr), (size), __pgprot(_PAGE_KERNEL_NC))
+
 #endif /* CONFIG_MMU */
 
 /*
diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h
index 87a71cc6d146..3ab5f2e3212b 100644
--- a/arch/riscv/include/asm/jump_label.h
+++ b/arch/riscv/include/asm/jump_label.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_JUMP_LABEL_H
 #define __ASM_JUMP_LABEL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <asm/asm.h>
@@ -66,5 +66,5 @@ label:
 	return true;
 }
 
-#endif  /* __ASSEMBLY__ */
+#endif  /* __ASSEMBLER__ */
 #endif	/* __ASM_JUMP_LABEL_H */
diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index e6a0071bdb56..60af6691f903 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -4,7 +4,7 @@
 #ifndef __ASM_KASAN_H
 #define __ASM_KASAN_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * The following comment was copied from arm64:
diff --git a/arch/riscv/include/asm/kgdb.h b/arch/riscv/include/asm/kgdb.h
index cc11c4544cff..7559d728c5ff 100644
--- a/arch/riscv/include/asm/kgdb.h
+++ b/arch/riscv/include/asm/kgdb.h
@@ -17,12 +17,12 @@
 #define BREAK_INSTR_SIZE	4
 #endif
 
-#ifndef	__ASSEMBLY__
+#ifndef	__ASSEMBLER__
 
 void arch_kgdb_breakpoint(void);
 extern unsigned long kgdb_compiled_break;
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #define DBG_REG_ZERO "zero"
 #define DBG_REG_RA "ra"
diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 1cc90465d75b..cf8e6eac77d5 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -7,7 +7,7 @@
 #ifndef _ASM_RISCV_MMU_H
 #define _ASM_RISCV_MMU_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 typedef struct {
 #ifndef CONFIG_MMU
@@ -40,6 +40,6 @@ typedef struct {
 
 void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
 				  pgprot_t prot);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_MMU_H */
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 572a141ddecd..ffe213ad65a4 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -41,7 +41,7 @@
 #define PAGE_OFFSET		((unsigned long)phys_ram_base)
 #endif /* CONFIG_MMU */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_RISCV_ISA_ZICBOZ
 void clear_page(void *page);
@@ -199,7 +199,7 @@ static __always_inline void *pfn_to_kaddr(unsigned long pfn)
 	return __va(pfn << PAGE_SHIFT);
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #define virt_addr_valid(vaddr)	({						\
 	unsigned long _addr = (unsigned long)vaddr;				\
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 91697fbf1f90..29e994a9afb6 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -111,7 +111,7 @@
 
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/page.h>
 #include <asm/tlbflush.h>
@@ -203,6 +203,7 @@ extern struct pt_alloc_ops pt_ops __meminitdata;
 
 #define PAGE_TABLE		__pgprot(_PAGE_TABLE)
 
+#define _PAGE_KERNEL_NC ((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_NOCACHE)
 #define _PAGE_IOREMAP	((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_IO)
 #define PAGE_KERNEL_IO		__pgprot(_PAGE_IOREMAP)
 
@@ -942,6 +943,23 @@ static inline int pudp_test_and_clear_young(struct vm_area_struct *vma,
 	return ptep_test_and_clear_young(vma, address, (pte_t *)pudp);
 }
 
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address,  pud_t *pudp)
+{
+#ifdef CONFIG_SMP
+	pud_t pud = __pud(xchg(&pudp->pud, 0));
+#else
+	pud_t pud = *pudp;
+
+	pud_clear(pudp);
+#endif
+
+	page_table_check_pud_clear(mm, pud);
+
+	return pud;
+}
+
 static inline int pud_young(pud_t pud)
 {
 	return pte_young(pud_pte(pud));
@@ -1118,6 +1136,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
 	set_pgd(pgdp, pgd); \
 })
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_PGTABLE_H */
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 24d3af4d3807..da5426122d28 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -54,7 +54,7 @@
 #define TASK_UNMAPPED_BASE	PAGE_ALIGN(TASK_SIZE / 3)
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct task_struct;
 struct pt_regs;
@@ -215,6 +215,6 @@ long get_tagged_addr_ctrl(struct task_struct *task);
 #define GET_TAGGED_ADDR_CTRL()		get_tagged_addr_ctrl(current)
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_PROCESSOR_H */
diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h
index a7dc0e330757..addc8188152f 100644
--- a/arch/riscv/include/asm/ptrace.h
+++ b/arch/riscv/include/asm/ptrace.h
@@ -10,7 +10,7 @@
 #include <asm/csr.h>
 #include <linux/compiler.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct pt_regs {
 	unsigned long epc;
@@ -180,6 +180,6 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 	return !(regs->status & SR_PIE);
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/include/asm/scs.h b/arch/riscv/include/asm/scs.h
index 0e45db78b24b..ab7714aa93bd 100644
--- a/arch/riscv/include/asm/scs.h
+++ b/arch/riscv/include/asm/scs.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_SCS_H
 #define _ASM_SCS_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #include <asm/asm-offsets.h>
 
 #ifdef CONFIG_SHADOW_CALL_STACK
@@ -49,6 +49,6 @@ _skip_scs:
 .endm
 
 #endif /* CONFIG_SHADOW_CALL_STACK */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_SCS_H */
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index ea263d3683ef..87389e93325a 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_RISCV_SET_MEMORY_H
 #define _ASM_RISCV_SET_MEMORY_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * Functions to change memory attributes.
  */
@@ -45,7 +45,7 @@ int set_direct_map_default_noflush(struct page *page);
 int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
 bool kernel_page_present(struct page *page);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_XIP_KERNEL)
 #ifdef CONFIG_64BIT
diff --git a/arch/riscv/include/asm/swab.h b/arch/riscv/include/asm/swab.h
new file mode 100644
index 000000000000..c1da22aa1326
--- /dev/null
+++ b/arch/riscv/include/asm/swab.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_RISCV_SWAB_H
+#define _ASM_RISCV_SWAB_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/cpufeature-macros.h>
+#include <asm/hwcap.h>
+#include <asm-generic/swab.h>
+
+#if defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE)
+
+// Duplicated from include/uapi/linux/swab.h
+#define ___constant_swab16(x) ((__u16)(				\
+	(((__u16)(x) & (__u16)0x00ffU) << 8) |			\
+	(((__u16)(x) & (__u16)0xff00U) >> 8)))
+
+#define ___constant_swab32(x) ((__u32)(				\
+	(((__u32)(x) & (__u32)0x000000ffUL) << 24) |		\
+	(((__u32)(x) & (__u32)0x0000ff00UL) <<  8) |		\
+	(((__u32)(x) & (__u32)0x00ff0000UL) >>  8) |		\
+	(((__u32)(x) & (__u32)0xff000000UL) >> 24)))
+
+#define ___constant_swab64(x) ((__u64)(				\
+	(((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) |	\
+	(((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) |	\
+	(((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) |	\
+	(((__u64)(x) & (__u64)0x00000000ff000000ULL) <<  8) |	\
+	(((__u64)(x) & (__u64)0x000000ff00000000ULL) >>  8) |	\
+	(((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) |	\
+	(((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) |	\
+	(((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56)))
+
+#define ARCH_SWAB(size, value)						\
+({									\
+	unsigned long x = value;					\
+									\
+	if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBB)) {            \
+		asm volatile (".option push\n"				\
+			      ".option arch,+zbb\n"			\
+			      "rev8 %0, %1\n"				\
+			      ".option pop\n"				\
+			      : "=r" (x) : "r" (x));			\
+		x = x >> (BITS_PER_LONG - size);			\
+	} else {                                                        \
+		x = ___constant_swab##size(value);                      \
+	}								\
+	x;								\
+})
+
+static __always_inline __u16 __arch_swab16(__u16 value)
+{
+	return ARCH_SWAB(16, value);
+}
+
+static __always_inline __u32 __arch_swab32(__u32 value)
+{
+	return ARCH_SWAB(32, value);
+}
+
+#ifdef CONFIG_64BIT
+static __always_inline __u64 __arch_swab64(__u64 value)
+{
+	return ARCH_SWAB(64, value);
+}
+#else
+static __always_inline __u64 __arch_swab64(__u64 value)
+{
+	__u32 h = value >> 32;
+	__u32 l = value & ((1ULL << 32) - 1);
+
+	return ((__u64)(__arch_swab32(l)) << 32) | ((__u64)(__arch_swab32(h)));
+}
+#endif
+
+#define __arch_swab64 __arch_swab64
+#define __arch_swab32 __arch_swab32
+#define __arch_swab16 __arch_swab16
+
+#undef ___constant_swab16
+#undef ___constant_swab32
+#undef ___constant_swab64
+
+#undef ARCH_SWAB
+
+#endif /* defined(CONFIG_TOOLCHAIN_HAS_ZBB) && defined(CONFIG_RISCV_ISA_ZBB) && !defined(NO_ALTERNATIVE) */
+#endif /* _ASM_RISCV_SWAB_H */
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index f5916a70879a..836d80dd2921 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -37,7 +37,7 @@
 
 #define IRQ_STACK_SIZE		THREAD_SIZE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/processor.h>
 #include <asm/csr.h>
@@ -98,7 +98,7 @@ struct thread_info {
 void arch_release_task_struct(struct task_struct *tsk);
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * thread information flags
@@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
  * - pending work-to-be-done flags are in lowest half-word
  * - other flags in upper half-word(s)
  */
-#define TIF_NEED_RESCHED	0	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	1       /* Lazy rescheduling needed */
-#define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
-#define TIF_SIGPENDING		3	/* signal pending */
-#define TIF_RESTORE_SIGMASK	4	/* restore signal mask in do_signal() */
-#define TIF_MEMDIE		5	/* is terminating due to OOM killer */
-#define TIF_NOTIFY_SIGNAL	9	/* signal notifications exist */
-#define TIF_UPROBE		10	/* uprobe breakpoint or singlestep */
-#define TIF_32BIT		11	/* compat-mode 32bit process */
-#define TIF_RISCV_V_DEFER_RESTORE	12 /* restore Vector before returing to user */
-
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_UPROBE		(1 << TIF_UPROBE)
-#define _TIF_RISCV_V_DEFER_RESTORE	(1 << TIF_RISCV_V_DEFER_RESTORE)
+
+/*
+ * Tell the generic TIF infrastructure which bits riscv supports
+ */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
+
+#include <asm-generic/thread_info_tif.h>
+
+#define TIF_32BIT			16	/* compat-mode 32bit process */
+#define TIF_RISCV_V_DEFER_RESTORE	17	/* restore Vector before returing to user */
+
+#define _TIF_RISCV_V_DEFER_RESTORE	BIT(TIF_RISCV_V_DEFER_RESTORE)
 
 #endif /* _ASM_RISCV_THREAD_INFO_H */
diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index b88a6218b7f2..f5f4f7f85543 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -209,7 +209,7 @@ do {									\
 		err = 0;						\
 		break;							\
 __gu_failed:								\
-		x = 0;							\
+		x = (__typeof__(x))0;					\
 		err = -EFAULT;						\
 } while (0)
 
@@ -311,7 +311,7 @@ do {								\
 do {								\
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&	\
 	    !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) {	\
-		__inttype(x) ___val = (__inttype(x))x;			\
+		__typeof__(*(__gu_ptr)) ___val = (x);		\
 		if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(___val), sizeof(*__gu_ptr))) \
 			goto label;				\
 		break;						\
@@ -438,10 +438,10 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
 }
 
 #define __get_kernel_nofault(dst, src, type, err_label)			\
-	__get_user_nocheck(*((type *)(dst)), (type *)(src), err_label)
+	__get_user_nocheck(*((type *)(dst)), (__force __user type *)(src), err_label)
 
 #define __put_kernel_nofault(dst, src, type, err_label)			\
-	__put_user_nocheck(*((type *)(src)), (type *)(dst), err_label)
+	__put_user_nocheck(*((type *)(src)), (__force __user type *)(dst), err_label)
 
 static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
 {
diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h
index c130d8100232..f80357fe24d1 100644
--- a/arch/riscv/include/asm/vdso.h
+++ b/arch/riscv/include/asm/vdso.h
@@ -16,7 +16,7 @@
 
 #define __VDSO_PAGES    4
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <generated/vdso-offsets.h>
 
 #define VDSO_SYMBOL(base, name)							\
@@ -34,7 +34,7 @@ extern char compat_vdso_start[], compat_vdso_end[];
 
 extern char vdso_start[], vdso_end[];
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* CONFIG_MMU */
 
diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h
index c6d66895c1f5..ab4aef955099 100644
--- a/arch/riscv/include/asm/vdso/getrandom.h
+++ b/arch/riscv/include/asm/vdso/getrandom.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_VDSO_GETRANDOM_H
 #define __ASM_VDSO_GETRANDOM_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/unistd.h>
 
@@ -25,6 +25,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns
 	return ret;
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h
index 29164f84f93c..9ec08fa04d35 100644
--- a/arch/riscv/include/asm/vdso/gettimeofday.h
+++ b/arch/riscv/include/asm/vdso/gettimeofday.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_VDSO_GETTIMEOFDAY_H
 #define __ASM_VDSO_GETTIMEOFDAY_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/barrier.h>
 #include <asm/unistd.h>
@@ -79,6 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,
 	return csr_read(CSR_TIME);
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h
index 8f383f05a290..c42f95dc8811 100644
--- a/arch/riscv/include/asm/vdso/processor.h
+++ b/arch/riscv/include/asm/vdso/processor.h
@@ -2,9 +2,10 @@
 #ifndef __ASM_VDSO_PROCESSOR_H
 #define __ASM_VDSO_PROCESSOR_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/barrier.h>
+#include <asm/errata_list.h>
 #include <asm/insn-def.h>
 
 static inline void cpu_relax(void)
@@ -19,10 +20,10 @@ static inline void cpu_relax(void)
 	 * Reduce instruction retirement.
 	 * This assumes the PC changes.
 	 */
-	__asm__ __volatile__ (RISCV_PAUSE);
+	ALT_RISCV_PAUSE();
 	barrier();
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h
index 1140b54b4bc8..558eb9dfda52 100644
--- a/arch/riscv/include/asm/vdso/vsyscall.h
+++ b/arch/riscv/include/asm/vdso/vsyscall.h
@@ -2,13 +2,13 @@
 #ifndef __ASM_VDSO_VSYSCALL_H
 #define __ASM_VDSO_VSYSCALL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <vdso/datapage.h>
 
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/riscv/include/asm/vendor_extensions/mips.h b/arch/riscv/include/asm/vendor_extensions/mips.h
new file mode 100644
index 000000000000..ea8ca747d691
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions/mips.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 MIPS.
+ */
+
+#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H
+#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H
+
+#include <linux/types.h>
+
+#define RISCV_ISA_VENDOR_EXT_XMIPSEXECTL	0
+
+#ifndef __ASSEMBLER__
+struct riscv_isa_vendor_ext_data_list;
+extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips;
+#endif
+
+/* Extension specific instructions */
+
+/*
+ * All of the xmipsexectl extension instructions are
+ * ‘hint’ encodings of the SLLI instruction,
+ * with rd = 0, rs1 = 0 and imm = 1 for IHB, imm = 3 for EHB,
+ * and imm = 5 for PAUSE.
+ * MIPS.PAUSE is an alternative opcode which is implemented to have the
+ * same behavior as PAUSE on some MIPS RISCV cores.
+ * MIPS.EHB clears all execution hazards before allowing
+ * any subsequent instructions to execute.
+ * MIPS.IHB clears all instruction hazards before
+ * allowing any subsequent instructions to fetch.
+ */
+
+#define MIPS_PAUSE	".4byte 0x00501013\n\t"
+#define MIPS_EHB	".4byte 0x00301013\n\t"
+#define MIPS_IHB	".4byte 0x00101013\n\t"
+
+#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H
diff --git a/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h
new file mode 100644
index 000000000000..e63f664b6b17
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions/mips_hwprobe.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 MIPS.
+ */
+
+#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_
+#define _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_
+
+#include <linux/cpumask.h>
+#include <uapi/asm/hwprobe.h>
+
+#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS
+void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair, const struct cpumask *cpus);
+#else
+static inline void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair,
+						 const struct cpumask *cpus)
+{
+	pair->value = 0;
+}
+#endif
+
+#endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_HWPROBE_H_
diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h
index a5150cdf34d8..3b09874d7a6d 100644
--- a/arch/riscv/include/asm/vendorid_list.h
+++ b/arch/riscv/include/asm/vendorid_list.h
@@ -9,5 +9,6 @@
 #define MICROCHIP_VENDOR_ID	0x029
 #define SIFIVE_VENDOR_ID	0x489
 #define THEAD_VENDOR_ID		0x5b7
+#define MIPS_VENDOR_ID		0x722
 
 #endif
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index aaf6ad970499..5d30a4fae37a 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -106,6 +106,7 @@ struct riscv_hwprobe {
 #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0	11
 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE	12
 #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0	13
+#define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0	14
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index ef27d4289da1..251099d860aa 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -9,7 +9,7 @@
 #ifndef __LINUX_KVM_RISCV_H
 #define __LINUX_KVM_RISCV_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <asm/bitsperlong.h>
diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h
index a38268b19c3d..beff8df80ac9 100644
--- a/arch/riscv/include/uapi/asm/ptrace.h
+++ b/arch/riscv/include/uapi/asm/ptrace.h
@@ -6,7 +6,7 @@
 #ifndef _UAPI_ASM_RISCV_PTRACE_H
 #define _UAPI_ASM_RISCV_PTRACE_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 
@@ -127,6 +127,6 @@ struct __riscv_v_regset_state {
  */
 #define RISCV_MAX_VLENB (8192)
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _UAPI_ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h
index cd4f175dc837..748dffc9ae19 100644
--- a/arch/riscv/include/uapi/asm/sigcontext.h
+++ b/arch/riscv/include/uapi/asm/sigcontext.h
@@ -15,7 +15,7 @@
 /* The size of END signal context header. */
 #define END_HDR_SIZE	0x0
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct __sc_riscv_v_state {
 	struct __riscv_v_ext_state v_state;
@@ -35,6 +35,6 @@ struct sigcontext {
 	};
 };
 
-#endif /*!__ASSEMBLY__*/
+#endif /*!__ASSEMBLER__*/
 
 #endif /* _UAPI_ASM_RISCV_SIGCONTEXT_H */
diff --git a/arch/riscv/include/uapi/asm/vendor/mips.h b/arch/riscv/include/uapi/asm/vendor/mips.h
new file mode 100644
index 000000000000..e65ab268b265
--- /dev/null
+++ b/arch/riscv/include/uapi/asm/vendor/mips.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#define RISCV_HWPROBE_VENDOR_EXT_XMIPSEXECTL	BIT(0)
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index c7b542573407..f60fce69b725 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -113,7 +113,7 @@ obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
-obj-$(CONFIG_CFI_CLANG)		+= cfi.o
+obj-$(CONFIG_CFI)		+= cfi.o
 
 obj-$(CONFIG_EFI)		+= efi.o
 obj-$(CONFIG_COMPAT)		+= compat_syscall_table.o
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index 3f6d5a6789e8..71698ee11621 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/efi-bgrt.h>
 #include <linux/efi.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
@@ -160,6 +161,8 @@ done:
 			early_init_dt_scan_chosen_stdout();
 	} else {
 		acpi_parse_spcr(earlycon_acpi_spcr_enable, true);
+		if (IS_ENABLED(CONFIG_ACPI_BGRT))
+			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 	}
 }
 
diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c
index 7eb3cb1215c6..7642704c7f18 100644
--- a/arch/riscv/kernel/alternative.c
+++ b/arch/riscv/kernel/alternative.c
@@ -47,6 +47,11 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info
 		cpu_mfr_info->patch_func = andes_errata_patch_func;
 		break;
 #endif
+#ifdef CONFIG_ERRATA_MIPS
+	case MIPS_VENDOR_ID:
+		cpu_mfr_info->patch_func = mips_errata_patch_func;
+		break;
+#endif
 #ifdef CONFIG_ERRATA_SIFIVE
 	case SIFIVE_VENDOR_ID:
 		cpu_mfr_info->patch_func = sifive_errata_patch_func;
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 6e8c0d6feae9..7d42d3b8a32a 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2012 Regents of the University of California
  * Copyright (C) 2017 SiFive
  */
+#define COMPILE_OFFSETS
 
 #include <linux/kbuild.h>
 #include <linux/mm.h>
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 743d53415572..67b59699357d 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -474,10 +474,10 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
 	__RISCV_ISA_EXT_DATA(zacas, RISCV_ISA_EXT_ZACAS),
 	__RISCV_ISA_EXT_DATA(zalrsc, RISCV_ISA_EXT_ZALRSC),
 	__RISCV_ISA_EXT_DATA(zawrs, RISCV_ISA_EXT_ZAWRS),
-	__RISCV_ISA_EXT_DATA(zfa, RISCV_ISA_EXT_ZFA),
+	__RISCV_ISA_EXT_DATA_VALIDATE(zfa, RISCV_ISA_EXT_ZFA, riscv_ext_f_depends),
 	__RISCV_ISA_EXT_DATA_VALIDATE(zfbfmin, RISCV_ISA_EXT_ZFBFMIN, riscv_ext_f_depends),
-	__RISCV_ISA_EXT_DATA(zfh, RISCV_ISA_EXT_ZFH),
-	__RISCV_ISA_EXT_DATA(zfhmin, RISCV_ISA_EXT_ZFHMIN),
+	__RISCV_ISA_EXT_DATA_VALIDATE(zfh, RISCV_ISA_EXT_ZFH, riscv_ext_f_depends),
+	__RISCV_ISA_EXT_DATA_VALIDATE(zfhmin, RISCV_ISA_EXT_ZFHMIN, riscv_ext_f_depends),
 	__RISCV_ISA_EXT_DATA(zca, RISCV_ISA_EXT_ZCA),
 	__RISCV_ISA_EXT_DATA_VALIDATE(zcb, RISCV_ISA_EXT_ZCB, riscv_ext_zca_depends),
 	__RISCV_ISA_EXT_DATA_VALIDATE(zcd, RISCV_ISA_EXT_ZCD, riscv_ext_zcd_validate),
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 3a0ec6fd5956..d3d92a4becc7 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 
+#include <asm/alternative-macros.h>
 #include <asm/asm.h>
 #include <asm/csr.h>
 #include <asm/scs.h>
@@ -46,7 +47,7 @@
 	 * a0 = &new_vmalloc[BIT_WORD(cpu)]
 	 * a1 = BIT_MASK(cpu)
 	 */
-	REG_L 	a2, TASK_TI_CPU(tp)
+	lw	a2, TASK_TI_CPU(tp)
 	/*
 	 * Compute the new_vmalloc element position:
 	 * (cpu / 64) * 8 = (cpu >> 6) << 3
diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c
index 56444c7bd34e..531d348db84d 100644
--- a/arch/riscv/kernel/kexec_elf.c
+++ b/arch/riscv/kernel/kexec_elf.c
@@ -28,7 +28,7 @@ static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
 	int i;
 	int ret = 0;
 	size_t size;
-	struct kexec_buf kbuf;
+	struct kexec_buf kbuf = {};
 	const struct elf_phdr *phdr;
 
 	kbuf.image = image;
@@ -66,7 +66,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
 {
 	int i;
 	int ret;
-	struct kexec_buf kbuf;
+	struct kexec_buf kbuf = {};
 	const struct elf_phdr *phdr;
 	unsigned long lowest_paddr = ULONG_MAX;
 	unsigned long lowest_vaddr = ULONG_MAX;
diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c
index 26a81774a78a..8f2eb900910b 100644
--- a/arch/riscv/kernel/kexec_image.c
+++ b/arch/riscv/kernel/kexec_image.c
@@ -41,7 +41,7 @@ static void *image_load(struct kimage *image,
 	struct riscv_image_header *h;
 	u64 flags;
 	bool be_image, be_kernel;
-	struct kexec_buf kbuf;
+	struct kexec_buf kbuf = {};
 	int ret;
 
 	/* Check Image header */
diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c
index e36104af2e24..dd9d92a96517 100644
--- a/arch/riscv/kernel/machine_kexec_file.c
+++ b/arch/riscv/kernel/machine_kexec_file.c
@@ -15,6 +15,7 @@
 #include <linux/memblock.h>
 #include <linux/vmalloc.h>
 #include <asm/setup.h>
+#include <asm/insn.h>
 
 const struct kexec_file_ops * const kexec_file_loaders[] = {
 	&elf_kexec_ops,
@@ -109,7 +110,6 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
 }
 #endif
 
-#define RV_X(x, s, n)  (((x) >> (s)) & ((1 << (n)) - 1))
 #define RISCV_IMM_BITS 12
 #define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS)
 #define RISCV_CONST_HIGH_PART(x) \
@@ -261,7 +261,7 @@ int load_extra_segments(struct kimage *image, unsigned long kernel_start,
 	int ret;
 	void *fdt;
 	unsigned long initrd_pbase = 0UL;
-	struct kexec_buf kbuf;
+	struct kexec_buf kbuf = {};
 	char *modified_cmdline = NULL;
 
 	kbuf.image = image;
diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile
index 7dd15be69c90..bc098edac898 100644
--- a/arch/riscv/kernel/pi/Makefile
+++ b/arch/riscv/kernel/pi/Makefile
@@ -39,4 +39,4 @@ $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
 obj-y		:= cmdline_early.pi.o fdt_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o archrandom_early.pi.o
-extra-y		:= $(patsubst %.pi.o,%.o,$(obj-y))
+targets		:= $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c
index fbcdc9e4e143..389d086a0718 100644
--- a/arch/riscv/kernel/pi/cmdline_early.c
+++ b/arch/riscv/kernel/pi/cmdline_early.c
@@ -41,9 +41,9 @@ static char *get_early_cmdline(uintptr_t dtb_pa)
 static u64 match_noXlvl(char *cmdline)
 {
 	if (strstr(cmdline, "no4lvl"))
-		return SATP_MODE_48;
+		return SATP_MODE_39;
 	else if (strstr(cmdline, "no5lvl"))
-		return SATP_MODE_57;
+		return SATP_MODE_48;
 
 	return 0;
 }
diff --git a/arch/riscv/kernel/pi/fdt_early.c b/arch/riscv/kernel/pi/fdt_early.c
index 9bdee2fafe47..a12ff8090f19 100644
--- a/arch/riscv/kernel/pi/fdt_early.c
+++ b/arch/riscv/kernel/pi/fdt_early.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/libfdt.h>
 #include <linux/ctype.h>
+#include <asm/csr.h>
 
 #include "pi.h"
 
@@ -183,3 +184,42 @@ bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name)
 
 	return ret;
 }
+
+/**
+ *  set_satp_mode_from_fdt - determine SATP mode based on the MMU type in fdt
+ *
+ * @dtb_pa: physical address of the device tree blob
+ *
+ *  Returns the SATP mode corresponding to the MMU type of the first enabled CPU,
+ *  0 otherwise
+ */
+u64 set_satp_mode_from_fdt(uintptr_t dtb_pa)
+{
+	const void *fdt = (const void *)dtb_pa;
+	const char *mmu_type;
+	int node, parent;
+
+	parent = fdt_path_offset(fdt, "/cpus");
+	if (parent < 0)
+		return 0;
+
+	fdt_for_each_subnode(node, fdt, parent) {
+		if (!fdt_node_name_eq(fdt, node, "cpu"))
+			continue;
+
+		if (!fdt_device_is_available(fdt, node))
+			continue;
+
+		mmu_type = fdt_getprop(fdt, node, "mmu-type", NULL);
+		if (!mmu_type)
+			break;
+
+		if (!strcmp(mmu_type, "riscv,sv39"))
+			return SATP_MODE_39;
+		else if (!strcmp(mmu_type, "riscv,sv48"))
+			return SATP_MODE_48;
+		break;
+	}
+
+	return 0;
+}
diff --git a/arch/riscv/kernel/pi/pi.h b/arch/riscv/kernel/pi/pi.h
index 21141d84fea6..3fee2cfddf7c 100644
--- a/arch/riscv/kernel/pi/pi.h
+++ b/arch/riscv/kernel/pi/pi.h
@@ -14,6 +14,7 @@ u64 get_kaslr_seed(uintptr_t dtb_pa);
 u64 get_kaslr_seed_zkr(const uintptr_t dtb_pa);
 bool set_nokaslr_from_cmdline(uintptr_t dtb_pa);
 u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa);
+u64 set_satp_mode_from_fdt(uintptr_t dtb_pa);
 
 bool fdt_early_match_extension_isa(const void *fdt, const char *ext_name);
 
diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c
index 6c166029079c..fa581590c1f8 100644
--- a/arch/riscv/kernel/probes/simulate-insn.c
+++ b/arch/riscv/kernel/probes/simulate-insn.c
@@ -41,19 +41,16 @@ bool __kprobes simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs
 	 *     1         10          1           8       5    JAL/J
 	 */
 	bool ret;
-	u32 imm;
-	u32 index = (opcode >> 7) & 0x1f;
+	s32 imm;
+	u32 index = RV_EXTRACT_RD_REG(opcode);
 
 	ret = rv_insn_reg_set_val(regs, index, addr + 4);
 	if (!ret)
 		return ret;
 
-	imm  = ((opcode >> 21) & 0x3ff) << 1;
-	imm |= ((opcode >> 20) & 0x1)   << 11;
-	imm |= ((opcode >> 12) & 0xff)  << 12;
-	imm |= ((opcode >> 31) & 0x1)   << 20;
+	imm = RV_EXTRACT_JTYPE_IMM(opcode);
 
-	instruction_pointer_set(regs, addr + sign_extend32((imm), 20));
+	instruction_pointer_set(regs, addr + imm);
 
 	return ret;
 }
@@ -67,9 +64,9 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg
 	 */
 	bool ret;
 	unsigned long base_addr;
-	u32 imm = (opcode >> 20) & 0xfff;
-	u32 rd_index = (opcode >> 7) & 0x1f;
-	u32 rs1_index = (opcode >> 15) & 0x1f;
+	u32 imm = RV_EXTRACT_ITYPE_IMM(opcode);
+	u32 rd_index = RV_EXTRACT_RD_REG(opcode);
+	u32 rs1_index = RV_EXTRACT_RS1_REG(opcode);
 
 	ret = rv_insn_reg_get_val(regs, rs1_index, &base_addr);
 	if (!ret)
@@ -84,20 +81,6 @@ bool __kprobes simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *reg
 	return ret;
 }
 
-#define auipc_rd_idx(opcode) \
-	((opcode >> 7) & 0x1f)
-
-#define auipc_imm(opcode) \
-	((((opcode) >> 12) & 0xfffff) << 12)
-
-#if __riscv_xlen == 64
-#define auipc_offset(opcode)	sign_extend64(auipc_imm(opcode), 31)
-#elif __riscv_xlen == 32
-#define auipc_offset(opcode)	auipc_imm(opcode)
-#else
-#error "Unexpected __riscv_xlen"
-#endif
-
 bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs)
 {
 	/*
@@ -107,8 +90,8 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re
 	 *        20       5     7
 	 */
 
-	u32 rd_idx = auipc_rd_idx(opcode);
-	unsigned long rd_val = addr + auipc_offset(opcode);
+	u32 rd_idx = RV_EXTRACT_RD_REG(opcode);
+	unsigned long rd_val = addr + (s32)RV_EXTRACT_UTYPE_IMM(opcode);
 
 	if (!rv_insn_reg_set_val(regs, rd_idx, rd_val))
 		return false;
@@ -118,24 +101,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re
 	return true;
 }
 
-#define branch_rs1_idx(opcode) \
-	(((opcode) >> 15) & 0x1f)
-
-#define branch_rs2_idx(opcode) \
-	(((opcode) >> 20) & 0x1f)
-
-#define branch_funct3(opcode) \
-	(((opcode) >> 12) & 0x7)
-
-#define branch_imm(opcode) \
-	(((((opcode) >>  8) & 0xf ) <<  1) | \
-	 ((((opcode) >> 25) & 0x3f) <<  5) | \
-	 ((((opcode) >>  7) & 0x1 ) << 11) | \
-	 ((((opcode) >> 31) & 0x1 ) << 12))
-
-#define branch_offset(opcode) \
-	sign_extend32((branch_imm(opcode)), 12)
-
 bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs)
 {
 	/*
@@ -156,12 +121,12 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r
 	unsigned long rs1_val;
 	unsigned long rs2_val;
 
-	if (!rv_insn_reg_get_val(regs, branch_rs1_idx(opcode), &rs1_val) ||
-	    !rv_insn_reg_get_val(regs, branch_rs2_idx(opcode), &rs2_val))
+	if (!rv_insn_reg_get_val(regs, RV_EXTRACT_RS1_REG(opcode), &rs1_val) ||
+	    !rv_insn_reg_get_val(regs, RV_EXTRACT_RS2_REG(opcode), &rs2_val))
 		return false;
 
-	offset_tmp = branch_offset(opcode);
-	switch (branch_funct3(opcode)) {
+	offset_tmp = RV_EXTRACT_BTYPE_IMM(opcode);
+	switch (RV_EXTRACT_FUNCT3(opcode)) {
 	case RVG_FUNCT3_BEQ:
 		offset = (rs1_val == rs2_val) ? offset_tmp : 4;
 		break;
@@ -191,24 +156,9 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r
 
 bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs)
 {
-	/*
-	 *  15    13 12                            2 1      0
-	 * | funct3 | offset[11|4|9:8|10|6|7|3:1|5] | opcode |
-	 *     3                   11                    2
-	 */
-
-	s32 offset;
+	s32 offset = RVC_EXTRACT_JTYPE_IMM(opcode);
 
-	offset  = ((opcode >> 3)  & 0x7) << 1;
-	offset |= ((opcode >> 11) & 0x1) << 4;
-	offset |= ((opcode >> 2)  & 0x1) << 5;
-	offset |= ((opcode >> 7)  & 0x1) << 6;
-	offset |= ((opcode >> 6)  & 0x1) << 7;
-	offset |= ((opcode >> 9)  & 0x3) << 8;
-	offset |= ((opcode >> 8)  & 0x1) << 10;
-	offset |= ((opcode >> 12) & 0x1) << 11;
-
-	instruction_pointer_set(regs, addr + sign_extend32(offset, 11));
+	instruction_pointer_set(regs, addr + offset);
 
 	return true;
 }
@@ -224,7 +174,7 @@ static bool __kprobes simulate_c_jr_jalr(u32 opcode, unsigned long addr, struct
 
 	unsigned long jump_addr;
 
-	u32 rs1 = (opcode >> 7) & 0x1f;
+	u32 rs1 = RVC_EXTRACT_C2_RS1_REG(opcode);
 
 	if (rs1 == 0) /* C.JR is only valid when rs1 != x0 */
 		return false;
@@ -268,16 +218,10 @@ static bool __kprobes simulate_c_bnez_beqz(u32 opcode, unsigned long addr, struc
 	if (!rv_insn_reg_get_val(regs, rs1, &rs1_val))
 		return false;
 
-	if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) {
-		offset =  ((opcode >> 3)  & 0x3) << 1;
-		offset |= ((opcode >> 10) & 0x3) << 3;
-		offset |= ((opcode >> 2)  & 0x1) << 5;
-		offset |= ((opcode >> 5)  & 0x3) << 6;
-		offset |= ((opcode >> 12) & 0x1) << 8;
-		offset = sign_extend32(offset, 8);
-	} else {
+	if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez))
+		offset = RVC_EXTRACT_BTYPE_IMM(opcode);
+	else
 		offset = 2;
-	}
 
 	instruction_pointer_set(regs, addr + offset);
 
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index a0a40889d79a..31a392993cb4 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -223,7 +223,7 @@ asmlinkage void ret_from_fork_user(struct pt_regs *regs)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 53836a9235e3..5e8cde055264 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -148,7 +148,7 @@ static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask,
 
 static void sbi_set_power_off(void)
 {
-	pm_power_off = sbi_shutdown;
+	register_platform_power_off(sbi_shutdown);
 }
 #else
 static void __sbi_set_timer_v01(uint64_t stime_value)
@@ -682,7 +682,7 @@ void __init sbi_init(void)
 		if (sbi_spec_version >= sbi_mk_version(0, 3) &&
 		    sbi_probe_extension(SBI_EXT_SRST)) {
 			pr_info("SBI SRST extension detected\n");
-			pm_power_off = sbi_srst_power_off;
+			register_platform_power_off(sbi_srst_power_off);
 			sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot;
 			sbi_srst_reboot_nb.priority = 192;
 			register_restart_handler(&sbi_srst_reboot_nb);
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 0b170e18a2be..000f4451a9d8 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -15,6 +15,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/vector.h>
+#include <asm/vendor_extensions/mips_hwprobe.h>
 #include <asm/vendor_extensions/sifive_hwprobe.h>
 #include <asm/vendor_extensions/thead_hwprobe.h>
 #include <vdso/vsyscall.h>
@@ -153,14 +154,12 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
 			EXT_KEY(ZVKT);
 		}
 
-		if (has_fpu()) {
-			EXT_KEY(ZCD);
-			EXT_KEY(ZCF);
-			EXT_KEY(ZFA);
-			EXT_KEY(ZFBFMIN);
-			EXT_KEY(ZFH);
-			EXT_KEY(ZFHMIN);
-		}
+		EXT_KEY(ZCD);
+		EXT_KEY(ZCF);
+		EXT_KEY(ZFA);
+		EXT_KEY(ZFBFMIN);
+		EXT_KEY(ZFH);
+		EXT_KEY(ZFHMIN);
 
 		if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM))
 			EXT_KEY(SUPM);
@@ -309,6 +308,9 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0:
 		hwprobe_isa_vendor_ext_thead_0(pair, cpus);
 		break;
+	case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0:
+		hwprobe_isa_vendor_ext_mips_0(pair, cpus);
+		break;
 
 	/*
 	 * For forward compatibility, unknown keys don't fail the whole
diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
index d77afe05578f..795b2e815ac9 100644
--- a/arch/riscv/kernel/sys_riscv.c
+++ b/arch/riscv/kernel/sys_riscv.c
@@ -10,7 +10,7 @@
 
 static long riscv_sys_mmap(unsigned long addr, unsigned long len,
 			   unsigned long prot, unsigned long flags,
-			   unsigned long fd, off_t offset,
+			   unsigned long fd, unsigned long offset,
 			   unsigned long page_shift_offset)
 {
 	if (unlikely(offset & (~PAGE_MASK >> page_shift_offset)))
diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug
index 78cea5d2c270..5db4df44279e 100644
--- a/arch/riscv/kernel/tests/Kconfig.debug
+++ b/arch/riscv/kernel/tests/Kconfig.debug
@@ -30,6 +30,18 @@ config RISCV_MODULE_LINKING_KUNIT
 
          If unsure, say N.
 
+config RISCV_KPROBES_KUNIT
+       bool "KUnit test for riscv kprobes" if !KUNIT_ALL_TESTS
+       depends on KUNIT
+       depends on KPROBES
+       default KUNIT_ALL_TESTS
+       help
+         Enable testing for riscv kprobes. Useful for riscv and/or kprobes
+         development. The test verifies that kprobes do not change the behaviour
+         of some sample functions.
+
+         If unsure, say N.
+
 endif # RUNTIME_TESTING_MENU
 
 endmenu # "arch/riscv/kernel runtime Testing"
diff --git a/arch/riscv/kernel/tests/Makefile b/arch/riscv/kernel/tests/Makefile
index 7d6c76cffe20..407e7e6c28dc 100644
--- a/arch/riscv/kernel/tests/Makefile
+++ b/arch/riscv/kernel/tests/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_RISCV_MODULE_LINKING_KUNIT)	+= module_test/
+obj-$(CONFIG_RISCV_KPROBES_KUNIT)		+= kprobes/
diff --git a/arch/riscv/kernel/tests/kprobes/Makefile b/arch/riscv/kernel/tests/kprobes/Makefile
new file mode 100644
index 000000000000..4cb6c66a98e8
--- /dev/null
+++ b/arch/riscv/kernel/tests/kprobes/Makefile
@@ -0,0 +1 @@
+obj-y += test-kprobes.o test-kprobes-asm.o
diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S
new file mode 100644
index 000000000000..b951d0f12482
--- /dev/null
+++ b/arch/riscv/kernel/tests/kprobes/test-kprobes-asm.S
@@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include "test-kprobes.h"
+
+SYM_FUNC_START(test_kprobes_add)
+	li a1, KPROBE_TEST_MAGIC_UPPER
+	li a2, KPROBE_TEST_MAGIC_LOWER
+test_kprobes_add_addr1:
+	add a1, a1, a2
+test_kprobes_add_addr2:
+	add a0, a1, x0
+	ret
+SYM_FUNC_END(test_kprobes_add)
+
+SYM_FUNC_START(test_kprobes_jal)
+	li a0, 0
+	mv a1, ra
+	.option push
+	.option norvc
+test_kprobes_jal_addr1:
+	jal x0, 2f
+	ret
+	.option pop
+1:	li a0, KPROBE_TEST_MAGIC_UPPER
+	ret
+	.option push
+	.option norvc
+test_kprobes_jal_addr2:
+2:	jal 1b
+	.option pop
+	li a2, KPROBE_TEST_MAGIC_LOWER
+	add a0, a0, a2
+	jr a1
+SYM_FUNC_END(test_kprobes_jal)
+
+SYM_FUNC_START(test_kprobes_jalr)
+	la a0, 1f
+	mv a1, ra
+	.option push
+	.option norvc
+test_kprobes_jalr_addr:
+	jalr a0
+	.option pop
+	li t0, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, t0
+	jr a1
+1:	li a0, KPROBE_TEST_MAGIC_LOWER
+	ret
+SYM_FUNC_END(test_kprobes_jalr)
+
+SYM_FUNC_START(test_kprobes_auipc)
+test_kprobes_auipc_addr:
+	auipc a0, KPROBE_TEST_MAGIC_LOWER
+	la a1, test_kprobes_auipc_addr
+	sub a0, a0, a1
+	srli a0, a0, 12
+	li a1, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, a1
+	ret
+SYM_FUNC_END(test_kprobes_auipc)
+
+SYM_FUNC_START(test_kprobes_branch)
+	.option push
+	.option norvc
+	li a0, 0
+	li a1, 1
+	li a2, 2
+test_kprobes_branch_addr1:
+	beqz a0, 1f
+	ret
+1:
+test_kprobes_branch_addr2:
+	beqz a1, 3f
+test_kprobes_branch_addr3:
+	bnez a0, 3f
+test_kprobes_branch_addr4:
+	bnez a2, 1f
+	ret
+1:
+test_kprobes_branch_addr5:
+	bge a1, a2, 3f
+test_kprobes_branch_addr6:
+	bge a2, a1, 2f
+	ret
+1:
+	li t0, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, t0
+	ret
+2:
+test_kprobes_branch_addr7:
+	blt a2, a1, 3f
+	li a0, KPROBE_TEST_MAGIC_LOWER
+test_kprobes_branch_addr8:
+	blt a1, a2, 1b
+3:
+	li a0, 0
+	ret
+	.option pop
+SYM_FUNC_END(test_kprobes_branch)
+
+#ifdef CONFIG_RISCV_ISA_C
+
+SYM_FUNC_START(test_kprobes_c_j)
+	li a0, 0
+test_kprobes_branch_c_j_addr1:
+	c.j 2f
+1:
+	li a1, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, a1
+	ret
+2:	li a0, KPROBE_TEST_MAGIC_LOWER
+test_kprobes_branch_c_j_addr2:
+	c.j 1b
+SYM_FUNC_END(test_kprobes_c_j)
+
+SYM_FUNC_START(test_kprobes_c_jr)
+	la a0, 2f
+test_kprobes_c_jr_addr1:
+	c.jr a0
+	ret
+1:	li a1, KPROBE_TEST_MAGIC_LOWER
+	add a0, a0, a1
+	ret
+2:
+	li a0, KPROBE_TEST_MAGIC_UPPER
+	la a1, 1b
+test_kprobes_c_jr_addr2:
+	c.jr a1
+SYM_FUNC_END(test_kprobes_c_jr)
+
+SYM_FUNC_START(test_kprobes_c_jalr)
+	mv a1, ra
+	la a0, 1f
+test_kprobes_c_jalr_addr:
+	c.jalr a0
+	li a2, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, a2
+	jr a1
+1:	li a0, KPROBE_TEST_MAGIC_LOWER
+	ret
+SYM_FUNC_END(test_kprobes_c_jalr)
+
+SYM_FUNC_START(test_kprobes_c_beqz)
+	li a0, 0
+	li a1, 1
+test_kprobes_c_beqz_addr1:
+	c.beqz a0, 2f
+	ret
+1:	li a1, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, a1
+	ret
+test_kprobes_c_beqz_addr2:
+2:	c.beqz a1, 3f
+	li a0, KPROBE_TEST_MAGIC_LOWER
+	mv a1, x0
+test_kprobes_c_beqz_addr3:
+	c.beqz a1, 1b
+3:	li a0, 0
+	ret
+SYM_FUNC_END(test_kprobes_c_beqz)
+
+SYM_FUNC_START(test_kprobes_c_bnez)
+	li a0, 0
+	li a1, 1
+test_kprobes_c_bnez_addr1:
+	c.bnez a1, 2f
+	ret
+1:	li a1, KPROBE_TEST_MAGIC_UPPER
+	add a0, a0, a1
+	ret
+test_kprobes_c_bnez_addr2:
+2:	c.bnez a0, 3f
+	li a0, KPROBE_TEST_MAGIC_LOWER
+test_kprobes_c_bnez_addr3:
+	c.bnez a0, 1b
+3:	li a0, 0
+	ret
+SYM_FUNC_END(test_kprobes_c_bnez)
+
+#endif /* CONFIG_RISCV_ISA_C */
+
+SYM_DATA_START(test_kprobes_addresses)
+	RISCV_PTR test_kprobes_add_addr1
+	RISCV_PTR test_kprobes_add_addr2
+	RISCV_PTR test_kprobes_jal_addr1
+	RISCV_PTR test_kprobes_jal_addr2
+	RISCV_PTR test_kprobes_jalr_addr
+	RISCV_PTR test_kprobes_auipc_addr
+	RISCV_PTR test_kprobes_branch_addr1
+	RISCV_PTR test_kprobes_branch_addr2
+	RISCV_PTR test_kprobes_branch_addr3
+	RISCV_PTR test_kprobes_branch_addr4
+	RISCV_PTR test_kprobes_branch_addr5
+	RISCV_PTR test_kprobes_branch_addr6
+	RISCV_PTR test_kprobes_branch_addr7
+	RISCV_PTR test_kprobes_branch_addr8
+#ifdef CONFIG_RISCV_ISA_C
+	RISCV_PTR test_kprobes_branch_c_j_addr1
+	RISCV_PTR test_kprobes_branch_c_j_addr2
+	RISCV_PTR test_kprobes_c_jr_addr1
+	RISCV_PTR test_kprobes_c_jr_addr2
+	RISCV_PTR test_kprobes_c_jalr_addr
+	RISCV_PTR test_kprobes_c_beqz_addr1
+	RISCV_PTR test_kprobes_c_beqz_addr2
+	RISCV_PTR test_kprobes_c_beqz_addr3
+	RISCV_PTR test_kprobes_c_bnez_addr1
+	RISCV_PTR test_kprobes_c_bnez_addr2
+	RISCV_PTR test_kprobes_c_bnez_addr3
+#endif /* CONFIG_RISCV_ISA_C */
+	RISCV_PTR 0
+SYM_DATA_END(test_kprobes_addresses)
+
+SYM_DATA_START(test_kprobes_functions)
+	RISCV_PTR test_kprobes_add
+	RISCV_PTR test_kprobes_jal
+	RISCV_PTR test_kprobes_jalr
+	RISCV_PTR test_kprobes_auipc
+	RISCV_PTR test_kprobes_branch
+#ifdef CONFIG_RISCV_ISA_C
+	RISCV_PTR test_kprobes_c_j
+	RISCV_PTR test_kprobes_c_jr
+	RISCV_PTR test_kprobes_c_jalr
+	RISCV_PTR test_kprobes_c_beqz
+	RISCV_PTR test_kprobes_c_bnez
+#endif /* CONFIG_RISCV_ISA_C */
+	RISCV_PTR 0
+SYM_DATA_END(test_kprobes_functions)
diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.c b/arch/riscv/kernel/tests/kprobes/test-kprobes.c
new file mode 100644
index 000000000000..6f6cdfbf5a95
--- /dev/null
+++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <kunit/test.h>
+#include "test-kprobes.h"
+
+static int kprobe_dummy_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+	return 0;
+}
+
+static void test_kprobe_riscv(struct kunit *test)
+{
+	unsigned int num_kprobe = 0;
+	long (*func)(void);
+	struct kprobe *kp;
+	int i;
+
+	while (test_kprobes_addresses[num_kprobe])
+		num_kprobe++;
+
+	kp = kcalloc(num_kprobe, sizeof(*kp), GFP_KERNEL);
+	KUNIT_EXPECT_TRUE(test, kp);
+	if (!kp)
+		return;
+
+	for (i = 0; i < num_kprobe; ++i) {
+		kp[i].addr = test_kprobes_addresses[i];
+		kp[i].pre_handler = kprobe_dummy_handler;
+		KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp[i]));
+	}
+
+	for (i = 0;; ++i) {
+		func = test_kprobes_functions[i];
+		if (!func)
+			break;
+		KUNIT_EXPECT_EQ_MSG(test, KPROBE_TEST_MAGIC, func(), "function %d broken", i);
+	}
+
+	for (i = 0; i < num_kprobe; ++i)
+		unregister_kprobe(&kp[i]);
+	kfree(kp);
+}
+
+static struct kunit_case kprobes_testcases[] = {
+	KUNIT_CASE(test_kprobe_riscv),
+	{}
+};
+
+static struct kunit_suite kprobes_test_suite = {
+	.name = "kprobes_test_riscv",
+	.test_cases = kprobes_testcases,
+};
+
+kunit_test_suites(&kprobes_test_suite);
diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.h b/arch/riscv/kernel/tests/kprobes/test-kprobes.h
new file mode 100644
index 000000000000..3886ab491ecb
--- /dev/null
+++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_KPROBES_H
+#define TEST_KPROBES_H
+
+/*
+ * The magic value that all the functions in the test_kprobes_functions array return. The test
+ * installs kprobes into these functions, and verify that the functions still correctly return this
+ * value.
+ */
+#define KPROBE_TEST_MAGIC          0xcafebabe
+#define KPROBE_TEST_MAGIC_LOWER    0x0000babe
+#define KPROBE_TEST_MAGIC_UPPER    0xcafe0000
+
+#ifndef __ASSEMBLY__
+
+/* array of addresses to install kprobes */
+extern void *test_kprobes_addresses[];
+
+/* array of functions that return KPROBE_TEST_MAGIC */
+extern long (*test_kprobes_functions[])(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* TEST_KPROBES_H */
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index f760e4fcc052..2a27d3ff4ac6 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -18,149 +18,7 @@
 #include <asm/cpufeature.h>
 #include <asm/sbi.h>
 #include <asm/vector.h>
-
-#define INSN_MATCH_LB			0x3
-#define INSN_MASK_LB			0x707f
-#define INSN_MATCH_LH			0x1003
-#define INSN_MASK_LH			0x707f
-#define INSN_MATCH_LW			0x2003
-#define INSN_MASK_LW			0x707f
-#define INSN_MATCH_LD			0x3003
-#define INSN_MASK_LD			0x707f
-#define INSN_MATCH_LBU			0x4003
-#define INSN_MASK_LBU			0x707f
-#define INSN_MATCH_LHU			0x5003
-#define INSN_MASK_LHU			0x707f
-#define INSN_MATCH_LWU			0x6003
-#define INSN_MASK_LWU			0x707f
-#define INSN_MATCH_SB			0x23
-#define INSN_MASK_SB			0x707f
-#define INSN_MATCH_SH			0x1023
-#define INSN_MASK_SH			0x707f
-#define INSN_MATCH_SW			0x2023
-#define INSN_MASK_SW			0x707f
-#define INSN_MATCH_SD			0x3023
-#define INSN_MASK_SD			0x707f
-
-#define INSN_MATCH_FLW			0x2007
-#define INSN_MASK_FLW			0x707f
-#define INSN_MATCH_FLD			0x3007
-#define INSN_MASK_FLD			0x707f
-#define INSN_MATCH_FLQ			0x4007
-#define INSN_MASK_FLQ			0x707f
-#define INSN_MATCH_FSW			0x2027
-#define INSN_MASK_FSW			0x707f
-#define INSN_MATCH_FSD			0x3027
-#define INSN_MASK_FSD			0x707f
-#define INSN_MATCH_FSQ			0x4027
-#define INSN_MASK_FSQ			0x707f
-
-#define INSN_MATCH_C_LD			0x6000
-#define INSN_MASK_C_LD			0xe003
-#define INSN_MATCH_C_SD			0xe000
-#define INSN_MASK_C_SD			0xe003
-#define INSN_MATCH_C_LW			0x4000
-#define INSN_MASK_C_LW			0xe003
-#define INSN_MATCH_C_SW			0xc000
-#define INSN_MASK_C_SW			0xe003
-#define INSN_MATCH_C_LDSP		0x6002
-#define INSN_MASK_C_LDSP		0xe003
-#define INSN_MATCH_C_SDSP		0xe002
-#define INSN_MASK_C_SDSP		0xe003
-#define INSN_MATCH_C_LWSP		0x4002
-#define INSN_MASK_C_LWSP		0xe003
-#define INSN_MATCH_C_SWSP		0xc002
-#define INSN_MASK_C_SWSP		0xe003
-
-#define INSN_MATCH_C_FLD		0x2000
-#define INSN_MASK_C_FLD			0xe003
-#define INSN_MATCH_C_FLW		0x6000
-#define INSN_MASK_C_FLW			0xe003
-#define INSN_MATCH_C_FSD		0xa000
-#define INSN_MASK_C_FSD			0xe003
-#define INSN_MATCH_C_FSW		0xe000
-#define INSN_MASK_C_FSW			0xe003
-#define INSN_MATCH_C_FLDSP		0x2002
-#define INSN_MASK_C_FLDSP		0xe003
-#define INSN_MATCH_C_FSDSP		0xa002
-#define INSN_MASK_C_FSDSP		0xe003
-#define INSN_MATCH_C_FLWSP		0x6002
-#define INSN_MASK_C_FLWSP		0xe003
-#define INSN_MATCH_C_FSWSP		0xe002
-#define INSN_MASK_C_FSWSP		0xe003
-
-#define INSN_MATCH_C_LHU		0x8400
-#define INSN_MASK_C_LHU			0xfc43
-#define INSN_MATCH_C_LH			0x8440
-#define INSN_MASK_C_LH			0xfc43
-#define INSN_MATCH_C_SH			0x8c00
-#define INSN_MASK_C_SH			0xfc43
-
-#define INSN_LEN(insn)			((((insn) & 0x3) < 0x3) ? 2 : 4)
-
-#if defined(CONFIG_64BIT)
-#define LOG_REGBYTES			3
-#define XLEN				64
-#else
-#define LOG_REGBYTES			2
-#define XLEN				32
-#endif
-#define REGBYTES			(1 << LOG_REGBYTES)
-#define XLEN_MINUS_16			((XLEN) - 16)
-
-#define SH_RD				7
-#define SH_RS1				15
-#define SH_RS2				20
-#define SH_RS2C				2
-
-#define RV_X(x, s, n)			(((x) >> (s)) & ((1 << (n)) - 1))
-#define RVC_LW_IMM(x)			((RV_X(x, 6, 1) << 2) | \
-					 (RV_X(x, 10, 3) << 3) | \
-					 (RV_X(x, 5, 1) << 6))
-#define RVC_LD_IMM(x)			((RV_X(x, 10, 3) << 3) | \
-					 (RV_X(x, 5, 2) << 6))
-#define RVC_LWSP_IMM(x)			((RV_X(x, 4, 3) << 2) | \
-					 (RV_X(x, 12, 1) << 5) | \
-					 (RV_X(x, 2, 2) << 6))
-#define RVC_LDSP_IMM(x)			((RV_X(x, 5, 2) << 3) | \
-					 (RV_X(x, 12, 1) << 5) | \
-					 (RV_X(x, 2, 3) << 6))
-#define RVC_SWSP_IMM(x)			((RV_X(x, 9, 4) << 2) | \
-					 (RV_X(x, 7, 2) << 6))
-#define RVC_SDSP_IMM(x)			((RV_X(x, 10, 3) << 3) | \
-					 (RV_X(x, 7, 3) << 6))
-#define RVC_RS1S(insn)			(8 + RV_X(insn, SH_RD, 3))
-#define RVC_RS2S(insn)			(8 + RV_X(insn, SH_RS2C, 3))
-#define RVC_RS2(insn)			RV_X(insn, SH_RS2C, 5)
-
-#define SHIFT_RIGHT(x, y)		\
-	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
-
-#define REG_MASK			\
-	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
-
-#define REG_OFFSET(insn, pos)		\
-	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
-
-#define REG_PTR(insn, pos, regs)	\
-	(ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
-
-#define GET_RS1(insn, regs)		(*REG_PTR(insn, SH_RS1, regs))
-#define GET_RS2(insn, regs)		(*REG_PTR(insn, SH_RS2, regs))
-#define GET_RS1S(insn, regs)		(*REG_PTR(RVC_RS1S(insn), 0, regs))
-#define GET_RS2S(insn, regs)		(*REG_PTR(RVC_RS2S(insn), 0, regs))
-#define GET_RS2C(insn, regs)		(*REG_PTR(insn, SH_RS2C, regs))
-#define GET_SP(regs)			(*REG_PTR(2, 0, regs))
-#define SET_RD(insn, regs, val)		(*REG_PTR(insn, SH_RD, regs) = (val))
-#define IMM_I(insn)			((s32)(insn) >> 20)
-#define IMM_S(insn)			(((s32)(insn) >> 25 << 5) | \
-					 (s32)(((insn) >> 7) & 0x1f))
-#define MASK_FUNCT3			0x7000
-
-#define GET_PRECISION(insn) (((insn) >> 25) & 3)
-#define GET_RM(insn) (((insn) >> 12) & 7)
-#define PRECISION_S 0
-#define PRECISION_D 1
+#include <asm/insn.h>
 
 #ifdef CONFIG_FPU
 
diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c
index 184f780c932d..901e67adf576 100644
--- a/arch/riscv/kernel/vector.c
+++ b/arch/riscv/kernel/vector.c
@@ -93,7 +93,7 @@ bool insn_is_vector(u32 insn_buf)
 		return true;
 	case RVV_OPCODE_VL:
 	case RVV_OPCODE_VS:
-		width = RVV_EXRACT_VL_VS_WIDTH(insn_buf);
+		width = RVV_EXTRACT_VL_VS_WIDTH(insn_buf);
 		if (width == RVV_VL_VS_WIDTH_8 || width == RVV_VL_VS_WIDTH_16 ||
 		    width == RVV_VL_VS_WIDTH_32 || width == RVV_VL_VS_WIDTH_64)
 			return true;
diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c
index 92d8ff81f42c..bb4a75923685 100644
--- a/arch/riscv/kernel/vendor_extensions.c
+++ b/arch/riscv/kernel/vendor_extensions.c
@@ -6,6 +6,7 @@
 #include <asm/vendorid_list.h>
 #include <asm/vendor_extensions.h>
 #include <asm/vendor_extensions/andes.h>
+#include <asm/vendor_extensions/mips.h>
 #include <asm/vendor_extensions/sifive.h>
 #include <asm/vendor_extensions/thead.h>
 
@@ -16,6 +17,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = {
 #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES
 	&riscv_isa_vendor_ext_list_andes,
 #endif
+#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS
+	&riscv_isa_vendor_ext_list_mips,
+#endif
 #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE
 	&riscv_isa_vendor_ext_list_sifive,
 #endif
@@ -49,6 +53,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig
 		cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap;
 		break;
 	#endif
+	#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_MIPS
+	case MIPS_VENDOR_ID:
+		bmap = &riscv_isa_vendor_ext_list_mips.all_harts_isa_bitmap;
+		cpu_bmap = riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap;
+		break;
+	#endif
 	#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE
 	case SIFIVE_VENDOR_ID:
 		bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap;
diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile
index a4eca96d1c8a..bf116c82b6bd 100644
--- a/arch/riscv/kernel/vendor_extensions/Makefile
+++ b/arch/riscv/kernel/vendor_extensions/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES)	+= andes.o
+obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS)  	+= mips.o
+obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_MIPS)  	+= mips_hwprobe.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE)	+= sifive.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE)	+= sifive_hwprobe.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD)	+= thead.o
diff --git a/arch/riscv/kernel/vendor_extensions/mips.c b/arch/riscv/kernel/vendor_extensions/mips.c
new file mode 100644
index 000000000000..f691129f96c2
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/mips.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 MIPS.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/mips.h>
+
+#include <linux/array_size.h>
+#include <linux/cpumask.h>
+#include <linux/types.h>
+
+/* All MIPS vendor extensions supported in Linux */
+static const struct riscv_isa_ext_data riscv_isa_vendor_ext_mips[] = {
+	__RISCV_ISA_EXT_DATA(xmipsexectl, RISCV_ISA_VENDOR_EXT_XMIPSEXECTL),
+};
+
+struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips = {
+	.ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_mips),
+	.ext_data = riscv_isa_vendor_ext_mips,
+};
diff --git a/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c
new file mode 100644
index 000000000000..dc213a2ca70d
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/mips_hwprobe.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 MIPS.
+ */
+
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/mips.h>
+#include <asm/vendor_extensions/mips_hwprobe.h>
+#include <asm/vendor_extensions/vendor_hwprobe.h>
+
+#include <linux/cpumask.h>
+#include <linux/types.h>
+
+#include <uapi/asm/hwprobe.h>
+#include <uapi/asm/vendor/mips.h>
+
+void hwprobe_isa_vendor_ext_mips_0(struct riscv_hwprobe *pair,
+				   const struct cpumask *cpus)
+{
+	VENDOR_EXTENSION_SUPPORTED(pair, cpus,
+				   riscv_isa_vendor_ext_list_mips.per_hart_isa_bitmap,
+				   { VENDOR_EXT_KEY(XMIPSEXECTL); });
+}
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index a1c3b2ec1dde..525fb5a330c0 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -39,6 +39,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 			  unsigned long size, bool writable, bool in_atomic)
 {
 	int ret = 0;
+	pgprot_t prot;
 	unsigned long pfn;
 	phys_addr_t addr, end;
 	struct kvm_mmu_memory_cache pcache = {
@@ -55,10 +56,12 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 
 	end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
 	pfn = __phys_to_pfn(hpa);
+	prot = pgprot_noncached(PAGE_WRITE);
 
 	for (addr = gpa; addr < end; addr += PAGE_SIZE) {
 		map.addr = addr;
-		map.pte = pfn_pte(pfn, PAGE_KERNEL_IO);
+		map.pte = pfn_pte(pfn, prot);
+		map.pte = pte_mkdirty(map.pte);
 		map.level = 0;
 
 		if (!writable)
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index f001e56403f9..3ebcfffaa978 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -683,7 +683,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 }
 
 /**
- * check_vcpu_requests - check and handle pending vCPU requests
+ * kvm_riscv_check_vcpu_requests - check and handle pending vCPU requests
  * @vcpu:	the VCPU pointer
  *
  * Return: 1 if we should enter the guest
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
index 97dec18e6989..de1f96ea6225 100644
--- a/arch/riscv/kvm/vcpu_insn.c
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -8,133 +8,7 @@
 #include <linux/kvm_host.h>
 
 #include <asm/cpufeature.h>
-
-#define INSN_OPCODE_MASK	0x007c
-#define INSN_OPCODE_SHIFT	2
-#define INSN_OPCODE_SYSTEM	28
-
-#define INSN_MASK_WFI		0xffffffff
-#define INSN_MATCH_WFI		0x10500073
-
-#define INSN_MASK_WRS		0xffffffff
-#define INSN_MATCH_WRS		0x00d00073
-
-#define INSN_MATCH_CSRRW	0x1073
-#define INSN_MASK_CSRRW		0x707f
-#define INSN_MATCH_CSRRS	0x2073
-#define INSN_MASK_CSRRS		0x707f
-#define INSN_MATCH_CSRRC	0x3073
-#define INSN_MASK_CSRRC		0x707f
-#define INSN_MATCH_CSRRWI	0x5073
-#define INSN_MASK_CSRRWI	0x707f
-#define INSN_MATCH_CSRRSI	0x6073
-#define INSN_MASK_CSRRSI	0x707f
-#define INSN_MATCH_CSRRCI	0x7073
-#define INSN_MASK_CSRRCI	0x707f
-
-#define INSN_MATCH_LB		0x3
-#define INSN_MASK_LB		0x707f
-#define INSN_MATCH_LH		0x1003
-#define INSN_MASK_LH		0x707f
-#define INSN_MATCH_LW		0x2003
-#define INSN_MASK_LW		0x707f
-#define INSN_MATCH_LD		0x3003
-#define INSN_MASK_LD		0x707f
-#define INSN_MATCH_LBU		0x4003
-#define INSN_MASK_LBU		0x707f
-#define INSN_MATCH_LHU		0x5003
-#define INSN_MASK_LHU		0x707f
-#define INSN_MATCH_LWU		0x6003
-#define INSN_MASK_LWU		0x707f
-#define INSN_MATCH_SB		0x23
-#define INSN_MASK_SB		0x707f
-#define INSN_MATCH_SH		0x1023
-#define INSN_MASK_SH		0x707f
-#define INSN_MATCH_SW		0x2023
-#define INSN_MASK_SW		0x707f
-#define INSN_MATCH_SD		0x3023
-#define INSN_MASK_SD		0x707f
-
-#define INSN_MATCH_C_LD		0x6000
-#define INSN_MASK_C_LD		0xe003
-#define INSN_MATCH_C_SD		0xe000
-#define INSN_MASK_C_SD		0xe003
-#define INSN_MATCH_C_LW		0x4000
-#define INSN_MASK_C_LW		0xe003
-#define INSN_MATCH_C_SW		0xc000
-#define INSN_MASK_C_SW		0xe003
-#define INSN_MATCH_C_LDSP	0x6002
-#define INSN_MASK_C_LDSP	0xe003
-#define INSN_MATCH_C_SDSP	0xe002
-#define INSN_MASK_C_SDSP	0xe003
-#define INSN_MATCH_C_LWSP	0x4002
-#define INSN_MASK_C_LWSP	0xe003
-#define INSN_MATCH_C_SWSP	0xc002
-#define INSN_MASK_C_SWSP	0xe003
-
-#define INSN_16BIT_MASK		0x3
-
-#define INSN_IS_16BIT(insn)	(((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK)
-
-#define INSN_LEN(insn)		(INSN_IS_16BIT(insn) ? 2 : 4)
-
-#ifdef CONFIG_64BIT
-#define LOG_REGBYTES		3
-#else
-#define LOG_REGBYTES		2
-#endif
-#define REGBYTES		(1 << LOG_REGBYTES)
-
-#define SH_RD			7
-#define SH_RS1			15
-#define SH_RS2			20
-#define SH_RS2C			2
-#define MASK_RX			0x1f
-
-#define RV_X(x, s, n)		(((x) >> (s)) & ((1 << (n)) - 1))
-#define RVC_LW_IMM(x)		((RV_X(x, 6, 1) << 2) | \
-				 (RV_X(x, 10, 3) << 3) | \
-				 (RV_X(x, 5, 1) << 6))
-#define RVC_LD_IMM(x)		((RV_X(x, 10, 3) << 3) | \
-				 (RV_X(x, 5, 2) << 6))
-#define RVC_LWSP_IMM(x)		((RV_X(x, 4, 3) << 2) | \
-				 (RV_X(x, 12, 1) << 5) | \
-				 (RV_X(x, 2, 2) << 6))
-#define RVC_LDSP_IMM(x)		((RV_X(x, 5, 2) << 3) | \
-				 (RV_X(x, 12, 1) << 5) | \
-				 (RV_X(x, 2, 3) << 6))
-#define RVC_SWSP_IMM(x)		((RV_X(x, 9, 4) << 2) | \
-				 (RV_X(x, 7, 2) << 6))
-#define RVC_SDSP_IMM(x)		((RV_X(x, 10, 3) << 3) | \
-				 (RV_X(x, 7, 3) << 6))
-#define RVC_RS1S(insn)		(8 + RV_X(insn, SH_RD, 3))
-#define RVC_RS2S(insn)		(8 + RV_X(insn, SH_RS2C, 3))
-#define RVC_RS2(insn)		RV_X(insn, SH_RS2C, 5)
-
-#define SHIFT_RIGHT(x, y)		\
-	((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
-
-#define REG_MASK			\
-	((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
-
-#define REG_OFFSET(insn, pos)		\
-	(SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
-
-#define REG_PTR(insn, pos, regs)	\
-	((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)))
-
-#define GET_FUNCT3(insn)	(((insn) >> 12) & 7)
-
-#define GET_RS1(insn, regs)	(*REG_PTR(insn, SH_RS1, regs))
-#define GET_RS2(insn, regs)	(*REG_PTR(insn, SH_RS2, regs))
-#define GET_RS1S(insn, regs)	(*REG_PTR(RVC_RS1S(insn), 0, regs))
-#define GET_RS2S(insn, regs)	(*REG_PTR(RVC_RS2S(insn), 0, regs))
-#define GET_RS2C(insn, regs)	(*REG_PTR(insn, SH_RS2C, regs))
-#define GET_SP(regs)		(*REG_PTR(2, 0, regs))
-#define SET_RD(insn, regs, val)	(*REG_PTR(insn, SH_RD, regs) = (val))
-#define IMM_I(insn)		((s32)(insn) >> 20)
-#define IMM_S(insn)		(((s32)(insn) >> 25 << 5) | \
-				 (s32)(((insn) >> 7) & 0x1f))
+#include <asm/insn.h>
 
 struct insn_func {
 	unsigned long mask;
diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c
index a5f88cb717f3..05f3cc2d8e31 100644
--- a/arch/riscv/kvm/vcpu_vector.c
+++ b/arch/riscv/kvm/vcpu_vector.c
@@ -182,6 +182,8 @@ int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu,
 		struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
 		unsigned long reg_val;
 
+		if (reg_size != sizeof(reg_val))
+			return -EINVAL;
 		if (copy_from_user(&reg_val, uaddr, reg_size))
 			return -EFAULT;
 		if (reg_val != cntx->vector.vlenb)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 15683ae13fa5..6091f3f06fa3 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/kfence.h>
 #include <linux/execmem.h>
 
+#include <asm/alternative.h>
 #include <asm/fixmap.h>
 #include <asm/io.h>
 #include <asm/kasan.h>
@@ -816,6 +817,7 @@ static __meminit pgprot_t pgprot_from_va(uintptr_t va)
 
 #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
 u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa);
+u64 __pi_set_satp_mode_from_fdt(uintptr_t dtb_pa);
 
 static void __init disable_pgtable_l5(void)
 {
@@ -855,18 +857,22 @@ static void __init set_mmap_rnd_bits_max(void)
  * underlying hardware: establish 1:1 mapping in 4-level page table mode
  * then read SATP to see if the configuration was taken into account
  * meaning sv48 is supported.
+ * The maximum SATP mode is limited by both the command line and the "mmu-type"
+ * property in the device tree, since some platforms may hang if an unsupported
+ * SATP mode is attempted.
  */
 static __init void set_satp_mode(uintptr_t dtb_pa)
 {
 	u64 identity_satp, hw_satp;
 	uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
-	u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa);
+	u64 satp_mode_limit = min_not_zero(__pi_set_satp_mode_from_cmdline(dtb_pa),
+					   __pi_set_satp_mode_from_fdt(dtb_pa));
 
 	kernel_map.page_offset = PAGE_OFFSET_L5;
 
-	if (satp_mode_cmdline == SATP_MODE_57) {
+	if (satp_mode_limit == SATP_MODE_48) {
 		disable_pgtable_l5();
-	} else if (satp_mode_cmdline == SATP_MODE_48) {
+	} else if (satp_mode_limit == SATP_MODE_39) {
 		disable_pgtable_l5();
 		disable_pgtable_l4();
 		return;
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 10e01ff06312..206b2b8552a7 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -18,7 +18,7 @@
 #define RV_MAX_REG_ARGS 8
 #define RV_FENTRY_NINSNS 2
 #define RV_FENTRY_NBYTES (RV_FENTRY_NINSNS * 4)
-#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI_CLANG) ? 1 : 0)
+#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI) ? 1 : 0)
 /* imm that allows emit_imm to emit max count insns */
 #define RV_MAX_COUNT_IMM 0x7FFF7FF7FF7FF7FF
 
@@ -469,7 +469,7 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx)
 
 static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx)
 {
-	if (IS_ENABLED(CONFIG_CFI_CLANG))
+	if (IS_ENABLED(CONFIG_CFI))
 		emit(hash, ctx);
 }
 
@@ -1356,7 +1356,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 				emit_mv(rd, rs, ctx);
 #ifdef CONFIG_SMP
 			/* Load current CPU number in T1 */
-			emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu),
+			emit_lw(RV_REG_T1, offsetof(struct thread_info, cpu),
 				RV_REG_TP, ctx);
 			/* Load address of __per_cpu_offset array in T2 */
 			emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx);
@@ -1763,7 +1763,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		 */
 		if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) {
 			/* Load current CPU number in R0 */
-			emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu),
+			emit_lw(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu),
 				RV_REG_TP, ctx);
 			break;
 		}
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index 240592e3f5c2..530e497ca2f9 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -71,7 +71,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG
 PURGATORY_CFLAGS_REMOVE		+= -fstack-protector-strong
 endif
 
-ifdef CONFIG_CFI_CLANG
+ifdef CONFIG_CFI
 PURGATORY_CFLAGS_REMOVE		+= $(CC_FLAGS_CFI)
 endif
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf680c26a33c..d4c4dd4c4653 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -49,6 +49,13 @@ config KASAN_SHADOW_OFFSET
 	depends on KASAN
 	default 0x1C000000000000
 
+config CC_HAS_BUILTIN_FFS
+	def_bool !(CC_IS_GCC && GCC_VERSION < 160000)
+	help
+	  GCC versions before 16.0.0 generate library calls to ffs()
+	  for __builtin_ffs() even when __has_builtin(__builtin_ffs)
+	  is true.
+
 config CC_ASM_FLAG_OUTPUT_BROKEN
 	def_bool CC_IS_GCC && GCC_VERSION < 140200
 	help
@@ -199,6 +206,7 @@ config S390
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY
 	select HAVE_FTRACE_GRAPH_FUNC
@@ -547,15 +555,11 @@ config NODES_SHIFT
 	depends on NUMA
 	default "1"
 
-config SCHED_SMT
-	def_bool n
-
-config SCHED_MC
-	def_bool n
-
 config SCHED_TOPOLOGY
 	def_bool y
 	prompt "Topology scheduler support"
+	select ARCH_SUPPORTS_SCHED_SMT
+	select ARCH_SUPPORTS_SCHED_MC
 	select SCHED_SMT
 	select SCHED_MC
 	help
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index c0152db285f0..37d5b097ede5 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -10,6 +10,7 @@
 
 #include <linux/printk.h>
 #include <asm/physmem_info.h>
+#include <asm/stacktrace.h>
 
 struct vmlinux_info {
 	unsigned long entry;
@@ -89,6 +90,13 @@ void __noreturn jump_to_kernel(psw_t *psw);
 #define boot_info(fmt, ...)	boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__)
 #define boot_debug(fmt, ...)	boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__)
 
+#define boot_panic(...) do {				\
+	boot_emerg(__VA_ARGS__);			\
+	print_stacktrace(current_frame_address());	\
+	boot_emerg(" -- System halted\n");		\
+	disabled_wait();				\
+} while (0)
+
 extern struct machine_info machine;
 extern int boot_console_loglevel;
 extern bool boot_ignore_loglevel;
diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c
index 03500b9d9fb9..8d1bc25a6bf4 100644
--- a/arch/s390/boot/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -68,9 +68,7 @@ static void decompress_error(char *m)
 {
 	if (bootdebug)
 		boot_rb_dump();
-	boot_emerg("Decompression error: %s\n", m);
-	boot_emerg(" -- System halted\n");
-	disabled_wait();
+	boot_panic("Decompression error: %s\n", m);
 }
 
 unsigned long mem_safe_offset(void)
diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
index 45e3d057cfaa..1f2ca5435838 100644
--- a/arch/s390/boot/physmem_info.c
+++ b/arch/s390/boot/physmem_info.c
@@ -228,9 +228,7 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min,
 	boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n",
 		   total_mem, total_reserved_mem,
 		   total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0);
-	print_stacktrace(current_frame_address());
-	boot_emerg(" -- System halted\n");
-	disabled_wait();
+	boot_panic("Oom\n");
 }
 
 static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size)
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 93684a775716..3fbd25b9498f 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -44,13 +44,6 @@ u64 __bootdata_preserved(clock_comparator_max) = -1UL;
 u64 __bootdata_preserved(stfle_fac_list[16]);
 struct oldmem_data __bootdata_preserved(oldmem_data);
 
-void error(char *x)
-{
-	boot_emerg("%s\n", x);
-	boot_emerg(" -- System halted\n");
-	disabled_wait();
-}
-
 static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE);
 
 static void detect_machine_type(void)
@@ -220,10 +213,10 @@ static void rescue_initrd(unsigned long min, unsigned long max)
 static void copy_bootdata(void)
 {
 	if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size)
-		error(".boot.data section size mismatch");
+		boot_panic(".boot.data section size mismatch\n");
 	memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size);
 	if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size)
-		error(".boot.preserved.data section size mismatch");
+		boot_panic(".boot.preserved.data section size mismatch\n");
 	memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size);
 }
 
@@ -237,7 +230,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
 	for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) {
 		loc = (long)*reloc + phys_offset;
 		if (loc < min_addr || loc > max_addr)
-			error("64-bit relocation outside of kernel!\n");
+			boot_panic("64-bit relocation outside of kernel!\n");
 		*(u64 *)loc += offset;
 	}
 }
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index 1d073acd05a7..cea3de4dce8c 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -530,6 +530,9 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l
 			 lowcore_address + sizeof(struct lowcore),
 			 POPULATE_LOWCORE);
 	for_each_physmem_usable_range(i, &start, &end) {
+		/* Do not map lowcore with identity mapping */
+		if (!start)
+			start = sizeof(struct lowcore);
 		pgtable_populate((unsigned long)__identity_va(start),
 				 (unsigned long)__identity_va(end),
 				 POPULATE_IDENTITY);
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 6b33429f1c4d..99467f2dc018 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -5,6 +5,7 @@ CONFIG_WATCH_QUEUE=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_POSIX_AUX_CLOCKS=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_BPF_JIT=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
@@ -19,6 +20,7 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
+CONFIG_SCHED_PROXY_EXEC=y
 CONFIG_NUMA_BALANCING=y
 CONFIG_MEMCG=y
 CONFIG_BLK_CGROUP=y
@@ -42,6 +44,7 @@ CONFIG_PROFILING=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
 CONFIG_KEXEC_SIG=y
+CONFIG_CRASH_DM_CRYPT=y
 CONFIG_LIVEPATCH=y
 CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=512
@@ -105,6 +108,7 @@ CONFIG_CMA_AREAS=7
 CONFIG_MEM_SOFT_DIRTY=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
+CONFIG_ZONE_DEVICE=y
 CONFIG_PERCPU_STATS=y
 CONFIG_GUP_TEST=y
 CONFIG_ANON_VMA_NAME=y
@@ -114,8 +118,13 @@ CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
 CONFIG_UNIX=y
 CONFIG_UNIX_DIAG=m
+CONFIG_TLS=m
+CONFIG_TLS_DEVICE=y
+CONFIG_TLS_TOE=y
 CONFIG_XFRM_USER=m
 CONFIG_NET_KEY=m
+CONFIG_XDP_SOCKETS=y
+CONFIG_XDP_SOCKETS_DIAG=m
 CONFIG_SMC_DIAG=m
 CONFIG_SMC_LO=y
 CONFIG_INET=y
@@ -223,17 +232,19 @@ CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
 CONFIG_NETFILTER_XT_TARGET_CT=m
 CONFIG_NETFILTER_XT_TARGET_DSCP=m
+CONFIG_NETFILTER_XT_TARGET_HL=m
 CONFIG_NETFILTER_XT_TARGET_HMARK=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_LOG=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_NAT=m
 CONFIG_NETFILTER_XT_TARGET_NETMAP=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
+CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
 CONFIG_NETFILTER_XT_TARGET_SECMARK=m
 CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
@@ -248,6 +259,7 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
@@ -318,16 +330,8 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
-CONFIG_IP_NF_NAT=m
-CONFIG_IP_NF_TARGET_MASQUERADE=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_FIB_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
@@ -340,15 +344,9 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_MH=m
 CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_IP6_NF_NAT=m
-CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_IP_SCTP=m
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m
@@ -383,6 +381,7 @@ CONFIG_NET_SCH_FQ_CODEL=m
 CONFIG_NET_SCH_INGRESS=m
 CONFIG_NET_SCH_PLUG=m
 CONFIG_NET_SCH_ETS=m
+CONFIG_NET_SCH_DUALPI2=m
 CONFIG_NET_CLS_BASIC=m
 CONFIG_NET_CLS_ROUTE4=m
 CONFIG_NET_CLS_FW=m
@@ -504,6 +503,7 @@ CONFIG_DM_VDO=m
 CONFIG_NETDEVICES=y
 CONFIG_BONDING=m
 CONFIG_DUMMY=m
+CONFIG_OVPN=m
 CONFIG_EQUALIZER=m
 CONFIG_IFB=m
 CONFIG_MACVLAN=m
@@ -547,6 +547,7 @@ CONFIG_NLMON=m
 CONFIG_MLX4_EN=m
 CONFIG_MLX5_CORE=m
 CONFIG_MLX5_CORE_EN=y
+CONFIG_MLX5_SF=y
 # CONFIG_NET_VENDOR_META is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_MICROCHIP is not set
@@ -641,6 +642,7 @@ CONFIG_VP_VDPA=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
 CONFIG_VHOST_VDPA=m
+CONFIG_DEV_DAX=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -662,9 +664,7 @@ CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_BTRFS_DEBUG=y
 CONFIG_BTRFS_ASSERT=y
 CONFIG_NILFS2_FS=m
-CONFIG_BCACHEFS_FS=y
-CONFIG_BCACHEFS_QUOTA=y
-CONFIG_BCACHEFS_POSIX_ACL=y
+CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
 CONFIG_FS_VERITY=y
@@ -755,6 +755,8 @@ CONFIG_HARDENED_USERCOPY=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_SELFTESTS=y
+CONFIG_CRYPTO_SELFTESTS_FULL=y
+CONFIG_CRYPTO_NULL=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_BENCHMARK=m
@@ -762,7 +764,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARIA=m
@@ -783,7 +784,6 @@ CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
 CONFIG_CRYPTO_GCM=y
 CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_MD4=m
@@ -822,6 +822,7 @@ CONFIG_SYSTEM_BLACKLIST_KEYRING=y
 CONFIG_CRYPTO_KRB5=m
 CONFIG_CRYPTO_KRB5_SELFTESTS=y
 CONFIG_CORDIC=m
+CONFIG_TRACE_MMIO_ACCESS=y
 CONFIG_RANDOM32_SELFTEST=y
 CONFIG_XZ_DEC_MICROLZMA=y
 CONFIG_DMA_CMA=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index b75eb2775850..a8573807e0c0 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -4,6 +4,7 @@ CONFIG_WATCH_QUEUE=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_POSIX_AUX_CLOCKS=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_BPF_JIT=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
@@ -17,6 +18,7 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
+CONFIG_SCHED_PROXY_EXEC=y
 CONFIG_NUMA_BALANCING=y
 CONFIG_MEMCG=y
 CONFIG_BLK_CGROUP=y
@@ -40,11 +42,12 @@ CONFIG_PROFILING=y
 CONFIG_KEXEC=y
 CONFIG_KEXEC_FILE=y
 CONFIG_KEXEC_SIG=y
+CONFIG_CRASH_DM_CRYPT=y
 CONFIG_LIVEPATCH=y
 CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=512
 CONFIG_NUMA=y
-CONFIG_HZ_100=y
+CONFIG_HZ_1000=y
 CONFIG_CERT_STORE=y
 CONFIG_EXPOLINE=y
 CONFIG_EXPOLINE_AUTO=y
@@ -97,6 +100,7 @@ CONFIG_CMA_AREAS=7
 CONFIG_MEM_SOFT_DIRTY=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
+CONFIG_ZONE_DEVICE=y
 CONFIG_PERCPU_STATS=y
 CONFIG_ANON_VMA_NAME=y
 CONFIG_USERFAULTFD=y
@@ -105,8 +109,13 @@ CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
 CONFIG_UNIX=y
 CONFIG_UNIX_DIAG=m
+CONFIG_TLS=m
+CONFIG_TLS_DEVICE=y
+CONFIG_TLS_TOE=y
 CONFIG_XFRM_USER=m
 CONFIG_NET_KEY=m
+CONFIG_XDP_SOCKETS=y
+CONFIG_XDP_SOCKETS_DIAG=m
 CONFIG_SMC_DIAG=m
 CONFIG_SMC_LO=y
 CONFIG_INET=y
@@ -214,17 +223,19 @@ CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
 CONFIG_NETFILTER_XT_TARGET_CT=m
 CONFIG_NETFILTER_XT_TARGET_DSCP=m
+CONFIG_NETFILTER_XT_TARGET_HL=m
 CONFIG_NETFILTER_XT_TARGET_HMARK=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_LOG=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_NAT=m
 CONFIG_NETFILTER_XT_TARGET_NETMAP=m
 CONFIG_NETFILTER_XT_TARGET_NFLOG=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
+CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m
 CONFIG_NETFILTER_XT_TARGET_TEE=m
 CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
 CONFIG_NETFILTER_XT_TARGET_SECMARK=m
 CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
@@ -239,6 +250,7 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
@@ -309,16 +321,8 @@ CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_RPFILTER=m
 CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=m
 CONFIG_IP_NF_TARGET_REJECT=m
-CONFIG_IP_NF_NAT=m
-CONFIG_IP_NF_TARGET_MASQUERADE=m
-CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NFT_FIB_IPV6=m
 CONFIG_IP6_NF_IPTABLES=m
@@ -331,15 +335,9 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_MH=m
 CONFIG_IP6_NF_MATCH_RPFILTER=m
 CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_IP6_NF_NAT=m
-CONFIG_IP6_NF_TARGET_MASQUERADE=m
 CONFIG_NF_TABLES_BRIDGE=m
+CONFIG_IP_SCTP=m
 CONFIG_RDS=m
 CONFIG_RDS_RDMA=m
 CONFIG_RDS_TCP=m
@@ -373,6 +371,7 @@ CONFIG_NET_SCH_FQ_CODEL=m
 CONFIG_NET_SCH_INGRESS=m
 CONFIG_NET_SCH_PLUG=m
 CONFIG_NET_SCH_ETS=m
+CONFIG_NET_SCH_DUALPI2=m
 CONFIG_NET_CLS_BASIC=m
 CONFIG_NET_CLS_ROUTE4=m
 CONFIG_NET_CLS_FW=m
@@ -494,6 +493,7 @@ CONFIG_DM_VDO=m
 CONFIG_NETDEVICES=y
 CONFIG_BONDING=m
 CONFIG_DUMMY=m
+CONFIG_OVPN=m
 CONFIG_EQUALIZER=m
 CONFIG_IFB=m
 CONFIG_MACVLAN=m
@@ -537,6 +537,7 @@ CONFIG_NLMON=m
 CONFIG_MLX4_EN=m
 CONFIG_MLX5_CORE=m
 CONFIG_MLX5_CORE_EN=y
+CONFIG_MLX5_SF=y
 # CONFIG_NET_VENDOR_META is not set
 # CONFIG_NET_VENDOR_MICREL is not set
 # CONFIG_NET_VENDOR_MICROCHIP is not set
@@ -631,6 +632,7 @@ CONFIG_VP_VDPA=m
 CONFIG_VHOST_NET=m
 CONFIG_VHOST_VSOCK=m
 CONFIG_VHOST_VDPA=m
+CONFIG_DEV_DAX=m
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
@@ -649,9 +651,7 @@ CONFIG_OCFS2_FS=m
 CONFIG_BTRFS_FS=y
 CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_NILFS2_FS=m
-CONFIG_BCACHEFS_FS=m
-CONFIG_BCACHEFS_QUOTA=y
-CONFIG_BCACHEFS_POSIX_ACL=y
+CONFIG_FS_DAX=y
 CONFIG_EXPORTFS_BLOCK_OPS=y
 CONFIG_FS_ENCRYPTION=y
 CONFIG_FS_VERITY=y
@@ -683,7 +683,6 @@ CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_TMPFS_INODE64=y
 CONFIG_TMPFS_QUOTA=y
 CONFIG_HUGETLBFS=y
-CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
@@ -741,6 +740,7 @@ CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_CRYPTO_FIPS=y
 CONFIG_CRYPTO_USER=m
 CONFIG_CRYPTO_SELFTESTS=y
+CONFIG_CRYPTO_NULL=y
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_BENCHMARK=m
@@ -748,7 +748,6 @@ CONFIG_CRYPTO_DH=m
 CONFIG_CRYPTO_ECDH=m
 CONFIG_CRYPTO_ECDSA=m
 CONFIG_CRYPTO_ECRDSA=m
-CONFIG_CRYPTO_CURVE25519=m
 CONFIG_CRYPTO_AES_TI=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARIA=m
@@ -769,7 +768,6 @@ CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_LRW=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_AEGIS128=m
-CONFIG_CRYPTO_CHACHA20POLY1305=m
 CONFIG_CRYPTO_GCM=y
 CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_MD4=m
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index 8163c1702720..ed0b137353ad 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -1,5 +1,6 @@
 CONFIG_NO_HZ_IDLE=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_POSIX_AUX_CLOCKS=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_CPU_ISOLATION is not set
 # CONFIG_UTS_NS is not set
@@ -11,7 +12,7 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_KEXEC=y
 CONFIG_MARCH_Z13=y
 CONFIG_NR_CPUS=2
-CONFIG_HZ_100=y
+CONFIG_HZ_1000=y
 # CONFIG_CHSC_SCH is not set
 # CONFIG_SCM_BUS is not set
 # CONFIG_AP is not set
diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c
index 5d9effb0867c..41a0d2066fa0 100644
--- a/arch/s390/hypfs/hypfs_dbfs.c
+++ b/arch/s390/hypfs/hypfs_dbfs.c
@@ -6,6 +6,7 @@
  * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com>
  */
 
+#include <linux/security.h>
 #include <linux/slab.h>
 #include "hypfs.h"
 
@@ -66,23 +67,27 @@ static long dbfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	long rc;
 
 	mutex_lock(&df->lock);
-	if (df->unlocked_ioctl)
-		rc = df->unlocked_ioctl(file, cmd, arg);
-	else
-		rc = -ENOTTY;
+	rc = df->unlocked_ioctl(file, cmd, arg);
 	mutex_unlock(&df->lock);
 	return rc;
 }
 
-static const struct file_operations dbfs_ops = {
+static const struct file_operations dbfs_ops_ioctl = {
 	.read		= dbfs_read,
 	.unlocked_ioctl = dbfs_ioctl,
 };
 
+static const struct file_operations dbfs_ops = {
+	.read		= dbfs_read,
+};
+
 void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df)
 {
-	df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df,
-					 &dbfs_ops);
+	const struct file_operations *fops = &dbfs_ops;
+
+	if (df->unlocked_ioctl && !security_locked_down(LOCKDOWN_DEBUGFS))
+		fops = &dbfs_ops_ioctl;
+	df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, fops);
 	mutex_init(&df->lock);
 }
 
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index a5ca0a947691..ec945fb60c02 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -122,6 +122,8 @@ static inline bool test_bit_inv(unsigned long nr,
 	return test_bit(nr ^ (BITS_PER_LONG - 1), ptr);
 }
 
+#ifndef CONFIG_CC_HAS_BUILTIN_FFS
+
 /**
  * __flogr - find leftmost one
  * @word - The word to search
@@ -130,11 +132,12 @@ static inline bool test_bit_inv(unsigned long nr,
  * where the most significant bit has bit number 0.
  * If no bit is set this function returns 64.
  */
-static inline unsigned char __flogr(unsigned long word)
+static __always_inline __attribute_const__ unsigned long __flogr(unsigned long word)
 {
-	if (__builtin_constant_p(word)) {
-		unsigned long bit = 0;
+	unsigned long bit;
 
+	if (__builtin_constant_p(word)) {
+		bit = 0;
 		if (!word)
 			return 64;
 		if (!(word & 0xffffffff00000000UL)) {
@@ -163,86 +166,49 @@ static inline unsigned char __flogr(unsigned long word)
 		}
 		return bit;
 	} else {
-		union register_pair rp;
+		union register_pair rp __uninitialized;
 
 		rp.even = word;
-		asm volatile(
-			"       flogr   %[rp],%[rp]\n"
-			: [rp] "+d" (rp.pair) : : "cc");
-		return rp.even;
+		asm("flogr	%[rp],%[rp]"
+		    : [rp] "+d" (rp.pair) : : "cc");
+		bit = rp.even;
+		/*
+		 * The result of the flogr instruction is a value in the range
+		 * of 0..64. Let the compiler know that the AND operation can
+		 * be optimized away.
+		 */
+		__assume(bit <= 64);
+		return bit & 127;
 	}
 }
 
 /**
- * __ffs - find first bit in word.
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-	return __flogr(-word & word) ^ (BITS_PER_LONG - 1);
-}
-
-/**
  * ffs - find first bit set
  * @word: the word to search
  *
  * This is defined the same way as the libc and
  * compiler builtin ffs routines (man ffs).
  */
-static inline int ffs(int word)
+static __always_inline __flatten __attribute_const__ int ffs(int word)
 {
-	unsigned long mask = 2 * BITS_PER_LONG - 1;
 	unsigned int val = (unsigned int)word;
 
-	return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask;
-}
-
-/**
- * __fls - find last (most-significant) set bit in a long word
- * @word: the word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
-	return __flogr(word) ^ (BITS_PER_LONG - 1);
+	return BITS_PER_LONG - __flogr(-val & val);
 }
 
-/**
- * fls64 - find last set bit in a 64-bit word
- * @word: the word to search
- *
- * This is defined in a similar way as the libc and compiler builtin
- * ffsll, but returns the position of the most significant set bit.
- *
- * fls64(value) returns 0 if value is 0 or the position of the last
- * set bit if value is nonzero. The last (most significant) bit is
- * at position 64.
- */
-static inline int fls64(unsigned long word)
-{
-	unsigned long mask = 2 * BITS_PER_LONG - 1;
+#else /* CONFIG_CC_HAS_BUILTIN_FFS */
 
-	return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask;
-}
+#include <asm-generic/bitops/builtin-ffs.h>
 
-/**
- * fls - find last (most-significant) bit set
- * @word: the word to search
- *
- * This is defined the same way as ffs.
- * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
- */
-static inline int fls(unsigned int word)
-{
-	return fls64(word);
-}
+#endif /* CONFIG_CC_HAS_BUILTIN_FFS */
 
+#include <asm-generic/bitops/builtin-__ffs.h>
+#include <asm-generic/bitops/ffz.h>
+#include <asm-generic/bitops/builtin-__fls.h>
+#include <asm-generic/bitops/builtin-fls.h>
+#include <asm-generic/bitops/fls64.h>
 #include <asm/arch_hweight.h>
 #include <asm-generic/bitops/const_hweight.h>
-#include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h
index e5f57cfe1d45..025c6dcbf893 100644
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -16,11 +16,11 @@
 #define ZPCI_PCI_ST_FUNC_NOT_AVAIL		40
 #define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE		44
 
-/* Load/Store return codes */
-#define ZPCI_PCI_LS_OK				0
-#define ZPCI_PCI_LS_ERR				1
-#define ZPCI_PCI_LS_BUSY			2
-#define ZPCI_PCI_LS_INVAL_HANDLE		3
+/* PCI instruction condition codes */
+#define ZPCI_CC_OK				0
+#define ZPCI_CC_ERR				1
+#define ZPCI_CC_BUSY				2
+#define ZPCI_CC_INVAL_HANDLE			3
 
 /* Load/Store address space identifiers */
 #define ZPCI_PCIAS_MEMIO_0			0
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 5345398df653..a16e65072371 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,12 +19,16 @@
 
 #define CRST_ALLOC_ORDER 2
 
-unsigned long *crst_table_alloc(struct mm_struct *);
+unsigned long *crst_table_alloc_noprof(struct mm_struct *);
+#define crst_table_alloc(...)	alloc_hooks(crst_table_alloc_noprof(__VA_ARGS__))
 void crst_table_free(struct mm_struct *, unsigned long *);
 
-unsigned long *page_table_alloc(struct mm_struct *);
-struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm);
+unsigned long *page_table_alloc_noprof(struct mm_struct *);
+#define page_table_alloc(...)	alloc_hooks(page_table_alloc_noprof(__VA_ARGS__))
 void page_table_free(struct mm_struct *, unsigned long *);
+
+struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm);
+#define page_table_alloc_pgste(...)	alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__))
 void page_table_free_pgste(struct ptdesc *ptdesc);
 
 static inline void crst_table_init(unsigned long *crst, unsigned long entry)
@@ -48,9 +52,9 @@ static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long
 	return addr;
 }
 
-static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address)
+static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long address)
 {
-	unsigned long *table = crst_table_alloc(mm);
+	unsigned long *table = crst_table_alloc_noprof(mm);
 
 	if (!table)
 		return NULL;
@@ -59,6 +63,7 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address)
 
 	return (p4d_t *) table;
 }
+#define p4d_alloc_one(...)	alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__))
 
 static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
 {
@@ -69,9 +74,9 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
 	crst_table_free(mm, (unsigned long *) p4d);
 }
 
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
+static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long address)
 {
-	unsigned long *table = crst_table_alloc(mm);
+	unsigned long *table = crst_table_alloc_noprof(mm);
 
 	if (!table)
 		return NULL;
@@ -80,6 +85,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 
 	return (pud_t *) table;
 }
+#define pud_alloc_one(...)	alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
@@ -90,9 +96,9 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 	crst_table_free(mm, (unsigned long *) pud);
 }
 
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
+static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long vmaddr)
 {
-	unsigned long *table = crst_table_alloc(mm);
+	unsigned long *table = crst_table_alloc_noprof(mm);
 
 	if (!table)
 		return NULL;
@@ -103,6 +109,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 	}
 	return (pmd_t *) table;
 }
+#define pmd_alloc_one(...)	alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
@@ -127,9 +134,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 	set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd)));
 }
 
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+static inline pgd_t *pgd_alloc_noprof(struct mm_struct *mm)
 {
-	unsigned long *table = crst_table_alloc(mm);
+	unsigned long *table = crst_table_alloc_noprof(mm);
 
 	if (!table)
 		return NULL;
@@ -137,6 +144,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	return (pgd_t *) table;
 }
+#define pgd_alloc(...)	alloc_hooks(pgd_alloc_noprof(__VA_ARGS__))
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index f6ed2c8192c8..7878e9bfbf07 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -56,49 +56,31 @@ void arch_setup_new_exec(void);
 
 /*
  * thread information flags bit numbers
+ *
+ * Tell the generic TIF infrastructure which special bits s390 supports
  */
-#define TIF_NOTIFY_RESUME	0	/* callback before returning to user */
-#define TIF_SIGPENDING		1	/* signal pending */
-#define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling needed */
-#define TIF_UPROBE		4	/* breakpointed or single-stepping */
-#define TIF_PATCH_PENDING	5	/* pending live patching update */
-#define TIF_ASCE_PRIMARY	6	/* primary asce is kernel asce */
-#define TIF_NOTIFY_SIGNAL	7	/* signal notifications exist */
-#define TIF_GUARDED_STORAGE	8	/* load guarded storage control block */
-#define TIF_ISOLATE_BP_GUEST	9	/* Run KVM guests with isolated BP */
-#define TIF_PER_TRAP		10	/* Need to handle PER trap on exit to usermode */
-#define TIF_31BIT		16	/* 32bit process */
-#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal() */
-#define TIF_SINGLE_STEP		19	/* This task is single stepped */
-#define TIF_BLOCK_STEP		20	/* This task is block stepped */
-#define TIF_UPROBE_SINGLESTEP	21	/* This task is uprobe single stepped */
-#define TIF_SYSCALL_TRACE	24	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	25	/* syscall auditing active */
-#define TIF_SECCOMP		26	/* secure computing */
-#define TIF_SYSCALL_TRACEPOINT	27	/* syscall tracepoint instrumentation */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_RESTORE_SIGMASK
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific bits */
+#define TIF_ASCE_PRIMARY	16	/* primary asce is kernel asce */
+#define TIF_GUARDED_STORAGE	17	/* load guarded storage control block */
+#define TIF_ISOLATE_BP_GUEST	18	/* Run KVM guests with isolated BP */
+#define TIF_PER_TRAP		19	/* Need to handle PER trap on exit to usermode */
+#define TIF_31BIT		20	/* 32bit process */
+#define TIF_SINGLE_STEP		21	/* This task is single stepped */
+#define TIF_BLOCK_STEP		22	/* This task is block stepped */
+#define TIF_UPROBE_SINGLESTEP	23	/* This task is uprobe single stepped */
 
-#define _TIF_NOTIFY_RESUME	BIT(TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		BIT(TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	BIT(TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
-#define _TIF_UPROBE		BIT(TIF_UPROBE)
-#define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
 #define _TIF_ASCE_PRIMARY	BIT(TIF_ASCE_PRIMARY)
-#define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
 #define _TIF_GUARDED_STORAGE	BIT(TIF_GUARDED_STORAGE)
 #define _TIF_ISOLATE_BP_GUEST	BIT(TIF_ISOLATE_BP_GUEST)
 #define _TIF_PER_TRAP		BIT(TIF_PER_TRAP)
 #define _TIF_31BIT		BIT(TIF_31BIT)
-#define _TIF_MEMDIE		BIT(TIF_MEMDIE)
-#define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #define _TIF_SINGLE_STEP	BIT(TIF_SINGLE_STEP)
 #define _TIF_BLOCK_STEP		BIT(TIF_BLOCK_STEP)
 #define _TIF_UPROBE_SINGLESTEP	BIT(TIF_UPROBE_SINGLESTEP)
-#define _TIF_SYSCALL_TRACE	BIT(TIF_SYSCALL_TRACE)
-#define _TIF_SYSCALL_AUDIT	BIT(TIF_SYSCALL_AUDIT)
-#define _TIF_SECCOMP		BIT(TIF_SECCOMP)
-#define _TIF_SYSCALL_TRACEPOINT	BIT(TIF_SYSCALL_TRACEPOINT)
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 95ecad9c7d7d..a8915663e917 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
  * This code generates raw asm output which is post-processed to extract
  * and format the required data.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/kbuild.h>
 #include <linux/sched.h>
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index c62100dc62c8..6a26f202441d 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -1416,18 +1416,12 @@ static inline char *debug_get_user_string(const char __user *user_buf,
 {
 	char *buffer;
 
-	buffer = kmalloc(user_len + 1, GFP_KERNEL);
-	if (!buffer)
-		return ERR_PTR(-ENOMEM);
-	if (copy_from_user(buffer, user_buf, user_len) != 0) {
-		kfree(buffer);
-		return ERR_PTR(-EFAULT);
-	}
+	buffer = memdup_user_nul(user_buf, user_len);
+	if (IS_ERR(buffer))
+		return buffer;
 	/* got the string, now strip linefeed. */
 	if (buffer[user_len - 1] == '\n')
 		buffer[user_len - 1] = 0;
-	else
-		buffer[user_len] = 0;
 	return buffer;
 }
 
diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c
index 7fa4c0b7eb6c..f0a8b4841fb9 100644
--- a/arch/s390/kernel/diag/diag324.c
+++ b/arch/s390/kernel/diag/diag324.c
@@ -116,7 +116,7 @@ static void pibwork_handler(struct work_struct *work)
 	mutex_lock(&pibmutex);
 	timedout = ktime_add_ns(data->expire, PIBWORK_DELAY);
 	if (ktime_before(ktime_get(), timedout)) {
-		mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
+		mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
 		goto out;
 	}
 	vfree(data->pib);
@@ -174,7 +174,7 @@ long diag324_pibbuf(unsigned long arg)
 		pib_update(data);
 		data->sequence++;
 		data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv));
-		mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
+		mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY));
 		first = false;
 	}
 	rc = data->rc;
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
index e7b66d046e8d..2507bc3f7757 100644
--- a/arch/s390/kernel/hiperdispatch.c
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -191,7 +191,7 @@ int hd_enable_hiperdispatch(void)
 		return 0;
 	if (hd_online_cores <= hd_entitled_cores)
 		return 0;
-	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
+	mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
 	hd_update_capacities();
 	return 1;
 }
diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
index 4d364de43799..143e34a4eca5 100644
--- a/arch/s390/kernel/kexec_elf.c
+++ b/arch/s390/kernel/kexec_elf.c
@@ -16,7 +16,7 @@
 static int kexec_file_add_kernel_elf(struct kimage *image,
 				     struct s390_load_data *data)
 {
-	struct kexec_buf buf;
+	struct kexec_buf buf = {};
 	const Elf_Ehdr *ehdr;
 	const Elf_Phdr *phdr;
 	Elf_Addr entry;
diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index a32ce8bea745..9a439175723c 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c
@@ -16,7 +16,7 @@
 static int kexec_file_add_kernel_image(struct kimage *image,
 				       struct s390_load_data *data)
 {
-	struct kexec_buf buf;
+	struct kexec_buf buf = {};
 
 	buf.image = image;
 
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index c2bac14dd668..a36d7311c668 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image,
 static int kexec_file_add_purgatory(struct kimage *image,
 				    struct s390_load_data *data)
 {
-	struct kexec_buf buf;
+	struct kexec_buf buf = {};
 	int ret;
 
 	buf.image = image;
@@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image,
 static int kexec_file_add_initrd(struct kimage *image,
 				 struct s390_load_data *data)
 {
-	struct kexec_buf buf;
+	struct kexec_buf buf = {};
 	int ret;
 
 	buf.image = image;
@@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
 {
 	__u32 *lc_ipl_parmblock_ptr;
 	unsigned int len, ncerts;
-	struct kexec_buf buf;
+	struct kexec_buf buf = {};
 	unsigned long addr;
 	void *ptr, *end;
 	int ret;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 4d09954ebf49..04457d88e589 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -760,8 +760,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
 		break;
 
 	case PERF_TYPE_HARDWARE:
-		if (is_sampling_event(event))	/* No sampling support */
-			return -ENOENT;
 		ev = attr->config;
 		if (!attr->exclude_user && attr->exclude_kernel) {
 			/*
@@ -859,6 +857,8 @@ static int cpumf_pmu_event_init(struct perf_event *event)
 	unsigned int type = event->attr.type;
 	int err = -ENOENT;
 
+	if (is_sampling_event(event))	/* No sampling support */
+		return err;
 	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW)
 		err = __hw_perf_event_init(event, type);
 	else if (event->pmu->type == type)
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index f373a1009c45..9455f213dc20 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -285,10 +285,10 @@ static int paicrypt_event_init(struct perf_event *event)
 	/* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
 	if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
 		return -ENOENT;
-	/* PAI crypto event must be in valid range */
+	/* PAI crypto event must be in valid range, try others if not */
 	if (a->config < PAI_CRYPTO_BASE ||
 	    a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
-		return -EINVAL;
+		return -ENOENT;
 	/* Allow only CRYPTO_ALL for sampling */
 	if (a->sample_period && a->config != PAI_CRYPTO_BASE)
 		return -EINVAL;
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index d827473e7f87..7b32935273ce 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -265,7 +265,7 @@ static int paiext_event_valid(struct perf_event *event)
 		event->hw.config_base = offsetof(struct paiext_cb, acc);
 		return 0;
 	}
-	return -EINVAL;
+	return -ENOENT;
 }
 
 /* Might be called on different CPU than the one the event is intended for. */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index f55f09cda6f8..b107dbca4ed7 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -106,7 +106,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long new_stackp = args->stack;
 	unsigned long tls = args->tls;
 	struct fake_frame
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 46569b8e47dd..1594c80e9bc4 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu)
 	return rc;
 }
 
-static const struct cpumask *cpu_thread_mask(int cpu)
-{
-	return &cpu_topology[cpu].thread_mask;
-}
-
-
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	return &cpu_topology[cpu].core_mask;
 }
 
-static const struct cpumask *cpu_book_mask(int cpu)
+static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu)
 {
 	return &cpu_topology[cpu].book_mask;
 }
 
-static const struct cpumask *cpu_drawer_mask(int cpu)
+static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu)
 {
 	return &cpu_topology[cpu].drawer_mask;
 }
 
 static struct sched_domain_topology_level s390_topology[] = {
-	SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
-	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
-	SDTL_INIT(cpu_book_mask, NULL, BOOK),
-	SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
-	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
+	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
+	SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
+	SDTL_INIT(tl_book_mask, NULL, BOOK),
+	SDTL_INIT(tl_drawer_mask, NULL, DRAWER),
+	SDTL_INIT(tl_pkg_mask, NULL, PKG),
 	{ NULL, },
 };
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 2a92a8b9e4c2..9384572ffa7b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2778,12 +2778,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
 
 static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
 {
+	struct mm_struct *mm = kvm->mm;
 	struct page *page = NULL;
+	int locked = 1;
+
+	if (mmget_not_zero(mm)) {
+		mmap_read_lock(mm);
+		get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
+				      &page, &locked);
+		if (locked)
+			mmap_read_unlock(mm);
+		mmput(mm);
+	}
 
-	mmap_read_lock(kvm->mm);
-	get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
-			      &page, NULL);
-	mmap_read_unlock(kvm->mm);
 	return page;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index bf6fa8b9ca73..6d51aa5f66be 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4864,12 +4864,12 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
  * @vcpu: the vCPU whose gmap is to be fixed up
  * @gfn: the guest frame number used for memslots (including fake memslots)
  * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
- * @flags: FOLL_* flags
+ * @foll: FOLL_* flags
  *
  * Return: 0 on success, < 0 in case of error.
  * Context: The mm lock must not be held before calling. May sleep.
  */
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags)
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll)
 {
 	struct kvm_memory_slot *slot;
 	unsigned int fault_flags;
@@ -4883,13 +4883,13 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return vcpu_post_run_addressing_exception(vcpu);
 
-	fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
+	fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
 	if (vcpu->arch.gmap->pfault_enabled)
-		flags |= FOLL_NOWAIT;
+		foll |= FOLL_NOWAIT;
 	vmaddr = __gfn_to_hva_memslot(slot, gfn);
 
 try_again:
-	pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page);
+	pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page);
 
 	/* Access outside memory, inject addressing exception */
 	if (is_noslot_pfn(pfn))
@@ -4905,7 +4905,7 @@ try_again:
 			return 0;
 		vcpu->stat.pfault_sync++;
 		/* Could not setup async pfault, try again synchronously */
-		flags &= ~FOLL_NOWAIT;
+		foll &= ~FOLL_NOWAIT;
 		goto try_again;
 	}
 	/* Any other error */
@@ -4925,7 +4925,7 @@ try_again:
 	return rc;
 }
 
-static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags)
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll)
 {
 	unsigned long gaddr_tmp;
 	gfn_t gfn;
@@ -4950,18 +4950,18 @@ static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, un
 		}
 		gfn = gpa_to_gfn(gaddr_tmp);
 	}
-	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags);
+	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll);
 }
 
 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 {
-	unsigned int flags = 0;
+	unsigned int foll = 0;
 	unsigned long gaddr;
 	int rc;
 
 	gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
 	if (kvm_s390_cur_gmap_fault_is_write())
-		flags = FAULT_FLAG_WRITE;
+		foll = FOLL_WRITE;
 
 	switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
 	case 0:
@@ -5003,7 +5003,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 			send_sig(SIGSEGV, current, 0);
 		if (rc != -ENXIO)
 			break;
-		flags = FAULT_FLAG_WRITE;
+		foll = FOLL_WRITE;
 		fallthrough;
 	case PGM_PROTECTION:
 	case PGM_SEGMENT_TRANSLATION:
@@ -5013,7 +5013,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
 	case PGM_REGION_SECOND_TRANS:
 	case PGM_REGION_THIRD_TRANS:
 		kvm_s390_assert_primary_as(vcpu);
-		return vcpu_dat_fault_handler(vcpu, gaddr, flags);
+		return vcpu_dat_fault_handler(vcpu, gaddr, foll);
 	default:
 		KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
 			current->thread.gmap_int_code, current->thread.gmap_teid.val);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 25ede8354514..6ba5a0305e25 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -624,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	int cc, ret;
 	u16 dummy;
 
+	/* Add the notifier only once. No races because we hold kvm->lock */
+	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+		/* The notifier will be unregistered when the VM is destroyed */
+		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+		ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+		if (ret) {
+			kvm->arch.pv.mmu_notifier.ops = NULL;
+			return ret;
+		}
+	}
+
 	ret = kvm_s390_pv_alloc_vm(kvm);
 	if (ret)
 		return ret;
@@ -659,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EIO;
 	}
 	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
-	/* Add the notifier only once. No races because we hold kvm->lock */
-	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
-		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
-		mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
-	}
 	return 0;
 }
 
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index d2f6f1f6d2fc..36700384fe6b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -14,11 +14,15 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
-unsigned long *crst_table_alloc(struct mm_struct *mm)
+unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
 {
-	struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
+	struct ptdesc *ptdesc;
 	unsigned long *table;
 
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
 	if (!ptdesc)
 		return NULL;
 	table = ptdesc_to_virt(ptdesc);
@@ -112,12 +116,12 @@ err_p4d:
 
 #ifdef CONFIG_PGSTE
 
-struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm)
+struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
 {
 	struct ptdesc *ptdesc;
 	u64 *table;
 
-	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+	ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
 	if (ptdesc) {
 		table = (u64 *)ptdesc_to_virt(ptdesc);
 		__arch_set_page_dat(table, 1);
@@ -134,12 +138,15 @@ void page_table_free_pgste(struct ptdesc *ptdesc)
 
 #endif /* CONFIG_PGSTE */
 
-unsigned long *page_table_alloc(struct mm_struct *mm)
+unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
 {
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
 	struct ptdesc *ptdesc;
 	unsigned long *table;
 
-	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
 		return NULL;
 	if (!pagetable_pte_ctor(mm, ptdesc)) {
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 60688be4e876..50eb57c976bc 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -335,7 +335,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
 	int nodat;
 	struct mm_struct *mm = vma->vm_mm;
 
-	preempt_disable();
 	pgste = ptep_xchg_start(mm, addr, ptep);
 	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
 	old = ptep_flush_lazy(mm, addr, ptep, nodat);
@@ -360,7 +359,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 	} else {
 		set_pte(ptep, pte);
 	}
-	preempt_enable();
 }
 
 static inline void pmdp_idte_local(struct mm_struct *mm,
diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h
index 10ceb0d6b5a9..aba3aa96a50e 100644
--- a/arch/sh/include/asm/bitops.h
+++ b/arch/sh/include/asm/bitops.h
@@ -24,7 +24,7 @@
 #include <asm-generic/bitops/non-atomic.h>
 #endif
 
-static inline unsigned long ffz(unsigned long word)
+static inline unsigned long __attribute_const__ ffz(unsigned long word)
 {
 	unsigned long result;
 
@@ -44,7 +44,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static inline __attribute_const__ unsigned long __ffs(unsigned long word)
 {
 	unsigned long result;
 
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index a0322e832845..429b6a763146 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -8,6 +8,7 @@
  * compile this file to assembler, and then extract the
  * #defines from the assembly-language output.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/stddef.h>
 #include <linux/types.h>
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 92b6649d4929..62f753a85b89 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -89,7 +89,7 @@ asmlinkage void ret_from_kernel_thread(void);
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 7b595092cbfb..a630d373e645 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -110,6 +110,8 @@ config SPARC64
 	select HAVE_SETUP_PER_CPU_AREA
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
 	select NEED_PER_CPU_PAGE_FIRST_CHUNK
+	select ARCH_SUPPORTS_SCHED_SMT if SMP
+	select ARCH_SUPPORTS_SCHED_MC  if SMP
 
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
@@ -288,24 +290,6 @@ if SPARC64 || COMPILE_TEST
 source "kernel/power/Kconfig"
 endif
 
-config SCHED_SMT
-	bool "SMT (Hyperthreading) scheduler support"
-	depends on SPARC64 && SMP
-	default y
-	help
-	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with SPARC cpus at a cost of slightly increased overhead
-	  in some places. If unsure say N here.
-
-config SCHED_MC
-	bool "Multi-core scheduler support"
-	depends on SPARC64 && SMP
-	default y
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
 config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 	depends on SPARC64
diff --git a/arch/sparc/crypto/Kconfig b/arch/sparc/crypto/Kconfig
index f5b2e720fec3..f755da979534 100644
--- a/arch/sparc/crypto/Kconfig
+++ b/arch/sparc/crypto/Kconfig
@@ -16,16 +16,6 @@ config CRYPTO_DES_SPARC64
 
 	  Architecture: sparc64
 
-config CRYPTO_MD5_SPARC64
-	tristate "Digests: MD5"
-	depends on SPARC64
-	select CRYPTO_MD5
-	select CRYPTO_HASH
-	help
-	  MD5 message digest algorithm (RFC1321)
-
-	  Architecture: sparc64 using crypto instructions, when available
-
 config CRYPTO_AES_SPARC64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR"
 	depends on SPARC64
diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile
index 0d05a17988c4..7b4796842ddd 100644
--- a/arch/sparc/crypto/Makefile
+++ b/arch/sparc/crypto/Makefile
@@ -3,14 +3,10 @@
 # Arch-specific CryptoAPI modules.
 #
 
-obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o
-
 obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o
 obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o
 
-md5-sparc64-y := md5_asm.o md5_glue.o
-
 aes-sparc64-y := aes_asm.o aes_glue.o
 des-sparc64-y := des_asm.o des_glue.o
 camellia-sparc64-y := camellia_asm.o camellia_glue.o
diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c
deleted file mode 100644
index b3615f0cdf62..000000000000
--- a/arch/sparc/crypto/md5_glue.c
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes.
- *
- * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
- * and crypto/md5.c which are:
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- * Copyright (c) Cryptoapi developers.
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <asm/elf.h>
-#include <asm/opcodes.h>
-#include <asm/pstate.h>
-#include <crypto/internal/hash.h>
-#include <crypto/md5.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-struct sparc_md5_state {
-	__le32 hash[MD5_HASH_WORDS];
-	u64 byte_count;
-};
-
-asmlinkage void md5_sparc64_transform(__le32 *digest, const char *data,
-				      unsigned int rounds);
-
-static int md5_sparc64_init(struct shash_desc *desc)
-{
-	struct sparc_md5_state *mctx = shash_desc_ctx(desc);
-
-	mctx->hash[0] = cpu_to_le32(MD5_H0);
-	mctx->hash[1] = cpu_to_le32(MD5_H1);
-	mctx->hash[2] = cpu_to_le32(MD5_H2);
-	mctx->hash[3] = cpu_to_le32(MD5_H3);
-	mctx->byte_count = 0;
-
-	return 0;
-}
-
-static int md5_sparc64_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
-
-	sctx->byte_count += round_down(len, MD5_HMAC_BLOCK_SIZE);
-	md5_sparc64_transform(sctx->hash, data, len / MD5_HMAC_BLOCK_SIZE);
-	return len - round_down(len, MD5_HMAC_BLOCK_SIZE);
-}
-
-/* Add padding and return the message digest. */
-static int md5_sparc64_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int offset, u8 *out)
-{
-	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
-	__le64 block[MD5_BLOCK_WORDS] = {};
-	u8 *p = memcpy(block, src, offset);
-	__le32 *dst = (__le32 *)out;
-	__le64 *pbits;
-	int i;
-
-	src = p;
-	p += offset;
-	*p++ = 0x80;
-	sctx->byte_count += offset;
-	pbits = &block[(MD5_BLOCK_WORDS / (offset > 55 ? 1 : 2)) - 1];
-	*pbits = cpu_to_le64(sctx->byte_count << 3);
-	md5_sparc64_transform(sctx->hash, src, (pbits - block + 1) / 8);
-	memzero_explicit(block, sizeof(block));
-
-	/* Store state in digest */
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		dst[i] = sctx->hash[i];
-
-	return 0;
-}
-
-static int md5_sparc64_export(struct shash_desc *desc, void *out)
-{
-	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u32 *u32;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		put_unaligned(le32_to_cpu(sctx->hash[i]), p.u32++);
-	put_unaligned(sctx->byte_count, p.u64);
-	return 0;
-}
-
-static int md5_sparc64_import(struct shash_desc *desc, const void *in)
-{
-	struct sparc_md5_state *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u32 *u32;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		sctx->hash[i] = cpu_to_le32(get_unaligned(p.u32++));
-	sctx->byte_count = get_unaligned(p.u64);
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.digestsize	=	MD5_DIGEST_SIZE,
-	.init		=	md5_sparc64_init,
-	.update		=	md5_sparc64_update,
-	.finup		=	md5_sparc64_finup,
-	.export		=	md5_sparc64_export,
-	.import		=	md5_sparc64_import,
-	.descsize	=	sizeof(struct sparc_md5_state),
-	.statesize	=	sizeof(struct sparc_md5_state),
-	.base		=	{
-		.cra_name	=	"md5",
-		.cra_driver_name=	"md5-sparc64",
-		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-};
-
-static bool __init sparc64_has_md5_opcode(void)
-{
-	unsigned long cfr;
-
-	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
-		return false;
-
-	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
-	if (!(cfr & CFR_MD5))
-		return false;
-
-	return true;
-}
-
-static int __init md5_sparc64_mod_init(void)
-{
-	if (sparc64_has_md5_opcode()) {
-		pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
-		return crypto_register_shash(&alg);
-	}
-	pr_info("sparc64 md5 opcode not available.\n");
-	return -ENODEV;
-}
-
-static void __exit md5_sparc64_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(md5_sparc64_mod_init);
-module_exit(md5_sparc64_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MD5 Message Digest Algorithm, sparc64 md5 opcode accelerated");
-
-MODULE_ALIAS_CRYPTO("md5");
-
-#include "crop_devid.c"
diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h
index 005a8ae858f1..2c7d33b3ec2e 100644
--- a/arch/sparc/include/asm/bitops_64.h
+++ b/arch/sparc/include/asm/bitops_64.h
@@ -23,8 +23,8 @@ void set_bit(unsigned long nr, volatile unsigned long *addr);
 void clear_bit(unsigned long nr, volatile unsigned long *addr);
 void change_bit(unsigned long nr, volatile unsigned long *addr);
 
-int fls(unsigned int word);
-int __fls(unsigned long word);
+int __attribute_const__ fls(unsigned int word);
+int __attribute_const__ __fls(unsigned long word);
 
 #include <asm-generic/bitops/non-atomic.h>
 
@@ -32,8 +32,8 @@ int __fls(unsigned long word);
 
 #ifdef __KERNEL__
 
-int ffs(int x);
-unsigned long __ffs(unsigned long);
+int __attribute_const__ ffs(int x);
+unsigned long __attribute_const__ __ffs(unsigned long);
 
 #include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/sched.h>
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index 3d9b9855dce9..6e660bde48dd 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -10,6 +10,7 @@
  *
  * On sparc, thread_info data is static and TI_XXX offsets are computed by hand.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/sched.h>
 #include <linux/mm_types.h>
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 9c7c662cb565..5a28c0e91bf1 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -260,7 +260,7 @@ extern void ret_from_kernel_thread(void);
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long sp = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 529adfecd58c..25781923788a 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -567,7 +567,7 @@ barf:
  */
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long sp = args->stack;
 	unsigned long tls = args->tls;
 	struct thread_info *t = task_thread_info(p);
diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index ad8d78fb1d9a..de7867ae220d 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -1250,10 +1250,12 @@ static int virtio_uml_probe(struct platform_device *pdev)
 	device_set_wakeup_capable(&vu_dev->vdev.dev, true);
 
 	rc = register_virtio_device(&vu_dev->vdev);
-	if (rc)
+	if (rc) {
 		put_device(&vu_dev->vdev.dev);
+		return rc;
+	}
 	vu_dev->registered = 1;
-	return rc;
+	return 0;
 
 error_init:
 	os_close_file(vu_dev->sock);
diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c
index 1fb12235ab9c..a69873aa697f 100644
--- a/arch/um/kernel/asm-offsets.c
+++ b/arch/um/kernel/asm-offsets.c
@@ -1 +1,3 @@
+#define COMPILE_OFFSETS
+
 #include <sysdep/kernel-offsets.h>
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 1be644de9e41..9c9c66dc45f0 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -143,7 +143,7 @@ static void fork_handler(void)
 
 int copy_thread(struct task_struct * p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long sp = args->stack;
 	unsigned long tls = args->tls;
 	void (*handler)(void);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 617886d1fb1e..21f0e50fb1df 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -535,7 +535,7 @@ ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds,
 	    cmsg->cmsg_type != SCM_RIGHTS)
 		return n;
 
-	memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len);
+	memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0));
 	return n;
 }
 
diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index 4193e04d7e4a..e3ad71a0d13c 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -20,8 +20,7 @@
 
 void stack_protections(unsigned long address)
 {
-	if (mprotect((void *) address, UM_THREAD_SIZE,
-		    PROT_READ | PROT_WRITE | PROT_EXEC) < 0)
+	if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0)
 		panic("protecting stack failed, errno = %d", errno);
 }
 
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index f7fb3d88c57b..36b985d0e7bf 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -3,6 +3,8 @@
 # Branch profiling isn't noinstr-safe.  Disable it for arch/x86/*
 subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
 
+obj-y += boot/startup/
+
 obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/
 
 obj-y += entry/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 58d890fe2100..a11af013cb1e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,7 +26,6 @@ config X86_64
 	depends on 64BIT
 	# Options that are inherently 64-bit kernel only:
 	select ARCH_HAS_GIGANTIC_PAGE
-	select ARCH_HAS_PTDUMP
 	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_PER_VMA_LOCK
@@ -99,6 +98,7 @@ config X86
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PREEMPT_LAZY
+	select ARCH_HAS_PTDUMP
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
 	select ARCH_HAS_NONLEAF_PMD_YOUNG	if PGTABLE_LEVELS > 2
@@ -127,8 +127,8 @@ config X86
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
-	select ARCH_SUPPORTS_CFI_CLANG		if X86_64
-	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
+	select ARCH_SUPPORTS_CFI		if X86_64
+	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
 	select ARCH_SUPPORTS_RT
@@ -239,6 +239,7 @@ config X86
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EISA			if X86_32
 	select HAVE_EXIT_THREAD
+	select HAVE_GENERIC_TIF_BITS
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC		if HAVE_FUNCTION_GRAPH_TRACER
@@ -330,6 +331,10 @@ config X86
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT    if EFI
 	select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
 	select ARCH_SUPPORTS_PT_RECLAIM		if X86_64
+	select ARCH_SUPPORTS_SCHED_SMT		if SMP
+	select SCHED_SMT			if SMP
+	select ARCH_SUPPORTS_SCHED_CLUSTER	if SMP
+	select ARCH_SUPPORTS_SCHED_MC		if SMP
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -483,6 +488,19 @@ config X86_X2APIC
 
 	  If in doubt, say Y.
 
+config AMD_SECURE_AVIC
+	bool "AMD Secure AVIC"
+	depends on AMD_MEM_ENCRYPT && X86_X2APIC
+	help
+	  Enable this to get AMD Secure AVIC support on guests that have this feature.
+
+	  AMD Secure AVIC provides hardware acceleration for performance sensitive
+	  APIC accesses and support for managing guest owned APIC state for SEV-SNP
+	  guests. Secure AVIC does not support xAPIC mode. It has functional
+	  dependency on x2apic being enabled in the guest.
+
+	  If you don't know what to do here, say N.
+
 config X86_POSTED_MSI
 	bool "Enable MSI and MSI-x delivery by posted interrupts"
 	depends on X86_64 && IRQ_REMAP
@@ -879,6 +897,15 @@ config ACRN_GUEST
 	  IOT with small footprint and real-time features. More details can be
 	  found in https://projectacrn.org/.
 
+config BHYVE_GUEST
+	bool "Bhyve (BSD Hypervisor) Guest support"
+	depends on X86_64
+	help
+	  This option allows to run Linux to recognise when it is running as a
+	  guest in the Bhyve hypervisor, and to support more than 255 vCPUs when
+	  when doing so. More details about Bhyve can be found at https://bhyve.org
+	  and https://wiki.freebsd.org/bhyve/.
+
 config INTEL_TDX_GUEST
 	bool "Intel TDX (Trust Domain Extensions) - Guest Support"
 	depends on X86_64 && CPU_SUP_INTEL
@@ -1031,29 +1058,6 @@ config NR_CPUS
 	  This is purely to save memory: each supported CPU adds about 8KB
 	  to the kernel image.
 
-config SCHED_CLUSTER
-	bool "Cluster scheduler support"
-	depends on SMP
-	default y
-	help
-	  Cluster scheduler support improves the CPU scheduler's decision
-	  making when dealing with machines that have clusters of CPUs.
-	  Cluster usually means a couple of CPUs which are placed closely
-	  by sharing mid-level caches, last-level cache tags or internal
-	  busses.
-
-config SCHED_SMT
-	def_bool y if SMP
-
-config SCHED_MC
-	def_bool y
-	prompt "Multi-core scheduler support"
-	depends on SMP
-	help
-	  Multi-core scheduler support improves the CPU scheduler's decision
-	  making when dealing with multi-core CPU chips at a cost of slightly
-	  increased overhead in some places. If unsure say N here.
-
 config SCHED_MC_PRIO
 	bool "CPU core priorities scheduler support"
 	depends on SCHED_MC
@@ -1340,7 +1344,7 @@ config MICROCODE_LATE_LOADING
 	  use this at your own risk. Late loading taints the kernel unless the
 	  microcode header indicates that it is safe for late loading via the
 	  minimal revision check. This minimal revision check can be enforced on
-	  the kernel command line with "microcode.minrev=Y".
+	  the kernel command line with "microcode=force_minrev".
 
 config MICROCODE_LATE_FORCE_MINREV
 	bool "Enforce late microcode loading minimal revision check"
@@ -1356,10 +1360,22 @@ config MICROCODE_LATE_FORCE_MINREV
 	  revision check fails.
 
 	  This minimal revision check can also be controlled via the
-	  "microcode.minrev" parameter on the kernel command line.
+	  "microcode=force_minrev" parameter on the kernel command line.
 
 	  If unsure say Y.
 
+config MICROCODE_DBG
+	bool "Enable microcode loader debugging"
+	default n
+	depends on MICROCODE
+	help
+	  Enable code which allows for debugging the microcode loader in
+	  a guest. Meaning the patch loading is simulated but everything else
+	  related to patch parsing and handling is done as on baremetal with
+	  the purpose of debugging solely the software side of things.
+
+	  You almost certainly want to say n here.
+
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	help
@@ -1753,11 +1769,7 @@ config X86_UMIP
 config CC_HAS_IBT
 	# GCC >= 9 and binutils >= 2.29
 	# Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654
-	# Clang/LLVM >= 14
-	# https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f
-	# https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332
-	def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \
-		  (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \
+	def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \
 		  $(as-instr,endbr64)
 
 config X86_CET
@@ -1769,8 +1781,6 @@ config X86_KERNEL_IBT
 	prompt "Indirect Branch Tracking"
 	def_bool y
 	depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
-	# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
-	depends on !LD_IS_LLD || LLD_VERSION >= 140000
 	select OBJTOOL
 	select X86_CET
 	help
@@ -2396,11 +2406,11 @@ config FUNCTION_PADDING_CFI
 	default  3 if FUNCTION_ALIGNMENT_8B
 	default  0
 
-# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG
+# Basically: FUNCTION_ALIGNMENT - 5*CFI
 # except Kconfig can't do arithmetic :/
 config FUNCTION_PADDING_BYTES
 	int
-	default FUNCTION_PADDING_CFI if CFI_CLANG
+	default FUNCTION_PADDING_CFI if CFI
 	default FUNCTION_ALIGNMENT
 
 config CALL_PADDING
@@ -2410,7 +2420,7 @@ config CALL_PADDING
 
 config FINEIBT
 	def_bool y
-	depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE
+	depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE
 	select CALL_PADDING
 
 config FINEIBT_BHI
@@ -2427,7 +2437,7 @@ config CALL_THUNKS
 
 config PREFIX_SYMBOLS
 	def_bool y
-	depends on CALL_PADDING && !CFI_CLANG
+	depends on CALL_PADDING && !CFI
 
 menuconfig CPU_MITIGATIONS
 	bool "Mitigations for CPU vulnerabilities"
@@ -2701,6 +2711,15 @@ config MITIGATION_TSA
 	  security vulnerability on AMD CPUs which can lead to forwarding of
 	  invalid info to subsequent instructions and thus can affect their
 	  timing and thereby cause a leakage.
+
+config MITIGATION_VMSCAPE
+	bool "Mitigate VMSCAPE"
+	depends on KVM
+	default y
+	help
+	  Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security
+	  vulnerability on Intel and AMD CPUs that may allow a guest to do
+	  Spectre v2 style attacks on userspace hypervisor.
 endif
 
 config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1913d342969b..4db7e4bf69f5 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -13,8 +13,8 @@ else
 endif
 
 ifdef CONFIG_CC_IS_GCC
-RETPOLINE_CFLAGS	:= $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
-RETPOLINE_VDSO_CFLAGS	:= $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
+RETPOLINE_CFLAGS	:= -mindirect-branch=thunk-extern -mindirect-branch-register
+RETPOLINE_VDSO_CFLAGS	:= -mindirect-branch=thunk-inline -mindirect-branch-register
 endif
 ifdef CONFIG_CC_IS_CLANG
 RETPOLINE_CFLAGS	:= -mretpoline-external-thunk
@@ -37,10 +37,11 @@ export RETPOLINE_VDSO_CFLAGS
 
 # For gcc stack alignment is specified with -mpreferred-stack-boundary,
 # clang has the option -mstack-alignment for that purpose.
-ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
+ifdef CONFIG_CC_IS_GCC
       cc_stack_align4 := -mpreferred-stack-boundary=2
       cc_stack_align8 := -mpreferred-stack-boundary=3
-else ifneq ($(call cc-option, -mstack-alignment=16),)
+endif
+ifdef CONFIG_CC_IS_CLANG
       cc_stack_align4 := -mstack-alignment=4
       cc_stack_align8 := -mstack-alignment=8
 endif
@@ -83,19 +84,7 @@ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-av
 #
 CC_FLAGS_FPU := -msse -msse2
 ifdef CONFIG_CC_IS_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
-#
-# The "-msse" in the first argument is there so that the
-# -mpreferred-stack-boundary=3 build error:
-#
-#  -mpreferred-stack-boundary=3 is not between 4 and 12
-#
-# can be triggered. Otherwise gcc doesn't complain.
 CC_FLAGS_FPU += -mhard-float
-CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
 endif
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
@@ -159,7 +148,7 @@ else
 
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += -mno-80387
-        KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
+        KBUILD_CFLAGS += -mno-fp-ret-in-387
 
         # By default gcc and clang use a stack alignment of 16 bytes for x86.
         # However the standard kernel entry on x86-64 leaves the stack on an
@@ -171,7 +160,7 @@ else
         KBUILD_CFLAGS += $(cc_stack_align8)
 
 	# Use -mskip-rax-setup if supported.
-	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+	KBUILD_CFLAGS += -mskip-rax-setup
 
 ifdef CONFIG_X86_NATIVE_CPU
         KBUILD_CFLAGS += -march=native
@@ -286,7 +275,6 @@ archprepare: $(cpufeaturemasks.hdr)
 ###
 # Kernel objects
 
-core-y  += arch/x86/boot/startup/
 libs-y  += arch/x86/lib/
 
 # drivers-y are linked after core-y
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 3a38fdcdb9bd..74657589264d 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T
 hostprogs	:= mkpiggy
 HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
 
 quiet_cmd_voffset = VOFFSET $@
       cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 94b5991da001..0f41ca0e52c0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -332,6 +332,8 @@ static size_t parse_elf(void *output)
 }
 
 const unsigned long kernel_text_size = VO___start_rodata - VO__text;
+const unsigned long kernel_inittext_offset = VO__sinittext - VO__text;
+const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext;
 const unsigned long kernel_total_size = VO__end - VO__text;
 
 static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
index 89dd02de2a0f..7530ad8b768b 100644
--- a/arch/x86/boot/compressed/sev-handle-vc.c
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "misc.h"
+#include "error.h"
 #include "sev.h"
 
 #include <linux/kernel.h>
@@ -14,6 +15,8 @@
 #include <asm/fpu/xcr.h>
 
 #define __BOOT_COMPRESSED
+#undef __init
+#define __init
 
 /* Basic instruction decoding support needed */
 #include "../../lib/inat.c"
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index fd1b67dfea22..6e5c32a53d03 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -32,102 +32,47 @@ struct ghcb *boot_ghcb;
 #undef __init
 #define __init
 
-#undef __head
-#define __head
-
 #define __BOOT_COMPRESSED
 
-extern struct svsm_ca *boot_svsm_caa;
-extern u64 boot_svsm_caa_pa;
-
-struct svsm_ca *svsm_get_caa(void)
-{
-	return boot_svsm_caa;
-}
-
-u64 svsm_get_caa_pa(void)
-{
-	return boot_svsm_caa_pa;
-}
-
-int svsm_perform_call_protocol(struct svsm_call *call);
-
 u8 snp_vmpl;
+u16 ghcb_version;
+
+u64 boot_svsm_caa_pa;
 
 /* Include code for early handlers */
 #include "../../boot/startup/sev-shared.c"
 
-int svsm_perform_call_protocol(struct svsm_call *call)
-{
-	struct ghcb *ghcb;
-	int ret;
-
-	if (boot_ghcb)
-		ghcb = boot_ghcb;
-	else
-		ghcb = NULL;
-
-	do {
-		ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
-			   : svsm_perform_msr_protocol(call);
-	} while (ret == -EAGAIN);
-
-	return ret;
-}
-
 static bool sev_snp_enabled(void)
 {
 	return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
 }
 
-static void __page_state_change(unsigned long paddr, enum psc_op op)
-{
-	u64 val, msr;
-
-	/*
-	 * If private -> shared then invalidate the page before requesting the
-	 * state change in the RMP table.
-	 */
-	if (op == SNP_PAGE_STATE_SHARED)
-		pvalidate_4k_page(paddr, paddr, false);
-
-	/* Save the current GHCB MSR value */
-	msr = sev_es_rd_ghcb_msr();
-
-	/* Issue VMGEXIT to change the page state in RMP table. */
-	sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
-	VMGEXIT();
-
-	/* Read the response of the VMGEXIT. */
-	val = sev_es_rd_ghcb_msr();
-	if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
-		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-
-	/* Restore the GHCB MSR value */
-	sev_es_wr_ghcb_msr(msr);
-
-	/*
-	 * Now that page state is changed in the RMP table, validate it so that it is
-	 * consistent with the RMP entry.
-	 */
-	if (op == SNP_PAGE_STATE_PRIVATE)
-		pvalidate_4k_page(paddr, paddr, true);
-}
-
 void snp_set_page_private(unsigned long paddr)
 {
+	struct psc_desc d = {
+		SNP_PAGE_STATE_PRIVATE,
+		(struct svsm_ca *)boot_svsm_caa_pa,
+		boot_svsm_caa_pa
+	};
+
 	if (!sev_snp_enabled())
 		return;
 
-	__page_state_change(paddr, SNP_PAGE_STATE_PRIVATE);
+	__page_state_change(paddr, paddr, &d);
 }
 
 void snp_set_page_shared(unsigned long paddr)
 {
+	struct psc_desc d = {
+		SNP_PAGE_STATE_SHARED,
+		(struct svsm_ca *)boot_svsm_caa_pa,
+		boot_svsm_caa_pa
+	};
+
 	if (!sev_snp_enabled())
 		return;
 
-	__page_state_change(paddr, SNP_PAGE_STATE_SHARED);
+	__page_state_change(paddr, paddr, &d);
 }
 
 bool early_setup_ghcb(void)
@@ -152,8 +97,14 @@ bool early_setup_ghcb(void)
 
 void snp_accept_memory(phys_addr_t start, phys_addr_t end)
 {
+	struct psc_desc d = {
+		SNP_PAGE_STATE_PRIVATE,
+		(struct svsm_ca *)boot_svsm_caa_pa,
+		boot_svsm_caa_pa
+	};
+
 	for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE)
-		__page_state_change(pa, SNP_PAGE_STATE_PRIVATE);
+		__page_state_change(pa, pa, &d);
 }
 
 void sev_es_shutdown_ghcb(void)
@@ -235,15 +186,23 @@ bool sev_es_check_ghcb_fault(unsigned long address)
 				 MSR_AMD64_SNP_VMSA_REG_PROT |		\
 				 MSR_AMD64_SNP_RESERVED_BIT13 |		\
 				 MSR_AMD64_SNP_RESERVED_BIT15 |		\
+				 MSR_AMD64_SNP_SECURE_AVIC |		\
 				 MSR_AMD64_SNP_RESERVED_MASK)
 
+#ifdef CONFIG_AMD_SECURE_AVIC
+#define SNP_FEATURE_SECURE_AVIC		MSR_AMD64_SNP_SECURE_AVIC
+#else
+#define SNP_FEATURE_SECURE_AVIC		0
+#endif
+
 /*
  * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented
  * by the guest kernel. As and when a new feature is implemented in the
  * guest kernel, a corresponding bit should be added to the mask.
  */
 #define SNP_FEATURES_PRESENT	(MSR_AMD64_SNP_DEBUG_SWAP |	\
-				 MSR_AMD64_SNP_SECURE_TSC)
+				 MSR_AMD64_SNP_SECURE_TSC |	\
+				 SNP_FEATURE_SECURE_AVIC)
 
 u64 snp_get_unsupported_features(u64 status)
 {
@@ -347,7 +306,7 @@ static bool early_snp_init(struct boot_params *bp)
 	 * running at VMPL0. The CA will be used to communicate with the
 	 * SVSM and request its services.
 	 */
-	svsm_setup_ca(cc_info);
+	svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page));
 
 	/*
 	 * Pass run-time kernel a pointer to CC info via boot_params so EFI
@@ -391,6 +350,8 @@ static int sev_check_cpu_support(void)
 	if (!(eax & BIT(1)))
 		return -ENODEV;
 
+	sev_snp_needs_sfw = !(ebx & BIT(31));
+
 	return ebx & 0x3f;
 }
 
@@ -453,30 +414,16 @@ void sev_enable(struct boot_params *bp)
 	 */
 	if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
 		u64 hv_features;
-		int ret;
 
 		hv_features = get_hv_features();
 		if (!(hv_features & GHCB_HV_FT_SNP))
 			sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
 
 		/*
-		 * Enforce running at VMPL0 or with an SVSM.
-		 *
-		 * Use RMPADJUST (see the rmpadjust() function for a description of
-		 * what the instruction does) to update the VMPL1 permissions of a
-		 * page. If the guest is running at VMPL0, this will succeed. If the
-		 * guest is running at any other VMPL, this will fail. Linux SNP guests
-		 * only ever run at a single VMPL level so permission mask changes of a
-		 * lesser-privileged VMPL are a don't-care.
-		 */
-		ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1);
-
-		/*
-		 * Running at VMPL0 is not required if an SVSM is present and the hypervisor
-		 * supports the required SVSM GHCB events.
+		 * Running at VMPL0 is required unless an SVSM is present and
+		 * the hypervisor supports the required SVSM GHCB events.
 		 */
-		if (ret &&
-		    !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)))
+		if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))
 			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
 	}
 
@@ -550,7 +497,6 @@ bool early_is_sevsnp_guest(void)
 
 			/* Obtain the address of the calling area to use */
 			boot_rdmsr(MSR_SVSM_CAA, &m);
-			boot_svsm_caa = (void *)m.q;
 			boot_svsm_caa_pa = m.q;
 
 			/*
diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile
index b514f7e81332..e8fdf020b422 100644
--- a/arch/x86/boot/startup/Makefile
+++ b/arch/x86/boot/startup/Makefile
@@ -4,6 +4,7 @@ KBUILD_AFLAGS		+= -D__DISABLE_EXPORTS
 KBUILD_CFLAGS		+= -D__DISABLE_EXPORTS -mcmodel=small -fPIC \
 			   -Os -DDISABLE_BRANCH_PROFILING \
 			   $(DISABLE_STACKLEAK_PLUGIN) \
+			   $(DISABLE_LATENT_ENTROPY_PLUGIN) \
 			   -fno-stack-protector -D__NO_FORTIFY \
 			   -fno-jump-tables \
 			   -include $(srctree)/include/linux/hidden.h
@@ -19,6 +20,7 @@ KCOV_INSTRUMENT	:= n
 
 obj-$(CONFIG_X86_64)		+= gdt_idt.o map_kernel.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= sme.o sev-startup.o
+pi-objs				:= $(patsubst %.o,$(obj)/%.o,$(obj-y))
 
 lib-$(CONFIG_X86_64)		+= la57toggle.o
 lib-$(CONFIG_EFI_MIXED)		+= efi-mixed.o
@@ -28,3 +30,23 @@ lib-$(CONFIG_EFI_MIXED)		+= efi-mixed.o
 # to be linked into the decompressor or the EFI stub but not vmlinux
 #
 $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
+
+#
+# Invoke objtool for each object individually to check for absolute
+# relocations, even if other objtool actions are being deferred.
+#
+$(pi-objs): objtool-enabled	= 1
+$(pi-objs): objtool-args	= $(if $(delay-objtool),,$(objtool-args-y)) --noabs
+
+#
+# Confine the startup code by prefixing all symbols with __pi_ (for position
+# independent). This ensures that startup code can only call other startup
+# code, or code that has explicitly been made accessible to it via a symbol
+# alias.
+#
+$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_
+$(obj)/%.pi.o: $(obj)/%.o FORCE
+	$(call if_changed,objcopy)
+
+targets	+= $(obj-y)
+obj-y	:= $(patsubst %.o,%.pi.o,$(obj-y))
diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h
new file mode 100644
index 000000000000..01d2363dc445
--- /dev/null
+++ b/arch/x86/boot/startup/exports.h
@@ -0,0 +1,14 @@
+
+/*
+ * The symbols below are functions that are implemented by the startup code,
+ * but called at runtime by the SEV code residing in the core kernel.
+ */
+PROVIDE(early_set_pages_state		= __pi_early_set_pages_state);
+PROVIDE(early_snp_set_memory_private	= __pi_early_snp_set_memory_private);
+PROVIDE(early_snp_set_memory_shared	= __pi_early_snp_set_memory_shared);
+PROVIDE(get_hv_features			= __pi_get_hv_features);
+PROVIDE(sev_es_terminate		= __pi_sev_es_terminate);
+PROVIDE(snp_cpuid			= __pi_snp_cpuid);
+PROVIDE(snp_cpuid_get_table		= __pi_snp_cpuid_get_table);
+PROVIDE(svsm_issue_call			= __pi_svsm_issue_call);
+PROVIDE(svsm_process_result_codes	= __pi_svsm_process_result_codes);
diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c
index a3112a69b06a..d16102abdaec 100644
--- a/arch/x86/boot/startup/gdt_idt.c
+++ b/arch/x86/boot/startup/gdt_idt.c
@@ -24,7 +24,7 @@
 static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
 
 /* This may run while still in the direct mapping */
-void __head startup_64_load_idt(void *vc_handler)
+void startup_64_load_idt(void *vc_handler)
 {
 	struct desc_ptr desc = {
 		.address = (unsigned long)rip_rel_ptr(bringup_idt_table),
@@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler)
 /*
  * Setup boot CPU state needed before kernel switches to virtual addresses.
  */
-void __head startup_64_setup_gdt_idt(void)
+void __init startup_64_setup_gdt_idt(void)
 {
 	struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page);
 	void *handler = NULL;
diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c
index 332dbe6688c4..83ba98d61572 100644
--- a/arch/x86/boot/startup/map_kernel.c
+++ b/arch/x86/boot/startup/map_kernel.c
@@ -30,7 +30,7 @@ static inline bool check_la57_support(void)
 	return true;
 }
 
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
+static unsigned long __init sme_postprocess_startup(struct boot_params *bp,
 						    pmdval_t *pmd,
 						    unsigned long p2v_offset)
 {
@@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
  * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
  * subtracting p2v_offset from the RIP-relative address.
  */
-unsigned long __head __startup_64(unsigned long p2v_offset,
+unsigned long __init __startup_64(unsigned long p2v_offset,
 				  struct boot_params *bp)
 {
 	pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c
index 7a706db87b93..4e22ffd73516 100644
--- a/arch/x86/boot/startup/sev-shared.c
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -12,35 +12,12 @@
 #include <asm/setup_data.h>
 
 #ifndef __BOOT_COMPRESSED
-#define error(v)			pr_err(v)
 #define has_cpuflag(f)			boot_cpu_has(f)
 #else
 #undef WARN
 #define WARN(condition, format...) (!!(condition))
-#undef vc_forward_exception
-#define vc_forward_exception(c)		panic("SNP: Hypervisor requested exception\n")
 #endif
 
-/*
- * SVSM related information:
- *   During boot, the page tables are set up as identity mapped and later
- *   changed to use kernel virtual addresses. Maintain separate virtual and
- *   physical addresses for the CAA to allow SVSM functions to be used during
- *   early boot, both with identity mapped virtual addresses and proper kernel
- *   virtual addresses.
- */
-struct svsm_ca *boot_svsm_caa __ro_after_init;
-u64 boot_svsm_caa_pa __ro_after_init;
-
-/*
- * Since feature negotiation related variables are set early in the boot
- * process they must reside in the .data section so as not to be zeroed
- * out when the .bss section is later cleared.
- *
- * GHCB protocol version negotiated with the hypervisor.
- */
-static u16 ghcb_version __ro_after_init;
-
 /* Copy of the SNP firmware's CPUID page. */
 static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
 
@@ -54,17 +31,9 @@ static u32 cpuid_std_range_max __ro_after_init;
 static u32 cpuid_hyp_range_max __ro_after_init;
 static u32 cpuid_ext_range_max __ro_after_init;
 
-bool __init sev_es_check_cpu_features(void)
-{
-	if (!has_cpuflag(X86_FEATURE_RDRAND)) {
-		error("RDRAND instruction not supported - no trusted source of randomness available\n");
-		return false;
-	}
-
-	return true;
-}
+bool sev_snp_needs_sfw;
 
-void __head __noreturn
+void __noreturn
 sev_es_terminate(unsigned int set, unsigned int reason)
 {
 	u64 val = GHCB_MSR_TERM_REQ;
@@ -83,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason)
 /*
  * The hypervisor features are available from GHCB version 2 onward.
  */
-u64 get_hv_features(void)
+u64 __init get_hv_features(void)
 {
 	u64 val;
 
@@ -100,72 +69,7 @@ u64 get_hv_features(void)
 	return GHCB_MSR_HV_FT_RESP_VAL(val);
 }
 
-void snp_register_ghcb_early(unsigned long paddr)
-{
-	unsigned long pfn = paddr >> PAGE_SHIFT;
-	u64 val;
-
-	sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
-	VMGEXIT();
-
-	val = sev_es_rd_ghcb_msr();
-
-	/* If the response GPA is not ours then abort the guest */
-	if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
-	    (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
-		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
-}
-
-bool sev_es_negotiate_protocol(void)
-{
-	u64 val;
-
-	/* Do the GHCB protocol version negotiation */
-	sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
-	VMGEXIT();
-	val = sev_es_rd_ghcb_msr();
-
-	if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
-		return false;
-
-	if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
-	    GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
-		return false;
-
-	ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
-
-	return true;
-}
-
-static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
-	u32 ret;
-
-	ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
-	if (!ret)
-		return ES_OK;
-
-	if (ret == 1) {
-		u64 info = ghcb->save.sw_exit_info_2;
-		unsigned long v = info & SVM_EVTINJ_VEC_MASK;
-
-		/* Check if exception information from hypervisor is sane. */
-		if ((info & SVM_EVTINJ_VALID) &&
-		    ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
-		    ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
-			ctxt->fi.vector = v;
-
-			if (info & SVM_EVTINJ_VALID_ERR)
-				ctxt->fi.error_code = info >> 32;
-
-			return ES_EXCEPTION;
-		}
-	}
-
-	return ES_VMM_ERROR;
-}
-
-static inline int svsm_process_result_codes(struct svsm_call *call)
+int svsm_process_result_codes(struct svsm_call *call)
 {
 	switch (call->rax_out) {
 	case SVSM_SUCCESS:
@@ -193,7 +97,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call)
  *     - RAX specifies the SVSM protocol/callid as input and the return code
  *       as output.
  */
-static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending)
+void svsm_issue_call(struct svsm_call *call, u8 *pending)
 {
 	register unsigned long rax asm("rax") = call->rax;
 	register unsigned long rcx asm("rcx") = call->rcx;
@@ -216,7 +120,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending)
 	call->r9_out  = r9;
 }
 
-static int svsm_perform_msr_protocol(struct svsm_call *call)
+int svsm_perform_msr_protocol(struct svsm_call *call)
 {
 	u8 pending = 0;
 	u64 val, resp;
@@ -247,63 +151,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call)
 	return svsm_process_result_codes(call);
 }
 
-static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
-{
-	struct es_em_ctxt ctxt;
-	u8 pending = 0;
-
-	vc_ghcb_invalidate(ghcb);
-
-	/*
-	 * Fill in protocol and format specifiers. This can be called very early
-	 * in the boot, so use rip-relative references as needed.
-	 */
-	ghcb->protocol_version = ghcb_version;
-	ghcb->ghcb_usage       = GHCB_DEFAULT_USAGE;
-
-	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
-	ghcb_set_sw_exit_info_1(ghcb, 0);
-	ghcb_set_sw_exit_info_2(ghcb, 0);
-
-	sev_es_wr_ghcb_msr(__pa(ghcb));
-
-	svsm_issue_call(call, &pending);
-
-	if (pending)
-		return -EINVAL;
-
-	switch (verify_exception_info(ghcb, &ctxt)) {
-	case ES_OK:
-		break;
-	case ES_EXCEPTION:
-		vc_forward_exception(&ctxt);
-		fallthrough;
-	default:
-		return -EINVAL;
-	}
-
-	return svsm_process_result_codes(call);
-}
-
-enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
-				   struct es_em_ctxt *ctxt,
-				   u64 exit_code, u64 exit_info_1,
-				   u64 exit_info_2)
-{
-	/* Fill in protocol and format specifiers */
-	ghcb->protocol_version = ghcb_version;
-	ghcb->ghcb_usage       = GHCB_DEFAULT_USAGE;
-
-	ghcb_set_sw_exit_code(ghcb, exit_code);
-	ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
-	ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
-
-	sev_es_wr_ghcb_msr(__pa(ghcb));
-	VMGEXIT();
-
-	return verify_exception_info(ghcb, ctxt);
-}
-
 static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
 {
 	u64 val;
@@ -342,44 +189,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
 	return ret;
 }
 
-static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
-	u32 cr4 = native_read_cr4();
-	int ret;
-
-	ghcb_set_rax(ghcb, leaf->fn);
-	ghcb_set_rcx(ghcb, leaf->subfn);
-
-	if (cr4 & X86_CR4_OSXSAVE)
-		/* Safe to read xcr0 */
-		ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
-	else
-		/* xgetbv will cause #UD - use reset value for xcr0 */
-		ghcb_set_xcr0(ghcb, 1);
 
-	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
-	if (ret != ES_OK)
-		return ret;
-
-	if (!(ghcb_rax_is_valid(ghcb) &&
-	      ghcb_rbx_is_valid(ghcb) &&
-	      ghcb_rcx_is_valid(ghcb) &&
-	      ghcb_rdx_is_valid(ghcb)))
-		return ES_VMM_ERROR;
-
-	leaf->eax = ghcb->save.rax;
-	leaf->ebx = ghcb->save.rbx;
-	leaf->ecx = ghcb->save.rcx;
-	leaf->edx = ghcb->save.rdx;
-
-	return ES_OK;
-}
-
-static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
-	return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
-		    : __sev_cpuid_hv_msr(leaf);
-}
 
 /*
  * This may be called early while still running on the initial identity
@@ -412,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void)
  *
  * Return: XSAVE area size on success, 0 otherwise.
  */
-static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
 {
 	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
 	u64 xfeatures_found = 0;
@@ -448,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
 	return xsave_size;
 }
 
-static bool __head
+static bool
 snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
 {
 	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -484,21 +294,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
 	return false;
 }
 
-static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf)
 {
-	if (sev_cpuid_hv(ghcb, ctxt, leaf))
+	if (__sev_cpuid_hv_msr(leaf))
 		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
 }
 
-static int __head
-snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
-		      struct cpuid_leaf *leaf)
+static int
+snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+		      void *ctx, struct cpuid_leaf *leaf)
 {
 	struct cpuid_leaf leaf_hv = *leaf;
 
 	switch (leaf->fn) {
 	case 0x1:
-		snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+		cpuid_fn(ctx, &leaf_hv);
 
 		/* initial APIC ID */
 		leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
@@ -517,7 +327,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
 		break;
 	case 0xB:
 		leaf_hv.subfn = 0;
-		snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+		cpuid_fn(ctx, &leaf_hv);
 
 		/* extended APIC ID */
 		leaf->edx = leaf_hv.edx;
@@ -565,7 +375,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
 		}
 		break;
 	case 0x8000001E:
-		snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
+		cpuid_fn(ctx, &leaf_hv);
 
 		/* extended APIC ID */
 		leaf->eax = leaf_hv.eax;
@@ -586,8 +396,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
  * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
  * should be treated as fatal by caller.
  */
-int __head
-snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+	      void *ctx, struct cpuid_leaf *leaf)
 {
 	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
 
@@ -621,7 +431,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
 			return 0;
 	}
 
-	return snp_cpuid_postprocess(ghcb, ctxt, leaf);
+	return snp_cpuid_postprocess(cpuid_fn, ctx, leaf);
 }
 
 /*
@@ -629,7 +439,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
  * page yet, so it only supports the MSR based communication with the
  * hypervisor and only the CPUID exit-code.
  */
-void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 {
 	unsigned int subfn = lower_bits(regs->cx, 32);
 	unsigned int fn = lower_bits(regs->ax, 32);
@@ -648,13 +458,24 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
 	leaf.fn = fn;
 	leaf.subfn = subfn;
 
-	ret = snp_cpuid(NULL, NULL, &leaf);
+	/*
+	 * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the
+	 * CPUID values (with possible HV interaction during post-processing of
+	 * the values). But if SNP is not active (no CPUID table present), then
+	 * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the
+	 * HV to obtain the CPUID information.
+	 */
+	ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf);
 	if (!ret)
 		goto cpuid_done;
 
 	if (ret != -EOPNOTSUPP)
 		goto fail;
 
+	/*
+	 * This is reached by a SEV-ES guest and needs to invoke the HV for
+	 * the CPUID data.
+	 */
 	if (__sev_cpuid_hv_msr(&leaf))
 		goto fail;
 
@@ -705,7 +526,7 @@ struct cc_setup_data {
  * Search for a Confidential Computing blob passed in as a setup_data entry
  * via the Linux Boot Protocol.
  */
-static __head
+static __init
 struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
 {
 	struct cc_setup_data *sd = NULL;
@@ -733,7 +554,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
  * mapping needs to be updated in sync with all the changes to virtual memory
  * layout and related mapping facilities throughout the boot process.
  */
-static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
 {
 	const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
 	int i;
@@ -761,13 +582,24 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
 	}
 }
 
-static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
+static int svsm_call_msr_protocol(struct svsm_call *call)
+{
+	int ret;
+
+	do {
+		ret = svsm_perform_msr_protocol(call);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static void svsm_pval_4k_page(unsigned long paddr, bool validate,
+			      struct svsm_ca *caa, u64 caa_pa)
 {
 	struct svsm_pvalidate_call *pc;
 	struct svsm_call call = {};
 	unsigned long flags;
 	u64 pc_pa;
-	int ret;
 
 	/*
 	 * This can be called very early in the boot, use native functions in
@@ -775,48 +607,96 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
 	 */
 	flags = native_local_irq_save();
 
-	call.caa = svsm_get_caa();
+	call.caa = caa;
 
 	pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
-	pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+	pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer);
 
 	pc->num_entries = 1;
 	pc->cur_index   = 0;
 	pc->entry[0].page_size = RMP_PG_SIZE_4K;
 	pc->entry[0].action    = validate;
 	pc->entry[0].ignore_cf = 0;
+	pc->entry[0].rsvd      = 0;
 	pc->entry[0].pfn       = paddr >> PAGE_SHIFT;
 
 	/* Protocol 0, Call ID 1 */
 	call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
 	call.rcx = pc_pa;
 
-	ret = svsm_perform_call_protocol(&call);
-	if (ret)
+	/*
+	 * Use the MSR protocol exclusively, so that this code is usable in
+	 * startup code where VA/PA translations of the GHCB page's address may
+	 * be problematic.
+	 */
+	if (svsm_call_msr_protocol(&call))
 		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
 
 	native_local_irq_restore(flags);
 }
 
-static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
-				     bool validate)
+static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
+			      bool validate, struct svsm_ca *caa, u64 caa_pa)
 {
 	int ret;
 
 	if (snp_vmpl) {
-		svsm_pval_4k_page(paddr, validate);
+		svsm_pval_4k_page(paddr, validate, caa, caa_pa);
 	} else {
 		ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
 		if (ret)
 			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
 	}
+
+	/*
+	 * If validating memory (making it private) and affected by the
+	 * cache-coherency vulnerability, perform the cache eviction mitigation.
+	 */
+	if (validate && sev_snp_needs_sfw)
+		sev_evict_cache((void *)vaddr, 1);
+}
+
+static void __page_state_change(unsigned long vaddr, unsigned long paddr,
+			        const struct psc_desc *desc)
+{
+	u64 val, msr;
+
+	/*
+	 * If private -> shared then invalidate the page before requesting the
+	 * state change in the RMP table.
+	 */
+	if (desc->op == SNP_PAGE_STATE_SHARED)
+		pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa);
+
+	/* Save the current GHCB MSR value */
+	msr = sev_es_rd_ghcb_msr();
+
+	/* Issue VMGEXIT to change the page state in RMP table. */
+	sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op));
+	VMGEXIT();
+
+	/* Read the response of the VMGEXIT. */
+	val = sev_es_rd_ghcb_msr();
+	if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+	/* Restore the GHCB MSR value */
+	sev_es_wr_ghcb_msr(msr);
+
+	/*
+	 * Now that page state is changed in the RMP table, validate it so that it is
+	 * consistent with the RMP entry.
+	 */
+	if (desc->op == SNP_PAGE_STATE_PRIVATE)
+		pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa);
 }
 
 /*
  * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM
  * services needed when not running in VMPL0.
  */
-static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
+static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info,
+				 void *page)
 {
 	struct snp_secrets_page *secrets_page;
 	struct snp_cpuid_table *cpuid_table;
@@ -839,7 +719,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
 	 * routine is running identity mapped when called, both by the decompressor
 	 * code and the early kernel code.
 	 */
-	if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1))
+	if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1))
 		return false;
 
 	/*
@@ -867,11 +747,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
 	if (caa & (PAGE_SIZE - 1))
 		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA);
 
-	/*
-	 * The CA is identity mapped when this routine is called, both by the
-	 * decompressor code and the early kernel code.
-	 */
-	boot_svsm_caa = (struct svsm_ca *)caa;
 	boot_svsm_caa_pa = caa;
 
 	/* Advertise the SVSM presence via CPUID. */
diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c
index 0b7e3b950183..09725428d3e6 100644
--- a/arch/x86/boot/startup/sev-startup.c
+++ b/arch/x86/boot/startup/sev-startup.c
@@ -41,143 +41,14 @@
 #include <asm/cpuid/api.h>
 #include <asm/cmdline.h>
 
-/* For early boot hypervisor communication in SEV-ES enabled guests */
-struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
-
-/*
- * Needs to be in the .data section because we need it NULL before bss is
- * cleared
- */
-struct ghcb *boot_ghcb __section(".data");
-
-/* Bitmap of SEV features supported by the hypervisor */
-u64 sev_hv_features __ro_after_init;
-
-/* Secrets page physical address from the CC blob */
-u64 sev_secrets_pa __ro_after_init;
-
-/* For early boot SVSM communication */
-struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
-
-DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
-DEFINE_PER_CPU(u64, svsm_caa_pa);
-
-/*
- * Nothing shall interrupt this code path while holding the per-CPU
- * GHCB. The backup GHCB is only for NMIs interrupting this path.
- *
- * Callers must disable local interrupts around it.
- */
-noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
-{
-	struct sev_es_runtime_data *data;
-	struct ghcb *ghcb;
-
-	WARN_ON(!irqs_disabled());
-
-	data = this_cpu_read(runtime_data);
-	ghcb = &data->ghcb_page;
-
-	if (unlikely(data->ghcb_active)) {
-		/* GHCB is already in use - save its contents */
-
-		if (unlikely(data->backup_ghcb_active)) {
-			/*
-			 * Backup-GHCB is also already in use. There is no way
-			 * to continue here so just kill the machine. To make
-			 * panic() work, mark GHCBs inactive so that messages
-			 * can be printed out.
-			 */
-			data->ghcb_active        = false;
-			data->backup_ghcb_active = false;
-
-			instrumentation_begin();
-			panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
-			instrumentation_end();
-		}
-
-		/* Mark backup_ghcb active before writing to it */
-		data->backup_ghcb_active = true;
-
-		state->ghcb = &data->backup_ghcb;
-
-		/* Backup GHCB content */
-		*state->ghcb = *ghcb;
-	} else {
-		state->ghcb = NULL;
-		data->ghcb_active = true;
-	}
-
-	return ghcb;
-}
-
 /* Include code shared with pre-decompression boot stage */
 #include "sev-shared.c"
 
-noinstr void __sev_put_ghcb(struct ghcb_state *state)
-{
-	struct sev_es_runtime_data *data;
-	struct ghcb *ghcb;
-
-	WARN_ON(!irqs_disabled());
-
-	data = this_cpu_read(runtime_data);
-	ghcb = &data->ghcb_page;
-
-	if (state->ghcb) {
-		/* Restore GHCB from Backup */
-		*ghcb = *state->ghcb;
-		data->backup_ghcb_active = false;
-		state->ghcb = NULL;
-	} else {
-		/*
-		 * Invalidate the GHCB so a VMGEXIT instruction issued
-		 * from userspace won't appear to be valid.
-		 */
-		vc_ghcb_invalidate(ghcb);
-		data->ghcb_active = false;
-	}
-}
-
-int svsm_perform_call_protocol(struct svsm_call *call)
-{
-	struct ghcb_state state;
-	unsigned long flags;
-	struct ghcb *ghcb;
-	int ret;
-
-	/*
-	 * This can be called very early in the boot, use native functions in
-	 * order to avoid paravirt issues.
-	 */
-	flags = native_local_irq_save();
-
-	if (sev_cfg.ghcbs_initialized)
-		ghcb = __sev_get_ghcb(&state);
-	else if (boot_ghcb)
-		ghcb = boot_ghcb;
-	else
-		ghcb = NULL;
-
-	do {
-		ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
-			   : svsm_perform_msr_protocol(call);
-	} while (ret == -EAGAIN);
-
-	if (sev_cfg.ghcbs_initialized)
-		__sev_put_ghcb(&state);
-
-	native_local_irq_restore(flags);
-
-	return ret;
-}
-
-void __head
+void
 early_set_pages_state(unsigned long vaddr, unsigned long paddr,
-		      unsigned long npages, enum psc_op op)
+		      unsigned long npages, const struct psc_desc *desc)
 {
 	unsigned long paddr_end;
-	u64 val;
 
 	vaddr = vaddr & PAGE_MASK;
 
@@ -185,42 +56,22 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
 	paddr_end = paddr + (npages << PAGE_SHIFT);
 
 	while (paddr < paddr_end) {
-		/* Page validation must be rescinded before changing to shared */
-		if (op == SNP_PAGE_STATE_SHARED)
-			pvalidate_4k_page(vaddr, paddr, false);
-
-		/*
-		 * Use the MSR protocol because this function can be called before
-		 * the GHCB is established.
-		 */
-		sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
-		VMGEXIT();
-
-		val = sev_es_rd_ghcb_msr();
-
-		if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
-			goto e_term;
-
-		if (GHCB_MSR_PSC_RESP_VAL(val))
-			goto e_term;
-
-		/* Page validation must be performed after changing to private */
-		if (op == SNP_PAGE_STATE_PRIVATE)
-			pvalidate_4k_page(vaddr, paddr, true);
+		__page_state_change(vaddr, paddr, desc);
 
 		vaddr += PAGE_SIZE;
 		paddr += PAGE_SIZE;
 	}
-
-	return;
-
-e_term:
-	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
 }
 
-void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
 					 unsigned long npages)
 {
+	struct psc_desc d = {
+		SNP_PAGE_STATE_PRIVATE,
+		rip_rel_ptr(&boot_svsm_ca_page),
+		boot_svsm_caa_pa
+	};
+
 	/*
 	 * This can be invoked in early boot while running identity mapped, so
 	 * use an open coded check for SNP instead of using cc_platform_has().
@@ -234,12 +85,18 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
 	  * Ask the hypervisor to mark the memory pages as private in the RMP
 	  * table.
 	  */
-	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
+	early_set_pages_state(vaddr, paddr, npages, &d);
 }
 
-void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
 					unsigned long npages)
 {
+	struct psc_desc d = {
+		SNP_PAGE_STATE_SHARED,
+		rip_rel_ptr(&boot_svsm_ca_page),
+		boot_svsm_caa_pa
+	};
+
 	/*
 	 * This can be invoked in early boot while running identity mapped, so
 	 * use an open coded check for SNP instead of using cc_platform_has().
@@ -250,7 +107,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
 		return;
 
 	 /* Ask hypervisor to mark the memory pages shared in the RMP table. */
-	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
+	early_set_pages_state(vaddr, paddr, npages, &d);
 }
 
 /*
@@ -266,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
  *
  * Scan for the blob in that order.
  */
-static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp)
 {
 	struct cc_blob_sev_info *cc_info;
 
@@ -287,15 +144,15 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
 
 found_cc_info:
 	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
-		snp_abort();
+		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
 
 	return cc_info;
 }
 
-static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
+static void __init svsm_setup(struct cc_blob_sev_info *cc_info)
 {
+	struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys;
 	struct svsm_call call = {};
-	int ret;
 	u64 pa;
 
 	/*
@@ -303,7 +160,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
 	 * running at VMPL0. The CA will be used to communicate with the
 	 * SVSM to perform the SVSM services.
 	 */
-	if (!svsm_setup_ca(cc_info))
+	if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page)))
 		return;
 
 	/*
@@ -315,25 +172,25 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
 	pa = (u64)rip_rel_ptr(&boot_svsm_ca_page);
 
 	/*
-	 * Switch over to the boot SVSM CA while the current CA is still
-	 * addressable. There is no GHCB at this point so use the MSR protocol.
+	 * Switch over to the boot SVSM CA while the current CA is still 1:1
+	 * mapped and thus addressable with VA == PA. There is no GHCB at this
+	 * point so use the MSR protocol.
 	 *
 	 * SVSM_CORE_REMAP_CA call:
 	 *   RAX = 0 (Protocol=0, CallID=0)
 	 *   RCX = New CA GPA
 	 */
-	call.caa = svsm_get_caa();
+	call.caa = (struct svsm_ca *)secrets->svsm_caa;
 	call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
 	call.rcx = pa;
-	ret = svsm_perform_call_protocol(&call);
-	if (ret)
+
+	if (svsm_call_msr_protocol(&call))
 		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
 
-	boot_svsm_caa = (struct svsm_ca *)pa;
 	boot_svsm_caa_pa = pa;
 }
 
-bool __head snp_init(struct boot_params *bp)
+bool __init snp_init(struct boot_params *bp)
 {
 	struct cc_blob_sev_info *cc_info;
 
@@ -361,8 +218,3 @@ bool __head snp_init(struct boot_params *bp)
 
 	return true;
 }
-
-void __head __noreturn snp_abort(void)
-{
-	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-}
diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c
index 70ea1748c0a7..e7ea65f3f1d6 100644
--- a/arch/x86/boot/startup/sme.c
+++ b/arch/x86/boot/startup/sme.c
@@ -91,7 +91,7 @@ struct sme_populate_pgd_data {
  */
 static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
 
-static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
 {
 	unsigned long pgd_start, pgd_end, pgd_size;
 	pgd_t *pgd_p;
@@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
 	memset(pgd_p, 0, pgd_size);
 }
 
-static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
 	return pud;
 }
 
-static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
 {
 	pud_t *pud;
 	pmd_t *pmd;
@@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
 	set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
 }
 
-static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
 {
 	pud_t *pud;
 	pmd_t *pmd;
@@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
 		set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
 }
 
-static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
 {
 	while (ppd->vaddr < ppd->vaddr_end) {
 		sme_populate_pgd_large(ppd);
@@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
 	}
 }
 
-static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
 {
 	while (ppd->vaddr < ppd->vaddr_end) {
 		sme_populate_pgd(ppd);
@@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
 	}
 }
 
-static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
+static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
 				   pmdval_t pmd_flags, pteval_t pte_flags)
 {
 	unsigned long vaddr_end;
@@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
 	__sme_map_range_pte(ppd);
 }
 
-static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
 {
 	__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
 }
 
-static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
 {
 	__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
 }
 
-static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
 {
 	__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
 }
 
-static unsigned long __head sme_pgtable_calc(unsigned long len)
+static unsigned long __init sme_pgtable_calc(unsigned long len)
 {
 	unsigned long entries = 0, tables = 0;
 
@@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len)
 	return entries + tables;
 }
 
-void __head sme_encrypt_kernel(struct boot_params *bp)
+void __init sme_encrypt_kernel(struct boot_params *bp)
 {
 	unsigned long workarea_start, workarea_end, workarea_len;
 	unsigned long execute_start, execute_end, execute_len;
@@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp)
 	native_write_cr3(__native_read_cr3());
 }
 
-void __head sme_enable(struct boot_params *bp)
+void __init sme_enable(struct boot_params *bp)
 {
 	unsigned int eax, ebx, ecx, edx;
 	unsigned long feature_mask;
@@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp)
 		return;
 
 	me_mask = 1UL << (ebx & 0x3f);
+	sev_snp_needs_sfw = !(ebx & BIT(31));
 
 	/* Check the SEV MSR whether SEV or SME is enabled */
 	sev_status = msr = native_rdmsrq(MSR_AMD64_SEV);
@@ -531,7 +532,7 @@ void __head sme_enable(struct boot_params *bp)
 	 * enablement abort the guest.
 	 */
 	if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED))
-		snp_abort();
+		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
 
 	/* Check if memory encryption is enabled */
 	if (feature_mask == AMD_SME_BIT) {
@@ -567,7 +568,6 @@ void __head sme_enable(struct boot_params *bp)
 
 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
 /* Local version for startup code, which never operates on user page tables */
-__weak
 pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
 {
 	return pgd;
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index d4610af68114..989ca9f72ba3 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr)
 	case CC_ATTR_HOST_SEV_SNP:
 		return cc_flags.host_sev_snp;
 
+	case CC_ATTR_SNP_SECURE_AVIC:
+		return sev_status & MSR_AMD64_SNP_SECURE_AVIC;
+
 	default:
 		return false;
 	}
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
index 342d79f0ab6a..3b8ae214a6a6 100644
--- a/arch/x86/coco/sev/Makefile
+++ b/arch/x86/coco/sev/Makefile
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-y += core.o sev-nmi.o vc-handle.o
+obj-y += core.o noinstr.o vc-handle.o
 
 # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
-UBSAN_SANITIZE_sev-nmi.o	:= n
+UBSAN_SANITIZE_noinstr.o	:= n
 
 # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining
-KASAN_SANITIZE_sev-nmi.o	:= n
-KCSAN_SANITIZE_sev-nmi.o	:= n
+KASAN_SANITIZE_noinstr.o	:= n
+KCSAN_SANITIZE_noinstr.o	:= n
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index fc59ce78c477..9ae3b11754e6 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -46,6 +46,48 @@
 #include <asm/cmdline.h>
 #include <asm/msr.h>
 
+/* Bitmap of SEV features supported by the hypervisor */
+u64 sev_hv_features __ro_after_init;
+SYM_PIC_ALIAS(sev_hv_features);
+
+/* Secrets page physical address from the CC blob */
+u64 sev_secrets_pa __ro_after_init;
+SYM_PIC_ALIAS(sev_secrets_pa);
+
+/* For early boot SVSM communication */
+struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
+SYM_PIC_ALIAS(boot_svsm_ca_page);
+
+/*
+ * SVSM related information:
+ *   During boot, the page tables are set up as identity mapped and later
+ *   changed to use kernel virtual addresses. Maintain separate virtual and
+ *   physical addresses for the CAA to allow SVSM functions to be used during
+ *   early boot, both with identity mapped virtual addresses and proper kernel
+ *   virtual addresses.
+ */
+u64 boot_svsm_caa_pa __ro_after_init;
+SYM_PIC_ALIAS(boot_svsm_caa_pa);
+
+DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
+DEFINE_PER_CPU(u64, svsm_caa_pa);
+
+static inline struct svsm_ca *svsm_get_caa(void)
+{
+	if (sev_cfg.use_cas)
+		return this_cpu_read(svsm_caa);
+	else
+		return rip_rel_ptr(&boot_svsm_ca_page);
+}
+
+static inline u64 svsm_get_caa_pa(void)
+{
+	if (sev_cfg.use_cas)
+		return this_cpu_read(svsm_caa_pa);
+	else
+		return boot_svsm_caa_pa;
+}
+
 /* AP INIT values as documented in the APM2  section "Processor Initialization State" */
 #define AP_INIT_CS_LIMIT		0xffff
 #define AP_INIT_DS_LIMIT		0xffff
@@ -79,6 +121,7 @@ static const char * const sev_status_feat_names[] = {
 	[MSR_AMD64_SNP_IBS_VIRT_BIT]		= "IBSVirt",
 	[MSR_AMD64_SNP_VMSA_REG_PROT_BIT]	= "VMSARegProt",
 	[MSR_AMD64_SNP_SMT_PROT_BIT]		= "SMTProt",
+	[MSR_AMD64_SNP_SECURE_AVIC_BIT]		= "SecureAVIC",
 };
 
 /*
@@ -100,6 +143,26 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
  */
 u8 snp_vmpl __ro_after_init;
 EXPORT_SYMBOL_GPL(snp_vmpl);
+SYM_PIC_ALIAS(snp_vmpl);
+
+/*
+ * Since feature negotiation related variables are set early in the boot
+ * process they must reside in the .data section so as not to be zeroed
+ * out when the .bss section is later cleared.
+ *
+ * GHCB protocol version negotiated with the hypervisor.
+ */
+u16 ghcb_version __ro_after_init;
+SYM_PIC_ALIAS(ghcb_version);
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+struct ghcb *boot_ghcb __section(".data");
 
 static u64 __init get_snp_jump_table_addr(void)
 {
@@ -154,6 +217,73 @@ static u64 __init get_jump_table_addr(void)
 	return ret;
 }
 
+static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
+{
+	struct es_em_ctxt ctxt;
+	u8 pending = 0;
+
+	vc_ghcb_invalidate(ghcb);
+
+	/*
+	 * Fill in protocol and format specifiers. This can be called very early
+	 * in the boot, so use rip-relative references as needed.
+	 */
+	ghcb->protocol_version = ghcb_version;
+	ghcb->ghcb_usage       = GHCB_DEFAULT_USAGE;
+
+	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
+	ghcb_set_sw_exit_info_1(ghcb, 0);
+	ghcb_set_sw_exit_info_2(ghcb, 0);
+
+	sev_es_wr_ghcb_msr(__pa(ghcb));
+
+	svsm_issue_call(call, &pending);
+
+	if (pending)
+		return -EINVAL;
+
+	switch (verify_exception_info(ghcb, &ctxt)) {
+	case ES_OK:
+		break;
+	case ES_EXCEPTION:
+		vc_forward_exception(&ctxt);
+		fallthrough;
+	default:
+		return -EINVAL;
+	}
+
+	return svsm_process_result_codes(call);
+}
+
+static int svsm_perform_call_protocol(struct svsm_call *call)
+{
+	struct ghcb_state state;
+	unsigned long flags;
+	struct ghcb *ghcb;
+	int ret;
+
+	flags = native_local_irq_save();
+
+	if (sev_cfg.ghcbs_initialized)
+		ghcb = __sev_get_ghcb(&state);
+	else if (boot_ghcb)
+		ghcb = boot_ghcb;
+	else
+		ghcb = NULL;
+
+	do {
+		ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+			   : __pi_svsm_perform_msr_protocol(call);
+	} while (ret == -EAGAIN);
+
+	if (sev_cfg.ghcbs_initialized)
+		__sev_put_ghcb(&state);
+
+	native_local_irq_restore(flags);
+
+	return ret;
+}
+
 static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size,
 				    int ret, u64 svsm_ret)
 {
@@ -227,6 +357,7 @@ static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action,
 		pe->page_size = RMP_PG_SIZE_4K;
 		pe->action    = action;
 		pe->ignore_cf = 0;
+		pe->rsvd      = 0;
 		pe->pfn       = pfn;
 
 		pe++;
@@ -257,6 +388,7 @@ static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int d
 		pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
 		pe->action    = e->operation == SNP_PAGE_STATE_PRIVATE;
 		pe->ignore_cf = 0;
+		pe->rsvd      = 0;
 		pe->pfn       = e->gfn;
 
 		pe++;
@@ -358,10 +490,31 @@ static void svsm_pval_pages(struct snp_psc_desc *desc)
 
 static void pvalidate_pages(struct snp_psc_desc *desc)
 {
+	struct psc_entry *e;
+	unsigned int i;
+
 	if (snp_vmpl)
 		svsm_pval_pages(desc);
 	else
 		pval_pages(desc);
+
+	/*
+	 * If not affected by the cache-coherency vulnerability there is no need
+	 * to perform the cache eviction mitigation.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_COHERENCY_SFW_NO))
+		return;
+
+	for (i = 0; i <= desc->hdr.end_entry; i++) {
+		e = &desc->entries[i];
+
+		/*
+		 * If validating memory (making it private) perform the cache
+		 * eviction mitigation.
+		 */
+		if (e->operation == SNP_PAGE_STATE_PRIVATE)
+			sev_evict_cache(pfn_to_kaddr(e->gfn), e->pagesize ? 512 : 1);
+	}
 }
 
 static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
@@ -508,8 +661,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
 	unsigned long vaddr_end;
 
 	/* Use the MSR protocol when a GHCB is not available. */
-	if (!boot_ghcb)
-		return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
+	if (!boot_ghcb) {
+		struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() };
+
+		return early_set_pages_state(vaddr, __pa(vaddr), npages, &d);
+	}
 
 	vaddr = vaddr & PAGE_MASK;
 	vaddr_end = vaddr + (npages << PAGE_SHIFT);
@@ -950,6 +1106,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned
 	vmsa->x87_ftw		= AP_INIT_X87_FTW_DEFAULT;
 	vmsa->x87_fcw		= AP_INIT_X87_FCW_DEFAULT;
 
+	if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+		vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK;
+
 	/* SVME must be set. */
 	vmsa->efer		= EFER_SVME;
 
@@ -1084,6 +1243,105 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd)
 	return 0;
 }
 
+u64 savic_ghcb_msr_read(u32 reg)
+{
+	u64 msr = APIC_BASE_MSR + (reg >> 4);
+	struct pt_regs regs = { .cx = msr };
+	struct es_em_ctxt ctxt = { .regs = &regs };
+	struct ghcb_state state;
+	enum es_result res;
+	struct ghcb *ghcb;
+
+	guard(irqsave)();
+
+	ghcb = __sev_get_ghcb(&state);
+	vc_ghcb_invalidate(ghcb);
+
+	res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false);
+	if (res != ES_OK) {
+		pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res);
+		/* MSR read failures are treated as fatal errors */
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+	}
+
+	__sev_put_ghcb(&state);
+
+	return regs.ax | regs.dx << 32;
+}
+
+void savic_ghcb_msr_write(u32 reg, u64 value)
+{
+	u64 msr = APIC_BASE_MSR + (reg >> 4);
+	struct pt_regs regs = {
+		.cx = msr,
+		.ax = lower_32_bits(value),
+		.dx = upper_32_bits(value)
+	};
+	struct es_em_ctxt ctxt = { .regs = &regs };
+	struct ghcb_state state;
+	enum es_result res;
+	struct ghcb *ghcb;
+
+	guard(irqsave)();
+
+	ghcb = __sev_get_ghcb(&state);
+	vc_ghcb_invalidate(ghcb);
+
+	res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true);
+	if (res != ES_OK) {
+		pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res);
+		/* MSR writes should never fail. Any failure is fatal error for SNP guest */
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+	}
+
+	__sev_put_ghcb(&state);
+}
+
+enum es_result savic_register_gpa(u64 gpa)
+{
+	struct ghcb_state state;
+	struct es_em_ctxt ctxt;
+	enum es_result res;
+	struct ghcb *ghcb;
+
+	guard(irqsave)();
+
+	ghcb = __sev_get_ghcb(&state);
+	vc_ghcb_invalidate(ghcb);
+
+	ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA);
+	ghcb_set_rbx(ghcb, gpa);
+	res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC,
+				  SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0);
+
+	__sev_put_ghcb(&state);
+
+	return res;
+}
+
+enum es_result savic_unregister_gpa(u64 *gpa)
+{
+	struct ghcb_state state;
+	struct es_em_ctxt ctxt;
+	enum es_result res;
+	struct ghcb *ghcb;
+
+	guard(irqsave)();
+
+	ghcb = __sev_get_ghcb(&state);
+	vc_ghcb_invalidate(ghcb);
+
+	ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA);
+	res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC,
+				  SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0);
+	if (gpa && res == ES_OK)
+		*gpa = ghcb->save.rbx;
+
+	__sev_put_ghcb(&state);
+
+	return res;
+}
+
 static void snp_register_per_cpu_ghcb(void)
 {
 	struct sev_es_runtime_data *data;
@@ -1210,7 +1468,8 @@ static void __init alloc_runtime_data(int cpu)
 		struct svsm_ca *caa;
 
 		/* Allocate the SVSM CA page if an SVSM is present */
-		caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE);
+		caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE)
+			  : &boot_svsm_ca_page;
 
 		per_cpu(svsm_caa, cpu) = caa;
 		per_cpu(svsm_caa_pa, cpu) = __pa(caa);
@@ -1264,32 +1523,9 @@ void __init sev_es_init_vc_handling(void)
 		init_ghcb(cpu);
 	}
 
-	/* If running under an SVSM, switch to the per-cpu CA */
-	if (snp_vmpl) {
-		struct svsm_call call = {};
-		unsigned long flags;
-		int ret;
-
-		local_irq_save(flags);
-
-		/*
-		 * SVSM_CORE_REMAP_CA call:
-		 *   RAX = 0 (Protocol=0, CallID=0)
-		 *   RCX = New CA GPA
-		 */
-		call.caa = svsm_get_caa();
-		call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
-		call.rcx = this_cpu_read(svsm_caa_pa);
-		ret = svsm_perform_call_protocol(&call);
-		if (ret)
-			panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n",
-			      ret, call.rax_out);
-
+	if (snp_vmpl)
 		sev_cfg.use_cas = true;
 
-		local_irq_restore(flags);
-	}
-
 	sev_es_setup_play_dead();
 
 	/* Secondary CPUs use the runtime #VC handler */
@@ -1567,15 +1803,6 @@ void sev_show_status(void)
 	pr_cont("\n");
 }
 
-void __init snp_update_svsm_ca(void)
-{
-	if (!snp_vmpl)
-		return;
-
-	/* Update the CAA to a proper kernel address */
-	boot_svsm_caa = &boot_svsm_ca_page;
-}
-
 #ifdef CONFIG_SYSFS
 static ssize_t vmpl_show(struct kobject *kobj,
 			 struct kobj_attribute *attr, char *buf)
diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c
index d8dfaddfb367..b527eafb6312 100644
--- a/arch/x86/coco/sev/sev-nmi.c
+++ b/arch/x86/coco/sev/noinstr.c
@@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void)
 
 	__sev_put_ghcb(&state);
 }
+
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
+{
+	struct sev_es_runtime_data *data;
+	struct ghcb *ghcb;
+
+	WARN_ON(!irqs_disabled());
+
+	data = this_cpu_read(runtime_data);
+	ghcb = &data->ghcb_page;
+
+	if (unlikely(data->ghcb_active)) {
+		/* GHCB is already in use - save its contents */
+
+		if (unlikely(data->backup_ghcb_active)) {
+			/*
+			 * Backup-GHCB is also already in use. There is no way
+			 * to continue here so just kill the machine. To make
+			 * panic() work, mark GHCBs inactive so that messages
+			 * can be printed out.
+			 */
+			data->ghcb_active        = false;
+			data->backup_ghcb_active = false;
+
+			instrumentation_begin();
+			panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+			instrumentation_end();
+		}
+
+		/* Mark backup_ghcb active before writing to it */
+		data->backup_ghcb_active = true;
+
+		state->ghcb = &data->backup_ghcb;
+
+		/* Backup GHCB content */
+		*state->ghcb = *ghcb;
+	} else {
+		state->ghcb = NULL;
+		data->ghcb_active = true;
+	}
+
+	return ghcb;
+}
+
+noinstr void __sev_put_ghcb(struct ghcb_state *state)
+{
+	struct sev_es_runtime_data *data;
+	struct ghcb *ghcb;
+
+	WARN_ON(!irqs_disabled());
+
+	data = this_cpu_read(runtime_data);
+	ghcb = &data->ghcb_page;
+
+	if (state->ghcb) {
+		/* Restore GHCB from Backup */
+		*ghcb = *state->ghcb;
+		data->backup_ghcb_active = false;
+		state->ghcb = NULL;
+	} else {
+		/*
+		 * Invalidate the GHCB so a VMGEXIT instruction issued
+		 * from userspace won't appear to be valid.
+		 */
+		vc_ghcb_invalidate(ghcb);
+		data->ghcb_active = false;
+	}
+}
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index faf1fce89ed4..7fc136a35334 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -351,6 +351,8 @@ fault:
 }
 
 #define sev_printk(fmt, ...)		printk(fmt, ##__VA_ARGS__)
+#define error(v)
+#define has_cpuflag(f)			boot_cpu_has(f)
 
 #include "vc-shared.c"
 
@@ -371,29 +373,30 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write)
  * executing with Secure TSC enabled, so special handling is required for
  * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ.
  */
-static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write)
+static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool write)
 {
+	struct pt_regs *regs = ctxt->regs;
 	u64 tsc;
 
 	/*
-	 * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled.
-	 * Terminate the SNP guest when the interception is enabled.
+	 * Writing to MSR_IA32_TSC can cause subsequent reads of the TSC to
+	 * return undefined values, and GUEST_TSC_FREQ is read-only. Generate
+	 * a #GP on all writes.
 	 */
-	if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
-		return ES_VMM_ERROR;
+	if (write) {
+		ctxt->fi.vector = X86_TRAP_GP;
+		ctxt->fi.error_code = 0;
+		return ES_EXCEPTION;
+	}
 
 	/*
-	 * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC
-	 *         to return undefined values, so ignore all writes.
-	 *
-	 * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use
-	 *        the value returned by rdtsc_ordered().
+	 * GUEST_TSC_FREQ read should not be intercepted when Secure TSC is
+	 * enabled. Terminate the guest if a read is attempted.
 	 */
-	if (write) {
-		WARN_ONCE(1, "TSC MSR writes are verboten!\n");
-		return ES_OK;
-	}
+	if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ)
+		return ES_VMM_ERROR;
 
+	/* Reads of MSR_IA32_TSC should return the current TSC value. */
 	tsc = rdtsc_ordered();
 	regs->ax = lower_32_bits(tsc);
 	regs->dx = upper_32_bits(tsc);
@@ -401,14 +404,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool wri
 	return ES_OK;
 }
 
-static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write)
 {
 	struct pt_regs *regs = ctxt->regs;
 	enum es_result ret;
-	bool write;
-
-	/* Is it a WRMSR? */
-	write = ctxt->insn.opcode.bytes[1] == 0x30;
 
 	switch (regs->cx) {
 	case MSR_SVSM_CAA:
@@ -416,7 +415,16 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	case MSR_IA32_TSC:
 	case MSR_AMD64_GUEST_TSC_FREQ:
 		if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
-			return __vc_handle_secure_tsc_msrs(regs, write);
+			return __vc_handle_secure_tsc_msrs(ctxt, write);
+		break;
+	case MSR_AMD64_SAVIC_CONTROL:
+		/*
+		 * AMD64_SAVIC_CONTROL should not be intercepted when
+		 * Secure AVIC is enabled. Terminate the Secure AVIC guest
+		 * if the interception is enabled.
+		 */
+		if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+			return ES_VMM_ERROR;
 		break;
 	default:
 		break;
@@ -438,6 +446,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	return ret;
 }
 
+static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+	return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30);
+}
+
 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
 {
 	int trapnr = ctxt->fi.vector;
diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c
index 2c0ab0fdc060..9b01c9ad81be 100644
--- a/arch/x86/coco/sev/vc-shared.c
+++ b/arch/x86/coco/sev/vc-shared.c
@@ -409,15 +409,109 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 	return ret;
 }
 
+enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+	u32 ret;
+
+	ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
+	if (!ret)
+		return ES_OK;
+
+	if (ret == 1) {
+		u64 info = ghcb->save.sw_exit_info_2;
+		unsigned long v = info & SVM_EVTINJ_VEC_MASK;
+
+		/* Check if exception information from hypervisor is sane. */
+		if ((info & SVM_EVTINJ_VALID) &&
+		    ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
+		    ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
+			ctxt->fi.vector = v;
+
+			if (info & SVM_EVTINJ_VALID_ERR)
+				ctxt->fi.error_code = info >> 32;
+
+			return ES_EXCEPTION;
+		}
+	}
+
+	return ES_VMM_ERROR;
+}
+
+enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+				   struct es_em_ctxt *ctxt,
+				   u64 exit_code, u64 exit_info_1,
+				   u64 exit_info_2)
+{
+	/* Fill in protocol and format specifiers */
+	ghcb->protocol_version = ghcb_version;
+	ghcb->ghcb_usage       = GHCB_DEFAULT_USAGE;
+
+	ghcb_set_sw_exit_code(ghcb, exit_code);
+	ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+	ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+	sev_es_wr_ghcb_msr(__pa(ghcb));
+	VMGEXIT();
+
+	return verify_exception_info(ghcb, ctxt);
+}
+
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+	u32 cr4 = native_read_cr4();
+	int ret;
+
+	ghcb_set_rax(ghcb, leaf->fn);
+	ghcb_set_rcx(ghcb, leaf->subfn);
+
+	if (cr4 & X86_CR4_OSXSAVE)
+		/* Safe to read xcr0 */
+		ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+	else
+		/* xgetbv will cause #UD - use reset value for xcr0 */
+		ghcb_set_xcr0(ghcb, 1);
+
+	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+	if (ret != ES_OK)
+		return ret;
+
+	if (!(ghcb_rax_is_valid(ghcb) &&
+	      ghcb_rbx_is_valid(ghcb) &&
+	      ghcb_rcx_is_valid(ghcb) &&
+	      ghcb_rdx_is_valid(ghcb)))
+		return ES_VMM_ERROR;
+
+	leaf->eax = ghcb->save.rax;
+	leaf->ebx = ghcb->save.rbx;
+	leaf->ecx = ghcb->save.rcx;
+	leaf->edx = ghcb->save.rdx;
+
+	return ES_OK;
+}
+
+struct cpuid_ctx {
+	struct ghcb *ghcb;
+	struct es_em_ctxt *ctxt;
+};
+
+static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf)
+{
+	struct cpuid_ctx *ctx = p;
+
+	if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf))
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
 static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
 {
+	struct cpuid_ctx ctx = { ghcb, ctxt };
 	struct pt_regs *regs = ctxt->regs;
 	struct cpuid_leaf leaf;
 	int ret;
 
 	leaf.fn = regs->ax;
 	leaf.subfn = regs->cx;
-	ret = snp_cpuid(ghcb, ctxt, &leaf);
+	ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf);
 	if (!ret) {
 		regs->ax = leaf.eax;
 		regs->bx = leaf.ebx;
@@ -502,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
 
 	return ES_OK;
 }
+
+void snp_register_ghcb_early(unsigned long paddr)
+{
+	unsigned long pfn = paddr >> PAGE_SHIFT;
+	u64 val;
+
+	sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
+	VMGEXIT();
+
+	val = sev_es_rd_ghcb_msr();
+
+	/* If the response GPA is not ours then abort the guest */
+	if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
+	    (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
+}
+
+bool __init sev_es_check_cpu_features(void)
+{
+	if (!has_cpuflag(X86_FEATURE_RDRAND)) {
+		error("RDRAND instruction not supported - no trusted source of randomness available\n");
+		return false;
+	}
+
+	return true;
+}
+
+bool sev_es_negotiate_protocol(void)
+{
+	u64 val;
+
+	/* Do the GHCB protocol version negotiation */
+	sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+	VMGEXIT();
+	val = sev_es_rd_ghcb_msr();
+
+	if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+		return false;
+
+	if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+	    GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+		return false;
+
+	ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
+
+	return true;
+}
diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config
index d5d091e03bd3..98b6952ba9d2 100644
--- a/arch/x86/configs/xen.config
+++ b/arch/x86/configs/xen.config
@@ -12,7 +12,6 @@ CONFIG_CPU_FREQ=y
 
 # x86 xen specific config options
 CONFIG_XEN_PVH=y
-CONFIG_XEN_SAVE_RESTORE=y
 # CONFIG_XEN_DEBUG_FS is not set
 CONFIG_XEN_MCE_LOG=y
 CONFIG_XEN_ACPI_PROCESSOR=m
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 94016c60561e..d9c6fc78cf33 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -2,19 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (x86)"
 
-config CRYPTO_CURVE25519_X86
-	tristate
-	depends on 64BIT
-	select CRYPTO_KPP
-	select CRYPTO_LIB_CURVE25519_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
-	default CRYPTO_LIB_CURVE25519_INTERNAL
-	help
-	  Curve25519 algorithm
-
-	  Architecture: x86_64 using:
-	  - ADX (large integer arithmetic)
-
 config CRYPTO_AES_NI_INTEL
 	tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
 	select CRYPTO_AEAD
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index d402963d6b57..dfba7e5e88ea 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -62,8 +62,6 @@ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
 nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
 
-obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
-
 obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o
 sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o
 
@@ -81,6 +79,3 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
 
 obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
 aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
-
-# Disable GCOV in odd or sensitive code
-GCOV_PROFILE_curve25519-x86_64.o := n
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92cf0fe2291e..ced2a1deecd7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 333	common	io_pgetevents		sys_io_pgetevents
 334	common	rseq			sys_rseq
 335	common	uretprobe		sys_uretprobe
+336	common	uprobe			sys_uprobe
 # don't use numbers 387 through 423, add new calls after the last
 # 'common' entry
 424	common	pidfd_send_signal	sys_pidfd_send_signal
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7610f26dfbd9..745caa6c15a3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event)
 
 void x86_pmu_show_pmu_cap(struct pmu *pmu)
 {
-	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu_num_counters(pmu));
-	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu_num_counters_fixed(pmu));
-	pr_info("... event mask:             %016Lx\n", hybrid(pmu, intel_ctrl));
+	pr_info("... version:                   %d\n", x86_pmu.version);
+	pr_info("... bit width:                 %d\n", x86_pmu.cntval_bits);
+	pr_info("... generic counters:          %d\n", x86_pmu_num_counters(pmu));
+	pr_info("... generic bitmap:            %016llx\n", hybrid(pmu, cntr_mask64));
+	pr_info("... fixed-purpose counters:    %d\n", x86_pmu_num_counters_fixed(pmu));
+	pr_info("... fixed-purpose bitmap:      %016llx\n", hybrid(pmu, fixed_cntr_mask64));
+	pr_info("... value mask:                %016llx\n", x86_pmu.cntval_mask);
+	pr_info("... max period:                %016llx\n", x86_pmu.max_period);
+	pr_info("... global_ctrl mask:          %016llx\n", hybrid(pmu, intel_ctrl));
 }
 
 static int __init init_hw_perf_events(void)
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 61da6b8a3d51..cbac54cb3a9e 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -643,4 +643,4 @@ static __init int bts_init(void)
 
 	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
 }
-arch_initcall(bts_init);
+early_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c2fb729c270e..28f5468a6ea3 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 mask, bits = 0;
 	int idx = hwc->idx;
+	u64 bits = 0;
 
 	if (is_topdown_idx(idx)) {
 		struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
 
 	idx -= INTEL_PMC_IDX_FIXED;
 	bits = intel_fixed_bits_by_idx(idx, bits);
-	mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
-
-	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
+	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
 		bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
-		mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
-	}
 
-	cpuc->fixed_ctrl_val &= ~mask;
+	cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
 	cpuc->fixed_ctrl_val |= bits;
 }
 
@@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
 			if (event->group_leader != leader->group_leader)
 				break;
 			for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
-				if (WARN_ON_ONCE(i + idx > cpuc->n_events))
+				if (i + idx >= cpuc->n_events ||
+				    !is_acr_event_group(cpuc->event_list[i + idx]))
 					return;
 				__set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
 			}
@@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
 						0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
 
 	if (pmu->intel_cap.perf_metrics)
-		pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+		pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
 	else
-		pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+		pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
 
 	intel_pmu_check_event_constraints(pmu->event_constraints,
 					  pmu->cntr_mask64,
@@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu)
 		rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
 		if (!perf_cap.perf_metrics) {
 			x86_pmu.intel_cap.perf_metrics = 0;
-			x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
+			x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
 		}
 	}
 
@@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void)
 	}
 
 	if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
-		x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+		x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
 
 	if (x86_pmu.intel_cap.pebs_timing_info)
 		x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 07ba4935e873..a26e66d66444 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -305,6 +305,8 @@ struct apic {
 
 	/* Probe, setup and smpboot functions */
 	int	(*probe)(void);
+	void	(*setup)(void);
+	void	(*teardown)(void);
 	int	(*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
 
 	void	(*init_apic_ldr)(void);
@@ -317,6 +319,8 @@ struct apic {
 	/* wakeup secondary CPU using 64-bit wakeup point */
 	int	(*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
 
+	void	(*update_vector)(unsigned int cpu, unsigned int vector, bool set);
+
 	char	*name;
 };
 
@@ -470,6 +474,12 @@ static __always_inline bool apic_id_valid(u32 apic_id)
 	return apic_id <= apic->max_apic_id;
 }
 
+static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set)
+{
+	if (apic->update_vector)
+		apic->update_vector(cpu, vector, set);
+}
+
 #else /* CONFIG_X86_LOCAL_APIC */
 
 static inline u32 apic_read(u32 reg) { return 0; }
@@ -481,6 +491,7 @@ static inline void apic_wait_icr_idle(void) { }
 static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
 static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); }
 static inline void apic_setup_apic_calls(void) { }
+static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { }
 
 #define apic_update_callback(_callback, _fn) do { } while (0)
 
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 094106b6a538..be39a543fbe5 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -135,6 +135,8 @@
 #define		APIC_TDR_DIV_128	0xA
 #define	APIC_EFEAT	0x400
 #define	APIC_ECTRL	0x410
+#define APIC_SEOI	0x420
+#define APIC_IER	0x480
 #define APIC_EILVTn(n)	(0x500 + 0x10 * n)
 #define		APIC_EILVT_NR_AMD_K8	1	/* # of extended interrupts */
 #define		APIC_EILVT_NR_AMD_10H	4
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index eebbc8889e70..a835f891164d 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -246,7 +246,7 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
 					  variable_test_bit(nr, addr);
 }
 
-static __always_inline unsigned long variable__ffs(unsigned long word)
+static __always_inline __attribute_const__ unsigned long variable__ffs(unsigned long word)
 {
 	asm("tzcnt %1,%0"
 		: "=r" (word)
@@ -265,7 +265,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
 	 (unsigned long)__builtin_ctzl(word) :	\
 	 variable__ffs(word))
 
-static __always_inline unsigned long variable_ffz(unsigned long word)
+static __always_inline __attribute_const__ unsigned long variable_ffz(unsigned long word)
 {
 	return variable__ffs(~word);
 }
@@ -287,7 +287,7 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned long __fls(unsigned long word)
 {
 	if (__builtin_constant_p(word))
 		return BITS_PER_LONG - 1 - __builtin_clzl(word);
@@ -301,7 +301,7 @@ static __always_inline unsigned long __fls(unsigned long word)
 #undef ADDR
 
 #ifdef __KERNEL__
-static __always_inline int variable_ffs(int x)
+static __always_inline __attribute_const__ int variable_ffs(int x)
 {
 	int r;
 
@@ -355,7 +355,7 @@ static __always_inline int variable_ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static __always_inline int fls(unsigned int x)
+static __always_inline __attribute_const__ int fls(unsigned int x)
 {
 	int r;
 
@@ -400,7 +400,7 @@ static __always_inline int fls(unsigned int x)
  * at position 64.
  */
 #ifdef CONFIG_X86_64
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
 {
 	int bitpos = -1;
 
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 02b23aa78955..f7b67cb73915 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -82,6 +82,8 @@
 #ifndef __ASSEMBLER__
 extern unsigned int output_len;
 extern const unsigned long kernel_text_size;
+extern const unsigned long kernel_inittext_offset;
+extern const unsigned long kernel_inittext_size;
 extern const unsigned long kernel_total_size;
 
 unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 1751f1eb95ef..976b90a3d190 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -113,7 +113,7 @@ extern bhi_thunk __bhi_args_end[];
 
 struct pt_regs;
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
 #define __bpfcall
 
@@ -157,7 +157,7 @@ static inline int cfi_get_func_arity(void *func)
 {
 	return 0;
 }
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 #if HAS_KERNEL_IBT == 1
 #define CFI_NOSEAL(x)	asm(IBT_NOSEAL(__stringify(x)))
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 602957dd2609..b2a562217d3f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -218,6 +218,7 @@
 #define X86_FEATURE_FLEXPRIORITY	( 8*32+ 1) /* "flexpriority" Intel FlexPriority */
 #define X86_FEATURE_EPT			( 8*32+ 2) /* "ept" Intel Extended Page Table */
 #define X86_FEATURE_VPID		( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */
+#define X86_FEATURE_COHERENCY_SFW_NO	( 8*32+ 4) /* SNP cache coherency software work around not needed */
 
 #define X86_FEATURE_VMMCALL		( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */
 #define X86_FEATURE_XENPV		( 8*32+16) /* Xen paravirtual guest */
@@ -494,6 +495,8 @@
 #define X86_FEATURE_TSA_SQ_NO		(21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
 #define X86_FEATURE_TSA_L1_NO		(21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
 #define X86_FEATURE_CLEAR_CPU_BUF_VM	(21*32+13) /* Clear CPU buffers using VERW before VMRUN */
+#define X86_FEATURE_IBPB_EXIT_TO_USER	(21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
+#define X86_FEATURE_ABMC		(21*32+15) /* Assignable Bandwidth Monitoring Counters */
 
 /*
  * BUG word(s)
@@ -550,4 +553,5 @@
 #define X86_BUG_ITS			X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
 #define X86_BUG_ITS_NATIVE_ONLY		X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
 #define X86_BUG_TSA			X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
+#define X86_BUG_VMSCAPE			X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
deleted file mode 100644
index d5749b25fa10..000000000000
--- a/arch/x86/include/asm/cpuid.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_X86_CPUID_H
-#define _ASM_X86_CPUID_H
-
-#include <asm/cpuid/api.h>
-
-#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index d535a97c7284..ce3eb6d5fdf9 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -93,6 +93,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
 	 * 8 (ia32) bits.
 	 */
 	choose_random_kstack_offset(rdtsc());
+
+	/* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */
+	if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) &&
+	    this_cpu_read(x86_ibpb_exit_to_user)) {
+		indirect_branch_prediction_barrier();
+		this_cpu_write(x86_ibpb_exit_to_user, false);
+	}
 }
 #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
 
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index c060549c6c94..89004f4ca208 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -11,7 +11,7 @@
 
 extern void save_fpregs_to_fpstate(struct fpu *fpu);
 extern void fpu__drop(struct task_struct *tsk);
-extern int  fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+extern int  fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
 		      unsigned long shstk_addr);
 extern void fpu_flush_thread(void);
 
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index e41cbf2ec41d..9ad86a7d13f6 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -30,6 +30,7 @@ enum x86_hypervisor_type {
 	X86_HYPER_KVM,
 	X86_HYPER_JAILHOUSE,
 	X86_HYPER_ACRN,
+	X86_HYPER_BHYVE,
 };
 
 #ifdef CONFIG_HYPERVISOR_GUEST
@@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv;
 extern const struct hypervisor_x86 x86_hyper_kvm;
 extern const struct hypervisor_x86 x86_hyper_jailhouse;
 extern const struct hypervisor_x86 x86_hyper_acrn;
+extern const struct hypervisor_x86 x86_hyper_bhyve;
 extern struct hypervisor_x86 x86_hyper_xen_hvm;
 
 extern bool nopv;
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 97f341777db5..1b3060a3425c 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -37,6 +37,8 @@
 #define INAT_PFX_EVEX	15	/* EVEX prefix */
 /* x86-64 REX2 prefix */
 #define INAT_PFX_REX2	16	/* 0xD5 */
+/* AMD XOP prefix */
+#define INAT_PFX_XOP	17	/* 0x8F */
 
 #define INAT_LSTPFX_MAX	3
 #define INAT_LGCPFX_MAX	11
@@ -77,6 +79,7 @@
 #define INAT_MOFFSET	(1 << (INAT_FLAG_OFFS + 3))
 #define INAT_VARIANT	(1 << (INAT_FLAG_OFFS + 4))
 #define INAT_VEXOK	(1 << (INAT_FLAG_OFFS + 5))
+#define INAT_XOPOK	INAT_VEXOK
 #define INAT_VEXONLY	(1 << (INAT_FLAG_OFFS + 6))
 #define INAT_EVEXONLY	(1 << (INAT_FLAG_OFFS + 7))
 #define INAT_NO_REX2	(1 << (INAT_FLAG_OFFS + 8))
@@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
 extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
 					  insn_byte_t vex_m,
 					  insn_byte_t vex_pp);
+extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode,
+					  insn_byte_t map_select);
 
 /* Attribute checking functions */
 static inline int inat_is_legacy_prefix(insn_attr_t attr)
@@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr)
 	return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
 }
 
+static inline int inat_is_xop_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_XOP;
+}
+
 static inline int inat_is_escape(insn_attr_t attr)
 {
 	return attr & INAT_ESC_MASK;
@@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr)
 	return attr & INAT_VEXOK;
 }
 
+static inline int inat_accept_xop(insn_attr_t attr)
+{
+	return attr & INAT_XOPOK;
+}
+
 static inline int inat_must_vex(insn_attr_t attr)
 {
 	return attr & (INAT_VEXONLY | INAT_EVEXONLY);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 5a68e9db6518..01ccdd168df0 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -2,12 +2,6 @@
 #ifndef _ASM_X86_INIT_H
 #define _ASM_X86_INIT_H
 
-#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
-#define __head	__section(".head.text") __no_sanitize_undefined __no_stack_protector
-#else
-#define __head	__section(".head.text") __no_sanitize_undefined __no_kstack_erase
-#endif
-
 struct x86_mapping_info {
 	void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
 	void (*free_pgt_page)(void *, void *); /* free buf for page table */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 7152ea809e6a..091f88c8254d 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -71,7 +71,10 @@ struct insn {
 					 * prefixes.bytes[3]: last prefix
 					 */
 	struct insn_field rex_prefix;	/* REX prefix */
-	struct insn_field vex_prefix;	/* VEX prefix */
+	union {
+		struct insn_field vex_prefix;	/* VEX prefix */
+		struct insn_field xop_prefix;	/* XOP prefix */
+	};
 	struct insn_field opcode;	/*
 					 * opcode.bytes[0]: opcode1
 					 * opcode.bytes[1]: opcode2
@@ -135,6 +138,17 @@ struct insn {
 #define X86_VEX_V(vex)	(((vex) & 0x78) >> 3)	/* VEX3 Byte2, VEX2 Byte1 */
 #define X86_VEX_P(vex)	((vex) & 0x03)		/* VEX3 Byte2, VEX2 Byte1 */
 #define X86_VEX_M_MAX	0x1f			/* VEX3.M Maximum value */
+/* XOP bit fields */
+#define X86_XOP_R(xop)	((xop) & 0x80)	/* XOP Byte2 */
+#define X86_XOP_X(xop)	((xop) & 0x40)	/* XOP Byte2 */
+#define X86_XOP_B(xop)	((xop) & 0x20)	/* XOP Byte2 */
+#define X86_XOP_M(xop)	((xop) & 0x1f)	/* XOP Byte2 */
+#define X86_XOP_W(xop)	((xop) & 0x80)	/* XOP Byte3 */
+#define X86_XOP_V(xop)	((xop) & 0x78)	/* XOP Byte3 */
+#define X86_XOP_L(xop)	((xop) & 0x04)	/* XOP Byte3 */
+#define X86_XOP_P(xop)	((xop) & 0x03)	/* XOP Byte3 */
+#define X86_XOP_M_MIN	0x08	/* Min of XOP.M */
+#define X86_XOP_M_MAX	0x1f	/* Max of XOP.M */
 
 extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
 extern int insn_get_prefixes(struct insn *insn);
@@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn)
 	return X86_REX2_M(insn->rex_prefix.bytes[1]);
 }
 
-static inline int insn_is_avx(struct insn *insn)
+static inline int insn_is_avx_or_xop(struct insn *insn)
 {
 	if (!insn->prefixes.got)
 		insn_get_prefixes(insn);
@@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn)
 	return (insn->vex_prefix.nbytes == 4);
 }
 
+/* If we already know this is AVX/XOP encoded */
+static inline int avx_insn_is_xop(struct insn *insn)
+{
+	insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]);
+
+	return inat_is_xop_prefix(attr);
+}
+
+static inline int insn_is_xop(struct insn *insn)
+{
+	if (!insn_is_avx_or_xop(insn))
+		return 0;
+
+	return avx_insn_is_xop(insn);
+}
+
 static inline int insn_has_emulate_prefix(struct insn *insn)
 {
 	return !!insn->emulate_prefix_size;
@@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn)
 	return X86_VEX_W(insn->vex_prefix.bytes[2]);
 }
 
+static inline insn_byte_t insn_xop_map_bits(struct insn *insn)
+{
+	if (insn->xop_prefix.nbytes < 3)	/* XOP is 3 bytes */
+		return 0;
+	return X86_XOP_M(insn->xop_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_xop_p_bits(struct insn *insn)
+{
+	return X86_XOP_P(insn->vex_prefix.bytes[2]);
+}
+
 /* Get the last prefix id from last prefix or VEX prefix */
 static inline int insn_last_prefix_id(struct insn *insn)
 {
-	if (insn_is_avx(insn))
+	if (insn_is_avx_or_xop(insn)) {
+		if (avx_insn_is_xop(insn))
+			return insn_xop_p_bits(insn);
 		return insn_vex_p_bits(insn);	/* VEX_p is a SIMD prefix id */
+	}
 
 	if (insn->prefixes.bytes[3])
 		return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index e345dbdf933e..f32a0eca2ae5 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -51,7 +51,7 @@
 #define INTEL_PENTIUM_MMX		IFM(5, 0x04) /* P55C */
 #define INTEL_QUARK_X1000		IFM(5, 0x09) /* Quark X1000 SoC */
 
-/* Family 6 */
+/* Family 6, 18, 19 */
 #define INTEL_PENTIUM_PRO		IFM(6, 0x01)
 #define INTEL_PENTIUM_II_KLAMATH	IFM(6, 0x03)
 #define INTEL_PENTIUM_III_DESCHUTES	IFM(6, 0x05)
@@ -126,6 +126,8 @@
 #define INTEL_GRANITERAPIDS_X		IFM(6, 0xAD) /* Redwood Cove */
 #define INTEL_GRANITERAPIDS_D		IFM(6, 0xAE)
 
+#define INTEL_DIAMONDRAPIDS_X		IFM(19, 0x01) /* Panther Cove */
+
 #define INTEL_BARTLETTLAKE		IFM(6, 0xD7) /* Raptor Cove */
 
 /* "Hybrid" Processors (P-Core/E-Core) */
@@ -203,9 +205,6 @@
 #define INTEL_P4_PRESCOTT_2M		IFM(15, 0x04)
 #define INTEL_P4_CEDARMILL		IFM(15, 0x06) /* Also Xeon Dempsey */
 
-/* Family 19 */
-#define INTEL_PANTHERCOVE_X		IFM(19, 0x01) /* Diamond Rapids */
-
 /*
  * Intel CPU core types
  *
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c77c03139f7..31e3cb550fb3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -241,12 +241,14 @@ struct cper_ia_proc_ctx;
 
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
+void mca_bsp_init(struct cpuinfo_x86 *c);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
 void mcheck_cpu_clear(struct cpuinfo_x86 *c);
 int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
 			       u64 lapic_id);
 #else
 static inline int mcheck_init(void) { return 0; }
+static inline void mca_bsp_init(struct cpuinfo_x86 *c) {}
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
 static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
 static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
@@ -290,8 +292,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 enum mcp_flags {
 	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
 	MCP_UC		= BIT(1),	/* log uncorrected errors */
-	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
-	MCP_QUEUE_LOG	= BIT(3),	/* only queue to genpool */
+	MCP_QUEUE_LOG	= BIT(2),	/* only queue to genpool */
 };
 
 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
@@ -371,15 +372,9 @@ enum smca_bank_types {
 
 extern bool amd_mce_is_memory_error(struct mce *m);
 
-extern int mce_threshold_create_device(unsigned int cpu);
-extern int mce_threshold_remove_device(unsigned int cpu);
-
 void mce_amd_feature_init(struct cpuinfo_x86 *c);
 enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
 #else
-
-static inline int mce_threshold_create_device(unsigned int cpu)		{ return 0; };
-static inline int mce_threshold_remove_device(unsigned int cpu)		{ return 0; };
 static inline bool amd_mce_is_memory_error(struct mce *m)		{ return false; };
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c)		{ }
 #endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b65c3ba5fa14..718a55d82fe4 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -315,12 +315,14 @@
 #define PERF_CAP_PT_IDX			16
 
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
-#define PERF_CAP_PEBS_TRAP             BIT_ULL(6)
-#define PERF_CAP_ARCH_REG              BIT_ULL(7)
-#define PERF_CAP_PEBS_FORMAT           0xf00
-#define PERF_CAP_PEBS_BASELINE         BIT_ULL(14)
-#define PERF_CAP_PEBS_MASK	(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
-				 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
+#define PERF_CAP_PEBS_TRAP		BIT_ULL(6)
+#define PERF_CAP_ARCH_REG		BIT_ULL(7)
+#define PERF_CAP_PEBS_FORMAT		0xf00
+#define PERF_CAP_PEBS_BASELINE		BIT_ULL(14)
+#define PERF_CAP_PEBS_TIMING_INFO	BIT_ULL(17)
+#define PERF_CAP_PEBS_MASK		(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
+					 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
+					 PERF_CAP_PEBS_TIMING_INFO)
 
 #define MSR_IA32_RTIT_CTL		0x00000570
 #define RTIT_CTL_TRACEEN		BIT(0)
@@ -631,6 +633,11 @@
 #define MSR_AMD_PPIN			0xc00102f1
 #define MSR_AMD64_CPUID_FN_7		0xc0011002
 #define MSR_AMD64_CPUID_FN_1		0xc0011004
+
+#define MSR_AMD64_CPUID_EXT_FEAT	0xc0011005
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT	54
+#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT	BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT)
+
 #define MSR_AMD64_LS_CFG		0xc0011020
 #define MSR_AMD64_DC_CFG		0xc0011022
 #define MSR_AMD64_TW_CFG		0xc0011023
@@ -699,8 +706,15 @@
 #define MSR_AMD64_SNP_VMSA_REG_PROT	BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT)
 #define MSR_AMD64_SNP_SMT_PROT_BIT	17
 #define MSR_AMD64_SNP_SMT_PROT		BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
-#define MSR_AMD64_SNP_RESV_BIT		18
+#define MSR_AMD64_SNP_SECURE_AVIC_BIT	18
+#define MSR_AMD64_SNP_SECURE_AVIC	BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT)
+#define MSR_AMD64_SNP_RESV_BIT		19
 #define MSR_AMD64_SNP_RESERVED_MASK	GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
+#define MSR_AMD64_SAVIC_CONTROL		0xc0010138
+#define MSR_AMD64_SAVIC_EN_BIT		0
+#define MSR_AMD64_SAVIC_EN		BIT_ULL(MSR_AMD64_SAVIC_EN_BIT)
+#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT	1
+#define MSR_AMD64_SAVIC_ALLOWEDNMI	BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT)
 #define MSR_AMD64_RMP_BASE		0xc0010132
 #define MSR_AMD64_RMP_END		0xc0010133
 #define MSR_AMD64_RMP_CFG		0xc0010136
@@ -1223,6 +1237,8 @@
 /* - AMD: */
 #define MSR_IA32_MBA_BW_BASE		0xc0000200
 #define MSR_IA32_SMBA_BW_BASE		0xc0000280
+#define MSR_IA32_L3_QOS_ABMC_CFG	0xc00003fd
+#define MSR_IA32_L3_QOS_EXT_CFG		0xc00003ff
 #define MSR_IA32_EVT_CFG_BASE		0xc0000400
 
 /* AMD-V MSRs */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 10f261678749..08ed5a2e46a5 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -514,6 +514,7 @@ enum spectre_v2_user_mitigation {
 /* The Speculative Store Bypass disable variants */
 enum ssb_mitigation {
 	SPEC_STORE_BYPASS_NONE,
+	SPEC_STORE_BYPASS_AUTO,
 	SPEC_STORE_BYPASS_DISABLE,
 	SPEC_STORE_BYPASS_PRCTL,
 	SPEC_STORE_BYPASS_SECCOMP,
@@ -530,6 +531,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
 		: "memory");
 }
 
+DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user);
+
 static inline void indirect_branch_prediction_barrier(void)
 {
 	asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 70d1d94aca7e..49a4d442f3fc 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -35,7 +35,6 @@
 #define ARCH_PERFMON_EVENTSEL_EQ			(1ULL << 36)
 #define ARCH_PERFMON_EVENTSEL_UMASK2			(0xFFULL << 40)
 
-#define INTEL_FIXED_BITS_MASK				0xFULL
 #define INTEL_FIXED_BITS_STRIDE			4
 #define INTEL_FIXED_0_KERNEL				(1ULL << 0)
 #define INTEL_FIXED_0_USER				(1ULL << 1)
@@ -48,6 +47,11 @@
 #define ICL_EVENTSEL_ADAPTIVE				(1ULL << 34)
 #define ICL_FIXED_0_ADAPTIVE				(1ULL << 32)
 
+#define INTEL_FIXED_BITS_MASK					\
+	(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER |		\
+	 INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI |	\
+	 ICL_FIXED_0_ADAPTIVE)
+
 #define intel_fixed_bits_by_idx(_idx, _bits)			\
 	((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
 
@@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx)
 #define GLOBAL_STATUS_TRACE_TOPAPMI		BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
 #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT	48
 
-#define GLOBAL_CTRL_EN_PERF_METRICS		48
+#define GLOBAL_CTRL_EN_PERF_METRICS		BIT_ULL(48)
 /*
  * We model guest LBR event tracing as another fixed-mode PMC like BTS.
  *
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 4604f924d8b8..7eb61ef6a185 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -36,6 +36,9 @@ static inline bool pgtable_l5_enabled(void)
 #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
 #endif /* USE_EARLY_PGTABLE_L5 */
 
+#define ARCH_PAGE_TABLE_SYNC_MASK \
+	(pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
+
 extern unsigned int pgdir_shift;
 extern unsigned int ptrs_per_p4d;
 
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index feb93b50e990..575f8408a9e7 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
 
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
-extern unsigned int rdt_mon_features;
 
 DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
@@ -84,21 +83,6 @@ static inline void resctrl_arch_disable_mon(void)
 	static_branch_dec_cpuslocked(&rdt_enable_key);
 }
 
-static inline bool resctrl_arch_is_llc_occupancy_enabled(void)
-{
-	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool resctrl_arch_is_mbm_total_enabled(void)
-{
-	return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool resctrl_arch_is_mbm_local_enabled(void)
-{
-	return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
 /*
  * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 77d8f49b92bd..f59ae7186940 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -244,7 +244,7 @@ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node)
 
 static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
 {
-	unsigned int p;
+	unsigned long p;
 
 	/*
 	 * Load CPU and node number from the GDT.  LSL is faster than RDTSCP
@@ -254,10 +254,10 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
 	 *
 	 * If RDPID is available, use it.
 	 */
-	alternative_io ("lsl %[seg],%[p]",
-			".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+	alternative_io ("lsl %[seg],%k[p]",
+			"rdpid %[p]",
 			X86_FEATURE_RDPID,
-			[p] "=a" (p), [seg] "r" (__CPUNODE_SEG));
+			[p] "=r" (p), [seg] "r" (__CPUNODE_SEG));
 
 	if (cpu)
 		*cpu = (p & VDSO_CPUNODE_MASK);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 692af46603a1..914eb32581c7 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,7 @@ extern void i386_reserve_resources(void);
 extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
 extern void startup_64_setup_gdt_idt(void);
 extern void startup_64_load_idt(void *vc_handler);
+extern void __pi_startup_64_load_idt(void *vc_handler);
 extern void early_setup_idt(void);
 extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
 
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 0020d77a0800..01a6e4dbe423 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -208,6 +208,7 @@ struct snp_psc_desc {
 #define GHCB_TERM_SVSM_CAA		9	/* SVSM is present but CAA is not page aligned */
 #define GHCB_TERM_SECURE_TSC		10	/* Secure TSC initialization failed */
 #define GHCB_TERM_SVSM_CA_REMAP_FAIL	11	/* SVSM is present but CA could not be remapped */
+#define GHCB_TERM_SAVIC_FAIL		12	/* Secure AVIC-specific failure */
 
 #define GHCB_RESP_CODE(v)		((v) & GHCB_MSR_INFO_MASK)
 
diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h
index 3dfd306d1c9e..c58c47c68ab6 100644
--- a/arch/x86/include/asm/sev-internal.h
+++ b/arch/x86/include/asm/sev-internal.h
@@ -2,7 +2,6 @@
 
 #define DR7_RESET_VALUE        0x400
 
-extern struct ghcb boot_ghcb_page;
 extern u64 sev_hv_features;
 extern u64 sev_secrets_pa;
 
@@ -56,31 +55,15 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
 DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
 
 void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
-			   unsigned long npages, enum psc_op op);
+			   unsigned long npages, const struct psc_desc *desc);
 
 DECLARE_PER_CPU(struct svsm_ca *, svsm_caa);
 DECLARE_PER_CPU(u64, svsm_caa_pa);
 
-extern struct svsm_ca *boot_svsm_caa;
 extern u64 boot_svsm_caa_pa;
 
-static __always_inline struct svsm_ca *svsm_get_caa(void)
-{
-	if (sev_cfg.use_cas)
-		return this_cpu_read(svsm_caa);
-	else
-		return boot_svsm_caa;
-}
-
-static __always_inline u64 svsm_get_caa_pa(void)
-{
-	if (sev_cfg.use_cas)
-		return this_cpu_read(svsm_caa_pa);
-	else
-		return boot_svsm_caa_pa;
-}
-
-int svsm_perform_call_protocol(struct svsm_call *call);
+enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt);
+void vc_forward_exception(struct es_em_ctxt *ctxt);
 
 static inline u64 sev_es_rd_ghcb_msr(void)
 {
@@ -97,9 +80,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val)
 	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
 }
 
-void snp_register_ghcb_early(unsigned long paddr);
-bool sev_es_negotiate_protocol(void);
-bool sev_es_check_cpu_features(void);
+enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write);
+
 u64 get_hv_features(void);
 
 const struct snp_cpuid_table *snp_cpuid_get_table(void);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 89075ff19afa..f9046c4b9a2b 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
 }
 
 void setup_ghcb(void);
+void snp_register_ghcb_early(unsigned long paddr);
 void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
 				  unsigned long npages);
 void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
@@ -511,14 +512,12 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
 void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
 void snp_set_wakeup_secondary_cpu(void);
 bool snp_init(struct boot_params *bp);
-void __noreturn snp_abort(void);
 void snp_dmi_setup(void);
 int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input);
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 snp_get_unsupported_features(u64 status);
 u64 sev_get_status(void);
 void sev_show_status(void);
-void snp_update_svsm_ca(void);
 int prepare_pte_enc(struct pte_enc_desc *d);
 void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot);
 void snp_kexec_finish(void);
@@ -533,6 +532,10 @@ int snp_svsm_vtpm_send_command(u8 *buffer);
 
 void __init snp_secure_tsc_prepare(void);
 void __init snp_secure_tsc_init(void);
+enum es_result savic_register_gpa(u64 gpa);
+enum es_result savic_unregister_gpa(u64 *gpa);
+u64 savic_ghcb_msr_read(u32 reg);
+void savic_ghcb_msr_write(u32 reg, u64 value);
 
 static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
 {
@@ -540,8 +543,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
 	__builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
 }
 
-void vc_forward_exception(struct es_em_ctxt *ctxt);
-
 /* I/O parameters for CPUID-related helpers */
 struct cpuid_leaf {
 	u32 fn;
@@ -552,7 +553,13 @@ struct cpuid_leaf {
 	u32 edx;
 };
 
-int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf);
+int svsm_perform_msr_protocol(struct svsm_call *call);
+int __pi_svsm_perform_msr_protocol(struct svsm_call *call);
+int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf),
+	      void *ctx, struct cpuid_leaf *leaf);
+
+void svsm_issue_call(struct svsm_call *call, u8 *pending);
+int svsm_process_result_codes(struct svsm_call *call);
 
 void __noreturn sev_es_terminate(unsigned int set, unsigned int reason);
 enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
@@ -560,7 +567,36 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
 				   u64 exit_code, u64 exit_info_1,
 				   u64 exit_info_2);
 
+bool sev_es_negotiate_protocol(void);
+bool sev_es_check_cpu_features(void);
+
+extern u16 ghcb_version;
 extern struct ghcb *boot_ghcb;
+extern bool sev_snp_needs_sfw;
+
+struct psc_desc {
+	enum psc_op op;
+	struct svsm_ca *ca;
+	u64 caa_pa;
+};
+
+static inline void sev_evict_cache(void *va, int npages)
+{
+	volatile u8 val __always_unused;
+	u8 *bytes = va;
+	int page_idx;
+
+	/*
+	 * For SEV guests, a read from the first/last cache-lines of a 4K page
+	 * using the guest key is sufficient to cause a flush of all cache-lines
+	 * associated with that 4K page without incurring all the overhead of a
+	 * full CLFLUSH sequence.
+	 */
+	for (page_idx = 0; page_idx < npages; page_idx++) {
+		val = bytes[page_idx * PAGE_SIZE];
+		val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1];
+	}
+}
 
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
@@ -582,7 +618,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag
 static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
 static inline void snp_set_wakeup_secondary_cpu(void) { }
 static inline bool snp_init(struct boot_params *bp) { return false; }
-static inline void snp_abort(void) { }
 static inline void snp_dmi_setup(void) { }
 static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input)
 {
@@ -592,7 +627,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
 static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
 static inline u64 sev_get_status(void) { return 0; }
 static inline void sev_show_status(void) { }
-static inline void snp_update_svsm_ca(void) { }
 static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; }
 static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { }
 static inline void snp_kexec_finish(void) { }
@@ -605,6 +639,11 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc,
 static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; }
 static inline void __init snp_secure_tsc_prepare(void) { }
 static inline void __init snp_secure_tsc_init(void) { }
+static inline void sev_evict_cache(void *va, int npages) {}
+static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; }
+static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; }
+static inline void savic_ghcb_msr_write(u32 reg, u64 value) { }
+static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; }
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
@@ -616,9 +655,13 @@ void snp_dump_hva_rmpentry(unsigned long address);
 int psmash(u64 pfn);
 int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable);
 int rmp_make_shared(u64 pfn, enum pg_level level);
-void snp_leak_pages(u64 pfn, unsigned int npages);
+void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp);
 void kdump_sev_callback(void);
 void snp_fixup_e820_tables(void);
+static inline void snp_leak_pages(u64 pfn, unsigned int pages)
+{
+	__snp_leak_pages(pfn, pages, true);
+}
 #else
 static inline bool snp_probe_rmptable_info(void) { return false; }
 static inline int snp_rmptable_init(void) { return -ENOSYS; }
@@ -631,6 +674,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as
 	return -ENODEV;
 }
 static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; }
+static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {}
 static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
 static inline void kdump_sev_callback(void) { }
 static inline void snp_fixup_e820_tables(void) {}
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index ba6f2fe43848..fc7dcec58fd4 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -16,25 +16,29 @@ struct thread_shstk {
 
 long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
 void reset_thread_features(void);
-unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
+unsigned long shstk_alloc_thread_stack(struct task_struct *p, u64 clone_flags,
 				       unsigned long stack_size);
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
 int shstk_update_last_frame(unsigned long val);
 bool shstk_is_enabled(void);
+int shstk_pop(u64 *val);
+int shstk_push(u64 val);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
 			       unsigned long arg2) { return -EINVAL; }
 static inline void reset_thread_features(void) {}
 static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
-						     unsigned long clone_flags,
+						     u64 clone_flags,
 						     unsigned long stack_size) { return 0; }
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
 static inline int shstk_update_last_frame(unsigned long val) { return 0; }
 static inline bool shstk_is_enabled(void) { return false; }
+static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
+static inline int shstk_push(u64 val) { return -ENOTSUPP; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLER__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 9282465eea21..e71e0e8362ed 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,56 +80,42 @@ struct thread_info {
 #endif
 
 /*
- * thread information flags
- * - these are process state flags that various assembly files
- *   may need to access
+ * Tell the generic TIF infrastructure which bits x86 supports
  */
-#define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
-#define TIF_SIGPENDING		2	/* signal pending */
-#define TIF_NEED_RESCHED	3	/* rescheduling necessary */
-#define TIF_NEED_RESCHED_LAZY	4	/* Lazy rescheduling needed */
-#define TIF_SINGLESTEP		5	/* reenable singlestep on user return*/
-#define TIF_SSBD		6	/* Speculative store bypass disable */
-#define TIF_SPEC_IB		9	/* Indirect branch speculation mitigation */
-#define TIF_SPEC_L1D_FLUSH	10	/* Flush L1D on mm switches (processes) */
-#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
-#define TIF_UPROBE		12	/* breakpointed or singlestepping */
-#define TIF_PATCH_PENDING	13	/* pending live patching update */
-#define TIF_NEED_FPU_LOAD	14	/* load FPU on return to userspace */
-#define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
-#define TIF_NOTSC		16	/* TSC is not accessible in userland */
-#define TIF_NOTIFY_SIGNAL	17	/* signal notifications exist */
-#define TIF_MEMDIE		20	/* is terminating due to OOM killer */
-#define TIF_POLLING_NRFLAG	21	/* idle is polling for TIF_NEED_RESCHED */
+#define HAVE_TIF_NEED_RESCHED_LAZY
+#define HAVE_TIF_POLLING_NRFLAG
+#define HAVE_TIF_SINGLESTEP
+
+#include <asm-generic/thread_info_tif.h>
+
+/* Architecture specific TIF space starts at 16 */
+#define TIF_SSBD		16	/* Speculative store bypass disable */
+#define TIF_SPEC_IB		17	/* Indirect branch speculation mitigation */
+#define TIF_SPEC_L1D_FLUSH	18	/* Flush L1D on mm switches (processes) */
+#define TIF_NEED_FPU_LOAD	19	/* load FPU on return to userspace */
+#define TIF_NOCPUID		20	/* CPUID is not accessible in userland */
+#define TIF_NOTSC		21	/* TSC is not accessible in userland */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_SPEC_FORCE_UPDATE	23	/* Force speculation MSR update in context switch */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
+#define TIF_SINGLESTEP		25	/* reenable singlestep on user return*/
+#define TIF_BLOCKSTEP		26	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
-#define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
-
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
-#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_SSBD		(1 << TIF_SSBD)
-#define _TIF_SPEC_IB		(1 << TIF_SPEC_IB)
-#define _TIF_SPEC_L1D_FLUSH	(1 << TIF_SPEC_L1D_FLUSH)
-#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
-#define _TIF_UPROBE		(1 << TIF_UPROBE)
-#define _TIF_PATCH_PENDING	(1 << TIF_PATCH_PENDING)
-#define _TIF_NEED_FPU_LOAD	(1 << TIF_NEED_FPU_LOAD)
-#define _TIF_NOCPUID		(1 << TIF_NOCPUID)
-#define _TIF_NOTSC		(1 << TIF_NOTSC)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-#define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
-#define _TIF_SPEC_FORCE_UPDATE	(1 << TIF_SPEC_FORCE_UPDATE)
-#define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
-#define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_ADDR32		(1 << TIF_ADDR32)
+#define TIF_ADDR32		28	/* 32-bit address space on 64 bits */
+
+#define _TIF_SSBD		BIT(TIF_SSBD)
+#define _TIF_SPEC_IB		BIT(TIF_SPEC_IB)
+#define _TIF_SPEC_L1D_FLUSH	BIT(TIF_SPEC_L1D_FLUSH)
+#define _TIF_NEED_FPU_LOAD	BIT(TIF_NEED_FPU_LOAD)
+#define _TIF_NOCPUID		BIT(TIF_NOCPUID)
+#define _TIF_NOTSC		BIT(TIF_NOTSC)
+#define _TIF_IO_BITMAP		BIT(TIF_IO_BITMAP)
+#define _TIF_SPEC_FORCE_UPDATE	BIT(TIF_SPEC_FORCE_UPDATE)
+#define _TIF_FORCED_TF		BIT(TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP		BIT(TIF_BLOCKSTEP)
+#define _TIF_SINGLESTEP		BIT(TIF_SINGLESTEP)
+#define _TIF_LAZY_MMU_UPDATES	BIT(TIF_LAZY_MMU_UPDATES)
+#define _TIF_ADDR32		BIT(TIF_ADDR32)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW_BASE					\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6c79ee7c0957..21041898157a 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -231,6 +231,16 @@ static inline bool topology_is_primary_thread(unsigned int cpu)
 }
 #define topology_is_primary_thread topology_is_primary_thread
 
+int topology_get_primary_thread(unsigned int cpu);
+
+static inline bool topology_is_core_online(unsigned int cpu)
+{
+	int pcpu = topology_get_primary_thread(cpu);
+
+	return pcpu >= 0 ? cpu_online(pcpu) : false;
+}
+#define topology_is_core_online topology_is_core_online
+
 #else /* CONFIG_SMP */
 static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
 static inline int topology_max_smt_threads(void) { return 1; }
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..1ee2e5115955 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE   = 0,
+	ARCH_UPROBE_FLAG_OPTIMIZE_FAIL  = 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 59a62c3780a2..a16d4631547c 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -94,12 +94,13 @@ DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func);
 #ifdef MODULE
 #define __ADDRESSABLE_xen_hypercall
 #else
-#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall)
+#define __ADDRESSABLE_xen_hypercall \
+	__stringify(.global STATIC_CALL_KEY(xen_hypercall);)
 #endif
 
 #define __HYPERCALL					\
 	__ADDRESSABLE_xen_hypercall			\
-	"call __SCT__xen_hypercall"
+	__stringify(call STATIC_CALL_TRAMP(xen_hypercall))
 
 #define __HYPERCALL_ENTRY(x)	"a" (x)
 
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 85e63d58c074..59f642a94b9d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -12,9 +12,9 @@
 #include <asm/extable.h>
 #include <asm/page.h>
 
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/grant_table.h>
-#include <xen/features.h>
 
 /* Xen machine address */
 typedef struct xmaddr {
@@ -162,7 +162,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 	 * pfn_to_mfn. This will have to be removed when we figured
 	 * out which call.
 	 */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return pfn;
 
 	mfn = __pfn_to_mfn(pfn);
@@ -175,7 +175,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 
 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return 1;
 
 	return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
@@ -210,7 +210,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 	 * gfn_to_pfn. This will have to be removed when we figure
 	 * out which call.
 	 */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return mfn;
 
 	pfn = mfn_to_pfn_no_overrides(mfn);
@@ -242,7 +242,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
 /* Pseudo-physical <-> Guest conversion */
 static inline unsigned long pfn_to_gfn(unsigned long pfn)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return pfn;
 	else
 		return pfn_to_mfn(pfn);
@@ -250,7 +250,7 @@ static inline unsigned long pfn_to_gfn(unsigned long pfn)
 
 static inline unsigned long gfn_to_pfn(unsigned long gfn)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return gfn;
 	else
 		return mfn_to_pfn(gfn);
@@ -284,7 +284,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return mfn;
 
 	pfn = mfn_to_pfn(mfn);
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 9c640a521a67..650e3256ea7d 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -118,6 +118,10 @@
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
 #define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018
+#define SVM_VMGEXIT_SAVIC			0x8000001a
+#define SVM_VMGEXIT_SAVIC_REGISTER_GPA		0
+#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA	1
+#define SVM_VMGEXIT_SAVIC_SELF_GPA		~0ULL
 #define SVM_VMGEXIT_HV_FEATURES			0x8000fffd
 #define SVM_VMGEXIT_TERM_REQUEST		0x8000fffe
 #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code)	\
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0d2a6d953be9..bc184dd38d99 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
-obj-$(CONFIG_CFI_CLANG)			+= cfi.o
+obj-$(CONFIG_CFI)			+= cfi.o
 
 obj-$(CONFIG_CALL_THUNKS)		+= callthunks.o
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7bde68247b5f..79ae9cb50019 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
 
 #ifdef CONFIG_CFI_AUTO_DEFAULT
 # define __CFI_DEFAULT CFI_AUTO
-#elif defined(CONFIG_CFI_CLANG)
+#elif defined(CONFIG_CFI)
 # define __CFI_DEFAULT CFI_KCFI
 #else
 # define __CFI_DEFAULT CFI_OFF
@@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
 bool cfi_bhi __ro_after_init = false;
 #endif
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 u32 cfi_get_func_hash(void *func)
 {
 	u32 hash;
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 52d1808ee360..581db89477f9 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y)
 # APIC probe will depend on the listing order here
 obj-$(CONFIG_X86_NUMACHIP)	+= apic_numachip.o
 obj-$(CONFIG_X86_UV)		+= x2apic_uv_x.o
+obj-$(CONFIG_AMD_SECURE_AVIC)	+= x2apic_savic.o
 obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o
 obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o
 obj-y				+= apic_flat_64.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d73ba5a7b623..680d305589a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -592,6 +592,8 @@ static void setup_APIC_timer(void)
 						0xF, ~0UL);
 	} else
 		clockevents_register_device(levt);
+
+	apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true);
 }
 
 /*
@@ -1168,6 +1170,9 @@ void disable_local_APIC(void)
 	if (!apic_accessible())
 		return;
 
+	if (apic->teardown)
+		apic->teardown();
+
 	apic_soft_disable();
 
 #ifdef CONFIG_X86_32
@@ -1428,63 +1433,61 @@ union apic_ir {
 	u32		regs[APIC_IR_REGS];
 };
 
-static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
+static bool apic_check_and_eoi_isr(union apic_ir *isr)
 {
 	int i, bit;
 
-	/* Read the IRRs */
-	for (i = 0; i < APIC_IR_REGS; i++)
-		irr->regs[i] = apic_read(APIC_IRR + i * 0x10);
-
 	/* Read the ISRs */
 	for (i = 0; i < APIC_IR_REGS; i++)
 		isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
 
+	/* If the ISR map empty, nothing to do here. */
+	if (bitmap_empty(isr->map, APIC_IR_BITS))
+		return true;
+
 	/*
-	 * If the ISR map is not empty. ACK the APIC and run another round
-	 * to verify whether a pending IRR has been unblocked and turned
-	 * into a ISR.
+	 * There can be multiple ISR bits set when a high priority
+	 * interrupt preempted a lower priority one. Issue an EOI for each
+	 * set bit. The priority traversal order does not matter as there
+	 * can't be new ISR bits raised at this point. What matters is that
+	 * an EOI is issued for each ISR bit.
 	 */
-	if (!bitmap_empty(isr->map, APIC_IR_BITS)) {
-		/*
-		 * There can be multiple ISR bits set when a high priority
-		 * interrupt preempted a lower priority one. Issue an ACK
-		 * per set bit.
-		 */
-		for_each_set_bit(bit, isr->map, APIC_IR_BITS)
-			apic_eoi();
-		return true;
-	}
+	for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+		apic_eoi();
 
-	return !bitmap_empty(irr->map, APIC_IR_BITS);
+	/* Reread the ISRs, they should be empty now */
+	for (i = 0; i < APIC_IR_REGS; i++)
+		isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+	return bitmap_empty(isr->map, APIC_IR_BITS);
 }
 
 /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
+ * If a CPU services an interrupt and crashes before issuing EOI to the
+ * local APIC, the corresponding ISR bit is still set when the crashing CPU
+ * jumps into a crash kernel. Read the ISR and issue an EOI for each set
+ * bit to acknowledge it as otherwise these slots would be locked forever
+ * waiting for an EOI.
  *
- * Most probably by now the CPU has serviced that pending interrupt and it
- * might not have done the apic_eoi() because it thought, interrupt
- * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
- * the ISR bit and cpu thinks it has already serviced the interrupt. Hence
- * a vector might get locked. It was noticed for timer irq (vector
- * 0x31). Issue an extra EOI to clear ISR.
+ * If there are pending bits in the IRR, then they won't be converted into
+ * ISR bits as the CPU has interrupts disabled. They will be delivered once
+ * the CPU enables interrupts and there is nothing which can prevent that.
  *
- * If there are pending IRR bits they turn into ISR bits after a higher
- * priority ISR bit has been acked.
+ * In the worst case this results in spurious interrupt warnings.
  */
-static void apic_pending_intr_clear(void)
+static void apic_clear_isr(void)
 {
-	union apic_ir irr, isr;
+	union apic_ir ir;
 	unsigned int i;
 
-	/* 512 loops are way oversized and give the APIC a chance to obey. */
-	for (i = 0; i < 512; i++) {
-		if (!apic_check_and_ack(&irr, &isr))
-			return;
-	}
-	/* Dump the IRR/ISR content if that failed */
-	pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map);
+	if (!apic_check_and_eoi_isr(&ir))
+		pr_warn("APIC: Stale ISR: %256pb\n", ir.map);
+
+	for (i = 0; i < APIC_IR_REGS; i++)
+		ir.regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+	if (!bitmap_empty(ir.map, APIC_IR_BITS))
+		pr_warn("APIC: Stale IRR: %256pb\n", ir.map);
 }
 
 /**
@@ -1503,6 +1506,9 @@ static void setup_local_APIC(void)
 		return;
 	}
 
+	if (apic->setup)
+		apic->setup();
+
 	/*
 	 * If this comes from kexec/kcrash the APIC might be enabled in
 	 * SPIV. Soft disable it before doing further initialization.
@@ -1541,8 +1547,7 @@ static void setup_local_APIC(void)
 	value |= 0x10;
 	apic_write(APIC_TASKPRI, value);
 
-	/* Clear eventually stale ISR/IRR bits */
-	apic_pending_intr_clear();
+	apic_clear_isr();
 
 	/*
 	 * Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a947b46a8b64..bddc54465399 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
 
 	apicd->hw_irq_cfg.vector = vector;
 	apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+
+	apic_update_vector(cpu, vector, true);
+
 	irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
-	trace_vector_config(irqd->irq, vector, cpu,
-			    apicd->hw_irq_cfg.dest_apicid);
+	trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid);
 }
 
-static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
-			       unsigned int newcpu)
+static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed)
+{
+	apic_update_vector(cpu, vector, false);
+	irq_matrix_free(vector_matrix, cpu, vector, managed);
+}
+
+static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu)
 {
 	struct apic_chip_data *apicd = apic_chip_data(irqd);
 	struct irq_desc *desc = irq_data_to_desc(irqd);
@@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
 		apicd->prev_cpu = apicd->cpu;
 		WARN_ON_ONCE(apicd->cpu == newcpu);
 	} else {
-		irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
-				managed);
+		apic_free_vector(apicd->cpu, apicd->vector, managed);
 	}
 
 setnew:
@@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
 	trace_vector_alloc(irqd->irq, vector, resvd, vector);
 	if (vector < 0)
 		return vector;
-	apic_update_vector(irqd, vector, cpu);
+	chip_data_update(irqd, vector, cpu);
 
 	return 0;
 }
@@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
 	trace_vector_alloc_managed(irqd->irq, vector, vector);
 	if (vector < 0)
 		return vector;
-	apic_update_vector(irqd, vector, cpu);
+	chip_data_update(irqd, vector, cpu);
 
 	return 0;
 }
@@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd)
 			   apicd->prev_cpu);
 
 	per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
-	irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
+	apic_free_vector(apicd->cpu, vector, managed);
 	apicd->vector = 0;
 
 	/* Clean up move in progress */
@@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd)
 		return;
 
 	per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
-	irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
+	apic_free_vector(apicd->prev_cpu, vector, managed);
 	apicd->prev_vector = 0;
 	apicd->move_in_progress = 0;
 	hlist_del_init(&apicd->clist);
@@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd)
 	 *    affinity mask comes online.
 	 */
 	trace_vector_free_moved(apicd->irq, cpu, vector, managed);
-	irq_matrix_free(vector_matrix, cpu, vector, managed);
+	apic_free_vector(cpu, vector, managed);
 	per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	hlist_del_init(&apicd->clist);
 	apicd->prev_vector = 0;
diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c
new file mode 100644
index 000000000000..dbc5678bc3b6
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_savic.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure AVIC Support (SEV-SNP Guests)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *
+ * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/cpumask.h>
+#include <linux/percpu-defs.h>
+#include <linux/align.h>
+
+#include <asm/apic.h>
+#include <asm/sev.h>
+
+#include "local.h"
+
+struct secure_avic_page {
+	u8 regs[PAGE_SIZE];
+} __aligned(PAGE_SIZE);
+
+static struct secure_avic_page __percpu *savic_page __ro_after_init;
+
+static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC);
+}
+
+static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset)
+{
+	return &per_cpu_ptr(savic_page, cpu)->regs[offset];
+}
+
+static inline void update_vector(unsigned int cpu, unsigned int offset,
+				 unsigned int vector, bool set)
+{
+	void *bitmap = get_reg_bitmap(cpu, offset);
+
+	if (set)
+		apic_set_vector(vector, bitmap);
+	else
+		apic_clear_vector(vector, bitmap);
+}
+
+#define SAVIC_ALLOWED_IRR	0x204
+
+/*
+ * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers
+ * result in #VC exception (for non-accelerated register accesses)
+ * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler
+ * can read/write the x2APIC register in the guest APIC backing page.
+ *
+ * Since doing this would increase the latency of accessing x2APIC
+ * registers, instead of doing RDMSR/WRMSR based accesses and
+ * handling the APIC register reads/writes in the #VC exception handler,
+ * the read() and write() callbacks directly read/write the APIC register
+ * from/to the vCPU's APIC backing page.
+ */
+static u32 savic_read(u32 reg)
+{
+	void *ap = this_cpu_ptr(savic_page);
+
+	switch (reg) {
+	case APIC_LVTT:
+	case APIC_TMICT:
+	case APIC_TMCCT:
+	case APIC_TDCR:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTERR:
+		return savic_ghcb_msr_read(reg);
+	case APIC_ID:
+	case APIC_LVR:
+	case APIC_TASKPRI:
+	case APIC_ARBPRI:
+	case APIC_PROCPRI:
+	case APIC_LDR:
+	case APIC_SPIV:
+	case APIC_ESR:
+	case APIC_EFEAT:
+	case APIC_ECTRL:
+	case APIC_SEOI:
+	case APIC_IER:
+	case APIC_EILVTn(0) ... APIC_EILVTn(3):
+		return apic_get_reg(ap, reg);
+	case APIC_ICR:
+		return (u32)apic_get_reg64(ap, reg);
+	case APIC_ISR ... APIC_ISR + 0x70:
+	case APIC_TMR ... APIC_TMR + 0x70:
+		if (WARN_ONCE(!IS_ALIGNED(reg, 16),
+			      "APIC register read offset 0x%x not aligned at 16 bytes", reg))
+			return 0;
+		return apic_get_reg(ap, reg);
+	/* IRR and ALLOWED_IRR offset range */
+	case APIC_IRR ... APIC_IRR + 0x74:
+		/*
+		 * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from
+		 * their respective base offset. APIC_IRRs are in the range
+		 *
+		 * (0x200, 0x210,  ..., 0x270)
+		 *
+		 * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range
+		 *
+		 * (0x204, 0x214, ..., 0x274).
+		 *
+		 * Filter out everything else.
+		 */
+		if (WARN_ONCE(!(IS_ALIGNED(reg, 16) ||
+				IS_ALIGNED(reg - 4, 16)),
+			      "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg))
+			return 0;
+		return apic_get_reg(ap, reg);
+	default:
+		pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg);
+		return 0;
+	}
+}
+
+#define SAVIC_NMI_REQ		0x278
+
+/*
+ * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware
+ * updates the APIC_IRR in the APIC backing page of the vCPU. In addition,
+ * hardware evaluates the new APIC_IRR update for interrupt injection to
+ * the vCPU. So, self IPIs are hardware-accelerated.
+ */
+static inline void self_ipi_reg_write(unsigned int vector)
+{
+	native_apic_msr_write(APIC_SELF_IPI, vector);
+}
+
+static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi)
+{
+	if (nmi)
+		apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1);
+	else
+		update_vector(cpu, APIC_IRR, vector, true);
+}
+
+static void send_ipi_allbut(unsigned int vector, bool nmi)
+{
+	unsigned int cpu, src_cpu;
+
+	guard(irqsave)();
+
+	src_cpu = raw_smp_processor_id();
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		if (cpu == src_cpu)
+			continue;
+		send_ipi_dest(cpu, vector, nmi);
+	}
+}
+
+static inline void self_ipi(unsigned int vector, bool nmi)
+{
+	u32 icr_low = APIC_SELF_IPI | vector;
+
+	if (nmi)
+		icr_low |= APIC_DM_NMI;
+
+	native_x2apic_icr_write(icr_low, 0);
+}
+
+static void savic_icr_write(u32 icr_low, u32 icr_high)
+{
+	unsigned int dsh, vector;
+	u64 icr_data;
+	bool nmi;
+
+	dsh = icr_low & APIC_DEST_ALLBUT;
+	vector = icr_low & APIC_VECTOR_MASK;
+	nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI);
+
+	switch (dsh) {
+	case APIC_DEST_SELF:
+		self_ipi(vector, nmi);
+		break;
+	case APIC_DEST_ALLINC:
+		self_ipi(vector, nmi);
+		fallthrough;
+	case APIC_DEST_ALLBUT:
+		send_ipi_allbut(vector, nmi);
+		break;
+	default:
+		send_ipi_dest(icr_high, vector, nmi);
+		break;
+	}
+
+	icr_data = ((u64)icr_high) << 32 | icr_low;
+	if (dsh != APIC_DEST_SELF)
+		savic_ghcb_msr_write(APIC_ICR, icr_data);
+	apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data);
+}
+
+static void savic_write(u32 reg, u32 data)
+{
+	void *ap = this_cpu_ptr(savic_page);
+
+	switch (reg) {
+	case APIC_LVTT:
+	case APIC_TMICT:
+	case APIC_TDCR:
+	case APIC_LVT0:
+	case APIC_LVT1:
+	case APIC_LVTTHMR:
+	case APIC_LVTPC:
+	case APIC_LVTERR:
+		savic_ghcb_msr_write(reg, data);
+		break;
+	case APIC_TASKPRI:
+	case APIC_EOI:
+	case APIC_SPIV:
+	case SAVIC_NMI_REQ:
+	case APIC_ESR:
+	case APIC_ECTRL:
+	case APIC_SEOI:
+	case APIC_IER:
+	case APIC_EILVTn(0) ... APIC_EILVTn(3):
+		apic_set_reg(ap, reg, data);
+		break;
+	case APIC_ICR:
+		savic_icr_write(data, 0);
+		break;
+	case APIC_SELF_IPI:
+		self_ipi_reg_write(data);
+		break;
+	/* ALLOWED_IRR offsets are writable */
+	case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70:
+		if (IS_ALIGNED(reg - 4, 16)) {
+			apic_set_reg(ap, reg, data);
+			break;
+		}
+		fallthrough;
+	default:
+		pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg);
+	}
+}
+
+static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh)
+{
+	unsigned int icr_low;
+
+	icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL);
+	savic_icr_write(icr_low, dest);
+}
+
+static void savic_send_ipi(int cpu, int vector)
+{
+	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+	send_ipi(dest, vector, 0);
+}
+
+static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self)
+{
+	unsigned int cpu, this_cpu;
+
+	guard(irqsave)();
+
+	this_cpu = raw_smp_processor_id();
+
+	for_each_cpu(cpu, mask) {
+		if (excl_self && cpu == this_cpu)
+			continue;
+		send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0);
+	}
+}
+
+static void savic_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+	send_ipi_mask(mask, vector, false);
+}
+
+static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+	send_ipi_mask(mask, vector, true);
+}
+
+static void savic_send_ipi_allbutself(int vector)
+{
+	send_ipi(0, vector, APIC_DEST_ALLBUT);
+}
+
+static void savic_send_ipi_all(int vector)
+{
+	send_ipi(0, vector, APIC_DEST_ALLINC);
+}
+
+static void savic_send_ipi_self(int vector)
+{
+	self_ipi_reg_write(vector);
+}
+
+static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set)
+{
+	update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set);
+}
+
+static void savic_eoi(void)
+{
+	unsigned int cpu;
+	int vec;
+
+	cpu = raw_smp_processor_id();
+	vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR));
+	if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR"))
+		return;
+
+	/* Is level-triggered interrupt? */
+	if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) {
+		update_vector(cpu, APIC_ISR, vec, false);
+		/*
+		 * Propagate the EOI write to the hypervisor for level-triggered
+		 * interrupts. Return to the guest from GHCB protocol event takes
+		 * care of re-evaluating interrupt state.
+		 */
+		savic_ghcb_msr_write(APIC_EOI, 0);
+	} else {
+		/*
+		 * Hardware clears APIC_ISR and re-evaluates the interrupt state
+		 * to determine if there is any pending interrupt which can be
+		 * delivered to CPU.
+		 */
+		native_apic_msr_eoi();
+	}
+}
+
+static void savic_teardown(void)
+{
+	/* Disable Secure AVIC */
+	native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0);
+	savic_unregister_gpa(NULL);
+}
+
+static void savic_setup(void)
+{
+	void *ap = this_cpu_ptr(savic_page);
+	enum es_result res;
+	unsigned long gpa;
+
+	/*
+	 * Before Secure AVIC is enabled, APIC MSR reads are intercepted.
+	 * APIC_ID MSR read returns the value from the hypervisor.
+	 */
+	apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID));
+
+	gpa = __pa(ap);
+
+	/*
+	 * The NPT entry for a vCPU's APIC backing page must always be
+	 * present when the vCPU is running in order for Secure AVIC to
+	 * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot
+	 * be resumed if the NPT entry for the APIC backing page is not
+	 * present. Notify GPA of the vCPU's APIC backing page to the
+	 * hypervisor by calling savic_register_gpa(). Before executing
+	 * VMRUN, the hypervisor makes use of this information to make sure
+	 * the APIC backing page is mapped in NPT.
+	 */
+	res = savic_register_gpa(gpa);
+	if (res != ES_OK)
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+	native_wrmsrq(MSR_AMD64_SAVIC_CONTROL,
+		      gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI);
+}
+
+static int savic_probe(void)
+{
+	if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+		return 0;
+
+	if (!x2apic_mode) {
+		pr_err("Secure AVIC enabled in non x2APIC mode\n");
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+		/* unreachable */
+	}
+
+	savic_page = alloc_percpu(struct secure_avic_page);
+	if (!savic_page)
+		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+	return 1;
+}
+
+static struct apic apic_x2apic_savic __ro_after_init = {
+
+	.name				= "secure avic x2apic",
+	.probe				= savic_probe,
+	.acpi_madt_oem_check		= savic_acpi_madt_oem_check,
+	.setup				= savic_setup,
+	.teardown			= savic_teardown,
+
+	.dest_mode_logical		= false,
+
+	.disable_esr			= 0,
+
+	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
+
+	.max_apic_id			= UINT_MAX,
+	.x2apic_set_max_apicid		= true,
+	.get_apic_id			= x2apic_get_apic_id,
+
+	.calc_dest_apicid		= apic_default_calc_apicid,
+
+	.send_IPI			= savic_send_ipi,
+	.send_IPI_mask			= savic_send_ipi_mask,
+	.send_IPI_mask_allbutself	= savic_send_ipi_mask_allbutself,
+	.send_IPI_allbutself		= savic_send_ipi_allbutself,
+	.send_IPI_all			= savic_send_ipi_all,
+	.send_IPI_self			= savic_send_ipi_self,
+
+	.nmi_to_offline_cpu		= true,
+
+	.read				= savic_read,
+	.write				= savic_write,
+	.eoi				= savic_eoi,
+	.icr_read			= native_x2apic_icr_read,
+	.icr_write			= savic_icr_write,
+
+	.update_vector			= savic_update_vector,
+};
+
+apic_driver(apic_x2apic_savic);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1e26179ff18c..2f8a58ef690e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX)			+= sgx/
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 obj-$(CONFIG_HYPERVISOR_GUEST)		+= vmware.o hypervisor.o mshyperv.o
+obj-$(CONFIG_BHYVE_GUEST)		+= bhyve.o
 obj-$(CONFIG_ACRN_GUEST)		+= acrn.o
 
 obj-$(CONFIG_DEBUG_FS)			+= debugfs.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a5ece6ebe8a7..a6f88ca1a6b4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1326,8 +1326,8 @@ static const char * const s5_reset_reason_txt[] = {
 
 static __init int print_s5_reset_status_mmio(void)
 {
-	unsigned long value;
 	void __iomem *addr;
+	u32 value;
 	int i;
 
 	if (!cpu_feature_enabled(X86_FEATURE_ZEN))
@@ -1340,12 +1340,16 @@ static __init int print_s5_reset_status_mmio(void)
 	value = ioread32(addr);
 	iounmap(addr);
 
+	/* Value with "all bits set" is an error response and should be ignored. */
+	if (value == U32_MAX)
+		return 0;
+
 	for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
 		if (!(value & BIT(i)))
 			continue;
 
 		if (s5_reset_reason_txt[i]) {
-			pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
+			pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n",
 				value, s5_reset_reason_txt[i]);
 		}
 	}
diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c
new file mode 100644
index 000000000000..f1a8ca3dd1ed
--- /dev/null
+++ b/arch/x86/kernel/cpu/bhyve.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FreeBSD Bhyve guest enlightenments
+ *
+ * Copyright © 2025 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+static uint32_t bhyve_cpuid_base;
+static uint32_t bhyve_cpuid_max;
+
+#define BHYVE_SIGNATURE			"bhyve bhyve "
+
+#define CPUID_BHYVE_FEATURES		0x40000001
+
+/* Features advertised in CPUID_BHYVE_FEATURES %eax */
+
+/* MSI Extended Dest ID */
+#define CPUID_BHYVE_FEAT_EXT_DEST_ID	(1UL << 0)
+
+static uint32_t __init bhyve_detect(void)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+                return 0;
+
+	bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0);
+	if (!bhyve_cpuid_base)
+		return 0;
+
+	bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base);
+	return bhyve_cpuid_max;
+}
+
+static uint32_t bhyve_features(void)
+{
+	unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES;
+
+	if (bhyve_cpuid_max < cpuid_leaf)
+		return 0;
+
+	return cpuid_eax(cpuid_leaf);
+}
+
+static bool __init bhyve_ext_dest_id(void)
+{
+	return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID);
+}
+
+static bool __init bhyve_x2apic_available(void)
+{
+	return true;
+}
+
+const struct hypervisor_x86 x86_hyper_bhyve __refconst = {
+	.name			= "Bhyve",
+	.detect			= bhyve_detect,
+	.init.init_platform	= x86_init_noop,
+	.init.x2apic_available	= bhyve_x2apic_available,
+	.init.msi_ext_dest_id	= bhyve_ext_dest_id,
+};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index b74bf937cd9f..6a526ae1fe99 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -96,6 +96,9 @@ static void __init its_update_mitigation(void);
 static void __init its_apply_mitigation(void);
 static void __init tsa_select_mitigation(void);
 static void __init tsa_apply_mitigation(void);
+static void __init vmscape_select_mitigation(void);
+static void __init vmscape_update_mitigation(void);
+static void __init vmscape_apply_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR without task-specific bits set */
 u64 x86_spec_ctrl_base;
@@ -105,6 +108,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
 DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
 EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
 
+/*
+ * Set when the CPU has run a potentially malicious guest. An IBPB will
+ * be needed to before running userspace. That IBPB will flush the branch
+ * predictor content.
+ */
+DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
+
 u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
 
 static u64 __ro_after_init x86_arch_cap_msr;
@@ -262,6 +273,7 @@ void __init cpu_select_mitigations(void)
 	its_select_mitigation();
 	bhi_select_mitigation();
 	tsa_select_mitigation();
+	vmscape_select_mitigation();
 
 	/*
 	 * After mitigations are selected, some may need to update their
@@ -293,6 +305,7 @@ void __init cpu_select_mitigations(void)
 	bhi_update_mitigation();
 	/* srso_update_mitigation() depends on retbleed_update_mitigation(). */
 	srso_update_mitigation();
+	vmscape_update_mitigation();
 
 	spectre_v1_apply_mitigation();
 	spectre_v2_apply_mitigation();
@@ -310,6 +323,7 @@ void __init cpu_select_mitigations(void)
 	its_apply_mitigation();
 	bhi_apply_mitigation();
 	tsa_apply_mitigation();
+	vmscape_apply_mitigation();
 }
 
 /*
@@ -386,7 +400,6 @@ static bool __init should_mitigate_vuln(unsigned int bug)
 
 	case X86_BUG_SPECTRE_V2:
 	case X86_BUG_RETBLEED:
-	case X86_BUG_SRSO:
 	case X86_BUG_L1TF:
 	case X86_BUG_ITS:
 		return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
@@ -417,6 +430,13 @@ static bool __init should_mitigate_vuln(unsigned int bug)
 		       cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
 		       cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
 		       (smt_mitigations != SMT_MITIGATIONS_OFF);
+
+	case X86_BUG_SPEC_STORE_BYPASS:
+		return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER);
+
+	case X86_BUG_VMSCAPE:
+		return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
 	default:
 		WARN(1, "Unknown bug %x\n", bug);
 		return false;
@@ -667,8 +687,7 @@ static const char * const mmio_strings[] = {
 
 static void __init mmio_select_mitigation(void)
 {
-	if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
-	     cpu_mitigations_off()) {
+	if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
 		mmio_mitigation = MMIO_MITIGATION_OFF;
 		return;
 	}
@@ -1069,10 +1088,8 @@ static void __init gds_select_mitigation(void)
 	if (gds_mitigation == GDS_MITIGATION_AUTO) {
 		if (should_mitigate_vuln(X86_BUG_GDS))
 			gds_mitigation = GDS_MITIGATION_FULL;
-		else {
+		else
 			gds_mitigation = GDS_MITIGATION_OFF;
-			return;
-		}
 	}
 
 	/* No microcode */
@@ -1445,8 +1462,10 @@ static void __init retbleed_update_mitigation(void)
 			retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
 			break;
 		default:
-			if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+			if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
 				pr_err(RETBLEED_INTEL_MSG);
+				retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+			}
 		}
 	}
 
@@ -1827,9 +1846,10 @@ enum spectre_v2_mitigation_cmd {
 	SPECTRE_V2_CMD_IBRS,
 };
 
-static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO;
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
 
-enum spectre_v2_user_cmd {
+enum spectre_v2_user_mitigation_cmd {
 	SPECTRE_V2_USER_CMD_NONE,
 	SPECTRE_V2_USER_CMD_AUTO,
 	SPECTRE_V2_USER_CMD_FORCE,
@@ -1839,6 +1859,9 @@ enum spectre_v2_user_cmd {
 	SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
 };
 
+static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
 static const char * const spectre_v2_user_strings[] = {
 	[SPECTRE_V2_USER_NONE]			= "User space: Vulnerable",
 	[SPECTRE_V2_USER_STRICT]		= "User space: Mitigation: STIBP protection",
@@ -1847,50 +1870,31 @@ static const char * const spectre_v2_user_strings[] = {
 	[SPECTRE_V2_USER_SECCOMP]		= "User space: Mitigation: STIBP via seccomp and prctl",
 };
 
-static const struct {
-	const char			*option;
-	enum spectre_v2_user_cmd	cmd;
-	bool				secure;
-} v2_user_options[] __initconst = {
-	{ "auto",		SPECTRE_V2_USER_CMD_AUTO,		false },
-	{ "off",		SPECTRE_V2_USER_CMD_NONE,		false },
-	{ "on",			SPECTRE_V2_USER_CMD_FORCE,		true  },
-	{ "prctl",		SPECTRE_V2_USER_CMD_PRCTL,		false },
-	{ "prctl,ibpb",		SPECTRE_V2_USER_CMD_PRCTL_IBPB,		false },
-	{ "seccomp",		SPECTRE_V2_USER_CMD_SECCOMP,		false },
-	{ "seccomp,ibpb",	SPECTRE_V2_USER_CMD_SECCOMP_IBPB,	false },
-};
-
-static void __init spec_v2_user_print_cond(const char *reason, bool secure)
-{
-	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
-		pr_info("spectre_v2_user=%s forced on command line.\n", reason);
-}
-
-static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
+static int __init spectre_v2_user_parse_cmdline(char *str)
 {
-	char arg[20];
-	int ret, i;
-
-	if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
-		return SPECTRE_V2_USER_CMD_NONE;
-
-	ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
-				  arg, sizeof(arg));
-	if (ret < 0)
-		return SPECTRE_V2_USER_CMD_AUTO;
+	if (!str)
+		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
-		if (match_option(arg, ret, v2_user_options[i].option)) {
-			spec_v2_user_print_cond(v2_user_options[i].option,
-						v2_user_options[i].secure);
-			return v2_user_options[i].cmd;
-		}
-	}
+	if (!strcmp(str, "auto"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO;
+	else if (!strcmp(str, "off"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE;
+	else if (!strcmp(str, "on"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE;
+	else if (!strcmp(str, "prctl"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL;
+	else if (!strcmp(str, "prctl,ibpb"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB;
+	else if (!strcmp(str, "seccomp"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP;
+	else if (!strcmp(str, "seccomp,ibpb"))
+		spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB;
+	else
+		pr_err("Ignoring unknown spectre_v2_user option (%s).", str);
 
-	pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
-	return SPECTRE_V2_USER_CMD_AUTO;
+	return 0;
 }
+early_param("spectre_v2_user", spectre_v2_user_parse_cmdline);
 
 static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
 {
@@ -1902,7 +1906,7 @@ static void __init spectre_v2_user_select_mitigation(void)
 	if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
 		return;
 
-	switch (spectre_v2_parse_user_cmdline()) {
+	switch (spectre_v2_user_cmd) {
 	case SPECTRE_V2_USER_CMD_NONE:
 		return;
 	case SPECTRE_V2_USER_CMD_FORCE:
@@ -2030,119 +2034,61 @@ static void __init spectre_v2_user_apply_mitigation(void)
 static const char * const spectre_v2_strings[] = {
 	[SPECTRE_V2_NONE]			= "Vulnerable",
 	[SPECTRE_V2_RETPOLINE]			= "Mitigation: Retpolines",
-	[SPECTRE_V2_LFENCE]			= "Mitigation: LFENCE",
+	[SPECTRE_V2_LFENCE]			= "Vulnerable: LFENCE",
 	[SPECTRE_V2_EIBRS]			= "Mitigation: Enhanced / Automatic IBRS",
 	[SPECTRE_V2_EIBRS_LFENCE]		= "Mitigation: Enhanced / Automatic IBRS + LFENCE",
 	[SPECTRE_V2_EIBRS_RETPOLINE]		= "Mitigation: Enhanced / Automatic IBRS + Retpolines",
 	[SPECTRE_V2_IBRS]			= "Mitigation: IBRS",
 };
 
-static const struct {
-	const char *option;
-	enum spectre_v2_mitigation_cmd cmd;
-	bool secure;
-} mitigation_options[] __initconst = {
-	{ "off",		SPECTRE_V2_CMD_NONE,		  false },
-	{ "on",			SPECTRE_V2_CMD_FORCE,		  true  },
-	{ "retpoline",		SPECTRE_V2_CMD_RETPOLINE,	  false },
-	{ "retpoline,amd",	SPECTRE_V2_CMD_RETPOLINE_LFENCE,  false },
-	{ "retpoline,lfence",	SPECTRE_V2_CMD_RETPOLINE_LFENCE,  false },
-	{ "retpoline,generic",	SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
-	{ "eibrs",		SPECTRE_V2_CMD_EIBRS,		  false },
-	{ "eibrs,lfence",	SPECTRE_V2_CMD_EIBRS_LFENCE,	  false },
-	{ "eibrs,retpoline",	SPECTRE_V2_CMD_EIBRS_RETPOLINE,	  false },
-	{ "auto",		SPECTRE_V2_CMD_AUTO,		  false },
-	{ "ibrs",		SPECTRE_V2_CMD_IBRS,              false },
-};
+static bool nospectre_v2 __ro_after_init;
 
-static void __init spec_v2_print_cond(const char *reason, bool secure)
+static int __init nospectre_v2_parse_cmdline(char *str)
 {
-	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
-		pr_info("%s selected on command line.\n", reason);
+	nospectre_v2 = true;
+	spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+	return 0;
 }
+early_param("nospectre_v2", nospectre_v2_parse_cmdline);
 
-static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+static int __init spectre_v2_parse_cmdline(char *str)
 {
-	enum spectre_v2_mitigation_cmd cmd;
-	char arg[20];
-	int ret, i;
-
-	cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?  SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
-	if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
-		return SPECTRE_V2_CMD_NONE;
-
-	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
-	if (ret < 0)
-		return cmd;
-
-	for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
-		if (!match_option(arg, ret, mitigation_options[i].option))
-			continue;
-		cmd = mitigation_options[i].cmd;
-		break;
-	}
-
-	if (i >= ARRAY_SIZE(mitigation_options)) {
-		pr_err("unknown option (%s). Switching to default mode\n", arg);
-		return cmd;
-	}
-
-	if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
-	     cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
-	     cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
-	     cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
-	     cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
-	    !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
-		pr_err("%s selected but not compiled in. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
-
-	if ((cmd == SPECTRE_V2_CMD_EIBRS ||
-	     cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
-	     cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
-	    !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
-		pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
-
-	if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
-	     cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
-	    !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
-		pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
-
-	if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
-		pr_err("%s selected but not compiled in. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
-
-	if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
-		pr_err("%s selected but not Intel CPU. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
+	if (!str)
+		return -EINVAL;
 
-	if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
-		pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
-	}
+	if (nospectre_v2)
+		return 0;
 
-	if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
-		pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
-		       mitigation_options[i].option);
-		return SPECTRE_V2_CMD_AUTO;
+	if (!strcmp(str, "off")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+	} else if (!strcmp(str, "on")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_FORCE;
+		setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+		setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+	} else if (!strcmp(str, "retpoline")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE;
+	} else if (!strcmp(str, "retpoline,amd") ||
+		 !strcmp(str, "retpoline,lfence")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE;
+	} else if (!strcmp(str, "retpoline,generic")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+	} else if (!strcmp(str, "eibrs")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS;
+	} else if (!strcmp(str, "eibrs,lfence")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE;
+	} else if (!strcmp(str, "eibrs,retpoline")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE;
+	} else if (!strcmp(str, "auto")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	} else if (!strcmp(str, "ibrs")) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_IBRS;
+	} else {
+		pr_err("Ignoring unknown spectre_v2 option (%s).", str);
 	}
 
-	spec_v2_print_cond(mitigation_options[i].option,
-			   mitigation_options[i].secure);
-	return cmd;
+	return 0;
 }
+early_param("spectre_v2", spectre_v2_parse_cmdline);
 
 static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
 {
@@ -2291,10 +2237,6 @@ static void __init bhi_update_mitigation(void)
 {
 	if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
 		bhi_mitigation = BHI_MITIGATION_OFF;
-
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
-	     spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)
-		bhi_mitigation = BHI_MITIGATION_OFF;
 }
 
 static void __init bhi_apply_mitigation(void)
@@ -2330,11 +2272,55 @@ static void __init bhi_apply_mitigation(void)
 
 static void __init spectre_v2_select_mitigation(void)
 {
-	spectre_v2_cmd = spectre_v2_parse_cmdline();
+	if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+	    !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+		pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+	    !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+		pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+	     spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+	    !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+		pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
+		pr_err("IBRS selected but not compiled in. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
 
-	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
-	    (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO))
+	if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+		pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+		pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
+		pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n");
+		spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+	}
+
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) {
+		spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
 		return;
+	}
 
 	switch (spectre_v2_cmd) {
 	case SPECTRE_V2_CMD_NONE:
@@ -2537,101 +2523,11 @@ static void update_mds_branch_idle(void)
 	}
 }
 
-#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
-#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
-#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
-
-void cpu_bugs_smt_update(void)
-{
-	mutex_lock(&spec_ctrl_mutex);
-
-	if (sched_smt_active() && unprivileged_ebpf_enabled() &&
-	    spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
-		pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
-
-	switch (spectre_v2_user_stibp) {
-	case SPECTRE_V2_USER_NONE:
-		break;
-	case SPECTRE_V2_USER_STRICT:
-	case SPECTRE_V2_USER_STRICT_PREFERRED:
-		update_stibp_strict();
-		break;
-	case SPECTRE_V2_USER_PRCTL:
-	case SPECTRE_V2_USER_SECCOMP:
-		update_indir_branch_cond();
-		break;
-	}
-
-	switch (mds_mitigation) {
-	case MDS_MITIGATION_FULL:
-	case MDS_MITIGATION_AUTO:
-	case MDS_MITIGATION_VMWERV:
-		if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
-			pr_warn_once(MDS_MSG_SMT);
-		update_mds_branch_idle();
-		break;
-	case MDS_MITIGATION_OFF:
-		break;
-	}
-
-	switch (taa_mitigation) {
-	case TAA_MITIGATION_VERW:
-	case TAA_MITIGATION_AUTO:
-	case TAA_MITIGATION_UCODE_NEEDED:
-		if (sched_smt_active())
-			pr_warn_once(TAA_MSG_SMT);
-		break;
-	case TAA_MITIGATION_TSX_DISABLED:
-	case TAA_MITIGATION_OFF:
-		break;
-	}
-
-	switch (mmio_mitigation) {
-	case MMIO_MITIGATION_VERW:
-	case MMIO_MITIGATION_AUTO:
-	case MMIO_MITIGATION_UCODE_NEEDED:
-		if (sched_smt_active())
-			pr_warn_once(MMIO_MSG_SMT);
-		break;
-	case MMIO_MITIGATION_OFF:
-		break;
-	}
-
-	switch (tsa_mitigation) {
-	case TSA_MITIGATION_USER_KERNEL:
-	case TSA_MITIGATION_VM:
-	case TSA_MITIGATION_AUTO:
-	case TSA_MITIGATION_FULL:
-		/*
-		 * TSA-SQ can potentially lead to info leakage between
-		 * SMT threads.
-		 */
-		if (sched_smt_active())
-			static_branch_enable(&cpu_buf_idle_clear);
-		else
-			static_branch_disable(&cpu_buf_idle_clear);
-		break;
-	case TSA_MITIGATION_NONE:
-	case TSA_MITIGATION_UCODE_NEEDED:
-		break;
-	}
-
-	mutex_unlock(&spec_ctrl_mutex);
-}
-
 #undef pr_fmt
 #define pr_fmt(fmt)	"Speculative Store Bypass: " fmt
 
-static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
-
-/* The kernel command line selection */
-enum ssb_mitigation_cmd {
-	SPEC_STORE_BYPASS_CMD_NONE,
-	SPEC_STORE_BYPASS_CMD_AUTO,
-	SPEC_STORE_BYPASS_CMD_ON,
-	SPEC_STORE_BYPASS_CMD_PRCTL,
-	SPEC_STORE_BYPASS_CMD_SECCOMP,
-};
+static enum ssb_mitigation ssb_mode __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE;
 
 static const char * const ssb_strings[] = {
 	[SPEC_STORE_BYPASS_NONE]	= "Vulnerable",
@@ -2640,89 +2536,61 @@ static const char * const ssb_strings[] = {
 	[SPEC_STORE_BYPASS_SECCOMP]	= "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
 };
 
-static const struct {
-	const char *option;
-	enum ssb_mitigation_cmd cmd;
-} ssb_mitigation_options[]  __initconst = {
-	{ "auto",	SPEC_STORE_BYPASS_CMD_AUTO },    /* Platform decides */
-	{ "on",		SPEC_STORE_BYPASS_CMD_ON },      /* Disable Speculative Store Bypass */
-	{ "off",	SPEC_STORE_BYPASS_CMD_NONE },    /* Don't touch Speculative Store Bypass */
-	{ "prctl",	SPEC_STORE_BYPASS_CMD_PRCTL },   /* Disable Speculative Store Bypass via prctl */
-	{ "seccomp",	SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
-};
+static bool nossb __ro_after_init;
 
-static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+static int __init nossb_parse_cmdline(char *str)
 {
-	enum ssb_mitigation_cmd cmd;
-	char arg[20];
-	int ret, i;
-
-	cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
-		SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
-	if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
-	    cpu_mitigations_off()) {
-		return SPEC_STORE_BYPASS_CMD_NONE;
-	} else {
-		ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
-					  arg, sizeof(arg));
-		if (ret < 0)
-			return cmd;
+	nossb = true;
+	ssb_mode = SPEC_STORE_BYPASS_NONE;
+	return 0;
+}
+early_param("nospec_store_bypass_disable", nossb_parse_cmdline);
 
-		for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
-			if (!match_option(arg, ret, ssb_mitigation_options[i].option))
-				continue;
+static int __init ssb_parse_cmdline(char *str)
+{
+	if (!str)
+		return -EINVAL;
 
-			cmd = ssb_mitigation_options[i].cmd;
-			break;
-		}
+	if (nossb)
+		return 0;
 
-		if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
-			pr_err("unknown option (%s). Switching to default mode\n", arg);
-			return cmd;
-		}
-	}
+	if (!strcmp(str, "auto"))
+		ssb_mode = SPEC_STORE_BYPASS_AUTO;
+	else if (!strcmp(str, "on"))
+		ssb_mode = SPEC_STORE_BYPASS_DISABLE;
+	else if (!strcmp(str, "off"))
+		ssb_mode = SPEC_STORE_BYPASS_NONE;
+	else if (!strcmp(str, "prctl"))
+		ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+	else if (!strcmp(str, "seccomp"))
+		ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ?
+			SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL;
+	else
+		pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n",
+			str);
 
-	return cmd;
+	return 0;
 }
+early_param("spec_store_bypass_disable", ssb_parse_cmdline);
 
 static void __init ssb_select_mitigation(void)
 {
-	enum ssb_mitigation_cmd cmd;
-
-	if (!boot_cpu_has(X86_FEATURE_SSBD))
-		goto out;
-
-	cmd = ssb_parse_cmdline();
-	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
-	    (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
-	     cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) {
+		ssb_mode = SPEC_STORE_BYPASS_NONE;
 		return;
+	}
 
-	switch (cmd) {
-	case SPEC_STORE_BYPASS_CMD_SECCOMP:
-		/*
-		 * Choose prctl+seccomp as the default mode if seccomp is
-		 * enabled.
-		 */
-		if (IS_ENABLED(CONFIG_SECCOMP))
-			ssb_mode = SPEC_STORE_BYPASS_SECCOMP;
-		else
+	if (ssb_mode == SPEC_STORE_BYPASS_AUTO) {
+		if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS))
 			ssb_mode = SPEC_STORE_BYPASS_PRCTL;
-		break;
-	case SPEC_STORE_BYPASS_CMD_ON:
-		ssb_mode = SPEC_STORE_BYPASS_DISABLE;
-		break;
-	case SPEC_STORE_BYPASS_CMD_AUTO:
-	case SPEC_STORE_BYPASS_CMD_PRCTL:
-		ssb_mode = SPEC_STORE_BYPASS_PRCTL;
-		break;
-	case SPEC_STORE_BYPASS_CMD_NONE:
-		break;
+		else
+			ssb_mode = SPEC_STORE_BYPASS_NONE;
 	}
 
-out:
-	if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
-		pr_info("%s\n", ssb_strings[ssb_mode]);
+	if (!boot_cpu_has(X86_FEATURE_SSBD))
+		ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+	pr_info("%s\n", ssb_strings[ssb_mode]);
 }
 
 static void __init ssb_apply_mitigation(void)
@@ -2938,6 +2806,7 @@ static int ssb_prctl_get(struct task_struct *task)
 		return PR_SPEC_DISABLE;
 	case SPEC_STORE_BYPASS_SECCOMP:
 	case SPEC_STORE_BYPASS_PRCTL:
+	case SPEC_STORE_BYPASS_AUTO:
 		if (task_spec_ssb_force_disable(task))
 			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
 		if (task_spec_ssb_noexec(task))
@@ -3184,8 +3053,18 @@ static void __init srso_select_mitigation(void)
 	}
 
 	if (srso_mitigation == SRSO_MITIGATION_AUTO) {
-		if (should_mitigate_vuln(X86_BUG_SRSO)) {
+		/*
+		 * Use safe-RET if user->kernel or guest->host protection is
+		 * required.  Otherwise the 'microcode' mitigation is sufficient
+		 * to protect the user->user and guest->guest vectors.
+		 */
+		if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+		    (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) &&
+		     !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) {
 			srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+		} else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+			   cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+			srso_mitigation = SRSO_MITIGATION_MICROCODE;
 		} else {
 			srso_mitigation = SRSO_MITIGATION_NONE;
 			return;
@@ -3247,14 +3126,15 @@ ibpb_on_vmexit:
 
 static void __init srso_update_mitigation(void)
 {
+	if (!boot_cpu_has_bug(X86_BUG_SRSO))
+		return;
+
 	/* If retbleed is using IBPB, that works for SRSO as well */
 	if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
 	    boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
 		srso_mitigation = SRSO_MITIGATION_IBPB;
 
-	if (boot_cpu_has_bug(X86_BUG_SRSO) &&
-	    !cpu_mitigations_off())
-		pr_info("%s\n", srso_strings[srso_mitigation]);
+	pr_info("%s\n", srso_strings[srso_mitigation]);
 }
 
 static void __init srso_apply_mitigation(void)
@@ -3315,8 +3195,187 @@ static void __init srso_apply_mitigation(void)
 }
 
 #undef pr_fmt
+#define pr_fmt(fmt)	"VMSCAPE: " fmt
+
+enum vmscape_mitigations {
+	VMSCAPE_MITIGATION_NONE,
+	VMSCAPE_MITIGATION_AUTO,
+	VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+	VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+static const char * const vmscape_strings[] = {
+	[VMSCAPE_MITIGATION_NONE]		= "Vulnerable",
+	/* [VMSCAPE_MITIGATION_AUTO] */
+	[VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER]	= "Mitigation: IBPB before exit to userspace",
+	[VMSCAPE_MITIGATION_IBPB_ON_VMEXIT]	= "Mitigation: IBPB on VMEXIT",
+};
+
+static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE;
+
+static int __init vmscape_parse_cmdline(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off")) {
+		vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+	} else if (!strcmp(str, "ibpb")) {
+		vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+	} else if (!strcmp(str, "force")) {
+		setup_force_cpu_bug(X86_BUG_VMSCAPE);
+		vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
+	} else {
+		pr_err("Ignoring unknown vmscape=%s option.\n", str);
+	}
+
+	return 0;
+}
+early_param("vmscape", vmscape_parse_cmdline);
+
+static void __init vmscape_select_mitigation(void)
+{
+	if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+	    !boot_cpu_has(X86_FEATURE_IBPB)) {
+		vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+		return;
+	}
+
+	if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
+		if (should_mitigate_vuln(X86_BUG_VMSCAPE))
+			vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+		else
+			vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+	}
+}
+
+static void __init vmscape_update_mitigation(void)
+{
+	if (!boot_cpu_has_bug(X86_BUG_VMSCAPE))
+		return;
+
+	if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB ||
+	    srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT)
+		vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT;
+
+	pr_info("%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static void __init vmscape_apply_mitigation(void)
+{
+	if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+		setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
+}
+
+#undef pr_fmt
 #define pr_fmt(fmt) fmt
 
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n"
+
+void cpu_bugs_smt_update(void)
+{
+	mutex_lock(&spec_ctrl_mutex);
+
+	if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+	    spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+		pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+	switch (spectre_v2_user_stibp) {
+	case SPECTRE_V2_USER_NONE:
+		break;
+	case SPECTRE_V2_USER_STRICT:
+	case SPECTRE_V2_USER_STRICT_PREFERRED:
+		update_stibp_strict();
+		break;
+	case SPECTRE_V2_USER_PRCTL:
+	case SPECTRE_V2_USER_SECCOMP:
+		update_indir_branch_cond();
+		break;
+	}
+
+	switch (mds_mitigation) {
+	case MDS_MITIGATION_FULL:
+	case MDS_MITIGATION_AUTO:
+	case MDS_MITIGATION_VMWERV:
+		if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+			pr_warn_once(MDS_MSG_SMT);
+		update_mds_branch_idle();
+		break;
+	case MDS_MITIGATION_OFF:
+		break;
+	}
+
+	switch (taa_mitigation) {
+	case TAA_MITIGATION_VERW:
+	case TAA_MITIGATION_AUTO:
+	case TAA_MITIGATION_UCODE_NEEDED:
+		if (sched_smt_active())
+			pr_warn_once(TAA_MSG_SMT);
+		break;
+	case TAA_MITIGATION_TSX_DISABLED:
+	case TAA_MITIGATION_OFF:
+		break;
+	}
+
+	switch (mmio_mitigation) {
+	case MMIO_MITIGATION_VERW:
+	case MMIO_MITIGATION_AUTO:
+	case MMIO_MITIGATION_UCODE_NEEDED:
+		if (sched_smt_active())
+			pr_warn_once(MMIO_MSG_SMT);
+		break;
+	case MMIO_MITIGATION_OFF:
+		break;
+	}
+
+	switch (tsa_mitigation) {
+	case TSA_MITIGATION_USER_KERNEL:
+	case TSA_MITIGATION_VM:
+	case TSA_MITIGATION_AUTO:
+	case TSA_MITIGATION_FULL:
+		/*
+		 * TSA-SQ can potentially lead to info leakage between
+		 * SMT threads.
+		 */
+		if (sched_smt_active())
+			static_branch_enable(&cpu_buf_idle_clear);
+		else
+			static_branch_disable(&cpu_buf_idle_clear);
+		break;
+	case TSA_MITIGATION_NONE:
+	case TSA_MITIGATION_UCODE_NEEDED:
+		break;
+	}
+
+	switch (vmscape_mitigation) {
+	case VMSCAPE_MITIGATION_NONE:
+	case VMSCAPE_MITIGATION_AUTO:
+		break;
+	case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+	case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
+		/*
+		 * Hypervisors can be attacked across-threads, warn for SMT when
+		 * STIBP is not already enabled system-wide.
+		 *
+		 * Intel eIBRS (!AUTOIBRS) implies STIBP on.
+		 */
+		if (!sched_smt_active() ||
+		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+		    spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+		    (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+		     !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+			break;
+		pr_warn_once(VMSCAPE_MSG_SMT);
+		break;
+	}
+
+	mutex_unlock(&spec_ctrl_mutex);
+}
+
 #ifdef CONFIG_SYSFS
 
 #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
@@ -3502,9 +3561,6 @@ static const char *spectre_bhi_state(void)
 
 static ssize_t spectre_v2_show_state(char *buf)
 {
-	if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
-		return sysfs_emit(buf, "Vulnerable: LFENCE\n");
-
 	if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
 		return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
 
@@ -3562,6 +3618,11 @@ static ssize_t tsa_show_state(char *buf)
 	return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
 }
 
+static ssize_t vmscape_show_state(char *buf)
+{
+	return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       char *buf, unsigned int bug)
 {
@@ -3628,6 +3689,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_TSA:
 		return tsa_show_state(buf);
 
+	case X86_BUG_VMSCAPE:
+		return vmscape_show_state(buf);
+
 	default:
 		break;
 	}
@@ -3719,6 +3783,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
 }
+
+ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE);
+}
 #endif
 
 void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index adfa7e8bb865..51a95b07831f 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -290,6 +290,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 }
 
 /*
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4)
+{
+	unsigned long num_threads_sharing;
+	int index_msb;
+
+	num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+	index_msb = get_count_order(num_threads_sharing);
+
+	return apicid >> index_msb;
+}
+
+/*
  * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
  */
 
@@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
 		 * Newer families: LLC ID is calculated from the number
 		 * of threads sharing the L3 cache.
 		 */
-		u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
 		u32 llc_index = find_num_cache_leaves(c) - 1;
+		struct _cpuid4_info id4 = {};
 
-		cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
-		if (eax)
-			num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
-
-		if (num_sharing_cache) {
-			int index_msb = get_count_order(num_sharing_cache);
-
-			c->topo.llc_id = c->topo.apicid >> index_msb;
-		}
+		if (!amd_fill_cpuid4_info(llc_index, &id4))
+			c->topo.llc_id = get_cache_id(c->topo.apicid, &id4);
 	}
 }
 
@@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu)
 	return 0;
 }
 
-/*
- * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
- * ECX as cache index. Then right shift apicid by the number's order to get
- * cache id for this cache node.
- */
-static void get_cache_id(int cpu, struct _cpuid4_info *id4)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	unsigned long num_threads_sharing;
-	int index_msb;
-
-	num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
-	index_msb = get_count_order(num_threads_sharing);
-	id4->id = c->topo.apicid >> index_msb;
-}
-
 int populate_cache_leaves(unsigned int cpu)
 {
 	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 	struct cacheinfo *ci = this_cpu_ci->info_list;
 	u8 cpu_vendor = boot_cpu_data.x86_vendor;
+	u32 apicid = cpu_data(cpu).topo.apicid;
 	struct amd_northbridge *nb = NULL;
 	struct _cpuid4_info id4 = {};
 	int idx, ret;
@@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu)
 		if (ret)
 			return ret;
 
-		get_cache_id(cpu, &id4);
+		id4.id = get_cache_id(apicid, &id4);
 
 		if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
 			nb = amd_init_l3_cache(idx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 34a054181c4d..c7d3512914ca 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1236,55 +1236,71 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 #define ITS_NATIVE_ONLY	BIT(9)
 /* CPU is affected by Transient Scheduler Attacks */
 #define TSA		BIT(10)
+/* CPU is affected by VMSCAPE */
+#define VMSCAPE		BIT(11)
 
 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
-	VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_HASWELL,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_HASWELL_L,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_HASWELL_G,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_HASWELL_X,	     X86_STEP_MAX,	MMIO),
-	VULNBL_INTEL_STEPS(INTEL_BROADWELL_D,	     X86_STEP_MAX,	MMIO),
-	VULNBL_INTEL_STEPS(INTEL_BROADWELL_G,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_BROADWELL_X,	     X86_STEP_MAX,	MMIO),
-	VULNBL_INTEL_STEPS(INTEL_BROADWELL,	     X86_STEP_MAX,	SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,		      0x5,	MMIO | RETBLEED | GDS),
-	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | ITS),
-	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_SKYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,		      0xb,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,		      0xc,	MMIO | RETBLEED | GDS | SRBDS),
-	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS),
-	VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L,	     X86_STEP_MAX,	RETBLEED),
+	VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_HASWELL,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_HASWELL_L,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_HASWELL_G,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_HASWELL_X,	     X86_STEP_MAX,	MMIO | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_BROADWELL_D,	     X86_STEP_MAX,	MMIO | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_BROADWELL_X,	     X86_STEP_MAX,	MMIO | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_BROADWELL_G,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_BROADWELL,	     X86_STEP_MAX,	SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,		      0x5,	MMIO | RETBLEED | GDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | ITS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SKYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,		      0xb,	MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,		      0xc,	MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_KABYLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L,	     X86_STEP_MAX,	RETBLEED | VMSCAPE),
 	VULNBL_INTEL_STEPS(INTEL_ICELAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_STEPS(INTEL_ICELAKE_D,	     X86_STEP_MAX,	MMIO | GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_STEPS(INTEL_ICELAKE_X,	     X86_STEP_MAX,	MMIO | GDS | ITS | ITS_NATIVE_ONLY),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,		      0x0,	MMIO | RETBLEED | ITS),
-	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,		      0x0,	MMIO | RETBLEED | ITS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
 	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L,	     X86_STEP_MAX,	GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_STEPS(INTEL_TIGERLAKE,	     X86_STEP_MAX,	GDS | ITS | ITS_NATIVE_ONLY),
 	VULNBL_INTEL_STEPS(INTEL_LAKEFIELD,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RETBLEED),
 	VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE,	     X86_STEP_MAX,	MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
-	VULNBL_INTEL_TYPE(INTEL_ALDERLAKE,		     ATOM,	RFDS),
-	VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L,	     X86_STEP_MAX,	RFDS),
-	VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE,		     ATOM,	RFDS),
-	VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P,	     X86_STEP_MAX,	RFDS),
-	VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S,	     X86_STEP_MAX,	RFDS),
-	VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT,     X86_STEP_MAX,	RFDS),
+	VULNBL_INTEL_TYPE(INTEL_ALDERLAKE,		     ATOM,	RFDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ALDERLAKE,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L,	     X86_STEP_MAX,	RFDS | VMSCAPE),
+	VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE,		     ATOM,	RFDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P,	     X86_STEP_MAX,	RFDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S,	     X86_STEP_MAX,	RFDS | VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ARROWLAKE,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M,	     X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X,   X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X,    X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X,    X86_STEP_MAX,	VMSCAPE),
+	VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT,     X86_STEP_MAX,	RFDS | VMSCAPE),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT,	     X86_STEP_MAX,	MMIO | MMIO_SBDS | RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D,     X86_STEP_MAX,	MMIO | RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L,     X86_STEP_MAX,	MMIO | MMIO_SBDS | RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT,      X86_STEP_MAX,	RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D,    X86_STEP_MAX,	RFDS),
 	VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX,	RFDS),
+	VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X,   X86_STEP_MAX,	VMSCAPE),
 
 	VULNBL_AMD(0x15, RETBLEED),
 	VULNBL_AMD(0x16, RETBLEED),
-	VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
-	VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
-	VULNBL_AMD(0x19, SRSO | TSA),
-	VULNBL_AMD(0x1a, SRSO),
+	VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+	VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+	VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE),
+	VULNBL_AMD(0x1a, SRSO | VMSCAPE),
 	{}
 };
 
@@ -1543,6 +1559,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 		}
 	}
 
+	/*
+	 * Set the bug only on bare-metal. A nested hypervisor should already be
+	 * deploying IBPB to isolate itself from nested guests.
+	 */
+	if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) &&
+	    !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		setup_force_cpu_bug(X86_BUG_VMSCAPE);
+
 	if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
 		return;
 
@@ -1784,6 +1808,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		setup_clear_cpu_cap(X86_FEATURE_LA57);
 
 	detect_nopl();
+	mca_bsp_init(c);
 }
 
 void __init init_cpu_devs(void)
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 2154f12766fb..1fda6c3a2b65 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -16,6 +16,7 @@
 #include <asm/spec-ctrl.h>
 #include <asm/delay.h>
 #include <asm/msr.h>
+#include <asm/resctrl.h>
 
 #include "cpu.h"
 
@@ -117,6 +118,8 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
 			x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
 		}
 	}
+
+	resctrl_cpu_detect(c);
 }
 
 static void early_init_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 553bfbfc3a1b..f3e9219845e8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 #ifdef CONFIG_ACRN_GUEST
 	&x86_hyper_acrn,
 #endif
+#ifdef CONFIG_BHYVE_GUEST
+	&x86_hyper_bhyve,
+#endif
 };
 
 enum x86_hypervisor_type x86_hyper_type;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 076eaa41b8c8..98ae4c37c93e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -262,7 +262,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-	} else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+	} else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) ||
 		   (c->x86_vfm >= INTEL_CORE_YONAH  && c->x86_vfm <= INTEL_IVYBRIDGE)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	}
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 5c4eb28c3ac9..d6906442f49b 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -241,7 +241,8 @@ struct threshold_block {
 
 struct threshold_bank {
 	struct kobject		*kobj;
-	struct threshold_block	*blocks;
+	/* List of threshold blocks within this MCA bank. */
+	struct list_head	miscj;
 };
 
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
@@ -252,9 +253,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
  */
 static DEFINE_PER_CPU(u64, bank_map);
 
-/* Map of banks that have more than MCA_MISC0 available. */
-static DEFINE_PER_CPU(u64, smca_misc_banks_map);
-
 static void amd_threshold_interrupt(void);
 static void amd_deferred_error_interrupt(void);
 
@@ -264,28 +262,6 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
-static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
-{
-	u32 low, high;
-
-	/*
-	 * For SMCA enabled processors, BLKPTR field of the first MISC register
-	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
-	 */
-	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
-		return;
-
-	if (!(low & MCI_CONFIG_MCAX))
-		return;
-
-	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high))
-		return;
-
-	if (low & MASK_BLKPTR_LO)
-		per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank);
-
-}
-
 static void smca_configure(unsigned int bank, unsigned int cpu)
 {
 	u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
@@ -326,8 +302,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 		wrmsr(smca_config, low, high);
 	}
 
-	smca_set_misc_banks_map(bank, cpu);
-
 	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
 		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
 		return;
@@ -419,8 +393,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	return true;
 };
 
-/* Reprogram MCx_MISC MSR behind this threshold bank. */
-static void threshold_restart_bank(void *_tr)
+/* Reprogram MCx_MISC MSR behind this threshold block. */
+static void threshold_restart_block(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
 	u32 hi, lo;
@@ -478,7 +452,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 	};
 
 	b->threshold_limit		= THRESHOLD_MAX;
-	threshold_restart_bank(&tr);
+	threshold_restart_block(&tr);
 };
 
 static int setup_APIC_mce_threshold(int reserved, int new)
@@ -525,18 +499,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
-static u32 smca_get_block_address(unsigned int bank, unsigned int block,
-				  unsigned int cpu)
-{
-	if (!block)
-		return MSR_AMD64_SMCA_MCx_MISC(bank);
-
-	if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank)))
-		return 0;
-
-	return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
-}
-
 static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 			     unsigned int bank, unsigned int block,
 			     unsigned int cpu)
@@ -546,8 +508,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 	if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
 		return addr;
 
-	if (mce_flags.smca)
-		return smca_get_block_address(bank, block, cpu);
+	if (mce_flags.smca) {
+		if (!block)
+			return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+		if (!(low & MASK_BLKPTR_LO))
+			return 0;
+
+		return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+	}
 
 	/* Fall back to method we used for older processors: */
 	switch (block) {
@@ -677,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
 		wrmsrq(MSR_K7_HWCR, hwcr);
 }
 
+static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+	/* This should be disabled by the BIOS, but isn't always */
+	if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+		/*
+		 * disable GART TBL walk error reporting, which
+		 * trips off incorrectly with the IOMMU & 3ware
+		 * & Cerberus:
+		 */
+		clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+	}
+
+	/*
+	 * Various K7s with broken bank 0 around. Always disable
+	 * by default.
+	 */
+	if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+		mce_banks[0].ctl = 0;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
@@ -684,6 +675,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 	u32 low = 0, high = 0, address = 0;
 	int offset = -1;
 
+	amd_apply_cpu_quirks(c);
+
+	mce_flags.amd_threshold	 = 1;
 
 	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
 		if (mce_flags.smca)
@@ -714,6 +708,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		deferred_error_interrupt_enable(c);
 }
 
+void smca_bsp_init(void)
+{
+	mce_threshold_vector	  = amd_threshold_interrupt;
+	deferred_error_int_vector = amd_deferred_error_interrupt;
+}
+
 /*
  * DRAM ECC errors are reported in the Northbridge (bank 4) with
  * Extended Error Code 8.
@@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block)
 	/* Reset threshold block after logging error. */
 	memset(&tr, 0, sizeof(tr));
 	tr.b = block;
-	threshold_restart_bank(&tr);
+	threshold_restart_block(&tr);
 }
 
 /*
@@ -930,9 +930,9 @@ static void log_and_reset_block(struct threshold_block *block)
  */
 static void amd_threshold_interrupt(void)
 {
-	struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
-	struct threshold_bank **bp = this_cpu_read(threshold_banks);
+	struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
 	unsigned int bank, cpu = smp_processor_id();
+	struct threshold_block *block, *tmp;
 
 	/*
 	 * Validate that the threshold bank has been initialized already. The
@@ -946,20 +946,20 @@ static void amd_threshold_interrupt(void)
 		if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
 			continue;
 
-		first_block = bp[bank]->blocks;
-		if (!first_block)
+		thr_bank = bp[bank];
+		if (!thr_bank)
 			continue;
 
-		/*
-		 * The first block is also the head of the list. Check it first
-		 * before iterating over the rest.
-		 */
-		log_and_reset_block(first_block);
-		list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+		list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
 			log_and_reset_block(block);
 	}
 }
 
+void amd_clear_bank(struct mce *m)
+{
+	mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
 /*
  * Sysfs Interface
  */
@@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
 	memset(&tr, 0, sizeof(tr));
 	tr.b		= b;
 
-	if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+	if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
 		return -ENODEV;
 
 	return size;
@@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
 	b->threshold_limit = new;
 	tr.b = b;
 
-	if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+	if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
 		return -ENODEV;
 
 	return size;
@@ -1181,13 +1181,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
 		default_attrs[2] = NULL;
 	}
 
-	INIT_LIST_HEAD(&b->miscj);
-
-	/* This is safe as @tb is not visible yet */
-	if (tb->blocks)
-		list_add(&b->miscj, &tb->blocks->miscj);
-	else
-		tb->blocks = b;
+	list_add(&b->miscj, &tb->miscj);
 
 	err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
 	if (err)
@@ -1238,6 +1232,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
 		goto out_free;
 	}
 
+	INIT_LIST_HEAD(&b->miscj);
+
 	err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
 	if (err)
 		goto out_kobj;
@@ -1258,26 +1254,15 @@ static void threshold_block_release(struct kobject *kobj)
 	kfree(to_block(kobj));
 }
 
-static void deallocate_threshold_blocks(struct threshold_bank *bank)
+static void threshold_remove_bank(struct threshold_bank *bank)
 {
 	struct threshold_block *pos, *tmp;
 
-	list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) {
+	list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) {
 		list_del(&pos->miscj);
 		kobject_put(&pos->kobj);
 	}
 
-	kobject_put(&bank->blocks->kobj);
-}
-
-static void threshold_remove_bank(struct threshold_bank *bank)
-{
-	if (!bank->blocks)
-		goto out_free;
-
-	deallocate_threshold_blocks(bank);
-
-out_free:
 	kobject_put(bank->kobj);
 	kfree(bank);
 }
@@ -1296,12 +1281,12 @@ static void __threshold_remove_device(struct threshold_bank **bp)
 	kfree(bp);
 }
 
-int mce_threshold_remove_device(unsigned int cpu)
+void mce_threshold_remove_device(unsigned int cpu)
 {
 	struct threshold_bank **bp = this_cpu_read(threshold_banks);
 
 	if (!bp)
-		return 0;
+		return;
 
 	/*
 	 * Clear the pointer before cleaning up, so that the interrupt won't
@@ -1310,7 +1295,7 @@ int mce_threshold_remove_device(unsigned int cpu)
 	this_cpu_write(threshold_banks, NULL);
 
 	__threshold_remove_device(bp);
-	return 0;
+	return;
 }
 
 /**
@@ -1324,36 +1309,34 @@ int mce_threshold_remove_device(unsigned int cpu)
  * thread running on @cpu.  The callback is invoked on all CPUs which are
  * online when the callback is installed or during a real hotplug event.
  */
-int mce_threshold_create_device(unsigned int cpu)
+void mce_threshold_create_device(unsigned int cpu)
 {
 	unsigned int numbanks, bank;
 	struct threshold_bank **bp;
-	int err;
 
 	if (!mce_flags.amd_threshold)
-		return 0;
+		return;
 
 	bp = this_cpu_read(threshold_banks);
 	if (bp)
-		return 0;
+		return;
 
 	numbanks = this_cpu_read(mce_num_banks);
 	bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
 	if (!bp)
-		return -ENOMEM;
+		return;
 
 	for (bank = 0; bank < numbanks; ++bank) {
 		if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
 			continue;
-		err = threshold_create_bank(bp, cpu, bank);
-		if (err) {
+		if (threshold_create_bank(bp, cpu, bank)) {
 			__threshold_remove_device(bp);
-			return err;
+			return;
 		}
 	}
 	this_cpu_write(threshold_banks, bp);
 
 	if (thresholding_irq_en)
 		mce_threshold_vector = amd_threshold_interrupt;
-	return 0;
+	return;
 }
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81..460e90a1a0b1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr)
 	return EAX_EDX_VAL(val, low, high);
 }
 
-static noinstr void mce_wrmsrq(u32 msr, u64 v)
+noinstr void mce_wrmsrq(u32 msr, u64 v)
 {
 	u32 low, high;
 
@@ -715,6 +715,60 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+static bool ser_should_log_poll_error(struct mce *m)
+{
+	/* Log "not enabled" (speculative) errors */
+	if (!(m->status & MCI_STATUS_EN))
+		return true;
+
+	/*
+	 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+	 * UC == 1 && PCC == 0 && S == 0
+	 */
+	if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
+		return true;
+
+	return false;
+}
+
+static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
+{
+	struct mce *m = &err->m;
+
+	/* If this entry is not valid, ignore it. */
+	if (!(m->status & MCI_STATUS_VAL))
+		return false;
+
+	/*
+	 * If we are logging everything (at CPU online) or this
+	 * is a corrected error, then we must log it.
+	 */
+	if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
+		return true;
+
+	if (mca_cfg.ser)
+		return ser_should_log_poll_error(m);
+
+	if (m->status & MCI_STATUS_UC)
+		return false;
+
+	return true;
+}
+
+static void clear_bank(struct mce *m)
+{
+	if (m->cpuvendor == X86_VENDOR_AMD)
+		return amd_clear_bank(m);
+
+	mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
  * Poll for corrected events or events that happened before reset.
  * Those are just logged through /dev/mcelog.
  *
@@ -765,51 +819,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		if (!mca_cfg.cmci_disabled)
 			mce_track_storm(m);
 
-		/* If this entry is not valid, ignore it */
-		if (!(m->status & MCI_STATUS_VAL))
+		/* Verify that the error should be logged based on hardware conditions. */
+		if (!should_log_poll_error(flags, &err))
 			continue;
 
-		/*
-		 * If we are logging everything (at CPU online) or this
-		 * is a corrected error, then we must log it.
-		 */
-		if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
-			goto log_it;
-
-		/*
-		 * Newer Intel systems that support software error
-		 * recovery need to make additional checks. Other
-		 * CPUs should skip over uncorrected errors, but log
-		 * everything else.
-		 */
-		if (!mca_cfg.ser) {
-			if (m->status & MCI_STATUS_UC)
-				continue;
-			goto log_it;
-		}
-
-		/* Log "not enabled" (speculative) errors */
-		if (!(m->status & MCI_STATUS_EN))
-			goto log_it;
-
-		/*
-		 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
-		 * UC == 1 && PCC == 0 && S == 0
-		 */
-		if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
-			goto log_it;
-
-		/*
-		 * Skip anything else. Presumption is that our read of this
-		 * bank is racing with a machine check. Leave the log alone
-		 * for do_machine_check() to deal with it.
-		 */
-		continue;
-
-log_it:
-		if (flags & MCP_DONTLOG)
-			goto clear_it;
-
 		mce_read_aux(&err, i);
 		m->severity = mce_severity(m, NULL, NULL, false);
 		/*
@@ -826,10 +839,7 @@ log_it:
 			mce_log(&err);
 
 clear_it:
-		/*
-		 * Clear state for this bank.
-		 */
-		mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+		clear_bank(m);
 	}
 
 	/*
@@ -1810,9 +1820,10 @@ static void __mcheck_cpu_mce_banks_init(void)
 		struct mce_bank *b = &mce_banks[i];
 
 		/*
-		 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
-		 * the required vendor quirks before
-		 * __mcheck_cpu_init_clear_banks() does the final bank setup.
+		 * Init them all by default.
+		 *
+		 * The required vendor quirks will be applied before
+		 * __mcheck_cpu_init_prepare_banks() does the final bank setup.
 		 */
 		b->ctl = -1ULL;
 		b->init = true;
@@ -1840,69 +1851,34 @@ static void __mcheck_cpu_cap_init(void)
 	this_cpu_write(mce_num_banks, b);
 
 	__mcheck_cpu_mce_banks_init();
-
-	/* Use accurate RIP reporting if available. */
-	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
-		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
-
-	if (cap & MCG_SER_P)
-		mca_cfg.ser = 1;
 }
 
 static void __mcheck_cpu_init_generic(void)
 {
-	enum mcp_flags m_fl = 0;
-	mce_banks_t all_banks;
 	u64 cap;
 
-	if (!mca_cfg.bootlog)
-		m_fl = MCP_DONTLOG;
-
-	/*
-	 * Log the machine checks left over from the previous reset. Log them
-	 * only, do not start processing them. That will happen in mcheck_late_init()
-	 * when all consumers have been registered on the notifier chain.
-	 */
-	bitmap_fill(all_banks, MAX_NR_BANKS);
-	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
-
-	cr4_set_bits(X86_CR4_MCE);
-
 	rdmsrq(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 }
 
-static void __mcheck_cpu_init_clear_banks(void)
+static void __mcheck_cpu_init_prepare_banks(void)
 {
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	u64 msrval;
 	int i;
 
-	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
-		struct mce_bank *b = &mce_banks[i];
+	/*
+	 * Log the machine checks left over from the previous reset. Log them
+	 * only, do not start processing them. That will happen in mcheck_late_init()
+	 * when all consumers have been registered on the notifier chain.
+	 */
+	if (mca_cfg.bootlog) {
+		mce_banks_t all_banks;
 
-		if (!b->init)
-			continue;
-		wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
-		wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+		bitmap_fill(all_banks, MAX_NR_BANKS);
+		machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
 	}
-}
-
-/*
- * Do a final check to see if there are any unused/RAZ banks.
- *
- * This must be done after the banks have been initialized and any quirks have
- * been applied.
- *
- * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
- * Otherwise, a user who disables a bank will not be able to re-enable it
- * without a system reboot.
- */
-static void __mcheck_cpu_check_banks(void)
-{
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-	u64 msrval;
-	int i;
 
 	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
@@ -1910,25 +1886,16 @@ static void __mcheck_cpu_check_banks(void)
 		if (!b->init)
 			continue;
 
+		wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+		wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+
 		rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
 		b->init = !!msrval;
 	}
 }
 
-static void apply_quirks_amd(struct cpuinfo_x86 *c)
+static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
 {
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-
-	/* This should be disabled by the BIOS, but isn't always */
-	if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
-		/*
-		 * disable GART TBL walk error reporting, which
-		 * trips off incorrectly with the IOMMU & 3ware
-		 * & Cerberus:
-		 */
-		clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
-	}
-
 	if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
 		/*
 		 * Lots of broken BIOS around that don't clear them
@@ -1938,13 +1905,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
 	}
 
 	/*
-	 * Various K7s with broken bank 0 around. Always disable
-	 * by default.
-	 */
-	if (c->x86 == 6 && this_cpu_read(mce_num_banks))
-		mce_banks[0].ctl = 0;
-
-	/*
 	 * overflow_recov is supported for F15h Models 00h-0fh
 	 * even though we don't have a CPUID bit for it.
 	 */
@@ -1955,26 +1915,13 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
 		mce_flags.zen_ifu_quirk = 1;
 }
 
-static void apply_quirks_intel(struct cpuinfo_x86 *c)
+static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
 {
-	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-
 	/* Older CPUs (prior to family 6) don't need quirks. */
 	if (c->x86_vfm < INTEL_PENTIUM_PRO)
 		return;
 
 	/*
-	 * SDM documents that on family 6 bank 0 should not be written
-	 * because it aliases to another special BIOS controlled
-	 * register.
-	 * But it's not aliased anymore on model 0x1a+
-	 * Don't ignore bank 0 completely because there could be a
-	 * valid event later, merely don't write CTL0.
-	 */
-	if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
-		mce_banks[0].init = false;
-
-	/*
 	 * All newer Intel systems support MCE broadcasting. Enable
 	 * synchronization with a one second timeout.
 	 */
@@ -1999,7 +1946,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c)
 		mce_flags.skx_repmov_quirk = 1;
 }
 
-static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
+static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
 {
 	/*
 	 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
@@ -2011,34 +1958,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
 	}
 }
 
-/* Add per CPU specific workarounds here */
-static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
-{
-	struct mca_config *cfg = &mca_cfg;
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_UNKNOWN:
-		pr_info("unknown CPU type - not enabling MCE support\n");
-		return false;
-	case X86_VENDOR_AMD:
-		apply_quirks_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		apply_quirks_intel(c);
-		break;
-	case X86_VENDOR_ZHAOXIN:
-		apply_quirks_zhaoxin(c);
-		break;
-	}
-
-	if (cfg->monarch_timeout < 0)
-		cfg->monarch_timeout = 0;
-	if (cfg->bootlog != 0)
-		cfg->panic_timeout = 30;
-
-	return true;
-}
-
 static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
 	if (c->x86 != 5)
@@ -2060,19 +1979,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 	return false;
 }
 
-/*
- * Init basic CPU features needed for early decoding of MCEs.
- */
-static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
-{
-	if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
-		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
-		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
-		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
-		mce_flags.amd_threshold	 = 1;
-	}
-}
-
 static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
 {
 	struct mca_config *cfg = &mca_cfg;
@@ -2281,6 +2187,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check)
 }
 #endif
 
+void mca_bsp_init(struct cpuinfo_x86 *c)
+{
+	u64 cap;
+
+	if (!mce_available(c))
+		return;
+
+	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+		mca_cfg.disabled = 1;
+		pr_info("unknown CPU type - not enabling MCE support\n");
+		return;
+	}
+
+	mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
+	mce_flags.succor	 = cpu_feature_enabled(X86_FEATURE_SUCCOR);
+	mce_flags.smca		 = cpu_feature_enabled(X86_FEATURE_SMCA);
+
+	if (mce_flags.smca)
+		smca_bsp_init();
+
+	rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+	/* Use accurate RIP reporting if available. */
+	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+	if (cap & MCG_SER_P)
+		mca_cfg.ser = 1;
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		amd_apply_global_quirks(c);
+		break;
+	case X86_VENDOR_INTEL:
+		intel_apply_global_quirks(c);
+		break;
+	case X86_VENDOR_ZHAOXIN:
+		zhaoxin_apply_global_quirks(c);
+		break;
+	}
+
+	if (mca_cfg.monarch_timeout < 0)
+		mca_cfg.monarch_timeout = 0;
+	if (mca_cfg.bootlog != 0)
+		mca_cfg.panic_timeout = 30;
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
@@ -2298,11 +2251,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 
 	__mcheck_cpu_cap_init();
 
-	if (!__mcheck_cpu_apply_quirks(c)) {
-		mca_cfg.disabled = 1;
-		return;
-	}
-
 	if (!mce_gen_pool_init()) {
 		mca_cfg.disabled = 1;
 		pr_emerg("Couldn't allocate MCE records pool!\n");
@@ -2311,12 +2259,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 
 	mca_cfg.initialized = 1;
 
-	__mcheck_cpu_init_early(c);
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
-	__mcheck_cpu_init_clear_banks();
-	__mcheck_cpu_check_banks();
+	__mcheck_cpu_init_prepare_banks();
 	__mcheck_cpu_setup_timer();
+	cr4_set_bits(X86_CR4_MCE);
 }
 
 /*
@@ -2483,7 +2430,8 @@ static void mce_syscore_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
-	__mcheck_cpu_init_clear_banks();
+	__mcheck_cpu_init_prepare_banks();
+	cr4_set_bits(X86_CR4_MCE);
 }
 
 static struct syscore_ops mce_syscore_ops = {
@@ -2501,8 +2449,9 @@ static void mce_cpu_restart(void *data)
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 	__mcheck_cpu_init_generic();
-	__mcheck_cpu_init_clear_banks();
+	__mcheck_cpu_init_prepare_banks();
 	__mcheck_cpu_init_timer();
+	cr4_set_bits(X86_CR4_MCE);
 }
 
 /* Reinit MCEs after user configuration changes */
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 9b149b9c4109..4655223ba560 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
 	}
 }
 
+static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+	/*
+	 * SDM documents that on family 6 bank 0 should not be written
+	 * because it aliases to another special BIOS controlled
+	 * register.
+	 * But it's not aliased anymore on model 0x1a+
+	 * Don't ignore bank 0 completely because there could be a
+	 * valid event later, merely don't write CTL0.
+	 *
+	 * Older CPUs (prior to family 6) can't reach this point and already
+	 * return early due to the check of __mcheck_cpu_ancient_init().
+	 */
+	if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+		this_cpu_ptr(mce_banks_array)[0].init = false;
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
+	intel_apply_cpu_quirks(c);
 	intel_init_cmci();
 	intel_init_lmce();
 	intel_imc_init(c);
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b5ba598e54cb..b0e00ec5cc8c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -265,8 +265,11 @@ void mce_prep_record_common(struct mce *m);
 void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
 
 #ifdef CONFIG_X86_MCE_AMD
+void mce_threshold_create_device(unsigned int cpu);
+void mce_threshold_remove_device(unsigned int cpu);
 extern bool amd_filter_mce(struct mce *m);
 bool amd_mce_usable_address(struct mce *m);
+void amd_clear_bank(struct mce *m);
 
 /*
  * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -292,10 +295,15 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
 	m->addr &= GENMASK_ULL(55, lsb);
 }
 
+void smca_bsp_init(void);
 #else
+static inline void mce_threshold_create_device(unsigned int cpu)	{ }
+static inline void mce_threshold_remove_device(unsigned int cpu)	{ }
 static inline bool amd_filter_mce(struct mce *m) { return false; }
 static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void amd_clear_bank(struct mce *m) { }
 static inline void smca_extract_err_addr(struct mce *m) { }
+static inline void smca_bsp_init(void) { }
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
@@ -313,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
 #endif
 
 noinstr u64 mce_rdmsrq(u32 msr);
+noinstr void mce_wrmsrq(u32 msr, u64 v);
 
 static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
 {
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 097e39327942..cdce885e2fd5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -171,8 +171,28 @@ static int cmp_id(const void *key, const void *elem)
 		return 1;
 }
 
+static u32 cpuid_to_ucode_rev(unsigned int val)
+{
+	union zen_patch_rev p = {};
+	union cpuid_1_eax c;
+
+	c.full = val;
+
+	p.stepping  = c.stepping;
+	p.model     = c.model;
+	p.ext_model = c.ext_model;
+	p.ext_fam   = c.ext_fam;
+
+	return p.ucode_rev;
+}
+
 static bool need_sha_check(u32 cur_rev)
 {
+	if (!cur_rev) {
+		cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+		pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
+	}
+
 	switch (cur_rev >> 8) {
 	case 0x80012: return cur_rev <= 0x800126f; break;
 	case 0x80082: return cur_rev <= 0x800820f; break;
@@ -249,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi
 	return true;
 }
 
-static u32 get_patch_level(void)
-{
-	u32 rev, dummy __always_unused;
-
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
-	return rev;
-}
-
 static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
 {
 	union zen_patch_rev p;
@@ -275,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
 	return c;
 }
 
+static u32 get_patch_level(void)
+{
+	u32 rev, dummy __always_unused;
+
+	if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+		int cpu = smp_processor_id();
+
+		if (!microcode_rev[cpu]) {
+			if (!base_rev)
+				base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+
+			microcode_rev[cpu] = base_rev;
+
+			ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev);
+		}
+
+		return microcode_rev[cpu];
+	}
+
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+	return rev;
+}
+
 static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
 {
 	unsigned int i;
@@ -304,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size)
 	u32 cont_magic;
 
 	if (buf_size <= CONTAINER_HDR_SZ) {
-		pr_debug("Truncated microcode container header.\n");
+		ucode_dbg("Truncated microcode container header.\n");
 		return false;
 	}
 
 	cont_magic = *(const u32 *)buf;
 	if (cont_magic != UCODE_MAGIC) {
-		pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
+		ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic);
 		return false;
 	}
 
@@ -335,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
 
 	cont_type = hdr[1];
 	if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
-		pr_debug("Wrong microcode container equivalence table type: %u.\n",
-			 cont_type);
+		ucode_dbg("Wrong microcode container equivalence table type: %u.\n",
+			  cont_type);
 		return false;
 	}
 
@@ -345,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
 	equiv_tbl_len = hdr[2];
 	if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
 	    buf_size < equiv_tbl_len) {
-		pr_debug("Truncated equivalence table.\n");
+		ucode_dbg("Truncated equivalence table.\n");
 		return false;
 	}
 
@@ -365,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize
 	const u32 *hdr;
 
 	if (buf_size < SECTION_HDR_SIZE) {
-		pr_debug("Truncated patch section.\n");
+		ucode_dbg("Truncated patch section.\n");
 		return false;
 	}
 
@@ -374,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize
 	p_size = hdr[1];
 
 	if (p_type != UCODE_UCODE_TYPE) {
-		pr_debug("Invalid type field (0x%x) in container file section header.\n",
-			 p_type);
+		ucode_dbg("Invalid type field (0x%x) in container file section header.\n",
+			  p_type);
 		return false;
 	}
 
 	if (p_size < sizeof(struct microcode_header_amd)) {
-		pr_debug("Patch of size %u too short.\n", p_size);
+		ucode_dbg("Patch of size %u too short.\n", p_size);
 		return false;
 	}
 
@@ -457,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
 	 * size sh_psize, as the section claims.
 	 */
 	if (buf_size < sh_psize) {
-		pr_debug("Patch of size %u truncated.\n", sh_psize);
+		ucode_dbg("Patch of size %u truncated.\n", sh_psize);
 		return -1;
 	}
 
 	if (!__verify_patch_size(sh_psize, buf_size)) {
-		pr_debug("Per-family patch size mismatch.\n");
+		ucode_dbg("Per-family patch size mismatch.\n");
 		return -1;
 	}
 
@@ -476,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
 
 	proc_id	= mc_hdr->processor_rev_id;
 	patch_fam = 0xf + (proc_id >> 12);
+
+	ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
 	if (patch_fam != family)
 		return 1;
 
@@ -546,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
 		}
 
 		mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
+
+		ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id);
+
 		if (mc_patch_matches(mc, eq_id)) {
 			desc->psize = patch_size;
 			desc->mc = mc;
+
+			ucode_dbg(" match: size: %d\n", patch_size);
 		}
 
 skip:
@@ -619,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
 			invlpg(p_addr_end);
 	}
 
+	if (IS_ENABLED(CONFIG_MICROCODE_DBG))
+		microcode_rev[smp_processor_id()] = mc->hdr.patch_id;
+
 	/* verify patch application was successful */
 	*cur_rev = get_patch_level();
+
+	ucode_dbg("updated rev: 0x%x\n", *cur_rev);
+
 	if (*cur_rev != mc->hdr.patch_id)
 		return false;
 
@@ -749,8 +798,6 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi
 	n.equiv_cpu = equiv_cpu;
 	n.patch_id  = uci->cpu_sig.rev;
 
-	WARN_ON_ONCE(!n.patch_id);
-
 	list_for_each_entry(p, &microcode_cache, plist)
 		if (patch_cpus_equivalent(p, &n, false))
 			return p;
@@ -1008,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
 	patch->patch_id  = mc_hdr->patch_id;
 	patch->equiv_cpu = proc_id;
 
-	pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
+	ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
 		 __func__, patch->patch_id, proc_id);
 
 	/* ... and add to cache. */
@@ -1151,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
 		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
 
 	if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
-		pr_debug("failed to load file %s\n", fw_name);
+		ucode_dbg("failed to load file %s\n", fw_name);
 		goto out;
 	}
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b92e09a87c69..f75c140906d0 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -43,10 +43,19 @@
 #include "internal.h"
 
 static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr = false;
+static bool dis_ucode_ldr;
 
 bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
-module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
+
+/*
+ * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in
+ * order to not uglify the code with ifdeffery and use IS_ENABLED()
+ * instead, leave them in. When microcode debugging is not enabled,
+ * those are meaningless anyway.
+ */
+/* base microcode revision for debugging */
+u32 base_rev;
+u32 microcode_rev[NR_CPUS] = {};
 
 /*
  * Synchronization.
@@ -119,20 +128,48 @@ bool __init microcode_loader_disabled(void)
 	 *    overwritten.
 	 */
 	if (!cpuid_feature() ||
-	    native_cpuid_ecx(1) & BIT(31) ||
+	    ((native_cpuid_ecx(1) & BIT(31)) &&
+	      !IS_ENABLED(CONFIG_MICROCODE_DBG)) ||
 	    amd_check_current_patch_level())
 		dis_ucode_ldr = true;
 
 	return dis_ucode_ldr;
 }
 
+static void early_parse_cmdline(void)
+{
+	char cmd_buf[64] = {};
+	char *s, *p = cmd_buf;
+
+	if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) {
+		while ((s = strsep(&p, ","))) {
+			if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+				if (strstr(s, "base_rev=")) {
+					/* advance to the option arg */
+					strsep(&s, "=");
+					if (kstrtouint(s, 16, &base_rev)) { ; }
+				}
+			}
+
+			if (!strcmp("force_minrev", s))
+				force_minrev = true;
+
+			if (!strcmp(s, "dis_ucode_ldr"))
+				dis_ucode_ldr = true;
+		}
+	}
+
+	/* old, compat option */
+	if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+		dis_ucode_ldr = true;
+}
+
 void __init load_ucode_bsp(void)
 {
 	unsigned int cpuid_1_eax;
 	bool intel = true;
 
-	if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
-		dis_ucode_ldr = true;
+	early_parse_cmdline();
 
 	if (microcode_loader_disabled())
 		return;
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
index cb6e601701ab..2d48e6593540 100644
--- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -67,9 +67,8 @@
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
@@ -81,51 +80,62 @@
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 },
-{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6,  .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf,  .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf,  .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
 { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf,  .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 50a9702ae4e2..ae8dbc2b908d 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -44,6 +44,9 @@ struct early_load_data {
 
 extern struct early_load_data early_data;
 extern struct ucode_cpu_info ucode_cpu_info[];
+extern u32 microcode_rev[NR_CPUS];
+extern u32 base_rev;
+
 struct cpio_data find_microcode_in_initrd(const char *path);
 
 #define MAX_UCODE_COUNT 128
@@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { }
 static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
 #endif  /* !CONFIG_CPU_SUP_INTEL */
 
+#define ucode_dbg(fmt, ...)					\
+({								\
+	if (IS_ENABLED(CONFIG_MICROCODE_DBG))			\
+		pr_info(fmt, ##__VA_ARGS__);			\
+})
+
 #endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 187d527ef73b..06ca5a30140c 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void)
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
 	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
-	return r->num_rmid;
+	return r->mon.num_rmid;
 }
 
 struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
@@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
 
 static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
 {
-	kfree(hw_dom->arch_mbm_total);
-	kfree(hw_dom->arch_mbm_local);
+	int idx;
+
+	for_each_mbm_idx(idx)
+		kfree(hw_dom->arch_mbm_states[idx]);
 	kfree(hw_dom);
 }
 
@@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *
  */
 static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
 {
-	size_t tsize;
-
-	if (resctrl_arch_is_mbm_total_enabled()) {
-		tsize = sizeof(*hw_dom->arch_mbm_total);
-		hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
-		if (!hw_dom->arch_mbm_total)
-			return -ENOMEM;
-	}
-	if (resctrl_arch_is_mbm_local_enabled()) {
-		tsize = sizeof(*hw_dom->arch_mbm_local);
-		hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
-		if (!hw_dom->arch_mbm_local) {
-			kfree(hw_dom->arch_mbm_total);
-			hw_dom->arch_mbm_total = NULL;
-			return -ENOMEM;
-		}
+	size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
+	enum resctrl_event_id eventid;
+	int idx;
+
+	for_each_mbm_event_id(eventid) {
+		if (!resctrl_is_mon_event_enabled(eventid))
+			continue;
+		idx = MBM_STATE_IDX(eventid);
+		hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
+		if (!hw_dom->arch_mbm_states[idx])
+			goto cleanup;
 	}
 
 	return 0;
+cleanup:
+	for_each_mbm_idx(idx) {
+		kfree(hw_dom->arch_mbm_states[idx]);
+		hw_dom->arch_mbm_states[idx] = NULL;
+	}
+
+	return -ENOMEM;
 }
 
 static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
@@ -516,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 		d = container_of(hdr, struct rdt_mon_domain, hdr);
 
 		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+		/* Update the mbm_assign_mode state for the CPU if supported */
+		if (r->mon.mbm_cntr_assignable)
+			resctrl_arch_mbm_cntr_assign_set_one(r);
 		return;
 	}
 
@@ -535,9 +542,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 	d->ci_id = ci->id;
 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
 
+	/* Update the mbm_assign_mode state for the CPU if supported */
+	if (r->mon.mbm_cntr_assignable)
+		resctrl_arch_mbm_cntr_assign_set_one(r);
+
 	arch_mon_domain_online(r, d);
 
-	if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+	if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
 		mon_domain_free(hw_dom);
 		return;
 	}
@@ -707,6 +718,7 @@ enum {
 	RDT_FLAG_MBA,
 	RDT_FLAG_SMBA,
 	RDT_FLAG_BMEC,
+	RDT_FLAG_ABMC,
 };
 
 #define RDT_OPT(idx, n, f)	\
@@ -732,6 +744,7 @@ static struct rdt_options rdt_options[]  __ro_after_init = {
 	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
 	RDT_OPT(RDT_FLAG_SMBA,	    "smba",	X86_FEATURE_SMBA),
 	RDT_OPT(RDT_FLAG_BMEC,	    "bmec",	X86_FEATURE_BMEC),
+	RDT_OPT(RDT_FLAG_ABMC,	    "abmc",	X86_FEATURE_ABMC),
 };
 #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
 
@@ -863,15 +876,24 @@ static __init bool get_rdt_alloc_resources(void)
 static __init bool get_rdt_mon_resources(void)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	bool ret = false;
 
-	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
-		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
-	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
-		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
-	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
-		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
+		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_ABMC))
+		ret = true;
 
-	if (!rdt_mon_features)
+	if (!ret)
 		return false;
 
 	return !rdt_get_mon_l3_config(r);
@@ -965,7 +987,7 @@ static enum cpuhp_state rdt_online;
 /* Runs once on the BSP during boot. */
 void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 {
-	if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+	if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
 		c->x86_cache_max_rmid  = -1;
 		c->x86_cache_occ_scale = -1;
 		c->x86_cache_mbm_width_offset = -1;
@@ -977,7 +999,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
 	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
-	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
+	    cpu_has(c, X86_FEATURE_ABMC)) {
 		u32 eax, ebx, ecx, edx;
 
 		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5e3c41b36437..9f4c2f0aaf5c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -37,6 +37,15 @@ struct arch_mbm_state {
 	u64	prev_msr;
 };
 
+/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */
+#define ABMC_ENABLE_BIT			0
+
+/*
+ * Qos Event Identifiers.
+ */
+#define ABMC_EXTENDED_EVT_ID		BIT(31)
+#define ABMC_EVT_ID			BIT(0)
+
 /**
  * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
  *			       a resource for a control function
@@ -54,15 +63,15 @@ struct rdt_hw_ctrl_domain {
  * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
  *			      a resource for a monitor function
  * @d_resctrl:	Properties exposed to the resctrl file system
- * @arch_mbm_total:	arch private state for MBM total bandwidth
- * @arch_mbm_local:	arch private state for MBM local bandwidth
+ * @arch_mbm_states:	Per-event pointer to the MBM event's saved state.
+ *			An MBM event's state is an array of struct arch_mbm_state
+ *			indexed by RMID on x86.
  *
  * Members of this structure are accessed via helpers that provide abstraction.
  */
 struct rdt_hw_mon_domain {
 	struct rdt_mon_domain		d_resctrl;
-	struct arch_mbm_state		*arch_mbm_total;
-	struct arch_mbm_state		*arch_mbm_local;
+	struct arch_mbm_state		*arch_mbm_states[QOS_NUM_L3_MBM_EVENTS];
 };
 
 static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
@@ -102,6 +111,7 @@ struct msr_param {
  * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
  * @mbm_width:		Monitor width, to detect and correct for overflow.
  * @cdp_enabled:	CDP state of this resource
+ * @mbm_cntr_assign_enabled:	ABMC feature is enabled
  *
  * Members of this structure are either private to the architecture
  * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
@@ -115,6 +125,7 @@ struct rdt_hw_resource {
 	unsigned int		mon_scale;
 	unsigned int		mbm_width;
 	bool			cdp_enabled;
+	bool			mbm_cntr_assign_enabled;
 };
 
 static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
@@ -159,6 +170,42 @@ union cpuid_0x10_x_edx {
 	unsigned int full;
 };
 
+/*
+ * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG.
+ *
+ * @bw_type		: Event configuration that represents the memory
+ *			  transactions being tracked by the @cntr_id.
+ * @bw_src		: Bandwidth source (RMID or CLOSID).
+ * @reserved1		: Reserved.
+ * @is_clos		: @bw_src field is a CLOSID (not an RMID).
+ * @cntr_id		: Counter identifier.
+ * @reserved		: Reserved.
+ * @cntr_en		: Counting enable bit.
+ * @cfg_en		: Configuration enable bit.
+ *
+ * Configuration and counting:
+ * Counter can be configured across multiple writes to MSR. Configuration
+ * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the
+ * configuration is applied.
+ * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not
+ *                             count events.
+ * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start
+ *                             counting events.
+ */
+union l3_qos_abmc_cfg {
+	struct {
+		unsigned long bw_type  :32,
+			      bw_src   :12,
+			      reserved1: 3,
+			      is_clos  : 1,
+			      cntr_id  : 5,
+			      reserved : 9,
+			      cntr_en  : 1,
+			      cfg_en   : 1;
+	} split;
+	unsigned long full;
+};
+
 void rdt_ctrl_update(void *arg);
 
 int rdt_get_mon_l3_config(struct rdt_resource *r);
@@ -168,5 +215,6 @@ bool rdt_cpu_has(int flag);
 void __init intel_rdt_mbm_apply_quirk(void);
 
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r);
 
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c261558276cd..c8945610d455 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -31,11 +31,6 @@
  */
 bool rdt_mon_capable;
 
-/*
- * Global to indicate which monitoring events are enabled.
- */
-unsigned int rdt_mon_features;
-
 #define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
 
 static int snc_nodes_per_l3_cache = 1;
@@ -135,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
 	if (snc_nodes_per_l3_cache == 1)
 		return lrmid;
 
-	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
+	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
 }
 
 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
@@ -166,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do
 						 u32 rmid,
 						 enum resctrl_event_id eventid)
 {
-	switch (eventid) {
-	case QOS_L3_OCCUP_EVENT_ID:
-		return NULL;
-	case QOS_L3_MBM_TOTAL_EVENT_ID:
-		return &hw_dom->arch_mbm_total[rmid];
-	case QOS_L3_MBM_LOCAL_EVENT_ID:
-		return &hw_dom->arch_mbm_local[rmid];
-	default:
-		/* Never expect to get here */
-		WARN_ON_ONCE(1);
+	struct arch_mbm_state *state;
+
+	if (!resctrl_is_mbm_event(eventid))
 		return NULL;
-	}
+
+	state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
+
+	return state ? &state[rmid] : NULL;
 }
 
 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
@@ -206,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
 {
 	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
-
-	if (resctrl_arch_is_mbm_total_enabled())
-		memset(hw_dom->arch_mbm_total, 0,
-		       sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
-
-	if (resctrl_arch_is_mbm_local_enabled())
-		memset(hw_dom->arch_mbm_local, 0,
-		       sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
+	enum resctrl_event_id eventid;
+	int idx;
+
+	for_each_mbm_event_id(eventid) {
+		if (!resctrl_is_mon_event_enabled(eventid))
+			continue;
+		idx = MBM_STATE_IDX(eventid);
+		memset(hw_dom->arch_mbm_states[idx], 0,
+		       sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
+	}
 }
 
 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
@@ -224,15 +217,33 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 	return chunks >> shift;
 }
 
+static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct arch_mbm_state *am;
+	u64 chunks;
+
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am) {
+		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+						 hw_res->mbm_width);
+		chunks = get_corrected_mbm_count(rmid, am->chunks);
+		am->prev_msr = msr_val;
+	} else {
+		chunks = msr_val;
+	}
+
+	return chunks * hw_res->mon_scale;
+}
+
 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
 			   u32 unused, u32 rmid, enum resctrl_event_id eventid,
 			   u64 *val, void *ignored)
 {
-	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	int cpu = cpumask_any(&d->hdr.cpu_mask);
-	struct arch_mbm_state *am;
-	u64 msr_val, chunks;
+	u64 msr_val;
 	u32 prmid;
 	int ret;
 
@@ -243,17 +254,76 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
 	if (ret)
 		return ret;
 
+	*val = get_corrected_val(r, d, rmid, eventid, msr_val);
+
+	return 0;
+}
+
+static int __cntr_id_read(u32 cntr_id, u64 *val)
+{
+	u64 msr_val;
+
+	/*
+	 * QM_EVTSEL Register definition:
+	 * =======================================================
+	 * Bits    Mnemonic        Description
+	 * =======================================================
+	 * 63:44   --              Reserved
+	 * 43:32   RMID            RMID or counter ID in ABMC mode
+	 *                         when reading an MBM event
+	 * 31      ExtendedEvtID   Extended Event Identifier
+	 * 30:8    --              Reserved
+	 * 7:0     EvtID           Event Identifier
+	 * =======================================================
+	 * The contents of a specific counter can be read by setting the
+	 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
+	 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
+	 * to the desired counter ID. Reading the QM_CTR then returns the
+	 * contents of the specified counter. The RMID_VAL_ERROR bit is set
+	 * if the counter configuration is invalid, or if an invalid counter
+	 * ID is set in the QM_EVTSEL.RMID field.  The RMID_VAL_UNAVAIL bit
+	 * is set if the counter data is unavailable.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
+	rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+	if (msr_val & RMID_VAL_ERROR)
+		return -EIO;
+	if (msr_val & RMID_VAL_UNAVAIL)
+		return -EINVAL;
+
+	*val = msr_val;
+	return 0;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 unused, u32 rmid, int cntr_id,
+			     enum resctrl_event_id eventid)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	struct arch_mbm_state *am;
+
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
-		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
-						 hw_res->mbm_width);
-		chunks = get_corrected_mbm_count(rmid, am->chunks);
-		am->prev_msr = msr_val;
-	} else {
-		chunks = msr_val;
+		memset(am, 0, sizeof(*am));
+
+		/* Record any initial, non-zero count value. */
+		__cntr_id_read(cntr_id, &am->prev_msr);
 	}
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+			   u32 unused, u32 rmid, int cntr_id,
+			   enum resctrl_event_id eventid, u64 *val)
+{
+	u64 msr_val;
+	int ret;
+
+	ret = __cntr_id_read(cntr_id, &msr_val);
+	if (ret)
+		return ret;
 
-	*val = chunks * hw_res->mon_scale;
+	*val = get_corrected_val(r, d, rmid, eventid, msr_val);
 
 	return 0;
 }
@@ -346,12 +416,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
 	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	unsigned int threshold;
+	u32 eax, ebx, ecx, edx;
 
 	snc_nodes_per_l3_cache = snc_get_config();
 
 	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
 	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
-	r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
+	r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
 	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
 
 	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
@@ -366,7 +437,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
 	 *
 	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
 	 */
-	threshold = resctrl_rmid_realloc_limit / r->num_rmid;
+	threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
 
 	/*
 	 * Because num_rmid may not be a power of two, round the value
@@ -375,12 +446,17 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
 	 */
 	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
 
-	if (rdt_cpu_has(X86_FEATURE_BMEC)) {
-		u32 eax, ebx, ecx, edx;
-
+	if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
 		/* Detect list of bandwidth sources that can be tracked */
 		cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
-		r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+		r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+	}
+
+	if (rdt_cpu_has(X86_FEATURE_ABMC)) {
+		r->mon.mbm_cntr_assignable = true;
+		cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
+		r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
+		hw_res->mbm_cntr_assign_enabled = true;
 	}
 
 	r->mon_capable = true;
@@ -401,3 +477,91 @@ void __init intel_rdt_mbm_apply_quirk(void)
 	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
 	mbm_cf = mbm_cf_table[cf_index].cf;
 }
+
+static void resctrl_abmc_set_one_amd(void *arg)
+{
+	bool *enable = arg;
+
+	if (*enable)
+		msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+	else
+		msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+}
+
+/*
+ * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
+ * associated with all monitor domains.
+ */
+static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_mon_domain *d;
+
+	lockdep_assert_cpus_held();
+
+	list_for_each_entry(d, &r->mon_domains, hdr.list) {
+		on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
+				 &enable, 1);
+		resctrl_arch_reset_rmid_all(r, d);
+	}
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (r->mon.mbm_cntr_assignable &&
+	    hw_res->mbm_cntr_assign_enabled != enable) {
+		_resctrl_abmc_enable(r, enable);
+		hw_res->mbm_cntr_assign_enabled = enable;
+	}
+
+	return 0;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
+}
+
+static void resctrl_abmc_config_one_amd(void *info)
+{
+	union l3_qos_abmc_cfg *abmc_cfg = info;
+
+	wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+}
+
+/*
+ * Send an IPI to the domain to assign the counter to RMID, event pair.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
+			      u32 cntr_id, bool assign)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	union l3_qos_abmc_cfg abmc_cfg = { 0 };
+	struct arch_mbm_state *am;
+
+	abmc_cfg.split.cfg_en = 1;
+	abmc_cfg.split.cntr_en = assign ? 1 : 0;
+	abmc_cfg.split.cntr_id = cntr_id;
+	abmc_cfg.split.bw_src = rmid;
+	if (assign)
+		abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
+
+	smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
+
+	/*
+	 * The hardware counter is reset (because cfg_en == 1) so there is no
+	 * need to record initial non-zero counts.
+	 */
+	am = get_arch_mbm_state(hw_dom, rmid, evtid);
+	if (am)
+		memset(am, 0, sizeof(*am));
+}
+
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index b4a1f6732a3a..4cee6213d667 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -48,8 +48,10 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_PROC_FEEDBACK,		CPUID_EDX, 11, 0x80000007, 0 },
 	{ X86_FEATURE_AMD_FAST_CPPC,		CPUID_EDX, 15, 0x80000007, 0 },
 	{ X86_FEATURE_MBA,			CPUID_EBX,  6, 0x80000008, 0 },
+	{ X86_FEATURE_COHERENCY_SFW_NO,		CPUID_EBX, 31, 0x8000001f, 0 },
 	{ X86_FEATURE_SMBA,			CPUID_EBX,  2, 0x80000020, 0 },
 	{ X86_FEATURE_BMEC,			CPUID_EBX,  3, 0x80000020, 0 },
+	{ X86_FEATURE_ABMC,			CPUID_EBX,  5, 0x80000020, 0 },
 	{ X86_FEATURE_TSA_SQ_NO,		CPUID_ECX,  1, 0x80000021, 0 },
 	{ X86_FEATURE_TSA_L1_NO,		CPUID_ECX,  2, 0x80000021, 0 },
 	{ X86_FEATURE_AMD_WORKLOAD_CLASS,	CPUID_EAX, 22, 0x80000021, 0 },
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index e35ccdc84910..6073a16628f9 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -372,6 +372,19 @@ unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_uni
 	return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
 }
 
+#ifdef CONFIG_SMP
+int topology_get_primary_thread(unsigned int cpu)
+{
+	u32 apic_id = cpuid_to_apicid[cpu];
+
+	/*
+	 * Get the core domain level APIC id, which is the primary thread
+	 * and return the CPU number assigned to it.
+	 */
+	return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
+}
+#endif
+
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /**
  * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index 843b1655ab45..6ac097e13106 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
 	tscan->amd_node_id = node_id;
 }
 
-static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
+static bool parse_8000_001e(struct topo_scan *tscan)
 {
 	struct {
 		// eax
@@ -81,20 +81,25 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
 
 	cpuid_leaf(0x8000001e, &leaf);
 
-	tscan->c->topo.initial_apicid = leaf.ext_apic_id;
-
 	/*
-	 * If leaf 0xb is available, then the domain shifts are set
-	 * already and nothing to do here. Only valid for family >= 0x17.
+	 * If leaf 0xb/0x26 is available, then the APIC ID and the domain
+	 * shifts are set already.
 	 */
-	if (!has_topoext && tscan->c->x86 >= 0x17) {
+	if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) {
+		tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
 		/*
-		 * Leaf 0x80000008 set the CORE domain shift already.
-		 * Update the SMT domain, but do not propagate it.
+		 * Leaf 0x8000008 sets the CORE domain shift but not the
+		 * SMT domain shift. On CPUs with family >= 0x17, there
+		 * might be hyperthreads.
 		 */
-		unsigned int nthreads = leaf.core_nthreads + 1;
+		if (tscan->c->x86 >= 0x17) {
+			/* Update the SMT domain, but do not propagate it. */
+			unsigned int nthreads = leaf.core_nthreads + 1;
 
-		topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads);
+			topology_update_dom(tscan, TOPO_SMT_DOMAIN,
+					    get_count_order(nthreads), nthreads);
+		}
 	}
 
 	store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
@@ -158,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan)
 	    c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
 		return;
 
-	if (msr_set_bit(0xc0011005, 54) <= 0)
+	if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT,
+			MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0)
 		return;
 
-	rdmsrq(0xc0011005, msrval);
-	if (msrval & BIT_64(54)) {
+	rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval);
+	if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) {
 		set_cpu_cap(c, X86_FEATURE_TOPOEXT);
 		pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
 	}
@@ -170,27 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan)
 
 static void parse_topology_amd(struct topo_scan *tscan)
 {
-	bool has_topoext = false;
+	if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
+		tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
 
 	/*
-	 * If the extended topology leaf 0x8000_001e is available
-	 * try to get SMT, CORE, TILE, and DIE shifts from extended
+	 * Try to get SMT, CORE, TILE, and DIE shifts from extended
 	 * CPUID leaf 0x8000_0026 on supported processors first. If
 	 * extended CPUID leaf 0x8000_0026 is not supported, try to
-	 * get SMT and CORE shift from leaf 0xb first, then try to
-	 * get the CORE shift from leaf 0x8000_0008.
+	 * get SMT and CORE shift from leaf 0xb. If either leaf is
+	 * available, cpu_parse_topology_ext() will return true.
+	 *
+	 * If XTOPOLOGY leaves (0x26/0xb) are not available, try to
+	 * get the CORE shift from leaf 0x8000_0008 first.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
-		has_topoext = cpu_parse_topology_ext(tscan);
-
-	if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
-		tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
-
-	if (!has_topoext && !parse_8000_0008(tscan))
+	if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan))
 		return;
 
-	/* Prefer leaf 0x8000001e if available */
-	if (parse_8000_001e(tscan, has_topoext))
+	/*
+	 * Prefer leaf 0x8000001e if available to get the SMT shift and
+	 * the initial APIC ID if XTOPOLOGY leaves are not available.
+	 */
+	if (parse_8000_001e(tscan))
 		return;
 
 	/* Try the NODEID MSR */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index aefd412a23dc..1f71cc135e9a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -631,7 +631,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
 }
 
 /* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
 	      unsigned long ssp)
 {
 	/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 12ed75c1b567..28e4fd65c9da 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1881,19 +1881,20 @@ long fpu_xstate_prctl(int option, unsigned long arg2)
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512
- * use in the task.
+ * use in the task. Report -1 if no AVX-512 usage.
  */
 static void avx512_status(struct seq_file *m, struct task_struct *task)
 {
-	unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
-	long delta;
+	unsigned long timestamp;
+	long delta = -1;
 
-	if (!timestamp) {
-		/*
-		 * Report -1 if no AVX512 usage
-		 */
-		delta = -1;
-	} else {
+	/* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
+	if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
+		return;
+
+	timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
+
+	if (timestamp) {
 		delta = (long)(jiffies - timestamp);
 		/*
 		 * Cap to LONG_MAX if time difference > LONG_MAX
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 533fcf5636fc..fd28b53dbac5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt);
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 unsigned int __pgtable_l5_enabled __ro_after_init;
+SYM_PIC_ALIAS(__pgtable_l5_enabled);
 unsigned int pgdir_shift __ro_after_init = 39;
 EXPORT_SYMBOL(pgdir_shift);
+SYM_PIC_ALIAS(pgdir_shift);
 unsigned int ptrs_per_p4d __ro_after_init = 1;
 EXPORT_SYMBOL(ptrs_per_p4d);
+SYM_PIC_ALIAS(ptrs_per_p4d);
 
 unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
 EXPORT_SYMBOL(page_offset_base);
@@ -316,5 +319,5 @@ void early_setup_idt(void)
 		handler = vc_boot_ghcb;
 	}
 
-	startup_64_load_idt(handler);
+	__pi_startup_64_load_idt(handler);
 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 76743dfad6ab..80ef5d386b03 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
  * any particular GDT layout, because we load our own as soon as we
  * can.
  */
-__HEAD
+	__INIT
 SYM_CODE_START(startup_32)
 	movl pa(initial_stack),%ecx
 	
@@ -136,6 +136,9 @@ SYM_CODE_END(startup_32)
  * If cpu hotplug is not supported then this code can go in init section
  * which will be freed later
  */
+#ifdef CONFIG_HOTPLUG_CPU
+       .text
+#endif
 SYM_FUNC_START(startup_32_smp)
 	cld
 	movl $(__BOOT_DS),%eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3e9b3a3bd039..21816b48537c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -33,7 +33,7 @@
  * because we need identity-mapped pages.
  */
 
-	__HEAD
+	__INIT
 	.code64
 SYM_CODE_START_NOALIGN(startup_64)
 	UNWIND_HINT_END_OF_STACK
@@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64)
 	xorl	%edx, %edx
 	wrmsr
 
-	call	startup_64_setup_gdt_idt
+	call	__pi_startup_64_setup_gdt_idt
 
 	/* Now switch to __KERNEL_CS so IRET works reliably */
 	pushq	$__KERNEL_CS
@@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64)
 	 * subsequent code. Pass the boot_params pointer as the first argument.
 	 */
 	movq	%r15, %rdi
-	call	sme_enable
+	call	__pi_sme_enable
 #endif
 
 	/* Sanitize CPU configuration */
@@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64)
 	 * programmed into CR3.
 	 */
 	movq	%r15, %rsi
-	call	__startup_64
+	call	__pi___startup_64
 
 	/* Form the CR3 value being sure to include the CR3 modifier */
 	leaq	early_top_pgt(%rip), %rcx
@@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
 	/* Call C handler */
 	movq    %rsp, %rdi
 	movq	ORIG_RAX(%rsp), %rsi
-	call    do_vc_no_ghcb
+	call    __pi_do_vc_no_ghcb
 
 	/* Unwind pt_regs */
 	POP_REGS
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6079d15dab8c..3863d7709386 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr)
 	if (is_exception_insn(&insn))
 		return false;
 
-	if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+	if (IS_ENABLED(CONFIG_CFI)) {
 		/*
 		 * The compiler generates the following instruction sequence
 		 * for indirect call checks and cfi.c decodes this;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1b7960cf6eb0..e3a3987b0c4f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -159,7 +159,7 @@ __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long sp = args->stack;
 	unsigned long tls = args->tls;
 	struct inactive_task_frame *frame;
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 2ddf23387c7e..978232b6d48d 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -191,7 +191,7 @@ void reset_thread_features(void)
 	current->thread.features_locked = 0;
 }
 
-unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
 				       unsigned long stack_size)
 {
 	struct thread_shstk *shstk = &tsk->thread.shstk;
@@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void)
 	return ssp;
 }
 
+int shstk_pop(u64 *val)
+{
+	int ret = 0;
+	u64 ssp;
+
+	if (!features_enabled(ARCH_SHSTK_SHSTK))
+		return -ENOTSUPP;
+
+	fpregs_lock_and_load();
+
+	rdmsrq(MSR_IA32_PL3_SSP, ssp);
+	if (val && get_user(*val, (__user u64 *)ssp))
+		ret = -EFAULT;
+	else
+		wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
+	fpregs_unlock();
+
+	return ret;
+}
+
+int shstk_push(u64 val)
+{
+	u64 ssp;
+	int ret;
+
+	if (!features_enabled(ARCH_SHSTK_SHSTK))
+		return -ENOTSUPP;
+
+	fpregs_lock_and_load();
+
+	rdmsrq(MSR_IA32_PL3_SSP, ssp);
+	ssp -= SS_FRAME_SIZE;
+	ret = write_user_shstk_64((__user void *)ssp, val);
+	if (!ret)
+		wrmsrq(MSR_IA32_PL3_SSP, ssp);
+	fpregs_unlock();
+
+	return ret;
+}
+
 #define SHSTK_DATA_BIT BIT(63)
 
 static int put_shstk_data(u64 __user *addr, u64 data)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..eb289abece23 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -479,14 +479,14 @@ static int x86_cluster_flags(void)
 static bool x86_has_numa_in_package;
 
 static struct sched_domain_topology_level x86_topology[] = {
-	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
+	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
 #ifdef CONFIG_SCHED_CLUSTER
-	SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS),
+	SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
 #endif
 #ifdef CONFIG_SCHED_MC
-	SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC),
+	SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
 #endif
-	SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG),
+	SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
 	{ NULL },
 };
 
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 5a4b21389b1d..d432f3824f0c 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -156,15 +156,26 @@ static int identify_insn(struct insn *insn)
 	if (!insn->modrm.nbytes)
 		return -EINVAL;
 
-	/* All the instructions of interest start with 0x0f. */
-	if (insn->opcode.bytes[0] != 0xf)
+	/* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */
+	if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf)
 		return -EINVAL;
 
 	if (insn->opcode.bytes[1] == 0x1) {
 		switch (X86_MODRM_REG(insn->modrm.value)) {
 		case 0:
+			/* The reg form of 0F 01 /0 encodes VMX instructions. */
+			if (X86_MODRM_MOD(insn->modrm.value) == 3)
+				return -EINVAL;
+
 			return UMIP_INST_SGDT;
 		case 1:
+			/*
+			 * The reg form of 0F 01 /1 encodes MONITOR/MWAIT,
+			 * STAC/CLAC, and ENCLS.
+			 */
+			if (X86_MODRM_MOD(insn->modrm.value) == 3)
+				return -EINVAL;
+
 			return UMIP_INST_SIDT;
 		case 4:
 			return UMIP_INST_SMSW;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6d383839e839..845aeaf36b8d 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 
 #ifdef CONFIG_X86_64
 
+struct uretprobe_syscall_args {
+	unsigned long r11;
+	unsigned long cx;
+	unsigned long ax;
+};
+
 asm (
 	".pushsection .rodata\n"
 	".global uretprobe_trampoline_entry\n"
 	"uretprobe_trampoline_entry:\n"
-	"pushq %rax\n"
-	"pushq %rcx\n"
-	"pushq %r11\n"
-	"movq $" __stringify(__NR_uretprobe) ", %rax\n"
+	"push %rax\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"mov $" __stringify(__NR_uretprobe) ", %rax\n"
 	"syscall\n"
 	".global uretprobe_syscall_check\n"
 	"uretprobe_syscall_check:\n"
-	"popq %r11\n"
-	"popq %rcx\n"
-
-	/* The uretprobe syscall replaces stored %rax value with final
+	"pop %r11\n"
+	"pop %rcx\n"
+	/*
+	 * The uretprobe syscall replaces stored %rax value with final
 	 * return address, so we don't restore %rax in here and just
 	 * call ret.
 	 */
-	"retq\n"
+	"ret\n"
+	"int3\n"
 	".global uretprobe_trampoline_end\n"
 	"uretprobe_trampoline_end:\n"
 	".popsection\n"
@@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[];
 extern u8 uretprobe_trampoline_end[];
 extern u8 uretprobe_syscall_check[];
 
-void *arch_uprobe_trampoline(unsigned long *psize)
+void *arch_uretprobe_trampoline(unsigned long *psize)
 {
 	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
 	struct pt_regs *regs = task_pt_regs(current);
@@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp)
 SYSCALL_DEFINE0(uretprobe)
 {
 	struct pt_regs *regs = task_pt_regs(current);
-	unsigned long err, ip, sp, r11_cx_ax[3], tramp;
+	struct uretprobe_syscall_args args;
+	unsigned long err, ip, sp, tramp;
 
 	/* If there's no trampoline, we are called from wrong place. */
 	tramp = uprobe_get_trampoline_vaddr();
@@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe)
 	if (unlikely(regs->ip != trampoline_check_ip(tramp)))
 		goto sigill;
 
-	err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
+	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
 	if (err)
 		goto sigill;
 
 	/* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
-	regs->r11 = r11_cx_ax[0];
-	regs->cx  = r11_cx_ax[1];
-	regs->ax  = r11_cx_ax[2];
-	regs->sp += sizeof(r11_cx_ax);
+	regs->r11 = args.r11;
+	regs->cx  = args.cx;
+	regs->ax  = args.ax;
+	regs->sp += sizeof(args);
 	regs->orig_ax = -1;
 
 	ip = regs->ip;
@@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe)
 	 */
 	if (regs->sp != sp || shstk_is_enabled())
 		return regs->ax;
-	regs->sp -= sizeof(r11_cx_ax);
+	regs->sp -= sizeof(args);
 
 	/* for the case uprobe_consumer has changed r11/cx */
-	r11_cx_ax[0] = regs->r11;
-	r11_cx_ax[1] = regs->cx;
+	args.r11 = regs->r11;
+	args.cx  = regs->cx;
 
 	/*
 	 * ax register is passed through as return value, so we can use
 	 * its space on stack for ip value and jump to it through the
 	 * trampoline's ret instruction
 	 */
-	r11_cx_ax[2] = regs->ip;
+	args.ax  = regs->ip;
 	regs->ip = ip;
 
-	err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
+	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
 	if (err)
 		goto sigill;
 
@@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		*sr = utask->autask.saved_scratch_register;
 	}
 }
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+	return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+	.name   = "[uprobes-trampoline]",
+	.mremap = tramp_mremap,
+	.pages  = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+	struct hlist_node	node;
+	unsigned long		vaddr;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+	long delta = (long)(vaddr + 5 - vtramp);
+
+	return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_trampoline(unsigned long vaddr)
+{
+	struct vm_unmapped_area_info info = {
+		.length     = PAGE_SIZE,
+		.align_mask = ~PAGE_MASK,
+	};
+	unsigned long low_limit, high_limit;
+	unsigned long low_tramp, high_tramp;
+	unsigned long call_end = vaddr + 5;
+
+	if (check_add_overflow(call_end, INT_MIN, &low_limit))
+		low_limit = PAGE_SIZE;
+
+	high_limit = call_end + INT_MAX;
+
+	/* Search up from the caller address. */
+	info.low_limit = call_end;
+	info.high_limit = min(high_limit, TASK_SIZE);
+	high_tramp = vm_unmapped_area(&info);
+
+	/* Search down from the caller address. */
+	info.low_limit = max(low_limit, PAGE_SIZE);
+	info.high_limit = call_end;
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	low_tramp = vm_unmapped_area(&info);
+
+	if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
+		return -ENOMEM;
+	if (IS_ERR_VALUE(high_tramp))
+		return low_tramp;
+	if (IS_ERR_VALUE(low_tramp))
+		return high_tramp;
+
+	/* Return address that's closest to the caller address. */
+	if (call_end - low_tramp < high_tramp - call_end)
+		return low_tramp;
+	return high_tramp;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct mm_struct *mm = current->mm;
+	struct uprobe_trampoline *tramp;
+	struct vm_area_struct *vma;
+
+	if (!user_64bit_mode(regs))
+		return NULL;
+
+	vaddr = find_nearest_trampoline(vaddr);
+	if (IS_ERR_VALUE(vaddr))
+		return NULL;
+
+	tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+	if (unlikely(!tramp))
+		return NULL;
+
+	tramp->vaddr = vaddr;
+	vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+				&tramp_mapping);
+	if (IS_ERR(vma)) {
+		kfree(tramp);
+		return NULL;
+	}
+	return tramp;
+}
+
+static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
+{
+	struct uprobes_state *state = &current->mm->uprobes_state;
+	struct uprobe_trampoline *tramp = NULL;
+
+	if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+		return NULL;
+
+	hlist_for_each_entry(tramp, &state->head_tramps, node) {
+		if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+			*new = false;
+			return tramp;
+		}
+	}
+
+	tramp = create_uprobe_trampoline(vaddr);
+	if (!tramp)
+		return NULL;
+
+	*new = true;
+	hlist_add_head(&tramp->node, &state->head_tramps);
+	return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+	/*
+	 * We do not unmap and release uprobe trampoline page itself,
+	 * because there's no easy way to make sure none of the threads
+	 * is still inside the trampoline.
+	 */
+	hlist_del(&tramp->node);
+	kfree(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+	INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+	struct uprobes_state *state = &mm->uprobes_state;
+	struct uprobe_trampoline *tramp;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+		destroy_uprobe_trampoline(tramp);
+}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+	return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+	struct mm_struct *mm = current->mm;
+	bool found, retry = true;
+	unsigned int seq;
+
+	rcu_read_lock();
+	if (mmap_lock_speculate_try_begin(mm, &seq)) {
+		found = __in_uprobe_trampoline(ip);
+		retry = mmap_lock_speculate_retry(mm, seq);
+	}
+	rcu_read_unlock();
+
+	if (retry) {
+		mmap_read_lock(mm);
+		found = __in_uprobe_trampoline(ip);
+		mmap_read_unlock(mm);
+	}
+	return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+	unsigned long ax;
+	unsigned long r11;
+	unsigned long cx;
+	unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	struct uprobe_syscall_args args;
+	unsigned long ip, sp, sret;
+	int err;
+
+	/* Allow execution only from uprobe trampolines. */
+	if (!in_uprobe_trampoline(regs->ip))
+		return -ENXIO;
+
+	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+	if (err)
+		goto sigill;
+
+	ip = regs->ip;
+
+	/*
+	 * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+	 * - adjust ip to the probe address, call saved next instruction address
+	 * - adjust sp to the probe's stack frame (check trampoline code)
+	 */
+	regs->ax  = args.ax;
+	regs->r11 = args.r11;
+	regs->cx  = args.cx;
+	regs->ip  = args.retaddr - 5;
+	regs->sp += sizeof(args);
+	regs->orig_ax = -1;
+
+	sp = regs->sp;
+
+	err = shstk_pop((u64 *)&sret);
+	if (err == -EFAULT || (!err && sret != args.retaddr))
+		goto sigill;
+
+	handle_syscall_uprobe(regs, regs->ip);
+
+	/*
+	 * Some of the uprobe consumers has changed sp, we can do nothing,
+	 * just return via iret.
+	 */
+	if (regs->sp != sp) {
+		/* skip the trampoline call */
+		if (args.retaddr - 5 == regs->ip)
+			regs->ip += 5;
+		return regs->ax;
+	}
+
+	regs->sp -= sizeof(args);
+
+	/* for the case uprobe_consumer has changed ax/r11/cx */
+	args.ax  = regs->ax;
+	args.r11 = regs->r11;
+	args.cx  = regs->cx;
+
+	/* keep return address unless we are instructed otherwise */
+	if (args.retaddr - 5 != regs->ip)
+		args.retaddr = regs->ip;
+
+	if (shstk_push(args.retaddr) == -EFAULT)
+		goto sigill;
+
+	regs->ip = ip;
+
+	err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+	if (err)
+		goto sigill;
+
+	/* ensure sysret, see do_syscall_64() */
+	regs->r11 = regs->flags;
+	regs->cx  = regs->ip;
+	return 0;
+
+sigill:
+	force_sig(SIGILL);
+	return -1;
+}
+
+asm (
+	".pushsection .rodata\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	"uprobe_trampoline_entry:\n"
+	"push %rcx\n"
+	"push %r11\n"
+	"push %rax\n"
+	"mov $" __stringify(__NR_uprobe) ", %rax\n"
+	"syscall\n"
+	"pop %rax\n"
+	"pop %r11\n"
+	"pop %rcx\n"
+	"ret\n"
+	"int3\n"
+	".balign " __stringify(PAGE_SIZE) "\n"
+	".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+	tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+	return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
+enum {
+	EXPECT_SWBP,
+	EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+	unsigned long base;
+	int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+		       int nbytes, void *data)
+{
+	struct write_opcode_ctx *ctx = data;
+	uprobe_opcode_t old_opcode[5];
+
+	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+	switch (ctx->expect) {
+	case EXPECT_SWBP:
+		if (is_swbp_insn(&old_opcode[0]))
+			return 1;
+		break;
+	case EXPECT_CALL:
+		if (is_call_insn(&old_opcode[0]))
+			return 1;
+		break;
+	}
+
+	return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ *   - Add an INT3 trap to the address that will be patched
+ *   - SMP sync all CPUs
+ *   - Update all but the first byte of the patched range
+ *   - SMP sync all CPUs
+ *   - Replace the first byte (INT3) by the first byte of the replacing opcode
+ *   - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		       unsigned long vaddr, char *insn, bool optimize)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+	};
+	int err;
+
+	/*
+	 * Write int3 trap.
+	 *
+	 * The swbp_optimize path comes with breakpoint already installed,
+	 * so we can skip this step for optimize == true.
+	 */
+	if (!optimize) {
+		ctx.expect = EXPECT_CALL;
+		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+				   true /* is_register */, false /* do_update_ref_ctr */,
+				   &ctx);
+		if (err)
+			return err;
+	}
+
+	smp_text_poke_sync_each_cpu();
+
+	/* Write all but the first byte of the patched range. */
+	ctx.expect = EXPECT_SWBP;
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+
+	/*
+	 * Write first byte.
+	 *
+	 * The swbp_unoptimize needs to finish uprobe removal together
+	 * with ref_ctr update, using uprobe_write with proper flags.
+	 */
+	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+			   optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+	return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			 unsigned long vaddr, unsigned long tramp)
+{
+	u8 call[5];
+
+	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+			(const void *) tramp, CALL_INSN_SIZE);
+	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+			   unsigned long vaddr)
+{
+	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+	unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+	struct vm_area_struct *vma;
+	struct page *page;
+
+	page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	uprobe_copy_from_page(page, vaddr, dst, len);
+	put_page(page);
+	return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+	struct __packed __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} *call = (struct __arch_relative_insn *) insn;
+
+	if (!is_call_insn(insn))
+		return false;
+	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
+{
+	uprobe_opcode_t insn[5];
+	int err;
+
+	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	if (err)
+		return err;
+	return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+		test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+	     unsigned long vaddr)
+{
+	if (should_optimize(auprobe)) {
+		/*
+		 * We could race with another thread that already optimized the probe,
+		 * so let's not overwrite it with int3 again in this case.
+		 */
+		int ret = is_optimized(vma->vm_mm, vaddr);
+		if (ret < 0)
+			return ret;
+		if (ret)
+			return 0;
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+				   true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		  unsigned long vaddr)
+{
+	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+		int ret = is_optimized(vma->vm_mm, vaddr);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			ret = swbp_unoptimize(auprobe, vma, vaddr);
+			WARN_ON_ONCE(ret);
+			return ret;
+		}
+	}
+	return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+				   false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+				  unsigned long vaddr)
+{
+	struct uprobe_trampoline *tramp;
+	struct vm_area_struct *vma;
+	bool new = false;
+	int err = 0;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		return -EINVAL;
+	tramp = get_uprobe_trampoline(vaddr, &new);
+	if (!tramp)
+		return -EINVAL;
+	err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+	if (WARN_ON_ONCE(err) && new)
+		destroy_uprobe_trampoline(tramp);
+	return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	uprobe_opcode_t insn[5];
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+
+	/*
+	 * Check if some other thread already optimized the uprobe for us,
+	 * if it's the case just go away silently.
+	 */
+	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+		goto unlock;
+	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+		goto unlock;
+
+	/*
+	 * If we fail to optimize the uprobe we set the fail bit so the
+	 * above should_optimize will fail from now on.
+	 */
+	if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+		set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+	mmap_write_unlock(mm);
+}
+
+static bool insn_is_nop(struct insn *insn)
+{
+	return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
+}
+
+static bool insn_is_nopl(struct insn *insn)
+{
+	if (insn->opcode.nbytes != 2)
+		return false;
+
+	if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
+		return false;
+
+	if (!insn->modrm.nbytes)
+		return false;
+
+	if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
+		return false;
+
+	/* 0f 1f /0 - NOPL */
+	return true;
+}
+
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+	if (!insn->x86_64 || insn->length != 5)
+		return false;
+
+	if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+		return false;
+
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
 #else /* 32-bit: */
 /*
  * No RIP-relative addressing on 32-bit
@@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 }
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+	return false;
+}
 #endif /* CONFIG_X86_64 */
 
 struct uprobe_xol_ops {
@@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
  */
 int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
 {
-	struct insn insn;
 	u8 fix_ip_or_call = UPROBE_FIX_IP;
+	struct insn insn;
 	int ret;
 
 	ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
 	if (ret)
 		return ret;
 
+	if (can_optimize(&insn, addr))
+		set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
 	ret = branch_setup_xol_ops(auprobe, &insn);
 	if (ret != -ENOSYS)
 		return ret;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4fa0be732af1..d7af4a64c211 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -160,11 +160,6 @@ SECTIONS
 
 	} :text = 0xcccccccc
 
-	/* bootstrapping code */
-	.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
-		HEAD_TEXT
-	} :text = 0xcccccccc
-
 	/* End of text section, which should occupy whole number of pages */
 	_etext = .;
 	. = ALIGN(PAGE_SIZE);
@@ -227,6 +222,8 @@ SECTIONS
 	 */
 	.altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
 		*(.altinstr_aux)
+		. = ALIGN(PAGE_SIZE);
+		__inittext_end = .;
 	}
 
 	INIT_DATA_SECTION(16)
@@ -535,3 +532,5 @@ xen_elfnote_entry_value =
 xen_elfnote_phys32_entry_value =
 	ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET);
 #endif
+
+#include "../boot/startup/exports.h"
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8172c2042dd6..5fc437341e03 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -810,6 +810,8 @@ static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
 	if (min > map->max_apic_id)
 		return 0;
 
+	min = array_index_nospec(min, map->max_apic_id + 1);
+
 	for_each_set_bit(i, ipi_bitmap,
 		min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
 		if (map->phys_map[min + i]) {
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ad89d0bd6005..103604c4b33b 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -13,7 +13,7 @@
 #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |	\
 					  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
 
-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
 #define fixed_ctrl_field(ctrl_reg, idx) \
 	(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 2fbdebf79fbb..0635bd71c10e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -719,13 +719,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
 static void sev_writeback_caches(struct kvm *kvm)
 {
 	/*
-	 * Note, the caller is responsible for ensuring correctness if the mask
-	 * can be modified, e.g. if a CPU could be doing VMRUN.
-	 */
-	if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
-		return;
-
-	/*
 	 * Ensure that all dirty guest tagged cache entries are written back
 	 * before releasing the pages back to the system for use.  CLFLUSH will
 	 * not do this without SME_COHERENT, and flushing many cache lines
@@ -739,6 +732,9 @@ static void sev_writeback_caches(struct kvm *kvm)
 	 * serializing multiple calls and having responding CPUs (to the IPI)
 	 * mark themselves as still running if they are running (or about to
 	 * run) a vCPU for the VM.
+	 *
+	 * Note, the caller is responsible for ensuring correctness if the mask
+	 * can be modified, e.g. if a CPU could be doing VMRUN.
 	 */
 	wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d9931c6c4bc6..1bfebe40854f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4046,8 +4046,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
-	if (nested_svm_virtualize_tpr(vcpu) ||
-	    kvm_vcpu_apicv_active(vcpu))
+	if (nested_svm_virtualize_tpr(vcpu))
 		return;
 
 	cr8 = kvm_get_cr8(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1c49bc681c4..706b6fd56d3c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9908,8 +9908,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 	rcu_read_lock();
 	map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
-	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
-		target = map->phys_map[dest_id]->vcpu;
+	if (likely(map) && dest_id <= map->max_apic_id) {
+		dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
+		if (map->phys_map[dest_id])
+			target = map->phys_map[dest_id]->vcpu;
+	}
 
 	rcu_read_unlock();
 
@@ -11008,6 +11011,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		wrmsrq(MSR_IA32_XFD_ERR, 0);
 
 	/*
+	 * Mark this CPU as needing a branch predictor flush before running
+	 * userspace. Must be done before enabling preemption to ensure it gets
+	 * set for the CPU that actually ran the guest, and not the CPU that it
+	 * may migrate to.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
+		this_cpu_write(x86_ibpb_exit_to_user, true);
+
+	/*
 	 * Consume any pending interrupts, including the possible source of
 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
 	 * An instruction is required after local_irq_enable() to fully unblock
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index b0f3b2a62ae2..a5cafd402cfd 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
 	return table[opcode];
 }
 
+insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select)
+{
+	const insn_attr_t *table;
+
+	if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX)
+		return 0;
+	map_select -= X86_XOP_M_MIN;
+	/* At first, this checks the master table */
+	table = inat_xop_tables[map_select];
+	if (!table)
+		return 0;
+	return table[opcode];
+}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 149a57e334ab..225af1399c9d 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -200,12 +200,15 @@ found:
 	}
 	insn->rex_prefix.got = 1;
 
-	/* Decode VEX prefix */
+	/* Decode VEX/XOP prefix */
 	b = peek_next(insn_byte_t, insn);
-	attr = inat_get_opcode_attribute(b);
-	if (inat_is_vex_prefix(attr)) {
+	if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) {
 		insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
-		if (!insn->x86_64) {
+
+		if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) {
+			/* Grp1A.0 is always POP Ev */
+			goto vex_end;
+		} else if (!insn->x86_64) {
 			/*
 			 * In 32-bits mode, if the [7:6] bits (mod bits of
 			 * ModRM) on the second byte are not 11b, it is
@@ -226,13 +229,13 @@ found:
 			if (insn->x86_64 && X86_VEX_W(b2))
 				/* VEX.W overrides opnd_size */
 				insn->opnd_bytes = 8;
-		} else if (inat_is_vex3_prefix(attr)) {
+		} else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) {
 			b2 = peek_nbyte_next(insn_byte_t, insn, 2);
 			insn_set_byte(&insn->vex_prefix, 2, b2);
 			insn->vex_prefix.nbytes = 3;
 			insn->next_byte += 3;
 			if (insn->x86_64 && X86_VEX_W(b2))
-				/* VEX.W overrides opnd_size */
+				/* VEX.W/XOP.W overrides opnd_size */
 				insn->opnd_bytes = 8;
 		} else {
 			/*
@@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn)
 	insn_set_byte(opcode, 0, op);
 	opcode->nbytes = 1;
 
-	/* Check if there is VEX prefix or not */
-	if (insn_is_avx(insn)) {
+	/* Check if there is VEX/XOP prefix or not */
+	if (insn_is_avx_or_xop(insn)) {
 		insn_byte_t m, p;
+
+		/* XOP prefix has different encoding */
+		if (unlikely(avx_insn_is_xop(insn))) {
+			m = insn_xop_map_bits(insn);
+			insn->attr = inat_get_xop_attribute(op, m);
+			if (!inat_accept_xop(insn->attr)) {
+				insn->attr = 0;
+				return -EINVAL;
+			}
+			/* XOP has only 1 byte for opcode */
+			goto end;
+		}
+
 		m = insn_vex_m_bits(insn);
 		p = insn_vex_p_bits(insn);
 		insn->attr = inat_get_avx_attribute(op, m, p);
@@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn)
 			pfx_id = insn_last_prefix_id(insn);
 			insn->attr = inat_get_group_attribute(mod, pfx_id,
 							      insn->attr);
-			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) {
+			if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) &&
+			    !inat_accept_xop(insn->attr)) {
 				/* Bad insn */
 				insn->attr = 0;
 				return -EINVAL;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index d78d769a02bd..f513d33b6d37 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -15,7 +15,6 @@
 
 	.section .text..__x86.indirect_thunk
 
-
 .macro POLINE reg
 	ANNOTATE_INTRA_FUNCTION_CALL
 	call    .Ldo_rop_\@
@@ -73,6 +72,7 @@ SYM_CODE_END(__x86_indirect_thunk_array)
 #undef GEN
 
 #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+
 .macro CALL_THUNK reg
 	.align RETPOLINE_THUNK_SIZE
 
@@ -126,7 +126,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
 #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg)
 #include <asm/GEN-for-each-reg.h>
 #undef GEN
-#endif
+
+#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
+
+#ifdef CONFIG_MITIGATION_ITS
+
+.macro ITS_THUNK reg
+
+/*
+ * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b)
+ * that complete the fineibt_paranoid caller sequence.
+ */
+1:	.byte 0xea
+SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL)
+	UNWIND_HINT_UNDEFINED
+	ANNOTATE_NOENDBR
+	jne 1b
+SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
+	UNWIND_HINT_UNDEFINED
+	ANNOTATE_NOENDBR
+	ANNOTATE_RETPOLINE_SAFE
+	jmp *%\reg
+	int3
+	.align 32, 0xcc		/* fill to the end of the line */
+	.skip  32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */
+.endm
+
+/* ITS mitigation requires thunks be aligned to upper half of cacheline */
+.align 64, 0xcc
+.skip 29, 0xcc
+
+#define GEN(reg) ITS_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+	.align 64, 0xcc
+SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax)
+SYM_CODE_END(__x86_indirect_its_thunk_array)
+
+#endif /* CONFIG_MITIGATION_ITS */
 
 #ifdef CONFIG_MITIGATION_RETHUNK
 
@@ -370,39 +408,6 @@ SYM_FUNC_END(call_depth_return_thunk)
 
 #ifdef CONFIG_MITIGATION_ITS
 
-.macro ITS_THUNK reg
-
-/*
- * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b)
- * that complete the fineibt_paranoid caller sequence.
- */
-1:	.byte 0xea
-SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL)
-	UNWIND_HINT_UNDEFINED
-	ANNOTATE_NOENDBR
-	jne 1b
-SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
-	UNWIND_HINT_UNDEFINED
-	ANNOTATE_NOENDBR
-	ANNOTATE_RETPOLINE_SAFE
-	jmp *%\reg
-	int3
-	.align 32, 0xcc		/* fill to the end of the line */
-	.skip  32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */
-.endm
-
-/* ITS mitigation requires thunks be aligned to upper half of cacheline */
-.align 64, 0xcc
-.skip 29, 0xcc
-
-#define GEN(reg) ITS_THUNK reg
-#include <asm/GEN-for-each-reg.h>
-#undef GEN
-
-	.align 64, 0xcc
-SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax)
-SYM_CODE_END(__x86_indirect_its_thunk_array)
-
 .align 64, 0xcc
 .skip 32, 0xcc
 SYM_CODE_START(its_return_thunk)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 262f7ca1fb95..2a4e69ecc2de 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -27,6 +27,11 @@
 #  (evo): this opcode is changed by EVEX prefix (EVEX opcode)
 #  (v): this opcode requires VEX prefix.
 #  (v1): this opcode only supports 128bit VEX.
+#  (xop): this opcode accepts XOP prefix.
+#
+# XOP Superscripts
+#  (W=0): this opcode requires XOP.W == 0
+#  (W=1): this opcode requires XOP.W == 1
 #
 # Last Prefix Superscripts
 #  - (66): the last prefix is 0x66
@@ -194,7 +199,7 @@ AVXcode:
 8c: MOV Ev,Sw
 8d: LEA Gv,M
 8e: MOV Sw,Ew
-8f: Grp1A (1A) | POP Ev (d64)
+8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix)
 # 0x90 - 0x9f
 90: NOP | PAUSE (F3) | XCHG r8,rAX
 91: XCHG rCX/r9,rAX
@@ -1106,6 +1111,84 @@ AVXcode: 7
 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B)
 EndTable
 
+# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5
+Table: XOP map 8h
+Referrer:
+XOPcode: 0
+85: VPMACSSWW Vo,Ho,Wo,Lo
+86: VPMACSSWD Vo,Ho,Wo,Lo
+87: VPMACSSDQL Vo,Ho,Wo,Lo
+8e: VPMACSSDD Vo,Ho,Wo,Lo
+8f: VPMACSSDQH Vo,Ho,Wo,Lo
+95: VPMACSWW Vo,Ho,Wo,Lo
+96: VPMACSWD Vo,Ho,Wo,Lo
+97: VPMACSDQL Vo,Ho,Wo,Lo
+9e: VPMACSDD Vo,Ho,Wo,Lo
+9f: VPMACSDQH Vo,Ho,Wo,Lo
+a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1)
+a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1)
+a6: VPMADCSSWD Vo,Ho,Wo,Lo
+b6: VPMADCSWD Vo,Ho,Wo,Lo
+c0: VPROTB Vo,Wo,Ib
+c1: VPROTW Vo,Wo,Ib
+c2: VPROTD Vo,Wo,Ib
+c3: VPROTQ Vo,Wo,Ib
+cc: VPCOMccB Vo,Ho,Wo,Ib
+cd: VPCOMccW Vo,Ho,Wo,Ib
+ce: VPCOMccD Vo,Ho,Wo,Ib
+cf: VPCOMccQ Vo,Ho,Wo,Ib
+ec: VPCOMccUB Vo,Ho,Wo,Ib
+ed: VPCOMccUW Vo,Ho,Wo,Ib
+ee: VPCOMccUD Vo,Ho,Wo,Ib
+ef: VPCOMccUQ Vo,Ho,Wo,Ib
+EndTable
+
+Table: XOP map 9h
+Referrer:
+XOPcode: 1
+01: GrpXOP1
+02: GrpXOP2
+12: GrpXOP3
+80: VFRCZPS Vx,Wx
+81: VFRCZPD Vx,Wx
+82: VFRCZSS Vq,Wss
+83: VFRCZSD Vq,Wsd
+90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1)
+95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1)
+96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1)
+97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1)
+98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1)
+99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1)
+9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1)
+9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1)
+c1: VPHADDBW Vo,Wo
+c2: VPHADDBD Vo,Wo
+c3: VPHADDBQ Vo,Wo
+c6: VPHADDWD Vo,Wo
+c7: VPHADDWQ Vo,Wo
+cb: VPHADDDQ Vo,Wo
+d1: VPHADDUBWD Vo,Wo
+d2: VPHADDUBD Vo,Wo
+d3: VPHADDUBQ Vo,Wo
+d6: VPHADDUWD Vo,Wo
+d7: VPHADDUWQ Vo,Wo
+db: VPHADDUDQ Vo,Wo
+e1: VPHSUBBW Vo,Wo
+e2: VPHSUBWD Vo,Wo
+e3: VPHSUBDQ Vo,Wo
+EndTable
+
+Table: XOP map Ah
+Referrer:
+XOPcode: 2
+10: BEXTR Gy,Ey,Id
+12: GrpXOP4
+EndTable
+
 GrpTable: Grp1
 0: ADD
 1: OR
@@ -1320,3 +1403,29 @@ GrpTable: GrpRNG
 4: xcrypt-cfb
 5: xcrypt-ofb
 EndTable
+
+# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4
+GrpTable: GrpXOP1
+1: BLCFILL By,Ey (xop)
+2: BLSFILL By,Ey (xop)
+3: BLCS By,Ey (xop)
+4: TZMSK By,Ey (xop)
+5: BLCIC By,Ey (xop)
+6: BLSIC By,Ey (xop)
+7: T1MSKC By,Ey (xop)
+EndTable
+
+GrpTable: GrpXOP2
+1: BLCMSK By,Ey (xop)
+6: BLCI By,Ey (xop)
+EndTable
+
+GrpTable: GrpXOP3
+0: LLWPCB Ry (xop)
+1: SLWPCB Ry (xop)
+EndTable
+
+GrpTable: GrpXOP4
+0: LWPINS By,Ed,Id (xop)
+1: LWPVAL By,Ed,Id (xop)
+EndTable
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 76e33bd7c556..b9426fce5f3e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -224,6 +224,24 @@ static void sync_global_pgds(unsigned long start, unsigned long end)
 }
 
 /*
+ * Make kernel mappings visible in all page tables in the system.
+ * This is necessary except when the init task populates kernel mappings
+ * during the boot process. In that case, all processes originating from
+ * the init task copies the kernel mappings, so there is no issue.
+ * Otherwise, missing synchronization could lead to kernel crashes due
+ * to missing page table entries for certain kernel mappings.
+ *
+ * Synchronization is performed at the top level, which is the PGD in
+ * 5-level paging systems. But in 4-level paging systems, however,
+ * pgd_populate() is a no-op, so synchronization is done at the P4D level.
+ * sync_global_pgds() handles this difference between paging levels.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+	sync_global_pgds(start, end);
+}
+
+/*
  * NOTE: This function is marked __ref because it calls __init function
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
  */
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index faf3a13fb6ba..2f8c32173972 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -536,12 +536,6 @@ void __init sme_early_init(void)
 		x86_init.resources.dmi_setup = snp_dmi_setup;
 	}
 
-	/*
-	 * Switch the SVSM CA mapping (if active) from identity mapped to
-	 * kernel mapped.
-	 */
-	snp_update_svsm_ca();
-
 	if (sev_status & MSR_AMD64_SNP_SECURE_TSC)
 		setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
 }
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index f8a33b25ae86..edbf9c998848 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -16,7 +16,7 @@
 
 	.text
 	.code64
-SYM_FUNC_START(sme_encrypt_execute)
+SYM_FUNC_START(__pi_sme_encrypt_execute)
 
 	/*
 	 * Entry parameters:
@@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute)
 	ANNOTATE_UNRET_SAFE
 	ret
 	int3
-SYM_FUNC_END(sme_encrypt_execute)
+SYM_FUNC_END(__pi_sme_encrypt_execute)
 
-SYM_FUNC_START(__enc_copy)
+SYM_FUNC_START_LOCAL(__enc_copy)
 	ANNOTATE_NOENDBR
 /*
  * Routine used to encrypt memory in place.
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 1d78e5631bb8..344030c1a81d 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -24,7 +24,7 @@
 #include <asm/nospec-branch.h>
 #include <xen/interface/elfnote.h>
 
-	__HEAD
+	__INIT
 
 /*
  * Entry point for PVH guests.
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index e0a607a14e7e..5ce1d4263000 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -57,7 +57,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE
 PURGATORY_CFLAGS_REMOVE		+= $(RETPOLINE_CFLAGS)
 endif
 
-ifdef CONFIG_CFI_CLANG
+ifdef CONFIG_CFI
 PURGATORY_CFLAGS_REMOVE		+= $(CC_FLAGS_CFI)
 endif
 
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index 2c19d7fc8a85..7ea1b75e59b7 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -21,6 +21,7 @@ function clear_vars() {
 	eid = -1 # escape id
 	gid = -1 # group id
 	aid = -1 # AVX id
+	xopid = -1 # XOP id
 	tname = ""
 }
 
@@ -39,9 +40,11 @@ BEGIN {
 	ggid = 1
 	geid = 1
 	gaid = 0
+	gxopid = 0
 	delete etable
 	delete gtable
 	delete atable
+	delete xoptable
 
 	opnd_expr = "^[A-Za-z/]"
 	ext_expr = "^\\("
@@ -61,6 +64,7 @@ BEGIN {
 	imm_flag["Ob"] = "INAT_MOFFSET"
 	imm_flag["Ov"] = "INAT_MOFFSET"
 	imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+	imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
 
 	modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
 	force64_expr = "\\([df]64\\)"
@@ -87,6 +91,8 @@ BEGIN {
 	evexonly_expr = "\\(ev\\)"
 	# (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size
 	evex_scalable_expr = "\\(es\\)"
+	# All opcodes in XOP table or with (xop) superscript accept XOP prefix
+	xopok_expr = "\\(xop\\)"
 
 	prefix_expr = "\\(Prefix\\)"
 	prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -106,6 +112,7 @@ BEGIN {
 	prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
 	prefix_num["EVEX"] = "INAT_PFX_EVEX"
 	prefix_num["REX2"] = "INAT_PFX_REX2"
+	prefix_num["XOP"] = "INAT_PFX_XOP"
 
 	clear_vars()
 }
@@ -147,6 +154,7 @@ function array_size(arr,   i,c) {
 	if (NF != 1) {
 		# AVX/escape opcode table
 		aid = $2
+		xopid = -1
 		if (gaid <= aid)
 			gaid = aid + 1
 		if (tname == "")	# AVX only opcode table
@@ -156,6 +164,20 @@ function array_size(arr,   i,c) {
 		tname = "inat_primary_table"
 }
 
+/^XOPcode:/ {
+	if (NF != 1) {
+		# XOP opcode table
+		xopid = $2
+		aid = -1
+		if (gxopid <= xopid)
+			gxopid = xopid + 1
+		if (tname == "")	# XOP only opcode table
+			tname = sprintf("inat_xop_table_%d", $2)
+	}
+	if (xopid == -1 && eid == -1)	# primary opcode table
+		tname = "inat_primary_table"
+}
+
 /^GrpTable:/ {
 	print "/* " $0 " */"
 	if (!($2 in group))
@@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n)
 			etable[eid,0] = tname
 			if (aid >= 0)
 				atable[aid,0] = tname
+			else if (xopid >= 0)
+				xoptable[xopid] = tname
 		}
 		if (array_size(lptable1) != 0) {
 			print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
@@ -347,6 +371,8 @@ function convert_operands(count,opnd,       i,j,imm,mod)
 			flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
 		else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
 			flags = add_flags(flags, "INAT_VEXOK")
+		else if (match(ext, xopok_expr) || xopid >= 0)
+			flags = add_flags(flags, "INAT_XOPOK")
 
 		# check prefixes
 		if (match(ext, prefix_expr)) {
@@ -413,6 +439,14 @@ END {
 				print "	["i"]["j"] = "atable[i,j]","
 	print "};\n"
 
+	print "/* XOP opcode map array */"
+	print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \
+	      " = {"
+	for (i = 0; i < gxopid; i++)
+		if (xoptable[i])
+			print "	["i"] = "xoptable[i]","
+	print "};"
+
 	print "#else /* !__BOOT_COMPRESSED */\n"
 
 	print "/* Escape opcode map array */"
@@ -430,6 +464,10 @@ END {
 	      "[INAT_LSTPFX_MAX + 1];"
 	print ""
 
+	print "/* XOP opcode map array */"
+	print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];"
+	print ""
+
 	print "static void inat_init_tables(void)"
 	print "{"
 
@@ -455,6 +493,12 @@ END {
 			if (atable[i,j])
 				print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";"
 
+	print ""
+	print "\t/* Print XOP opcode map array */"
+	for (i = 0; i < gxopid; i++)
+		if (xoptable[i])
+			print "\tinat_xop_tables["i"] = "xoptable[i]";"
+
 	print "}"
 	print "#endif"
 }
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 5778bc498415..e5a2b9a912d1 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
 static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 		      const char *symname)
 {
-	int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text");
 	unsigned r_type = ELF64_R_TYPE(rel->r_info);
 	ElfW(Addr) offset = rel->r_offset;
 	int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
+
 	if (sym->st_shndx == SHN_UNDEF)
 		return 0;
 
@@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 			break;
 		}
 
-		if (headtext) {
-			die("Absolute reference to symbol '%s' not permitted in .head.text\n",
-			    symname);
-			break;
-		}
-
 		/*
 		 * Relocation offsets for 64 bit kernels are output
 		 * as 32 bits and sign extended back to 64 bits when
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 942372e69b4d..ee643a6cd691 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level)
 }
 EXPORT_SYMBOL_GPL(rmp_make_shared);
 
-void snp_leak_pages(u64 pfn, unsigned int npages)
+void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp)
 {
 	struct page *page = pfn_to_page(pfn);
 
@@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages)
 		    (PageHead(page) && compound_nr(page) <= npages))
 			list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
 
-		dump_rmpentry(pfn);
+		if (dump_rmp)
+			dump_rmpentry(pfn);
 		snp_nr_leaked_pages++;
 		pfn++;
 		page++;
 	}
 	spin_unlock(&snp_leaked_pages_list_lock);
 }
-EXPORT_SYMBOL_GPL(snp_leak_pages);
+EXPORT_SYMBOL_GPL(__snp_leak_pages);
 
 void kdump_sev_callback(void)
 {
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 98d8a50d2aed..aa4040fd9215 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -8,6 +8,7 @@ config XEN
 	depends on PARAVIRT
 	select PARAVIRT_CLOCK
 	select X86_HV_CALLBACK_VECTOR
+	select HIBERNATE_CALLBACKS
 	depends on X86_64 || (X86_32 && X86_PAE)
 	depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM)
 	depends on X86_LOCAL_APIC && X86_TSC
@@ -64,12 +65,6 @@ config XEN_PVHVM_GUEST
 	help
 	  Support running as a Xen PVHVM guest.
 
-config XEN_SAVE_RESTORE
-	bool
-	depends on XEN
-	select HIBERNATE_CALLBACKS
-	default y
-
 config XEN_DEBUG_FS
 	bool "Enable Xen debug and tuning parameters in debugfs"
 	depends on XEN && DEBUG_FS
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 26bbaf4b7330..4806cc28d7ca 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -382,7 +382,6 @@ static bool __init xen_check_xsave(void)
 
 static void __init xen_init_capabilities(void)
 {
-	setup_force_cpu_cap(X86_FEATURE_XENPV);
 	setup_clear_cpu_cap(X86_FEATURE_DCA);
 	setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
 	setup_clear_cpu_cap(X86_FEATURE_MTRR);
@@ -1402,6 +1401,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
 			JMP32_INSN_SIZE);
 
 	xen_domain_type = XEN_PV_DOMAIN;
+	setup_force_cpu_cap(X86_FEATURE_XENPV);
 	xen_start_flags = xen_start_info->flags;
 	/* Interrupts are guaranteed to be off initially. */
 	early_boot_irqs_disabled = true;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c4c479373249..3be45bf4bc79 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int nr, struct page **pages)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return xen_xlate_unmap_gfn_range(vma, nr, pages);
 
 	if (!pages)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 56914e21e303..2dd12b61a230 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -686,7 +686,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 	int i, ret = 0;
 	pte_t *pte;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return 0;
 
 	if (kmap_ops) {
@@ -769,7 +769,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 {
 	int i, ret = 0;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return 0;
 
 	for (i = 0; i < count; i++) {
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index e02ec5833389..f7390b6761e1 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -37,7 +37,7 @@ static inline unsigned long __cntlz (unsigned long x)
  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
  */
 
-static inline int ffz(unsigned long x)
+static inline int __attribute_const__ ffz(unsigned long x)
 {
 	return 31 - __cntlz(~x & -~x);
 }
@@ -46,7 +46,7 @@ static inline int ffz(unsigned long x)
  * __ffs: Find first bit set in word. Return 0 for bit 0
  */
 
-static inline unsigned long __ffs(unsigned long x)
+static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
 	return 31 - __cntlz(x & -x);
 }
@@ -57,7 +57,7 @@ static inline unsigned long __ffs(unsigned long x)
  * differs in spirit from the above ffz (man ffs).
  */
 
-static inline int ffs(unsigned long x)
+static inline __attribute_const__ int ffs(unsigned long x)
 {
 	return 32 - __cntlz(x & -x);
 }
@@ -67,7 +67,7 @@ static inline int ffs(unsigned long x)
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 
-static inline int fls (unsigned int x)
+static inline __attribute_const__ int fls (unsigned int x)
 {
 	return 32 - __cntlz(x);
 }
@@ -78,7 +78,7 @@ static inline int fls (unsigned int x)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static inline __attribute_const__ unsigned long __fls(unsigned long word)
 {
 	return 31 - __cntlz(word);
 }
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index da38de20ae59..cfbced95e944 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -11,6 +11,7 @@
  *
  * Chris Zankel <chris@zankel.net>
  */
+#define COMPILE_OFFSETS
 
 #include <asm/processor.h>
 #include <asm/coprocessor.h>
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 7bd66677f7b6..94d43f44be13 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -267,7 +267,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
-	unsigned long clone_flags = args->flags;
+	u64 clone_flags = args->flags;
 	unsigned long usp_thread_fn = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
diff --git a/block/bdev.c b/block/bdev.c
index b77ddd12dc06..810707cca970 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = {
 	.statfs = simple_statfs,
 	.alloc_inode = bdev_alloc_inode,
 	.free_inode = bdev_free_inode,
-	.drop_inode = generic_delete_inode,
+	.drop_inode = inode_just_drop,
 	.evict_inode = bdev_evict_inode,
 };
 
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3bf76902f07f..50e51047e1fe 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5847,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 			goto out;
 	}
 
-	bfqq = kmem_cache_alloc_node(bfq_pool,
-				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+	bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
 				     bfqd->queue->node);
 
 	if (bfqq) {
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5936db7f8475..fe9ebd6a2e14 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -394,7 +394,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
 
 	/* allocate */
 	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
+		new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
 			goto err_put_css;
@@ -1467,7 +1467,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 	spin_lock_init(&blkcg->lock);
 	refcount_set(&blkcg->online_pin, 1);
-	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -1630,7 +1630,7 @@ retry:
 			pd_prealloc = NULL;
 		} else {
 			pd = pol->pd_alloc_fn(disk, blkg->blkcg,
-					      GFP_NOWAIT | __GFP_NOWARN);
+					      GFP_NOWAIT);
 		}
 
 		if (!pd) {
diff --git a/block/blk-core.c b/block/blk-core.c
index fdac48aec5ef..a27185cd8ede 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -560,6 +560,8 @@ static inline int bio_check_eod(struct bio *bio)
 	if (nr_sectors &&
 	    (nr_sectors > maxsector ||
 	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
+		if (!maxsector)
+			return -EIO;
 		pr_info_ratelimited("%s: attempt to access beyond end of device\n"
 				    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
 				    current->comm, bio->bi_bdev, bio->bi_opf,
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 056b8948369d..ce08ad4565e2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -58,16 +58,14 @@ new_segment:
 int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
 		     struct logical_block_metadata_cap __user *argp)
 {
-	struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+	struct blk_integrity *bi;
 	struct logical_block_metadata_cap meta_cap = {};
 	size_t usize = _IOC_SIZE(cmd);
 
-	if (_IOC_DIR(cmd)  != _IOC_DIR(FS_IOC_GETLBMD_CAP) ||
-	    _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) ||
-	    _IOC_NR(cmd)   != _IOC_NR(FS_IOC_GETLBMD_CAP) ||
-	    _IOC_SIZE(cmd) < LBMD_SIZE_VER0)
+	if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0))
 		return -ENOIOCTLCMD;
 
+	bi = blk_get_integrity(bdev->bd_disk);
 	if (!bi)
 		goto out;
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9fda3906e5f5..d15918d7fabb 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -286,7 +286,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
+int __copy_io(u64 clone_flags, struct task_struct *tsk)
 {
 	struct io_context *ioc = current->io_context;
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7ed3e71f2fc0..32c65efdda46 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -95,6 +95,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(SQ_SCHED),
 	QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
 	QUEUE_FLAG_NAME(NO_ELV_SWITCH),
+	QUEUE_FLAG_NAME(QOS_ENABLED),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b67d6c02eceb..ba3a4b77f578 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -5033,6 +5033,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 	unsigned int memflags;
 	int i;
 	struct xarray elv_tbl, et_tbl;
+	bool queues_frozen = false;
 
 	lockdep_assert_held(&set->tag_list_lock);
 
@@ -5056,9 +5057,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 		blk_mq_sysfs_unregister_hctxs(q);
 	}
 
-	list_for_each_entry(q, &set->tag_list, tag_set_list)
-		blk_mq_freeze_queue_nomemsave(q);
-
 	/*
 	 * Switch IO scheduler to 'none', cleaning up the data associated
 	 * with the previous scheduler. We will switch back once we are done
@@ -5068,6 +5066,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 		if (blk_mq_elv_switch_none(q, &elv_tbl))
 			goto switch_back;
 
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_freeze_queue_nomemsave(q);
+	queues_frozen = true;
 	if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
 		goto switch_back;
 
@@ -5091,8 +5092,12 @@ fallback:
 	}
 switch_back:
 	/* The blk_mq_elv_switch_back unfreezes queue for us. */
-	list_for_each_entry(q, &set->tag_list, tag_set_list)
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		/* switch_back expects queue to be frozen */
+		if (!queues_frozen)
+			blk_mq_freeze_queue_nomemsave(q);
 		blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
+	}
 
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_sysfs_register_hctxs(q);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 848591fb3c57..654478dfbc20 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -2,8 +2,6 @@
 
 #include "blk-rq-qos.h"
 
-__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
-
 /*
  * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
  * false if 'v' + 1 would be bigger than 'below'.
@@ -319,8 +317,8 @@ void rq_qos_exit(struct request_queue *q)
 		struct rq_qos *rqos = q->rq_qos;
 		q->rq_qos = rqos->next;
 		rqos->ops->exit(rqos);
-		static_branch_dec(&block_rq_qos);
 	}
+	blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
 	mutex_unlock(&q->rq_qos_mutex);
 }
 
@@ -346,7 +344,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		goto ebusy;
 	rqos->next = q->rq_qos;
 	q->rq_qos = rqos;
-	static_branch_inc(&block_rq_qos);
+	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
 
 	blk_mq_unfreeze_queue(q, memflags);
 
@@ -377,6 +375,8 @@ void rq_qos_del(struct rq_qos *rqos)
 			break;
 		}
 	}
+	if (!q->rq_qos)
+		blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
 	blk_mq_unfreeze_queue(q, memflags);
 
 	mutex_lock(&q->debugfs_mutex);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 39749f4066fb..b538f2c0febc 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -12,7 +12,6 @@
 #include "blk-mq-debugfs.h"
 
 struct blk_mq_debugfs_attr;
-extern struct static_key_false block_rq_qos;
 
 enum rq_qos_id {
 	RQ_QOS_WBT,
@@ -113,43 +112,58 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
 
 static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos)
 		__rq_qos_cleanup(q->rq_qos, bio);
 }
 
 static inline void rq_qos_done(struct request_queue *q, struct request *rq)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
-	    !blk_rq_is_passthrough(rq))
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos && !blk_rq_is_passthrough(rq))
 		__rq_qos_done(q->rq_qos, rq);
 }
 
 static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos)
 		__rq_qos_issue(q->rq_qos, rq);
 }
 
 static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos)
 		__rq_qos_requeue(q->rq_qos, rq);
 }
 
 static inline void rq_qos_done_bio(struct bio *bio)
 {
-	if (static_branch_unlikely(&block_rq_qos) &&
-	    bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
-			     bio_flagged(bio, BIO_QOS_MERGED))) {
-		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-		if (q->rq_qos)
-			__rq_qos_done_bio(q->rq_qos, bio);
-	}
+	struct request_queue *q;
+
+	if (!bio->bi_bdev || (!bio_flagged(bio, BIO_QOS_THROTTLED) &&
+			     !bio_flagged(bio, BIO_QOS_MERGED)))
+		return;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	/*
+	 * A BIO may carry BIO_QOS_* flags even if the associated request_queue
+	 * does not have rq_qos enabled. This can happen with stacked block
+	 * devices — for example, NVMe multipath, where it's possible that the
+	 * bottom device has QoS enabled but the top device does not. Therefore,
+	 * always verify that q->rq_qos is present and QoS is enabled before
+	 * calling __rq_qos_done_bio().
+	 */
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
+		__rq_qos_done_bio(q->rq_qos, bio);
 }
 
 static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos) {
 		bio_set_flag(bio, BIO_QOS_THROTTLED);
 		__rq_qos_throttle(q->rq_qos, bio);
 	}
@@ -158,14 +172,16 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 static inline void rq_qos_track(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos)
 		__rq_qos_track(q->rq_qos, rq, bio);
 }
 
 static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos) {
 		bio_set_flag(bio, BIO_QOS_MERGED);
 		__rq_qos_merge(q->rq_qos, rq, bio);
 	}
@@ -173,7 +189,8 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
 
 static inline void rq_qos_queue_depth_changed(struct request_queue *q)
 {
-	if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
+	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
+			q->rq_qos)
 		__rq_qos_queue_depth_changed(q->rq_qos);
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 07874e9b609f..d6438e6c276d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,16 +157,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 	switch (bi->csum_type) {
 	case BLK_INTEGRITY_CSUM_NONE:
 		if (bi->pi_tuple_size) {
-			pr_warn("pi_tuple_size must be 0 when checksum type \
-				 is none\n");
+			pr_warn("pi_tuple_size must be 0 when checksum type is none\n");
 			return -EINVAL;
 		}
 		break;
 	case BLK_INTEGRITY_CSUM_CRC:
 	case BLK_INTEGRITY_CSUM_IP:
 		if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
-			pr_warn("pi_tuple_size mismatch for T10 PI: expected \
-				 %zu, got %u\n",
+			pr_warn("pi_tuple_size mismatch for T10 PI: expected %zu, got %u\n",
 				 sizeof(struct t10_pi_tuple),
 				 bi->pi_tuple_size);
 			return -EINVAL;
@@ -174,8 +172,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 		break;
 	case BLK_INTEGRITY_CSUM_CRC64:
 		if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
-			pr_warn("pi_tuple_size mismatch for CRC64 PI: \
-				 expected %zu, got %u\n",
+			pr_warn("pi_tuple_size mismatch for CRC64 PI: expected %zu, got %u\n",
 				 sizeof(struct crc64_pi_tuple),
 				 bi->pi_tuple_size);
 			return -EINVAL;
@@ -972,6 +969,8 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
 			goto incompatible;
 		if (ti->csum_type != bi->csum_type)
 			goto incompatible;
+		if (ti->pi_tuple_size != bi->pi_tuple_size)
+			goto incompatible;
 		if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
 		    (bi->flags & BLK_INTEGRITY_REF_TAG))
 			goto incompatible;
@@ -980,6 +979,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
 		ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
 			     (bi->flags & BLK_INTEGRITY_REF_TAG);
 		ti->csum_type = bi->csum_type;
+		ti->pi_tuple_size = bi->pi_tuple_size;
 		ti->metadata_size = bi->metadata_size;
 		ti->pi_offset = bi->pi_offset;
 		ti->interval_exp = bi->interval_exp;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 396cded255ea..4a7f1a349998 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -847,7 +847,7 @@ static void blk_queue_release(struct kobject *kobj)
 	/* nothing to do here, all data is associated with the parent gendisk */
 }
 
-static const struct kobj_type blk_queue_ktype = {
+const struct kobj_type blk_queue_ktype = {
 	.default_groups = blk_queue_attr_groups,
 	.sysfs_ops	= &queue_sysfs_ops,
 	.release	= blk_queue_release,
@@ -875,15 +875,14 @@ int blk_register_queue(struct gendisk *disk)
 	struct request_queue *q = disk->queue;
 	int ret;
 
-	kobject_init(&disk->queue_kobj, &blk_queue_ktype);
 	ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
 	if (ret < 0)
-		goto out_put_queue_kobj;
+		return ret;
 
 	if (queue_is_mq(q)) {
 		ret = blk_mq_sysfs_register(disk);
 		if (ret)
-			goto out_put_queue_kobj;
+			goto out_del_queue_kobj;
 	}
 	mutex_lock(&q->sysfs_lock);
 
@@ -903,9 +902,9 @@ int blk_register_queue(struct gendisk *disk)
 
 	if (queue_is_mq(q))
 		elevator_set_default(q);
-	wbt_enable_default(disk);
 
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+	wbt_enable_default(disk);
 
 	/* Now everything is ready and send out KOBJ_ADD uevent */
 	kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
@@ -934,8 +933,8 @@ out_debugfs_remove:
 	mutex_unlock(&q->sysfs_lock);
 	if (queue_is_mq(q))
 		blk_mq_sysfs_unregister(disk);
-out_put_queue_kobj:
-	kobject_put(&disk->queue_kobj);
+out_del_queue_kobj:
+	kobject_del(&disk->queue_kobj);
 	return ret;
 }
 
@@ -986,5 +985,4 @@ void blk_unregister_queue(struct gendisk *disk)
 		elevator_set_none(q);
 
 	blk_debugfs_remove(disk);
-	kobject_put(&disk->queue_kobj);
 }
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index a50d4cd55f41..eb8037bae0bd 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -85,8 +85,8 @@ struct rq_wb {
 	u64 sync_issue;
 	void *sync_cookie;
 
-	unsigned long last_issue;		/* last non-throttled issue */
-	unsigned long last_comp;		/* last non-throttled comp */
+	unsigned long last_issue;	/* issue time of last read rq */
+	unsigned long last_comp;	/* completion time of last read rq */
 	unsigned long min_lat_nsec;
 	struct rq_qos rqos;
 	struct rq_wait rq_wait[WBT_NUM_RWQ];
@@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq)
 	struct rq_wb *rwb = RQWB(rqos);
 
 	if (!wbt_is_tracked(rq)) {
-		if (rwb->sync_cookie == rq) {
-			rwb->sync_issue = 0;
-			rwb->sync_cookie = NULL;
-		}
+		if (wbt_is_read(rq)) {
+			if (rwb->sync_cookie == rq) {
+				rwb->sync_issue = 0;
+				rwb->sync_cookie = NULL;
+			}
 
-		if (wbt_is_read(rq))
 			wb_timestamp(rwb, &rwb->last_comp);
+		}
 	} else {
 		WARN_ON_ONCE(rq == rwb->sync_cookie);
 		__wbt_done(rqos, wbt_flags(rq));
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ef43aaca49f4..5e2a5788dc3b 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1286,14 +1286,14 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
 	struct block_device *bdev;
 	unsigned long flags;
 	struct bio *bio;
+	bool prepared;
 
 	/*
 	 * Submit the next plugged BIO. If we do not have any, clear
 	 * the plugged flag.
 	 */
-	spin_lock_irqsave(&zwplug->lock, flags);
-
 again:
+	spin_lock_irqsave(&zwplug->lock, flags);
 	bio = bio_list_pop(&zwplug->bio_list);
 	if (!bio) {
 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
@@ -1304,13 +1304,14 @@ again:
 	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
 				 bio->bi_iter.bi_sector, bio_sectors(bio));
 
-	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	if (!prepared) {
 		blk_zone_wplug_bio_io_error(zwplug, bio);
 		goto again;
 	}
 
-	spin_unlock_irqrestore(&zwplug->lock, flags);
-
 	bdev = bio->bi_bdev;
 
 	/*
diff --git a/block/blk.h b/block/blk.h
index 0a2eccf28ca4..46f566f9b126 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,7 @@ struct elevator_tags;
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
+extern const struct kobj_type blk_queue_ktype;
 extern struct dentry *blk_debugfs_root;
 
 struct blk_flush_queue {
diff --git a/block/fops.c b/block/fops.c
index 82451ac8ff25..ddbc69c0922b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
@@ -54,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	struct bio bio;
 	ssize_t ret;
 
-	WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
 	else {
@@ -131,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio)
 	if (bio->bi_status && !dio->bio.bi_status)
 		dio->bio.bi_status = bio->bi_status;
 
-	if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+	if (bio_integrity(bio))
 		bio_integrity_unmap_user(bio);
 
 	if (atomic_dec_and_test(&dio->ref)) {
@@ -233,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			}
 			bio->bi_opf |= REQ_NOWAIT;
 		}
-		if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+		if (iocb->ki_flags & IOCB_HAS_METADATA) {
 			ret = bio_integrity_map_iter(bio, iocb->private);
 			if (unlikely(ret))
 				goto fail;
@@ -301,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio)
 		ret = blk_status_to_errno(bio->bi_status);
 	}
 
-	if (iocb->ki_flags & IOCB_HAS_METADATA)
+	if (bio_integrity(bio))
 		bio_integrity_unmap_user(bio);
 
 	iocb->ki_complete(iocb, ret);
@@ -422,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	}
 
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
-	if (likely(nr_pages <= BIO_MAX_VECS)) {
+	if (likely(nr_pages <= BIO_MAX_VECS &&
+		   !(iocb->ki_flags & IOCB_HAS_METADATA))) {
 		if (is_sync_kiocb(iocb))
 			return __blkdev_direct_IO_simple(iocb, iter, bdev,
 							nr_pages);
@@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 
 	if (bdev_can_atomic_write(bdev))
 		filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+	if (blk_get_integrity(bdev->bd_disk))
+		filp->f_mode |= FMODE_HAS_METADATA;
 
 	ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
 	if (ret)
diff --git a/block/genhd.c b/block/genhd.c
index c26733f6324b..9bbc38d12792 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1303,6 +1303,7 @@ static void disk_release(struct device *dev)
 	disk_free_zone_resources(disk);
 	xa_destroy(&disk->part_tbl);
 
+	kobject_put(&disk->queue_kobj);
 	disk->queue->disk = NULL;
 	blk_put_queue(disk->queue);
 
@@ -1486,6 +1487,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	INIT_LIST_HEAD(&disk->slave_bdevs);
 #endif
 	mutex_init(&disk->rqos_state_mutex);
+	kobject_init(&disk->queue_kobj, &blk_queue_ktype);
 	return disk;
 
 out_erase_part0:
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 23bd98981ae8..a04595f9d0ca 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -344,14 +344,6 @@ config CRYPTO_ECRDSA
 	  One of the Russian cryptographic standard algorithms (called GOST
 	  algorithms). Only signature verification is implemented.
 
-config CRYPTO_CURVE25519
-	tristate "Curve25519"
-	select CRYPTO_KPP
-	select CRYPTO_LIB_CURVE25519_GENERIC
-	select CRYPTO_LIB_CURVE25519_INTERNAL
-	help
-	  Curve25519 elliptic curve (RFC7748)
-
 endmenu
 
 menu "Block ciphers"
@@ -609,6 +601,7 @@ menu "Length-preserving ciphers and modes"
 config CRYPTO_ADIANTUM
 	tristate "Adiantum"
 	select CRYPTO_CHACHA20
+	select CRYPTO_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC
 	select CRYPTO_NHPOLY1305
 	select CRYPTO_MANAGER
@@ -647,7 +640,6 @@ config CRYPTO_ARC4
 config CRYPTO_CHACHA20
 	tristate "ChaCha"
 	select CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
 	select CRYPTO_SKCIPHER
 	help
 	  The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms
@@ -770,6 +762,7 @@ config CRYPTO_XTS
 config CRYPTO_NHPOLY1305
 	tristate
 	select CRYPTO_HASH
+	select CRYPTO_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC
 
 endmenu
@@ -938,8 +931,9 @@ config CRYPTO_MD4
 config CRYPTO_MD5
 	tristate "MD5"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_MD5
 	help
-	  MD5 message digest algorithm (RFC1321)
+	  MD5 message digest algorithm (RFC1321), including HMAC support.
 
 config CRYPTO_MICHAEL_MIC
 	tristate "Michael MIC"
diff --git a/crypto/Makefile b/crypto/Makefile
index 6c5d59369dac..e430e6e99b6a 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -182,7 +182,6 @@ obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
 obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
 obj-$(CONFIG_CRYPTO_ECC) += ecc.o
 obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o
-obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
 
 ecdh_generic-y += ecdh.o
 ecdh_generic-y += ecdh_helper.o
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 0da7c1ac778a..ca6fdcc6c54a 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -970,6 +970,12 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
 	}
 
 	lock_sock(sk);
+	if (ctx->write) {
+		release_sock(sk);
+		return -EBUSY;
+	}
+	ctx->write = true;
+
 	if (ctx->init && !ctx->more) {
 		if (ctx->used) {
 			err = -EINVAL;
@@ -1019,6 +1025,8 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
 			continue;
 		}
 
+		ctx->merge = 0;
+
 		if (!af_alg_writable(sk)) {
 			err = af_alg_wait_for_wmem(sk, msg->msg_flags);
 			if (err)
@@ -1058,7 +1066,6 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
 			ctx->used += plen;
 			copied += plen;
 			size -= plen;
-			ctx->merge = 0;
 		} else {
 			do {
 				struct page *pg;
@@ -1104,6 +1111,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 unlock:
 	af_alg_data_wakeup(sk);
+	ctx->write = false;
 	release_sock(sk);
 
 	return copied ?: err;
diff --git a/crypto/chacha.c b/crypto/chacha.c
index c3a11f4e2d13..ec16d5a33f3c 100644
--- a/crypto/chacha.c
+++ b/crypto/chacha.c
@@ -47,7 +47,7 @@ static int chacha12_setkey(struct crypto_skcipher *tfm,
 
 static int chacha_stream_xor(struct skcipher_request *req,
 			     const struct chacha_ctx *ctx,
-			     const u8 iv[CHACHA_IV_SIZE], bool arch)
+			     const u8 iv[CHACHA_IV_SIZE])
 {
 	struct skcipher_walk walk;
 	struct chacha_state state;
@@ -63,36 +63,23 @@ static int chacha_stream_xor(struct skcipher_request *req,
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE);
 
-		if (arch)
-			chacha_crypt(&state, walk.dst.virt.addr,
-				     walk.src.virt.addr, nbytes, ctx->nrounds);
-		else
-			chacha_crypt_generic(&state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
+		chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr,
+			     nbytes, ctx->nrounds);
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
 	return err;
 }
 
-static int crypto_chacha_crypt_generic(struct skcipher_request *req)
+static int crypto_chacha_crypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	return chacha_stream_xor(req, ctx, req->iv, false);
+	return chacha_stream_xor(req, ctx, req->iv);
 }
 
-static int crypto_chacha_crypt_arch(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_stream_xor(req, ctx, req->iv, true);
-}
-
-static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch)
+static int crypto_xchacha_crypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -102,10 +89,7 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch)
 
 	/* Compute the subkey given the original key and first 128 nonce bits */
 	chacha_init(&state, ctx->key, req->iv);
-	if (arch)
-		hchacha_block(&state, subctx.key, ctx->nrounds);
-	else
-		hchacha_block_generic(&state, subctx.key, ctx->nrounds);
+	hchacha_block(&state, subctx.key, ctx->nrounds);
 	subctx.nrounds = ctx->nrounds;
 
 	/* Build the real IV */
@@ -113,71 +97,13 @@ static int crypto_xchacha_crypt(struct skcipher_request *req, bool arch)
 	memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */
 
 	/* Generate the stream and XOR it with the data */
-	return chacha_stream_xor(req, &subctx, real_iv, arch);
-}
-
-static int crypto_xchacha_crypt_generic(struct skcipher_request *req)
-{
-	return crypto_xchacha_crypt(req, false);
-}
-
-static int crypto_xchacha_crypt_arch(struct skcipher_request *req)
-{
-	return crypto_xchacha_crypt(req, true);
+	return chacha_stream_xor(req, &subctx, real_iv);
 }
 
 static struct skcipher_alg algs[] = {
 	{
 		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-generic",
-		.base.cra_priority	= 100,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= crypto_chacha_crypt_generic,
-		.decrypt		= crypto_chacha_crypt_generic,
-	},
-	{
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-generic",
-		.base.cra_priority	= 100,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= crypto_xchacha_crypt_generic,
-		.decrypt		= crypto_xchacha_crypt_generic,
-	},
-	{
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-generic",
-		.base.cra_priority	= 100,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= crypto_xchacha_crypt_generic,
-		.decrypt		= crypto_xchacha_crypt_generic,
-	},
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-" __stringify(ARCH),
+		.base.cra_driver_name	= "chacha20-lib",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
@@ -188,12 +114,12 @@ static struct skcipher_alg algs[] = {
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
-		.encrypt		= crypto_chacha_crypt_arch,
-		.decrypt		= crypto_chacha_crypt_arch,
+		.encrypt		= crypto_chacha_crypt,
+		.decrypt		= crypto_chacha_crypt,
 	},
 	{
 		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-" __stringify(ARCH),
+		.base.cra_driver_name	= "xchacha20-lib",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
@@ -204,12 +130,12 @@ static struct skcipher_alg algs[] = {
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
-		.encrypt		= crypto_xchacha_crypt_arch,
-		.decrypt		= crypto_xchacha_crypt_arch,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
 	},
 	{
 		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-" __stringify(ARCH),
+		.base.cra_driver_name	= "xchacha12-lib",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
@@ -220,27 +146,19 @@ static struct skcipher_alg algs[] = {
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha12_setkey,
-		.encrypt		= crypto_xchacha_crypt_arch,
-		.decrypt		= crypto_xchacha_crypt_arch,
+		.encrypt		= crypto_xchacha_crypt,
+		.decrypt		= crypto_xchacha_crypt,
 	}
 };
 
-static unsigned int num_algs;
-
 static int __init crypto_chacha_mod_init(void)
 {
-	/* register the arch flavours only if they differ from generic */
-	num_algs = ARRAY_SIZE(algs);
-	BUILD_BUG_ON(ARRAY_SIZE(algs) % 2 != 0);
-	if (!chacha_is_arch_optimized())
-		num_algs /= 2;
-
-	return crypto_register_skciphers(algs, num_algs);
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 static void __exit crypto_chacha_mod_fini(void)
 {
-	crypto_unregister_skciphers(algs, num_algs);
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
 module_init(crypto_chacha_mod_init);
@@ -250,11 +168,8 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers");
 MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-generic");
-MODULE_ALIAS_CRYPTO("chacha20-"  __stringify(ARCH));
+MODULE_ALIAS_CRYPTO("chacha20-lib");
 MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-generic");
-MODULE_ALIAS_CRYPTO("xchacha20-"  __stringify(ARCH));
+MODULE_ALIAS_CRYPTO("xchacha20-lib");
 MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-generic");
-MODULE_ALIAS_CRYPTO("xchacha12-"  __stringify(ARCH));
+MODULE_ALIAS_CRYPTO("xchacha12-lib");
diff --git a/crypto/curve25519-generic.c b/crypto/curve25519-generic.c
deleted file mode 100644
index f3e56e73c66c..000000000000
--- a/crypto/curve25519-generic.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/curve25519.h>
-#include <crypto/internal/kpp.h>
-#include <crypto/kpp.h>
-#include <linux/module.h>
-#include <linux/scatterlist.h>
-
-static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-				 unsigned int len)
-{
-	u8 *secret = kpp_tfm_ctx(tfm);
-
-	if (!len)
-		curve25519_generate_secret(secret);
-	else if (len == CURVE25519_KEY_SIZE &&
-		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
-		memcpy(secret, buf, CURVE25519_KEY_SIZE);
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_compute_value(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 public_key[CURVE25519_KEY_SIZE];
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-	u8 const *bp;
-
-	if (req->src) {
-		copied = sg_copy_to_buffer(req->src,
-					   sg_nents_for_len(req->src,
-							    CURVE25519_KEY_SIZE),
-					   public_key, CURVE25519_KEY_SIZE);
-		if (copied != CURVE25519_KEY_SIZE)
-			return -EINVAL;
-		bp = public_key;
-	} else {
-		bp = curve25519_base_point;
-	}
-
-	curve25519_generic(buf, secret, bp);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
-
-static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
-{
-	return CURVE25519_KEY_SIZE;
-}
-
-static struct kpp_alg curve25519_alg = {
-	.base.cra_name		= "curve25519",
-	.base.cra_driver_name	= "curve25519-generic",
-	.base.cra_priority	= 100,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
-
-	.set_secret		= curve25519_set_secret,
-	.generate_public_key	= curve25519_compute_value,
-	.compute_shared_secret	= curve25519_compute_value,
-	.max_size		= curve25519_max_size,
-};
-
-static int __init curve25519_init(void)
-{
-	return crypto_register_kpp(&curve25519_alg);
-}
-
-static void __exit curve25519_exit(void)
-{
-	crypto_unregister_kpp(&curve25519_alg);
-}
-
-module_init(curve25519_init);
-module_exit(curve25519_exit);
-
-MODULE_ALIAS_CRYPTO("curve25519");
-MODULE_ALIAS_CRYPTO("curve25519-generic");
-MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)");
-MODULE_LICENSE("GPL");
diff --git a/crypto/md5.c b/crypto/md5.c
index 32c0819f5118..c167d203c710 100644
--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -1,25 +1,62 @@
-/* 
- * Cryptographic API.
- *
- * MD5 Message Digest Algorithm (RFC1321).
- *
- * Derived from cryptoapi implementation, originally based on the
- * public domain implementation written by Colin Plumb in 1993.
- *
- * Copyright (c) Cryptoapi developers.
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * 
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) 
- * any later version.
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for MD5 and HMAC-MD5
  *
+ * Copyright 2025 Google LLC
  */
 #include <crypto/internal/hash.h>
 #include <crypto/md5.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/string.h>
+
+/*
+ * Export and import functions.  crypto_shash wants a particular format that
+ * matches that used by some legacy drivers.  It currently is the same as the
+ * library MD5 context, except the value in bytecount must be block-aligned and
+ * the remainder must be stored in an extra u8 appended to the struct.
+ */
+
+#define MD5_SHASH_STATE_SIZE (sizeof(struct md5_ctx) + 1)
+static_assert(sizeof(struct md5_ctx) == sizeof(struct md5_state));
+static_assert(offsetof(struct md5_ctx, state) == offsetof(struct md5_state, hash));
+static_assert(offsetof(struct md5_ctx, bytecount) == offsetof(struct md5_state, byte_count));
+static_assert(offsetof(struct md5_ctx, buf) == offsetof(struct md5_state, block));
+
+static int __crypto_md5_export(const struct md5_ctx *ctx0, void *out)
+{
+	struct md5_ctx ctx = *ctx0;
+	unsigned int partial;
+	u8 *p = out;
+
+	partial = ctx.bytecount % MD5_BLOCK_SIZE;
+	ctx.bytecount -= partial;
+	memcpy(p, &ctx, sizeof(ctx));
+	p += sizeof(ctx);
+	*p = partial;
+	return 0;
+}
+
+static int __crypto_md5_import(struct md5_ctx *ctx, const void *in)
+{
+	const u8 *p = in;
+
+	memcpy(ctx, p, sizeof(*ctx));
+	p += sizeof(*ctx);
+	ctx->bytecount += *p;
+	return 0;
+}
+
+static int __crypto_md5_export_core(const struct md5_ctx *ctx, void *out)
+{
+	memcpy(out, ctx, offsetof(struct md5_ctx, buf));
+	return 0;
+}
+
+static int __crypto_md5_import_core(struct md5_ctx *ctx, const void *in)
+{
+	memcpy(ctx, in, offsetof(struct md5_ctx, buf));
+	return 0;
+}
 
 const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = {
 	0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
@@ -27,198 +64,173 @@ const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = {
 };
 EXPORT_SYMBOL_GPL(md5_zero_message_hash);
 
-#define F1(x, y, z)	(z ^ (x & (y ^ z)))
-#define F2(x, y, z)	F1(z, x, y)
-#define F3(x, y, z)	(x ^ y ^ z)
-#define F4(x, y, z)	(y ^ (x | ~z))
-
-#define MD5STEP(f, w, x, y, z, in, s) \
-	(w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
-
-static void md5_transform(__u32 *hash, __u32 const *in)
-{
-	u32 a, b, c, d;
-
-	a = hash[0];
-	b = hash[1];
-	c = hash[2];
-	d = hash[3];
-
-	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
-	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
-	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
-	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
-	hash[0] += a;
-	hash[1] += b;
-	hash[2] += c;
-	hash[3] += d;
-}
-
-static inline void md5_transform_helper(struct md5_state *ctx,
-					u32 block[MD5_BLOCK_WORDS])
-{
-	le32_to_cpu_array(block, MD5_BLOCK_WORDS);
-	md5_transform(ctx->hash, block);
-}
-
-static int md5_init(struct shash_desc *desc)
-{
-	struct md5_state *mctx = shash_desc_ctx(desc);
-
-	mctx->hash[0] = MD5_H0;
-	mctx->hash[1] = MD5_H1;
-	mctx->hash[2] = MD5_H2;
-	mctx->hash[3] = MD5_H3;
-	mctx->byte_count = 0;
+#define MD5_CTX(desc) ((struct md5_ctx *)shash_desc_ctx(desc))
 
+static int crypto_md5_init(struct shash_desc *desc)
+{
+	md5_init(MD5_CTX(desc));
 	return 0;
 }
 
-static int md5_update(struct shash_desc *desc, const u8 *data, unsigned int len)
-{
-	struct md5_state *mctx = shash_desc_ctx(desc);
-	u32 block[MD5_BLOCK_WORDS];
-
-	mctx->byte_count += len;
-	do {
-		memcpy(block, data, sizeof(block));
-		md5_transform_helper(mctx, block);
-		data += sizeof(block);
-		len -= sizeof(block);
-	} while (len >= sizeof(block));
-	memzero_explicit(block, sizeof(block));
-	mctx->byte_count -= len;
-	return len;
-}
-
-static int md5_finup(struct shash_desc *desc, const u8 *data, unsigned int len,
-		     u8 *out)
-{
-	struct md5_state *mctx = shash_desc_ctx(desc);
-	u32 block[MD5_BLOCK_WORDS];
-	unsigned int offset;
-	int padding;
-	char *p;
-
-	memcpy(block, data, len);
-
-	offset = len;
-	p = (char *)block + offset;
-	padding = 56 - (offset + 1);
-
-	*p++ = 0x80;
-	if (padding < 0) {
-		memset(p, 0x00, padding + sizeof (u64));
-		md5_transform_helper(mctx, block);
-		p = (char *)block;
-		padding = 56;
-	}
-
-	memset(p, 0, padding);
-	mctx->byte_count += len;
-	block[14] = mctx->byte_count << 3;
-	block[15] = mctx->byte_count >> 29;
-	le32_to_cpu_array(block, (sizeof(block) - sizeof(u64)) / sizeof(u32));
-	md5_transform(mctx->hash, block);
-	memzero_explicit(block, sizeof(block));
-	cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32));
-	memcpy(out, mctx->hash, sizeof(mctx->hash));
+static int crypto_md5_update(struct shash_desc *desc,
+			     const u8 *data, unsigned int len)
+{
+	md5_update(MD5_CTX(desc), data, len);
+	return 0;
+}
 
+static int crypto_md5_final(struct shash_desc *desc, u8 *out)
+{
+	md5_final(MD5_CTX(desc), out);
 	return 0;
 }
 
-static struct shash_alg alg = {
-	.digestsize	=	MD5_DIGEST_SIZE,
-	.init		=	md5_init,
-	.update		=	md5_update,
-	.finup		=	md5_finup,
-	.descsize	=	MD5_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"md5",
-		.cra_driver_name =	"md5-generic",
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	MD5_HMAC_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
+static int crypto_md5_digest(struct shash_desc *desc,
+			     const u8 *data, unsigned int len, u8 *out)
+{
+	md5(data, len, out);
+	return 0;
+}
+
+static int crypto_md5_export(struct shash_desc *desc, void *out)
+{
+	return __crypto_md5_export(MD5_CTX(desc), out);
+}
+
+static int crypto_md5_import(struct shash_desc *desc, const void *in)
+{
+	return __crypto_md5_import(MD5_CTX(desc), in);
+}
 
-static int __init md5_mod_init(void)
+static int crypto_md5_export_core(struct shash_desc *desc, void *out)
 {
-	return crypto_register_shash(&alg);
+	return __crypto_md5_export_core(MD5_CTX(desc), out);
 }
 
-static void __exit md5_mod_fini(void)
+static int crypto_md5_import_core(struct shash_desc *desc, const void *in)
 {
-	crypto_unregister_shash(&alg);
+	return __crypto_md5_import_core(MD5_CTX(desc), in);
 }
 
-module_init(md5_mod_init);
-module_exit(md5_mod_fini);
+#define HMAC_MD5_KEY(tfm) ((struct hmac_md5_key *)crypto_shash_ctx(tfm))
+#define HMAC_MD5_CTX(desc) ((struct hmac_md5_ctx *)shash_desc_ctx(desc))
+
+static int crypto_hmac_md5_setkey(struct crypto_shash *tfm,
+				  const u8 *raw_key, unsigned int keylen)
+{
+	hmac_md5_preparekey(HMAC_MD5_KEY(tfm), raw_key, keylen);
+	return 0;
+}
+
+static int crypto_hmac_md5_init(struct shash_desc *desc)
+{
+	hmac_md5_init(HMAC_MD5_CTX(desc), HMAC_MD5_KEY(desc->tfm));
+	return 0;
+}
+
+static int crypto_hmac_md5_update(struct shash_desc *desc,
+				  const u8 *data, unsigned int len)
+{
+	hmac_md5_update(HMAC_MD5_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_hmac_md5_final(struct shash_desc *desc, u8 *out)
+{
+	hmac_md5_final(HMAC_MD5_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_hmac_md5_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	hmac_md5(HMAC_MD5_KEY(desc->tfm), data, len, out);
+	return 0;
+}
+
+static int crypto_hmac_md5_export(struct shash_desc *desc, void *out)
+{
+	return __crypto_md5_export(&HMAC_MD5_CTX(desc)->hash_ctx, out);
+}
+
+static int crypto_hmac_md5_import(struct shash_desc *desc, const void *in)
+{
+	struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc);
+
+	ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate;
+	return __crypto_md5_import(&ctx->hash_ctx, in);
+}
+
+static int crypto_hmac_md5_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_md5_export_core(&HMAC_MD5_CTX(desc)->hash_ctx, out);
+}
+
+static int crypto_hmac_md5_import_core(struct shash_desc *desc, const void *in)
+{
+	struct hmac_md5_ctx *ctx = HMAC_MD5_CTX(desc);
+
+	ctx->ostate = HMAC_MD5_KEY(desc->tfm)->ostate;
+	return __crypto_md5_import_core(&ctx->hash_ctx, in);
+}
+
+static struct shash_alg algs[] = {
+	{
+		.base.cra_name		= "md5",
+		.base.cra_driver_name	= "md5-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= MD5_BLOCK_SIZE,
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= MD5_DIGEST_SIZE,
+		.init			= crypto_md5_init,
+		.update			= crypto_md5_update,
+		.final			= crypto_md5_final,
+		.digest			= crypto_md5_digest,
+		.export			= crypto_md5_export,
+		.import			= crypto_md5_import,
+		.export_core		= crypto_md5_export_core,
+		.import_core		= crypto_md5_import_core,
+		.descsize		= sizeof(struct md5_ctx),
+		.statesize		= MD5_SHASH_STATE_SIZE,
+	},
+	{
+		.base.cra_name		= "hmac(md5)",
+		.base.cra_driver_name	= "hmac-md5-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= MD5_BLOCK_SIZE,
+		.base.cra_ctxsize	= sizeof(struct hmac_md5_key),
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= MD5_DIGEST_SIZE,
+		.setkey			= crypto_hmac_md5_setkey,
+		.init			= crypto_hmac_md5_init,
+		.update			= crypto_hmac_md5_update,
+		.final			= crypto_hmac_md5_final,
+		.digest			= crypto_hmac_md5_digest,
+		.export			= crypto_hmac_md5_export,
+		.import			= crypto_hmac_md5_import,
+		.export_core		= crypto_hmac_md5_export_core,
+		.import_core		= crypto_hmac_md5_import_core,
+		.descsize		= sizeof(struct hmac_md5_ctx),
+		.statesize		= MD5_SHASH_STATE_SIZE,
+	},
+};
+
+static int __init crypto_md5_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_md5_mod_init);
+
+static void __exit crypto_md5_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_md5_mod_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MD5 Message Digest Algorithm");
+MODULE_DESCRIPTION("Crypto API support for MD5 and HMAC-MD5");
+
 MODULE_ALIAS_CRYPTO("md5");
+MODULE_ALIAS_CRYPTO("md5-lib");
+MODULE_ALIAS_CRYPTO("hmac(md5)");
+MODULE_ALIAS_CRYPTO("hmac-md5-lib");
diff --git a/crypto/sha1.c b/crypto/sha1.c
index ecef4bf2d9c0..4fbf61cf0370 100644
--- a/crypto/sha1.c
+++ b/crypto/sha1.c
@@ -49,6 +49,18 @@ static int __crypto_sha1_import(struct sha1_ctx *ctx, const void *in)
 	return 0;
 }
 
+static int __crypto_sha1_export_core(const struct sha1_ctx *ctx, void *out)
+{
+	memcpy(out, ctx, offsetof(struct sha1_ctx, buf));
+	return 0;
+}
+
+static int __crypto_sha1_import_core(struct sha1_ctx *ctx, const void *in)
+{
+	memcpy(ctx, in, offsetof(struct sha1_ctx, buf));
+	return 0;
+}
+
 const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = {
 	0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
 	0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
@@ -94,6 +106,16 @@ static int crypto_sha1_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha1_import(SHA1_CTX(desc), in);
 }
 
+static int crypto_sha1_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha1_export_core(SHA1_CTX(desc), out);
+}
+
+static int crypto_sha1_import_core(struct shash_desc *desc, const void *in)
+{
+	return __crypto_sha1_import_core(SHA1_CTX(desc), in);
+}
+
 #define HMAC_SHA1_KEY(tfm) ((struct hmac_sha1_key *)crypto_shash_ctx(tfm))
 #define HMAC_SHA1_CTX(desc) ((struct hmac_sha1_ctx *)shash_desc_ctx(desc))
 
@@ -143,6 +165,19 @@ static int crypto_hmac_sha1_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha1_import(&ctx->sha_ctx, in);
 }
 
+static int crypto_hmac_sha1_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha1_export_core(&HMAC_SHA1_CTX(desc)->sha_ctx, out);
+}
+
+static int crypto_hmac_sha1_import_core(struct shash_desc *desc, const void *in)
+{
+	struct hmac_sha1_ctx *ctx = HMAC_SHA1_CTX(desc);
+
+	ctx->ostate = HMAC_SHA1_KEY(desc->tfm)->ostate;
+	return __crypto_sha1_import_core(&ctx->sha_ctx, in);
+}
+
 static struct shash_alg algs[] = {
 	{
 		.base.cra_name		= "sha1",
@@ -157,6 +192,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_sha1_digest,
 		.export			= crypto_sha1_export,
 		.import			= crypto_sha1_import,
+		.export_core		= crypto_sha1_export_core,
+		.import_core		= crypto_sha1_import_core,
 		.descsize		= sizeof(struct sha1_ctx),
 		.statesize		= SHA1_SHASH_STATE_SIZE,
 	},
@@ -175,6 +212,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_hmac_sha1_digest,
 		.export			= crypto_hmac_sha1_export,
 		.import			= crypto_hmac_sha1_import,
+		.export_core		= crypto_hmac_sha1_export_core,
+		.import_core		= crypto_hmac_sha1_import_core,
 		.descsize		= sizeof(struct hmac_sha1_ctx),
 		.statesize		= SHA1_SHASH_STATE_SIZE,
 	},
diff --git a/crypto/sha256.c b/crypto/sha256.c
index 052806559f06..fb81defe084c 100644
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@@ -50,6 +50,19 @@ static int __crypto_sha256_import(struct __sha256_ctx *ctx, const void *in)
 	return 0;
 }
 
+static int __crypto_sha256_export_core(const struct __sha256_ctx *ctx,
+				       void *out)
+{
+	memcpy(out, ctx, offsetof(struct __sha256_ctx, buf));
+	return 0;
+}
+
+static int __crypto_sha256_import_core(struct __sha256_ctx *ctx, const void *in)
+{
+	memcpy(ctx, in, offsetof(struct __sha256_ctx, buf));
+	return 0;
+}
+
 /* SHA-224 */
 
 const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = {
@@ -98,6 +111,16 @@ static int crypto_sha224_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha256_import(&SHA224_CTX(desc)->ctx, in);
 }
 
+static int crypto_sha224_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha256_export_core(&SHA224_CTX(desc)->ctx, out);
+}
+
+static int crypto_sha224_import_core(struct shash_desc *desc, const void *in)
+{
+	return __crypto_sha256_import_core(&SHA224_CTX(desc)->ctx, in);
+}
+
 /* SHA-256 */
 
 const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = {
@@ -146,6 +169,16 @@ static int crypto_sha256_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha256_import(&SHA256_CTX(desc)->ctx, in);
 }
 
+static int crypto_sha256_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha256_export_core(&SHA256_CTX(desc)->ctx, out);
+}
+
+static int crypto_sha256_import_core(struct shash_desc *desc, const void *in)
+{
+	return __crypto_sha256_import_core(&SHA256_CTX(desc)->ctx, in);
+}
+
 /* HMAC-SHA224 */
 
 #define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm))
@@ -198,6 +231,21 @@ static int crypto_hmac_sha224_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha256_import(&ctx->ctx.sha_ctx, in);
 }
 
+static int crypto_hmac_sha224_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha256_export_core(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx,
+					   out);
+}
+
+static int crypto_hmac_sha224_import_core(struct shash_desc *desc,
+					  const void *in)
+{
+	struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc);
+
+	ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate;
+	return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in);
+}
+
 /* HMAC-SHA256 */
 
 #define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm))
@@ -250,6 +298,21 @@ static int crypto_hmac_sha256_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha256_import(&ctx->ctx.sha_ctx, in);
 }
 
+static int crypto_hmac_sha256_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha256_export_core(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx,
+					   out);
+}
+
+static int crypto_hmac_sha256_import_core(struct shash_desc *desc,
+					  const void *in)
+{
+	struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc);
+
+	ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate;
+	return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in);
+}
+
 /* Algorithm definitions */
 
 static struct shash_alg algs[] = {
@@ -266,6 +329,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_sha224_digest,
 		.export			= crypto_sha224_export,
 		.import			= crypto_sha224_import,
+		.export_core		= crypto_sha224_export_core,
+		.import_core		= crypto_sha224_import_core,
 		.descsize		= sizeof(struct sha224_ctx),
 		.statesize		= SHA256_SHASH_STATE_SIZE,
 	},
@@ -282,6 +347,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_sha256_digest,
 		.export			= crypto_sha256_export,
 		.import			= crypto_sha256_import,
+		.export_core		= crypto_sha256_export_core,
+		.import_core		= crypto_sha256_import_core,
 		.descsize		= sizeof(struct sha256_ctx),
 		.statesize		= SHA256_SHASH_STATE_SIZE,
 	},
@@ -300,6 +367,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_hmac_sha224_digest,
 		.export			= crypto_hmac_sha224_export,
 		.import			= crypto_hmac_sha224_import,
+		.export_core		= crypto_hmac_sha224_export_core,
+		.import_core		= crypto_hmac_sha224_import_core,
 		.descsize		= sizeof(struct hmac_sha224_ctx),
 		.statesize		= SHA256_SHASH_STATE_SIZE,
 	},
@@ -318,6 +387,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_hmac_sha256_digest,
 		.export			= crypto_hmac_sha256_export,
 		.import			= crypto_hmac_sha256_import,
+		.export_core		= crypto_hmac_sha256_export_core,
+		.import_core		= crypto_hmac_sha256_import_core,
 		.descsize		= sizeof(struct hmac_sha256_ctx),
 		.statesize		= SHA256_SHASH_STATE_SIZE,
 	},
diff --git a/crypto/sha512.c b/crypto/sha512.c
index fb1c520978ef..d320fe53913f 100644
--- a/crypto/sha512.c
+++ b/crypto/sha512.c
@@ -50,6 +50,19 @@ static int __crypto_sha512_import(struct __sha512_ctx *ctx, const void *in)
 	return 0;
 }
 
+static int __crypto_sha512_export_core(const struct __sha512_ctx *ctx,
+				       void *out)
+{
+	memcpy(out, ctx, offsetof(struct __sha512_ctx, buf));
+	return 0;
+}
+
+static int __crypto_sha512_import_core(struct __sha512_ctx *ctx, const void *in)
+{
+	memcpy(ctx, in, offsetof(struct __sha512_ctx, buf));
+	return 0;
+}
+
 /* SHA-384 */
 
 const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = {
@@ -100,6 +113,16 @@ static int crypto_sha384_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha512_import(&SHA384_CTX(desc)->ctx, in);
 }
 
+static int crypto_sha384_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha512_export_core(&SHA384_CTX(desc)->ctx, out);
+}
+
+static int crypto_sha384_import_core(struct shash_desc *desc, const void *in)
+{
+	return __crypto_sha512_import_core(&SHA384_CTX(desc)->ctx, in);
+}
+
 /* SHA-512 */
 
 const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = {
@@ -152,6 +175,16 @@ static int crypto_sha512_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha512_import(&SHA512_CTX(desc)->ctx, in);
 }
 
+static int crypto_sha512_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha512_export_core(&SHA512_CTX(desc)->ctx, out);
+}
+
+static int crypto_sha512_import_core(struct shash_desc *desc, const void *in)
+{
+	return __crypto_sha512_import_core(&SHA512_CTX(desc)->ctx, in);
+}
+
 /* HMAC-SHA384 */
 
 #define HMAC_SHA384_KEY(tfm) ((struct hmac_sha384_key *)crypto_shash_ctx(tfm))
@@ -204,6 +237,21 @@ static int crypto_hmac_sha384_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha512_import(&ctx->ctx.sha_ctx, in);
 }
 
+static int crypto_hmac_sha384_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha512_export_core(&HMAC_SHA384_CTX(desc)->ctx.sha_ctx,
+					   out);
+}
+
+static int crypto_hmac_sha384_import_core(struct shash_desc *desc,
+					  const void *in)
+{
+	struct hmac_sha384_ctx *ctx = HMAC_SHA384_CTX(desc);
+
+	ctx->ctx.ostate = HMAC_SHA384_KEY(desc->tfm)->key.ostate;
+	return __crypto_sha512_import_core(&ctx->ctx.sha_ctx, in);
+}
+
 /* HMAC-SHA512 */
 
 #define HMAC_SHA512_KEY(tfm) ((struct hmac_sha512_key *)crypto_shash_ctx(tfm))
@@ -256,6 +304,21 @@ static int crypto_hmac_sha512_import(struct shash_desc *desc, const void *in)
 	return __crypto_sha512_import(&ctx->ctx.sha_ctx, in);
 }
 
+static int crypto_hmac_sha512_export_core(struct shash_desc *desc, void *out)
+{
+	return __crypto_sha512_export_core(&HMAC_SHA512_CTX(desc)->ctx.sha_ctx,
+					   out);
+}
+
+static int crypto_hmac_sha512_import_core(struct shash_desc *desc,
+					  const void *in)
+{
+	struct hmac_sha512_ctx *ctx = HMAC_SHA512_CTX(desc);
+
+	ctx->ctx.ostate = HMAC_SHA512_KEY(desc->tfm)->key.ostate;
+	return __crypto_sha512_import_core(&ctx->ctx.sha_ctx, in);
+}
+
 /* Algorithm definitions */
 
 static struct shash_alg algs[] = {
@@ -272,6 +335,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_sha384_digest,
 		.export			= crypto_sha384_export,
 		.import			= crypto_sha384_import,
+		.export_core		= crypto_sha384_export_core,
+		.import_core		= crypto_sha384_import_core,
 		.descsize		= sizeof(struct sha384_ctx),
 		.statesize		= SHA512_SHASH_STATE_SIZE,
 	},
@@ -288,6 +353,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_sha512_digest,
 		.export			= crypto_sha512_export,
 		.import			= crypto_sha512_import,
+		.export_core		= crypto_sha512_export_core,
+		.import_core		= crypto_sha512_import_core,
 		.descsize		= sizeof(struct sha512_ctx),
 		.statesize		= SHA512_SHASH_STATE_SIZE,
 	},
@@ -306,6 +373,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_hmac_sha384_digest,
 		.export			= crypto_hmac_sha384_export,
 		.import			= crypto_hmac_sha384_import,
+		.export_core		= crypto_hmac_sha384_export_core,
+		.import_core		= crypto_hmac_sha384_import_core,
 		.descsize		= sizeof(struct hmac_sha384_ctx),
 		.statesize		= SHA512_SHASH_STATE_SIZE,
 	},
@@ -324,6 +393,8 @@ static struct shash_alg algs[] = {
 		.digest			= crypto_hmac_sha512_digest,
 		.export			= crypto_hmac_sha512_export,
 		.import			= crypto_hmac_sha512_import,
+		.export_core		= crypto_hmac_sha512_export_core,
+		.import_core		= crypto_hmac_sha512_import_core,
 		.descsize		= sizeof(struct hmac_sha512_ctx),
 		.statesize		= SHA512_SHASH_STATE_SIZE,
 	},
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ee33ba21ae2b..9dca41e7ee73 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4152,14 +4152,14 @@ static int alg_test_null(const struct alg_test_desc *desc,
 static const struct alg_test_desc alg_test_descs[] = {
 	{
 		.alg = "adiantum(xchacha12,aes)",
-		.generic_driver = "adiantum(xchacha12-generic,aes-generic,nhpoly1305-generic)",
+		.generic_driver = "adiantum(xchacha12-lib,aes-generic,nhpoly1305-generic)",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(adiantum_xchacha12_aes_tv_template)
 		},
 	}, {
 		.alg = "adiantum(xchacha20,aes)",
-		.generic_driver = "adiantum(xchacha20-generic,aes-generic,nhpoly1305-generic)",
+		.generic_driver = "adiantum(xchacha20-lib,aes-generic,nhpoly1305-generic)",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(adiantum_xchacha20_aes_tv_template)
@@ -4178,6 +4178,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "authenc(hmac(md5),ecb(cipher_null))",
+		.generic_driver = "authenc(hmac-md5-lib,ecb-cipher_null)",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(hmac_md5_ecb_cipher_null_tv_template)
@@ -4484,6 +4485,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "chacha20",
+		.generic_driver = "chacha20-lib",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(chacha20_tv_template)
@@ -4640,12 +4642,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 			.cipher = __VECS(sm4_cts_tv_template)
 		}
 	}, {
-		.alg = "curve25519",
-		.test = alg_test_kpp,
-		.suite = {
-			.kpp = __VECS(curve25519_tv_template)
-		}
-	}, {
 		.alg = "deflate",
 		.test = alg_test_comp,
 		.fips_allowed = 1,
@@ -5064,6 +5060,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(md5)",
+		.generic_driver = "hmac-md5-lib",
 		.test = alg_test_hash,
 		.suite = {
 			.hash = __VECS(hmac_md5_tv_template)
@@ -5250,6 +5247,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "md5",
+		.generic_driver = "md5-lib",
 		.test = alg_test_hash,
 		.suite = {
 			.hash = __VECS(md5_tv_template)
@@ -5417,12 +5415,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "rfc7539(chacha20,poly1305)",
+		.generic_driver = "rfc7539(chacha20-lib,poly1305-generic)",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(rfc7539_tv_template)
 		}
 	}, {
 		.alg = "rfc7539esp(chacha20,poly1305)",
+		.generic_driver = "rfc7539esp(chacha20-lib,poly1305-generic)",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
@@ -5588,12 +5588,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "xchacha12",
+		.generic_driver = "xchacha12-lib",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(xchacha12_tv_template)
 		},
 	}, {
 		.alg = "xchacha20",
+		.generic_driver = "xchacha20-lib",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(xchacha20_tv_template)
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 32d099ac9e73..268231227282 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -3798,1231 +3798,6 @@ static const struct kpp_testvec ffdhe8192_dh_tv_template[] __maybe_unused = {
 	},
 };
 
-static const struct kpp_testvec curve25519_tv_template[] = {
-{
-	.secret = (u8[32]){ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
-		     0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
-		     0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a,
-		     0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a },
-	.b_public = (u8[32]){ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4,
-		    0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37,
-		    0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d,
-		    0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f },
-	.expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
-		    0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
-		    0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
-		    0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b,
-		     0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6,
-		     0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd,
-		     0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb },
-	.b_public = (u8[32]){ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54,
-		    0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a,
-		    0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4,
-		    0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a },
-	.expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
-		    0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
-		    0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
-		    0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 1 },
-	.b_public = (u8[32]){ 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64,
-		    0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d,
-		    0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98,
-		    0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 1 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f,
-		    0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d,
-		    0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3,
-		    0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
-		     0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
-		     0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
-		     0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 },
-	.b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
-		    0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
-		    0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
-		    0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
-	.expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
-		    0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
-		    0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
-		    0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff,
-		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f },
-	.expected_ss = (u8[32]){ 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2,
-		    0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57,
-		    0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05,
-		    0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-{
-	.secret = (u8[32]){ 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 },
-	.expected_ss = (u8[32]){ 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d,
-		    0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12,
-		    0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99,
-		    0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - normal case */
-{
-	.secret = (u8[32]){ 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda,
-		     0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66,
-		     0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3,
-		     0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba },
-	.b_public = (u8[32]){ 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5,
-		    0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9,
-		    0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e,
-		    0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a },
-	.expected_ss = (u8[32]){ 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5,
-		    0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38,
-		    0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e,
-		    0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key on twist */
-{
-	.secret = (u8[32]){ 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4,
-		     0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5,
-		     0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49,
-		     0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 },
-	.b_public = (u8[32]){ 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5,
-		    0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8,
-		    0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3,
-		    0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 },
-	.expected_ss = (u8[32]){ 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff,
-		    0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d,
-		    0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe,
-		    0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key on twist */
-{
-	.secret = (u8[32]){ 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9,
-		     0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39,
-		     0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5,
-		     0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 },
-	.b_public = (u8[32]){ 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f,
-		    0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b,
-		    0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c,
-		    0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 },
-	.expected_ss = (u8[32]){ 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53,
-		    0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57,
-		    0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0,
-		    0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key on twist */
-{
-	.secret = (u8[32]){ 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc,
-		     0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d,
-		     0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67,
-		     0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c },
-	.b_public = (u8[32]){ 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97,
-		    0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f,
-		    0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45,
-		    0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a },
-	.expected_ss = (u8[32]){ 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93,
-		    0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2,
-		    0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44,
-		    0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key on twist */
-{
-	.secret = (u8[32]){ 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1,
-		     0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95,
-		     0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99,
-		     0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d },
-	.b_public = (u8[32]){ 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27,
-		    0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07,
-		    0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae,
-		    0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c },
-	.expected_ss = (u8[32]){ 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73,
-		    0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2,
-		    0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f,
-		    0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key on twist */
-{
-	.secret = (u8[32]){ 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9,
-		     0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd,
-		     0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b,
-		     0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 },
-	.b_public = (u8[32]){ 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5,
-		    0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52,
-		    0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8,
-		    0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 },
-	.expected_ss = (u8[32]){ 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86,
-		    0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4,
-		    0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6,
-		    0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04,
-		     0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77,
-		     0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90,
-		     0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 },
-	.b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97,
-		    0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9,
-		    0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7,
-		    0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36,
-		     0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd,
-		     0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c,
-		     0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 },
-	.b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e,
-		    0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b,
-		    0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e,
-		    0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed,
-		     0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e,
-		     0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd,
-		     0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff,
-		    0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
-		    0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00,
-		    0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f,
-		    0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1,
-		    0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10,
-		    0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3,
-		     0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d,
-		     0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00,
-		     0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 },
-	.b_public = (u8[32]){ 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00,
-		    0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00,
-		    0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff,
-		    0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8,
-		    0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4,
-		    0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70,
-		    0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3,
-		     0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a,
-		     0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e,
-		     0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 },
-	.b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
-		    0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
-		    0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
-		    0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57,
-		    0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c,
-		    0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59,
-		    0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case on twist */
-{
-	.secret = (u8[32]){ 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f,
-		     0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42,
-		     0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9,
-		     0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 },
-	.b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c,
-		    0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5,
-		    0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65,
-		    0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6,
-		     0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4,
-		     0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8,
-		     0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe },
-	.b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7,
-		    0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca,
-		    0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f,
-		    0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa,
-		     0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3,
-		     0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52,
-		     0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-		    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-		    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-		    0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
-	.expected_ss = (u8[32]){ 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3,
-		    0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e,
-		    0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75,
-		    0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26,
-		     0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea,
-		     0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00,
-		     0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
-	.expected_ss = (u8[32]){ 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8,
-		    0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32,
-		    0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87,
-		    0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c,
-		     0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6,
-		     0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb,
-		     0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff,
-		    0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff,
-		    0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff,
-		    0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f },
-	.expected_ss = (u8[32]){ 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85,
-		    0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f,
-		    0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0,
-		    0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38,
-		     0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b,
-		     0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c,
-		     0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
-	.expected_ss = (u8[32]){ 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b,
-		    0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81,
-		    0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3,
-		    0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d,
-		     0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42,
-		     0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98,
-		     0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
-		    0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
-		    0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
-		    0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c,
-		    0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9,
-		    0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89,
-		    0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for public key */
-{
-	.secret = (u8[32]){ 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29,
-		     0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6,
-		     0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c,
-		     0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f },
-	.b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75,
-		    0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89,
-		    0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c,
-		    0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc,
-		     0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1,
-		     0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d,
-		     0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae },
-	.b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09,
-		    0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde,
-		    0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1,
-		    0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81,
-		     0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a,
-		     0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99,
-		     0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d },
-	.b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17,
-		    0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35,
-		    0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55,
-		    0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11,
-		     0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b,
-		     0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9,
-		     0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 },
-	.b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53,
-		    0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e,
-		    0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6,
-		    0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78,
-		     0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2,
-		     0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd,
-		     0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.expected_ss = (u8[32]){ 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb,
-		    0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40,
-		    0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2,
-		    0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9,
-		     0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60,
-		     0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13,
-		     0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 },
-	.b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
-	.expected_ss = (u8[32]){ 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c,
-		    0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3,
-		    0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65,
-		    0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a,
-		     0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7,
-		     0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11,
-		     0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e },
-	.b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
-	.expected_ss = (u8[32]){ 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82,
-		    0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4,
-		    0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c,
-		    0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e,
-		     0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a,
-		     0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d,
-		     0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f },
-	.b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
-	.expected_ss = (u8[32]){ 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2,
-		    0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60,
-		    0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25,
-		    0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb,
-		     0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97,
-		     0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c,
-		     0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 },
-	.b_public = (u8[32]){ 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23,
-		    0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8,
-		    0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69,
-		    0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a,
-		     0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23,
-		     0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b,
-		     0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 },
-	.b_public = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b,
-		    0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44,
-		    0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37,
-		    0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80,
-		     0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d,
-		     0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b,
-		     0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 },
-	.b_public = (u8[32]){ 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63,
-		    0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae,
-		    0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f,
-		    0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0,
-		     0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd,
-		     0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49,
-		     0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 },
-	.b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41,
-		    0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0,
-		    0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf,
-		    0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9,
-		     0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa,
-		     0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5,
-		     0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e },
-	.b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47,
-		    0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3,
-		    0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b,
-		    0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8,
-		     0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98,
-		     0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0,
-		     0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 },
-	.b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0,
-		    0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1,
-		    0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a,
-		    0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02,
-		     0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4,
-		     0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68,
-		     0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d },
-	.b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f,
-		    0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2,
-		    0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95,
-		    0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7,
-		     0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06,
-		     0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9,
-		     0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 },
-	.b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5,
-		    0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0,
-		    0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80,
-		    0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - public key >= p */
-{
-	.secret = (u8[32]){ 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd,
-		     0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4,
-		     0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04,
-		     0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 },
-	.b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-	.expected_ss = (u8[32]){ 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0,
-		    0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac,
-		    0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48,
-		    0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - RFC 7748 */
-{
-	.secret = (u8[32]){ 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
-		     0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
-		     0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
-		     0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 },
-	.b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
-		    0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
-		    0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
-		    0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
-	.expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
-		    0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
-		    0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
-		    0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - RFC 7748 */
-{
-	.secret = (u8[32]){ 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c,
-		     0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5,
-		     0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4,
-		     0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d },
-	.b_public = (u8[32]){ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3,
-		    0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c,
-		    0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e,
-		    0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 },
-	.expected_ss = (u8[32]){ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d,
-		    0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8,
-		    0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52,
-		    0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde,
-		    0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8,
-		    0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4,
-		    0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 },
-	.expected_ss = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d,
-		    0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64,
-		    0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd,
-		    0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 },
-	.expected_ss = (u8[32]){ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8,
-		    0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf,
-		    0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94,
-		    0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d },
-	.expected_ss = (u8[32]){ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84,
-		    0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62,
-		    0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e,
-		    0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 },
-	.expected_ss = (u8[32]){ 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8,
-		    0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58,
-		    0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02,
-		    0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 },
-	.expected_ss = (u8[32]){ 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9,
-		    0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a,
-		    0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44,
-		    0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b },
-	.expected_ss = (u8[32]){ 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd,
-		    0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22,
-		    0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56,
-		    0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b },
-	.expected_ss = (u8[32]){ 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53,
-		    0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f,
-		    0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18,
-		    0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f },
-	.expected_ss = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55,
-		    0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b,
-		    0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79,
-		    0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f },
-	.expected_ss = (u8[32]){ 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39,
-		    0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c,
-		    0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb,
-		    0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e },
-	.expected_ss = (u8[32]){ 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04,
-		    0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10,
-		    0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58,
-		    0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c },
-	.expected_ss = (u8[32]){ 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3,
-		    0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c,
-		    0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88,
-		    0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 },
-	.expected_ss = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a,
-		    0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49,
-		    0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a,
-		    0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f },
-	.expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - edge case for shared secret */
-{
-	.secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
-		     0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
-		     0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
-		     0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
-	.b_public = (u8[32]){ 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca,
-		    0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c,
-		    0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb,
-		    0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 },
-	.expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - checking for overflow */
-{
-	.secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
-		     0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
-		     0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
-		     0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
-	.b_public = (u8[32]){ 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58,
-		    0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7,
-		    0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01,
-		    0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d },
-	.expected_ss = (u8[32]){ 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d,
-		    0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27,
-		    0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b,
-		    0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - checking for overflow */
-{
-	.secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
-		     0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
-		     0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
-		     0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
-	.b_public = (u8[32]){ 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26,
-		    0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2,
-		    0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44,
-		    0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e },
-	.expected_ss = (u8[32]){ 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6,
-		    0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d,
-		    0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e,
-		    0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - checking for overflow */
-{
-	.secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
-		     0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
-		     0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
-		     0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
-	.b_public = (u8[32]){ 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61,
-		    0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67,
-		    0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e,
-		    0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c },
-	.expected_ss = (u8[32]){ 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65,
-		    0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce,
-		    0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0,
-		    0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - checking for overflow */
-{
-	.secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
-		     0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
-		     0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
-		     0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
-	.b_public = (u8[32]){ 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee,
-		    0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d,
-		    0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14,
-		    0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 },
-	.expected_ss = (u8[32]){ 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e,
-		    0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc,
-		    0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5,
-		    0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - checking for overflow */
-{
-	.secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
-		     0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
-		     0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
-		     0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
-	.b_public = (u8[32]){ 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4,
-		    0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5,
-		    0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c,
-		    0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 },
-	.expected_ss = (u8[32]){ 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b,
-		    0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93,
-		    0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f,
-		    0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - private key == -1 (mod order) */
-{
-	.secret = (u8[32]){ 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8,
-		     0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68,
-		     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 },
-	.b_public = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
-		    0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
-		    0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
-		    0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
-	.expected_ss = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
-		    0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
-		    0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
-		    0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-},
-/* wycheproof - private key == 1 (mod order) on twist */
-{
-	.secret = (u8[32]){ 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef,
-		     0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82,
-		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f },
-	.b_public = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
-		    0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
-		    0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
-		    0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
-	.expected_ss = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
-		    0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
-		    0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
-		    0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
-	.secret_size = 32,
-	.b_public_size = 32,
-	.expected_ss_size = 32,
-
-}
-};
-
 static const struct kpp_testvec ecdh_p192_tv_template[] = {
 	{
 	.secret =
diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index 601fdbe70179..61472a381904 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -1829,9 +1829,6 @@ static void hl_release_dmabuf(struct dma_buf *dmabuf)
 	struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
 	struct hl_ctx *ctx;
 
-	if (!hl_dmabuf)
-		return;
-
 	ctx = hl_dmabuf->ctx;
 
 	if (hl_dmabuf->memhash_hnode)
@@ -1859,7 +1856,12 @@ static int export_dmabuf(struct hl_ctx *ctx,
 {
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
 	struct hl_device *hdev = ctx->hdev;
-	int rc, fd;
+	CLASS(get_unused_fd, fd)(flags);
+
+	if (fd < 0) {
+		dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
+		return fd;
+	}
 
 	exp_info.ops = &habanalabs_dmabuf_ops;
 	exp_info.size = total_size;
@@ -1872,13 +1874,6 @@ static int export_dmabuf(struct hl_ctx *ctx,
 		return PTR_ERR(hl_dmabuf->dmabuf);
 	}
 
-	fd = dma_buf_fd(hl_dmabuf->dmabuf, flags);
-	if (fd < 0) {
-		dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
-		rc = fd;
-		goto err_dma_buf_put;
-	}
-
 	hl_dmabuf->ctx = ctx;
 	hl_ctx_get(hl_dmabuf->ctx);
 	atomic_inc(&ctx->hdev->dmabuf_export_cnt);
@@ -1890,13 +1885,9 @@ static int export_dmabuf(struct hl_ctx *ctx,
 	get_file(ctx->hpriv->file_priv->filp);
 
 	*dmabuf_fd = fd;
+	fd_install(take_fd(fd), hl_dmabuf->dmabuf->file);
 
 	return 0;
-
-err_dma_buf_put:
-	hl_dmabuf->dmabuf->priv = NULL;
-	dma_buf_put(hl_dmabuf->dmabuf);
-	return rc;
 }
 
 static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index a38b88baadf2..5722e4128d3c 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -10437,7 +10437,7 @@ end:
 				(u64 *)(lin_dma_pkts_arr), DEBUGFS_WRITE64);
 	WREG32(sob_addr, 0);
 
-	kfree(lin_dma_pkts_arr);
+	kvfree(lin_dma_pkts_arr);
 
 	return rc;
 }
diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
index 3d6d52492536..3289751b4757 100644
--- a/drivers/accel/ivpu/ivpu_drv.c
+++ b/drivers/accel/ivpu/ivpu_drv.c
@@ -677,7 +677,7 @@ static void ivpu_bo_unbind_all_user_contexts(struct ivpu_device *vdev)
 static void ivpu_dev_fini(struct ivpu_device *vdev)
 {
 	ivpu_jobs_abort_all(vdev);
-	ivpu_pm_cancel_recovery(vdev);
+	ivpu_pm_disable_recovery(vdev);
 	ivpu_pm_disable(vdev);
 	ivpu_prepare_for_reset(vdev);
 	ivpu_shutdown(vdev);
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index eacda1dbe840..475ddc94f1cf 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -417,10 +417,10 @@ void ivpu_pm_init(struct ivpu_device *vdev)
 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
 }
 
-void ivpu_pm_cancel_recovery(struct ivpu_device *vdev)
+void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
 {
 	drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
-	cancel_work_sync(&vdev->pm->recovery_work);
+	disable_work_sync(&vdev->pm->recovery_work);
 }
 
 void ivpu_pm_enable(struct ivpu_device *vdev)
diff --git a/drivers/accel/ivpu/ivpu_pm.h b/drivers/accel/ivpu/ivpu_pm.h
index 89b264cc0e3e..a2aa7a27f32e 100644
--- a/drivers/accel/ivpu/ivpu_pm.h
+++ b/drivers/accel/ivpu/ivpu_pm.h
@@ -25,7 +25,7 @@ struct ivpu_pm_info {
 void ivpu_pm_init(struct ivpu_device *vdev);
 void ivpu_pm_enable(struct ivpu_device *vdev);
 void ivpu_pm_disable(struct ivpu_device *vdev);
-void ivpu_pm_cancel_recovery(struct ivpu_device *vdev);
+void ivpu_pm_disable_recovery(struct ivpu_device *vdev);
 
 int ivpu_pm_suspend_cb(struct device *dev);
 int ivpu_pm_resume_cb(struct device *dev);
diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c
index bf8dc92a373a..2561b045acc7 100644
--- a/drivers/acpi/apei/einj-core.c
+++ b/drivers/acpi/apei/einj-core.c
@@ -315,7 +315,7 @@ static void __iomem *einj_get_parameter_address(void)
 			memcpy_fromio(&v5param, p, v5param_size);
 			acpi5 = 1;
 			check_vendor_extension(pa_v5, &v5param);
-			if (available_error_type & ACPI65_EINJV2_SUPP) {
+			if (is_v2 && available_error_type & ACPI65_EINJV2_SUPP) {
 				len = v5param.einjv2_struct.length;
 				offset = offsetof(struct einjv2_extension_struct, component_arr);
 				max_nr_components = (len - offset) /
@@ -540,6 +540,9 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 		struct set_error_type_with_address *v5param;
 
 		v5param = kmalloc(v5param_size, GFP_KERNEL);
+		if (!v5param)
+			return -ENOMEM;
+
 		memcpy_fromio(v5param, einj_param, v5param_size);
 		v5param->type = type;
 		if (type & ACPI5_VENDOR_BIT) {
@@ -1091,7 +1094,7 @@ err_put_table:
 	return rc;
 }
 
-static void __exit einj_remove(struct faux_device *fdev)
+static void einj_remove(struct faux_device *fdev)
 {
 	struct apei_exec_context ctx;
 
@@ -1114,15 +1117,9 @@ static void __exit einj_remove(struct faux_device *fdev)
 }
 
 static struct faux_device *einj_dev;
-/*
- * einj_remove() lives in .exit.text. For drivers registered via
- * platform_driver_probe() this is ok because they cannot get unbound at
- * runtime. So mark the driver struct with __refdata to prevent modpost
- * triggering a section mismatch warning.
- */
-static struct faux_device_ops einj_device_ops __refdata = {
+static struct faux_device_ops einj_device_ops = {
 	.probe = einj_probe,
-	.remove = __exit_p(einj_remove),
+	.remove = einj_remove,
 };
 
 static int __init einj_init(void)
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 98759d6199d3..65f0f56ad753 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -937,8 +937,10 @@ static u32 *iort_rmr_alloc_sids(u32 *sids, u32 count, u32 id_start,
 
 	new_sids = krealloc_array(sids, count + new_count,
 				  sizeof(*new_sids), GFP_KERNEL);
-	if (!new_sids)
+	if (!new_sids) {
+		kfree(sids);
 		return NULL;
+	}
 
 	for (i = count; i < total_count; i++)
 		new_sids[i] = id_start++;
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 75c7db8b156a..7855bbf752b1 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -2033,7 +2033,7 @@ void __init acpi_ec_ecdt_probe(void)
 		goto out;
 	}
 
-	if (!strstarts(ecdt_ptr->id, "\\")) {
+	if (!strlen(ecdt_ptr->id)) {
 		/*
 		 * The ECDT table on some MSI notebooks contains invalid data, together
 		 * with an empty ID string ("").
@@ -2042,9 +2042,13 @@ void __init acpi_ec_ecdt_probe(void)
 		 * a "fully qualified reference to the (...) embedded controller device",
 		 * so this string always has to start with a backslash.
 		 *
-		 * By verifying this we can avoid such faulty ECDT tables in a safe way.
+		 * However some ThinkBook machines have a ECDT table with a valid EC
+		 * description but an invalid ID string ("_SB.PC00.LPCB.EC0").
+		 *
+		 * Because of this we only check if the ID string is empty in order to
+		 * avoid the obvious cases.
 		 */
-		pr_err(FW_BUG "Ignoring ECDT due to invalid ID string \"%s\"\n", ecdt_ptr->id);
+		pr_err(FW_BUG "Ignoring ECDT due to empty ID string\n");
 		goto out;
 	}
 
diff --git a/drivers/acpi/pfr_update.c b/drivers/acpi/pfr_update.c
index 318683744ed1..11b1c2828005 100644
--- a/drivers/acpi/pfr_update.c
+++ b/drivers/acpi/pfr_update.c
@@ -329,7 +329,7 @@ static bool applicable_image(const void *data, struct pfru_update_cap_info *cap,
 	if (type == PFRU_CODE_INJECT_TYPE)
 		return payload_hdr->rt_ver >= cap->code_rt_version;
 
-	return payload_hdr->rt_ver >= cap->drv_rt_version;
+	return payload_hdr->svn_ver >= cap->drv_svn;
 }
 
 static void print_update_debug_info(struct pfru_updated_result *result,
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 755003bf3a45..8972446b7162 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -180,7 +180,7 @@ void acpi_processor_ppc_init(struct cpufreq_policy *policy)
 		struct acpi_processor *pr = per_cpu(processors, cpu);
 		int ret;
 
-		if (!pr || !pr->performance)
+		if (!pr)
 			continue;
 
 		/*
@@ -197,6 +197,9 @@ void acpi_processor_ppc_init(struct cpufreq_policy *policy)
 			pr_err("Failed to add freq constraint for CPU%d (%d)\n",
 			       cpu, ret);
 
+		if (!pr->performance)
+			continue;
+
 		ret = acpi_processor_get_platform_limit(pr);
 		if (ret)
 			pr_err("Failed to update freq constraint for CPU%d (%d)\n",
diff --git a/drivers/acpi/riscv/cppc.c b/drivers/acpi/riscv/cppc.c
index 440cf9fb91aa..42c1a9052470 100644
--- a/drivers/acpi/riscv/cppc.c
+++ b/drivers/acpi/riscv/cppc.c
@@ -119,7 +119,7 @@ int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val)
 
 		*val = data.ret.value;
 
-		return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+		return data.ret.error;
 	}
 
 	return -EINVAL;
@@ -148,7 +148,7 @@ int cpc_write_ffh(int cpu, struct cpc_reg *reg, u64 val)
 
 		smp_call_function_single(cpu, cppc_ffh_csr_write, &data, 1);
 
-		return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+		return data.ret.error;
 	}
 
 	return -EINVAL;
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index e1c24bbacf64..7a7f88b3fa2b 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -689,40 +689,50 @@ MODULE_PARM_DESC(mask_port_map,
 		 "where <pci_dev> is the PCI ID of an AHCI controller in the "
 		 "form \"domain:bus:dev.func\"");
 
-static void ahci_apply_port_map_mask(struct device *dev,
-				     struct ahci_host_priv *hpriv, char *mask_s)
+static char *ahci_mask_port_ext;
+module_param_named(mask_port_ext, ahci_mask_port_ext, charp, 0444);
+MODULE_PARM_DESC(mask_port_ext,
+		 "32-bits mask to ignore the external/hotplug capability of ports. "
+		 "Valid values are: "
+		 "\"<mask>\" to apply the same mask to all AHCI controller "
+		 "devices, and \"<pci_dev>=<mask>,<pci_dev>=<mask>,...\" to "
+		 "specify different masks for the controllers specified, "
+		 "where <pci_dev> is the PCI ID of an AHCI controller in the "
+		 "form \"domain:bus:dev.func\"");
+
+static u32 ahci_port_mask(struct device *dev, char *mask_s)
 {
 	unsigned int mask;
 
 	if (kstrtouint(mask_s, 0, &mask)) {
 		dev_err(dev, "Invalid port map mask\n");
-		return;
+		return 0;
 	}
 
-	hpriv->mask_port_map = mask;
+	return mask;
 }
 
-static void ahci_get_port_map_mask(struct device *dev,
-				   struct ahci_host_priv *hpriv)
+static u32 ahci_get_port_mask(struct device *dev, char *mask_p)
 {
 	char *param, *end, *str, *mask_s;
 	char *name;
+	u32 mask = 0;
 
-	if (!strlen(ahci_mask_port_map))
-		return;
+	if (!mask_p || !strlen(mask_p))
+		return 0;
 
-	str = kstrdup(ahci_mask_port_map, GFP_KERNEL);
+	str = kstrdup(mask_p, GFP_KERNEL);
 	if (!str)
-		return;
+		return 0;
 
 	/* Handle single mask case */
 	if (!strchr(str, '=')) {
-		ahci_apply_port_map_mask(dev, hpriv, str);
+		mask = ahci_port_mask(dev, str);
 		goto free;
 	}
 
 	/*
-	 * Mask list case: parse the parameter to apply the mask only if
+	 * Mask list case: parse the parameter to get the mask only if
 	 * the device name matches.
 	 */
 	param = str;
@@ -752,11 +762,13 @@ static void ahci_get_port_map_mask(struct device *dev,
 			param++;
 		}
 
-		ahci_apply_port_map_mask(dev, hpriv, mask_s);
+		mask = ahci_port_mask(dev, mask_s);
 	}
 
 free:
 	kfree(str);
+
+	return mask;
 }
 
 static void ahci_pci_save_initial_config(struct pci_dev *pdev,
@@ -782,8 +794,10 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev,
 	}
 
 	/* Handle port map masks passed as module parameter. */
-	if (ahci_mask_port_map)
-		ahci_get_port_map_mask(&pdev->dev, hpriv);
+	hpriv->mask_port_map =
+		ahci_get_port_mask(&pdev->dev, ahci_mask_port_map);
+	hpriv->mask_port_ext =
+		ahci_get_port_mask(&pdev->dev, ahci_mask_port_ext);
 
 	ahci_save_initial_config(&pdev->dev, hpriv);
 }
@@ -1757,11 +1771,20 @@ static void ahci_mark_external_port(struct ata_port *ap)
 	void __iomem *port_mmio = ahci_port_base(ap);
 	u32 tmp;
 
-	/* mark external ports (hotplug-capable, eSATA) */
+	/*
+	 * Mark external ports (hotplug-capable, eSATA), unless we were asked to
+	 * ignore this feature.
+	 */
 	tmp = readl(port_mmio + PORT_CMD);
 	if (((tmp & PORT_CMD_ESP) && (hpriv->cap & HOST_CAP_SXS)) ||
-	    (tmp & PORT_CMD_HPCP))
+	    (tmp & PORT_CMD_HPCP)) {
+		if (hpriv->mask_port_ext & (1U << ap->port_no)) {
+			ata_port_info(ap,
+				"Ignoring external/hotplug capability\n");
+			return;
+		}
 		ap->pflags |= ATA_PFLAG_EXTERNAL;
+	}
 }
 
 static void ahci_update_initial_lpm_policy(struct ata_port *ap)
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 2c10c8f440d1..293b7fb216b5 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -330,6 +330,7 @@ struct ahci_host_priv {
 	/* Input fields */
 	unsigned int		flags;		/* AHCI_HFLAG_* */
 	u32			mask_port_map;	/* Mask of valid ports */
+	u32			mask_port_ext;	/* Mask of ports ext capability */
 
 	void __iomem *		mmio;		/* bus-independent mem map */
 	u32			cap;		/* cap to use */
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 5d5a51a77f5d..6b8844646fcd 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -450,7 +450,6 @@ static int xgene_ahci_pmp_softreset(struct ata_link *link, unsigned int *class,
 {
 	int pmp = sata_srst_pmp(link);
 	struct ata_port *ap = link->ap;
-	u32 rc;
 	void __iomem *port_mmio = ahci_port_base(ap);
 	u32 port_fbs;
 
@@ -463,9 +462,7 @@ static int xgene_ahci_pmp_softreset(struct ata_link *link, unsigned int *class,
 	port_fbs |= pmp << PORT_FBS_DEV_OFFSET;
 	writel(port_fbs, port_mmio + PORT_FBS);
 
-	rc = ahci_do_softreset(link, class, pmp, deadline, ahci_check_ready);
-
-	return rc;
+	return ahci_do_softreset(link, class, pmp, deadline, ahci_check_ready);
 }
 
 /**
@@ -500,7 +497,7 @@ static int xgene_ahci_softreset(struct ata_link *link, unsigned int *class,
 	u32 port_fbs;
 	u32 port_fbs_save;
 	u32 retry = 1;
-	u32 rc;
+	int rc;
 
 	port_fbs_save = readl(port_mmio + PORT_FBS);
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 2946ae6d4b2c..2586e77ebf45 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2075,7 +2075,7 @@ out:
  * Check if a link is established. This is a relaxed version of
  * ata_phys_link_online() which accounts for the fact that this is potentially
  * called after changing the link power management policy, which may not be
- * reflected immediately in the SSTAUS register (e.g., we may still be seeing
+ * reflected immediately in the SStatus register (e.g., we may still be seeing
  * the PHY in partial, slumber or devsleep Partial power management state.
  * So check that:
  * - A device is still present, that is, DET is 1h (Device presence detected
@@ -2089,8 +2089,13 @@ static bool ata_eh_link_established(struct ata_link *link)
 	u32 sstatus;
 	u8 det, ipm;
 
+	/*
+	 * For old IDE/PATA adapters that do not have a valid scr_read method,
+	 * or if reading the SStatus register fails, assume that the device is
+	 * present. Device probe will determine if that is really the case.
+	 */
 	if (sata_scr_read(link, SCR_STATUS, &sstatus))
-		return false;
+		return true;
 
 	det = sstatus & 0x0f;
 	ipm = (sstatus >> 8) & 0x0f;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 57f674f51b0c..2ded5e476d6e 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3904,21 +3904,16 @@ static int ata_mselect_control_ata_feature(struct ata_queued_cmd *qc,
 	/* Check cdl_ctrl */
 	switch (buf[0] & 0x03) {
 	case 0:
-		/* Disable CDL if it is enabled */
-		if (!(dev->flags & ATA_DFLAG_CDL_ENABLED))
-			return 0;
+		/* Disable CDL */
 		ata_dev_dbg(dev, "Disabling CDL\n");
 		cdl_action = 0;
 		dev->flags &= ~ATA_DFLAG_CDL_ENABLED;
 		break;
 	case 0x02:
 		/*
-		 * Enable CDL if not already enabled. Since this is mutually
-		 * exclusive with NCQ priority, allow this only if NCQ priority
-		 * is disabled.
+		 * Enable CDL. Since CDL is mutually exclusive with NCQ
+		 * priority, allow this only if NCQ priority is disabled.
 		 */
-		if (dev->flags & ATA_DFLAG_CDL_ENABLED)
-			return 0;
 		if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED) {
 			ata_dev_err(dev,
 				"NCQ priority must be disabled to enable CDL\n");
diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c
index eeae160c898d..fa3c76a2b49d 100644
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -279,6 +279,19 @@ static struct atm_vcc *find_vcc(struct atm_dev *dev, short vpi, int vci)
         return NULL;
 }
 
+static int atmtcp_c_pre_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct atmtcp_hdr *hdr;
+
+	if (skb->len < sizeof(struct atmtcp_hdr))
+		return -EINVAL;
+
+	hdr = (struct atmtcp_hdr *)skb->data;
+	if (hdr->length == ATMTCP_HDR_MAGIC)
+		return -EINVAL;
+
+	return 0;
+}
 
 static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb)
 {
@@ -288,9 +301,6 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb)
 	struct sk_buff *new_skb;
 	int result = 0;
 
-	if (skb->len < sizeof(struct atmtcp_hdr))
-		goto done;
-
 	dev = vcc->dev_data;
 	hdr = (struct atmtcp_hdr *) skb->data;
 	if (hdr->length == ATMTCP_HDR_MAGIC) {
@@ -347,6 +357,7 @@ static const struct atmdev_ops atmtcp_v_dev_ops = {
 
 static const struct atmdev_ops atmtcp_c_dev_ops = {
 	.close		= atmtcp_c_close,
+	.pre_send	= atmtcp_c_pre_send,
 	.send		= atmtcp_c_send
 };
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index efc575a00edd..008da0354fba 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -603,6 +603,7 @@ CPU_SHOW_VULN_FALLBACK(ghostwrite);
 CPU_SHOW_VULN_FALLBACK(old_microcode);
 CPU_SHOW_VULN_FALLBACK(indirect_target_selection);
 CPU_SHOW_VULN_FALLBACK(tsa);
+CPU_SHOW_VULN_FALLBACK(vmscape);
 
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
@@ -622,6 +623,7 @@ static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL);
 static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL);
 static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL);
 static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL);
+static DEVICE_ATTR(vmscape, 0444, cpu_show_vmscape, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -642,6 +644,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_old_microcode.attr,
 	&dev_attr_indirect_target_selection.attr,
 	&dev_attr_tsa.attr,
+	&dev_attr_vmscape.attr,
 	NULL
 };
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 31bfb3194b4c..9d4e46ad8352 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	struct dentry *dentry;
 	struct path path;
 
-	dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY);
+	dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode)
 	if (!IS_ERR(dentry))
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return PTR_ERR_OR_ZERO(dentry);
 }
 
@@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 	struct path path;
 	int err;
 
-	dentry = kern_path_create(AT_FDCWD, nodename, &path, 0);
+	dentry = start_creating_path(AT_FDCWD, nodename, &path, 0);
 	if (dentry == ERR_PTR(-ENOENT)) {
 		create_path(nodename);
-		dentry = kern_path_create(AT_FDCWD, nodename, &path, 0);
+		dentry = start_creating_path(AT_FDCWD, nodename, &path, 0);
 	}
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
 		/* mark as kernel-created inode */
 		d_inode(dentry)->i_private = &thread;
 	}
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return err;
 }
 
@@ -256,7 +256,7 @@ static int dev_rmdir(const char *name)
 	struct dentry *dentry;
 	int err;
 
-	dentry = kern_path_locked(name, &parent);
+	dentry = start_removing_path(name, &parent);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	if (d_inode(dentry)->i_private == &thread)
@@ -265,9 +265,7 @@ static int dev_rmdir(const char *name)
 	else
 		err = -EPERM;
 
-	dput(dentry);
-	inode_unlock(d_inode(parent.dentry));
-	path_put(&parent);
+	end_removing_path(&parent, dentry);
 	return err;
 }
 
@@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev)
 	int deleted = 0;
 	int err = 0;
 
-	dentry = kern_path_locked(nodename, &parent);
+	dentry = start_removing_path(nodename, &parent);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev)
 		if (!err || err == -ENOENT)
 			deleted = 1;
 	}
-	dput(dentry);
-	inode_unlock(d_inode(parent.dentry));
+	end_removing_path(&parent, dentry);
 
-	path_put(&parent);
 	if (deleted && strchr(nodename, '/'))
 		delete_path(nodename);
 	return err;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index dbf5456cd891..2ea6e05e6ec9 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -675,7 +675,7 @@ static void dpm_async_resume_subordinate(struct device *dev, async_func_t func)
 	idx = device_links_read_lock();
 
 	/* Start processing the device's "async" consumers. */
-	list_for_each_entry_rcu(link, &dev->links.consumers, s_node)
+	list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node)
 		if (READ_ONCE(link->status) != DL_STATE_DORMANT)
 			dpm_async_with_cleanup(link->consumer, func);
 
@@ -1330,7 +1330,7 @@ static void dpm_async_suspend_superior(struct device *dev, async_func_t func)
 	idx = device_links_read_lock();
 
 	/* Start processing the device's "async" suppliers. */
-	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node)
+	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node)
 		if (READ_ONCE(link->status) != DL_STATE_DORMANT)
 			dpm_async_with_cleanup(link->supplier, func);
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e21492981f7d..f6d6276974ee 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -380,6 +380,9 @@ enum {
 	/* this is/was a write request */
 	__EE_WRITE,
 
+	/* hand back using mempool_free(e, drbd_buffer_page_pool) */
+	__EE_RELEASE_TO_MEMPOOL,
+
 	/* this is/was a write same request */
 	__EE_WRITE_SAME,
 
@@ -402,6 +405,7 @@ enum {
 #define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
 #define EE_SUBMITTED		(1<<__EE_SUBMITTED)
 #define EE_WRITE		(1<<__EE_WRITE)
+#define EE_RELEASE_TO_MEMPOOL	(1<<__EE_RELEASE_TO_MEMPOOL)
 #define EE_WRITE_SAME		(1<<__EE_WRITE_SAME)
 #define EE_APPLICATION		(1<<__EE_APPLICATION)
 #define EE_RS_THIN_REQ		(1<<__EE_RS_THIN_REQ)
@@ -858,7 +862,6 @@ struct drbd_device {
 	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
 	struct list_head done_ee;   /* need to send P_WRITE_ACK */
 	struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
-	struct list_head net_ee;    /* zero-copy network send in progress */
 
 	struct list_head resync_reads;
 	atomic_t pp_in_use;		/* allocated from page pool */
@@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 extern mempool_t drbd_request_mempool;
 extern mempool_t drbd_ee_mempool;
 
-/* drbd's page pool, used to buffer data received from the peer,
- * or data requested by the peer.
- *
- * This does not have an emergency reserve.
- *
- * When allocating from this pool, it first takes pages from the pool.
- * Only if the pool is depleted will try to allocate from the system.
- *
- * The assumption is that pages taken from this pool will be processed,
- * and given back, "quickly", and then can be recycled, so we can avoid
- * frequent calls to alloc_page(), and still will be able to make progress even
- * under memory pressure.
- */
-extern struct page *drbd_pp_pool;
-extern spinlock_t   drbd_pp_lock;
-extern int	    drbd_pp_vacant;
-extern wait_queue_head_t drbd_pp_wait;
-
 /* We also need a standard (emergency-reserve backed) page pool
  * for meta data IO (activity log, bitmap).
  * We can keep it global, as long as it is used as "N pages at a time".
@@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait;
  */
 #define DRBD_MIN_POOL_PAGES	128
 extern mempool_t drbd_md_io_page_pool;
+extern mempool_t drbd_buffer_page_pool;
 
 /* We also need to make sure we get a bio
  * when we need it for housekeeping purposes */
@@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *,
 						     sector_t, unsigned int,
 						     unsigned int,
 						     gfp_t) __must_hold(local);
-extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
-				 int);
-#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
-#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req);
 extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
 extern int drbd_connected(struct drbd_peer_device *);
@@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page)
 	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
 
 
-static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
-{
-	struct page *page = peer_req->pages;
-	page_chain_for_each(page) {
-		if (page_count(page) > 1)
-			return 1;
-	}
-	return 0;
-}
-
 static inline union drbd_state drbd_read_state(struct drbd_device *device)
 {
 	struct drbd_resource *resource = device->resource;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 52724b79be30..c73376886e7a 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 mempool_t drbd_request_mempool;
 mempool_t drbd_ee_mempool;
 mempool_t drbd_md_io_page_pool;
+mempool_t drbd_buffer_page_pool;
 struct bio_set drbd_md_io_bio_set;
 struct bio_set drbd_io_bio_set;
 
-/* I do not use a standard mempool, because:
-   1) I want to hand out the pre-allocated objects first.
-   2) I want to be able to interrupt sleeping allocation with a signal.
-   Note: This is a single linked list, the next pointer is the private
-	 member of struct page.
- */
-struct page *drbd_pp_pool;
-DEFINE_SPINLOCK(drbd_pp_lock);
-int          drbd_pp_vacant;
-wait_queue_head_t drbd_pp_wait;
-
 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
 
 static const struct block_device_operations drbd_ops = {
@@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
 static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
 			    struct drbd_peer_request *peer_req)
 {
+	bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL);
 	struct page *page = peer_req->pages;
 	unsigned len = peer_req->i.size;
 	int err;
@@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
 	page_chain_for_each(page) {
 		unsigned l = min_t(unsigned, len, PAGE_SIZE);
 
-		err = _drbd_send_page(peer_device, page, 0, l,
-				      page_chain_next(page) ? MSG_MORE : 0);
+		if (likely(use_sendpage))
+			err = _drbd_send_page(peer_device, page, 0, l,
+					      page_chain_next(page) ? MSG_MORE : 0);
+		else
+			err = _drbd_no_send_page(peer_device, page, 0, l,
+						 page_chain_next(page) ? MSG_MORE : 0);
+
 		if (err)
 			return err;
 		len -= l;
@@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device)
 	INIT_LIST_HEAD(&device->sync_ee);
 	INIT_LIST_HEAD(&device->done_ee);
 	INIT_LIST_HEAD(&device->read_ee);
-	INIT_LIST_HEAD(&device->net_ee);
 	INIT_LIST_HEAD(&device->resync_reads);
 	INIT_LIST_HEAD(&device->resync_work.list);
 	INIT_LIST_HEAD(&device->unplug_work.list);
@@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device)
 	D_ASSERT(device, list_empty(&device->sync_ee));
 	D_ASSERT(device, list_empty(&device->done_ee));
 	D_ASSERT(device, list_empty(&device->read_ee));
-	D_ASSERT(device, list_empty(&device->net_ee));
 	D_ASSERT(device, list_empty(&device->resync_reads));
 	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
 	D_ASSERT(device, list_empty(&device->resync_work.list));
@@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device)
 
 static void drbd_destroy_mempools(void)
 {
-	struct page *page;
-
-	while (drbd_pp_pool) {
-		page = drbd_pp_pool;
-		drbd_pp_pool = (struct page *)page_private(page);
-		__free_page(page);
-		drbd_pp_vacant--;
-	}
-
 	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
 
 	bioset_exit(&drbd_io_bio_set);
 	bioset_exit(&drbd_md_io_bio_set);
+	mempool_exit(&drbd_buffer_page_pool);
 	mempool_exit(&drbd_md_io_page_pool);
 	mempool_exit(&drbd_ee_mempool);
 	mempool_exit(&drbd_request_mempool);
@@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void)
 
 static int drbd_create_mempools(void)
 {
-	struct page *page;
 	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
-	int i, ret;
+	int ret;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void)
 	if (ret)
 		goto Enomem;
 
+	ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0);
+	if (ret)
+		goto Enomem;
+
 	ret = mempool_init_slab_pool(&drbd_request_mempool, number,
 				     drbd_request_cache);
 	if (ret)
@@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void)
 	if (ret)
 		goto Enomem;
 
-	for (i = 0; i < number; i++) {
-		page = alloc_page(GFP_HIGHUSER);
-		if (!page)
-			goto Enomem;
-		set_page_private(page, (unsigned long)drbd_pp_pool);
-		drbd_pp_pool = page;
-	}
-	drbd_pp_vacant = number;
-
 	return 0;
 
 Enomem:
@@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device)
 	rr = drbd_free_peer_reqs(device, &device->done_ee);
 	if (rr)
 		drbd_err(device, "%d EEs in done list found!\n", rr);
-
-	rr = drbd_free_peer_reqs(device, &device->net_ee);
-	if (rr)
-		drbd_err(device, "%d EEs in net list found!\n", rr);
 }
 
 /* caution. no locking. */
@@ -2863,11 +2839,6 @@ static int __init drbd_init(void)
 		return err;
 	}
 
-	/*
-	 * allocate all necessary structs
-	 */
-	init_waitqueue_head(&drbd_pp_wait);
-
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
 	idr_init(&drbd_devices);
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e09930c2b226..91f3b8afb63c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1330,6 +1330,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device,
 		lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
 	else
 		lim.max_write_zeroes_sectors = 0;
+	lim.max_hw_wzeroes_unmap_sectors = 0;
 
 	if ((lim.discard_granularity >> SECTOR_SHIFT) >
 	    lim.max_hw_discard_sectors) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 975024cf03c5..caaf2781136d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/scatterlist.h>
 #include <linux/part_stat.h>
+#include <linux/mempool.h>
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
@@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int);
 
 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
 
-/*
- * some helper functions to deal with single linked page lists,
- * page->private being our "next" pointer.
- */
-
-/* If at least n pages are linked at head, get n pages off.
- * Otherwise, don't modify head, and return NULL.
- * Locking is the responsibility of the caller.
- */
-static struct page *page_chain_del(struct page **head, int n)
-{
-	struct page *page;
-	struct page *tmp;
-
-	BUG_ON(!n);
-	BUG_ON(!head);
-
-	page = *head;
-
-	if (!page)
-		return NULL;
-
-	while (page) {
-		tmp = page_chain_next(page);
-		if (--n == 0)
-			break; /* found sufficient pages */
-		if (tmp == NULL)
-			/* insufficient pages, don't use any of them. */
-			return NULL;
-		page = tmp;
-	}
-
-	/* add end of list marker for the returned list */
-	set_page_private(page, 0);
-	/* actual return value, and adjustment of head */
-	page = *head;
-	*head = tmp;
-	return page;
-}
-
-/* may be used outside of locks to find the tail of a (usually short)
- * "private" page chain, before adding it back to a global chain head
- * with page_chain_add() under a spinlock. */
-static struct page *page_chain_tail(struct page *page, int *len)
-{
-	struct page *tmp;
-	int i = 1;
-	while ((tmp = page_chain_next(page))) {
-		++i;
-		page = tmp;
-	}
-	if (len)
-		*len = i;
-	return page;
-}
-
-static int page_chain_free(struct page *page)
-{
-	struct page *tmp;
-	int i = 0;
-	page_chain_for_each_safe(page, tmp) {
-		put_page(page);
-		++i;
-	}
-	return i;
-}
-
-static void page_chain_add(struct page **head,
-		struct page *chain_first, struct page *chain_last)
-{
-#if 1
-	struct page *tmp;
-	tmp = page_chain_tail(chain_first, NULL);
-	BUG_ON(tmp != chain_last);
-#endif
-
-	/* add chain to head */
-	set_page_private(chain_last, (unsigned long)*head);
-	*head = chain_first;
-}
-
-static struct page *__drbd_alloc_pages(struct drbd_device *device,
-				       unsigned int number)
+static struct page *__drbd_alloc_pages(unsigned int number)
 {
 	struct page *page = NULL;
 	struct page *tmp = NULL;
 	unsigned int i = 0;
 
-	/* Yes, testing drbd_pp_vacant outside the lock is racy.
-	 * So what. It saves a spin_lock. */
-	if (drbd_pp_vacant >= number) {
-		spin_lock(&drbd_pp_lock);
-		page = page_chain_del(&drbd_pp_pool, number);
-		if (page)
-			drbd_pp_vacant -= number;
-		spin_unlock(&drbd_pp_lock);
-		if (page)
-			return page;
-	}
-
 	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
 	for (i = 0; i < number; i++) {
-		tmp = alloc_page(GFP_TRY);
+		tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY);
 		if (!tmp)
-			break;
+			goto fail;
 		set_page_private(tmp, (unsigned long)page);
 		page = tmp;
 	}
-
-	if (i == number)
-		return page;
-
-	/* Not enough pages immediately available this time.
-	 * No need to jump around here, drbd_alloc_pages will retry this
-	 * function "soon". */
-	if (page) {
-		tmp = page_chain_tail(page, NULL);
-		spin_lock(&drbd_pp_lock);
-		page_chain_add(&drbd_pp_pool, page, tmp);
-		drbd_pp_vacant += i;
-		spin_unlock(&drbd_pp_lock);
+	return page;
+fail:
+	page_chain_for_each_safe(page, tmp) {
+		set_page_private(page, 0);
+		mempool_free(page, &drbd_buffer_page_pool);
 	}
 	return NULL;
 }
 
-static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
-					   struct list_head *to_be_freed)
-{
-	struct drbd_peer_request *peer_req, *tmp;
-
-	/* The EEs are always appended to the end of the list. Since
-	   they are sent in order over the wire, they have to finish
-	   in order. As soon as we see the first not finished we can
-	   stop to examine the list... */
-
-	list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
-		if (drbd_peer_req_has_active_page(peer_req))
-			break;
-		list_move(&peer_req->w.list, to_be_freed);
-	}
-}
-
-static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
-{
-	LIST_HEAD(reclaimed);
-	struct drbd_peer_request *peer_req, *t;
-
-	spin_lock_irq(&device->resource->req_lock);
-	reclaim_finished_net_peer_reqs(device, &reclaimed);
-	spin_unlock_irq(&device->resource->req_lock);
-	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
-		drbd_free_net_peer_req(device, peer_req);
-}
-
-static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
-{
-	struct drbd_peer_device *peer_device;
-	int vnr;
-
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		if (!atomic_read(&device->pp_in_use_by_net))
-			continue;
-
-		kref_get(&device->kref);
-		rcu_read_unlock();
-		drbd_reclaim_net_peer_reqs(device);
-		kref_put(&device->kref, drbd_destroy_device);
-		rcu_read_lock();
-	}
-	rcu_read_unlock();
-}
-
 /**
  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
  * @peer_device:	DRBD device.
@@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 			      bool retry)
 {
 	struct drbd_device *device = peer_device->device;
-	struct page *page = NULL;
+	struct page *page;
 	struct net_conf *nc;
-	DEFINE_WAIT(wait);
 	unsigned int mxb;
 
 	rcu_read_lock();
@@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 	mxb = nc ? nc->max_buffers : 1000000;
 	rcu_read_unlock();
 
-	if (atomic_read(&device->pp_in_use) < mxb)
-		page = __drbd_alloc_pages(device, number);
-
-	/* Try to keep the fast path fast, but occasionally we need
-	 * to reclaim the pages we lended to the network stack. */
-	if (page && atomic_read(&device->pp_in_use_by_net) > 512)
-		drbd_reclaim_net_peer_reqs(device);
-
-	while (page == NULL) {
-		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
-
-		drbd_reclaim_net_peer_reqs(device);
-
-		if (atomic_read(&device->pp_in_use) < mxb) {
-			page = __drbd_alloc_pages(device, number);
-			if (page)
-				break;
-		}
-
-		if (!retry)
-			break;
-
-		if (signal_pending(current)) {
-			drbd_warn(device, "drbd_alloc_pages interrupted!\n");
-			break;
-		}
-
-		if (schedule_timeout(HZ/10) == 0)
-			mxb = UINT_MAX;
-	}
-	finish_wait(&drbd_pp_wait, &wait);
+	if (atomic_read(&device->pp_in_use) >= mxb)
+		schedule_timeout_interruptible(HZ / 10);
+	page = __drbd_alloc_pages(number);
 
 	if (page)
 		atomic_add(number, &device->pp_in_use);
@@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
  * Is also used from inside an other spin_lock_irq(&resource->req_lock);
  * Either links the page chain back to the global pool,
  * or returns all pages to the system. */
-static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_device *device, struct page *page)
 {
-	atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
-	int i;
+	struct page *tmp;
+	int i = 0;
 
 	if (page == NULL)
 		return;
 
-	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
-		i = page_chain_free(page);
-	else {
-		struct page *tmp;
-		tmp = page_chain_tail(page, &i);
-		spin_lock(&drbd_pp_lock);
-		page_chain_add(&drbd_pp_pool, page, tmp);
-		drbd_pp_vacant += i;
-		spin_unlock(&drbd_pp_lock);
-	}
-	i = atomic_sub_return(i, a);
+	page_chain_for_each_safe(page, tmp) {
+		set_page_private(page, 0);
+		if (page_count(page) == 1)
+			mempool_free(page, &drbd_buffer_page_pool);
+		else
+			put_page(page);
+		i++;
+	}
+	i = atomic_sub_return(i, &device->pp_in_use);
 	if (i < 0)
-		drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
-			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
-	wake_up(&drbd_pp_wait);
+		drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 }
 
 /*
@@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 					gfpflags_allow_blocking(gfp_mask));
 		if (!page)
 			goto fail;
+		if (!mempool_is_saturated(&drbd_buffer_page_pool))
+			peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
 	}
 
 	memset(peer_req, 0, sizeof(*peer_req));
@@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 	return NULL;
 }
 
-void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
-		       int is_net)
+void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req)
 {
 	might_sleep();
 	if (peer_req->flags & EE_HAS_DIGEST)
 		kfree(peer_req->digest);
-	drbd_free_pages(device, peer_req->pages, is_net);
+	drbd_free_pages(device, peer_req->pages);
 	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 	if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
@@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 	LIST_HEAD(work_list);
 	struct drbd_peer_request *peer_req, *t;
 	int count = 0;
-	int is_net = list == &device->net_ee;
 
 	spin_lock_irq(&device->resource->req_lock);
 	list_splice_init(list, &work_list);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
-		__drbd_free_peer_req(device, peer_req, is_net);
+		drbd_free_peer_req(device, peer_req);
 		count++;
 	}
 	return count;
@@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 static int drbd_finish_peer_reqs(struct drbd_device *device)
 {
 	LIST_HEAD(work_list);
-	LIST_HEAD(reclaimed);
 	struct drbd_peer_request *peer_req, *t;
 	int err = 0;
 
 	spin_lock_irq(&device->resource->req_lock);
-	reclaim_finished_net_peer_reqs(device, &reclaimed);
 	list_splice_init(&device->done_ee, &work_list);
 	spin_unlock_irq(&device->resource->req_lock);
 
-	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
-		drbd_free_net_peer_req(device, peer_req);
-
 	/* possible callbacks here:
 	 * e_end_block, and e_end_resync_block, e_send_superseded.
 	 * all ignore the last argument.
@@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
 		data_size -= len;
 	}
 	kunmap(page);
-	drbd_free_pages(peer_device->device, page, 0);
+	drbd_free_pages(peer_device->device, page);
 	return err;
 }
 
@@ -5224,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
 		put_ldev(device);
 	}
 
-	/* tcp_close and release of sendpage pages can be deferred.  I don't
-	 * want to use SO_LINGER, because apparently it can be deferred for
-	 * more than 20 seconds (longest time I checked).
-	 *
-	 * Actually we don't care for exactly when the network stack does its
-	 * put_page(), but release our reference on these pages right here.
-	 */
-	i = drbd_free_peer_reqs(device, &device->net_ee);
-	if (i)
-		drbd_info(device, "net_ee not empty, killed %u entries\n", i);
 	i = atomic_read(&device->pp_in_use_by_net);
 	if (i)
 		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
@@ -5980,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi)
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
 
-		conn_reclaim_net_peer_reqs(connection);
-
 		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
 			if (drbd_send_ping(connection)) {
 				drbd_err(connection, "drbd_send_ping has failed\n");
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index a6ea737b3b71..dea3e79d044f 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1030,22 +1030,6 @@ out:
 	return 1;
 }
 
-/* helper */
-static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
-{
-	if (drbd_peer_req_has_active_page(peer_req)) {
-		/* This might happen if sendpage() has not finished */
-		int i = PFN_UP(peer_req->i.size);
-		atomic_add(i, &device->pp_in_use_by_net);
-		atomic_sub(i, &device->pp_in_use);
-		spin_lock_irq(&device->resource->req_lock);
-		list_add_tail(&peer_req->w.list, &device->net_ee);
-		spin_unlock_irq(&device->resource->req_lock);
-		wake_up(&drbd_pp_wait);
-	} else
-		drbd_free_peer_req(device, peer_req);
-}
-
 /**
  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
  * @w:		work object.
@@ -1059,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
 	int err;
 
 	if (unlikely(cancel)) {
-		drbd_free_peer_req(device, peer_req);
-		dec_unacked(device);
-		return 0;
+		err = 0;
+		goto out;
 	}
 
 	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
@@ -1074,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
 		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
 	}
 
-	dec_unacked(device);
-
-	move_to_net_ee_or_free(device, peer_req);
-
 	if (unlikely(err))
 		drbd_err(device, "drbd_send_block() failed\n");
+out:
+	dec_unacked(device);
+	drbd_free_peer_req(device, peer_req);
+
 	return err;
 }
 
@@ -1120,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
 	int err;
 
 	if (unlikely(cancel)) {
-		drbd_free_peer_req(device, peer_req);
-		dec_unacked(device);
-		return 0;
+		err = 0;
+		goto out;
 	}
 
 	if (get_ldev_if_state(device, D_FAILED)) {
@@ -1155,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
 		/* update resync data with failure */
 		drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size);
 	}
-
-	dec_unacked(device);
-
-	move_to_net_ee_or_free(device, peer_req);
-
 	if (unlikely(err))
 		drbd_err(device, "drbd_send_block() failed\n");
+out:
+	dec_unacked(device);
+	drbd_free_peer_req(device, peer_req);
+
 	return err;
 }
 
@@ -1176,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
 	int err, eq = 0;
 
 	if (unlikely(cancel)) {
-		drbd_free_peer_req(device, peer_req);
-		dec_unacked(device);
-		return 0;
+		err = 0;
+		goto out;
 	}
 
 	if (get_ldev(device)) {
@@ -1220,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
 		if (drbd_ratelimit())
 			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
 	}
-
-	dec_unacked(device);
-	move_to_net_ee_or_free(device, peer_req);
-
 	if (unlikely(err))
 		drbd_err(device, "drbd_send_block/ack() failed\n");
+out:
+	dec_unacked(device);
+	drbd_free_peer_req(device, peer_req);
+
 	return err;
 }
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1b6ee91f8eb9..053a086d547e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -137,20 +137,35 @@ static void loop_global_unlock(struct loop_device *lo, bool global)
 static int max_part;
 static int part_shift;
 
-static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
+static loff_t lo_calculate_size(struct loop_device *lo, struct file *file)
 {
 	loff_t loopsize;
+	int ret;
+
+	if (S_ISBLK(file_inode(file)->i_mode)) {
+		loopsize = i_size_read(file->f_mapping->host);
+	} else {
+		struct kstat stat;
+
+		/*
+		 * Get the accurate file size. This provides better results than
+		 * cached inode data, particularly for network filesystems where
+		 * metadata may be stale.
+		 */
+		ret = vfs_getattr_nosec(&file->f_path, &stat, STATX_SIZE, 0);
+		if (ret)
+			return 0;
 
-	/* Compute loopsize in bytes */
-	loopsize = i_size_read(file->f_mapping->host);
-	if (offset > 0)
-		loopsize -= offset;
+		loopsize = stat.size;
+	}
+
+	if (lo->lo_offset > 0)
+		loopsize -= lo->lo_offset;
 	/* offset is beyond i_size, weird but possible */
 	if (loopsize < 0)
 		return 0;
-
-	if (sizelimit > 0 && sizelimit < loopsize)
-		loopsize = sizelimit;
+	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+		loopsize = lo->lo_sizelimit;
 	/*
 	 * Unfortunately, if we want to do I/O on the device,
 	 * the number of 512-byte sectors has to fit into a sector_t.
@@ -158,11 +173,6 @@ static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
 	return loopsize >> 9;
 }
 
-static loff_t get_loop_size(struct loop_device *lo, struct file *file)
-{
-	return get_size(lo->lo_offset, lo->lo_sizelimit, file);
-}
-
 /*
  * We support direct I/O only if lo_offset is aligned with the logical I/O size
  * of backing device, and the logical block size of loop is bigger than that of
@@ -569,7 +579,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	error = -EINVAL;
 
 	/* size of the new backing store needs to be the same */
-	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
+	if (lo_calculate_size(lo, file) != lo_calculate_size(lo, old_file))
 		goto out_err;
 
 	/*
@@ -1063,7 +1073,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
 	loop_update_dio(lo);
 	loop_sysfs_init(lo);
 
-	size = get_loop_size(lo, file);
+	size = lo_calculate_size(lo, file);
 	loop_set_size(lo, size);
 
 	/* Order wrt reading lo_state in loop_validate_file(). */
@@ -1255,8 +1265,7 @@ out_unfreeze:
 	if (partscan)
 		clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
 	if (!err && size_changed) {
-		loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
-					   lo->lo_backing_file);
+		loff_t new_size = lo_calculate_size(lo, lo->lo_backing_file);
 		loop_set_size(lo, new_size);
 	}
 out_unlock:
@@ -1399,7 +1408,7 @@ static int loop_set_capacity(struct loop_device *lo)
 	if (unlikely(lo->lo_state != Lo_bound))
 		return -ENXIO;
 
-	size = get_loop_size(lo, lo->lo_backing_file);
+	size = lo_calculate_size(lo, lo->lo_backing_file);
 	loop_set_size(lo, size);
 
 	return 0;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6561d2a561fa..67d4a867aec4 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -235,10 +235,11 @@ struct ublk_device {
 
 	struct completion	completion;
 	unsigned int		nr_queues_ready;
-	unsigned int		nr_privileged_daemon;
+	bool 			unprivileged_daemons;
 	struct mutex cancel_mutex;
 	bool canceling;
 	pid_t 	ublksrv_tgid;
+	struct delayed_work	exit_work;
 };
 
 /* header of ublk_params */
@@ -1389,7 +1390,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
 {
 	blk_status_t res;
 
-	if (unlikely(ubq->fail_io))
+	if (unlikely(READ_ONCE(ubq->fail_io)))
 		return BLK_STS_TARGET;
 
 	/* With recovery feature enabled, force_abort is set in
@@ -1401,7 +1402,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
 	 * Note: force_abort is guaranteed to be seen because it is set
 	 * before request queue is unqiuesced.
 	 */
-	if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
+	if (ublk_nosrv_should_queue_io(ubq) &&
+	    unlikely(READ_ONCE(ubq->force_abort)))
 		return BLK_STS_IOERR;
 
 	if (check_cancel && unlikely(ubq->canceling))
@@ -1550,7 +1552,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
 	/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
 	ub->mm = NULL;
 	ub->nr_queues_ready = 0;
-	ub->nr_privileged_daemon = 0;
+	ub->unprivileged_daemons = false;
 	ub->ublksrv_tgid = -1;
 }
 
@@ -1594,13 +1596,63 @@ static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
 		ublk_get_queue(ub, i)->canceling = canceling;
 }
 
-static int ublk_ch_release(struct inode *inode, struct file *filp)
+static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
 {
-	struct ublk_device *ub = filp->private_data;
+	int i, j;
+
+	if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
+					UBLK_F_AUTO_BUF_REG)))
+		return false;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		for (j = 0; j < ubq->q_depth; j++) {
+			struct ublk_io *io = &ubq->ios[j];
+			unsigned int refs = refcount_read(&io->ref) +
+				io->task_registered_buffers;
+
+			/*
+			 * UBLK_REFCOUNT_INIT or zero means no active
+			 * reference
+			 */
+			if (refs != UBLK_REFCOUNT_INIT && refs != 0)
+				return true;
+
+			/* reset to zero if the io hasn't active references */
+			refcount_set(&io->ref, 0);
+			io->task_registered_buffers = 0;
+		}
+	}
+	return false;
+}
+
+static void ublk_ch_release_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, exit_work.work);
 	struct gendisk *disk;
 	int i;
 
 	/*
+	 * For zero-copy and auto buffer register modes, I/O references
+	 * might not be dropped naturally when the daemon is killed, but
+	 * io_uring guarantees that registered bvec kernel buffers are
+	 * unregistered finally when freeing io_uring context, then the
+	 * active references are dropped.
+	 *
+	 * Wait until active references are dropped for avoiding use-after-free
+	 *
+	 * registered buffer may be unregistered in io_ring's release hander,
+	 * so have to wait by scheduling work function for avoiding the two
+	 * file release dependency.
+	 */
+	if (ublk_check_and_reset_active_ref(ub)) {
+		schedule_delayed_work(&ub->exit_work, 1);
+		return;
+	}
+
+	/*
 	 * disk isn't attached yet, either device isn't live, or it has
 	 * been removed already, so we needn't to do anything
 	 */
@@ -1644,7 +1696,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
 	 * Transition the device to the nosrv state. What exactly this
 	 * means depends on the recovery flags
 	 */
-	blk_mq_quiesce_queue(disk->queue);
 	if (ublk_nosrv_should_stop_dev(ub)) {
 		/*
 		 * Allow any pending/future I/O to pass through quickly
@@ -1652,8 +1703,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
 		 * waits for all pending I/O to complete
 		 */
 		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
-			ublk_get_queue(ub, i)->force_abort = true;
-		blk_mq_unquiesce_queue(disk->queue);
+			WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
 
 		ublk_stop_dev_unlocked(ub);
 	} else {
@@ -1663,9 +1713,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
 		} else {
 			ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
 			for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
-				ublk_get_queue(ub, i)->fail_io = true;
+				WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
 		}
-		blk_mq_unquiesce_queue(disk->queue);
 	}
 unlock:
 	mutex_unlock(&ub->mutex);
@@ -1675,6 +1724,23 @@ unlock:
 	ublk_reset_ch_dev(ub);
 out:
 	clear_bit(UB_STATE_OPEN, &ub->state);
+
+	/* put the reference grabbed in ublk_ch_release() */
+	ublk_put_device(ub);
+}
+
+static int ublk_ch_release(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = filp->private_data;
+
+	/*
+	 * Grab ublk device reference, so it won't be gone until we are
+	 * really released from work function.
+	 */
+	ublk_get_device(ub);
+
+	INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
+	schedule_delayed_work(&ub->exit_work, 0);
 	return 0;
 }
 
@@ -1980,12 +2046,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
 	__must_hold(&ub->mutex)
 {
 	ubq->nr_io_ready++;
-	if (ublk_queue_ready(ubq)) {
+	if (ublk_queue_ready(ubq))
 		ub->nr_queues_ready++;
-
-		if (capable(CAP_SYS_ADMIN))
-			ub->nr_privileged_daemon++;
-	}
+	if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
+		ub->unprivileged_daemons = true;
 
 	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
 		/* now we are ready for handling ublk io request */
@@ -2880,8 +2944,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 
 	ublk_apply_params(ub);
 
-	/* don't probe partitions if any one ubq daemon is un-trusted */
-	if (ub->nr_privileged_daemon != ub->nr_queues_ready)
+	/* don't probe partitions if any daemon task is un-trusted */
+	if (ub->unprivileged_daemons)
 		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
 
 	ublk_get_device(ub);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8acad3cc6e6e..f31652085adc 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill,
 				  u32 index)
 {
 	zram_slot_lock(zram, index);
+	zram_free_page(zram, index);
 	zram_set_flag(zram, index, ZRAM_SAME);
 	zram_set_handle(zram, index, fill);
 	zram_slot_unlock(zram, index);
@@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
 	kunmap_local(src);
 
 	zram_slot_lock(zram, index);
+	zram_free_page(zram, index);
 	zram_set_flag(zram, index, ZRAM_HUGE);
 	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, PAGE_SIZE);
@@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	unsigned long element;
 	bool same_filled;
 
-	/* First, free memory allocated to this slot (if any) */
-	zram_slot_lock(zram, index);
-	zram_free_page(zram, index);
-	zram_slot_unlock(zram, index);
-
 	mem = kmap_local_page(page);
 	same_filled = page_same_filled(mem, &element);
 	kunmap_local(mem);
@@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	zcomp_stream_put(zstrm);
 
 	zram_slot_lock(zram, index);
+	zram_free_page(zram, index);
 	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index 4ab32abf0f48..7df69ccb6600 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -312,7 +312,9 @@ config BT_HCIBCM4377
 
 config BT_HCIBPA10X
 	tristate "HCI BPA10x USB driver"
+	depends on BT_HCIUART
 	depends on USB
+	select BT_HCIUART_H4
 	help
 	  Bluetooth HCI BPA10x USB driver.
 	  This driver provides support for the Digianswer BPA 100/105 Bluetooth
@@ -437,8 +439,10 @@ config BT_MTKSDIO
 
 config BT_MTKUART
 	tristate "MediaTek HCI UART driver"
+	depends on BT_HCIUART
 	depends on SERIAL_DEV_BUS
 	depends on USB || !BT_HCIBTUSB_MTK
+	select BT_HCIUART_H4
 	select BT_MTK
 	help
 	  MediaTek Bluetooth HCI UART driver.
@@ -483,7 +487,9 @@ config BT_VIRTIO
 
 config BT_NXPUART
 	tristate "NXP protocol support"
+	depends on BT_HCIUART
 	depends on SERIAL_DEV_BUS
+	select BT_HCIUART_H4
 	select CRC32
 	select CRC8
 	help
diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c
index 4390fd571dbd..a8c520dc09e1 100644
--- a/drivers/bluetooth/btmtk.c
+++ b/drivers/bluetooth/btmtk.c
@@ -642,12 +642,7 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev,
 	 * WMT command.
 	 */
 	err = wait_on_bit_timeout(&data->flags, BTMTK_TX_WAIT_VND_EVT,
-				  TASK_INTERRUPTIBLE, HCI_INIT_TIMEOUT);
-	if (err == -EINTR) {
-		bt_dev_err(hdev, "Execution of wmt command interrupted");
-		clear_bit(BTMTK_TX_WAIT_VND_EVT, &data->flags);
-		goto err_free_wc;
-	}
+				  TASK_UNINTERRUPTIBLE, HCI_INIT_TIMEOUT);
 
 	if (err) {
 		bt_dev_err(hdev, "Execution of wmt command timed out");
diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index 73a4a325c867..76e7f857fb7d 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -543,10 +543,10 @@ static int ps_setup(struct hci_dev *hdev)
 	}
 
 	if (psdata->wakeup_source) {
-		ret = devm_request_irq(&serdev->dev, psdata->irq_handler,
-					ps_host_wakeup_irq_handler,
-					IRQF_ONESHOT | IRQF_TRIGGER_FALLING,
-					dev_name(&serdev->dev), nxpdev);
+		ret = devm_request_threaded_irq(&serdev->dev, psdata->irq_handler,
+						NULL, ps_host_wakeup_irq_handler,
+						IRQF_ONESHOT,
+						dev_name(&serdev->dev), nxpdev);
 		if (ret)
 			bt_dev_info(hdev, "error setting wakeup IRQ handler, ignoring\n");
 		disable_irq(psdata->irq_handler);
diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h
index 5ea5dd80e297..cbbe79b241ce 100644
--- a/drivers/bluetooth/hci_uart.h
+++ b/drivers/bluetooth/hci_uart.h
@@ -121,10 +121,6 @@ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable);
 void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed,
 			 unsigned int oper_speed);
 
-#ifdef CONFIG_BT_HCIUART_H4
-int h4_init(void);
-int h4_deinit(void);
-
 struct h4_recv_pkt {
 	u8  type;	/* Packet type */
 	u8  hlen;	/* Header length */
@@ -162,6 +158,10 @@ struct h4_recv_pkt {
 	.lsize = 2, \
 	.maxlen = HCI_MAX_FRAME_SIZE \
 
+#ifdef CONFIG_BT_HCIUART_H4
+int h4_init(void);
+int h4_deinit(void);
+
 struct sk_buff *h4_recv_buf(struct hci_dev *hdev, struct sk_buff *skb,
 			    const unsigned char *buffer, int count,
 			    const struct h4_recv_pkt *pkts, int pkts_count);
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index f7d8c3c00655..2fef08254d78 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -380,6 +380,28 @@ static const struct file_operations force_devcoredump_fops = {
 	.write		= force_devcd_write,
 };
 
+static void vhci_debugfs_init(struct vhci_data *data)
+{
+	struct hci_dev *hdev = data->hdev;
+
+	debugfs_create_file("force_suspend", 0644, hdev->debugfs, data,
+			    &force_suspend_fops);
+
+	debugfs_create_file("force_wakeup", 0644, hdev->debugfs, data,
+			    &force_wakeup_fops);
+
+	if (IS_ENABLED(CONFIG_BT_MSFTEXT))
+		debugfs_create_file("msft_opcode", 0644, hdev->debugfs, data,
+				    &msft_opcode_fops);
+
+	if (IS_ENABLED(CONFIG_BT_AOSPEXT))
+		debugfs_create_file("aosp_capable", 0644, hdev->debugfs, data,
+				    &aosp_capable_fops);
+
+	debugfs_create_file("force_devcoredump", 0644, hdev->debugfs, data,
+			    &force_devcoredump_fops);
+}
+
 static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 {
 	struct hci_dev *hdev;
@@ -434,22 +456,8 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 		return -EBUSY;
 	}
 
-	debugfs_create_file("force_suspend", 0644, hdev->debugfs, data,
-			    &force_suspend_fops);
-
-	debugfs_create_file("force_wakeup", 0644, hdev->debugfs, data,
-			    &force_wakeup_fops);
-
-	if (IS_ENABLED(CONFIG_BT_MSFTEXT))
-		debugfs_create_file("msft_opcode", 0644, hdev->debugfs, data,
-				    &msft_opcode_fops);
-
-	if (IS_ENABLED(CONFIG_BT_AOSPEXT))
-		debugfs_create_file("aosp_capable", 0644, hdev->debugfs, data,
-				    &aosp_capable_fops);
-
-	debugfs_create_file("force_devcoredump", 0644, hdev->debugfs, data,
-			    &force_devcoredump_fops);
+	if (!IS_ERR_OR_NULL(hdev->debugfs))
+		vhci_debugfs_init(data);
 
 	hci_skb_pkt_type(skb) = HCI_VENDOR_PKT;
 
@@ -651,6 +659,21 @@ static int vhci_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static void vhci_debugfs_remove(struct hci_dev *hdev)
+{
+	debugfs_lookup_and_remove("force_suspend", hdev->debugfs);
+
+	debugfs_lookup_and_remove("force_wakeup", hdev->debugfs);
+
+	if (IS_ENABLED(CONFIG_BT_MSFTEXT))
+		debugfs_lookup_and_remove("msft_opcode", hdev->debugfs);
+
+	if (IS_ENABLED(CONFIG_BT_AOSPEXT))
+		debugfs_lookup_and_remove("aosp_capable", hdev->debugfs);
+
+	debugfs_lookup_and_remove("force_devcoredump", hdev->debugfs);
+}
+
 static int vhci_release(struct inode *inode, struct file *file)
 {
 	struct vhci_data *data = file->private_data;
@@ -662,6 +685,8 @@ static int vhci_release(struct inode *inode, struct file *file)
 	hdev = data->hdev;
 
 	if (hdev) {
+		if (!IS_ERR_OR_NULL(hdev->debugfs))
+			vhci_debugfs_remove(hdev);
 		hci_unregister_dev(hdev);
 		hci_free_dev(hdev);
 	}
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index fca83141e3e6..3f8b9041babf 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -14,7 +14,7 @@
 #include "cdx_controller.h"
 #include "../cdx.h"
 #include "mcdi_functions.h"
-#include "mcdi.h"
+#include "mcdid.h"
 
 static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
 {
diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c
index 04b578a0be17..59aabd99fa8f 100644
--- a/drivers/cdx/controller/cdx_rpmsg.c
+++ b/drivers/cdx/controller/cdx_rpmsg.c
@@ -15,7 +15,7 @@
 #include "../cdx.h"
 #include "cdx_controller.h"
 #include "mcdi_functions.h"
-#include "mcdi.h"
+#include "mcdid.h"
 
 static struct rpmsg_device_id cdx_rpmsg_id_table[] = {
 	{ .name = "mcdi_ipc" },
@@ -129,8 +129,7 @@ static int cdx_rpmsg_probe(struct rpmsg_device *rpdev)
 
 	chinfo.src = RPMSG_ADDR_ANY;
 	chinfo.dst = rpdev->dst;
-	strscpy(chinfo.name, cdx_rpmsg_id_table[0].name,
-		strlen(cdx_rpmsg_id_table[0].name));
+	strscpy(chinfo.name, cdx_rpmsg_id_table[0].name, sizeof(chinfo.name));
 
 	cdx_mcdi->ept = rpmsg_create_ept(rpdev, cdx_rpmsg_cb, NULL, chinfo);
 	if (!cdx_mcdi->ept) {
diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c
index e760f8d347cc..2e82ffc18d89 100644
--- a/drivers/cdx/controller/mcdi.c
+++ b/drivers/cdx/controller/mcdi.c
@@ -23,9 +23,10 @@
 #include <linux/log2.h>
 #include <linux/net_tstamp.h>
 #include <linux/wait.h>
+#include <linux/cdx/bitfield.h>
 
-#include "bitfield.h"
-#include "mcdi.h"
+#include <linux/cdx/mcdi.h>
+#include "mcdid.h"
 
 static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd);
 static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx);
@@ -99,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd
 		return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd);
 }
 
+/**
+ * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state
+ * @cdx:	Handle to the CDX MCDI structure
+ *
+ * This function allocates and initializes internal MCDI structures and resources
+ * for the CDX device, including the workqueue, locking primitives, and command
+ * tracking mechanisms. It sets the initial operating mode and prepares the device
+ * for MCDI operations.
+ *
+ * Return:
+ * * 0        - on success
+ * * -ENOMEM  - if memory allocation or workqueue creation fails
+ */
 int cdx_mcdi_init(struct cdx_mcdi *cdx)
 {
 	struct cdx_mcdi_iface *mcdi;
@@ -128,7 +142,16 @@ fail2:
 fail:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(cdx_mcdi_init);
 
+/**
+ * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state
+ * @cdx:	Handle to the CDX MCDI structure
+ *
+ * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface)
+ * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue.
+ *
+ */
 void cdx_mcdi_finish(struct cdx_mcdi *cdx)
 {
 	struct cdx_mcdi_iface *mcdi;
@@ -143,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx)
 	kfree(cdx->mcdi);
 	cdx->mcdi = NULL;
 }
+EXPORT_SYMBOL_GPL(cdx_mcdi_finish);
 
 static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups)
 {
@@ -553,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi,
 			cdx_mcdi_cmd_start_or_queue(mcdi, cmd);
 }
 
+/**
+ * cdx_mcdi_process_cmd - Process an incoming MCDI response
+ * @cdx:	Handle to the CDX MCDI structure
+ * @outbuf:	Pointer to the response buffer received from the management controller
+ * @len:	Length of the response buffer in bytes
+ *
+ * This function handles a response from the management controller. It locates the
+ * corresponding command using the sequence number embedded in the header,
+ * completes the command if it is still pending, and initiates any necessary cleanup.
+ *
+ * The function assumes that the response buffer is well-formed and at least one
+ * dword in size.
+ */
 void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len)
 {
 	struct cdx_mcdi_iface *mcdi;
@@ -590,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le
 
 	cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list);
 }
+EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd);
 
 static void cdx_mcdi_cmd_work(struct work_struct *context)
 {
@@ -757,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd,
 	return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen,
 				 outlen_actual, false);
 }
+EXPORT_SYMBOL_GPL(cdx_mcdi_rpc);
 
 /**
  * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 885c69e6ebe5..8ae2d99be81e 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -5,7 +5,6 @@
 
 #include <linux/module.h>
 
-#include "mcdi.h"
 #include "mcdi_functions.h"
 
 int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx)
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index b9942affdc6b..57fd1bae706b 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -8,7 +8,8 @@
 #ifndef CDX_MCDI_FUNCTIONS_H
 #define CDX_MCDI_FUNCTIONS_H
 
-#include "mcdi.h"
+#include <linux/cdx/mcdi.h>
+#include "mcdid.h"
 #include "../cdx.h"
 
 /**
diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h
new file mode 100644
index 000000000000..7fc29f099265
--- /dev/null
+++ b/drivers/cdx/controller/mcdid.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2008-2013 Solarflare Communications Inc.
+ * Copyright (C) 2022-2025, Advanced Micro Devices, Inc.
+ */
+
+#ifndef CDX_MCDID_H
+#define CDX_MCDID_H
+
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/rpmsg.h>
+
+#include "mc_cdx_pcol.h"
+
+#ifdef DEBUG
+#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x)
+#define CDX_WARN_ON_PARANOID(x) WARN_ON(x)
+#else
+#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0)
+#define CDX_WARN_ON_PARANOID(x) do {} while (0)
+#endif
+
+#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX)
+
+static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx)
+{
+	return cdx->mcdi ? &cdx->mcdi->iface : NULL;
+}
+
+int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd,
+		       const struct cdx_dword *inbuf, size_t inlen,
+		       cdx_mcdi_async_completer *complete,
+		       unsigned long cookie);
+int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx,
+				 unsigned int timeout_jiffies);
+
+/*
+ * We expect that 16- and 32-bit fields in MCDI requests and responses
+ * are appropriately aligned, but 64-bit fields are only
+ * 32-bit-aligned.
+ */
+#define MCDI_BYTE(_buf, _field)						\
+	((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1),	\
+	 *MCDI_PTR(_buf, _field))
+#define MCDI_WORD(_buf, _field)						\
+	((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2),	\
+	 le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field)))
+#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1)		\
+	CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field),		\
+			     MC_CMD_ ## _name1, _value1)
+#define MCDI_SET_QWORD(_buf, _field, _value)				\
+	do {								\
+		CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0],	\
+				     CDX_DWORD, (u32)(_value));	\
+		CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1],	\
+				     CDX_DWORD, (u64)(_value) >> 32);	\
+	} while (0)
+#define MCDI_QWORD(_buf, _field)					\
+	(CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) |	\
+	(u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32)
+
+#endif /* CDX_MCDID_H */
diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c
index 5bc473c2adb3..2f65fe2c6bdf 100644
--- a/drivers/clk/renesas/clk-mstp.c
+++ b/drivers/clk/renesas/clk-mstp.c
@@ -303,6 +303,9 @@ void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev)
 		pm_clk_destroy(dev);
 }
 
+static struct device_node *cpg_mstp_pd_np __initdata = NULL;
+static struct generic_pm_domain *cpg_mstp_pd_genpd __initdata = NULL;
+
 void __init cpg_mstp_add_clk_domain(struct device_node *np)
 {
 	struct generic_pm_domain *pd;
@@ -324,5 +327,20 @@ void __init cpg_mstp_add_clk_domain(struct device_node *np)
 	pd->detach_dev = cpg_mstp_detach_dev;
 	pm_genpd_init(pd, &pm_domain_always_on_gov, false);
 
-	of_genpd_add_provider_simple(np, pd);
+	cpg_mstp_pd_np = of_node_get(np);
+	cpg_mstp_pd_genpd = pd;
+}
+
+static int __init cpg_mstp_pd_init_provider(void)
+{
+	int error;
+
+	if (!cpg_mstp_pd_np)
+		return -ENODEV;
+
+	error = of_genpd_add_provider_simple(cpg_mstp_pd_np, cpg_mstp_pd_genpd);
+
+	of_node_put(cpg_mstp_pd_np);
+	return error;
 }
+postcore_initcall(cpg_mstp_pd_init_provider);
diff --git a/drivers/clk/sunxi-ng/ccu_mp.c b/drivers/clk/sunxi-ng/ccu_mp.c
index 354c981943b6..4221b1888b38 100644
--- a/drivers/clk/sunxi-ng/ccu_mp.c
+++ b/drivers/clk/sunxi-ng/ccu_mp.c
@@ -185,7 +185,7 @@ static unsigned long ccu_mp_recalc_rate(struct clk_hw *hw,
 	p &= (1 << cmp->p.width) - 1;
 
 	if (cmp->common.features & CCU_FEATURE_DUAL_DIV)
-		rate = (parent_rate / p) / m;
+		rate = (parent_rate / (p + cmp->p.offset)) / m;
 	else
 		rate = (parent_rate >> p) / m;
 
diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 23b7178522ae..7e2f2b1a1c36 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -1587,6 +1587,9 @@ static int do_insnlist_ioctl(struct comedi_device *dev,
 				memset(&data[n], 0, (MIN_SAMPLES - n) *
 						    sizeof(unsigned int));
 			}
+		} else {
+			memset(data, 0, max_t(unsigned int, n, MIN_SAMPLES) *
+					sizeof(unsigned int));
 		}
 		ret = parse_insn(dev, insns + i, data, file);
 		if (ret < 0)
@@ -1670,6 +1673,8 @@ static int do_insn_ioctl(struct comedi_device *dev,
 			memset(&data[insn->n], 0,
 			       (MIN_SAMPLES - insn->n) * sizeof(unsigned int));
 		}
+	} else {
+		memset(data, 0, n_data * sizeof(unsigned int));
 	}
 	ret = parse_insn(dev, insn, data, file);
 	if (ret < 0)
diff --git a/drivers/comedi/drivers.c b/drivers/comedi/drivers.c
index f1dc854928c1..c9ebaadc5e82 100644
--- a/drivers/comedi/drivers.c
+++ b/drivers/comedi/drivers.c
@@ -620,11 +620,9 @@ static int insn_rw_emulate_bits(struct comedi_device *dev,
 	unsigned int chan = CR_CHAN(insn->chanspec);
 	unsigned int base_chan = (chan < 32) ? 0 : chan;
 	unsigned int _data[2];
+	unsigned int i;
 	int ret;
 
-	if (insn->n == 0)
-		return 0;
-
 	memset(_data, 0, sizeof(_data));
 	memset(&_insn, 0, sizeof(_insn));
 	_insn.insn = INSN_BITS;
@@ -635,18 +633,21 @@ static int insn_rw_emulate_bits(struct comedi_device *dev,
 	if (insn->insn == INSN_WRITE) {
 		if (!(s->subdev_flags & SDF_WRITABLE))
 			return -EINVAL;
-		_data[0] = 1U << (chan - base_chan);		     /* mask */
-		_data[1] = data[0] ? (1U << (chan - base_chan)) : 0; /* bits */
+		_data[0] = 1U << (chan - base_chan);		/* mask */
 	}
+	for (i = 0; i < insn->n; i++) {
+		if (insn->insn == INSN_WRITE)
+			_data[1] = data[i] ? _data[0] : 0;	/* bits */
 
-	ret = s->insn_bits(dev, s, &_insn, _data);
-	if (ret < 0)
-		return ret;
+		ret = s->insn_bits(dev, s, &_insn, _data);
+		if (ret < 0)
+			return ret;
 
-	if (insn->insn == INSN_READ)
-		data[0] = (_data[1] >> (chan - base_chan)) & 1;
+		if (insn->insn == INSN_READ)
+			data[i] = (_data[1] >> (chan - base_chan)) & 1;
+	}
 
-	return 1;
+	return insn->n;
 }
 
 static int __comedi_device_postconfig_async(struct comedi_device *dev,
diff --git a/drivers/comedi/drivers/pcl726.c b/drivers/comedi/drivers/pcl726.c
index 0430630e6ebb..b542896fa0e4 100644
--- a/drivers/comedi/drivers/pcl726.c
+++ b/drivers/comedi/drivers/pcl726.c
@@ -328,7 +328,8 @@ static int pcl726_attach(struct comedi_device *dev,
 	 * Hook up the external trigger source interrupt only if the
 	 * user config option is valid and the board supports interrupts.
 	 */
-	if (it->options[1] && (board->irq_mask & (1 << it->options[1]))) {
+	if (it->options[1] > 0 && it->options[1] < 16 &&
+	    (board->irq_mask & (1U << it->options[1]))) {
 		ret = request_irq(it->options[1], pcl726_interrupt, 0,
 				  dev->board_name, dev);
 		if (ret == 0) {
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index bbc27ef9edf7..b4c79fde1979 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1554,13 +1554,15 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
 	pr_debug("CPU %d exiting\n", policy->cpu);
 }
 
-static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy, bool policy_change)
 {
 	struct amd_cpudata *cpudata = policy->driver_data;
 	union perf_cached perf;
 	u8 epp;
 
-	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
+	if (policy_change ||
+	    policy->min != cpudata->min_limit_freq ||
+	    policy->max != cpudata->max_limit_freq)
 		amd_pstate_update_min_max_limit(policy);
 
 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
@@ -1584,7 +1586,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
 
 	cpudata->policy = policy->policy;
 
-	ret = amd_pstate_epp_update_limit(policy);
+	ret = amd_pstate_epp_update_limit(policy, true);
 	if (ret)
 		return ret;
 
@@ -1626,13 +1628,14 @@ static int amd_pstate_suspend(struct cpufreq_policy *policy)
 	 * min_perf value across kexec reboots. If this CPU is just resumed back without kexec,
 	 * the limits, epp and desired perf will get reset to the cached values in cpudata struct
 	 */
-	ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
+	ret = amd_pstate_update_perf(policy, perf.bios_min_perf,
+				     FIELD_GET(AMD_CPPC_DES_PERF_MASK, cpudata->cppc_req_cached),
+				     FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached),
+				     FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached),
+				     false);
 	if (ret)
 		return ret;
 
-	/* invalidate to ensure it's rewritten during resume */
-	cpudata->cppc_req_cached = 0;
-
 	/* set this flag to avoid setting core offline*/
 	cpudata->suspended = true;
 
@@ -1658,7 +1661,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy)
 		int ret;
 
 		/* enable amd pstate from suspend state*/
-		ret = amd_pstate_epp_update_limit(policy);
+		ret = amd_pstate_epp_update_limit(policy, false);
 		if (ret)
 			return ret;
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index fc7eace8b65b..58e3839a2140 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2953,6 +2953,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 			goto err_null_driver;
 	}
 
+	/*
+	 * Mark support for the scheduler's frequency invariance engine for
+	 * drivers that implement target(), target_index() or fast_switch().
+	 */
+	if (!cpufreq_driver->setpolicy) {
+		static_branch_enable_cpuslocked(&cpufreq_freq_invariance);
+		pr_debug("cpufreq: supports frequency invariance\n");
+	}
+
 	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
 		goto err_boost_unreg;
@@ -2974,21 +2983,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	hp_online = ret;
 	ret = 0;
 
-	/*
-	 * Mark support for the scheduler's frequency invariance engine for
-	 * drivers that implement target(), target_index() or fast_switch().
-	 */
-	if (!cpufreq_driver->setpolicy) {
-		static_branch_enable_cpuslocked(&cpufreq_freq_invariance);
-		pr_debug("supports frequency invariance");
-	}
-
 	pr_debug("driver %s up and running\n", driver_data->name);
 	goto out;
 
 err_if_unreg:
 	subsys_interface_unregister(&cpufreq_interface);
 err_boost_unreg:
+	if (!cpufreq_driver->setpolicy)
+		static_branch_disable_cpuslocked(&cpufreq_freq_invariance);
 	remove_boost_sysfs_file();
 err_null_driver:
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 06a1c7dd081f..0d5d283a5429 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1034,8 +1034,8 @@ static bool hybrid_register_perf_domain(unsigned int cpu)
 	if (!cpu_dev)
 		return false;
 
-	if (em_dev_register_perf_domain(cpu_dev, HYBRID_EM_STATE_COUNT, &cb,
-					cpumask_of(cpu), false))
+	if (em_dev_register_pd_no_update(cpu_dev, HYBRID_EM_STATE_COUNT, &cb,
+					 cpumask_of(cpu), false))
 		return false;
 
 	cpudata->pd_registered = true;
@@ -2793,6 +2793,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
 	X86_MATCH(INTEL_GRANITERAPIDS_X,	core_funcs),
 	X86_MATCH(INTEL_ATOM_CRESTMONT,		core_funcs),
 	X86_MATCH(INTEL_ATOM_CRESTMONT_X,	core_funcs),
+	X86_MATCH(INTEL_ATOM_DARKMONT_X,	core_funcs),
 	{}
 };
 #endif
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 52d5d26fc7c6..b2e3d0b0a116 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns)
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
 
+static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
+{
+	/* Update the repeating-pattern data. */
+	data->intervals[data->interval_ptr++] = interval_us;
+	if (data->interval_ptr >= INTERVALS)
+		data->interval_ptr = 0;
+}
+
 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 
 /*
@@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	if (data->needs_update) {
 		menu_update(drv, dev);
 		data->needs_update = 0;
+	} else if (!dev->last_residency_ns) {
+		/*
+		 * This happens when the driver rejects the previously selected
+		 * idle state and returns an error, so update the recent
+		 * intervals table to prevent invalid information from being
+		 * used going forward.
+		 */
+		menu_update_intervals(data, UINT_MAX);
 	}
 
 	/* Find the shortest expected idle interval. */
@@ -271,20 +287,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		return 0;
 	}
 
-	if (tick_nohz_tick_stopped()) {
-		/*
-		 * If the tick is already stopped, the cost of possible short
-		 * idle duration misprediction is much higher, because the CPU
-		 * may be stuck in a shallow idle state for a long time as a
-		 * result of it.  In that case say we might mispredict and use
-		 * the known time till the closest timer event for the idle
-		 * state selection.
-		 */
-		if (predicted_ns < TICK_NSEC)
-			predicted_ns = data->next_timer_ns;
-	} else if (latency_req > predicted_ns) {
-		latency_req = predicted_ns;
-	}
+	/*
+	 * If the tick is already stopped, the cost of possible short idle
+	 * duration misprediction is much higher, because the CPU may be stuck
+	 * in a shallow idle state for a long time as a result of it.  In that
+	 * case, say we might mispredict and use the known time till the closest
+	 * timer event for the idle state selection.
+	 */
+	if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
+		predicted_ns = data->next_timer_ns;
 
 	/*
 	 * Find the idle state with the lowest power while satisfying
@@ -300,13 +311,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		if (idx == -1)
 			idx = i; /* first enabled state */
 
+		if (s->exit_latency_ns > latency_req)
+			break;
+
 		if (s->target_residency_ns > predicted_ns) {
 			/*
 			 * Use a physical idle state, not busy polling, unless
 			 * a timer is going to trigger soon enough.
 			 */
 			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
-			    s->exit_latency_ns <= latency_req &&
 			    s->target_residency_ns <= data->next_timer_ns) {
 				predicted_ns = s->target_residency_ns;
 				idx = i;
@@ -338,8 +351,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
 			return idx;
 		}
-		if (s->exit_latency_ns > latency_req)
-			break;
 
 		idx = i;
 	}
@@ -482,10 +493,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 	data->correction_factor[data->bucket] = new_factor;
 
-	/* update the repeating-pattern data */
-	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
-	if (data->interval_ptr >= INTERVALS)
-		data->interval_ptr = 0;
+	menu_update_intervals(data, ktime_to_us(measured_ns));
 }
 
 /**
diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index 394484929dae..a9626b30044a 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \
                                    tee-dev.o \
                                    platform-access.o \
                                    dbc.o \
-                                   hsti.o
+                                   hsti.o \
+                                   sfs.o
 
 obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o
 ccp-crypto-objs := ccp-crypto-main.o \
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 1c5a7189631e..9e21da0e298a 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -17,6 +17,7 @@
 #include "psp-dev.h"
 #include "sev-dev.h"
 #include "tee-dev.h"
+#include "sfs.h"
 #include "platform-access.h"
 #include "dbc.h"
 #include "hsti.h"
@@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp)
 	return 0;
 }
 
+static int psp_check_sfs_support(struct psp_device *psp)
+{
+	/* Check if device supports SFS feature */
+	if (!psp->capability.sfs) {
+		dev_dbg(psp->dev, "psp does not support SFS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
 static int psp_init(struct psp_device *psp)
 {
 	int ret;
@@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp)
 			return ret;
 	}
 
+	if (!psp_check_sfs_support(psp)) {
+		ret = sfs_dev_init(psp);
+		if (ret)
+			return ret;
+	}
+
 	if (psp->vdata->platform_access) {
 		ret = platform_access_dev_init(psp);
 		if (ret)
@@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp)
 
 	tee_dev_destroy(psp);
 
+	sfs_dev_destroy(psp);
+
 	dbc_dev_destroy(psp);
 
 	platform_access_dev_destroy(psp);
diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
index e43ce87ede76..268c83f298cb 100644
--- a/drivers/crypto/ccp/psp-dev.h
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -32,7 +32,8 @@ union psp_cap_register {
 		unsigned int sev			:1,
 			     tee			:1,
 			     dbc_thru_ext		:1,
-			     rsvd1			:4,
+			     sfs			:1,
+			     rsvd1			:3,
 			     security_reporting		:1,
 			     fused_part			:1,
 			     rsvd2			:1,
@@ -68,6 +69,7 @@ struct psp_device {
 	void *tee_data;
 	void *platform_access_data;
 	void *dbc_data;
+	void *sfs_data;
 
 	union psp_cap_register capability;
 };
@@ -118,12 +120,16 @@ struct psp_ext_request {
  * @PSP_SUB_CMD_DBC_SET_UID:		Set UID for DBC
  * @PSP_SUB_CMD_DBC_GET_PARAMETER:	Get parameter from DBC
  * @PSP_SUB_CMD_DBC_SET_PARAMETER:	Set parameter for DBC
+ * @PSP_SUB_CMD_SFS_GET_FW_VERS:	Get firmware versions for ASP and other MP
+ * @PSP_SUB_CMD_SFS_UPDATE:		Command to load, verify and execute SFS package
  */
 enum psp_sub_cmd {
 	PSP_SUB_CMD_DBC_GET_NONCE	= PSP_DYNAMIC_BOOST_GET_NONCE,
 	PSP_SUB_CMD_DBC_SET_UID		= PSP_DYNAMIC_BOOST_SET_UID,
 	PSP_SUB_CMD_DBC_GET_PARAMETER	= PSP_DYNAMIC_BOOST_GET_PARAMETER,
 	PSP_SUB_CMD_DBC_SET_PARAMETER	= PSP_DYNAMIC_BOOST_SET_PARAMETER,
+	PSP_SUB_CMD_SFS_GET_FW_VERS	= PSP_SFS_GET_FW_VERSIONS,
+	PSP_SUB_CMD_SFS_UPDATE		= PSP_SFS_UPDATE,
 };
 
 int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs,
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index e058ba027792..65d6d0af140a 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -82,6 +82,21 @@ MODULE_FIRMWARE("amd/amd_sev_fam19h_model1xh.sbin"); /* 4th gen EPYC */
 static bool psp_dead;
 static int psp_timeout;
 
+enum snp_hv_fixed_pages_state {
+	ALLOCATED,
+	HV_FIXED,
+};
+
+struct snp_hv_fixed_pages_entry {
+	struct list_head list;
+	struct page *page;
+	unsigned int order;
+	bool free;
+	enum snp_hv_fixed_pages_state page_state;
+};
+
+static LIST_HEAD(snp_hv_fixed_pages);
+
 /* Trusted Memory Region (TMR):
  *   The TMR is a 1MB area that must be 1MB aligned.  Use the page allocator
  *   to allocate the memory, which will return aligned memory for the specified
@@ -1073,6 +1088,165 @@ static void snp_set_hsave_pa(void *arg)
 	wrmsrq(MSR_VM_HSAVE_PA, 0);
 }
 
+/* Hypervisor Fixed pages API interface */
+static void snp_hv_fixed_pages_state_update(struct sev_device *sev,
+					    enum snp_hv_fixed_pages_state page_state)
+{
+	struct snp_hv_fixed_pages_entry *entry;
+
+	/* List is protected by sev_cmd_mutex */
+	lockdep_assert_held(&sev_cmd_mutex);
+
+	if (list_empty(&snp_hv_fixed_pages))
+		return;
+
+	list_for_each_entry(entry, &snp_hv_fixed_pages, list)
+		entry->page_state = page_state;
+}
+
+/*
+ * Allocate HV_FIXED pages in 2MB aligned sizes to ensure the whole
+ * 2MB pages are marked as HV_FIXED.
+ */
+struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages)
+{
+	struct psp_device *psp_master = psp_get_master_device();
+	struct snp_hv_fixed_pages_entry *entry;
+	struct sev_device *sev;
+	unsigned int order;
+	struct page *page;
+
+	if (!psp_master || !psp_master->sev_data)
+		return NULL;
+
+	sev = psp_master->sev_data;
+
+	order = get_order(PMD_SIZE * num_2mb_pages);
+
+	/*
+	 * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list
+	 * also needs to be protected using the same mutex.
+	 */
+	guard(mutex)(&sev_cmd_mutex);
+
+	/*
+	 * This API uses SNP_INIT_EX to transition allocated pages to HV_Fixed
+	 * page state, fail if SNP is already initialized.
+	 */
+	if (sev->snp_initialized)
+		return NULL;
+
+	/* Re-use freed pages that match the request */
+	list_for_each_entry(entry, &snp_hv_fixed_pages, list) {
+		/* Hypervisor fixed page allocator implements exact fit policy */
+		if (entry->order == order && entry->free) {
+			entry->free = false;
+			memset(page_address(entry->page), 0,
+			       (1 << entry->order) * PAGE_SIZE);
+			return entry->page;
+		}
+	}
+
+	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!page)
+		return NULL;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		__free_pages(page, order);
+		return NULL;
+	}
+
+	entry->page = page;
+	entry->order = order;
+	list_add_tail(&entry->list, &snp_hv_fixed_pages);
+
+	return page;
+}
+
+void snp_free_hv_fixed_pages(struct page *page)
+{
+	struct psp_device *psp_master = psp_get_master_device();
+	struct snp_hv_fixed_pages_entry *entry, *nentry;
+
+	if (!psp_master || !psp_master->sev_data)
+		return;
+
+	/*
+	 * SNP_INIT_EX is protected by sev_cmd_mutex, therefore this list
+	 * also needs to be protected using the same mutex.
+	 */
+	guard(mutex)(&sev_cmd_mutex);
+
+	list_for_each_entry_safe(entry, nentry, &snp_hv_fixed_pages, list) {
+		if (entry->page != page)
+			continue;
+
+		/*
+		 * HV_FIXED page state cannot be changed until reboot
+		 * and they cannot be used by an SNP guest, so they cannot
+		 * be returned back to the page allocator.
+		 * Mark the pages as free internally to allow possible re-use.
+		 */
+		if (entry->page_state == HV_FIXED) {
+			entry->free = true;
+		} else {
+			__free_pages(page, entry->order);
+			list_del(&entry->list);
+			kfree(entry);
+		}
+		return;
+	}
+}
+
+static void snp_add_hv_fixed_pages(struct sev_device *sev, struct sev_data_range_list *range_list)
+{
+	struct snp_hv_fixed_pages_entry *entry;
+	struct sev_data_range *range;
+	int num_elements;
+
+	lockdep_assert_held(&sev_cmd_mutex);
+
+	if (list_empty(&snp_hv_fixed_pages))
+		return;
+
+	num_elements = list_count_nodes(&snp_hv_fixed_pages) +
+		       range_list->num_elements;
+
+	/*
+	 * Ensure the list of HV_FIXED pages that will be passed to firmware
+	 * do not exceed the page-sized argument buffer.
+	 */
+	if (num_elements * sizeof(*range) + sizeof(*range_list) > PAGE_SIZE) {
+		dev_warn(sev->dev, "Additional HV_Fixed pages cannot be accommodated, omitting\n");
+		return;
+	}
+
+	range = &range_list->ranges[range_list->num_elements];
+	list_for_each_entry(entry, &snp_hv_fixed_pages, list) {
+		range->base = page_to_pfn(entry->page) << PAGE_SHIFT;
+		range->page_count = 1 << entry->order;
+		range++;
+	}
+	range_list->num_elements = num_elements;
+}
+
+static void snp_leak_hv_fixed_pages(void)
+{
+	struct snp_hv_fixed_pages_entry *entry;
+
+	/* List is protected by sev_cmd_mutex */
+	lockdep_assert_held(&sev_cmd_mutex);
+
+	if (list_empty(&snp_hv_fixed_pages))
+		return;
+
+	list_for_each_entry(entry, &snp_hv_fixed_pages, list)
+		if (entry->page_state == HV_FIXED)
+			__snp_leak_pages(page_to_pfn(entry->page),
+					 1 << entry->order, false);
+}
+
 static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg)
 {
 	struct sev_data_range_list *range_list = arg;
@@ -1163,6 +1337,12 @@ static int __sev_snp_init_locked(int *error)
 			return rc;
 		}
 
+		/*
+		 * Add HV_Fixed pages from other PSP sub-devices, such as SFS to the
+		 * HV_Fixed page list.
+		 */
+		snp_add_hv_fixed_pages(sev, snp_range_list);
+
 		memset(&data, 0, sizeof(data));
 		data.init_rmp = 1;
 		data.list_paddr_en = 1;
@@ -1202,6 +1382,7 @@ static int __sev_snp_init_locked(int *error)
 		return rc;
 	}
 
+	snp_hv_fixed_pages_state_update(sev, HV_FIXED);
 	sev->snp_initialized = true;
 	dev_dbg(sev->dev, "SEV-SNP firmware initialized\n");
 
@@ -1784,6 +1965,7 @@ static int __sev_snp_shutdown_locked(int *error, bool panic)
 		return ret;
 	}
 
+	snp_leak_hv_fixed_pages();
 	sev->snp_initialized = false;
 	dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n");
 
@@ -2430,7 +2612,7 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic)
 {
 	int error;
 
-	__sev_platform_shutdown_locked(NULL);
+	__sev_platform_shutdown_locked(&error);
 
 	if (sev_es_tmr) {
 		/*
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index 3e4e5574e88a..28021abc85ad 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -65,4 +65,7 @@ void sev_dev_destroy(struct psp_device *psp);
 void sev_pci_init(void);
 void sev_pci_exit(void);
 
+struct page *snp_alloc_hv_fixed_pages(unsigned int num_2mb_pages);
+void snp_free_hv_fixed_pages(struct page *page);
+
 #endif /* __SEV_DEV_H */
diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c
new file mode 100644
index 000000000000..2f4beaafe7ec
--- /dev/null
+++ b/drivers/crypto/ccp/sfs.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure Processor Seamless Firmware Servicing support.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ *
+ * Author: Ashish Kalra <ashish.kalra@amd.com>
+ */
+
+#include <linux/firmware.h>
+
+#include "sfs.h"
+#include "sev-dev.h"
+
+#define SFS_DEFAULT_TIMEOUT		(10 * MSEC_PER_SEC)
+#define SFS_MAX_PAYLOAD_SIZE		(2 * 1024 * 1024)
+#define SFS_NUM_2MB_PAGES_CMDBUF	(SFS_MAX_PAYLOAD_SIZE / PMD_SIZE)
+#define SFS_NUM_PAGES_CMDBUF		(SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE)
+
+static DEFINE_MUTEX(sfs_ioctl_mutex);
+
+static struct sfs_misc_dev *misc_dev;
+
+static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg)
+{
+	int ret;
+
+	sfs_dev->command_buf->hdr.status = 0;
+	sfs_dev->command_buf->hdr.sub_cmd_id = msg;
+
+	ret = psp_extended_mailbox_cmd(sfs_dev->psp,
+				       SFS_DEFAULT_TIMEOUT,
+				       (struct psp_ext_request *)sfs_dev->command_buf);
+	if (ret == -EIO) {
+		dev_dbg(sfs_dev->dev,
+			 "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n",
+			 msg, sfs_dev->command_buf->hdr.status,
+			 *(u32 *)sfs_dev->command_buf->buf);
+	}
+
+	return ret;
+}
+
+static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev)
+{
+	/*
+	 * SFS_GET_FW_VERSIONS command needs the output buffer to be
+	 * initialized to 0xC7 in every byte.
+	 */
+	memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE);
+	sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE;
+
+	return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS);
+}
+
+static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name)
+{
+	char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")];
+	const struct firmware *firmware;
+	unsigned long package_size;
+	int ret;
+
+	/* Sanitize userspace provided payload name */
+	if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0'))
+		return -EINVAL;
+
+	snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name);
+
+	ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev);
+	if (ret < 0) {
+		dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n",
+				     payload_path, ret);
+		return -ENOENT;
+	}
+
+	/*
+	 * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER
+	 * followed by the Update Package and it should be 64KB aligned.
+	 */
+	package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U);
+
+	/*
+	 * SFS command buffer is a pre-allocated 2MB buffer, fail update package
+	 * if SFS payload is larger than the pre-allocated command buffer.
+	 */
+	if (package_size > SFS_MAX_PAYLOAD_SIZE) {
+		dev_warn_ratelimited(sfs_dev->dev,
+			 "SFS payload size %ld larger than maximum supported payload size of %u\n",
+			 package_size, SFS_MAX_PAYLOAD_SIZE);
+		release_firmware(firmware);
+		return -E2BIG;
+	}
+
+	/*
+	 * Copy firmware data to a HV_Fixed memory region.
+	 */
+	memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size);
+	sfs_dev->command_buf->hdr.payload_size = package_size;
+
+	release_firmware(firmware);
+
+	return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE);
+}
+
+static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct sfs_user_get_fw_versions __user *sfs_get_fw_versions;
+	struct sfs_user_update_package __user *sfs_update_package;
+	struct psp_device *psp_master = psp_get_master_device();
+	char payload_name[PAYLOAD_NAME_SIZE];
+	struct sfs_device *sfs_dev;
+	int ret = 0;
+
+	if (!psp_master || !psp_master->sfs_data)
+		return -ENODEV;
+
+	sfs_dev = psp_master->sfs_data;
+
+	guard(mutex)(&sfs_ioctl_mutex);
+
+	switch (cmd) {
+	case SFSIOCFWVERS:
+		dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n");
+
+		sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg;
+
+		ret = send_sfs_get_fw_versions(sfs_dev);
+		if (ret && ret != -EIO)
+			return ret;
+
+		/*
+		 * Return SFS status and extended status back to userspace
+		 * if PSP status indicated success or command error.
+		 */
+		if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer,
+				 PAGE_SIZE))
+			return -EFAULT;
+		if (copy_to_user(&sfs_get_fw_versions->sfs_status,
+				 &sfs_dev->command_buf->hdr.status,
+				 sizeof(sfs_get_fw_versions->sfs_status)))
+			return -EFAULT;
+		if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status,
+				 &sfs_dev->command_buf->buf,
+				 sizeof(sfs_get_fw_versions->sfs_extended_status)))
+			return -EFAULT;
+		break;
+	case SFSIOCUPDATEPKG:
+		dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n");
+
+		sfs_update_package = (struct sfs_user_update_package __user *)arg;
+
+		if (copy_from_user(payload_name, sfs_update_package->payload_name,
+				   PAYLOAD_NAME_SIZE))
+			return -EFAULT;
+
+		ret = send_sfs_update_package(sfs_dev, payload_name);
+		if (ret && ret != -EIO)
+			return ret;
+
+		/*
+		 * Return SFS status and extended status back to userspace
+		 * if PSP status indicated success or command error.
+		 */
+		if (copy_to_user(&sfs_update_package->sfs_status,
+				 &sfs_dev->command_buf->hdr.status,
+				 sizeof(sfs_update_package->sfs_status)))
+			return -EFAULT;
+		if (copy_to_user(&sfs_update_package->sfs_extended_status,
+				 &sfs_dev->command_buf->buf,
+				 sizeof(sfs_update_package->sfs_extended_status)))
+			return -EFAULT;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static const struct file_operations sfs_fops = {
+	.owner	= THIS_MODULE,
+	.unlocked_ioctl = sfs_ioctl,
+};
+
+static void sfs_exit(struct kref *ref)
+{
+	misc_deregister(&misc_dev->misc);
+	kfree(misc_dev);
+	misc_dev = NULL;
+}
+
+void sfs_dev_destroy(struct psp_device *psp)
+{
+	struct sfs_device *sfs_dev = psp->sfs_data;
+
+	if (!sfs_dev)
+		return;
+
+	/*
+	 * Change SFS command buffer back to the default "Write-Back" type.
+	 */
+	set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF);
+
+	snp_free_hv_fixed_pages(sfs_dev->page);
+
+	if (sfs_dev->misc)
+		kref_put(&misc_dev->refcount, sfs_exit);
+
+	psp->sfs_data = NULL;
+}
+
+/* Based on sev_misc_init() */
+static int sfs_misc_init(struct sfs_device *sfs)
+{
+	struct device *dev = sfs->dev;
+	int ret;
+
+	/*
+	 * SFS feature support can be detected on multiple devices but the SFS
+	 * FW commands must be issued on the master. During probe, we do not
+	 * know the master hence we create /dev/sfs on the first device probe.
+	 */
+	if (!misc_dev) {
+		struct miscdevice *misc;
+
+		misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL);
+		if (!misc_dev)
+			return -ENOMEM;
+
+		misc = &misc_dev->misc;
+		misc->minor = MISC_DYNAMIC_MINOR;
+		misc->name = "sfs";
+		misc->fops = &sfs_fops;
+		misc->mode = 0600;
+
+		ret = misc_register(misc);
+		if (ret)
+			return ret;
+
+		kref_init(&misc_dev->refcount);
+	} else {
+		kref_get(&misc_dev->refcount);
+	}
+
+	sfs->misc = misc_dev;
+	dev_dbg(dev, "registered SFS device\n");
+
+	return 0;
+}
+
+int sfs_dev_init(struct psp_device *psp)
+{
+	struct device *dev = psp->dev;
+	struct sfs_device *sfs_dev;
+	struct page *page;
+	int ret = -ENOMEM;
+
+	sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL);
+	if (!sfs_dev)
+		return -ENOMEM;
+
+	/*
+	 * Pre-allocate 2MB command buffer for all SFS commands using
+	 * SNP HV_Fixed page allocator which also transitions the
+	 * SFS command buffer to HV_Fixed page state if SNP is enabled.
+	 */
+	page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF);
+	if (!page) {
+		dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n");
+		goto cleanup_dev;
+	}
+	sfs_dev->page = page;
+	sfs_dev->command_buf = page_address(page);
+
+	dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf);
+
+	/*
+	 * SFS command buffer must be mapped as non-cacheable.
+	 */
+	ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF);
+	if (ret) {
+		dev_dbg(dev, "Set memory uc failed\n");
+		goto cleanup_cmd_buf;
+	}
+
+	dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf);
+
+	psp->sfs_data = sfs_dev;
+	sfs_dev->dev = dev;
+	sfs_dev->psp = psp;
+
+	ret = sfs_misc_init(sfs_dev);
+	if (ret)
+		goto cleanup_mem_attr;
+
+	dev_notice(sfs_dev->dev, "SFS support is available\n");
+
+	return 0;
+
+cleanup_mem_attr:
+	set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF);
+
+cleanup_cmd_buf:
+	snp_free_hv_fixed_pages(page);
+
+cleanup_dev:
+	psp->sfs_data = NULL;
+	devm_kfree(dev, sfs_dev);
+
+	return ret;
+}
diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h
new file mode 100644
index 000000000000..97704c210efd
--- /dev/null
+++ b/drivers/crypto/ccp/sfs.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ *
+ * Author: Ashish Kalra <ashish.kalra@amd.com>
+ */
+
+#ifndef __SFS_H__
+#define __SFS_H__
+
+#include <uapi/linux/psp-sfs.h>
+
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/psp-sev.h>
+#include <linux/psp-platform-access.h>
+#include <linux/set_memory.h>
+
+#include "psp-dev.h"
+
+struct sfs_misc_dev {
+	struct kref refcount;
+	struct miscdevice misc;
+};
+
+struct sfs_command {
+	struct psp_ext_req_buffer_hdr hdr;
+	u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)];
+	u8 sfs_buffer[];
+} __packed;
+
+struct sfs_device {
+	struct device *dev;
+	struct psp_device *psp;
+
+	struct page *page;
+	struct sfs_command *command_buf;
+
+	struct sfs_misc_dev *misc;
+};
+
+void sfs_dev_destroy(struct psp_device *psp);
+int sfs_dev_init(struct psp_device *psp);
+
+#endif /* __SFS_H__ */
diff --git a/drivers/crypto/hisilicon/Kconfig b/drivers/crypto/hisilicon/Kconfig
index 4137a8bf131f..4835bdebdbb3 100644
--- a/drivers/crypto/hisilicon/Kconfig
+++ b/drivers/crypto/hisilicon/Kconfig
@@ -69,7 +69,6 @@ config CRYPTO_DEV_HISI_HPRE
 	select CRYPTO_DEV_HISI_QM
 	select CRYPTO_DH
 	select CRYPTO_RSA
-	select CRYPTO_CURVE25519
 	select CRYPTO_ECDH
 	help
 	  Support for HiSilicon HPRE(High Performance RSA Engine)
diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
index 1550c3818383..21ccf879f70c 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 HiSilicon Limited. */
 #include <crypto/akcipher.h>
-#include <crypto/curve25519.h>
 #include <crypto/dh.h>
 #include <crypto/ecc_curve.h>
 #include <crypto/ecdh.h>
@@ -106,16 +105,6 @@ struct hpre_ecdh_ctx {
 	dma_addr_t dma_g;
 };
 
-struct hpre_curve25519_ctx {
-	/* low address: p->a->k */
-	unsigned char *p;
-	dma_addr_t dma_p;
-
-	/* gx coordinate */
-	unsigned char *g;
-	dma_addr_t dma_g;
-};
-
 struct hpre_ctx {
 	struct hisi_qp *qp;
 	struct device *dev;
@@ -129,7 +118,6 @@ struct hpre_ctx {
 		struct hpre_rsa_ctx rsa;
 		struct hpre_dh_ctx dh;
 		struct hpre_ecdh_ctx ecdh;
-		struct hpre_curve25519_ctx curve25519;
 	};
 	/* for ecc algorithms */
 	unsigned int curve_id;
@@ -146,7 +134,6 @@ struct hpre_asym_request {
 		struct akcipher_request *rsa;
 		struct kpp_request *dh;
 		struct kpp_request *ecdh;
-		struct kpp_request *curve25519;
 	} areq;
 	int err;
 	int req_id;
@@ -1214,8 +1201,7 @@ static void hpre_key_to_big_end(u8 *data, int len)
 	}
 }
 
-static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all,
-			       bool is_ecdh)
+static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all)
 {
 	struct device *dev = ctx->dev;
 	unsigned int sz = ctx->key_sz;
@@ -1224,17 +1210,11 @@ static void hpre_ecc_clear_ctx(struct hpre_ctx *ctx, bool is_clear_all,
 	if (is_clear_all)
 		hisi_qm_stop_qp(ctx->qp);
 
-	if (is_ecdh && ctx->ecdh.p) {
+	if (ctx->ecdh.p) {
 		/* ecdh: p->a->k->b */
 		memzero_explicit(ctx->ecdh.p + shift, sz);
 		dma_free_coherent(dev, sz << 3, ctx->ecdh.p, ctx->ecdh.dma_p);
 		ctx->ecdh.p = NULL;
-	} else if (!is_ecdh && ctx->curve25519.p) {
-		/* curve25519: p->a->k */
-		memzero_explicit(ctx->curve25519.p + shift, sz);
-		dma_free_coherent(dev, sz << 2, ctx->curve25519.p,
-				  ctx->curve25519.dma_p);
-		ctx->curve25519.p = NULL;
 	}
 
 	hpre_ctx_clear(ctx, is_clear_all);
@@ -1432,7 +1412,7 @@ static int hpre_ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
 		return -EINVAL;
 	}
 
-	hpre_ecc_clear_ctx(ctx, false, true);
+	hpre_ecc_clear_ctx(ctx, false);
 
 	ret = hpre_ecdh_set_param(ctx, &params);
 	if (ret < 0) {
@@ -1683,337 +1663,7 @@ static void hpre_ecdh_exit_tfm(struct crypto_kpp *tfm)
 {
 	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
 
-	hpre_ecc_clear_ctx(ctx, true, true);
-}
-
-static void hpre_curve25519_fill_curve(struct hpre_ctx *ctx, const void *buf,
-				       unsigned int len)
-{
-	u8 secret[CURVE25519_KEY_SIZE] = { 0 };
-	unsigned int sz = ctx->key_sz;
-	const struct ecc_curve *curve;
-	unsigned int shift = sz << 1;
-	void *p;
-
-	/*
-	 * The key from 'buf' is in little-endian, we should preprocess it as
-	 * the description in rfc7748: "k[0] &= 248, k[31] &= 127, k[31] |= 64",
-	 * then convert it to big endian. Only in this way, the result can be
-	 * the same as the software curve-25519 that exists in crypto.
-	 */
-	memcpy(secret, buf, len);
-	curve25519_clamp_secret(secret);
-	hpre_key_to_big_end(secret, CURVE25519_KEY_SIZE);
-
-	p = ctx->curve25519.p + sz - len;
-
-	curve = ecc_get_curve25519();
-
-	/* fill curve parameters */
-	fill_curve_param(p, curve->p, len, curve->g.ndigits);
-	fill_curve_param(p + sz, curve->a, len, curve->g.ndigits);
-	memcpy(p + shift, secret, len);
-	fill_curve_param(p + shift + sz, curve->g.x, len, curve->g.ndigits);
-	memzero_explicit(secret, CURVE25519_KEY_SIZE);
-}
-
-static int hpre_curve25519_set_param(struct hpre_ctx *ctx, const void *buf,
-				     unsigned int len)
-{
-	struct device *dev = ctx->dev;
-	unsigned int sz = ctx->key_sz;
-	unsigned int shift = sz << 1;
-
-	/* p->a->k->gx */
-	if (!ctx->curve25519.p) {
-		ctx->curve25519.p = dma_alloc_coherent(dev, sz << 2,
-						       &ctx->curve25519.dma_p,
-						       GFP_KERNEL);
-		if (!ctx->curve25519.p)
-			return -ENOMEM;
-	}
-
-	ctx->curve25519.g = ctx->curve25519.p + shift + sz;
-	ctx->curve25519.dma_g = ctx->curve25519.dma_p + shift + sz;
-
-	hpre_curve25519_fill_curve(ctx, buf, len);
-
-	return 0;
-}
-
-static int hpre_curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-				      unsigned int len)
-{
-	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
-	struct device *dev = ctx->dev;
-	int ret = -EINVAL;
-
-	if (len != CURVE25519_KEY_SIZE ||
-	    !crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) {
-		dev_err(dev, "key is null or key len is not 32bytes!\n");
-		return ret;
-	}
-
-	/* Free old secret if any */
-	hpre_ecc_clear_ctx(ctx, false, false);
-
-	ctx->key_sz = CURVE25519_KEY_SIZE;
-	ret = hpre_curve25519_set_param(ctx, buf, CURVE25519_KEY_SIZE);
-	if (ret) {
-		dev_err(dev, "failed to set curve25519 param, ret = %d!\n", ret);
-		hpre_ecc_clear_ctx(ctx, false, false);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void hpre_curve25519_hw_data_clr_all(struct hpre_ctx *ctx,
-					    struct hpre_asym_request *req,
-					    struct scatterlist *dst,
-					    struct scatterlist *src)
-{
-	struct device *dev = ctx->dev;
-	struct hpre_sqe *sqe = &req->req;
-	dma_addr_t dma;
-
-	dma = le64_to_cpu(sqe->in);
-	if (unlikely(dma_mapping_error(dev, dma)))
-		return;
-
-	if (src && req->src)
-		dma_free_coherent(dev, ctx->key_sz, req->src, dma);
-
-	dma = le64_to_cpu(sqe->out);
-	if (unlikely(dma_mapping_error(dev, dma)))
-		return;
-
-	if (req->dst)
-		dma_free_coherent(dev, ctx->key_sz, req->dst, dma);
-	if (dst)
-		dma_unmap_single(dev, dma, ctx->key_sz, DMA_FROM_DEVICE);
-}
-
-static void hpre_curve25519_cb(struct hpre_ctx *ctx, void *resp)
-{
-	struct hpre_dfx *dfx = ctx->hpre->debug.dfx;
-	struct hpre_asym_request *req = NULL;
-	struct kpp_request *areq;
-	u64 overtime_thrhld;
-	int ret;
-
-	ret = hpre_alg_res_post_hf(ctx, resp, (void **)&req);
-	areq = req->areq.curve25519;
-	areq->dst_len = ctx->key_sz;
-
-	overtime_thrhld = atomic64_read(&dfx[HPRE_OVERTIME_THRHLD].value);
-	if (overtime_thrhld && hpre_is_bd_timeout(req, overtime_thrhld))
-		atomic64_inc(&dfx[HPRE_OVER_THRHLD_CNT].value);
-
-	/* Do unmap before data processing */
-	hpre_curve25519_hw_data_clr_all(ctx, req, areq->dst, areq->src);
-
-	hpre_key_to_big_end(sg_virt(areq->dst), CURVE25519_KEY_SIZE);
-
-	kpp_request_complete(areq, ret);
-
-	atomic64_inc(&dfx[HPRE_RECV_CNT].value);
-}
-
-static int hpre_curve25519_msg_request_set(struct hpre_ctx *ctx,
-					   struct kpp_request *req)
-{
-	struct hpre_asym_request *h_req;
-	struct hpre_sqe *msg;
-	int req_id;
-	void *tmp;
-
-	if (unlikely(req->dst_len < ctx->key_sz)) {
-		req->dst_len = ctx->key_sz;
-		return -EINVAL;
-	}
-
-	tmp = kpp_request_ctx(req);
-	h_req = PTR_ALIGN(tmp, hpre_align_sz());
-	h_req->cb = hpre_curve25519_cb;
-	h_req->areq.curve25519 = req;
-	msg = &h_req->req;
-	memset(msg, 0, sizeof(*msg));
-	msg->in = cpu_to_le64(DMA_MAPPING_ERROR);
-	msg->out = cpu_to_le64(DMA_MAPPING_ERROR);
-	msg->key = cpu_to_le64(ctx->curve25519.dma_p);
-
-	msg->dw0 |= cpu_to_le32(0x1U << HPRE_SQE_DONE_SHIFT);
-	msg->task_len1 = (ctx->key_sz >> HPRE_BITS_2_BYTES_SHIFT) - 1;
-	h_req->ctx = ctx;
-
-	req_id = hpre_add_req_to_ctx(h_req);
-	if (req_id < 0)
-		return -EBUSY;
-
-	msg->tag = cpu_to_le16((u16)req_id);
-	return 0;
-}
-
-static void hpre_curve25519_src_modulo_p(u8 *ptr)
-{
-	int i;
-
-	for (i = 0; i < CURVE25519_KEY_SIZE - 1; i++)
-		ptr[i] = 0;
-
-	/* The modulus is ptr's last byte minus '0xed'(last byte of p) */
-	ptr[i] -= 0xed;
-}
-
-static int hpre_curve25519_src_init(struct hpre_asym_request *hpre_req,
-				    struct scatterlist *data, unsigned int len)
-{
-	struct hpre_sqe *msg = &hpre_req->req;
-	struct hpre_ctx *ctx = hpre_req->ctx;
-	struct device *dev = ctx->dev;
-	u8 p[CURVE25519_KEY_SIZE] = { 0 };
-	const struct ecc_curve *curve;
-	dma_addr_t dma = 0;
-	u8 *ptr;
-
-	if (len != CURVE25519_KEY_SIZE) {
-		dev_err(dev, "sourc_data len is not 32bytes, len = %u!\n", len);
-		return -EINVAL;
-	}
-
-	ptr = dma_alloc_coherent(dev, ctx->key_sz, &dma, GFP_KERNEL);
-	if (unlikely(!ptr))
-		return -ENOMEM;
-
-	scatterwalk_map_and_copy(ptr, data, 0, len, 0);
-
-	if (!crypto_memneq(ptr, curve25519_null_point, CURVE25519_KEY_SIZE)) {
-		dev_err(dev, "gx is null!\n");
-		goto err;
-	}
-
-	/*
-	 * Src_data(gx) is in little-endian order, MSB in the final byte should
-	 * be masked as described in RFC7748, then transform it to big-endian
-	 * form, then hisi_hpre can use the data.
-	 */
-	ptr[31] &= 0x7f;
-	hpre_key_to_big_end(ptr, CURVE25519_KEY_SIZE);
-
-	curve = ecc_get_curve25519();
-
-	fill_curve_param(p, curve->p, CURVE25519_KEY_SIZE, curve->g.ndigits);
-
-	/*
-	 * When src_data equals (2^255 - 19) ~  (2^255 - 1), it is out of p,
-	 * we get its modulus to p, and then use it.
-	 */
-	if (memcmp(ptr, p, ctx->key_sz) == 0) {
-		dev_err(dev, "gx is p!\n");
-		goto err;
-	} else if (memcmp(ptr, p, ctx->key_sz) > 0) {
-		hpre_curve25519_src_modulo_p(ptr);
-	}
-
-	hpre_req->src = ptr;
-	msg->in = cpu_to_le64(dma);
-	return 0;
-
-err:
-	dma_free_coherent(dev, ctx->key_sz, ptr, dma);
-	return -EINVAL;
-}
-
-static int hpre_curve25519_dst_init(struct hpre_asym_request *hpre_req,
-				    struct scatterlist *data, unsigned int len)
-{
-	struct hpre_sqe *msg = &hpre_req->req;
-	struct hpre_ctx *ctx = hpre_req->ctx;
-	struct device *dev = ctx->dev;
-	dma_addr_t dma;
-
-	if (!data || !sg_is_last(data) || len != ctx->key_sz) {
-		dev_err(dev, "data or data length is illegal!\n");
-		return -EINVAL;
-	}
-
-	hpre_req->dst = NULL;
-	dma = dma_map_single(dev, sg_virt(data), len, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(dev, dma))) {
-		dev_err(dev, "dma map data err!\n");
-		return -ENOMEM;
-	}
-
-	msg->out = cpu_to_le64(dma);
-	return 0;
-}
-
-static int hpre_curve25519_compute_value(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
-	struct device *dev = ctx->dev;
-	void *tmp = kpp_request_ctx(req);
-	struct hpre_asym_request *hpre_req = PTR_ALIGN(tmp, hpre_align_sz());
-	struct hpre_sqe *msg = &hpre_req->req;
-	int ret;
-
-	ret = hpre_curve25519_msg_request_set(ctx, req);
-	if (unlikely(ret)) {
-		dev_err(dev, "failed to set curve25519 request, ret = %d!\n", ret);
-		return ret;
-	}
-
-	if (req->src) {
-		ret = hpre_curve25519_src_init(hpre_req, req->src, req->src_len);
-		if (unlikely(ret)) {
-			dev_err(dev, "failed to init src data, ret = %d!\n",
-				ret);
-			goto clear_all;
-		}
-	} else {
-		msg->in = cpu_to_le64(ctx->curve25519.dma_g);
-	}
-
-	ret = hpre_curve25519_dst_init(hpre_req, req->dst, req->dst_len);
-	if (unlikely(ret)) {
-		dev_err(dev, "failed to init dst data, ret = %d!\n", ret);
-		goto clear_all;
-	}
-
-	msg->dw0 = cpu_to_le32(le32_to_cpu(msg->dw0) | HPRE_ALG_CURVE25519_MUL);
-	ret = hpre_send(ctx, msg);
-	if (likely(!ret))
-		return -EINPROGRESS;
-
-clear_all:
-	hpre_rm_req_from_ctx(hpre_req);
-	hpre_curve25519_hw_data_clr_all(ctx, hpre_req, req->dst, req->src);
-	return ret;
-}
-
-static unsigned int hpre_curve25519_max_size(struct crypto_kpp *tfm)
-{
-	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
-
-	return ctx->key_sz;
-}
-
-static int hpre_curve25519_init_tfm(struct crypto_kpp *tfm)
-{
-	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
-
-	kpp_set_reqsize(tfm, sizeof(struct hpre_asym_request) + hpre_align_pd());
-
-	return hpre_ctx_init(ctx, HPRE_V3_ECC_ALG_TYPE);
-}
-
-static void hpre_curve25519_exit_tfm(struct crypto_kpp *tfm)
-{
-	struct hpre_ctx *ctx = kpp_tfm_ctx(tfm);
-
-	hpre_ecc_clear_ctx(ctx, true, false);
+	hpre_ecc_clear_ctx(ctx, true);
 }
 
 static struct akcipher_alg rsa = {
@@ -2095,22 +1745,6 @@ static struct kpp_alg ecdh_curves[] = {
 	}
 };
 
-static struct kpp_alg curve25519_alg = {
-	.set_secret = hpre_curve25519_set_secret,
-	.generate_public_key = hpre_curve25519_compute_value,
-	.compute_shared_secret = hpre_curve25519_compute_value,
-	.max_size = hpre_curve25519_max_size,
-	.init = hpre_curve25519_init_tfm,
-	.exit = hpre_curve25519_exit_tfm,
-	.base = {
-		.cra_ctxsize = sizeof(struct hpre_ctx),
-		.cra_priority = HPRE_CRYPTO_ALG_PRI,
-		.cra_name = "curve25519",
-		.cra_driver_name = "hpre-curve25519",
-		.cra_module = THIS_MODULE,
-	},
-};
-
 static int hpre_register_rsa(struct hisi_qm *qm)
 {
 	int ret;
@@ -2192,28 +1826,6 @@ static void hpre_unregister_ecdh(struct hisi_qm *qm)
 		crypto_unregister_kpp(&ecdh_curves[i]);
 }
 
-static int hpre_register_x25519(struct hisi_qm *qm)
-{
-	int ret;
-
-	if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP))
-		return 0;
-
-	ret = crypto_register_kpp(&curve25519_alg);
-	if (ret)
-		dev_err(&qm->pdev->dev, "failed to register x25519 (%d)!\n", ret);
-
-	return ret;
-}
-
-static void hpre_unregister_x25519(struct hisi_qm *qm)
-{
-	if (!hpre_check_alg_support(qm, HPRE_DRV_X25519_MASK_CAP))
-		return;
-
-	crypto_unregister_kpp(&curve25519_alg);
-}
-
 int hpre_algs_register(struct hisi_qm *qm)
 {
 	int ret = 0;
@@ -2236,17 +1848,11 @@ int hpre_algs_register(struct hisi_qm *qm)
 	if (ret)
 		goto unreg_dh;
 
-	ret = hpre_register_x25519(qm);
-	if (ret)
-		goto unreg_ecdh;
-
 	hpre_available_devs++;
 	mutex_unlock(&hpre_algs_lock);
 
 	return ret;
 
-unreg_ecdh:
-	hpre_unregister_ecdh(qm);
 unreg_dh:
 	hpre_unregister_dh(qm);
 unreg_rsa:
@@ -2262,7 +1868,6 @@ void hpre_algs_unregister(struct hisi_qm *qm)
 	if (--hpre_available_devs)
 		goto unlock;
 
-	hpre_unregister_x25519(qm);
 	hpre_unregister_ecdh(qm);
 	hpre_unregister_dh(qm);
 	hpre_unregister_rsa(qm);
diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c
index 76b7ecb5624b..f22c12e36b56 100644
--- a/drivers/crypto/img-hash.c
+++ b/drivers/crypto/img-hash.c
@@ -700,7 +700,7 @@ static int img_hash_cra_init(struct crypto_tfm *tfm, const char *alg_name)
 
 static int img_hash_cra_md5_init(struct crypto_tfm *tfm)
 {
-	return img_hash_cra_init(tfm, "md5-generic");
+	return img_hash_cra_init(tfm, "md5-lib");
 }
 
 static int img_hash_cra_sha1_init(struct crypto_tfm *tfm)
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 54c480e874cb..d7714d8afb0f 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -388,7 +388,7 @@ static const struct super_operations dax_sops = {
 	.alloc_inode = dax_alloc_inode,
 	.destroy_inode = dax_destroy_inode,
 	.free_inode = dax_free_inode,
-	.drop_inode = generic_delete_inode,
+	.drop_inode = inode_just_drop,
 };
 
 static int dax_init_fs_context(struct fs_context *fc)
diff --git a/drivers/dma/dw/rzn1-dmamux.c b/drivers/dma/dw/rzn1-dmamux.c
index 4fb8508419db..deadf135681b 100644
--- a/drivers/dma/dw/rzn1-dmamux.c
+++ b/drivers/dma/dw/rzn1-dmamux.c
@@ -48,12 +48,16 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec,
 	u32 mask;
 	int ret;
 
-	if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS)
-		return ERR_PTR(-EINVAL);
+	if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) {
+		ret = -EINVAL;
+		goto put_device;
+	}
 
 	map = kzalloc(sizeof(*map), GFP_KERNEL);
-	if (!map)
-		return ERR_PTR(-ENOMEM);
+	if (!map) {
+		ret = -ENOMEM;
+		goto put_device;
+	}
 
 	chan = dma_spec->args[0];
 	map->req_idx = dma_spec->args[4];
@@ -94,12 +98,15 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec,
 	if (ret)
 		goto clear_bitmap;
 
+	put_device(&pdev->dev);
 	return map;
 
 clear_bitmap:
 	clear_bit(map->req_idx, dmamux->used_chans);
 free_map:
 	kfree(map);
+put_device:
+	put_device(&pdev->dev);
 
 	return ERR_PTR(ret);
 }
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 35bdefd3728b..8c4725ad1f64 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -189,27 +189,30 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 	idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev));
 	if (!idxd->wq_enable_map) {
 		rc = -ENOMEM;
-		goto err_bitmap;
+		goto err_free_wqs;
 	}
 
 	for (i = 0; i < idxd->max_wqs; i++) {
 		wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev));
 		if (!wq) {
 			rc = -ENOMEM;
-			goto err;
+			goto err_unwind;
 		}
 
 		idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ);
 		conf_dev = wq_confdev(wq);
 		wq->id = i;
 		wq->idxd = idxd;
-		device_initialize(wq_confdev(wq));
+		device_initialize(conf_dev);
 		conf_dev->parent = idxd_confdev(idxd);
 		conf_dev->bus = &dsa_bus_type;
 		conf_dev->type = &idxd_wq_device_type;
 		rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id);
-		if (rc < 0)
-			goto err;
+		if (rc < 0) {
+			put_device(conf_dev);
+			kfree(wq);
+			goto err_unwind;
+		}
 
 		mutex_init(&wq->wq_lock);
 		init_waitqueue_head(&wq->err_queue);
@@ -220,15 +223,20 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 		wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
 		wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
 		if (!wq->wqcfg) {
+			put_device(conf_dev);
+			kfree(wq);
 			rc = -ENOMEM;
-			goto err;
+			goto err_unwind;
 		}
 
 		if (idxd->hw.wq_cap.op_config) {
 			wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL);
 			if (!wq->opcap_bmap) {
+				kfree(wq->wqcfg);
+				put_device(conf_dev);
+				kfree(wq);
 				rc = -ENOMEM;
-				goto err_opcap_bmap;
+				goto err_unwind;
 			}
 			bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS);
 		}
@@ -239,13 +247,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
 
 	return 0;
 
-err_opcap_bmap:
-	kfree(wq->wqcfg);
-
-err:
-	put_device(conf_dev);
-	kfree(wq);
-
+err_unwind:
 	while (--i >= 0) {
 		wq = idxd->wqs[i];
 		if (idxd->hw.wq_cap.op_config)
@@ -254,11 +256,10 @@ err:
 		conf_dev = wq_confdev(wq);
 		put_device(conf_dev);
 		kfree(wq);
-
 	}
 	bitmap_free(idxd->wq_enable_map);
 
-err_bitmap:
+err_free_wqs:
 	kfree(idxd->wqs);
 
 	return rc;
@@ -1291,10 +1292,12 @@ static void idxd_remove(struct pci_dev *pdev)
 	device_unregister(idxd_confdev(idxd));
 	idxd_shutdown(pdev);
 	idxd_device_remove_debugfs(idxd);
-	idxd_cleanup(idxd);
+	perfmon_pmu_remove(idxd);
+	idxd_cleanup_interrupts(idxd);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
 	pci_iounmap(pdev, idxd->reg_base);
 	put_device(idxd_confdev(idxd));
-	idxd_free(idxd);
 	pci_disable_device(pdev);
 }
 
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index bbc3276992bb..2cf060174795 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -1283,13 +1283,17 @@ static int bam_dma_probe(struct platform_device *pdev)
 	if (!bdev->bamclk) {
 		ret = of_property_read_u32(pdev->dev.of_node, "num-channels",
 					   &bdev->num_channels);
-		if (ret)
+		if (ret) {
 			dev_err(bdev->dev, "num-channels unspecified in dt\n");
+			return ret;
+		}
 
 		ret = of_property_read_u32(pdev->dev.of_node, "qcom,num-ees",
 					   &bdev->num_ees);
-		if (ret)
+		if (ret) {
 			dev_err(bdev->dev, "num-ees unspecified in dt\n");
+			return ret;
+		}
 	}
 
 	ret = clk_prepare_enable(bdev->bamclk);
diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c
index 3ed406f08c44..552be71db6c4 100644
--- a/drivers/dma/ti/edma.c
+++ b/drivers/dma/ti/edma.c
@@ -2064,8 +2064,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
 	 * priority. So Q0 is the highest priority queue and the last queue has
 	 * the lowest priority.
 	 */
-	queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, sizeof(s8),
-					  GFP_KERNEL);
+	queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1,
+					  sizeof(*queue_priority_map), GFP_KERNEL);
 	if (!queue_priority_map)
 		return -ENOMEM;
 
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 036f21cac0a9..0a852011653c 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -211,8 +211,8 @@ static int
 dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll,
 				 struct netlink_ext_ack *extack)
 {
+	DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) = { 0 };
 	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
-	DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 };
 	enum dpll_clock_quality_level ql;
 	int ret;
 
@@ -221,7 +221,7 @@ dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll,
 	ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack);
 	if (ret)
 		return ret;
-	for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX)
+	for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1)
 		if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql))
 			return -EMSGSIZE;
 
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 19ad3c3b675d..39352b9b7a7e 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -576,4 +576,20 @@ config EDAC_LOONGSON
 	  errors (CE) only. Loongson-3A5000/3C5000/3D5000/3A6000/3C6000
 	  are compatible.
 
+config EDAC_CORTEX_A72
+	tristate "ARM Cortex A72"
+	depends on ARM64
+	help
+	  Support for L1/L2 cache error detection for ARM Cortex A72 processor.
+	  The detected and reported errors are from reading CPU/L2 memory error
+	  syndrome registers.
+
+config EDAC_VERSALNET
+	tristate "AMD VersalNET DDR Controller"
+	depends on CDX_CONTROLLER && ARCH_ZYNQMP
+	help
+	  Support for single bit error correction, double bit error detection
+	  and other system errors from various IP subsystems like RPU, NOCs,
+	  HNICX, PL on the AMD Versal NET DDR memory controller.
+
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index a8f2d8f6c894..1c14796410a3 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -88,3 +88,5 @@ obj-$(CONFIG_EDAC_NPCM)			+= npcm_edac.o
 obj-$(CONFIG_EDAC_ZYNQMP)		+= zynqmp_edac.o
 obj-$(CONFIG_EDAC_VERSAL)		+= versal_edac.o
 obj-$(CONFIG_EDAC_LOONGSON)		+= loongson_edac.o
+obj-$(CONFIG_EDAC_VERSALNET)		+= versalnet_edac.o
+obj-$(CONFIG_EDAC_CORTEX_A72)		+= a72_edac.o
diff --git a/drivers/edac/a72_edac.c b/drivers/edac/a72_edac.c
new file mode 100644
index 000000000000..9262d75c3855
--- /dev/null
+++ b/drivers/edac/a72_edac.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cortex A72 EDAC L1 and L2 cache error detection
+ *
+ * Copyright (c) 2020 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de>
+ * Copyright (c) 2025 Microsoft Corporation, <vijayb@linux.microsoft.com>
+ *
+ * Based on Code from:
+ * Copyright (c) 2018, NXP Semiconductor
+ * Author: York Sun <york.sun@nxp.com>
+ */
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/bitfield.h>
+#include <asm/smp_plat.h>
+
+#include "edac_module.h"
+
+#define DRVNAME			"a72-edac"
+
+#define SYS_CPUMERRSR_EL1	sys_reg(3, 1, 15, 2, 2)
+#define SYS_L2MERRSR_EL1	sys_reg(3, 1, 15, 2, 3)
+
+#define CPUMERRSR_EL1_RAMID	GENMASK(30, 24)
+#define L2MERRSR_EL1_CPUID_WAY	GENMASK(21, 18)
+
+#define CPUMERRSR_EL1_VALID	BIT(31)
+#define CPUMERRSR_EL1_FATAL	BIT(63)
+#define L2MERRSR_EL1_VALID	BIT(31)
+#define L2MERRSR_EL1_FATAL	BIT(63)
+
+#define L1_I_TAG_RAM		0x00
+#define L1_I_DATA_RAM		0x01
+#define L1_D_TAG_RAM		0x08
+#define L1_D_DATA_RAM		0x09
+#define TLB_RAM			0x18
+
+#define MESSAGE_SIZE		64
+
+struct mem_err_synd_reg {
+	u64 cpu_mesr;
+	u64 l2_mesr;
+};
+
+static struct cpumask compat_mask;
+
+static void report_errors(struct edac_device_ctl_info *edac_ctl, int cpu,
+			  struct mem_err_synd_reg *mesr)
+{
+	u64 cpu_mesr = mesr->cpu_mesr;
+	u64 l2_mesr = mesr->l2_mesr;
+	char msg[MESSAGE_SIZE];
+
+	if (cpu_mesr & CPUMERRSR_EL1_VALID) {
+		const char *str;
+		bool fatal = cpu_mesr & CPUMERRSR_EL1_FATAL;
+
+		switch (FIELD_GET(CPUMERRSR_EL1_RAMID, cpu_mesr)) {
+		case L1_I_TAG_RAM:
+			str = "L1-I Tag RAM";
+			break;
+		case L1_I_DATA_RAM:
+			str = "L1-I Data RAM";
+			break;
+		case L1_D_TAG_RAM:
+			str = "L1-D Tag RAM";
+			break;
+		case L1_D_DATA_RAM:
+			str = "L1-D Data RAM";
+			break;
+		case TLB_RAM:
+			str = "TLB RAM";
+			break;
+		default:
+			str = "Unspecified";
+			break;
+		}
+
+		snprintf(msg, MESSAGE_SIZE, "%s %s error(s) on CPU %d",
+			 str, fatal ? "fatal" : "correctable", cpu);
+
+		if (fatal)
+			edac_device_handle_ue(edac_ctl, cpu, 0, msg);
+		else
+			edac_device_handle_ce(edac_ctl, cpu, 0, msg);
+	}
+
+	if (l2_mesr & L2MERRSR_EL1_VALID) {
+		bool fatal = l2_mesr & L2MERRSR_EL1_FATAL;
+
+		snprintf(msg, MESSAGE_SIZE, "L2 %s error(s) on CPU %d CPUID/WAY 0x%lx",
+			 fatal ? "fatal" : "correctable", cpu,
+			 FIELD_GET(L2MERRSR_EL1_CPUID_WAY, l2_mesr));
+		if (fatal)
+			edac_device_handle_ue(edac_ctl, cpu, 1, msg);
+		else
+			edac_device_handle_ce(edac_ctl, cpu, 1, msg);
+	}
+}
+
+static void read_errors(void *data)
+{
+	struct mem_err_synd_reg *mesr = data;
+
+	mesr->cpu_mesr = read_sysreg_s(SYS_CPUMERRSR_EL1);
+	if (mesr->cpu_mesr & CPUMERRSR_EL1_VALID) {
+		write_sysreg_s(0, SYS_CPUMERRSR_EL1);
+		isb();
+	}
+	mesr->l2_mesr = read_sysreg_s(SYS_L2MERRSR_EL1);
+	if (mesr->l2_mesr & L2MERRSR_EL1_VALID) {
+		write_sysreg_s(0, SYS_L2MERRSR_EL1);
+		isb();
+	}
+}
+
+static void a72_edac_check(struct edac_device_ctl_info *edac_ctl)
+{
+	struct mem_err_synd_reg mesr;
+	int cpu;
+
+	cpus_read_lock();
+	for_each_cpu_and(cpu, cpu_online_mask, &compat_mask) {
+		smp_call_function_single(cpu, read_errors, &mesr, true);
+		report_errors(edac_ctl, cpu, &mesr);
+	}
+	cpus_read_unlock();
+}
+
+static int a72_edac_probe(struct platform_device *pdev)
+{
+	struct edac_device_ctl_info *edac_ctl;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	edac_ctl = edac_device_alloc_ctl_info(0, "cpu",
+					      num_possible_cpus(), "L", 2, 1,
+					      edac_device_alloc_index());
+	if (!edac_ctl)
+		return -ENOMEM;
+
+	edac_ctl->edac_check = a72_edac_check;
+	edac_ctl->dev = dev;
+	edac_ctl->mod_name = dev_name(dev);
+	edac_ctl->dev_name = dev_name(dev);
+	edac_ctl->ctl_name = DRVNAME;
+	dev_set_drvdata(dev, edac_ctl);
+
+	rc = edac_device_add_device(edac_ctl);
+	if (rc)
+		goto out_dev;
+
+	return 0;
+
+out_dev:
+	edac_device_free_ctl_info(edac_ctl);
+
+	return rc;
+}
+
+static void a72_edac_remove(struct platform_device *pdev)
+{
+	struct edac_device_ctl_info *edac_ctl = dev_get_drvdata(&pdev->dev);
+
+	edac_device_del_device(edac_ctl->dev);
+	edac_device_free_ctl_info(edac_ctl);
+}
+
+static const struct of_device_id cortex_arm64_edac_of_match[] = {
+	{ .compatible = "arm,cortex-a72" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, cortex_arm64_edac_of_match);
+
+static struct platform_driver a72_edac_driver = {
+	.probe = a72_edac_probe,
+	.remove = a72_edac_remove,
+	.driver = {
+		.name = DRVNAME,
+	},
+};
+
+static struct platform_device *a72_pdev;
+
+static int __init a72_edac_driver_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu);
+		if (np) {
+			if (of_match_node(cortex_arm64_edac_of_match, np) &&
+			    of_property_read_bool(np, "edac-enabled")) {
+				cpumask_set_cpu(cpu, &compat_mask);
+			}
+		} else {
+			pr_warn("failed to find device node for CPU %d\n", cpu);
+		}
+	}
+
+	if (cpumask_empty(&compat_mask))
+		return 0;
+
+	a72_pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0);
+	if (IS_ERR(a72_pdev)) {
+		pr_err("failed to register A72 EDAC device\n");
+		return PTR_ERR(a72_pdev);
+	}
+
+	return platform_driver_register(&a72_edac_driver);
+}
+
+static void __exit a72_edac_driver_exit(void)
+{
+	platform_device_unregister(a72_pdev);
+	platform_driver_unregister(&a72_edac_driver);
+}
+
+module_init(a72_edac_driver_init);
+module_exit(a72_edac_driver_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
+MODULE_DESCRIPTION("Cortex A72 L1 and L2 cache EDAC driver");
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index cae52c654a15..103b2c2eba2a 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -128,7 +128,6 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file,
 
 	ptemp = dma_alloc_coherent(mci->pdev, 16, &dma_handle, GFP_KERNEL);
 	if (!ptemp) {
-		dma_free_coherent(mci->pdev, 16, ptemp, dma_handle);
 		edac_printk(KERN_ERR, EDAC_MC,
 			    "Inject: Buffer Allocation error\n");
 		return -ENOMEM;
@@ -2131,8 +2130,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev)
 	edac->irq_chip.name = pdev->dev.of_node->name;
 	edac->irq_chip.irq_mask = a10_eccmgr_irq_mask;
 	edac->irq_chip.irq_unmask = a10_eccmgr_irq_unmask;
-	edac->domain = irq_domain_create_linear(of_fwnode_handle(pdev->dev.of_node),
-						64, &a10_eccmgr_ic_ops, edac);
+	edac->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 64, &a10_eccmgr_ic_ops,
+						edac);
 	if (!edac->domain) {
 		dev_err(&pdev->dev, "Error adding IRQ domain\n");
 		return -ENOMEM;
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 07f1e9dc1ca7..2f6ab783bf20 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3923,6 +3923,26 @@ static int per_family_init(struct amd64_pvt *pvt)
 			pvt->ctl_name           = "F1Ah_M40h";
 			pvt->flags.zn_regs_v2   = 1;
 			break;
+		case 0x50 ... 0x57:
+			pvt->ctl_name           = "F1Ah_M50h";
+			pvt->max_mcs            = 16;
+			pvt->flags.zn_regs_v2   = 1;
+			break;
+		case 0x90 ... 0x9f:
+			pvt->ctl_name           = "F1Ah_M90h";
+			pvt->max_mcs            = 8;
+			pvt->flags.zn_regs_v2   = 1;
+			break;
+		case 0xa0 ... 0xaf:
+			pvt->ctl_name           = "F1Ah_MA0h";
+			pvt->max_mcs            = 8;
+			pvt->flags.zn_regs_v2   = 1;
+			break;
+		case 0xc0 ... 0xc7:
+			pvt->ctl_name           = "F1Ah_MC0h";
+			pvt->max_mcs            = 16;
+			pvt->flags.zn_regs_v2   = 1;
+			break;
 		}
 		break;
 
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 17228d07de4c..d70b8a8d0b09 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -96,7 +96,7 @@
 /* Hardware limit on ChipSelect rows per MC and processors per system */
 #define NUM_CHIPSELECTS			8
 #define DRAM_RANGES			8
-#define NUM_CONTROLLERS			12
+#define NUM_CONTROLLERS			16
 
 #define ON true
 #define OFF false
diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c
index 51c451c7f0f0..51c451c7f0f0 100755..100644
--- a/drivers/edac/ecs.c
+++ b/drivers/edac/ecs.c
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 0f338adf7d93..8689631f1905 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR,
 	channel_dimm_label_show, channel_dimm_label_store, 10);
 DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR,
 	channel_dimm_label_show, channel_dimm_label_store, 11);
+DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR,
+	channel_dimm_label_show, channel_dimm_label_store, 12);
+DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR,
+	channel_dimm_label_show, channel_dimm_label_store, 13);
+DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR,
+	channel_dimm_label_show, channel_dimm_label_store, 14);
+DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR,
+	channel_dimm_label_show, channel_dimm_label_store, 15);
 
 /* Total possible dynamic DIMM Label attribute file table */
 static struct attribute *dynamic_csrow_dimm_attr[] = {
@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = {
 	&dev_attr_legacy_ch9_dimm_label.attr.attr,
 	&dev_attr_legacy_ch10_dimm_label.attr.attr,
 	&dev_attr_legacy_ch11_dimm_label.attr.attr,
+	&dev_attr_legacy_ch12_dimm_label.attr.attr,
+	&dev_attr_legacy_ch13_dimm_label.attr.attr,
+	&dev_attr_legacy_ch14_dimm_label.attr.attr,
+	&dev_attr_legacy_ch15_dimm_label.attr.attr,
 	NULL
 };
 
@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO,
 		   channel_ce_count_show, NULL, 10);
 DEVICE_CHANNEL(ch11_ce_count, S_IRUGO,
 		   channel_ce_count_show, NULL, 11);
+DEVICE_CHANNEL(ch12_ce_count, S_IRUGO,
+		   channel_ce_count_show, NULL, 12);
+DEVICE_CHANNEL(ch13_ce_count, S_IRUGO,
+		   channel_ce_count_show, NULL, 13);
+DEVICE_CHANNEL(ch14_ce_count, S_IRUGO,
+		   channel_ce_count_show, NULL, 14);
+DEVICE_CHANNEL(ch15_ce_count, S_IRUGO,
+		   channel_ce_count_show, NULL, 15);
 
 /* Total possible dynamic ce_count attribute file table */
 static struct attribute *dynamic_csrow_ce_count_attr[] = {
@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = {
 	&dev_attr_legacy_ch9_ce_count.attr.attr,
 	&dev_attr_legacy_ch10_ce_count.attr.attr,
 	&dev_attr_legacy_ch11_ce_count.attr.attr,
+	&dev_attr_legacy_ch12_ce_count.attr.attr,
+	&dev_attr_legacy_ch13_ce_count.attr.attr,
+	&dev_attr_legacy_ch14_ce_count.attr.attr,
+	&dev_attr_legacy_ch15_ce_count.attr.attr,
 	NULL
 };
 
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index bf4171ac191d..2010a47149f4 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -468,17 +468,18 @@ static int i10nm_get_imc_num(struct res_config *cfg)
 			return -ENODEV;
 		}
 
-		if (imc_num > I10NM_NUM_DDR_IMC) {
-			i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num);
-			return -EINVAL;
-		}
-
 		if (cfg->ddr_imc_num != imc_num) {
 			/*
-			 * Store the number of present DDR memory controllers.
+			 * Update the configuration data to reflect the number of
+			 * present DDR memory controllers.
 			 */
 			cfg->ddr_imc_num = imc_num;
 			edac_dbg(2, "Set DDR MC number: %d", imc_num);
+
+			/* Release and reallocate skx_dev list with the updated number. */
+			skx_remove();
+			if (skx_get_all_bus_mappings(cfg, &i10nm_edac_list) <= 0)
+				return -ENODEV;
 		}
 
 		return 0;
@@ -1057,6 +1058,15 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan)
 	return !!GET_BITFIELD(mcmtr, 2, 2);
 }
 
+static bool i10nm_channel_disabled(struct skx_imc *imc, int chan)
+{
+	u32 mcmtr = I10NM_GET_MCMTR(imc, chan);
+
+	edac_dbg(1, "mc%d ch%d mcmtr reg %x\n", imc->mc, chan, mcmtr);
+
+	return (mcmtr == ~0 || GET_BITFIELD(mcmtr, 18, 18));
+}
+
 static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 				 struct res_config *cfg)
 {
@@ -1070,6 +1080,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 		if (!imc->mbase)
 			continue;
 
+		if (i10nm_channel_disabled(imc, i)) {
+			edac_dbg(1, "mc%d ch%d is disabled.\n", imc->mc, i);
+			continue;
+		}
+
 		ndimms = 0;
 
 		if (res_cfg->type != GNR)
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 5c1fa1c0d12e..5a080ab65476 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -99,6 +99,8 @@
 
 /* Alder Lake-S */
 #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1	0x4660
+#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2	0x4668	/* 8P+4E, e.g. i7-12700K */
+#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3	0x4648	/* 6P+4E, e.g. i5-12600K */
 
 /* Bartlett Lake-S */
 #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1	0x4639
@@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg},
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg},
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg},
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg},
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg},
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg},
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg},
diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c
index 108d69209146..108d69209146 100755..100644
--- a/drivers/edac/mem_repair.c
+++ b/drivers/edac/mem_repair.c
diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c
index f9d02af2fc3a..f9d02af2fc3a 100755..100644
--- a/drivers/edac/scrub.c
+++ b/drivers/edac/scrub.c
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 29897b21fb8e..078ddf95cc6e 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -33,6 +33,15 @@ static unsigned int nvdimm_count;
 #define	MASK26	0x3FFFFFF		/* Mask for 2^26 */
 #define MASK29	0x1FFFFFFF		/* Mask for 2^29 */
 
+static struct res_config skx_cfg = {
+	.type			= SKX,
+	.decs_did		= 0x2016,
+	.busno_cfg_offset	= 0xcc,
+	.ddr_imc_num		= 2,
+	.ddr_chan_num		= 3,
+	.ddr_dimm_num		= 2,
+};
+
 static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx)
 {
 	struct skx_dev *d;
@@ -52,7 +61,7 @@ enum munittype {
 
 struct munit {
 	u16	did;
-	u16	devfn[SKX_NUM_IMC];
+	u16	devfn[2];
 	u8	busidx;
 	u8	per_socket;
 	enum munittype mtype;
@@ -89,11 +98,11 @@ static int get_all_munits(const struct munit *m)
 		if (!pdev)
 			break;
 		ndev++;
-		if (m->per_socket == SKX_NUM_IMC) {
-			for (i = 0; i < SKX_NUM_IMC; i++)
+		if (m->per_socket == skx_cfg.ddr_imc_num) {
+			for (i = 0; i < skx_cfg.ddr_imc_num; i++)
 				if (m->devfn[i] == pdev->devfn)
 					break;
-			if (i == SKX_NUM_IMC)
+			if (i == skx_cfg.ddr_imc_num)
 				goto fail;
 		}
 		d = get_skx_dev(pdev->bus, m->busidx);
@@ -157,12 +166,6 @@ fail:
 	return -ENODEV;
 }
 
-static struct res_config skx_cfg = {
-	.type			= SKX,
-	.decs_did		= 0x2016,
-	.busno_cfg_offset	= 0xcc,
-};
-
 static const struct x86_cpu_id skx_cpuids[] = {
 	X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg),
 	{ }
@@ -186,11 +189,11 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 	/* Only the mcmtr on the first channel is effective */
 	pci_read_config_dword(imc->chan[0].cdev, 0x87c, &mcmtr);
 
-	for (i = 0; i < SKX_NUM_CHANNELS; i++) {
+	for (i = 0; i < cfg->ddr_chan_num; i++) {
 		ndimms = 0;
 		pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
 		pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg);
-		for (j = 0; j < SKX_NUM_DIMMS; j++) {
+		for (j = 0; j < cfg->ddr_dimm_num; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			pci_read_config_dword(imc->chan[i].cdev,
 					      0x80 + 4 * j, &mtr);
@@ -620,6 +623,7 @@ static int __init skx_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	skx_set_res_cfg(cfg);
 
 	rc = skx_get_hi_lo(0x2034, off, &skx_tolm, &skx_tohm);
 	if (rc)
@@ -652,10 +656,13 @@ static int __init skx_init(void)
 			goto fail;
 
 		edac_dbg(2, "src_id = %d\n", src_id);
-		for (i = 0; i < SKX_NUM_IMC; i++) {
+		for (i = 0; i < cfg->ddr_imc_num; i++) {
 			d->imc[i].mc = mc++;
 			d->imc[i].lmc = i;
 			d->imc[i].src_id = src_id;
+			d->imc[i].num_channels = cfg->ddr_chan_num;
+			d->imc[i].num_dimms    = cfg->ddr_dimm_num;
+
 			rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev,
 					      "Skylake Socket", EDAC_MOD_STR,
 					      skx_get_dimm_config, cfg);
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 39c733dbc5b9..724842f512ac 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -14,9 +14,11 @@
  * Copyright (c) 2018, Intel Corporation.
  */
 
+#include <linux/topology.h>
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/adxl.h>
+#include <linux/overflow.h>
 #include <acpi/nfit.h>
 #include <asm/mce.h>
 #include <asm/uv/uv.h>
@@ -130,8 +132,8 @@ static void skx_init_mc_mapping(struct skx_dev *d)
 	 * the logical indices of the memory controllers enumerated by the
 	 * EDAC driver.
 	 */
-	for (int i = 0; i < NUM_IMC; i++)
-		d->mc_mapping[i] = i;
+	for (int i = 0; i < d->num_imc; i++)
+		d->imc[i].mc_mapping = i;
 }
 
 void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
@@ -139,22 +141,28 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
 	edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
 		 pmc, lmc);
 
-	d->mc_mapping[pmc] = lmc;
+	d->imc[lmc].mc_mapping = pmc;
 }
 EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
 
-static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
+static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
 {
-	edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
-		 pmc, d->mc_mapping[pmc]);
+	for (int lmc = 0; lmc < d->num_imc; lmc++) {
+		if (d->imc[lmc].mc_mapping == pmc) {
+			edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+				 pmc, lmc);
 
-	return d->mc_mapping[pmc];
+			return lmc;
+		}
+	}
+
+	return -1;
 }
 
 static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 {
+	int i, lmc, len = 0;
 	struct skx_dev *d;
-	int i, len = 0;
 
 	if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
 				      res->addr < BIT_ULL(32))) {
@@ -200,7 +208,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 		res->cs      = (int)adxl_values[component_indices[INDEX_CS]];
 	}
 
-	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
+	if (res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -218,7 +226,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 		return false;
 	}
 
-	res->imc = skx_get_mc_mapping(d, res->imc);
+	lmc = skx_get_mc_mapping(d, res->imc);
+	if (lmc < 0) {
+		skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc);
+		return false;
+	}
+
+	res->imc = lmc;
 
 	for (i = 0; i < adxl_component_count; i++) {
 		if (adxl_values[i] == ~0x0ull)
@@ -265,7 +279,7 @@ static int skx_get_pkg_id(struct skx_dev *d, u8 *id)
 			struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 			if (c->initialized && cpu_to_node(cpu) == node) {
-				*id = c->topo.pkg_id;
+				*id = topology_physical_package_id(cpu);
 				return 0;
 			}
 		}
@@ -320,10 +334,10 @@ static int get_width(u32 mtr)
  */
 int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
 {
+	int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num;
 	struct pci_dev *pdev, *prev;
 	struct skx_dev *d;
 	u32 reg;
-	int ndev = 0;
 
 	prev = NULL;
 	for (;;) {
@@ -331,7 +345,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
 		if (!pdev)
 			break;
 		ndev++;
-		d = kzalloc(sizeof(*d), GFP_KERNEL);
+		d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL);
 		if (!d) {
 			pci_dev_put(pdev);
 			return -ENOMEM;
@@ -354,8 +368,10 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
 			d->seg = GET_BITFIELD(reg, 16, 23);
 		}
 
-		edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n",
-			 d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
+		d->num_imc = imc_num;
+
+		edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n",
+			 d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num);
 		list_add_tail(&d->list, &dev_edac_list);
 		prev = pdev;
 
@@ -541,10 +557,10 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
 
 	/* Allocate a new MC control structure */
 	layers[0].type = EDAC_MC_LAYER_CHANNEL;
-	layers[0].size = NUM_CHANNELS;
+	layers[0].size = imc->num_channels;
 	layers[0].is_virt_csrow = false;
 	layers[1].type = EDAC_MC_LAYER_SLOT;
-	layers[1].size = NUM_DIMMS;
+	layers[1].size = imc->num_dimms;
 	layers[1].is_virt_csrow = true;
 	mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers,
 			    sizeof(struct skx_pvt));
@@ -784,7 +800,7 @@ void skx_remove(void)
 
 	list_for_each_entry_safe(d, tmp, &dev_edac_list, list) {
 		list_del(&d->list);
-		for (i = 0; i < NUM_IMC; i++) {
+		for (i = 0; i < d->num_imc; i++) {
 			if (d->imc[i].mci)
 				skx_unregister_mci(&d->imc[i]);
 
@@ -794,7 +810,7 @@ void skx_remove(void)
 			if (d->imc[i].mbase)
 				iounmap(d->imc[i].mbase);
 
-			for (j = 0; j < NUM_CHANNELS; j++) {
+			for (j = 0; j < d->imc[i].num_channels; j++) {
 				if (d->imc[i].chan[j].cdev)
 					pci_dev_put(d->imc[i].chan[j].cdev);
 			}
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index ec4966f7ea40..73ba89786cdf 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -29,23 +29,18 @@
 #define GET_BITFIELD(v, lo, hi) \
 	(((v) & GENMASK_ULL((hi), (lo))) >> (lo))
 
-#define SKX_NUM_IMC		2	/* Memory controllers per socket */
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_DDR_IMC	12
 #define I10NM_NUM_DDR_CHANNELS	2
 #define I10NM_NUM_DDR_DIMMS	2
 
-#define I10NM_NUM_HBM_IMC	16
 #define I10NM_NUM_HBM_CHANNELS	2
 #define I10NM_NUM_HBM_DIMMS	1
 
-#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
 #define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
 #define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
-#define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
 #define NUM_CHANNELS	MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS)
 #define NUM_DIMMS	MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS)
 
@@ -134,16 +129,7 @@ struct skx_dev {
 	struct pci_dev *uracu; /* for i10nm CPU */
 	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
-	/*
-	 * Some server BIOS may hide certain memory controllers, and the
-	 * EDAC driver skips those hidden memory controllers. However, the
-	 * ADXL still decodes memory error address using physical memory
-	 * controller indices. The mapping table is used to convert the
-	 * physical indices (reported by ADXL) to the logical indices
-	 * (used the EDAC driver) of present memory controllers during the
-	 * error handling process.
-	 */
-	u8 mc_mapping[NUM_IMC];
+	int num_imc;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
@@ -155,6 +141,16 @@ struct skx_dev {
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id;
+		/*
+		 * Some server BIOS may hide certain memory controllers, and the
+		 * EDAC driver skips those hidden memory controllers. However, the
+		 * ADXL still decodes memory error address using physical memory
+		 * controller indices. The mapping table is used to convert the
+		 * physical indices (reported by ADXL) to the logical indices
+		 * (used the EDAC driver) of present memory controllers during the
+		 * error handling process.
+		 */
+		u8 mc_mapping;
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
@@ -171,7 +167,7 @@ struct skx_dev {
 				u8 colbits;
 			} dimms[NUM_DIMMS];
 		} chan[NUM_CHANNELS];
-	} imc[NUM_IMC];
+	} imc[];
 };
 
 struct skx_pvt {
diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c
new file mode 100644
index 000000000000..7c5db8bf0595
--- /dev/null
+++ b/drivers/edac/versalnet_edac.c
@@ -0,0 +1,960 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Versal NET memory controller driver
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/cdx/edac_cdx_pcol.h>
+#include <linux/edac.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/ras.h>
+#include <linux/remoteproc.h>
+#include <linux/rpmsg.h>
+#include <linux/sizes.h>
+#include <ras/ras_event.h>
+
+#include "edac_module.h"
+
+/* Granularity of reported error in bytes */
+#define MC5_ERR_GRAIN			1
+#define MC_GET_DDR_CONFIG_IN_LEN	4
+
+#define MC5_IRQ_CE_MASK			GENMASK(18, 15)
+#define MC5_IRQ_UE_MASK			GENMASK(14, 11)
+
+#define MC5_RANK_1_MASK			GENMASK(11, 6)
+#define MASK_24				GENMASK(29, 24)
+#define MASK_0				GENMASK(5, 0)
+
+#define MC5_LRANK_1_MASK		GENMASK(11, 6)
+#define MC5_LRANK_2_MASK		GENMASK(17, 12)
+#define MC5_BANK1_MASK			GENMASK(11, 6)
+#define MC5_GRP_0_MASK			GENMASK(17, 12)
+#define MC5_GRP_1_MASK			GENMASK(23, 18)
+
+#define MC5_REGHI_ROW			7
+#define MC5_EACHBIT			1
+#define MC5_ERR_TYPE_CE			0
+#define MC5_ERR_TYPE_UE			1
+#define MC5_HIGH_MEM_EN			BIT(20)
+#define MC5_MEM_MASK			GENMASK(19, 0)
+#define MC5_X16_BASE			256
+#define MC5_X16_ECC			32
+#define MC5_X16_SIZE			(MC5_X16_BASE + MC5_X16_ECC)
+#define MC5_X32_SIZE			576
+#define MC5_HIMEM_BASE			(256 * SZ_1M)
+#define MC5_ILC_HIMEM_EN		BIT(28)
+#define MC5_ILC_MEM			GENMASK(27, 0)
+#define MC5_INTERLEAVE_SEL		GENMASK(3, 0)
+#define MC5_BUS_WIDTH_MASK		GENMASK(19, 18)
+#define MC5_NUM_CHANS_MASK		BIT(17)
+#define MC5_RANK_MASK			GENMASK(15, 14)
+
+#define ERROR_LEVEL			2
+#define ERROR_ID			3
+#define TOTAL_ERR_LENGTH		5
+#define MSG_ERR_OFFSET			8
+#define MSG_ERR_LENGTH			9
+#define ERROR_DATA			10
+#define MCDI_RESPONSE			0xFF
+
+#define REG_MAX				152
+#define ADEC_MAX			152
+#define NUM_CONTROLLERS			8
+#define REGS_PER_CONTROLLER		19
+#define ADEC_NUM			19
+#define BUFFER_SZ			80
+
+#define XDDR5_BUS_WIDTH_64		0
+#define XDDR5_BUS_WIDTH_32		1
+#define XDDR5_BUS_WIDTH_16		2
+
+/**
+ * struct ecc_error_info - ECC error log information.
+ * @burstpos:		Burst position.
+ * @lrank:		Logical Rank number.
+ * @rank:		Rank number.
+ * @group:		Group number.
+ * @bank:		Bank number.
+ * @col:		Column number.
+ * @row:		Row number.
+ * @rowhi:		Row number higher bits.
+ * @i:			Combined ECC error vector containing encoded values of burst position,
+ *			rank, bank, column, and row information.
+ */
+union ecc_error_info {
+	struct {
+		u32 burstpos:3;
+		u32 lrank:4;
+		u32 rank:2;
+		u32 group:3;
+		u32 bank:2;
+		u32 col:11;
+		u32 row:7;
+		u32 rowhi;
+	};
+	u64 i;
+} __packed;
+
+/* Row and column bit positions in the address decoder (ADEC) registers. */
+union row_col_mapping {
+	struct {
+		u32 row0:6;
+		u32 row1:6;
+		u32 row2:6;
+		u32 row3:6;
+		u32 row4:6;
+		u32 reserved:2;
+	};
+	struct {
+		u32 col1:6;
+		u32 col2:6;
+		u32 col3:6;
+		u32 col4:6;
+		u32 col5:6;
+		u32 reservedcol:2;
+	};
+	u32 i;
+} __packed;
+
+/**
+ * struct ecc_status - ECC status information to report.
+ * @ceinfo:	Correctable errors.
+ * @ueinfo:	Uncorrected errors.
+ * @channel:	Channel number.
+ * @error_type:	Error type.
+ */
+struct ecc_status {
+	union ecc_error_info ceinfo[2];
+	union ecc_error_info ueinfo[2];
+	u8 channel;
+	u8 error_type;
+};
+
+/**
+ * struct mc_priv - DDR memory controller private instance data.
+ * @message:		Buffer for framing the event specific info.
+ * @stat:		ECC status information.
+ * @error_id:		The error id.
+ * @error_level:	The error level.
+ * @dwidth:		Width of data bus excluding ECC bits.
+ * @part_len:		The support of the message received.
+ * @regs:		The registers sent on the rpmsg.
+ * @adec:		Address decode registers.
+ * @mci:		Memory controller interface.
+ * @ept:		rpmsg endpoint.
+ * @mcdi:		The mcdi handle.
+ */
+struct mc_priv {
+	char message[256];
+	struct ecc_status stat;
+	u32 error_id;
+	u32 error_level;
+	u32 dwidth;
+	u32 part_len;
+	u32 regs[REG_MAX];
+	u32 adec[ADEC_MAX];
+	struct mem_ctl_info *mci[NUM_CONTROLLERS];
+	struct rpmsg_endpoint *ept;
+	struct cdx_mcdi *mcdi;
+};
+
+/*
+ * Address decoder (ADEC) registers to match the order in which the register
+ * information is received from the firmware.
+ */
+enum adec_info {
+	CONF = 0,
+	ADEC0,
+	ADEC1,
+	ADEC2,
+	ADEC3,
+	ADEC4,
+	ADEC5,
+	ADEC6,
+	ADEC7,
+	ADEC8,
+	ADEC9,
+	ADEC10,
+	ADEC11,
+	ADEC12,
+	ADEC13,
+	ADEC14,
+	ADEC15,
+	ADEC16,
+	ADECILC,
+};
+
+enum reg_info {
+	ISR = 0,
+	IMR,
+	ECCR0_ERR_STATUS,
+	ECCR0_ADDR_LO,
+	ECCR0_ADDR_HI,
+	ECCR0_DATA_LO,
+	ECCR0_DATA_HI,
+	ECCR0_PAR,
+	ECCR1_ERR_STATUS,
+	ECCR1_ADDR_LO,
+	ECCR1_ADDR_HI,
+	ECCR1_DATA_LO,
+	ECCR1_DATA_HI,
+	ECCR1_PAR,
+	XMPU_ERR,
+	XMPU_ERR_ADDR_L0,
+	XMPU_ERR_ADDR_HI,
+	XMPU_ERR_AXI_ID,
+	ADEC_CHK_ERR_LOG,
+};
+
+static bool get_ddr_info(u32 *error_data, struct mc_priv *priv)
+{
+	u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr;
+	struct ecc_status *p;
+
+	isr = error_data[ISR];
+
+	if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK)))
+		return false;
+
+	eccr0_val = error_data[ECCR0_ERR_STATUS];
+	eccr1_val = error_data[ECCR1_ERR_STATUS];
+
+	if (!eccr0_val && !eccr1_val)
+		return false;
+
+	p = &priv->stat;
+
+	if (!eccr0_val)
+		p->channel = 1;
+	else
+		p->channel = 0;
+
+	reglo = error_data[ECCR0_ADDR_LO];
+	reghi = error_data[ECCR0_ADDR_HI];
+	if (isr & MC5_IRQ_CE_MASK)
+		p->ceinfo[0].i = reglo | (u64)reghi << 32;
+	else if (isr & MC5_IRQ_UE_MASK)
+		p->ueinfo[0].i = reglo | (u64)reghi << 32;
+
+	parity = error_data[ECCR0_PAR];
+	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
+		 reghi, reglo, parity);
+
+	reglo = error_data[ECCR1_ADDR_LO];
+	reghi = error_data[ECCR1_ADDR_HI];
+	if (isr & MC5_IRQ_CE_MASK)
+		p->ceinfo[1].i = reglo | (u64)reghi << 32;
+	else if (isr & MC5_IRQ_UE_MASK)
+		p->ueinfo[1].i = reglo | (u64)reghi << 32;
+
+	parity = error_data[ECCR1_PAR];
+	edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n",
+		 reghi, reglo, parity);
+
+	return true;
+}
+
+/**
+ * convert_to_physical - Convert @error_data to a physical address.
+ * @priv:	DDR memory controller private instance data.
+ * @pinf:	ECC error info structure.
+ * @controller:	Controller number of the MC5
+ * @error_data:	the DDRMC5 ADEC address decoder register data
+ *
+ * Return: physical address of the DDR memory.
+ */
+static unsigned long convert_to_physical(struct mc_priv *priv,
+					 union ecc_error_info pinf,
+					 int controller, int *error_data)
+{
+	u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset;
+	u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base;
+	unsigned long err_addr = 0, addr;
+	union row_col_mapping cols;
+	union row_col_mapping rows;
+	u32 col_bit_0;
+
+	row = pinf.rowhi << MC5_REGHI_ROW | pinf.row;
+	offset = controller * ADEC_NUM;
+
+	reg = error_data[ADEC6];
+	rows.i = reg;
+	err_addr |= (row & BIT(0)) << rows.row0;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row1;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row2;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row3;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row4;
+	row >>= MC5_EACHBIT;
+
+	reg = error_data[ADEC7];
+	rows.i = reg;
+	err_addr |= (row & BIT(0)) << rows.row0;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row1;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row2;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row3;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row4;
+	row >>= MC5_EACHBIT;
+
+	reg = error_data[ADEC8];
+	rows.i = reg;
+	err_addr |= (row & BIT(0)) << rows.row0;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row1;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row2;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row3;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row4;
+
+	reg = error_data[ADEC9];
+	rows.i = reg;
+
+	err_addr |= (row & BIT(0)) << rows.row0;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row1;
+	row >>= MC5_EACHBIT;
+	err_addr |= (row & BIT(0)) << rows.row2;
+	row >>= MC5_EACHBIT;
+
+	col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]);
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << col_bit_0;
+
+	cols.i = error_data[ADEC10];
+	err_addr |= (pinf.col & 1) << cols.col1;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col2;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col3;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col4;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col5;
+	pinf.col >>= 1;
+
+	cols.i = error_data[ADEC11];
+	err_addr |= (pinf.col & 1) << cols.col1;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col2;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col3;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col4;
+	pinf.col >>= 1;
+	err_addr |= (pinf.col & 1) << cols.col5;
+	pinf.col >>= 1;
+
+	reg = error_data[ADEC12];
+	err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0);
+	pinf.bank >>= MC5_EACHBIT;
+	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg);
+	pinf.bank >>= MC5_EACHBIT;
+
+	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg);
+	pinf.group >>= MC5_EACHBIT;
+	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg);
+	pinf.group >>= MC5_EACHBIT;
+	err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg);
+	pinf.group >>= MC5_EACHBIT;
+
+	reg = error_data[ADEC4];
+	err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0);
+	pinf.rank >>= MC5_EACHBIT;
+	err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg);
+	pinf.rank >>= MC5_EACHBIT;
+
+	reg = error_data[ADEC5];
+	err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0);
+	pinf.lrank >>= MC5_EACHBIT;
+	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg);
+	pinf.lrank >>= MC5_EACHBIT;
+	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg);
+	pinf.lrank >>= MC5_EACHBIT;
+	err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg);
+	pinf.lrank >>= MC5_EACHBIT;
+
+	high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE;
+	interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL;
+
+	high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK;
+	low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK;
+	reg = priv->adec[ADEC14 + offset];
+	ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN);
+	ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M;
+	if (ilc_himem_en)
+		ilc_base_ctrl_add = ilcmem_base - high_mem_offset;
+	else
+		ilc_base_ctrl_add = ilcmem_base - low_mem_offset;
+
+	if (priv->dwidth == DEV_X16) {
+		blk = err_addr / MC5_X16_SIZE;
+		rsh_req_addr = (blk << 8) + ilc_base_ctrl_add;
+		err_addr = rsh_req_addr * interleave * 2;
+	} else {
+		blk = err_addr / MC5_X32_SIZE;
+		rsh_req_addr = (blk << 9) + ilc_base_ctrl_add;
+		err_addr = rsh_req_addr * interleave * 2;
+	}
+
+	if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base)
+		addr = err_addr - high_mem_offset;
+	else
+		addr = err_addr - low_mem_offset;
+
+	return addr;
+}
+
+/**
+ * handle_error - Handle errors.
+ * @priv:	DDR memory controller private instance data.
+ * @stat:	ECC status structure.
+ * @ctl_num:	Controller number of the MC5
+ * @error_data:	the MC5 ADEC address decoder register data
+ *
+ * Handles ECC correctable and uncorrectable errors.
+ */
+static void handle_error(struct mc_priv  *priv, struct ecc_status *stat,
+			 int ctl_num, int *error_data)
+{
+	union ecc_error_info pinf;
+	struct mem_ctl_info *mci;
+	unsigned long pa;
+	phys_addr_t pfn;
+	int err;
+
+	if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS))
+		return;
+
+	mci = priv->mci[ctl_num];
+
+	if (stat->error_type == MC5_ERR_TYPE_CE) {
+		pinf = stat->ceinfo[stat->channel];
+		snprintf(priv->message, sizeof(priv->message),
+			 "Error type:%s Controller %d Addr at %lx\n",
+			 "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
+
+		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
+				     1, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+	}
+
+	if (stat->error_type == MC5_ERR_TYPE_UE) {
+		pinf = stat->ueinfo[stat->channel];
+		snprintf(priv->message, sizeof(priv->message),
+			 "Error type:%s controller %d Addr at %lx\n",
+			 "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data));
+
+		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
+				     1, 0, 0, 0, 0, 0, -1,
+				     priv->message, "");
+		pa = convert_to_physical(priv, pinf, ctl_num, error_data);
+		pfn = PHYS_PFN(pa);
+
+		if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
+			err = memory_failure(pfn, MF_ACTION_REQUIRED);
+			if (err)
+				edac_dbg(2, "memory_failure() error: %d", err);
+			else
+				edac_dbg(2, "Poison page at PA 0x%lx\n", pa);
+		}
+	}
+}
+
+static void mc_init(struct mem_ctl_info *mci, struct device *dev)
+{
+	struct mc_priv *priv = mci->pvt_info;
+	struct csrow_info *csi;
+	struct dimm_info *dimm;
+	u32 row;
+	int ch;
+
+	/* Initialize controller capabilities and configuration */
+	mci->mtype_cap = MEM_FLAG_DDR5;
+	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+	mci->scrub_cap = SCRUB_HW_SRC;
+	mci->scrub_mode = SCRUB_NONE;
+
+	mci->edac_cap = EDAC_FLAG_SECDED;
+	mci->ctl_name = "VersalNET DDR5";
+	mci->dev_name = dev_name(dev);
+	mci->mod_name = "versalnet_edac";
+
+	edac_op_state = EDAC_OPSTATE_INT;
+
+	for (row = 0; row < mci->nr_csrows; row++) {
+		csi = mci->csrows[row];
+		for (ch = 0; ch < csi->nr_channels; ch++) {
+			dimm = csi->channels[ch]->dimm;
+			dimm->edac_mode = EDAC_SECDED;
+			dimm->mtype = MEM_DDR5;
+			dimm->grain = MC5_ERR_GRAIN;
+			dimm->dtype = priv->dwidth;
+		}
+	}
+}
+
+#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
+
+static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd)
+{
+	return MCDI_RPC_TIMEOUT;
+}
+
+static void mcdi_request(struct cdx_mcdi *cdx,
+			 const struct cdx_dword *hdr, size_t hdr_len,
+			 const struct cdx_dword *sdu, size_t sdu_len)
+{
+	void *send_buf;
+	int ret;
+
+	send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL);
+	if (!send_buf)
+		return;
+
+	memcpy(send_buf, hdr, hdr_len);
+	memcpy(send_buf + hdr_len, sdu, sdu_len);
+
+	ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len);
+	if (ret)
+		dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret);
+
+	kfree(send_buf);
+}
+
+static const struct cdx_mcdi_ops mcdi_ops = {
+	.mcdi_rpc_timeout = mcdi_rpc_timeout,
+	.mcdi_request = mcdi_request,
+};
+
+static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi)
+{
+	size_t outlen;
+	int ret;
+
+	MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, BUFFER_SZ);
+
+	MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index);
+
+	ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf),
+			   outbuf, sizeof(outbuf), &outlen);
+	if (!ret)
+		memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG),
+		       (ADEC_NUM * 4));
+}
+
+static int setup_mcdi(struct mc_priv *mc_priv)
+{
+	struct cdx_mcdi *amd_mcdi;
+	int ret, i;
+
+	amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL);
+	if (!amd_mcdi)
+		return -ENOMEM;
+
+	amd_mcdi->mcdi_ops = &mcdi_ops;
+	ret = cdx_mcdi_init(amd_mcdi);
+	if (ret) {
+		kfree(amd_mcdi);
+		return ret;
+	}
+
+	amd_mcdi->ept = mc_priv->ept;
+	mc_priv->mcdi = amd_mcdi;
+
+	for (i = 0; i < NUM_CONTROLLERS; i++)
+		get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi);
+
+	return 0;
+}
+
+static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2,
+						 0xb8, 0xb4, 0x45, 0x56, 0x2e,
+						 0x8c, 0x5b, 0xec);
+
+static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
+		    int len, void *priv, u32 src)
+{
+	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
+	const guid_t *sec_type = &guid_null;
+	u32 length, offset, error_id;
+	u32 *result = (u32 *)data;
+	struct ecc_status *p;
+	int i, j, k, sec_sev;
+	const char *err_str;
+	u32 *adec_data;
+
+	if (*(u8 *)data == MCDI_RESPONSE) {
+		cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len);
+		return 0;
+	}
+
+	sec_sev = result[ERROR_LEVEL];
+	error_id = result[ERROR_ID];
+	length = result[MSG_ERR_LENGTH];
+	offset = result[MSG_ERR_OFFSET];
+
+	if (result[TOTAL_ERR_LENGTH] > length) {
+		if (!mc_priv->part_len)
+			mc_priv->part_len = length;
+		else
+			mc_priv->part_len += length;
+		/*
+		 * The data can come in 2 stretches. Construct the regs from 2
+		 * messages the offset indicates the offset from which the data is to
+		 * be taken
+		 */
+		for (i = 0 ; i < length; i++) {
+			k = offset + i;
+			j = ERROR_DATA + i;
+			mc_priv->regs[k] = result[j];
+		}
+		if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
+			return 0;
+		mc_priv->part_len = 0;
+	}
+
+	mc_priv->error_id = error_id;
+	mc_priv->error_level = result[ERROR_LEVEL];
+
+	switch (error_id) {
+	case 5:		err_str = "General Software Non-Correctable error"; break;
+	case 6:		err_str = "CFU error"; break;
+	case 7:		err_str = "CFRAME error"; break;
+	case 10:	err_str = "DDRMC Microblaze Correctable ECC error"; break;
+	case 11:	err_str = "DDRMC Microblaze Non-Correctable ECC error"; break;
+	case 15:	err_str = "MMCM error"; break;
+	case 16:	err_str = "HNICX Correctable error"; break;
+	case 17:	err_str = "HNICX Non-Correctable error"; break;
+
+	case 18:
+		p = &mc_priv->stat;
+		memset(p, 0, sizeof(struct ecc_status));
+		p->error_type = MC5_ERR_TYPE_CE;
+		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
+			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
+				adec_data = mc_priv->adec + ADEC_NUM * i;
+				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
+			}
+		}
+		return 0;
+	case 19:
+		p = &mc_priv->stat;
+		memset(p, 0, sizeof(struct ecc_status));
+		p->error_type = MC5_ERR_TYPE_UE;
+		for (i = 0 ; i < NUM_CONTROLLERS; i++) {
+			if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) {
+				adec_data = mc_priv->adec + ADEC_NUM * i;
+				handle_error(mc_priv, &mc_priv->stat, i, adec_data);
+			}
+		}
+		return 0;
+
+	case 21:	err_str = "GT Non-Correctable error"; break;
+	case 22:	err_str = "PL Sysmon Correctable error"; break;
+	case 23:	err_str = "PL Sysmon Non-Correctable error"; break;
+	case 111:	err_str = "LPX unexpected dfx activation error"; break;
+	case 114:	err_str = "INT_LPD Non-Correctable error"; break;
+	case 116:	err_str = "INT_OCM Non-Correctable error"; break;
+	case 117:	err_str = "INT_FPD Correctable error"; break;
+	case 118:	err_str = "INT_FPD Non-Correctable error"; break;
+	case 120:	err_str = "INT_IOU Non-Correctable error"; break;
+	case 123:	err_str = "err_int_irq from APU GIC Distributor"; break;
+	case 124:	err_str = "fault_int_irq from APU GIC Distribute"; break;
+	case 132 ... 139: err_str = "FPX SPLITTER error"; break;
+	case 140:	err_str = "APU Cluster 0 error"; break;
+	case 141:	err_str = "APU Cluster 1 error"; break;
+	case 142:	err_str = "APU Cluster 2 error"; break;
+	case 143:	err_str = "APU Cluster 3 error"; break;
+	case 145:	err_str = "WWDT1 LPX error"; break;
+	case 147:	err_str = "IPI error"; break;
+	case 152 ... 153: err_str = "AFIFS error"; break;
+	case 154 ... 155: err_str = "LPX glitch error"; break;
+	case 185 ... 186: err_str = "FPX AFIFS error"; break;
+	case 195 ... 199: err_str = "AFIFM error"; break;
+	case 108:	err_str = "PSM Correctable error"; break;
+	case 59:	err_str = "PMC correctable error"; break;
+	case 60:	err_str = "PMC Un correctable error"; break;
+	case 43 ... 47:	err_str = "PMC Sysmon error"; break;
+	case 163 ... 184: err_str = "RPU error"; break;
+	case 148:	err_str = "OCM0 correctable error"; break;
+	case 149:	err_str = "OCM1 correctable error"; break;
+	case 150:	err_str = "OCM0 Un-correctable error"; break;
+	case 151:	err_str = "OCM1 Un-correctable error"; break;
+	case 189:	err_str = "PSX_CMN_3 PD block consolidated error"; break;
+	case 191:	err_str = "FPD_INT_WRAP PD block consolidated error"; break;
+	case 232:	err_str = "CRAM Un-Correctable error"; break;
+	default:	err_str = "VERSAL_EDAC_ERR_ID: %d"; break;
+	}
+
+	snprintf(mc_priv->message,
+		 sizeof(mc_priv->message),
+		 "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str);
+
+	/* Convert to bytes */
+	length = result[TOTAL_ERR_LENGTH] * 4;
+	log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
+			       sec_sev, (void *)&result[ERROR_DATA], length);
+
+	return 0;
+}
+
+static struct rpmsg_device_id amd_rpmsg_id_table[] = {
+	{ .name = "error_ipc" },
+	{ },
+};
+MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table);
+
+static int rpmsg_probe(struct rpmsg_device *rpdev)
+{
+	struct rpmsg_channel_info chinfo;
+	struct mc_priv *pg;
+
+	pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data;
+	chinfo.src = RPMSG_ADDR_ANY;
+	chinfo.dst = rpdev->dst;
+	strscpy(chinfo.name, amd_rpmsg_id_table[0].name,
+		strlen(amd_rpmsg_id_table[0].name));
+
+	pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo);
+	if (!pg->ept)
+		return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n",
+				     chinfo.name);
+
+	dev_set_drvdata(&rpdev->dev, pg);
+
+	return 0;
+}
+
+static void rpmsg_remove(struct rpmsg_device *rpdev)
+{
+	struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev);
+
+	rpmsg_destroy_ept(mc_priv->ept);
+	dev_set_drvdata(&rpdev->dev, NULL);
+}
+
+static struct rpmsg_driver amd_rpmsg_driver = {
+	.drv.name = KBUILD_MODNAME,
+	.probe = rpmsg_probe,
+	.remove = rpmsg_remove,
+	.callback = rpmsg_cb,
+	.id_table = amd_rpmsg_id_table,
+};
+
+static void versal_edac_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev)
+{
+	u32 num_chans, rank, dwidth, config;
+	struct edac_mc_layer layers[2];
+	struct mem_ctl_info *mci;
+	struct device *dev;
+	enum dev_type dt;
+	char *name;
+	int rc, i;
+
+	for (i = 0; i < NUM_CONTROLLERS; i++) {
+		config = priv->adec[CONF + i * ADEC_NUM];
+		num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config);
+		rank = 1 << FIELD_GET(MC5_RANK_MASK, config);
+		dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config);
+
+		switch (dwidth) {
+		case XDDR5_BUS_WIDTH_16:
+			dt = DEV_X16;
+			break;
+		case XDDR5_BUS_WIDTH_32:
+			dt = DEV_X32;
+			break;
+		case XDDR5_BUS_WIDTH_64:
+			dt = DEV_X64;
+			break;
+		default:
+			dt = DEV_UNKNOWN;
+		}
+
+		if (dt == DEV_UNKNOWN)
+			continue;
+
+		/* Find the first enabled device and register that one. */
+		layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
+		layers[0].size = rank;
+		layers[0].is_virt_csrow = true;
+		layers[1].type = EDAC_MC_LAYER_CHANNEL;
+		layers[1].size = num_chans;
+		layers[1].is_virt_csrow = false;
+
+		rc = -ENOMEM;
+		mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers,
+				    sizeof(struct mc_priv));
+		if (!mci) {
+			edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i);
+			goto err_alloc;
+		}
+
+		priv->mci[i] = mci;
+		priv->dwidth = dt;
+
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		dev->release = versal_edac_release;
+		name = kmalloc(32, GFP_KERNEL);
+		sprintf(name, "versal-net-ddrmc5-edac-%d", i);
+		dev->init_name = name;
+		rc = device_register(dev);
+		if (rc)
+			goto err_alloc;
+
+		mci->pdev = dev;
+
+		platform_set_drvdata(pdev, priv);
+
+		mc_init(mci, dev);
+		rc = edac_mc_add_mc(mci);
+		if (rc) {
+			edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i);
+			goto err_alloc;
+		}
+	}
+	return 0;
+
+err_alloc:
+	while (i--) {
+		mci = priv->mci[i];
+		if (!mci)
+			continue;
+
+		if (mci->pdev) {
+			device_unregister(mci->pdev);
+			edac_mc_del_mc(mci->pdev);
+		}
+
+		edac_mc_free(mci);
+	}
+
+	return rc;
+}
+
+static void remove_versalnet(struct mc_priv *priv)
+{
+	struct mem_ctl_info *mci;
+	int i;
+
+	for (i = 0; i < NUM_CONTROLLERS; i++) {
+		device_unregister(priv->mci[i]->pdev);
+		mci = edac_mc_del_mc(priv->mci[i]->pdev);
+		if (!mci)
+			return;
+
+		edac_mc_free(mci);
+	}
+}
+
+static int mc_probe(struct platform_device *pdev)
+{
+	struct device_node *r5_core_node;
+	struct mc_priv *priv;
+	struct rproc *rp;
+	int rc;
+
+	r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0);
+	if (!r5_core_node) {
+		dev_err(&pdev->dev, "amd,rproc: invalid phandle\n");
+		return -EINVAL;
+	}
+
+	rp = rproc_get_by_phandle(r5_core_node->phandle);
+	if (!rp)
+		return -EPROBE_DEFER;
+
+	rc = rproc_boot(rp);
+	if (rc) {
+		dev_err(&pdev->dev, "Failed to attach to remote processor\n");
+		goto err_rproc_boot;
+	}
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
+
+	amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv;
+
+	rc = register_rpmsg_driver(&amd_rpmsg_driver);
+	if (rc) {
+		edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc);
+		goto err_alloc;
+	}
+
+	rc = setup_mcdi(priv);
+	if (rc)
+		goto err_unreg;
+
+	priv->mcdi->r5_rproc = rp;
+
+	rc = init_versalnet(priv, pdev);
+	if (rc)
+		goto err_init;
+
+	return 0;
+
+err_init:
+	cdx_mcdi_finish(priv->mcdi);
+
+err_unreg:
+	unregister_rpmsg_driver(&amd_rpmsg_driver);
+
+err_alloc:
+	rproc_shutdown(rp);
+
+err_rproc_boot:
+	rproc_put(rp);
+
+	return rc;
+}
+
+static void mc_remove(struct platform_device *pdev)
+{
+	struct mc_priv *priv = platform_get_drvdata(pdev);
+
+	unregister_rpmsg_driver(&amd_rpmsg_driver);
+	remove_versalnet(priv);
+	rproc_shutdown(priv->mcdi->r5_rproc);
+	cdx_mcdi_finish(priv->mcdi);
+}
+
+static const struct of_device_id amd_edac_match[] = {
+	{ .compatible = "xlnx,versal-net-ddrmc5", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, amd_edac_match);
+
+static struct platform_driver amd_ddr_edac_mc_driver = {
+	.driver = {
+		.name = "versal-net-edac",
+		.of_match_table = amd_edac_match,
+	},
+	.probe = mc_probe,
+	.remove = mc_remove,
+};
+
+module_platform_driver(amd_ddr_edac_mc_driver);
+
+MODULE_AUTHOR("AMD Inc");
+MODULE_DESCRIPTION("Versal NET EDAC driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 78b10c6ef7fe..2e93189d7142 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -41,7 +41,7 @@
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
  */
-#define FW_CDEV_KERNEL_VERSION			5
+#define FW_CDEV_KERNEL_VERSION			6
 #define FW_CDEV_VERSION_EVENT_REQUEST2		4
 #define FW_CDEV_VERSION_ALLOCATE_REGION_END	4
 #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW	5
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index d28477d84697..1d1c2d8f85ae 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -550,6 +550,23 @@ const struct fw_address_region fw_unit_space_region =
 	{ .start = 0xfffff0000900ULL, .end = 0x1000000000000ULL, };
 #endif  /*  0  */
 
+static void complete_address_handler(struct kref *kref)
+{
+	struct fw_address_handler *handler = container_of(kref, struct fw_address_handler, kref);
+
+	complete(&handler->done);
+}
+
+static void get_address_handler(struct fw_address_handler *handler)
+{
+	kref_get(&handler->kref);
+}
+
+static int put_address_handler(struct fw_address_handler *handler)
+{
+	return kref_put(&handler->kref, complete_address_handler);
+}
+
 /**
  * fw_core_add_address_handler() - register for incoming requests
  * @handler:	callback
@@ -596,6 +613,8 @@ int fw_core_add_address_handler(struct fw_address_handler *handler,
 		if (other != NULL) {
 			handler->offset += other->length;
 		} else {
+			init_completion(&handler->done);
+			kref_init(&handler->kref);
 			list_add_tail_rcu(&handler->link, &address_handler_list);
 			ret = 0;
 			break;
@@ -621,6 +640,9 @@ void fw_core_remove_address_handler(struct fw_address_handler *handler)
 		list_del_rcu(&handler->link);
 
 	synchronize_rcu();
+
+	if (!put_address_handler(handler))
+		wait_for_completion(&handler->done);
 }
 EXPORT_SYMBOL(fw_core_remove_address_handler);
 
@@ -914,22 +936,31 @@ static void handle_exclusive_region_request(struct fw_card *card,
 		handler = lookup_enclosing_address_handler(&address_handler_list, offset,
 							   request->length);
 		if (handler)
-			handler->address_callback(card, request, tcode, destination, source,
-						  p->generation, offset, request->data,
-						  request->length, handler->callback_data);
+			get_address_handler(handler);
 	}
 
-	if (!handler)
+	if (!handler) {
 		fw_send_response(card, request, RCODE_ADDRESS_ERROR);
+		return;
+	}
+
+	// Outside the RCU read-side critical section. Without spinlock. With reference count.
+	handler->address_callback(card, request, tcode, destination, source, p->generation, offset,
+				  request->data, request->length, handler->callback_data);
+	put_address_handler(handler);
 }
 
+// To use kmalloc allocator efficiently, this should be power of two.
+#define BUFFER_ON_KERNEL_STACK_SIZE	4
+
 static void handle_fcp_region_request(struct fw_card *card,
 				      struct fw_packet *p,
 				      struct fw_request *request,
 				      unsigned long long offset)
 {
-	struct fw_address_handler *handler;
-	int tcode, destination, source;
+	struct fw_address_handler *buffer_on_kernel_stack[BUFFER_ON_KERNEL_STACK_SIZE];
+	struct fw_address_handler *handler, **handlers;
+	int tcode, destination, source, i, count, buffer_size;
 
 	if ((offset != (CSR_REGISTER_BASE | CSR_FCP_COMMAND) &&
 	     offset != (CSR_REGISTER_BASE | CSR_FCP_RESPONSE)) ||
@@ -950,15 +981,55 @@ static void handle_fcp_region_request(struct fw_card *card,
 		return;
 	}
 
+	count = 0;
+	handlers = buffer_on_kernel_stack;
+	buffer_size = ARRAY_SIZE(buffer_on_kernel_stack);
 	scoped_guard(rcu) {
 		list_for_each_entry_rcu(handler, &address_handler_list, link) {
-			if (is_enclosing_handler(handler, offset, request->length))
-				handler->address_callback(card, request, tcode, destination, source,
-							  p->generation, offset, request->data,
-							  request->length, handler->callback_data);
+			if (is_enclosing_handler(handler, offset, request->length)) {
+				if (count >= buffer_size) {
+					int next_size = buffer_size * 2;
+					struct fw_address_handler **buffer_on_kernel_heap;
+
+					if (handlers == buffer_on_kernel_stack)
+						buffer_on_kernel_heap = NULL;
+					else
+						buffer_on_kernel_heap = handlers;
+
+					buffer_on_kernel_heap =
+						krealloc_array(buffer_on_kernel_heap, next_size,
+							sizeof(*buffer_on_kernel_heap), GFP_ATOMIC);
+					// FCP is used for purposes unrelated to significant system
+					// resources (e.g. storage or networking), so allocation
+					// failures are not considered so critical.
+					if (!buffer_on_kernel_heap)
+						break;
+
+					if (handlers == buffer_on_kernel_stack) {
+						memcpy(buffer_on_kernel_heap, buffer_on_kernel_stack,
+						       sizeof(buffer_on_kernel_stack));
+					}
+
+					handlers = buffer_on_kernel_heap;
+					buffer_size = next_size;
+				}
+				get_address_handler(handler);
+				handlers[count++] = handler;
+			}
 		}
 	}
 
+	for (i = 0; i < count; ++i) {
+		handler = handlers[i];
+		handler->address_callback(card, request, tcode, destination, source,
+					  p->generation, offset, request->data,
+					  request->length, handler->callback_data);
+		put_address_handler(handler);
+	}
+
+	if (handlers != buffer_on_kernel_stack)
+		kfree(handlers);
+
 	fw_send_response(card, request, RCODE_COMPLETE);
 }
 
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index cafc90d4caaf..0d05eac7c72b 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -788,7 +788,9 @@ static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry,
 
 	*kernel_entry = addr + entry;
 
-	return efi_adjust_memory_range_protection(addr, kernel_text_size);
+	return efi_adjust_memory_range_protection(addr, kernel_text_size) ?:
+	       efi_adjust_memory_range_protection(addr + kernel_inittext_offset,
+						  kernel_inittext_size);
 }
 
 static void __noreturn enter_kernel(unsigned long kernel_addr,
diff --git a/drivers/firmware/efi/stmm/tee_stmm_efi.c b/drivers/firmware/efi/stmm/tee_stmm_efi.c
index f741ca279052..65c0fe1ba275 100644
--- a/drivers/firmware/efi/stmm/tee_stmm_efi.c
+++ b/drivers/firmware/efi/stmm/tee_stmm_efi.c
@@ -143,6 +143,10 @@ static efi_status_t mm_communicate(u8 *comm_buf, size_t payload_size)
 	return var_hdr->ret_status;
 }
 
+#define COMM_BUF_SIZE(__payload_size)	(MM_COMMUNICATE_HEADER_SIZE + \
+					 MM_VARIABLE_COMMUNICATE_SIZE + \
+					 (__payload_size))
+
 /**
  * setup_mm_hdr() -	Allocate a buffer for StandAloneMM and initialize the
  *			header data.
@@ -150,11 +154,9 @@ static efi_status_t mm_communicate(u8 *comm_buf, size_t payload_size)
  * @dptr:		pointer address to store allocated buffer
  * @payload_size:	payload size
  * @func:		standAloneMM function number
- * @ret:		EFI return code
  * Return:		pointer to corresponding StandAloneMM function buffer or NULL
  */
-static void *setup_mm_hdr(u8 **dptr, size_t payload_size, size_t func,
-			  efi_status_t *ret)
+static void *setup_mm_hdr(u8 **dptr, size_t payload_size, size_t func)
 {
 	const efi_guid_t mm_var_guid = EFI_MM_VARIABLE_GUID;
 	struct efi_mm_communicate_header *mm_hdr;
@@ -169,17 +171,13 @@ static void *setup_mm_hdr(u8 **dptr, size_t payload_size, size_t func,
 	if (max_buffer_size &&
 	    max_buffer_size < (MM_COMMUNICATE_HEADER_SIZE +
 			       MM_VARIABLE_COMMUNICATE_SIZE + payload_size)) {
-		*ret = EFI_INVALID_PARAMETER;
 		return NULL;
 	}
 
-	comm_buf = kzalloc(MM_COMMUNICATE_HEADER_SIZE +
-				   MM_VARIABLE_COMMUNICATE_SIZE + payload_size,
-			   GFP_KERNEL);
-	if (!comm_buf) {
-		*ret = EFI_OUT_OF_RESOURCES;
+	comm_buf = alloc_pages_exact(COMM_BUF_SIZE(payload_size),
+				     GFP_KERNEL | __GFP_ZERO);
+	if (!comm_buf)
 		return NULL;
-	}
 
 	mm_hdr = (struct efi_mm_communicate_header *)comm_buf;
 	memcpy(&mm_hdr->header_guid, &mm_var_guid, sizeof(mm_hdr->header_guid));
@@ -187,9 +185,7 @@ static void *setup_mm_hdr(u8 **dptr, size_t payload_size, size_t func,
 
 	var_hdr = (struct smm_variable_communicate_header *)mm_hdr->data;
 	var_hdr->function = func;
-	if (dptr)
-		*dptr = comm_buf;
-	*ret = EFI_SUCCESS;
+	*dptr = comm_buf;
 
 	return var_hdr->data;
 }
@@ -212,10 +208,9 @@ static efi_status_t get_max_payload(size_t *size)
 
 	payload_size = sizeof(*var_payload);
 	var_payload = setup_mm_hdr(&comm_buf, payload_size,
-				   SMM_VARIABLE_FUNCTION_GET_PAYLOAD_SIZE,
-				   &ret);
+				   SMM_VARIABLE_FUNCTION_GET_PAYLOAD_SIZE);
 	if (!var_payload)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	ret = mm_communicate(comm_buf, payload_size);
 	if (ret != EFI_SUCCESS)
@@ -239,7 +234,7 @@ static efi_status_t get_max_payload(size_t *size)
 	 */
 	*size -= 2;
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
@@ -259,9 +254,9 @@ static efi_status_t get_property_int(u16 *name, size_t name_size,
 
 	smm_property = setup_mm_hdr(
 		&comm_buf, payload_size,
-		SMM_VARIABLE_FUNCTION_VAR_CHECK_VARIABLE_PROPERTY_GET, &ret);
+		SMM_VARIABLE_FUNCTION_VAR_CHECK_VARIABLE_PROPERTY_GET);
 	if (!smm_property)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	memcpy(&smm_property->guid, vendor, sizeof(smm_property->guid));
 	smm_property->name_size = name_size;
@@ -282,7 +277,7 @@ static efi_status_t get_property_int(u16 *name, size_t name_size,
 	memcpy(var_property, &smm_property->property, sizeof(*var_property));
 
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
@@ -315,9 +310,9 @@ static efi_status_t tee_get_variable(u16 *name, efi_guid_t *vendor,
 
 	payload_size = MM_VARIABLE_ACCESS_HEADER_SIZE + name_size + tmp_dsize;
 	var_acc = setup_mm_hdr(&comm_buf, payload_size,
-			       SMM_VARIABLE_FUNCTION_GET_VARIABLE, &ret);
+			       SMM_VARIABLE_FUNCTION_GET_VARIABLE);
 	if (!var_acc)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	/* Fill in contents */
 	memcpy(&var_acc->guid, vendor, sizeof(var_acc->guid));
@@ -347,7 +342,7 @@ static efi_status_t tee_get_variable(u16 *name, efi_guid_t *vendor,
 	memcpy(data, (u8 *)var_acc->name + var_acc->name_size,
 	       var_acc->data_size);
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
@@ -380,10 +375,9 @@ static efi_status_t tee_get_next_variable(unsigned long *name_size,
 
 	payload_size = MM_VARIABLE_GET_NEXT_HEADER_SIZE + out_name_size;
 	var_getnext = setup_mm_hdr(&comm_buf, payload_size,
-				   SMM_VARIABLE_FUNCTION_GET_NEXT_VARIABLE_NAME,
-				   &ret);
+				SMM_VARIABLE_FUNCTION_GET_NEXT_VARIABLE_NAME);
 	if (!var_getnext)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	/* Fill in contents */
 	memcpy(&var_getnext->guid, guid, sizeof(var_getnext->guid));
@@ -404,7 +398,7 @@ static efi_status_t tee_get_next_variable(unsigned long *name_size,
 	memcpy(name, var_getnext->name, var_getnext->name_size);
 
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
@@ -437,9 +431,9 @@ static efi_status_t tee_set_variable(efi_char16_t *name, efi_guid_t *vendor,
 	 * the properties, if the allocation fails
 	 */
 	var_acc = setup_mm_hdr(&comm_buf, payload_size,
-			       SMM_VARIABLE_FUNCTION_SET_VARIABLE, &ret);
+			       SMM_VARIABLE_FUNCTION_SET_VARIABLE);
 	if (!var_acc)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	/*
 	 * The API has the ability to override RO flags. If no RO check was
@@ -467,7 +461,7 @@ static efi_status_t tee_set_variable(efi_char16_t *name, efi_guid_t *vendor,
 	ret = mm_communicate(comm_buf, payload_size);
 	dev_dbg(pvt_data.dev, "Set Variable %s %d %lx\n", __FILE__, __LINE__, ret);
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
@@ -492,10 +486,9 @@ static efi_status_t tee_query_variable_info(u32 attributes,
 
 	payload_size = sizeof(*mm_query_info);
 	mm_query_info = setup_mm_hdr(&comm_buf, payload_size,
-				SMM_VARIABLE_FUNCTION_QUERY_VARIABLE_INFO,
-				&ret);
+				SMM_VARIABLE_FUNCTION_QUERY_VARIABLE_INFO);
 	if (!mm_query_info)
-		return EFI_OUT_OF_RESOURCES;
+		return EFI_DEVICE_ERROR;
 
 	mm_query_info->attr = attributes;
 	ret = mm_communicate(comm_buf, payload_size);
@@ -507,7 +500,7 @@ static efi_status_t tee_query_variable_info(u32 attributes,
 	*max_variable_size = mm_query_info->max_variable_size;
 
 out:
-	kfree(comm_buf);
+	free_pages_exact(comm_buf, COMM_BUF_SIZE(payload_size));
 	return ret;
 }
 
diff --git a/drivers/firmware/tegra/bpmp-tegra186.c b/drivers/firmware/tegra/bpmp-tegra186.c
index 7cfc5fdfa49d..64863db7a715 100644
--- a/drivers/firmware/tegra/bpmp-tegra186.c
+++ b/drivers/firmware/tegra/bpmp-tegra186.c
@@ -198,7 +198,10 @@ static int tegra186_bpmp_dram_init(struct tegra_bpmp *bpmp)
 
 	err = of_reserved_mem_region_to_resource(bpmp->dev->of_node, 0, &res);
 	if (err < 0) {
-		dev_warn(bpmp->dev, "failed to parse memory region: %d\n", err);
+		if (err != -ENODEV)
+			dev_warn(bpmp->dev,
+				 "failed to parse memory region: %d\n", err);
+
 		return err;
 	}
 
diff --git a/drivers/fpga/zynq-fpga.c b/drivers/fpga/zynq-fpga.c
index 0be0d569589d..b7629a0e4813 100644
--- a/drivers/fpga/zynq-fpga.c
+++ b/drivers/fpga/zynq-fpga.c
@@ -405,12 +405,12 @@ static int zynq_fpga_ops_write(struct fpga_manager *mgr, struct sg_table *sgt)
 		}
 	}
 
-	priv->dma_nelms =
-	    dma_map_sgtable(mgr->dev.parent, sgt, DMA_TO_DEVICE, 0);
-	if (priv->dma_nelms == 0) {
+	err = dma_map_sgtable(mgr->dev.parent, sgt, DMA_TO_DEVICE, 0);
+	if (err) {
 		dev_err(&mgr->dev, "Unable to DMA map (TO_DEVICE)\n");
-		return -ENOMEM;
+		return err;
 	}
+	priv->dma_nelms = sgt->nents;
 
 	/* enable clock */
 	err = clk_enable(priv->clk);
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index e43abb322fa6..23b5820e1854 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -3,6 +3,9 @@
 # GPIO infrastructure and drivers
 #
 
+config GPIOLIB_LEGACY
+	def_bool y
+
 menuconfig GPIOLIB
 	bool "GPIO Support"
 	help
@@ -12,9 +15,6 @@ menuconfig GPIOLIB
 
 	  If unsure, say N.
 
-config GPIOLIB_LEGACY
-	def_bool y
-
 if GPIOLIB
 
 config GPIOLIB_FASTPATH_LIMIT
@@ -485,7 +485,6 @@ config GPIO_MM_LANTIQ
 config GPIO_MPC5200
 	def_bool y
 	depends on PPC_MPC52xx
-	select OF_GPIO_MM_GPIOCHIP
 
 config GPIO_MPC8XXX
 	bool "MPC512x/MPC8xxx/QorIQ GPIO support"
diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c
index 9875e34bde72..ed29b07d16c1 100644
--- a/drivers/gpio/gpio-mlxbf3.c
+++ b/drivers/gpio/gpio-mlxbf3.c
@@ -190,9 +190,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev)
 	struct mlxbf3_gpio_context *gs;
 	struct gpio_irq_chip *girq;
 	struct gpio_chip *gc;
-	char *colon_ptr;
 	int ret, irq;
-	long num;
 
 	gs = devm_kzalloc(dev, sizeof(*gs), GFP_KERNEL);
 	if (!gs)
@@ -229,39 +227,25 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev)
 	gc->owner = THIS_MODULE;
 	gc->add_pin_ranges = mlxbf3_gpio_add_pin_ranges;
 
-	colon_ptr = strchr(dev_name(dev), ':');
-	if (!colon_ptr) {
-		dev_err(dev, "invalid device name format\n");
-		return -EINVAL;
-	}
-
-	ret = kstrtol(++colon_ptr, 16, &num);
-	if (ret) {
-		dev_err(dev, "invalid device instance\n");
-		return ret;
-	}
-
-	if (!num) {
-		irq = platform_get_irq(pdev, 0);
-		if (irq >= 0) {
-			girq = &gs->gc.irq;
-			gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip);
-			girq->default_type = IRQ_TYPE_NONE;
-			/* This will let us handle the parent IRQ in the driver */
-			girq->num_parents = 0;
-			girq->parents = NULL;
-			girq->parent_handler = NULL;
-			girq->handler = handle_bad_irq;
-
-			/*
-			 * Directly request the irq here instead of passing
-			 * a flow-handler because the irq is shared.
-			 */
-			ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler,
-					       IRQF_SHARED, dev_name(dev), gs);
-			if (ret)
-				return dev_err_probe(dev, ret, "failed to request IRQ");
-		}
+	irq = platform_get_irq_optional(pdev, 0);
+	if (irq >= 0) {
+		girq = &gs->gc.irq;
+		gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip);
+		girq->default_type = IRQ_TYPE_NONE;
+		/* This will let us handle the parent IRQ in the driver */
+		girq->num_parents = 0;
+		girq->parents = NULL;
+		girq->parent_handler = NULL;
+		girq->handler = handle_bad_irq;
+
+		/*
+		 * Directly request the irq here instead of passing
+		 * a flow-handler because the irq is shared.
+		 */
+		ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler,
+				       IRQF_SHARED, dev_name(dev), gs);
+		if (ret)
+			return dev_err_probe(dev, ret, "failed to request IRQ");
 	}
 
 	platform_set_drvdata(pdev, gs);
diff --git a/drivers/gpio/gpio-mpc5200.c b/drivers/gpio/gpio-mpc5200.c
index dad0eca1ca2e..00f209157fd0 100644
--- a/drivers/gpio/gpio-mpc5200.c
+++ b/drivers/gpio/gpio-mpc5200.c
@@ -8,7 +8,7 @@
 #include <linux/of.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/gpio/legacy-of-mm-gpiochip.h>
+#include <linux/gpio/driver.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
@@ -19,7 +19,8 @@
 static DEFINE_SPINLOCK(gpio_lock);
 
 struct mpc52xx_gpiochip {
-	struct of_mm_gpio_chip mmchip;
+	struct gpio_chip gc;
+	void __iomem *regs;
 	unsigned int shadow_dvo;
 	unsigned int shadow_gpioe;
 	unsigned int shadow_ddr;
@@ -43,8 +44,8 @@ struct mpc52xx_gpiochip {
  */
 static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
+	struct mpc52xx_gpio_wkup __iomem *regs = chip->regs;
 	unsigned int ret;
 
 	ret = (in_8(&regs->wkup_ival) >> (7 - gpio)) & 1;
@@ -57,9 +58,8 @@ static int mpc52xx_wkup_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 static inline void
 __mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
-	struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpio_wkup __iomem *regs = chip->regs;
 
 	if (val)
 		chip->shadow_dvo |= 1 << (7 - gpio);
@@ -87,9 +87,8 @@ mpc52xx_wkup_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
-	struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpio_wkup __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&gpio_lock, flags);
@@ -110,9 +109,8 @@ static int mpc52xx_wkup_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 static int
 mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct mpc52xx_gpio_wkup __iomem *regs = mm_gc->regs;
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
+	struct mpc52xx_gpio_wkup __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&gpio_lock, flags);
@@ -136,30 +134,41 @@ mpc52xx_wkup_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev)
 {
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = dev->of_node;
 	struct mpc52xx_gpiochip *chip;
 	struct mpc52xx_gpio_wkup __iomem *regs;
 	struct gpio_chip *gc;
 	int ret;
 
-	chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL);
+	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
 	platform_set_drvdata(ofdev, chip);
 
-	gc = &chip->mmchip.gc;
+	gc = &chip->gc;
 
+	gc->base             = -1;
 	gc->ngpio            = 8;
 	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
 	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
 	gc->get              = mpc52xx_wkup_gpio_get;
 	gc->set              = mpc52xx_wkup_gpio_set;
 
-	ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip);
+	gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np);
+	if (!gc->label)
+		return -ENOMEM;
+
+	chip->regs = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(chip->regs))
+		return PTR_ERR(chip->regs);
+
+	ret = devm_gpiochip_add_data(dev, gc, chip);
 	if (ret)
 		return ret;
 
-	regs = chip->mmchip.regs;
+	regs = chip->regs;
 	chip->shadow_gpioe = in_8(&regs->wkup_gpioe);
 	chip->shadow_ddr = in_8(&regs->wkup_ddr);
 	chip->shadow_dvo = in_8(&regs->wkup_dvo);
@@ -167,13 +176,6 @@ static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev)
 	return 0;
 }
 
-static void mpc52xx_gpiochip_remove(struct platform_device *ofdev)
-{
-	struct mpc52xx_gpiochip *chip = platform_get_drvdata(ofdev);
-
-	of_mm_gpiochip_remove(&chip->mmchip);
-}
-
 static const struct of_device_id mpc52xx_wkup_gpiochip_match[] = {
 	{ .compatible = "fsl,mpc5200-gpio-wkup", },
 	{}
@@ -185,7 +187,6 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = {
 		.of_match_table = mpc52xx_wkup_gpiochip_match,
 	},
 	.probe = mpc52xx_wkup_gpiochip_probe,
-	.remove = mpc52xx_gpiochip_remove,
 };
 
 /*
@@ -207,8 +208,8 @@ static struct platform_driver mpc52xx_wkup_gpiochip_driver = {
  */
 static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
-	struct mpc52xx_gpio __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
+	struct mpc52xx_gpio __iomem *regs = chip->regs;
 	unsigned int ret;
 
 	ret = (in_be32(&regs->simple_ival) >> (31 - gpio)) & 1;
@@ -219,9 +220,8 @@ static int mpc52xx_simple_gpio_get(struct gpio_chip *gc, unsigned int gpio)
 static inline void
 __mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
-	struct mpc52xx_gpio __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpio __iomem *regs = chip->regs;
 
 	if (val)
 		chip->shadow_dvo |= 1 << (31 - gpio);
@@ -248,9 +248,8 @@ mpc52xx_simple_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
-	struct mpc52xx_gpio __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpio __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&gpio_lock, flags);
@@ -271,9 +270,8 @@ static int mpc52xx_simple_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 static int
 mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
-	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
 	struct mpc52xx_gpiochip *chip = gpiochip_get_data(gc);
-	struct mpc52xx_gpio __iomem *regs = mm_gc->regs;
+	struct mpc52xx_gpio __iomem *regs = chip->regs;
 	unsigned long flags;
 
 	spin_lock_irqsave(&gpio_lock, flags);
@@ -298,30 +296,41 @@ mpc52xx_simple_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int mpc52xx_simple_gpiochip_probe(struct platform_device *ofdev)
 {
+	struct device *dev = &ofdev->dev;
+	struct device_node *np = dev->of_node;
 	struct mpc52xx_gpiochip *chip;
 	struct gpio_chip *gc;
 	struct mpc52xx_gpio __iomem *regs;
 	int ret;
 
-	chip = devm_kzalloc(&ofdev->dev, sizeof(*chip), GFP_KERNEL);
+	chip = devm_kzalloc(dev, sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
 	platform_set_drvdata(ofdev, chip);
 
-	gc = &chip->mmchip.gc;
+	gc = &chip->gc;
 
+	gc->base             = -1;
 	gc->ngpio            = 32;
 	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
 	gc->direction_output = mpc52xx_simple_gpio_dir_out;
 	gc->get              = mpc52xx_simple_gpio_get;
 	gc->set              = mpc52xx_simple_gpio_set;
 
-	ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip);
+	gc->label = devm_kasprintf(dev, GFP_KERNEL, "%pOF", np);
+	if (!gc->label)
+		return -ENOMEM;
+
+	chip->regs = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(chip->regs))
+		return PTR_ERR(chip->regs);
+
+	ret = devm_gpiochip_add_data(dev, gc, chip);
 	if (ret)
 		return ret;
 
-	regs = chip->mmchip.regs;
+	regs = chip->regs;
 	chip->shadow_gpioe = in_be32(&regs->simple_gpioe);
 	chip->shadow_ddr = in_be32(&regs->simple_ddr);
 	chip->shadow_dvo = in_be32(&regs->simple_dvo);
@@ -340,7 +349,6 @@ static struct platform_driver mpc52xx_simple_gpiochip_driver = {
 		.of_match_table = mpc52xx_simple_gpiochip_match,
 	},
 	.probe = mpc52xx_simple_gpiochip_probe,
-	.remove = mpc52xx_gpiochip_remove,
 };
 
 static struct platform_driver * const drivers[] = {
diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c
index e8a32dfebdcb..3f8b72311f8e 100644
--- a/drivers/gpio/gpio-regmap.c
+++ b/drivers/gpio/gpio-regmap.c
@@ -274,7 +274,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 	if (!chip->ngpio) {
 		ret = gpiochip_get_ngpios(chip, chip->parent);
 		if (ret)
-			return ERR_PTR(ret);
+			goto err_free_gpio;
 	}
 
 	/* if not set, assume there is only one register */
diff --git a/drivers/gpio/gpio-timberdale.c b/drivers/gpio/gpio-timberdale.c
index 679e27f00ff6..f488939dd00a 100644
--- a/drivers/gpio/gpio-timberdale.c
+++ b/drivers/gpio/gpio-timberdale.c
@@ -137,7 +137,7 @@ static int timbgpio_irq_type(struct irq_data *d, unsigned trigger)
 	u32 ver;
 	int ret = 0;
 
-	if (offset < 0 || offset > tgpio->gpio.ngpio)
+	if (offset < 0 || offset >= tgpio->gpio.ngpio)
 		return -EINVAL;
 
 	ver = ioread32(tgpio->membase + TGPIO_VER);
diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c
index 12b24a717e43..284e762d92c4 100644
--- a/drivers/gpio/gpiolib-acpi-core.c
+++ b/drivers/gpio/gpiolib-acpi-core.c
@@ -942,8 +942,9 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode,
 {
 	struct acpi_device *adev = to_acpi_device_node(fwnode);
 	bool can_fallback = acpi_can_fallback_to_crs(adev, con_id);
-	struct acpi_gpio_info info;
+	struct acpi_gpio_info info = {};
 	struct gpio_desc *desc;
+	int ret;
 
 	desc = __acpi_find_gpio(fwnode, con_id, idx, can_fallback, &info);
 	if (IS_ERR(desc))
@@ -957,6 +958,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode,
 
 	acpi_gpio_update_gpiod_flags(dflags, &info);
 	acpi_gpio_update_gpiod_lookup_flags(lookupflags, &info);
+
+	/* ACPI uses hundredths of milliseconds units */
+	ret = gpio_set_debounce_timeout(desc, info.debounce * 10);
+	if (ret)
+		return ERR_PTR(ret);
+
 	return desc;
 }
 
@@ -992,7 +999,7 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id,
 	int ret;
 
 	for (i = 0, idx = 0; idx <= index; i++) {
-		struct acpi_gpio_info info;
+		struct acpi_gpio_info info = {};
 		struct gpio_desc *desc;
 
 		/* Ignore -EPROBE_DEFER, it only matters if idx matches */
diff --git a/drivers/gpio/gpiolib-acpi-quirks.c b/drivers/gpio/gpiolib-acpi-quirks.c
index c13545dce349..7b95d1b03361 100644
--- a/drivers/gpio/gpiolib-acpi-quirks.c
+++ b/drivers/gpio/gpiolib-acpi-quirks.c
@@ -319,6 +319,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = {
 	},
 	{
 		/*
+		 * Same as G1619-04. New model.
+		 */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "GPD"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"),
+		},
+		.driver_data = &(struct acpi_gpiolib_dmi_quirk) {
+			.ignore_wake = "PNP0C50:00@8",
+		},
+	},
+	{
+		/*
 		 * Spurious wakeups from GPIO 11
 		 * Found in BIOS 1.04
 		 * https://gitlab.freedesktop.org/drm/amd/-/issues/3954
@@ -344,6 +356,20 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = {
 			.ignore_interrupt = "AMDI0030:00@8",
 		},
 	},
+	{
+		/*
+		 * Spurious wakeups from TP_ATTN# pin
+		 * Found in BIOS 5.35
+		 * https://gitlab.freedesktop.org/drm/amd/-/issues/4482
+		 */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt PX13"),
+		},
+		.driver_data = &(struct acpi_gpiolib_dmi_quirk) {
+			.ignore_wake = "ASCP1A00:00@8",
+		},
+	},
 	{} /* Terminating entry */
 };
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 0d2b470a252e..74d54513730a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4604,6 +4604,23 @@ static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode,
 	return desc;
 }
 
+static struct gpio_desc *gpiod_fwnode_lookup(struct fwnode_handle *fwnode,
+					     struct device *consumer,
+					     const char *con_id,
+					     unsigned int idx,
+					     enum gpiod_flags *flags,
+					     unsigned long *lookupflags)
+{
+	struct gpio_desc *desc;
+
+	desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, flags, lookupflags);
+	if (gpiod_not_found(desc) && !IS_ERR_OR_NULL(fwnode))
+		desc = gpiod_find_by_fwnode(fwnode->secondary, consumer, con_id,
+					    idx, flags, lookupflags);
+
+	return desc;
+}
+
 struct gpio_desc *gpiod_find_and_request(struct device *consumer,
 					 struct fwnode_handle *fwnode,
 					 const char *con_id,
@@ -4622,8 +4639,8 @@ struct gpio_desc *gpiod_find_and_request(struct device *consumer,
 	int ret = 0;
 
 	scoped_guard(srcu, &gpio_devices_srcu) {
-		desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx,
-					    &flags, &lookupflags);
+		desc = gpiod_fwnode_lookup(fwnode, consumer, con_id, idx,
+					   &flags, &lookupflags);
 		if (gpiod_not_found(desc) && platform_lookup_allowed) {
 			/*
 			 * Either we are not using DT or ACPI, or their lookup
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index fbe7616555c8..a2879d2b7c8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -250,16 +250,24 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
 
 void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc)
 {
-	if (adev->kfd.dev)
-		kgd2kfd_suspend(adev->kfd.dev, suspend_proc);
+	if (adev->kfd.dev) {
+		if (adev->in_s0ix)
+			kgd2kfd_stop_sched_all_nodes(adev->kfd.dev);
+		else
+			kgd2kfd_suspend(adev->kfd.dev, suspend_proc);
+	}
 }
 
 int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc)
 {
 	int r = 0;
 
-	if (adev->kfd.dev)
-		r = kgd2kfd_resume(adev->kfd.dev, resume_proc);
+	if (adev->kfd.dev) {
+		if (adev->in_s0ix)
+			r = kgd2kfd_start_sched_all_nodes(adev->kfd.dev);
+		else
+			r = kgd2kfd_resume(adev->kfd.dev, resume_proc);
+	}
 
 	return r;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 33eb4826b58b..aa88bad7416b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -426,7 +426,9 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
 int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd);
 void kgd2kfd_unlock_kfd(struct kfd_dev *kfd);
 int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
+int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd);
 int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
+int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd);
 bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
 bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
 			       bool retry_fault);
@@ -516,11 +518,21 @@ static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
 	return 0;
 }
 
+static inline int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd)
+{
+	return 0;
+}
+
 static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
 {
 	return 0;
 }
 
+static inline int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd)
+{
+	return 0;
+}
+
 static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
 {
 	return false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 260165bbe373..b16cce7c22c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -213,19 +213,35 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 	spin_lock(&kfd_mem_limit.mem_limit_lock);
 
 	if (kfd_mem_limit.system_mem_used + system_mem_needed >
-	    kfd_mem_limit.max_system_mem_limit)
+	    kfd_mem_limit.max_system_mem_limit) {
 		pr_debug("Set no_system_mem_limit=1 if using shared memory\n");
+		if (!no_system_mem_limit) {
+			ret = -ENOMEM;
+			goto release;
+		}
+	}
 
-	if ((kfd_mem_limit.system_mem_used + system_mem_needed >
-	     kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) ||
-	    (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
-	     kfd_mem_limit.max_ttm_mem_limit) ||
-	    (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
-	     vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) {
+	if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
+		kfd_mem_limit.max_ttm_mem_limit) {
 		ret = -ENOMEM;
 		goto release;
 	}
 
+	/*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with
+	 * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip
+	 * VRAM check since ttm_mem_limit check already cover this allocation
+	 */
+
+	if (adev && xcp_id >= 0 && (!adev->apu_prefer_gtt || adev->gmc.is_app_apu)) {
+		uint64_t vram_available =
+			vram_size - reserved_for_pt - reserved_for_ras -
+			atomic64_read(&adev->vram_pin_size);
+		if (adev->kfd.vram_used[xcp_id] + vram_needed > vram_available) {
+			ret = -ENOMEM;
+			goto release;
+		}
+	}
+
 	/* Update memory accounting by decreasing available system
 	 * memory, TTM memory and GPU memory as computed above
 	 */
@@ -1626,11 +1642,15 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
 	uint64_t vram_available, system_mem_available, ttm_mem_available;
 
 	spin_lock(&kfd_mem_limit.mem_limit_lock);
-	vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
-		- adev->kfd.vram_used_aligned[xcp_id]
-		- atomic64_read(&adev->vram_pin_size)
-		- reserved_for_pt
-		- reserved_for_ras;
+	if (adev->apu_prefer_gtt && !adev->gmc.is_app_apu)
+		vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
+			- adev->kfd.vram_used_aligned[xcp_id];
+	else
+		vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
+			- adev->kfd.vram_used_aligned[xcp_id]
+			- atomic64_read(&adev->vram_pin_size)
+			- reserved_for_pt
+			- reserved_for_ras;
 
 	if (adev->apu_prefer_gtt) {
 		system_mem_available = no_system_mem_limit ?
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index a2adaacf6adb..d3f220be2ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1139,6 +1139,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		}
 	}
 
+	if (!amdgpu_vm_ready(vm))
+		return -EINVAL;
+
 	r = amdgpu_vm_clear_freed(adev, vm, NULL);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 01d234cf8156..c8459337fcb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5136,7 +5136,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 	adev->in_suspend = true;
 
 	if (amdgpu_sriov_vf(adev)) {
-		if (!adev->in_s0ix && !adev->in_runpm)
+		if (!adev->in_runpm)
 			amdgpu_amdkfd_suspend_process(adev);
 		amdgpu_virt_fini_data_exchange(adev);
 		r = amdgpu_virt_request_full_gpu(adev, false);
@@ -5156,10 +5156,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 
 	amdgpu_device_ip_suspend_phase1(adev);
 
-	if (!adev->in_s0ix) {
-		amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
-		amdgpu_userq_suspend(adev);
-	}
+	amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+	amdgpu_userq_suspend(adev);
 
 	r = amdgpu_device_evict_resources(adev);
 	if (r)
@@ -5254,15 +5252,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
 		goto exit;
 	}
 
-	if (!adev->in_s0ix) {
-		r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
-		if (r)
-			goto exit;
+	r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+	if (r)
+		goto exit;
 
-		r = amdgpu_userq_resume(adev);
-		if (r)
-			goto exit;
-	}
+	r = amdgpu_userq_resume(adev);
+	if (r)
+		goto exit;
 
 	r = amdgpu_device_ip_late_init(adev);
 	if (r)
@@ -5275,7 +5271,7 @@ exit:
 		amdgpu_virt_init_data_exchange(adev);
 		amdgpu_virt_release_full_gpu(adev, true);
 
-		if (!adev->in_s0ix && !r && !adev->in_runpm)
+		if (!r && !adev->in_runpm)
 			r = amdgpu_amdkfd_resume_process(adev);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index ff98c87b2e0b..ce27cb5bb05e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -285,6 +285,36 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf,
 	return ret;
 }
 
+static int amdgpu_dma_buf_vmap(struct dma_buf *dma_buf, struct iosys_map *map)
+{
+	struct drm_gem_object *obj = dma_buf->priv;
+	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
+	int ret;
+
+	/*
+	 * Pin to keep buffer in place while it's vmap'ed. The actual
+	 * domain is not that important as long as it's mapable. Using
+	 * GTT and VRAM should be compatible with most use cases.
+	 */
+	ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT | AMDGPU_GEM_DOMAIN_VRAM);
+	if (ret)
+		return ret;
+	ret = drm_gem_dmabuf_vmap(dma_buf, map);
+	if (ret)
+		amdgpu_bo_unpin(bo);
+
+	return ret;
+}
+
+static void amdgpu_dma_buf_vunmap(struct dma_buf *dma_buf, struct iosys_map *map)
+{
+	struct drm_gem_object *obj = dma_buf->priv;
+	struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
+
+	drm_gem_dmabuf_vunmap(dma_buf, map);
+	amdgpu_bo_unpin(bo);
+}
+
 const struct dma_buf_ops amdgpu_dmabuf_ops = {
 	.attach = amdgpu_dma_buf_attach,
 	.pin = amdgpu_dma_buf_pin,
@@ -294,8 +324,8 @@ const struct dma_buf_ops amdgpu_dmabuf_ops = {
 	.release = drm_gem_dmabuf_release,
 	.begin_cpu_access = amdgpu_dma_buf_begin_cpu_access,
 	.mmap = drm_gem_dmabuf_mmap,
-	.vmap = drm_gem_dmabuf_vmap,
-	.vunmap = drm_gem_dmabuf_vunmap,
+	.vmap = amdgpu_dma_buf_vmap,
+	.vunmap = amdgpu_dma_buf_vunmap,
 };
 
 /**
@@ -514,7 +544,7 @@ bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev,
 		return false;
 
 	if (drm_gem_is_imported(obj)) {
-		struct dma_buf *dma_buf = obj->dma_buf;
+		struct dma_buf *dma_buf = obj->import_attach->dmabuf;
 
 		if (dma_buf->ops != &amdgpu_dmabuf_ops)
 			/* No XGMI with non AMD GPUs */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 6626a6e64ff5..d1ccbfcf21fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -317,7 +317,8 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
 	 */
 	if (!vm->is_compute_context || !vm->process_info)
 		return 0;
-	if (!drm_gem_is_imported(obj) || !dma_buf_is_dynamic(obj->dma_buf))
+	if (!drm_gem_is_imported(obj) ||
+	    !dma_buf_is_dynamic(obj->import_attach->dmabuf))
 		return 0;
 	mutex_lock_nested(&vm->process_info->lock, 1);
 	if (!WARN_ON(!vm->process_info->eviction_fence)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 0bd51a04be79..693357caa9a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -448,7 +448,7 @@ static int psp_sw_init(struct amdgpu_ip_block *ip_block)
 	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
 	if (!psp->cmd) {
 		dev_err(adev->dev, "Failed to allocate memory to command buffer!\n");
-		ret = -ENOMEM;
+		return -ENOMEM;
 	}
 
 	adev->psp.xgmi_context.supports_extended_data =
@@ -1039,15 +1039,28 @@ int psp_update_fw_reservation(struct psp_context *psp)
 {
 	int ret;
 	uint64_t reserv_addr, reserv_addr_ext;
-	uint32_t reserv_size, reserv_size_ext;
+	uint32_t reserv_size, reserv_size_ext, mp0_ip_ver;
 	struct amdgpu_device *adev = psp->adev;
 
+	mp0_ip_ver = amdgpu_ip_version(adev, MP0_HWIP, 0);
+
 	if (amdgpu_sriov_vf(psp->adev))
 		return 0;
 
-	if ((amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(14, 0, 2)) &&
-	    (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(14, 0, 3)))
+	switch (mp0_ip_ver) {
+	case IP_VERSION(14, 0, 2):
+		if (adev->psp.sos.fw_version < 0x3b0e0d)
+			return 0;
+		break;
+
+	case IP_VERSION(14, 0, 3):
+		if (adev->psp.sos.fw_version < 0x3a0e14)
+			return 0;
+		break;
+
+	default:
 		return 0;
+	}
 
 	ret = psp_get_fw_reservation_info(psp, GFX_CMD_ID_FB_FW_RESERV_ADDR, &reserv_addr, &reserv_size);
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 6379bb25bf5c..486c3646710c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -421,8 +421,6 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
 	dma_fence_put(ring->vmid_wait);
 	ring->vmid_wait = NULL;
 	ring->me = 0;
-
-	ring->adev->rings[ring->idx] = NULL;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index c3ace8030530..8190c24a649a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -471,6 +471,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	if (index == (uint64_t)-EINVAL) {
 		drm_file_err(uq_mgr->file, "Failed to get doorbell for queue\n");
 		kfree(queue);
+		r = -EINVAL;
 		goto unlock;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 5cacf5717016..c39bb06ebda1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  * Check if all VM PDs/PTs are ready for updates
  *
  * Returns:
- * True if VM is not evicting.
+ * True if VM is not evicting and all VM entities are not stopped
  */
 bool amdgpu_vm_ready(struct amdgpu_vm *vm)
 {
-	bool empty;
 	bool ret;
 
 	amdgpu_vm_eviction_lock(vm);
@@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
 	amdgpu_vm_eviction_unlock(vm);
 
 	spin_lock(&vm->status_lock);
-	empty = list_empty(&vm->evicted);
+	ret &= list_empty(&vm->evicted);
 	spin_unlock(&vm->status_lock);
 
-	return ret && empty;
+	spin_lock(&vm->immediate.lock);
+	ret &= !vm->immediate.stopped;
+	spin_unlock(&vm->immediate.lock);
+
+	spin_lock(&vm->delayed.lock);
+	ret &= !vm->delayed.stopped;
+	spin_unlock(&vm->delayed.lock);
+
+	return ret;
 }
 
 /**
@@ -1276,7 +1283,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
 		struct drm_gem_object *obj = &bo->tbo.base;
 
 		if (drm_gem_is_imported(obj) && bo_va->is_xgmi) {
-			struct dma_buf *dma_buf = obj->dma_buf;
+			struct dma_buf *dma_buf = obj->import_attach->dmabuf;
 			struct drm_gem_object *gobj = dma_buf->priv;
 			struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 07c936e90d8e..78f9e86ccc09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -648,9 +648,8 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
 	list_for_each_entry(block, &vres->blocks, link)
 		vis_usage += amdgpu_vram_mgr_vis_size(adev, block);
 
-	amdgpu_vram_mgr_do_reserve(man);
-
 	drm_buddy_free_list(mm, &vres->blocks, vres->flags);
+	amdgpu_vram_mgr_do_reserve(man);
 	mutex_unlock(&mgr->lock);
 
 	atomic64_sub(vis_usage, &mgr->vis_usage);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index bf7c22f81cda..ba73518f5cdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -1462,17 +1462,12 @@ static int dce_v10_0_audio_init(struct amdgpu_device *adev)
 
 static void dce_v10_0_audio_fini(struct amdgpu_device *adev)
 {
-	int i;
-
 	if (!amdgpu_audio)
 		return;
 
 	if (!adev->mode_info.audio.enabled)
 		return;
 
-	for (i = 0; i < adev->mode_info.audio.num_pins; i++)
-		dce_v10_0_audio_enable(adev, &adev->mode_info.audio.pin[i], false);
-
 	adev->mode_info.audio.enabled = false;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 47e05783c4a0..b01d88d078fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -1511,17 +1511,12 @@ static int dce_v11_0_audio_init(struct amdgpu_device *adev)
 
 static void dce_v11_0_audio_fini(struct amdgpu_device *adev)
 {
-	int i;
-
 	if (!amdgpu_audio)
 		return;
 
 	if (!adev->mode_info.audio.enabled)
 		return;
 
-	for (i = 0; i < adev->mode_info.audio.num_pins; i++)
-		dce_v11_0_audio_enable(adev, &adev->mode_info.audio.pin[i], false);
-
 	adev->mode_info.audio.enabled = false;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index 276c025c4c03..81760a26f2ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -1451,17 +1451,12 @@ static int dce_v6_0_audio_init(struct amdgpu_device *adev)
 
 static void dce_v6_0_audio_fini(struct amdgpu_device *adev)
 {
-	int i;
-
 	if (!amdgpu_audio)
 		return;
 
 	if (!adev->mode_info.audio.enabled)
 		return;
 
-	for (i = 0; i < adev->mode_info.audio.num_pins; i++)
-		dce_v6_0_audio_enable(adev, &adev->mode_info.audio.pin[i], false);
-
 	adev->mode_info.audio.enabled = false;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index e62ccf9eb73d..19a265bd4d19 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -1443,17 +1443,12 @@ static int dce_v8_0_audio_init(struct amdgpu_device *adev)
 
 static void dce_v8_0_audio_fini(struct amdgpu_device *adev)
 {
-	int i;
-
 	if (!amdgpu_audio)
 		return;
 
 	if (!adev->mode_info.audio.enabled)
 		return;
 
-	for (i = 0; i < adev->mode_info.audio.num_pins; i++)
-		dce_v8_0_audio_enable(adev, &adev->mode_info.audio.pin[i], false);
-
 	adev->mode_info.audio.enabled = false;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index c01c241a1b06..c37527704d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1612,9 +1612,9 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 	case IP_VERSION(11, 0, 2):
 	case IP_VERSION(11, 0, 3):
 		if (!adev->gfx.disable_uq &&
-		    adev->gfx.me_fw_version  >= 2390 &&
-		    adev->gfx.pfp_fw_version >= 2530 &&
-		    adev->gfx.mec_fw_version >= 2600 &&
+		    adev->gfx.me_fw_version  >= 2420 &&
+		    adev->gfx.pfp_fw_version >= 2580 &&
+		    adev->gfx.mec_fw_version >= 2650 &&
 		    adev->mes.fw_version[0] >= 120) {
 			adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs;
 			adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs;
@@ -1654,6 +1654,21 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 			}
 		}
 		break;
+	case IP_VERSION(11, 0, 1):
+	case IP_VERSION(11, 0, 4):
+		adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex;
+		adev->gfx.cleaner_shader_size = sizeof(gfx_11_0_3_cleaner_shader_hex);
+		if (adev->gfx.pfp_fw_version >= 102 &&
+		    adev->gfx.mec_fw_version >= 66 &&
+		    adev->mes.fw_version[0] >= 128) {
+			adev->gfx.enable_cleaner_shader = true;
+			r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size);
+			if (r) {
+				adev->gfx.enable_cleaner_shader = false;
+				dev_err(adev->dev, "Failed to initialize cleaner shader\n");
+			}
+		}
+		break;
 	case IP_VERSION(11, 5, 0):
 	case IP_VERSION(11, 5, 1):
 		adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex;
@@ -4129,6 +4144,8 @@ static int gfx_v11_0_gfx_mqd_init(struct amdgpu_device *adev, void *m,
 #endif
 	if (prop->tmz_queue)
 		tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, TMZ_MATCH, 1);
+	if (!prop->kernel_queue)
+		tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, RB_NON_PRIV, 1);
 	mqd->cp_gfx_hqd_cntl = tmp;
 
 	/* set up cp_doorbell_control */
@@ -4281,8 +4298,10 @@ static int gfx_v11_0_compute_mqd_init(struct amdgpu_device *adev, void *m,
 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 1);
 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
 			    prop->allow_tunneling);
-	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
-	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
+	if (prop->kernel_queue) {
+		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
+		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
+	}
 	if (prop->tmz_queue)
 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TMZ, 1);
 	mqd->cp_hqd_pq_control = tmp;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 3e138527d534..fd44d5503e28 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -3026,6 +3026,8 @@ static int gfx_v12_0_gfx_mqd_init(struct amdgpu_device *adev, void *m,
 #endif
 	if (prop->tmz_queue)
 		tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, TMZ_MATCH, 1);
+	if (!prop->kernel_queue)
+		tmp = REG_SET_FIELD(tmp, CP_GFX_HQD_CNTL, RB_NON_PRIV, 1);
 	mqd->cp_gfx_hqd_cntl = tmp;
 
 	/* set up cp_doorbell_control */
@@ -3175,8 +3177,10 @@ static int gfx_v12_0_compute_mqd_init(struct amdgpu_device *adev, void *m,
 			    (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 1);
 	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
-	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
-	tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
+	if (prop->kernel_queue) {
+		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
+		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
+	}
 	if (prop->tmz_queue)
 		tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TMZ, 1);
 	mqd->cp_hqd_pq_control = tmp;
diff --git a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c
index a887df520414..4258d3e0b706 100644
--- a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c
@@ -29,6 +29,8 @@
 #include "amdgpu.h"
 #include "isp_v4_1_1.h"
 
+MODULE_FIRMWARE("amdgpu/isp_4_1_1.bin");
+
 #define ISP_PERFORMANCE_STATE_LOW 0
 #define ISP_PERFORMANCE_STATE_HIGH 1
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 28eb846280dd..3f6a828cad8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -641,8 +641,9 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
 		break;
 	case MES_MISC_OP_CHANGE_CONFIG:
 		if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) < 0x63) {
-			dev_err(mes->adev->dev, "MES FW version must be larger than 0x63 to support limit single process feature.\n");
-			return -EINVAL;
+			dev_warn_once(mes->adev->dev,
+				      "MES FW version must be larger than 0x63 to support limit single process feature.\n");
+			return 0;
 		}
 		misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG;
 		misc_pkt.change_config.opcode =
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index 6cc05d36e359..64b240b51f1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -149,12 +149,12 @@ static int psp_v11_0_wait_for_bootloader(struct psp_context *psp)
 	int ret;
 	int retry_loop;
 
-	for (retry_loop = 0; retry_loop < 10; retry_loop++) {
+	for (retry_loop = 0; retry_loop < 20; retry_loop++) {
 		/* Wait for bootloader to signify that is
 		    ready having bit 31 of C2PMSG_35 set to 1 */
 		ret = psp_wait_for(
 			psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_35),
-			0x80000000, 0x80000000, PSP_WAITREG_NOVERBOSE);
+			0x80000000, 0x8000FFFF, PSP_WAITREG_NOVERBOSE);
 
 		if (ret == 0)
 			return 0;
@@ -397,18 +397,6 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp)
 
 	msleep(500);
 
-	offset = SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_33);
-
-	ret = psp_wait_for(psp, offset, MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK,
-			   0);
-
-	if (ret) {
-		DRM_INFO("psp mode 1 reset failed!\n");
-		return -EINVAL;
-	}
-
-	DRM_INFO("psp mode1 reset succeed \n");
-
 	return 0;
 }
 
@@ -665,7 +653,8 @@ static const struct psp_funcs psp_v11_0_funcs = {
 	.ring_get_wptr = psp_v11_0_ring_get_wptr,
 	.ring_set_wptr = psp_v11_0_ring_set_wptr,
 	.load_usbc_pd_fw = psp_v11_0_load_usbc_pd_fw,
-	.read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw
+	.read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw,
+	.wait_for_bootloader = psp_v11_0_wait_for_bootloader
 };
 
 void psp_v11_0_set_psp_funcs(struct psp_context *psp)
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index e6d8eddda2bf..db6e41967f12 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1377,7 +1377,7 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 
 	switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
 	case IP_VERSION(6, 0, 0):
-		if ((adev->sdma.instance[0].fw_version >= 24) && !adev->sdma.disable_uq)
+		if ((adev->sdma.instance[0].fw_version >= 27) && !adev->sdma.disable_uq)
 			adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 		break;
 	case IP_VERSION(6, 0, 1):
@@ -1385,11 +1385,11 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 			adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 		break;
 	case IP_VERSION(6, 0, 2):
-		if ((adev->sdma.instance[0].fw_version >= 21) && !adev->sdma.disable_uq)
+		if ((adev->sdma.instance[0].fw_version >= 23) && !adev->sdma.disable_uq)
 			adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 		break;
 	case IP_VERSION(6, 0, 3):
-		if ((adev->sdma.instance[0].fw_version >= 25) && !adev->sdma.disable_uq)
+		if ((adev->sdma.instance[0].fw_version >= 27) && !adev->sdma.disable_uq)
 			adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs;
 		break;
 	case IP_VERSION(6, 1, 0):
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 4b8f4407047f..2811226b0ea5 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -1888,15 +1888,19 @@ static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p,
 				struct amdgpu_job *job)
 {
 	struct drm_gpu_scheduler **scheds;
-
-	/* The create msg must be in the first IB submitted */
-	if (atomic_read(&job->base.entity->fence_seq))
-		return -EINVAL;
+	struct dma_fence *fence;
 
 	/* if VCN0 is harvested, we can't support AV1 */
 	if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0)
 		return -EINVAL;
 
+	/* wait for all jobs to finish before switching to instance 0 */
+	fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull);
+	if (fence) {
+		dma_fence_wait(fence, false);
+		dma_fence_put(fence);
+	}
+
 	scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC]
 		[AMDGPU_RING_PRIO_DEFAULT].sched;
 	drm_sched_entity_modify_sched(job->base.entity, scheds, 1);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 1924e075b66f..706f3b2f484f 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1808,15 +1808,19 @@ static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p,
 				struct amdgpu_job *job)
 {
 	struct drm_gpu_scheduler **scheds;
-
-	/* The create msg must be in the first IB submitted */
-	if (atomic_read(&job->base.entity->fence_seq))
-		return -EINVAL;
+	struct dma_fence *fence;
 
 	/* if VCN0 is harvested, we can't support AV1 */
 	if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0)
 		return -EINVAL;
 
+	/* wait for all jobs to finish before switching to instance 0 */
+	fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull);
+	if (fence) {
+		dma_fence_wait(fence, false);
+		dma_fence_put(fence);
+	}
+
 	scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC]
 		[AMDGPU_RING_PRIO_0].sched;
 	drm_sched_entity_modify_sched(job->base.entity, scheds, 1);
@@ -1907,22 +1911,16 @@ out:
 
 #define RADEON_VCN_ENGINE_TYPE_ENCODE			(0x00000002)
 #define RADEON_VCN_ENGINE_TYPE_DECODE			(0x00000003)
-
 #define RADEON_VCN_ENGINE_INFO				(0x30000001)
-#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET		16
-
 #define RENCODE_ENCODE_STANDARD_AV1			2
 #define RENCODE_IB_PARAM_SESSION_INIT			0x00000003
-#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET	64
 
-/* return the offset in ib if id is found, -1 otherwise
- * to speed up the searching we only search upto max_offset
- */
-static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset)
+/* return the offset in ib if id is found, -1 otherwise */
+static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int start)
 {
 	int i;
 
-	for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) {
+	for (i = start; i < ib->length_dw && ib->ptr[i] >= 8; i += ib->ptr[i] / 4) {
 		if (ib->ptr[i + 1] == id)
 			return i;
 	}
@@ -1937,33 +1935,29 @@ static int vcn_v4_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
 	struct amdgpu_vcn_decode_buffer *decode_buffer;
 	uint64_t addr;
 	uint32_t val;
-	int idx;
+	int idx = 0, sidx;
 
 	/* The first instance can decode anything */
 	if (!ring->me)
 		return 0;
 
-	/* RADEON_VCN_ENGINE_INFO is at the top of ib block */
-	idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO,
-			RADEON_VCN_ENGINE_INFO_MAX_OFFSET);
-	if (idx < 0) /* engine info is missing */
-		return 0;
-
-	val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */
-	if (val == RADEON_VCN_ENGINE_TYPE_DECODE) {
-		decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6];
-
-		if (!(decode_buffer->valid_buf_flag  & 0x1))
-			return 0;
-
-		addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 |
-			decode_buffer->msg_buffer_address_lo;
-		return vcn_v4_0_dec_msg(p, job, addr);
-	} else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) {
-		idx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT,
-			RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET);
-		if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1)
-			return vcn_v4_0_limit_sched(p, job);
+	while ((idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, idx)) >= 0) {
+		val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */
+		if (val == RADEON_VCN_ENGINE_TYPE_DECODE) {
+			decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6];
+
+			if (!(decode_buffer->valid_buf_flag & 0x1))
+				return 0;
+
+			addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 |
+				decode_buffer->msg_buffer_address_lo;
+			return vcn_v4_0_dec_msg(p, job, addr);
+		} else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) {
+			sidx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, idx);
+			if (sidx >= 0 && ib->ptr[sidx + 2] == RENCODE_ENCODE_STANDARD_AV1)
+				return vcn_v4_0_limit_sched(p, job);
+		}
+		idx += ib->ptr[idx] / 4;
 	}
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 7e749f9b6d69..349c351e242b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1550,6 +1550,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
 	return ret;
 }
 
+int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd)
+{
+	struct kfd_node *node;
+	int i, r;
+
+	if (!kfd->init_complete)
+		return 0;
+
+	for (i = 0; i < kfd->num_nodes; i++) {
+		node = kfd->nodes[i];
+		r = node->dqm->ops.unhalt(node->dqm);
+		if (r) {
+			dev_err(kfd_device, "Error in starting scheduler\n");
+			return r;
+		}
+	}
+	return 0;
+}
+
 int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
 {
 	struct kfd_node *node;
@@ -1567,6 +1586,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
 	return node->dqm->ops.halt(node->dqm);
 }
 
+int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd)
+{
+	struct kfd_node *node;
+	int i, r;
+
+	if (!kfd->init_complete)
+		return 0;
+
+	for (i = 0; i < kfd->num_nodes; i++) {
+		node = kfd->nodes[i];
+		r = node->dqm->ops.halt(node->dqm);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
 bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id)
 {
 	struct kfd_node *node;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 4ec73f33535e..720b20e842ba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1587,7 +1587,8 @@ static int kfd_dev_create_p2p_links(void)
 			break;
 		if (!dev->gpu || !dev->gpu->adev ||
 		    (dev->gpu->kfd->hive_id &&
-		     dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id))
+		     dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id &&
+		     amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev)))
 			goto next;
 
 		/* check if node(s) is/are peer accessible in one direction or bi-direction */
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index cd0e2976e268..ef026143dc1c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2037,6 +2037,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
 
 	dc_hardware_init(adev->dm.dc);
 
+	adev->dm.restore_backlight = true;
+
 	adev->dm.hpd_rx_offload_wq = hpd_rx_irq_create_workqueue(adev);
 	if (!adev->dm.hpd_rx_offload_wq) {
 		drm_err(adev_to_drm(adev), "failed to create hpd rx offload workqueue.\n");
@@ -2913,6 +2915,17 @@ static int dm_oem_i2c_hw_init(struct amdgpu_device *adev)
 	return 0;
 }
 
+static void dm_oem_i2c_hw_fini(struct amdgpu_device *adev)
+{
+	struct amdgpu_display_manager *dm = &adev->dm;
+
+	if (dm->oem_i2c) {
+		i2c_del_adapter(&dm->oem_i2c->base);
+		kfree(dm->oem_i2c);
+		dm->oem_i2c = NULL;
+	}
+}
+
 /**
  * dm_hw_init() - Initialize DC device
  * @ip_block: Pointer to the amdgpu_ip_block for this hw instance.
@@ -2963,7 +2976,7 @@ static int dm_hw_fini(struct amdgpu_ip_block *ip_block)
 {
 	struct amdgpu_device *adev = ip_block->adev;
 
-	kfree(adev->dm.oem_i2c);
+	dm_oem_i2c_hw_fini(adev);
 
 	amdgpu_dm_hpd_fini(adev);
 
@@ -3127,25 +3140,6 @@ static void dm_destroy_cached_state(struct amdgpu_device *adev)
 	dm->cached_state = NULL;
 }
 
-static void dm_complete(struct amdgpu_ip_block *ip_block)
-{
-	struct amdgpu_device *adev = ip_block->adev;
-
-	dm_destroy_cached_state(adev);
-}
-
-static int dm_prepare_suspend(struct amdgpu_ip_block *ip_block)
-{
-	struct amdgpu_device *adev = ip_block->adev;
-
-	if (amdgpu_in_reset(adev))
-		return 0;
-
-	WARN_ON(adev->dm.cached_state);
-
-	return dm_cache_state(adev);
-}
-
 static int dm_suspend(struct amdgpu_ip_block *ip_block)
 {
 	struct amdgpu_device *adev = ip_block->adev;
@@ -3407,6 +3401,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
 		dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D0);
 
 		dc_resume(dm->dc);
+		adev->dm.restore_backlight = true;
 
 		amdgpu_dm_irq_resume_early(adev);
 
@@ -3571,10 +3566,8 @@ static const struct amd_ip_funcs amdgpu_dm_funcs = {
 	.early_fini = amdgpu_dm_early_fini,
 	.hw_init = dm_hw_init,
 	.hw_fini = dm_hw_fini,
-	.prepare_suspend = dm_prepare_suspend,
 	.suspend = dm_suspend,
 	.resume = dm_resume,
-	.complete = dm_complete,
 	.is_idle = dm_is_idle,
 	.wait_for_idle = dm_wait_for_idle,
 	.check_soft_reset = dm_check_soft_reset,
@@ -7792,6 +7785,9 @@ amdgpu_dm_connector_atomic_check(struct drm_connector *conn,
 	struct amdgpu_dm_connector *aconn = to_amdgpu_dm_connector(conn);
 	int ret;
 
+	if (WARN_ON(unlikely(!old_con_state || !new_con_state)))
+		return -EINVAL;
+
 	trace_amdgpu_dm_connector_atomic_check(new_con_state);
 
 	if (conn->connector_type == DRM_MODE_CONNECTOR_DisplayPort) {
@@ -8378,8 +8374,7 @@ static int amdgpu_dm_connector_get_modes(struct drm_connector *connector)
 				drm_add_modes_noedid(connector, 1920, 1080);
 	} else {
 		amdgpu_dm_connector_ddc_get_modes(connector, drm_edid);
-		if (encoder && (connector->connector_type != DRM_MODE_CONNECTOR_eDP) &&
-		    (connector->connector_type != DRM_MODE_CONNECTOR_LVDS))
+		if (encoder)
 			amdgpu_dm_connector_add_common_modes(encoder, connector);
 		amdgpu_dm_connector_add_freesync_modes(connector, drm_edid);
 	}
@@ -8725,7 +8720,16 @@ static int amdgpu_dm_encoder_init(struct drm_device *dev,
 static void manage_dm_interrupts(struct amdgpu_device *adev,
 				 struct amdgpu_crtc *acrtc,
 				 struct dm_crtc_state *acrtc_state)
-{
+{	/*
+	 * We cannot be sure that the frontend index maps to the same
+	 * backend index - some even map to more than one.
+	 * So we have to go through the CRTC to find the right IRQ.
+	 */
+	int irq_type = amdgpu_display_crtc_idx_to_irq_type(
+			adev,
+			acrtc->crtc_id);
+	struct drm_device *dev = adev_to_drm(adev);
+
 	struct drm_vblank_crtc_config config = {0};
 	struct dc_crtc_timing *timing;
 	int offdelay;
@@ -8778,7 +8782,35 @@ static void manage_dm_interrupts(struct amdgpu_device *adev,
 
 		drm_crtc_vblank_on_config(&acrtc->base,
 					  &config);
+		/* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_get.*/
+		switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) {
+		case IP_VERSION(3, 0, 0):
+		case IP_VERSION(3, 0, 2):
+		case IP_VERSION(3, 0, 3):
+		case IP_VERSION(3, 2, 0):
+			if (amdgpu_irq_get(adev, &adev->pageflip_irq, irq_type))
+				drm_err(dev, "DM_IRQ: Cannot get pageflip irq!\n");
+#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY)
+			if (amdgpu_irq_get(adev, &adev->vline0_irq, irq_type))
+				drm_err(dev, "DM_IRQ: Cannot get vline0 irq!\n");
+#endif
+		}
+
 	} else {
+		/* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_put.*/
+		switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) {
+		case IP_VERSION(3, 0, 0):
+		case IP_VERSION(3, 0, 2):
+		case IP_VERSION(3, 0, 3):
+		case IP_VERSION(3, 2, 0):
+#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY)
+			if (amdgpu_irq_put(adev, &adev->vline0_irq, irq_type))
+				drm_err(dev, "DM_IRQ: Cannot put vline0 irq!\n");
+#endif
+			if (amdgpu_irq_put(adev, &adev->pageflip_irq, irq_type))
+				drm_err(dev, "DM_IRQ: Cannot put pageflip irq!\n");
+		}
+
 		drm_crtc_vblank_off(&acrtc->base);
 	}
 }
@@ -9800,7 +9832,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state,
 	bool mode_set_reset_required = false;
 	u32 i;
 	struct dc_commit_streams_params params = {dc_state->streams, dc_state->stream_count};
-	bool set_backlight_level = false;
 
 	/* Disable writeback */
 	for_each_old_connector_in_state(state, connector, old_con_state, i) {
@@ -9920,7 +9951,6 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state,
 			acrtc->hw_mode = new_crtc_state->mode;
 			crtc->hwmode = new_crtc_state->mode;
 			mode_set_reset_required = true;
-			set_backlight_level = true;
 		} else if (modereset_required(new_crtc_state)) {
 			drm_dbg_atomic(dev,
 				       "Atomic commit: RESET. crtc id %d:[%p]\n",
@@ -9977,13 +10007,16 @@ static void amdgpu_dm_commit_streams(struct drm_atomic_state *state,
 	 * to fix a flicker issue.
 	 * It will cause the dm->actual_brightness is not the current panel brightness
 	 * level. (the dm->brightness is the correct panel level)
-	 * So we set the backlight level with dm->brightness value after set mode
+	 * So we set the backlight level with dm->brightness value after initial
+	 * set mode. Use restore_backlight flag to avoid setting backlight level
+	 * for every subsequent mode set.
 	 */
-	if (set_backlight_level) {
+	if (dm->restore_backlight) {
 		for (i = 0; i < dm->num_of_edps; i++) {
 			if (dm->backlight_dev[i])
 				amdgpu_dm_backlight_set_level(dm, i, dm->brightness[i]);
 		}
+		dm->restore_backlight = false;
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index b937da0a4e4a..6aae51c1beb3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -611,6 +611,13 @@ struct amdgpu_display_manager {
 	u32 actual_brightness[AMDGPU_DM_MAX_NUM_EDP];
 
 	/**
+	 * @restore_backlight:
+	 *
+	 * Flag to indicate whether to restore backlight after modeset.
+	 */
+	bool restore_backlight;
+
+	/**
 	 * @aux_hpd_discon_quirk:
 	 *
 	 * quirk for hpd discon while aux is on-going.
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
index ebabfe3a512f..c0dfe2d8b3be 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
@@ -821,7 +821,7 @@ int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev,
 	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
 	const struct drm_color_lut *shaper = NULL, *lut3d = NULL;
 	uint32_t exp_size, size, dim_size = MAX_COLOR_3DLUT_SIZE;
-	bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut;
+	bool has_3dlut = adev->dm.dc->caps.color.dpp.hw_3d_lut || adev->dm.dc->caps.color.mpc.preblend;
 
 	/* shaper LUT is only available if 3D LUT color caps */
 	exp_size = has_3dlut ? MAX_COLOR_LUT_ENTRIES : 0;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index 010172f930ae..45feb404b097 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -299,6 +299,25 @@ static inline int amdgpu_dm_crtc_set_vblank(struct drm_crtc *crtc, bool enable)
 	irq_type = amdgpu_display_crtc_idx_to_irq_type(adev, acrtc->crtc_id);
 
 	if (enable) {
+		struct dc *dc = adev->dm.dc;
+		struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
+		struct psr_settings *psr = &acrtc_state->stream->link->psr_settings;
+		struct replay_settings *pr = &acrtc_state->stream->link->replay_settings;
+		bool sr_supported = (psr->psr_version != DC_PSR_VERSION_UNSUPPORTED) ||
+								pr->config.replay_supported;
+
+		/*
+		 * IPS & self-refresh feature can cause vblank counter resets between
+		 * vblank disable and enable.
+		 * It may cause system stuck due to waiting for the vblank counter.
+		 * Call this function to estimate missed vblanks by using timestamps and
+		 * update the vblank counter in DRM.
+		 */
+		if (dc->caps.ips_support &&
+			dc->config.disable_ips != DMUB_IPS_DISABLE_ALL &&
+			sr_supported && vblank->config.disable_immediate)
+			drm_crtc_vblank_restore(crtc);
+
 		/* vblank irq on -> Only need vupdate irq in vrr mode */
 		if (amdgpu_dm_crtc_vrr_active(acrtc_state))
 			rc = amdgpu_dm_crtc_set_vupdate_irq(crtc, true);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 7187d5aedf0a..77a9d2c7d318 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -809,6 +809,7 @@ void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm,
 	drm_dp_aux_init(&aconnector->dm_dp_aux.aux);
 	drm_dp_cec_register_connector(&aconnector->dm_dp_aux.aux,
 				      &aconnector->base);
+	drm_dp_dpcd_set_probe(&aconnector->dm_dp_aux.aux, false);
 
 	if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP)
 		return;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index eef51652ca35..3d2f8eedeef2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -1633,7 +1633,7 @@ dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
 		drm_object_attach_property(&plane->base,
 					   dm->adev->mode_info.plane_ctm_property, 0);
 
-	if (dpp_color_caps.hw_3d_lut) {
+	if (dpp_color_caps.hw_3d_lut || dm->dc->caps.color.mpc.preblend) {
 		drm_object_attach_property(&plane->base,
 					   mode_info.plane_shaper_lut_property, 0);
 		drm_object_attach_property(&plane->base,
diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser.c
index 67f08495b7e6..154fd2c18e88 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser.c
@@ -174,11 +174,8 @@ static struct graphics_object_id bios_parser_get_connector_id(
 		return object_id;
 	}
 
-	if (tbl->ucNumberOfObjects <= i) {
-		dm_error("Can't find connector id %d in connector table of size %d.\n",
-			 i, tbl->ucNumberOfObjects);
+	if (tbl->ucNumberOfObjects <= i)
 		return object_id;
-	}
 
 	id = le16_to_cpu(tbl->asObjects[i].usObjectID);
 	object_id = object_id_from_bios_object_id(id);
diff --git a/drivers/gpu/drm/amd/display/dc/bios/command_table.c b/drivers/gpu/drm/amd/display/dc/bios/command_table.c
index 2bcae0643e61..58e88778da7f 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/command_table.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/command_table.c
@@ -993,7 +993,7 @@ static enum bp_result set_pixel_clock_v3(
 	allocation.sPCLKInput.usFbDiv =
 			cpu_to_le16((uint16_t)bp_params->feedback_divider);
 	allocation.sPCLKInput.ucFracFbDiv =
-			(uint8_t)bp_params->fractional_feedback_divider;
+			(uint8_t)(bp_params->fractional_feedback_divider / 100000);
 	allocation.sPCLKInput.ucPostDiv =
 			(uint8_t)bp_params->pixel_clock_post_divider;
 
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce100/dce_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce100/dce_clk_mgr.c
index f5ad0a177038..dbd6ef1b60a0 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce100/dce_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce100/dce_clk_mgr.c
@@ -72,9 +72,9 @@ static const struct state_dependent_clocks dce80_max_clks_by_state[] = {
 /* ClocksStateLow */
 { .display_clk_khz = 352000, .pixel_clk_khz = 330000},
 /* ClocksStateNominal */
-{ .display_clk_khz = 600000, .pixel_clk_khz = 400000 },
+{ .display_clk_khz = 625000, .pixel_clk_khz = 400000 },
 /* ClocksStatePerformance */
-{ .display_clk_khz = 600000, .pixel_clk_khz = 400000 } };
+{ .display_clk_khz = 625000, .pixel_clk_khz = 400000 } };
 
 int dentist_get_divider_from_did(int did)
 {
@@ -391,8 +391,6 @@ static void dce_pplib_apply_display_requirements(
 {
 	struct dm_pp_display_configuration *pp_display_cfg = &context->pp_display_cfg;
 
-	pp_display_cfg->avail_mclk_switch_time_us = dce110_get_min_vblank_time_us(context);
-
 	dce110_fill_display_configs(context, pp_display_cfg);
 
 	if (memcmp(&dc->current_state->pp_display_cfg, pp_display_cfg, sizeof(*pp_display_cfg)) !=  0)
@@ -405,11 +403,9 @@ static void dce_update_clocks(struct clk_mgr *clk_mgr_base,
 {
 	struct clk_mgr_internal *clk_mgr_dce = TO_CLK_MGR_INTERNAL(clk_mgr_base);
 	struct dm_pp_power_level_change_request level_change_req;
-	int patched_disp_clk = context->bw_ctx.bw.dce.dispclk_khz;
-
-	/*TODO: W/A for dal3 linux, investigate why this works */
-	if (!clk_mgr_dce->dfs_bypass_active)
-		patched_disp_clk = patched_disp_clk * 115 / 100;
+	const int max_disp_clk =
+		clk_mgr_dce->max_clks_by_state[DM_PP_CLOCKS_STATE_PERFORMANCE].display_clk_khz;
+	int patched_disp_clk = MIN(max_disp_clk, context->bw_ctx.bw.dce.dispclk_khz);
 
 	level_change_req.power_level = dce_get_required_clocks_state(clk_mgr_base, context);
 	/* get max clock state from PPLIB */
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c
index f8409453434c..13cf415e38e5 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c
@@ -120,9 +120,15 @@ void dce110_fill_display_configs(
 	const struct dc_state *context,
 	struct dm_pp_display_configuration *pp_display_cfg)
 {
+	struct dc *dc = context->clk_mgr->ctx->dc;
 	int j;
 	int num_cfgs = 0;
 
+	pp_display_cfg->avail_mclk_switch_time_us = dce110_get_min_vblank_time_us(context);
+	pp_display_cfg->disp_clk_khz = dc->clk_mgr->clks.dispclk_khz;
+	pp_display_cfg->avail_mclk_switch_time_in_disp_active_us = 0;
+	pp_display_cfg->crtc_index = dc->res_pool->res_cap->num_timing_generator;
+
 	for (j = 0; j < context->stream_count; j++) {
 		int k;
 
@@ -164,6 +170,23 @@ void dce110_fill_display_configs(
 		cfg->v_refresh /= stream->timing.h_total;
 		cfg->v_refresh = (cfg->v_refresh + stream->timing.v_total / 2)
 							/ stream->timing.v_total;
+
+		/* Find first CRTC index and calculate its line time.
+		 * This is necessary for DPM on SI GPUs.
+		 */
+		if (cfg->pipe_idx < pp_display_cfg->crtc_index) {
+			const struct dc_crtc_timing *timing =
+				&context->streams[0]->timing;
+
+			pp_display_cfg->crtc_index = cfg->pipe_idx;
+			pp_display_cfg->line_time_in_us =
+				timing->h_total * 10000 / timing->pix_clk_100hz;
+		}
+	}
+
+	if (!num_cfgs) {
+		pp_display_cfg->crtc_index = 0;
+		pp_display_cfg->line_time_in_us = 0;
 	}
 
 	pp_display_cfg->display_count = num_cfgs;
@@ -223,25 +246,8 @@ void dce11_pplib_apply_display_requirements(
 	pp_display_cfg->min_engine_clock_deep_sleep_khz
 			= context->bw_ctx.bw.dce.sclk_deep_sleep_khz;
 
-	pp_display_cfg->avail_mclk_switch_time_us =
-						dce110_get_min_vblank_time_us(context);
-	/* TODO: dce11.2*/
-	pp_display_cfg->avail_mclk_switch_time_in_disp_active_us = 0;
-
-	pp_display_cfg->disp_clk_khz = dc->clk_mgr->clks.dispclk_khz;
-
 	dce110_fill_display_configs(context, pp_display_cfg);
 
-	/* TODO: is this still applicable?*/
-	if (pp_display_cfg->display_count == 1) {
-		const struct dc_crtc_timing *timing =
-			&context->streams[0]->timing;
-
-		pp_display_cfg->crtc_index =
-			pp_display_cfg->disp_configs[0].pipe_idx;
-		pp_display_cfg->line_time_in_us = timing->h_total * 10000 / timing->pix_clk_100hz;
-	}
-
 	if (memcmp(&dc->current_state->pp_display_cfg, pp_display_cfg, sizeof(*pp_display_cfg)) !=  0)
 		dm_pp_apply_display_requirements(dc->ctx, pp_display_cfg);
 }
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce60/dce60_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce60/dce60_clk_mgr.c
index 0267644717b2..a39641a0ff09 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce60/dce60_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce60/dce60_clk_mgr.c
@@ -83,22 +83,13 @@ static const struct state_dependent_clocks dce60_max_clks_by_state[] = {
 static int dce60_get_dp_ref_freq_khz(struct clk_mgr *clk_mgr_base)
 {
 	struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base);
-	int dprefclk_wdivider;
-	int dp_ref_clk_khz;
-	int target_div;
+	struct dc_context *ctx = clk_mgr_base->ctx;
+	int dp_ref_clk_khz = 0;
 
-	/* DCE6 has no DPREFCLK_CNTL to read DP Reference Clock source */
-
-	/* Read the mmDENTIST_DISPCLK_CNTL to get the currently
-	 * programmed DID DENTIST_DPREFCLK_WDIVIDER*/
-	REG_GET(DENTIST_DISPCLK_CNTL, DENTIST_DPREFCLK_WDIVIDER, &dprefclk_wdivider);
-
-	/* Convert DENTIST_DPREFCLK_WDIVIDERto actual divider*/
-	target_div = dentist_get_divider_from_did(dprefclk_wdivider);
-
-	/* Calculate the current DFS clock, in kHz.*/
-	dp_ref_clk_khz = (DENTIST_DIVIDER_RANGE_SCALE_FACTOR
-		* clk_mgr->base.dentist_vco_freq_khz) / target_div;
+	if (ASIC_REV_IS_TAHITI_P(ctx->asic_id.hw_internal_rev))
+		dp_ref_clk_khz = ctx->dc_bios->fw_info.default_display_engine_pll_frequency;
+	else
+		dp_ref_clk_khz = clk_mgr_base->clks.dispclk_khz;
 
 	return dce_adjust_dp_ref_freq_for_ss(clk_mgr, dp_ref_clk_khz);
 }
@@ -109,8 +100,6 @@ static void dce60_pplib_apply_display_requirements(
 {
 	struct dm_pp_display_configuration *pp_display_cfg = &context->pp_display_cfg;
 
-	pp_display_cfg->avail_mclk_switch_time_us = dce110_get_min_vblank_time_us(context);
-
 	dce110_fill_display_configs(context, pp_display_cfg);
 
 	if (memcmp(&dc->current_state->pp_display_cfg, pp_display_cfg, sizeof(*pp_display_cfg)) !=  0)
@@ -123,11 +112,9 @@ static void dce60_update_clocks(struct clk_mgr *clk_mgr_base,
 {
 	struct clk_mgr_internal *clk_mgr_dce = TO_CLK_MGR_INTERNAL(clk_mgr_base);
 	struct dm_pp_power_level_change_request level_change_req;
-	int patched_disp_clk = context->bw_ctx.bw.dce.dispclk_khz;
-
-	/*TODO: W/A for dal3 linux, investigate why this works */
-	if (!clk_mgr_dce->dfs_bypass_active)
-		patched_disp_clk = patched_disp_clk * 115 / 100;
+	const int max_disp_clk =
+		clk_mgr_dce->max_clks_by_state[DM_PP_CLOCKS_STATE_PERFORMANCE].display_clk_khz;
+	int patched_disp_clk = MIN(max_disp_clk, context->bw_ctx.bw.dce.dispclk_khz);
 
 	level_change_req.power_level = dce_get_required_clocks_state(clk_mgr_base, context);
 	/* get max clock state from PPLIB */
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
index bb1ac12a2b09..0e638bc6bf77 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
@@ -587,9 +587,118 @@ bool dcn35_are_clock_states_equal(struct dc_clocks *a,
 	return true;
 }
 
-static void dcn35_dump_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass,
+static void dcn35_save_clk_registers_internal(struct dcn35_clk_internal *internal, struct clk_mgr *clk_mgr_base)
+{
+	struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base);
+
+	// read dtbclk
+	internal->CLK1_CLK4_CURRENT_CNT = REG_READ(CLK1_CLK4_CURRENT_CNT);
+	internal->CLK1_CLK4_BYPASS_CNTL = REG_READ(CLK1_CLK4_BYPASS_CNTL);
+
+	// read dcfclk
+	internal->CLK1_CLK3_CURRENT_CNT = REG_READ(CLK1_CLK3_CURRENT_CNT);
+	internal->CLK1_CLK3_BYPASS_CNTL = REG_READ(CLK1_CLK3_BYPASS_CNTL);
+
+	// read dcf deep sleep divider
+	internal->CLK1_CLK3_DS_CNTL = REG_READ(CLK1_CLK3_DS_CNTL);
+	internal->CLK1_CLK3_ALLOW_DS = REG_READ(CLK1_CLK3_ALLOW_DS);
+
+	// read dppclk
+	internal->CLK1_CLK1_CURRENT_CNT = REG_READ(CLK1_CLK1_CURRENT_CNT);
+	internal->CLK1_CLK1_BYPASS_CNTL = REG_READ(CLK1_CLK1_BYPASS_CNTL);
+
+	// read dprefclk
+	internal->CLK1_CLK2_CURRENT_CNT = REG_READ(CLK1_CLK2_CURRENT_CNT);
+	internal->CLK1_CLK2_BYPASS_CNTL = REG_READ(CLK1_CLK2_BYPASS_CNTL);
+
+	// read dispclk
+	internal->CLK1_CLK0_CURRENT_CNT = REG_READ(CLK1_CLK0_CURRENT_CNT);
+	internal->CLK1_CLK0_BYPASS_CNTL = REG_READ(CLK1_CLK0_BYPASS_CNTL);
+}
+
+static void dcn35_save_clk_registers(struct clk_state_registers_and_bypass *regs_and_bypass,
 		struct clk_mgr_dcn35 *clk_mgr)
 {
+	struct dcn35_clk_internal internal = {0};
+	char *bypass_clks[5] = {"0x0 DFS", "0x1 REFCLK", "0x2 ERROR", "0x3 400 FCH", "0x4 600 FCH"};
+
+	dcn35_save_clk_registers_internal(&internal, &clk_mgr->base.base);
+
+	regs_and_bypass->dcfclk = internal.CLK1_CLK3_CURRENT_CNT / 10;
+	regs_and_bypass->dcf_deep_sleep_divider = internal.CLK1_CLK3_DS_CNTL / 10;
+	regs_and_bypass->dcf_deep_sleep_allow = internal.CLK1_CLK3_ALLOW_DS;
+	regs_and_bypass->dprefclk = internal.CLK1_CLK2_CURRENT_CNT / 10;
+	regs_and_bypass->dispclk = internal.CLK1_CLK0_CURRENT_CNT / 10;
+	regs_and_bypass->dppclk = internal.CLK1_CLK1_CURRENT_CNT / 10;
+	regs_and_bypass->dtbclk = internal.CLK1_CLK4_CURRENT_CNT / 10;
+
+	regs_and_bypass->dppclk_bypass = internal.CLK1_CLK1_BYPASS_CNTL & 0x0007;
+	if (regs_and_bypass->dppclk_bypass < 0 || regs_and_bypass->dppclk_bypass > 4)
+		regs_and_bypass->dppclk_bypass = 0;
+	regs_and_bypass->dcfclk_bypass = internal.CLK1_CLK3_BYPASS_CNTL & 0x0007;
+	if (regs_and_bypass->dcfclk_bypass < 0 || regs_and_bypass->dcfclk_bypass > 4)
+		regs_and_bypass->dcfclk_bypass = 0;
+	regs_and_bypass->dispclk_bypass = internal.CLK1_CLK0_BYPASS_CNTL & 0x0007;
+	if (regs_and_bypass->dispclk_bypass < 0 || regs_and_bypass->dispclk_bypass > 4)
+		regs_and_bypass->dispclk_bypass = 0;
+	regs_and_bypass->dprefclk_bypass = internal.CLK1_CLK2_BYPASS_CNTL & 0x0007;
+	if (regs_and_bypass->dprefclk_bypass < 0 || regs_and_bypass->dprefclk_bypass > 4)
+		regs_and_bypass->dprefclk_bypass = 0;
+
+	if (clk_mgr->base.base.ctx->dc->debug.pstate_enabled) {
+		DC_LOG_SMU("clk_type,clk_value,deepsleep_cntl,deepsleep_allow,bypass\n");
+
+		DC_LOG_SMU("dcfclk,%d,%d,%d,%s\n",
+				   regs_and_bypass->dcfclk,
+				   regs_and_bypass->dcf_deep_sleep_divider,
+				   regs_and_bypass->dcf_deep_sleep_allow,
+				   bypass_clks[(int) regs_and_bypass->dcfclk_bypass]);
+
+		DC_LOG_SMU("dprefclk,%d,N/A,N/A,%s\n",
+			regs_and_bypass->dprefclk,
+			bypass_clks[(int) regs_and_bypass->dprefclk_bypass]);
+
+		DC_LOG_SMU("dispclk,%d,N/A,N/A,%s\n",
+			regs_and_bypass->dispclk,
+			bypass_clks[(int) regs_and_bypass->dispclk_bypass]);
+
+		// REGISTER VALUES
+		DC_LOG_SMU("reg_name,value,clk_type");
+
+		DC_LOG_SMU("CLK1_CLK3_CURRENT_CNT,%d,dcfclk",
+				internal.CLK1_CLK3_CURRENT_CNT);
+
+		DC_LOG_SMU("CLK1_CLK4_CURRENT_CNT,%d,dtbclk",
+					internal.CLK1_CLK4_CURRENT_CNT);
+
+		DC_LOG_SMU("CLK1_CLK3_DS_CNTL,%d,dcf_deep_sleep_divider",
+					internal.CLK1_CLK3_DS_CNTL);
+
+		DC_LOG_SMU("CLK1_CLK3_ALLOW_DS,%d,dcf_deep_sleep_allow",
+					internal.CLK1_CLK3_ALLOW_DS);
+
+		DC_LOG_SMU("CLK1_CLK2_CURRENT_CNT,%d,dprefclk",
+					internal.CLK1_CLK2_CURRENT_CNT);
+
+		DC_LOG_SMU("CLK1_CLK0_CURRENT_CNT,%d,dispclk",
+					internal.CLK1_CLK0_CURRENT_CNT);
+
+		DC_LOG_SMU("CLK1_CLK1_CURRENT_CNT,%d,dppclk",
+					internal.CLK1_CLK1_CURRENT_CNT);
+
+		DC_LOG_SMU("CLK1_CLK3_BYPASS_CNTL,%d,dcfclk_bypass",
+					internal.CLK1_CLK3_BYPASS_CNTL);
+
+		DC_LOG_SMU("CLK1_CLK2_BYPASS_CNTL,%d,dprefclk_bypass",
+					internal.CLK1_CLK2_BYPASS_CNTL);
+
+		DC_LOG_SMU("CLK1_CLK0_BYPASS_CNTL,%d,dispclk_bypass",
+					internal.CLK1_CLK0_BYPASS_CNTL);
+
+		DC_LOG_SMU("CLK1_CLK1_BYPASS_CNTL,%d,dppclk_bypass",
+					internal.CLK1_CLK1_BYPASS_CNTL);
+
+	}
 }
 
 static bool dcn35_is_spll_ssc_enabled(struct clk_mgr *clk_mgr_base)
@@ -623,6 +732,7 @@ static void init_clk_states(struct clk_mgr *clk_mgr)
 void dcn35_init_clocks(struct clk_mgr *clk_mgr)
 {
 	struct clk_mgr_internal *clk_mgr_int = TO_CLK_MGR_INTERNAL(clk_mgr);
+	struct clk_mgr_dcn35 *clk_mgr_dcn35 = TO_CLK_MGR_DCN35(clk_mgr_int);
 
 	init_clk_states(clk_mgr);
 
@@ -633,6 +743,13 @@ void dcn35_init_clocks(struct clk_mgr *clk_mgr)
 	else
 		clk_mgr->dp_dto_source_clock_in_khz = clk_mgr->dprefclk_khz;
 
+	dcn35_save_clk_registers(&clk_mgr->boot_snapshot, clk_mgr_dcn35);
+
+	clk_mgr->clks.ref_dtbclk_khz =  clk_mgr->boot_snapshot.dtbclk * 10;
+	if (clk_mgr->boot_snapshot.dtbclk > 59000) {
+		/*dtbclk enabled based on */
+		clk_mgr->clks.dtbclk_en = true;
+	}
 }
 static struct clk_bw_params dcn35_bw_params = {
 	.vram_type = Ddr4MemType,
@@ -1323,7 +1440,7 @@ void dcn35_clk_mgr_construct(
 		dcn35_bw_params.wm_table = ddr5_wm_table;
 	}
 	/* Saved clocks configured at boot for debug purposes */
-	dcn35_dump_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr);
+	dcn35_save_clk_registers(&clk_mgr->base.base.boot_snapshot, clk_mgr);
 
 	clk_mgr->base.base.dprefclk_khz = dcn35_smu_get_dprefclk(&clk_mgr->base);
 	clk_mgr->base.base.clks.ref_dtbclk_khz = 600000;
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 9ab0ee20ca6f..dcc48b5238e5 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -217,11 +217,24 @@ static bool create_links(
 		connectors_num,
 		num_virtual_links);
 
-	// condition loop on link_count to allow skipping invalid indices
+	/* When getting the number of connectors, the VBIOS reports the number of valid indices,
+	 * but it doesn't say which indices are valid, and not every index has an actual connector.
+	 * So, if we don't find a connector on an index, that is not an error.
+	 *
+	 * - There is no guarantee that the first N indices will be valid
+	 * - VBIOS may report a higher amount of valid indices than there are actual connectors
+	 * - Some VBIOS have valid configurations for more connectors than there actually are
+	 *   on the card. This may be because the manufacturer used the same VBIOS for different
+	 *   variants of the same card.
+	 */
 	for (i = 0; dc->link_count < connectors_num && i < MAX_LINKS; i++) {
+		struct graphics_object_id connector_id = bios->funcs->get_connector_id(bios, i);
 		struct link_init_data link_init_params = {0};
 		struct dc_link *link;
 
+		if (connector_id.id == CONNECTOR_ID_UNKNOWN)
+			continue;
+
 		DC_LOG_DC("BIOS object table - printing link object info for connector number: %d, link_index: %d", i, dc->link_count);
 
 		link_init_params.ctx = dc->ctx;
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
index 59c07756130d..8c230cf8939b 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -1145,6 +1145,7 @@ struct dc_debug_options {
 	bool enable_hblank_borrow;
 	bool force_subvp_df_throttle;
 	uint32_t acpi_transition_bitmasks[MAX_PIPES];
+	bool enable_pg_cntl_debug_logs;
 };
 
 
@@ -1347,7 +1348,6 @@ union surface_update_flags {
 		uint32_t in_transfer_func_change:1;
 		uint32_t input_csc_change:1;
 		uint32_t coeff_reduction_change:1;
-		uint32_t output_tf_change:1;
 		uint32_t pixel_format_change:1;
 		uint32_t plane_size_change:1;
 		uint32_t gamut_remap_change:1;
diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
index 58c84f555c0f..0ce9489ac6b7 100644
--- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
+++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
@@ -133,30 +133,34 @@ enum dsc_clk_source {
 };
 
 
-static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool enable)
+static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool allow_rcg)
 {
 	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
 
-	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && enable)
+	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && allow_rcg)
 		return;
 
 	switch (inst) {
 	case 0:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 1:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 2:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 3:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	default:
 		BREAK_TO_DEBUGGER();
 		return;
 	}
+
+	/* Wait for clock to ramp */
+	if (!allow_rcg)
+		udelay(10);
 }
 
 static void dccg35_set_symclk32_se_rcg(
@@ -385,35 +389,34 @@ static void dccg35_set_dtbclk_p_rcg(struct dccg *dccg, int inst, bool enable)
 	}
 }
 
-static void dccg35_set_dppclk_rcg(struct dccg *dccg,
-												int inst, bool enable)
+static void dccg35_set_dppclk_rcg(struct dccg *dccg, int inst, bool allow_rcg)
 {
-
 	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
 
-
-	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && enable)
+	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && allow_rcg)
 		return;
 
 	switch (inst) {
 	case 0:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 1:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 2:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	case 3:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1);
 		break;
 	default:
 	BREAK_TO_DEBUGGER();
 		break;
 	}
-	//DC_LOG_DEBUG("%s: inst(%d) DPPCLK rcg_disable: %d\n", __func__, inst, enable ? 0 : 1);
 
+	/* Wait for clock to ramp */
+	if (!allow_rcg)
+		udelay(10);
 }
 
 static void dccg35_set_dpstreamclk_rcg(
@@ -1177,32 +1180,34 @@ static void dccg35_update_dpp_dto(struct dccg *dccg, int dpp_inst,
 }
 
 static void dccg35_set_dppclk_root_clock_gating(struct dccg *dccg,
-		 uint32_t dpp_inst, uint32_t enable)
+		 uint32_t dpp_inst, uint32_t disallow_rcg)
 {
 	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
 
-	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp)
+	if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && !disallow_rcg)
 		return;
 
 
 	switch (dpp_inst) {
 	case 0:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, disallow_rcg);
 		break;
 	case 1:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, disallow_rcg);
 		break;
 	case 2:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, disallow_rcg);
 		break;
 	case 3:
-		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, disallow_rcg);
 		break;
 	default:
 		break;
 	}
-	//DC_LOG_DEBUG("%s: dpp_inst(%d) rcg: %d\n", __func__, dpp_inst, enable);
 
+	/* Wait for clock to ramp */
+	if (disallow_rcg)
+		udelay(10);
 }
 
 static void dccg35_get_pixel_rate_div(
@@ -1782,8 +1787,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst)
 	//Disable DTO
 	switch (inst) {
 	case 0:
-		if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc)
-			REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1);
 
 		REG_UPDATE_2(DSCCLK0_DTO_PARAM,
 				DSCCLK0_DTO_PHASE, 0,
@@ -1791,8 +1795,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst)
 		REG_UPDATE(DSCCLK_DTO_CTRL,	DSCCLK0_EN, 1);
 		break;
 	case 1:
-		if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc)
-			REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1);
 
 		REG_UPDATE_2(DSCCLK1_DTO_PARAM,
 				DSCCLK1_DTO_PHASE, 0,
@@ -1800,8 +1803,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst)
 		REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK1_EN, 1);
 		break;
 	case 2:
-		if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc)
-			REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1);
 
 		REG_UPDATE_2(DSCCLK2_DTO_PARAM,
 				DSCCLK2_DTO_PHASE, 0,
@@ -1809,8 +1811,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst)
 		REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK2_EN, 1);
 		break;
 	case 3:
-		if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc)
-			REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1);
+		REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1);
 
 		REG_UPDATE_2(DSCCLK3_DTO_PARAM,
 				DSCCLK3_DTO_PHASE, 0,
@@ -1821,6 +1822,9 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst)
 		BREAK_TO_DEBUGGER();
 		return;
 	}
+
+	/* Wait for clock to ramp */
+	udelay(10);
 }
 
 static void dccg35_disable_dscclk(struct dccg *dccg,
@@ -1864,6 +1868,9 @@ static void dccg35_disable_dscclk(struct dccg *dccg,
 	default:
 		return;
 	}
+
+	/* Wait for clock ramp */
+	udelay(10);
 }
 
 static void dccg35_enable_symclk_se(struct dccg *dccg, uint32_t stream_enc_inst, uint32_t link_enc_inst)
@@ -2349,10 +2356,7 @@ static void dccg35_disable_symclk_se_cb(
 
 void dccg35_root_gate_disable_control(struct dccg *dccg, uint32_t pipe_idx, uint32_t disable_clock_gating)
 {
-
-	if (dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) {
-		dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating);
-	}
+	dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating);
 }
 
 static const struct dccg_funcs dccg35_funcs_new = {
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_link_encoder.c b/drivers/gpu/drm/amd/display/dc/dce/dce_link_encoder.c
index 4a9d07c31bc5..0c50fe266c8a 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dce_link_encoder.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dce_link_encoder.c
@@ -896,13 +896,13 @@ void dce110_link_encoder_construct(
 						enc110->base.id, &bp_cap_info);
 
 	/* Override features with DCE-specific values */
-	if (BP_RESULT_OK == result) {
+	if (result == BP_RESULT_OK) {
 		enc110->base.features.flags.bits.IS_HBR2_CAPABLE =
 				bp_cap_info.DP_HBR2_EN;
 		enc110->base.features.flags.bits.IS_HBR3_CAPABLE =
 				bp_cap_info.DP_HBR3_EN;
 		enc110->base.features.flags.bits.HDMI_6GB_EN = bp_cap_info.HDMI_6GB_EN;
-	} else {
+	} else if (result != BP_RESULT_NORECORD) {
 		DC_LOG_WARNING("%s: Failed to get encoder_cap_info from VBIOS with error code %d!\n",
 				__func__,
 				result);
@@ -1798,13 +1798,13 @@ void dce60_link_encoder_construct(
 						enc110->base.id, &bp_cap_info);
 
 	/* Override features with DCE-specific values */
-	if (BP_RESULT_OK == result) {
+	if (result == BP_RESULT_OK) {
 		enc110->base.features.flags.bits.IS_HBR2_CAPABLE =
 				bp_cap_info.DP_HBR2_EN;
 		enc110->base.features.flags.bits.IS_HBR3_CAPABLE =
 				bp_cap_info.DP_HBR3_EN;
 		enc110->base.features.flags.bits.HDMI_6GB_EN = bp_cap_info.HDMI_6GB_EN;
-	} else {
+	} else if (result != BP_RESULT_NORECORD) {
 		DC_LOG_WARNING("%s: Failed to get encoder_cap_info from VBIOS with error code %d!\n",
 				__func__,
 				result);
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c
index e7a318e26d38..fcd3d86ad517 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.c
@@ -4,7 +4,6 @@
 
 #include "dc.h"
 #include "dc_dmub_srv.h"
-#include "dc_dp_types.h"
 #include "dmub/dmub_srv.h"
 #include "core_types.h"
 #include "dmub_replay.h"
@@ -44,45 +43,21 @@ static void dmub_replay_get_state(struct dmub_replay *dmub, enum replay_state *s
 /*
  * Enable/Disable Replay.
  */
-static void dmub_replay_enable(struct dmub_replay *dmub, bool enable, bool wait, uint8_t panel_inst,
-			       struct dc_link *link)
+static void dmub_replay_enable(struct dmub_replay *dmub, bool enable, bool wait, uint8_t panel_inst)
 {
 	union dmub_rb_cmd cmd;
 	struct dc_context *dc = dmub->ctx;
 	uint32_t retry_count;
 	enum replay_state state = REPLAY_STATE_0;
-	struct pipe_ctx *pipe_ctx = NULL;
-	struct resource_context *res_ctx = &link->ctx->dc->current_state->res_ctx;
-	uint8_t i;
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.replay_enable.header.type = DMUB_CMD__REPLAY;
 	cmd.replay_enable.data.panel_inst = panel_inst;
 
 	cmd.replay_enable.header.sub_type = DMUB_CMD__REPLAY_ENABLE;
-	if (enable) {
+	if (enable)
 		cmd.replay_enable.data.enable = REPLAY_ENABLE;
-		// hpo stream/link encoder assignments are not static, need to update everytime we try to enable replay
-		if (link->cur_link_settings.link_rate >= LINK_RATE_UHBR10) {
-			for (i = 0; i < MAX_PIPES; i++) {
-				if (res_ctx &&
-					res_ctx->pipe_ctx[i].stream &&
-					res_ctx->pipe_ctx[i].stream->link &&
-					res_ctx->pipe_ctx[i].stream->link == link &&
-					res_ctx->pipe_ctx[i].stream->link->connector_signal == SIGNAL_TYPE_EDP) {
-					pipe_ctx = &res_ctx->pipe_ctx[i];
-					//TODO: refactor for multi edp support
-					break;
-				}
-			}
-
-			if (!pipe_ctx)
-				return;
-
-			cmd.replay_enable.data.hpo_stream_enc_inst = pipe_ctx->stream_res.hpo_dp_stream_enc->inst;
-			cmd.replay_enable.data.hpo_link_enc_inst = pipe_ctx->link_res.hpo_dp_link_enc->inst;
-		}
-	} else
+	else
 		cmd.replay_enable.data.enable = REPLAY_DISABLE;
 
 	cmd.replay_enable.header.payload_bytes = sizeof(struct dmub_rb_cmd_replay_enable_data);
@@ -174,17 +149,6 @@ static bool dmub_replay_copy_settings(struct dmub_replay *dmub,
 	copy_settings_data->digbe_inst				= replay_context->digbe_inst;
 	copy_settings_data->digfe_inst				= replay_context->digfe_inst;
 
-	if (link->cur_link_settings.link_rate >= LINK_RATE_UHBR10) {
-		if (pipe_ctx->stream_res.hpo_dp_stream_enc)
-			copy_settings_data->hpo_stream_enc_inst = pipe_ctx->stream_res.hpo_dp_stream_enc->inst;
-		else
-			copy_settings_data->hpo_stream_enc_inst = 0;
-		if (pipe_ctx->link_res.hpo_dp_link_enc)
-			copy_settings_data->hpo_link_enc_inst = pipe_ctx->link_res.hpo_dp_link_enc->inst;
-		else
-			copy_settings_data->hpo_link_enc_inst = 0;
-	}
-
 	if (pipe_ctx->plane_res.dpp)
 		copy_settings_data->dpp_inst			= pipe_ctx->plane_res.dpp->inst;
 	else
@@ -247,7 +211,6 @@ static void dmub_replay_set_coasting_vtotal(struct dmub_replay *dmub,
 	pCmd->header.type = DMUB_CMD__REPLAY;
 	pCmd->header.sub_type = DMUB_CMD__REPLAY_SET_COASTING_VTOTAL;
 	pCmd->header.payload_bytes = sizeof(struct dmub_cmd_replay_set_coasting_vtotal_data);
-	pCmd->replay_set_coasting_vtotal_data.panel_inst = panel_inst;
 	pCmd->replay_set_coasting_vtotal_data.coasting_vtotal = (coasting_vtotal & 0xFFFF);
 	pCmd->replay_set_coasting_vtotal_data.coasting_vtotal_high = (coasting_vtotal & 0xFFFF0000) >> 16;
 
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.h b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.h
index ccbe385e132c..e6346c0ffc0e 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.h
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_replay.h
@@ -19,7 +19,7 @@ struct dmub_replay_funcs {
 	void (*replay_get_state)(struct dmub_replay *dmub, enum replay_state *state,
 		uint8_t panel_inst);
 	void (*replay_enable)(struct dmub_replay *dmub, bool enable, bool wait,
-		uint8_t panel_inst, struct dc_link *link);
+		uint8_t panel_inst);
 	bool (*replay_copy_settings)(struct dmub_replay *dmub, struct dc_link *link,
 		struct replay_context *replay_context, uint8_t panel_inst);
 	void (*replay_set_power_opt)(struct dmub_replay *dmub, unsigned int power_opt,
diff --git a/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.c b/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.c
index 75fb77bca83b..01480a04f85e 100644
--- a/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.c
+++ b/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.c
@@ -520,6 +520,15 @@ void dpp1_dppclk_control(
 		REG_UPDATE(DPP_CONTROL, DPP_CLOCK_ENABLE, 0);
 }
 
+void dpp_force_disable_cursor(struct dpp *dpp_base)
+{
+	struct dcn10_dpp *dpp = TO_DCN10_DPP(dpp_base);
+
+	/* Force disable cursor */
+	REG_UPDATE(CURSOR0_CONTROL, CUR0_ENABLE, 0);
+	dpp_base->pos.cur0_ctl.bits.cur0_enable = 0;
+}
+
 static const struct dpp_funcs dcn10_dpp_funcs = {
 		.dpp_read_state = dpp_read_state,
 		.dpp_reset = dpp_reset,
diff --git a/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.h b/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.h
index c48139bed11f..f466182963f7 100644
--- a/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/dpp/dcn10/dcn10_dpp.h
@@ -1525,4 +1525,6 @@ void dpp1_construct(struct dcn10_dpp *dpp1,
 
 void dpp1_cm_get_gamut_remap(struct dpp *dpp_base,
 			     struct dpp_grph_csc_adjustment *adjust);
+void dpp_force_disable_cursor(struct dpp *dpp_base);
+
 #endif
diff --git a/drivers/gpu/drm/amd/display/dc/dpp/dcn30/dcn30_dpp.c b/drivers/gpu/drm/amd/display/dc/dpp/dcn30/dcn30_dpp.c
index 2d70586cef40..09be2a90cc79 100644
--- a/drivers/gpu/drm/amd/display/dc/dpp/dcn30/dcn30_dpp.c
+++ b/drivers/gpu/drm/amd/display/dc/dpp/dcn30/dcn30_dpp.c
@@ -1494,6 +1494,7 @@ static struct dpp_funcs dcn30_dpp_funcs = {
 	.dpp_dppclk_control		= dpp1_dppclk_control,
 	.dpp_set_hdr_multiplier		= dpp3_set_hdr_multiplier,
 	.dpp_get_gamut_remap		= dpp3_cm_get_gamut_remap,
+	.dpp_force_disable_cursor 	= dpp_force_disable_cursor,
 };
 
 
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
index 3207addbd4eb..5e57bd1a08e7 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
@@ -955,7 +955,7 @@ enum dc_status dcn20_enable_stream_timing(
 		return DC_ERROR_UNEXPECTED;
 	}
 
-	fsleep(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz));
+	udelay(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz));
 
 	params.vertical_total_min = stream->adjust.v_total_min;
 	params.vertical_total_max = stream->adjust.v_total_max;
@@ -1982,10 +1982,8 @@ static void dcn20_program_pipe(
 	 * updating on slave planes
 	 */
 	if (pipe_ctx->update_flags.bits.enable ||
-		pipe_ctx->update_flags.bits.plane_changed ||
-		pipe_ctx->stream->update_flags.bits.out_tf ||
-		(pipe_ctx->plane_state &&
-			pipe_ctx->plane_state->update_flags.bits.output_tf_change))
+	    pipe_ctx->update_flags.bits.plane_changed ||
+	    pipe_ctx->stream->update_flags.bits.out_tf)
 		hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream);
 
 	/* If the pipe has been enabled or has a different opp, we
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c
index e68f21fd5f0f..560984533950 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.c
@@ -528,3 +528,75 @@ void dcn314_disable_link_output(struct dc_link *link,
 
 	apply_symclk_on_tx_off_wa(link);
 }
+
+/**
+ * dcn314_dpp_pg_control - DPP power gate control.
+ *
+ * @hws: dce_hwseq reference.
+ * @dpp_inst: DPP instance reference.
+ * @power_on: true if we want to enable power gate, false otherwise.
+ *
+ * Enable or disable power gate in the specific DPP instance.
+ * If power gating is disabled, will force disable cursor in the DPP instance.
+ */
+void dcn314_dpp_pg_control(
+		struct dce_hwseq *hws,
+		unsigned int dpp_inst,
+		bool power_on)
+{
+	uint32_t power_gate = power_on ? 0 : 1;
+	uint32_t pwr_status = power_on ? 0 : 2;
+
+
+	if (hws->ctx->dc->debug.disable_dpp_power_gate) {
+		/* Workaround for DCN314 with disabled power gating */
+		if (!power_on) {
+
+			/* Force disable cursor if power gating is disabled */
+			struct dpp *dpp = hws->ctx->dc->res_pool->dpps[dpp_inst];
+			if (dpp && dpp->funcs->dpp_force_disable_cursor)
+				dpp->funcs->dpp_force_disable_cursor(dpp);
+		}
+		return;
+	}
+	if (REG(DOMAIN1_PG_CONFIG) == 0)
+		return;
+
+	switch (dpp_inst) {
+	case 0: /* DPP0 */
+		REG_UPDATE(DOMAIN1_PG_CONFIG,
+				DOMAIN1_POWER_GATE, power_gate);
+
+		REG_WAIT(DOMAIN1_PG_STATUS,
+				DOMAIN1_PGFSM_PWR_STATUS, pwr_status,
+				1, 1000);
+		break;
+	case 1: /* DPP1 */
+		REG_UPDATE(DOMAIN3_PG_CONFIG,
+				DOMAIN3_POWER_GATE, power_gate);
+
+		REG_WAIT(DOMAIN3_PG_STATUS,
+				DOMAIN3_PGFSM_PWR_STATUS, pwr_status,
+				1, 1000);
+		break;
+	case 2: /* DPP2 */
+		REG_UPDATE(DOMAIN5_PG_CONFIG,
+				DOMAIN5_POWER_GATE, power_gate);
+
+		REG_WAIT(DOMAIN5_PG_STATUS,
+				DOMAIN5_PGFSM_PWR_STATUS, pwr_status,
+				1, 1000);
+		break;
+	case 3: /* DPP3 */
+		REG_UPDATE(DOMAIN7_PG_CONFIG,
+				DOMAIN7_POWER_GATE, power_gate);
+
+		REG_WAIT(DOMAIN7_PG_STATUS,
+				DOMAIN7_PGFSM_PWR_STATUS, pwr_status,
+				1, 1000);
+		break;
+	default:
+		BREAK_TO_DEBUGGER();
+		break;
+	}
+}
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.h
index 2305ad282f21..6c072d0274ea 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.h
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_hwseq.h
@@ -47,4 +47,6 @@ void dcn314_dpp_root_clock_control(struct dce_hwseq *hws, unsigned int dpp_inst,
 
 void dcn314_disable_link_output(struct dc_link *link, const struct link_resource *link_res, enum signal_type signal);
 
+void dcn314_dpp_pg_control(struct dce_hwseq *hws, unsigned int dpp_inst, bool power_on);
+
 #endif /* __DC_HWSS_DCN314_H__ */
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_init.c
index f5112742edf9..9f454fa90e65 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_init.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn314/dcn314_init.c
@@ -141,6 +141,7 @@ static const struct hwseq_private_funcs dcn314_private_funcs = {
 	.enable_power_gating_plane = dcn314_enable_power_gating_plane,
 	.dpp_root_clock_control = dcn314_dpp_root_clock_control,
 	.hubp_pg_control = dcn31_hubp_pg_control,
+	.dpp_pg_control = dcn314_dpp_pg_control,
 	.program_all_writeback_pipes_in_tree = dcn30_program_all_writeback_pipes_in_tree,
 	.update_odm = dcn314_update_odm,
 	.dsc_pg_control = dcn314_dsc_pg_control,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
index a267f574b619..764eff6a4ec6 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
@@ -113,6 +113,14 @@ static void enable_memory_low_power(struct dc *dc)
 }
 #endif
 
+static void print_pg_status(struct dc *dc, const char *debug_func, const char *debug_log)
+{
+	if (dc->debug.enable_pg_cntl_debug_logs && dc->res_pool->pg_cntl) {
+		if (dc->res_pool->pg_cntl->funcs->print_pg_status)
+			dc->res_pool->pg_cntl->funcs->print_pg_status(dc->res_pool->pg_cntl, debug_func, debug_log);
+	}
+}
+
 void dcn35_set_dmu_fgcg(struct dce_hwseq *hws, bool enable)
 {
 	REG_UPDATE_3(DMU_CLK_CNTL,
@@ -137,6 +145,8 @@ void dcn35_init_hw(struct dc *dc)
 	uint32_t user_level = MAX_BACKLIGHT_LEVEL;
 	int i;
 
+	print_pg_status(dc, __func__, ": start");
+
 	if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks)
 		dc->clk_mgr->funcs->init_clocks(dc->clk_mgr);
 
@@ -200,10 +210,7 @@ void dcn35_init_hw(struct dc *dc)
 
 	/* we want to turn off all dp displays before doing detection */
 	dc->link_srv->blank_all_dp_displays(dc);
-/*
-	if (hws->funcs.enable_power_gating_plane)
-		hws->funcs.enable_power_gating_plane(dc->hwseq, true);
-*/
+
 	if (res_pool->hubbub && res_pool->hubbub->funcs->dchubbub_init)
 		res_pool->hubbub->funcs->dchubbub_init(dc->res_pool->hubbub);
 	/* If taking control over from VBIOS, we may want to optimize our first
@@ -236,6 +243,8 @@ void dcn35_init_hw(struct dc *dc)
 		}
 
 		hws->funcs.init_pipes(dc, dc->current_state);
+		print_pg_status(dc, __func__, ": after init_pipes");
+
 		if (dc->res_pool->hubbub->funcs->allow_self_refresh_control &&
 			!dc->res_pool->hubbub->ctx->dc->debug.disable_stutter)
 			dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub,
@@ -312,6 +321,7 @@ void dcn35_init_hw(struct dc *dc)
 		if (dc->res_pool->pg_cntl->funcs->init_pg_status)
 			dc->res_pool->pg_cntl->funcs->init_pg_status(dc->res_pool->pg_cntl);
 	}
+	print_pg_status(dc, __func__, ": after init_pg_status");
 }
 
 static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable)
@@ -500,97 +510,6 @@ void dcn35_physymclk_root_clock_control(struct dce_hwseq *hws, unsigned int phy_
 	}
 }
 
-void dcn35_dsc_pg_control(
-		struct dce_hwseq *hws,
-		unsigned int dsc_inst,
-		bool power_on)
-{
-	uint32_t power_gate = power_on ? 0 : 1;
-	uint32_t pwr_status = power_on ? 0 : 2;
-	uint32_t org_ip_request_cntl = 0;
-
-	if (hws->ctx->dc->debug.disable_dsc_power_gate)
-		return;
-	if (hws->ctx->dc->debug.ignore_pg)
-		return;
-	REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl);
-	if (org_ip_request_cntl == 0)
-		REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1);
-
-	switch (dsc_inst) {
-	case 0: /* DSC0 */
-		REG_UPDATE(DOMAIN16_PG_CONFIG,
-				DOMAIN_POWER_GATE, power_gate);
-
-		REG_WAIT(DOMAIN16_PG_STATUS,
-				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
-		break;
-	case 1: /* DSC1 */
-		REG_UPDATE(DOMAIN17_PG_CONFIG,
-				DOMAIN_POWER_GATE, power_gate);
-
-		REG_WAIT(DOMAIN17_PG_STATUS,
-				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
-		break;
-	case 2: /* DSC2 */
-		REG_UPDATE(DOMAIN18_PG_CONFIG,
-				DOMAIN_POWER_GATE, power_gate);
-
-		REG_WAIT(DOMAIN18_PG_STATUS,
-				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
-		break;
-	case 3: /* DSC3 */
-		REG_UPDATE(DOMAIN19_PG_CONFIG,
-				DOMAIN_POWER_GATE, power_gate);
-
-		REG_WAIT(DOMAIN19_PG_STATUS,
-				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
-		break;
-	default:
-		BREAK_TO_DEBUGGER();
-		break;
-	}
-
-	if (org_ip_request_cntl == 0)
-		REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 0);
-}
-
-void dcn35_enable_power_gating_plane(struct dce_hwseq *hws, bool enable)
-{
-	bool force_on = true; /* disable power gating */
-	uint32_t org_ip_request_cntl = 0;
-
-	if (hws->ctx->dc->debug.disable_hubp_power_gate)
-		return;
-	if (hws->ctx->dc->debug.ignore_pg)
-		return;
-	REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl);
-	if (org_ip_request_cntl == 0)
-		REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1);
-	/* DCHUBP0/1/2/3/4/5 */
-	REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	/* DPP0/1/2/3/4/5 */
-	REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-
-	force_on = true; /* disable power gating */
-	if (enable && !hws->ctx->dc->debug.disable_dsc_power_gate)
-		force_on = false;
-
-	/* DCS0/1/2/3/4 */
-	REG_UPDATE(DOMAIN16_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	REG_UPDATE(DOMAIN17_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	REG_UPDATE(DOMAIN18_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-	REG_UPDATE(DOMAIN19_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on);
-
-
-}
-
 /* In headless boot cases, DIG may be turned
  * on which causes HW/SW discrepancies.
  * To avoid this, power down hardware on boot
@@ -1453,6 +1372,8 @@ void dcn35_prepare_bandwidth(
 	}
 
 	dcn20_prepare_bandwidth(dc, context);
+
+	print_pg_status(dc, __func__, ": after rcg and power up");
 }
 
 void dcn35_optimize_bandwidth(
@@ -1461,6 +1382,8 @@ void dcn35_optimize_bandwidth(
 {
 	struct pg_block_update pg_update_state;
 
+	print_pg_status(dc, __func__, ": before rcg and power up");
+
 	dcn20_optimize_bandwidth(dc, context);
 
 	if (dc->hwss.calc_blocks_to_gate) {
@@ -1472,6 +1395,8 @@ void dcn35_optimize_bandwidth(
 		if (dc->hwss.root_clock_control)
 			dc->hwss.root_clock_control(dc, &pg_update_state, false);
 	}
+
+	print_pg_status(dc, __func__, ": after rcg and power up");
 }
 
 void dcn35_set_drr(struct pipe_ctx **pipe_ctx,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
index a3ccf805bd16..aefb7c473741 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
@@ -115,7 +115,6 @@ static const struct hw_sequencer_funcs dcn35_funcs = {
 	.exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state,
 	.update_visual_confirm_color = dcn10_update_visual_confirm_color,
 	.apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations,
-	.update_dsc_pg = dcn32_update_dsc_pg,
 	.calc_blocks_to_gate = dcn35_calc_blocks_to_gate,
 	.calc_blocks_to_ungate = dcn35_calc_blocks_to_ungate,
 	.hw_block_power_up = dcn35_hw_block_power_up,
@@ -150,7 +149,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = {
 	.plane_atomic_disable = dcn35_plane_atomic_disable,
 	//.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/
 	//.hubp_pg_control = dcn35_hubp_pg_control,
-	.enable_power_gating_plane = dcn35_enable_power_gating_plane,
 	.dpp_root_clock_control = dcn35_dpp_root_clock_control,
 	.dpstream_root_clock_control = dcn35_dpstream_root_clock_control,
 	.physymclk_root_clock_control = dcn35_physymclk_root_clock_control,
@@ -165,7 +163,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = {
 	.calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values,
 	.resync_fifo_dccg_dio = dcn314_resync_fifo_dccg_dio,
 	.is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy,
-	.dsc_pg_control = dcn35_dsc_pg_control,
 	.dsc_pg_status = dcn32_dsc_pg_status,
 	.enable_plane = dcn35_enable_plane,
 	.wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c
index 58f2be2a326b..a580a55695c3 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c
@@ -114,7 +114,6 @@ static const struct hw_sequencer_funcs dcn351_funcs = {
 	.exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state,
 	.update_visual_confirm_color = dcn10_update_visual_confirm_color,
 	.apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations,
-	.update_dsc_pg = dcn32_update_dsc_pg,
 	.calc_blocks_to_gate = dcn351_calc_blocks_to_gate,
 	.calc_blocks_to_ungate = dcn351_calc_blocks_to_ungate,
 	.hw_block_power_up = dcn351_hw_block_power_up,
@@ -145,7 +144,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = {
 	.plane_atomic_disable = dcn35_plane_atomic_disable,
 	//.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/
 	//.hubp_pg_control = dcn35_hubp_pg_control,
-	.enable_power_gating_plane = dcn35_enable_power_gating_plane,
 	.dpp_root_clock_control = dcn35_dpp_root_clock_control,
 	.dpstream_root_clock_control = dcn35_dpstream_root_clock_control,
 	.physymclk_root_clock_control = dcn35_physymclk_root_clock_control,
@@ -159,7 +157,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = {
 	.setup_hpo_hw_control = dcn35_setup_hpo_hw_control,
 	.calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values,
 	.is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy,
-	.dsc_pg_control = dcn35_dsc_pg_control,
 	.dsc_pg_status = dcn32_dsc_pg_status,
 	.enable_plane = dcn35_enable_plane,
 	.wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
index cc9f40d97af2..61167c19359d 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
@@ -2019,10 +2019,8 @@ void dcn401_program_pipe(
 	 * updating on slave planes
 	 */
 	if (pipe_ctx->update_flags.bits.enable ||
-		pipe_ctx->update_flags.bits.plane_changed ||
-		pipe_ctx->stream->update_flags.bits.out_tf ||
-		(pipe_ctx->plane_state &&
-			pipe_ctx->plane_state->update_flags.bits.output_tf_change))
+	    pipe_ctx->update_flags.bits.plane_changed ||
+	    pipe_ctx->stream->update_flags.bits.out_tf)
 		hws->funcs.set_output_transfer_func(dc, pipe_ctx, pipe_ctx->stream);
 
 	/* If the pipe has been enabled or has a different opp, we
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
index 0c5675d1c593..1b7c085dc2cc 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
@@ -349,6 +349,9 @@ struct dpp_funcs {
 		struct dpp *dpp_base,
 		enum dc_color_space color_space,
 		struct dc_csc_transform cursor_csc_color_matrix);
+
+	void (*dpp_force_disable_cursor)(struct dpp *dpp_base);
+
 };
 
 
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h
index 44f86cc2d1d6..227e3f8d7e5f 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h
@@ -49,6 +49,7 @@ struct pg_cntl_funcs {
 	void (*mem_pg_control)(struct pg_cntl *pg_cntl, bool power_on);
 	void (*dio_pg_control)(struct pg_cntl *pg_cntl, bool power_on);
 	void (*init_pg_status)(struct pg_cntl *pg_cntl);
+	void (*print_pg_status)(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log);
 };
 
 #endif //__DC_PG_CNTL_H__
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c
index e7927b8f5ba3..98ec9b5a559c 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c
@@ -944,7 +944,7 @@ bool edp_set_replay_allow_active(struct dc_link *link, const bool *allow_active,
 		// TODO: Handle mux change case if force_static is set
 		// If force_static is set, just change the replay_allow_active state directly
 		if (replay != NULL && link->replay_settings.replay_feature_enabled)
-			replay->funcs->replay_enable(replay, *allow_active, wait, panel_inst, link);
+			replay->funcs->replay_enable(replay, *allow_active, wait, panel_inst);
 		link->replay_settings.replay_allow_active = *allow_active;
 	}
 
diff --git a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c
index af21c0a27f86..72bd43f9bbe2 100644
--- a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c
+++ b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c
@@ -79,16 +79,12 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 	uint32_t power_gate = power_on ? 0 : 1;
 	uint32_t pwr_status = power_on ? 0 : 2;
 	uint32_t org_ip_request_cntl = 0;
-	bool block_enabled;
-
-	/*need to enable dscclk regardless DSC_PG*/
-	if (pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc && power_on)
-		pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc(
-				pg_cntl->ctx->dc->res_pool->dccg, dsc_inst);
+	bool block_enabled = false;
+	bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg ||
+		       pg_cntl->ctx->dc->debug.disable_dsc_power_gate ||
+		       pg_cntl->ctx->dc->idle_optimizations_allowed;
 
-	if (pg_cntl->ctx->dc->debug.ignore_pg ||
-		pg_cntl->ctx->dc->debug.disable_dsc_power_gate ||
-		pg_cntl->ctx->dc->idle_optimizations_allowed)
+	if (skip_pg && !power_on)
 		return;
 
 	block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, dsc_inst);
@@ -111,7 +107,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 
 		REG_WAIT(DOMAIN16_PG_STATUS,
 				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
+				1, 10000);
 		break;
 	case 1: /* DSC1 */
 		REG_UPDATE(DOMAIN17_PG_CONFIG,
@@ -119,7 +115,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 
 		REG_WAIT(DOMAIN17_PG_STATUS,
 				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
+				1, 10000);
 		break;
 	case 2: /* DSC2 */
 		REG_UPDATE(DOMAIN18_PG_CONFIG,
@@ -127,7 +123,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 
 		REG_WAIT(DOMAIN18_PG_STATUS,
 				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
+				1, 10000);
 		break;
 	case 3: /* DSC3 */
 		REG_UPDATE(DOMAIN19_PG_CONFIG,
@@ -135,7 +131,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 
 		REG_WAIT(DOMAIN19_PG_STATUS,
 				DOMAIN_PGFSM_PWR_STATUS, pwr_status,
-				1, 1000);
+				1, 10000);
 		break;
 	default:
 		BREAK_TO_DEBUGGER();
@@ -144,12 +140,6 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo
 
 	if (dsc_inst < MAX_PIPES)
 		pg_cntl->pg_pipe_res_enable[PG_DSC][dsc_inst] = power_on;
-
-	if (pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc && !power_on) {
-		/*this is to disable dscclk*/
-		pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc(
-			pg_cntl->ctx->dc->res_pool->dccg, dsc_inst);
-	}
 }
 
 static bool pg_cntl35_hubp_dpp_pg_status(struct pg_cntl *pg_cntl, unsigned int hubp_dpp_inst)
@@ -189,11 +179,12 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp
 	uint32_t pwr_status = power_on ? 0 : 2;
 	uint32_t org_ip_request_cntl;
 	bool block_enabled;
+	bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg ||
+		       pg_cntl->ctx->dc->debug.disable_hubp_power_gate ||
+		       pg_cntl->ctx->dc->debug.disable_dpp_power_gate ||
+		       pg_cntl->ctx->dc->idle_optimizations_allowed;
 
-	if (pg_cntl->ctx->dc->debug.ignore_pg ||
-		pg_cntl->ctx->dc->debug.disable_hubp_power_gate ||
-		pg_cntl->ctx->dc->debug.disable_dpp_power_gate ||
-		pg_cntl->ctx->dc->idle_optimizations_allowed)
+	if (skip_pg && !power_on)
 		return;
 
 	block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, hubp_dpp_inst);
@@ -213,22 +204,22 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp
 	case 0:
 		/* DPP0 & HUBP0 */
 		REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_GATE, power_gate);
-		REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000);
+		REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000);
 		break;
 	case 1:
 		/* DPP1 & HUBP1 */
 		REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_GATE, power_gate);
-		REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000);
+		REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000);
 		break;
 	case 2:
 		/* DPP2 & HUBP2 */
 		REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_GATE, power_gate);
-		REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000);
+		REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000);
 		break;
 	case 3:
 		/* DPP3 & HUBP3 */
 		REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_GATE, power_gate);
-		REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000);
+		REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000);
 		break;
 	default:
 		BREAK_TO_DEBUGGER();
@@ -501,6 +492,36 @@ void pg_cntl35_init_pg_status(struct pg_cntl *pg_cntl)
 	pg_cntl->pg_res_enable[PG_DWB] = block_enabled;
 }
 
+static void pg_cntl35_print_pg_status(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log)
+{
+	int i = 0;
+	bool block_enabled = false;
+
+	DC_LOG_DEBUG("%s: %s", debug_func, debug_log);
+
+	DC_LOG_DEBUG("PG_CNTL status:\n");
+
+	block_enabled = pg_cntl35_io_clk_status(pg_cntl);
+	DC_LOG_DEBUG("ONO0=%d (DCCG, DIO, DCIO)\n", block_enabled ? 1 : 0);
+
+	block_enabled = pg_cntl35_mem_status(pg_cntl);
+	DC_LOG_DEBUG("ONO1=%d (DCHUBBUB, DCHVM, DCHUBBUBMEM)\n", block_enabled ? 1 : 0);
+
+	block_enabled = pg_cntl35_plane_otg_status(pg_cntl);
+	DC_LOG_DEBUG("ONO2=%d (MPC, OPP, OPTC, DWB)\n", block_enabled ? 1 : 0);
+
+	block_enabled = pg_cntl35_hpo_pg_status(pg_cntl);
+	DC_LOG_DEBUG("ONO3=%d (HPO)\n", block_enabled ? 1 : 0);
+
+	for (i = 0; i < pg_cntl->ctx->dc->res_pool->pipe_count; i++) {
+		block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, i);
+		DC_LOG_DEBUG("ONO%d=%d (DCHUBP%d, DPP%d)\n", 4 + i * 2, block_enabled ? 1 : 0, i, i);
+
+		block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, i);
+		DC_LOG_DEBUG("ONO%d=%d (DSC%d)\n", 5 + i * 2, block_enabled ? 1 : 0, i);
+	}
+}
+
 static const struct pg_cntl_funcs pg_cntl35_funcs = {
 	.init_pg_status = pg_cntl35_init_pg_status,
 	.dsc_pg_control = pg_cntl35_dsc_pg_control,
@@ -511,7 +532,8 @@ static const struct pg_cntl_funcs pg_cntl35_funcs = {
 	.mpcc_pg_control = pg_cntl35_mpcc_pg_control,
 	.opp_pg_control = pg_cntl35_opp_pg_control,
 	.optc_pg_control = pg_cntl35_optc_pg_control,
-	.dwb_pg_control = pg_cntl35_dwb_pg_control
+	.dwb_pg_control = pg_cntl35_dwb_pg_control,
+	.print_pg_status = pg_cntl35_print_pg_status
 };
 
 struct pg_cntl *pg_cntl35_create(
diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
index c587b3441e07..6a69a788abe8 100644
--- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
+++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
@@ -4048,14 +4048,6 @@ struct dmub_cmd_replay_copy_settings_data {
 	 */
 	uint8_t digbe_inst;
 	/**
-	 * @hpo_stream_enc_inst: HPO stream encoder instance
-	 */
-	uint8_t hpo_stream_enc_inst;
-	/**
-	 * @hpo_link_enc_inst: HPO link encoder instance
-	 */
-	uint8_t hpo_link_enc_inst;
-	/**
 	 * AUX HW instance.
 	 */
 	uint8_t aux_inst;
@@ -4159,18 +4151,6 @@ struct dmub_rb_cmd_replay_enable_data {
 	 * This does not support HDMI/DP2 for now.
 	 */
 	uint8_t phy_rate;
-	/**
-	 * @hpo_stream_enc_inst: HPO stream encoder instance
-	 */
-	uint8_t hpo_stream_enc_inst;
-	/**
-	 * @hpo_link_enc_inst: HPO link encoder instance
-	 */
-	uint8_t hpo_link_enc_inst;
-	/**
-	 * @pad: Align structure to 4 byte boundary.
-	 */
-	uint8_t pad[2];
 };
 
 /**
diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
index e58e7b93810b..6b7db8ec9a53 100644
--- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
@@ -260,6 +260,9 @@ enum mod_hdcp_status mod_hdcp_hdcp1_create_session(struct mod_hdcp *hdcp)
 		return MOD_HDCP_STATUS_FAILURE;
 	}
 
+	if (!display)
+		return MOD_HDCP_STATUS_DISPLAY_NOT_FOUND;
+
 	hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.context.mem_context.shared_buf;
 
 	mutex_lock(&psp->hdcp_context.mutex);
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 4b64851fdb42..5fbfe7333b54 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -3458,14 +3458,16 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj,
 		effective_mode &= ~S_IWUSR;
 
 	/* not implemented yet for APUs other than GC 10.3.1 (vangogh) and 9.4.3 */
-	if (((adev->family == AMDGPU_FAMILY_SI) ||
-	     ((adev->flags & AMD_IS_APU) && (gc_ver != IP_VERSION(10, 3, 1)) &&
-	      (gc_ver != IP_VERSION(9, 4, 3) && gc_ver != IP_VERSION(9, 4, 4)))) &&
-	    (attr == &sensor_dev_attr_power1_cap_max.dev_attr.attr ||
-	     attr == &sensor_dev_attr_power1_cap_min.dev_attr.attr ||
-	     attr == &sensor_dev_attr_power1_cap.dev_attr.attr ||
-	     attr == &sensor_dev_attr_power1_cap_default.dev_attr.attr))
-		return 0;
+	if (attr == &sensor_dev_attr_power1_cap_max.dev_attr.attr ||
+	    attr == &sensor_dev_attr_power1_cap_min.dev_attr.attr ||
+	    attr == &sensor_dev_attr_power1_cap.dev_attr.attr ||
+	    attr == &sensor_dev_attr_power1_cap_default.dev_attr.attr) {
+		if (adev->family == AMDGPU_FAMILY_SI ||
+		    ((adev->flags & AMD_IS_APU) && gc_ver != IP_VERSION(10, 3, 1) &&
+		     (gc_ver != IP_VERSION(9, 4, 3) && gc_ver != IP_VERSION(9, 4, 4))) ||
+		    (amdgpu_sriov_vf(adev) && gc_ver == IP_VERSION(11, 0, 3)))
+			return 0;
+	}
 
 	/* not implemented yet for APUs having < GC 9.3.0 (Renoir) */
 	if (((adev->family == AMDGPU_FAMILY_SI) ||
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index b47cb4a5f488..408f05dfab90 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2236,7 +2236,7 @@ static int smu_resume(struct amdgpu_ip_block *ip_block)
 			return ret;
 	}
 
-	if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) {
+	if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL && smu->od_enabled) {
 		ret = smu_od_edit_dpm_table(smu, PP_OD_COMMIT_DPM_TABLE, NULL, 0);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index 3aea32baea3d..f32474af90b3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -1697,9 +1697,11 @@ static int smu_v14_0_2_get_power_limit(struct smu_context *smu,
 				       uint32_t *min_power_limit)
 {
 	struct smu_table_context *table_context = &smu->smu_table;
+	struct smu_14_0_2_powerplay_table *powerplay_table =
+		table_context->power_play_table;
 	PPTable_t *pptable = table_context->driver_pptable;
 	CustomSkuTable_t *skutable = &pptable->CustomSkuTable;
-	uint32_t power_limit;
+	uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0;
 	uint32_t msg_limit = pptable->SkuTable.MsgLimits.Power[PPT_THROTTLER_PPT0][POWER_SOURCE_AC];
 
 	if (smu_v14_0_get_current_power_limit(smu, &power_limit))
@@ -1712,11 +1714,29 @@ static int smu_v14_0_2_get_power_limit(struct smu_context *smu,
 	if (default_power_limit)
 		*default_power_limit = power_limit;
 
-	if (max_power_limit)
-		*max_power_limit = msg_limit;
+	if (powerplay_table) {
+		if (smu->od_enabled &&
+		    smu_v14_0_2_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT)) {
+			od_percent_upper = pptable->SkuTable.OverDriveLimitsBasicMax.Ppt;
+			od_percent_lower = pptable->SkuTable.OverDriveLimitsBasicMin.Ppt;
+		} else if (smu_v14_0_2_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT)) {
+			od_percent_upper = 0;
+			od_percent_lower = pptable->SkuTable.OverDriveLimitsBasicMin.Ppt;
+		}
+	}
+
+	dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n",
+					od_percent_upper, od_percent_lower, power_limit);
+
+	if (max_power_limit) {
+		*max_power_limit = msg_limit * (100 + od_percent_upper);
+		*max_power_limit /= 100;
+	}
 
-	if (min_power_limit)
-		*min_power_limit = 0;
+	if (min_power_limit) {
+		*min_power_limit = power_limit * (100 + od_percent_lower);
+		*min_power_limit /= 100;
+	}
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c
index 19c04687b0fe..8e650a02c528 100644
--- a/drivers/gpu/drm/ast/ast_dp.c
+++ b/drivers/gpu/drm/ast/ast_dp.c
@@ -134,7 +134,7 @@ static int ast_astdp_read_edid_block(void *data, u8 *buf, unsigned int block, si
 			 * 3. The Delays are often longer a lot when system resume from S3/S4.
 			 */
 			if (j)
-				mdelay(j + 1);
+				msleep(j + 1);
 
 			/* Wait for EDID offset to show up in mirror register */
 			vgacrd7 = ast_get_index_reg(ast, AST_IO_VGACRI, 0xd7);
diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
index ed35e567d117..efe534977d12 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -1474,8 +1474,8 @@ analogix_dp_probe(struct device *dev, struct analogix_dp_plat_data *plat_data)
 
 	dp = devm_drm_bridge_alloc(dev, struct analogix_dp_device, bridge,
 				   &analogix_dp_bridge_funcs);
-	if (!dp)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(dp))
+		return ERR_CAST(dp);
 
 	dp->dev = &pdev->dev;
 	dp->dpms_mode = DRM_MODE_DPMS_OFF;
diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c b/drivers/gpu/drm/bridge/analogix/anx7625.c
index c0ad8f59e483..8b3304dedcd9 100644
--- a/drivers/gpu/drm/bridge/analogix/anx7625.c
+++ b/drivers/gpu/drm/bridge/analogix/anx7625.c
@@ -2677,7 +2677,7 @@ static int anx7625_i2c_probe(struct i2c_client *client)
 		ret = devm_request_threaded_irq(dev, platform->pdata.intp_irq,
 						NULL, anx7625_intr_hpd_isr,
 						IRQF_TRIGGER_FALLING |
-						IRQF_ONESHOT,
+						IRQF_ONESHOT | IRQF_NO_AUTOEN,
 						"anx7625-intp", platform);
 		if (ret) {
 			DRM_DEV_ERROR(dev, "fail to request irq\n");
@@ -2746,8 +2746,10 @@ static int anx7625_i2c_probe(struct i2c_client *client)
 	}
 
 	/* Add work function */
-	if (platform->pdata.intp_irq)
+	if (platform->pdata.intp_irq) {
+		enable_irq(platform->pdata.intp_irq);
 		queue_work(platform->workqueue, &platform->work);
+	}
 
 	if (platform->pdata.audio_en)
 		anx7625_register_audio(dev, platform);
diff --git a/drivers/gpu/drm/bridge/aux-bridge.c b/drivers/gpu/drm/bridge/aux-bridge.c
index b63304d3a80f..b3e4cdff61d6 100644
--- a/drivers/gpu/drm/bridge/aux-bridge.c
+++ b/drivers/gpu/drm/bridge/aux-bridge.c
@@ -18,6 +18,7 @@ static void drm_aux_bridge_release(struct device *dev)
 {
 	struct auxiliary_device *adev = to_auxiliary_dev(dev);
 
+	of_node_put(dev->of_node);
 	ida_free(&drm_aux_bridge_ida, adev->id);
 
 	kfree(adev);
@@ -65,6 +66,7 @@ int drm_aux_bridge_register(struct device *parent)
 
 	ret = auxiliary_device_init(adev);
 	if (ret) {
+		of_node_put(adev->dev.of_node);
 		ida_free(&drm_aux_bridge_ida, adev->id);
 		kfree(adev);
 		return ret;
diff --git a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c
index a614d1384f71..38726ae1bf15 100644
--- a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c
+++ b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c
@@ -1984,8 +1984,10 @@ static void cdns_mhdp_atomic_enable(struct drm_bridge *bridge,
 	mhdp_state = to_cdns_mhdp_bridge_state(new_state);
 
 	mhdp_state->current_mode = drm_mode_duplicate(bridge->dev, mode);
-	if (!mhdp_state->current_mode)
-		return;
+	if (!mhdp_state->current_mode) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	drm_mode_set_name(mhdp_state->current_mode);
 
diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index 464390372b34..ae0d08e5e960 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -393,6 +393,17 @@ static int __maybe_unused ti_sn65dsi86_resume(struct device *dev)
 	gpiod_set_value_cansleep(pdata->enable_gpio, 1);
 
 	/*
+	 * After EN is deasserted and an external clock is detected, the bridge
+	 * will sample GPIO3:1 to determine its frequency. The driver will
+	 * overwrite this setting in ti_sn_bridge_set_refclk_freq(). But this is
+	 * racy. Thus we have to wait a couple of us. According to the datasheet
+	 * the GPIO lines has to be stable at least 5 us (td5) but it seems that
+	 * is not enough and the refclk frequency value is still lost or
+	 * overwritten by the bridge itself. Waiting for 20us seems to work.
+	 */
+	usleep_range(20, 30);
+
+	/*
 	 * If we have a reference clock we can enable communication w/ the
 	 * panel (including the aux channel) w/out any need for an input clock
 	 * so we can do it in resume which lets us read the EDID before
diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
index dd45d9b504d8..4bde00083047 100644
--- a/drivers/gpu/drm/drm_bridge.c
+++ b/drivers/gpu/drm/drm_bridge.c
@@ -1227,6 +1227,7 @@ EXPORT_SYMBOL(drm_atomic_bridge_chain_check);
 /**
  * drm_bridge_detect - check if anything is attached to the bridge output
  * @bridge: bridge control structure
+ * @connector: attached connector
  *
  * If the bridge supports output detection, as reported by the
  * DRM_BRIDGE_OP_DETECT bridge ops flag, call &drm_bridge_funcs.detect for the
diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c
index bbc7fecb6f4a..86853535fb7b 100644
--- a/drivers/gpu/drm/drm_gpuvm.c
+++ b/drivers/gpu/drm/drm_gpuvm.c
@@ -40,7 +40,7 @@
  * mapping's backing &drm_gem_object buffers.
  *
  * &drm_gem_object buffers maintain a list of &drm_gpuva objects representing
- * all existent GPU VA mappings using this &drm_gem_object as backing buffer.
+ * all existing GPU VA mappings using this &drm_gem_object as backing buffer.
  *
  * GPU VAs can be flagged as sparse, such that drivers may use GPU VAs to also
  * keep track of sparse PTEs in order to support Vulkan 'Sparse Resources'.
@@ -72,7 +72,7 @@
  * but it can also be a 'dummy' object, which can be allocated with
  * drm_gpuvm_resv_object_alloc().
  *
- * In order to connect a struct drm_gpuva its backing &drm_gem_object each
+ * In order to connect a struct drm_gpuva to its backing &drm_gem_object each
  * &drm_gem_object maintains a list of &drm_gpuvm_bo structures, and each
  * &drm_gpuvm_bo contains a list of &drm_gpuva structures.
  *
@@ -81,7 +81,7 @@
  * This is ensured by the API through drm_gpuvm_bo_obtain() and
  * drm_gpuvm_bo_obtain_prealloc() which first look into the corresponding
  * &drm_gem_object list of &drm_gpuvm_bos for an existing instance of this
- * particular combination. If not existent a new instance is created and linked
+ * particular combination. If not present, a new instance is created and linked
  * to the &drm_gem_object.
  *
  * &drm_gpuvm_bo structures, since unique for a given &drm_gpuvm, are also used
@@ -108,7 +108,7 @@
  * sequence of operations to satisfy a given map or unmap request.
  *
  * Therefore the DRM GPU VA manager provides an algorithm implementing splitting
- * and merging of existent GPU VA mappings with the ones that are requested to
+ * and merging of existing GPU VA mappings with the ones that are requested to
  * be mapped or unmapped. This feature is required by the Vulkan API to
  * implement Vulkan 'Sparse Memory Bindings' - drivers UAPIs often refer to this
  * as VM BIND.
@@ -119,7 +119,7 @@
  * execute in order to integrate the new mapping cleanly into the current state
  * of the GPU VA space.
  *
- * Depending on how the new GPU VA mapping intersects with the existent mappings
+ * Depending on how the new GPU VA mapping intersects with the existing mappings
  * of the GPU VA space the &drm_gpuvm_ops callbacks contain an arbitrary amount
  * of unmap operations, a maximum of two remap operations and a single map
  * operation. The caller might receive no callback at all if no operation is
@@ -139,16 +139,16 @@
  * one unmap operation and one or two map operations, such that drivers can
  * derive the page table update delta accordingly.
  *
- * Note that there can't be more than two existent mappings to split up, one at
+ * Note that there can't be more than two existing mappings to split up, one at
  * the beginning and one at the end of the new mapping, hence there is a
  * maximum of two remap operations.
  *
  * Analogous to drm_gpuvm_sm_map() drm_gpuvm_sm_unmap() uses &drm_gpuvm_ops to
  * call back into the driver in order to unmap a range of GPU VA space. The
- * logic behind this function is way simpler though: For all existent mappings
+ * logic behind this function is way simpler though: For all existing mappings
  * enclosed by the given range unmap operations are created. For mappings which
- * are only partically located within the given range, remap operations are
- * created such that those mappings are split up and re-mapped partically.
+ * are only partially located within the given range, remap operations are
+ * created such that those mappings are split up and re-mapped partially.
  *
  * As an alternative to drm_gpuvm_sm_map() and drm_gpuvm_sm_unmap(),
  * drm_gpuvm_sm_map_ops_create() and drm_gpuvm_sm_unmap_ops_create() can be used
@@ -168,7 +168,7 @@
  * provided helper functions drm_gpuva_map(), drm_gpuva_remap() and
  * drm_gpuva_unmap() instead.
  *
- * The following diagram depicts the basic relationships of existent GPU VA
+ * The following diagram depicts the basic relationships of existing GPU VA
  * mappings, a newly requested mapping and the resulting mappings as implemented
  * by drm_gpuvm_sm_map() - it doesn't cover any arbitrary combinations of these.
  *
@@ -218,7 +218,7 @@
  *
  *
  * 4) Existent mapping is a left aligned subset of the requested one, hence
- *    replace the existent one.
+ *    replace the existing one.
  *
  *    ::
  *
@@ -236,9 +236,9 @@
  *       and/or non-contiguous BO offset.
  *
  *
- * 5) Requested mapping's range is a left aligned subset of the existent one,
+ * 5) Requested mapping's range is a left aligned subset of the existing one,
  *    but backed by a different BO. Hence, map the requested mapping and split
- *    the existent one adjusting its BO offset.
+ *    the existing one adjusting its BO offset.
  *
  *    ::
  *
@@ -271,9 +271,9 @@
  *	new: |-----|-----| (a.bo_offset=n, a'.bo_offset=n+1)
  *
  *
- * 7) Requested mapping's range is a right aligned subset of the existent one,
+ * 7) Requested mapping's range is a right aligned subset of the existing one,
  *    but backed by a different BO. Hence, map the requested mapping and split
- *    the existent one, without adjusting the BO offset.
+ *    the existing one, without adjusting the BO offset.
  *
  *    ::
  *
@@ -304,7 +304,7 @@
  *
  * 9) Existent mapping is overlapped at the end by the requested mapping backed
  *    by a different BO. Hence, map the requested mapping and split up the
- *    existent one, without adjusting the BO offset.
+ *    existing one, without adjusting the BO offset.
  *
  *    ::
  *
@@ -334,9 +334,9 @@
  *	 new: |-----|-----------| (a'.bo_offset=n, a.bo_offset=n+1)
  *
  *
- * 11) Requested mapping's range is a centered subset of the existent one
+ * 11) Requested mapping's range is a centered subset of the existing one
  *     having a different backing BO. Hence, map the requested mapping and split
- *     up the existent one in two mappings, adjusting the BO offset of the right
+ *     up the existing one in two mappings, adjusting the BO offset of the right
  *     one accordingly.
  *
  *     ::
@@ -351,7 +351,7 @@
  *	 new: |-----|-----|-----| (a.bo_offset=n,b.bo_offset=m,a'.bo_offset=n+2)
  *
  *
- * 12) Requested mapping is a contiguous subset of the existent one. Split it
+ * 12) Requested mapping is a contiguous subset of the existing one. Split it
  *     up, but indicate that the backing PTEs could be kept.
  *
  *     ::
@@ -367,7 +367,7 @@
  *
  *
  * 13) Existent mapping is a right aligned subset of the requested one, hence
- *     replace the existent one.
+ *     replace the existing one.
  *
  *     ::
  *
@@ -386,7 +386,7 @@
  *
  *
  * 14) Existent mapping is a centered subset of the requested one, hence
- *     replace the existent one.
+ *     replace the existing one.
  *
  *     ::
  *
@@ -406,7 +406,7 @@
  *
  * 15) Existent mappings is overlapped at the beginning by the requested mapping
  *     backed by a different BO. Hence, map the requested mapping and split up
- *     the existent one, adjusting its BO offset accordingly.
+ *     the existing one, adjusting its BO offset accordingly.
  *
  *     ::
  *
@@ -469,8 +469,8 @@
  * make use of them.
  *
  * The below code is strictly limited to illustrate the generic usage pattern.
- * To maintain simplicitly, it doesn't make use of any abstractions for common
- * code, different (asyncronous) stages with fence signalling critical paths,
+ * To maintain simplicity, it doesn't make use of any abstractions for common
+ * code, different (asynchronous) stages with fence signalling critical paths,
  * any other helpers or error handling in terms of freeing memory and dropping
  * previously taken locks.
  *
@@ -479,7 +479,7 @@
  *	// Allocates a new &drm_gpuva.
  *	struct drm_gpuva * driver_gpuva_alloc(void);
  *
- *	// Typically drivers would embedd the &drm_gpuvm and &drm_gpuva
+ *	// Typically drivers would embed the &drm_gpuvm and &drm_gpuva
  *	// structure in individual driver structures and lock the dma-resv with
  *	// drm_exec or similar helpers.
  *	int driver_mapping_create(struct drm_gpuvm *gpuvm,
@@ -582,7 +582,7 @@
  *		.sm_step_unmap = driver_gpuva_unmap,
  *	};
  *
- *	// Typically drivers would embedd the &drm_gpuvm and &drm_gpuva
+ *	// Typically drivers would embed the &drm_gpuvm and &drm_gpuva
  *	// structure in individual driver structures and lock the dma-resv with
  *	// drm_exec or similar helpers.
  *	int driver_mapping_create(struct drm_gpuvm *gpuvm,
@@ -680,7 +680,7 @@
  *
  * This helper is here to provide lockless list iteration. Lockless as in, the
  * iterator releases the lock immediately after picking the first element from
- * the list, so list insertion deletion can happen concurrently.
+ * the list, so list insertion and deletion can happen concurrently.
  *
  * Elements popped from the original list are kept in a local list, so removal
  * and is_empty checks can still happen while we're iterating the list.
@@ -1160,7 +1160,7 @@ drm_gpuvm_prepare_objects_locked(struct drm_gpuvm *gpuvm,
 }
 
 /**
- * drm_gpuvm_prepare_objects() - prepare all assoiciated BOs
+ * drm_gpuvm_prepare_objects() - prepare all associated BOs
  * @gpuvm: the &drm_gpuvm
  * @exec: the &drm_exec locking context
  * @num_fences: the amount of &dma_fences to reserve
@@ -1230,13 +1230,13 @@ drm_gpuvm_prepare_range(struct drm_gpuvm *gpuvm, struct drm_exec *exec,
 EXPORT_SYMBOL_GPL(drm_gpuvm_prepare_range);
 
 /**
- * drm_gpuvm_exec_lock() - lock all dma-resv of all assoiciated BOs
+ * drm_gpuvm_exec_lock() - lock all dma-resv of all associated BOs
  * @vm_exec: the &drm_gpuvm_exec wrapper
  *
  * Acquires all dma-resv locks of all &drm_gem_objects the given
  * &drm_gpuvm contains mappings of.
  *
- * Addionally, when calling this function with struct drm_gpuvm_exec::extra
+ * Additionally, when calling this function with struct drm_gpuvm_exec::extra
  * being set the driver receives the given @fn callback to lock additional
  * dma-resv in the context of the &drm_gpuvm_exec instance. Typically, drivers
  * would call drm_exec_prepare_obj() from within this callback.
@@ -1293,7 +1293,7 @@ fn_lock_array(struct drm_gpuvm_exec *vm_exec)
 }
 
 /**
- * drm_gpuvm_exec_lock_array() - lock all dma-resv of all assoiciated BOs
+ * drm_gpuvm_exec_lock_array() - lock all dma-resv of all associated BOs
  * @vm_exec: the &drm_gpuvm_exec wrapper
  * @objs: additional &drm_gem_objects to lock
  * @num_objs: the number of additional &drm_gem_objects to lock
@@ -1588,7 +1588,7 @@ drm_gpuvm_bo_find(struct drm_gpuvm *gpuvm,
 EXPORT_SYMBOL_GPL(drm_gpuvm_bo_find);
 
 /**
- * drm_gpuvm_bo_obtain() - obtains and instance of the &drm_gpuvm_bo for the
+ * drm_gpuvm_bo_obtain() - obtains an instance of the &drm_gpuvm_bo for the
  * given &drm_gpuvm and &drm_gem_object
  * @gpuvm: The &drm_gpuvm the @obj is mapped in.
  * @obj: The &drm_gem_object being mapped in the @gpuvm.
@@ -1624,7 +1624,7 @@ drm_gpuvm_bo_obtain(struct drm_gpuvm *gpuvm,
 EXPORT_SYMBOL_GPL(drm_gpuvm_bo_obtain);
 
 /**
- * drm_gpuvm_bo_obtain_prealloc() - obtains and instance of the &drm_gpuvm_bo
+ * drm_gpuvm_bo_obtain_prealloc() - obtains an instance of the &drm_gpuvm_bo
  * for the given &drm_gpuvm and &drm_gem_object
  * @__vm_bo: A pre-allocated struct drm_gpuvm_bo.
  *
@@ -1688,7 +1688,7 @@ EXPORT_SYMBOL_GPL(drm_gpuvm_bo_extobj_add);
  * @vm_bo: the &drm_gpuvm_bo to add or remove
  * @evict: indicates whether the object is evicted
  *
- * Adds a &drm_gpuvm_bo to or removes it from the &drm_gpuvms evicted list.
+ * Adds a &drm_gpuvm_bo to or removes it from the &drm_gpuvm's evicted list.
  */
 void
 drm_gpuvm_bo_evict(struct drm_gpuvm_bo *vm_bo, bool evict)
@@ -1790,7 +1790,7 @@ __drm_gpuva_remove(struct drm_gpuva *va)
  * drm_gpuva_remove() - remove a &drm_gpuva
  * @va: the &drm_gpuva to remove
  *
- * This removes the given &va from the underlaying tree.
+ * This removes the given &va from the underlying tree.
  *
  * It is safe to use this function using the safe versions of iterating the GPU
  * VA space, such as drm_gpuvm_for_each_va_safe() and
@@ -2358,7 +2358,7 @@ EXPORT_SYMBOL_GPL(drm_gpuvm_sm_map);
  *
  * This function iterates the given range of the GPU VA space. It utilizes the
  * &drm_gpuvm_ops to call back into the driver providing the operations to
- * unmap and, if required, split existent mappings.
+ * unmap and, if required, split existing mappings.
  *
  * Drivers may use these callbacks to update the GPU VA space right away within
  * the callback. In case the driver decides to copy and store the operations for
@@ -2430,7 +2430,7 @@ static const struct drm_gpuvm_ops lock_ops = {
  * remapped, and locks+prepares (drm_exec_prepare_object()) objects that
  * will be newly mapped.
  *
- * The expected usage is:
+ * The expected usage is::
  *
  *    vm_bind {
  *        struct drm_exec exec;
@@ -2473,7 +2473,7 @@ static const struct drm_gpuvm_ops lock_ops = {
  *    required without the earlier DRIVER_OP_MAP.  This is safe because we've
  *    already locked the GEM object in the earlier DRIVER_OP_MAP step.
  *
- * Returns: 0 on success or a negative error codec
+ * Returns: 0 on success or a negative error code
  */
 int
 drm_gpuvm_sm_map_exec_lock(struct drm_gpuvm *gpuvm,
@@ -2617,12 +2617,12 @@ static const struct drm_gpuvm_ops gpuvm_list_ops = {
  * @req_offset: the offset within the &drm_gem_object
  *
  * This function creates a list of operations to perform splitting and merging
- * of existent mapping(s) with the newly requested one.
+ * of existing mapping(s) with the newly requested one.
  *
  * The list can be iterated with &drm_gpuva_for_each_op and must be processed
  * in the given order. It can contain map, unmap and remap operations, but it
  * also can be empty if no operation is required, e.g. if the requested mapping
- * already exists is the exact same way.
+ * already exists in the exact same way.
  *
  * There can be an arbitrary amount of unmap operations, a maximum of two remap
  * operations and a single map operation. The latter one represents the original
diff --git a/drivers/gpu/drm/drm_panic_qr.rs b/drivers/gpu/drm/drm_panic_qr.rs
index 09a9b452e8b7..50c286c5cee8 100644
--- a/drivers/gpu/drm/drm_panic_qr.rs
+++ b/drivers/gpu/drm/drm_panic_qr.rs
@@ -381,6 +381,26 @@ struct DecFifo {
     len: usize,
 }
 
+// On arm32 architecture, dividing an `u64` by a constant will generate a call
+// to `__aeabi_uldivmod` which is not present in the kernel.
+// So use the multiply by inverse method for this architecture.
+fn div10(val: u64) -> u64 {
+    if cfg!(target_arch = "arm") {
+        let val_h = val >> 32;
+        let val_l = val & 0xFFFFFFFF;
+        let b_h: u64 = 0x66666666;
+        let b_l: u64 = 0x66666667;
+
+        let tmp1 = val_h * b_l + ((val_l * b_l) >> 32);
+        let tmp2 = val_l * b_h + (tmp1 & 0xffffffff);
+        let tmp3 = val_h * b_h + (tmp1 >> 32) + (tmp2 >> 32);
+
+        tmp3 >> 2
+    } else {
+        val / 10
+    }
+}
+
 impl DecFifo {
     fn push(&mut self, data: u64, len: usize) {
         let mut chunk = data;
@@ -389,7 +409,7 @@ impl DecFifo {
         }
         for i in 0..len {
             self.decimals[i] = (chunk % 10) as u8;
-            chunk /= 10;
+            chunk = div10(chunk);
         }
         self.len += len;
     }
diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi.c b/drivers/gpu/drm/gma500/oaktrail_hdmi.c
index 1cf394369127..c0feca58511d 100644
--- a/drivers/gpu/drm/gma500/oaktrail_hdmi.c
+++ b/drivers/gpu/drm/gma500/oaktrail_hdmi.c
@@ -726,8 +726,8 @@ void oaktrail_hdmi_teardown(struct drm_device *dev)
 
 	if (hdmi_dev) {
 		pdev = hdmi_dev->dev;
-		pci_set_drvdata(pdev, NULL);
 		oaktrail_hdmi_i2c_exit(pdev);
+		pci_set_drvdata(pdev, NULL);
 		iounmap(hdmi_dev->regs);
 		kfree(hdmi_dev);
 		pci_dev_put(pdev);
diff --git a/drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.c b/drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.c
index 74f7832ea53e..0726cb5b736e 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.c
+++ b/drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.c
@@ -325,6 +325,17 @@ static int hibmc_dp_link_downgrade_training_eq(struct hibmc_dp_dev *dp)
 	return hibmc_dp_link_reduce_rate(dp);
 }
 
+static void hibmc_dp_update_caps(struct hibmc_dp_dev *dp)
+{
+	dp->link.cap.link_rate = dp->dpcd[DP_MAX_LINK_RATE];
+	if (dp->link.cap.link_rate > DP_LINK_BW_8_1 || !dp->link.cap.link_rate)
+		dp->link.cap.link_rate = DP_LINK_BW_8_1;
+
+	dp->link.cap.lanes = dp->dpcd[DP_MAX_LANE_COUNT] & DP_MAX_LANE_COUNT_MASK;
+	if (dp->link.cap.lanes > HIBMC_DP_LANE_NUM_MAX)
+		dp->link.cap.lanes = HIBMC_DP_LANE_NUM_MAX;
+}
+
 int hibmc_dp_link_training(struct hibmc_dp_dev *dp)
 {
 	struct hibmc_dp_link *link = &dp->link;
@@ -334,8 +345,7 @@ int hibmc_dp_link_training(struct hibmc_dp_dev *dp)
 	if (ret)
 		drm_err(dp->dev, "dp aux read dpcd failed, ret: %d\n", ret);
 
-	dp->link.cap.link_rate = dp->dpcd[DP_MAX_LINK_RATE];
-	dp->link.cap.lanes = 0x2;
+	hibmc_dp_update_caps(dp);
 
 	ret = hibmc_dp_get_serdes_rate_cfg(dp);
 	if (ret < 0)
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
index 768b97f9e74a..289304500ab0 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
+++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
@@ -32,7 +32,7 @@
 
 DEFINE_DRM_GEM_FOPS(hibmc_fops);
 
-static const char *g_irqs_names_map[HIBMC_MAX_VECTORS] = { "vblank", "hpd" };
+static const char *g_irqs_names_map[HIBMC_MAX_VECTORS] = { "hibmc-vblank", "hibmc-hpd" };
 
 static irqreturn_t hibmc_interrupt(int irq, void *arg)
 {
@@ -115,6 +115,8 @@ static const struct drm_mode_config_funcs hibmc_mode_funcs = {
 static int hibmc_kms_init(struct hibmc_drm_private *priv)
 {
 	struct drm_device *dev = &priv->dev;
+	struct drm_encoder *encoder;
+	u32 clone_mask = 0;
 	int ret;
 
 	ret = drmm_mode_config_init(dev);
@@ -154,6 +156,12 @@ static int hibmc_kms_init(struct hibmc_drm_private *priv)
 		return ret;
 	}
 
+	drm_for_each_encoder(encoder, dev)
+		clone_mask |= drm_encoder_mask(encoder);
+
+	drm_for_each_encoder(encoder, dev)
+		encoder->possible_clones = clone_mask;
+
 	return 0;
 }
 
@@ -277,7 +285,6 @@ static void hibmc_unload(struct drm_device *dev)
 static int hibmc_msi_init(struct drm_device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	char name[32] = {0};
 	int valid_irq_num;
 	int irq;
 	int ret;
@@ -292,9 +299,6 @@ static int hibmc_msi_init(struct drm_device *dev)
 	valid_irq_num = ret;
 
 	for (int i = 0; i < valid_irq_num; i++) {
-		snprintf(name, ARRAY_SIZE(name) - 1, "%s-%s-%s",
-			 dev->driver->name, pci_name(pdev), g_irqs_names_map[i]);
-
 		irq = pci_irq_vector(pdev, i);
 
 		if (i)
@@ -302,10 +306,10 @@ static int hibmc_msi_init(struct drm_device *dev)
 			ret = devm_request_threaded_irq(&pdev->dev, irq,
 							hibmc_dp_interrupt,
 							hibmc_dp_hpd_isr,
-							IRQF_SHARED, name, dev);
+							IRQF_SHARED, g_irqs_names_map[i], dev);
 		else
 			ret = devm_request_irq(&pdev->dev, irq, hibmc_interrupt,
-					       IRQF_SHARED, name, dev);
+					       IRQF_SHARED, g_irqs_names_map[i], dev);
 		if (ret) {
 			drm_err(dev, "install irq failed: %d\n", ret);
 			return ret;
@@ -323,13 +327,13 @@ static int hibmc_load(struct drm_device *dev)
 
 	ret = hibmc_hw_init(priv);
 	if (ret)
-		goto err;
+		return ret;
 
 	ret = drmm_vram_helper_init(dev, pci_resource_start(pdev, 0),
 				    pci_resource_len(pdev, 0));
 	if (ret) {
 		drm_err(dev, "Error initializing VRAM MM; %d\n", ret);
-		goto err;
+		return ret;
 	}
 
 	ret = hibmc_kms_init(priv);
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h
index 274feabe7df0..ca8502e2760c 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h
+++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.h
@@ -69,6 +69,7 @@ int hibmc_de_init(struct hibmc_drm_private *priv);
 int hibmc_vdac_init(struct hibmc_drm_private *priv);
 
 int hibmc_ddc_create(struct drm_device *drm_dev, struct hibmc_vdac *connector);
+void hibmc_ddc_del(struct hibmc_vdac *vdac);
 
 int hibmc_dp_init(struct hibmc_drm_private *priv);
 
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c
index 99b3b77b5445..44860011855e 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c
+++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_i2c.c
@@ -95,3 +95,8 @@ int hibmc_ddc_create(struct drm_device *drm_dev, struct hibmc_vdac *vdac)
 
 	return i2c_bit_add_bus(&vdac->adapter);
 }
+
+void hibmc_ddc_del(struct hibmc_vdac *vdac)
+{
+	i2c_del_adapter(&vdac->adapter);
+}
diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c
index e8a527ede854..841e81f47b68 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c
+++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_vdac.c
@@ -53,7 +53,7 @@ static void hibmc_connector_destroy(struct drm_connector *connector)
 {
 	struct hibmc_vdac *vdac = to_hibmc_vdac(connector);
 
-	i2c_del_adapter(&vdac->adapter);
+	hibmc_ddc_del(vdac);
 	drm_connector_cleanup(connector);
 }
 
@@ -110,7 +110,7 @@ int hibmc_vdac_init(struct hibmc_drm_private *priv)
 	ret = drmm_encoder_init(dev, encoder, NULL, DRM_MODE_ENCODER_DAC, NULL);
 	if (ret) {
 		drm_err(dev, "failed to init encoder: %d\n", ret);
-		return ret;
+		goto err;
 	}
 
 	drm_encoder_helper_add(encoder, &hibmc_encoder_helper_funcs);
@@ -121,7 +121,7 @@ int hibmc_vdac_init(struct hibmc_drm_private *priv)
 					  &vdac->adapter);
 	if (ret) {
 		drm_err(dev, "failed to init connector: %d\n", ret);
-		return ret;
+		goto err;
 	}
 
 	drm_connector_helper_add(connector, &hibmc_connector_helper_funcs);
@@ -131,4 +131,9 @@ int hibmc_vdac_init(struct hibmc_drm_private *priv)
 	connector->polled = DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT;
 
 	return 0;
+
+err:
+	hibmc_ddc_del(vdac);
+
+	return ret;
 }
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index 0405396c7750..9ecbb4b99c37 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -596,8 +596,9 @@ intel_ddi_transcoder_func_reg_val_get(struct intel_encoder *encoder,
 			enum transcoder master;
 
 			master = crtc_state->mst_master_transcoder;
-			drm_WARN_ON(display->drm,
-				    master == INVALID_TRANSCODER);
+			if (drm_WARN_ON(display->drm,
+					master == INVALID_TRANSCODER))
+				master = TRANSCODER_A;
 			temp |= TRANS_DDI_MST_TRANSPORT_SELECT(master);
 		}
 	} else {
diff --git a/drivers/gpu/drm/i915/display/intel_display_irq.c b/drivers/gpu/drm/i915/display/intel_display_irq.c
index fb25ec8adae3..68157f177b6a 100644
--- a/drivers/gpu/drm/i915/display/intel_display_irq.c
+++ b/drivers/gpu/drm/i915/display/intel_display_irq.c
@@ -1506,10 +1506,14 @@ u32 gen11_gu_misc_irq_ack(struct intel_display *display, const u32 master_ctl)
 	if (!(master_ctl & GEN11_GU_MISC_IRQ))
 		return 0;
 
+	intel_display_rpm_assert_block(display);
+
 	iir = intel_de_read(display, GEN11_GU_MISC_IIR);
 	if (likely(iir))
 		intel_de_write(display, GEN11_GU_MISC_IIR, iir);
 
+	intel_display_rpm_assert_unblock(display);
+
 	return iir;
 }
 
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index 273054c22325..c92f3e736228 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -1172,7 +1172,7 @@ static void icl_mbus_init(struct intel_display *display)
 	if (DISPLAY_VER(display) == 12)
 		abox_regs |= BIT(0);
 
-	for_each_set_bit(i, &abox_regs, sizeof(abox_regs))
+	for_each_set_bit(i, &abox_regs, BITS_PER_TYPE(abox_regs))
 		intel_de_rmw(display, MBUS_ABOX_CTL(i), mask, val);
 }
 
@@ -1629,11 +1629,11 @@ static void tgl_bw_buddy_init(struct intel_display *display)
 	if (table[config].page_mask == 0) {
 		drm_dbg_kms(display->drm,
 			    "Unknown memory configuration; disabling address buddy logic.\n");
-		for_each_set_bit(i, &abox_mask, sizeof(abox_mask))
+		for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask))
 			intel_de_write(display, BW_BUDDY_CTL(i),
 				       BW_BUDDY_DISABLE);
 	} else {
-		for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) {
+		for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) {
 			intel_de_write(display, BW_BUDDY_PAGE_MASK(i),
 				       table[config].page_mask);
 
diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
index 41228478b21c..0a3a3f6a5f9d 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c
@@ -546,7 +546,7 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector,
 				     luminance_range->max_luminance,
 				     panel->vbt.backlight.pwm_freq_hz,
 				     intel_dp->edp_dpcd, &current_level, &current_mode,
-				     false);
+				     panel->backlight.edp.vesa.luminance_control_support);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c
index 6e26cb4c5724..685ac98bd001 100644
--- a/drivers/gpu/drm/i915/display/intel_fbc.c
+++ b/drivers/gpu/drm/i915/display/intel_fbc.c
@@ -552,10 +552,6 @@ static void ilk_fbc_deactivate(struct intel_fbc *fbc)
 	if (dpfc_ctl & DPFC_CTL_EN) {
 		dpfc_ctl &= ~DPFC_CTL_EN;
 		intel_de_write(display, ILK_DPFC_CONTROL(fbc->id), dpfc_ctl);
-
-		/* wa_18038517565 Enable DPFC clock gating after FBC disable */
-		if (display->platform.dg2 || DISPLAY_VER(display) >= 14)
-			fbc_compressor_clkgate_disable_wa(fbc, false);
 	}
 }
 
@@ -1710,6 +1706,10 @@ static void __intel_fbc_disable(struct intel_fbc *fbc)
 
 	__intel_fbc_cleanup_cfb(fbc);
 
+	/* wa_18038517565 Enable DPFC clock gating after FBC disable */
+	if (display->platform.dg2 || DISPLAY_VER(display) >= 14)
+		fbc_compressor_clkgate_disable_wa(fbc, false);
+
 	fbc->state.plane = NULL;
 	fbc->flip_pending = false;
 	fbc->busy_bits = 0;
diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c
index ae9053919211..41988e193a41 100644
--- a/drivers/gpu/drm/i915/display/intel_psr.c
+++ b/drivers/gpu/drm/i915/display/intel_psr.c
@@ -3275,7 +3275,9 @@ static void intel_psr_configure_full_frame_update(struct intel_dp *intel_dp)
 
 static void _psr_invalidate_handle(struct intel_dp *intel_dp)
 {
-	if (intel_dp->psr.psr2_sel_fetch_enabled) {
+	struct intel_display *display = to_intel_display(intel_dp);
+
+	if (DISPLAY_VER(display) < 20 && intel_dp->psr.psr2_sel_fetch_enabled) {
 		if (!intel_dp->psr.psr2_sel_fetch_cff_enabled) {
 			intel_dp->psr.psr2_sel_fetch_cff_enabled = true;
 			intel_psr_configure_full_frame_update(intel_dp);
@@ -3361,7 +3363,7 @@ static void _psr_flush_handle(struct intel_dp *intel_dp)
 {
 	struct intel_display *display = to_intel_display(intel_dp);
 
-	if (intel_dp->psr.psr2_sel_fetch_enabled) {
+	if (DISPLAY_VER(display) < 20 && intel_dp->psr.psr2_sel_fetch_enabled) {
 		if (intel_dp->psr.psr2_sel_fetch_cff_enabled) {
 			/* can we turn CFF off? */
 			if (intel_dp->psr.busy_frontbuffer_bits == 0)
@@ -3378,11 +3380,13 @@ static void _psr_flush_handle(struct intel_dp *intel_dp)
 		 * existing SU configuration
 		 */
 		intel_psr_configure_full_frame_update(intel_dp);
-	}
 
-	intel_psr_force_update(intel_dp);
+		intel_psr_force_update(intel_dp);
+	} else {
+		intel_psr_exit(intel_dp);
+	}
 
-	if (!intel_dp->psr.psr2_sel_fetch_enabled && !intel_dp->psr.active &&
+	if ((!intel_dp->psr.psr2_sel_fetch_enabled || DISPLAY_VER(display) >= 20) &&
 	    !intel_dp->psr.busy_frontbuffer_bits)
 		queue_work(display->wq.unordered, &intel_dp->psr.work);
 }
diff --git a/drivers/gpu/drm/i915/display/intel_tc.c b/drivers/gpu/drm/i915/display/intel_tc.c
index 3bc57579fe53..668ef139391b 100644
--- a/drivers/gpu/drm/i915/display/intel_tc.c
+++ b/drivers/gpu/drm/i915/display/intel_tc.c
@@ -23,6 +23,7 @@
 #include "intel_modeset_lock.h"
 #include "intel_tc.h"
 
+#define DP_PIN_ASSIGNMENT_NONE	0x0
 #define DP_PIN_ASSIGNMENT_C	0x3
 #define DP_PIN_ASSIGNMENT_D	0x4
 #define DP_PIN_ASSIGNMENT_E	0x5
@@ -66,6 +67,7 @@ struct intel_tc_port {
 	enum tc_port_mode init_mode;
 	enum phy_fia phy_fia;
 	u8 phy_fia_idx;
+	u8 max_lane_count;
 };
 
 static enum intel_display_power_domain
@@ -307,6 +309,8 @@ static int lnl_tc_port_get_max_lane_count(struct intel_digital_port *dig_port)
 		REG_FIELD_GET(TCSS_DDI_STATUS_PIN_ASSIGNMENT_MASK, val);
 
 	switch (pin_assignment) {
+	case DP_PIN_ASSIGNMENT_NONE:
+		return 0;
 	default:
 		MISSING_CASE(pin_assignment);
 		fallthrough;
@@ -365,12 +369,12 @@ static int intel_tc_port_get_max_lane_count(struct intel_digital_port *dig_port)
 	}
 }
 
-int intel_tc_port_max_lane_count(struct intel_digital_port *dig_port)
+static int get_max_lane_count(struct intel_tc_port *tc)
 {
-	struct intel_display *display = to_intel_display(dig_port);
-	struct intel_tc_port *tc = to_tc_port(dig_port);
+	struct intel_display *display = to_intel_display(tc->dig_port);
+	struct intel_digital_port *dig_port = tc->dig_port;
 
-	if (!intel_encoder_is_tc(&dig_port->base) || tc->mode != TC_PORT_DP_ALT)
+	if (tc->mode != TC_PORT_DP_ALT)
 		return 4;
 
 	assert_tc_cold_blocked(tc);
@@ -384,6 +388,25 @@ int intel_tc_port_max_lane_count(struct intel_digital_port *dig_port)
 	return intel_tc_port_get_max_lane_count(dig_port);
 }
 
+static void read_pin_configuration(struct intel_tc_port *tc)
+{
+	tc->max_lane_count = get_max_lane_count(tc);
+}
+
+int intel_tc_port_max_lane_count(struct intel_digital_port *dig_port)
+{
+	struct intel_display *display = to_intel_display(dig_port);
+	struct intel_tc_port *tc = to_tc_port(dig_port);
+
+	if (!intel_encoder_is_tc(&dig_port->base))
+		return 4;
+
+	if (DISPLAY_VER(display) < 20)
+		return get_max_lane_count(tc);
+
+	return tc->max_lane_count;
+}
+
 void intel_tc_port_set_fia_lane_count(struct intel_digital_port *dig_port,
 				      int required_lanes)
 {
@@ -596,9 +619,12 @@ static void icl_tc_phy_get_hw_state(struct intel_tc_port *tc)
 	tc_cold_wref = __tc_cold_block(tc, &domain);
 
 	tc->mode = tc_phy_get_current_mode(tc);
-	if (tc->mode != TC_PORT_DISCONNECTED)
+	if (tc->mode != TC_PORT_DISCONNECTED) {
 		tc->lock_wakeref = tc_cold_block(tc);
 
+		read_pin_configuration(tc);
+	}
+
 	__tc_cold_unblock(tc, domain, tc_cold_wref);
 }
 
@@ -656,8 +682,11 @@ static bool icl_tc_phy_connect(struct intel_tc_port *tc,
 
 	tc->lock_wakeref = tc_cold_block(tc);
 
-	if (tc->mode == TC_PORT_TBT_ALT)
+	if (tc->mode == TC_PORT_TBT_ALT) {
+		read_pin_configuration(tc);
+
 		return true;
+	}
 
 	if ((!tc_phy_is_ready(tc) ||
 	     !icl_tc_phy_take_ownership(tc, true)) &&
@@ -668,6 +697,7 @@ static bool icl_tc_phy_connect(struct intel_tc_port *tc,
 		goto out_unblock_tc_cold;
 	}
 
+	read_pin_configuration(tc);
 
 	if (!tc_phy_verify_legacy_or_dp_alt_mode(tc, required_lanes))
 		goto out_release_phy;
@@ -858,9 +888,12 @@ static void adlp_tc_phy_get_hw_state(struct intel_tc_port *tc)
 	port_wakeref = intel_display_power_get(display, port_power_domain);
 
 	tc->mode = tc_phy_get_current_mode(tc);
-	if (tc->mode != TC_PORT_DISCONNECTED)
+	if (tc->mode != TC_PORT_DISCONNECTED) {
 		tc->lock_wakeref = tc_cold_block(tc);
 
+		read_pin_configuration(tc);
+	}
+
 	intel_display_power_put(display, port_power_domain, port_wakeref);
 }
 
@@ -873,6 +906,9 @@ static bool adlp_tc_phy_connect(struct intel_tc_port *tc, int required_lanes)
 
 	if (tc->mode == TC_PORT_TBT_ALT) {
 		tc->lock_wakeref = tc_cold_block(tc);
+
+		read_pin_configuration(tc);
+
 		return true;
 	}
 
@@ -894,6 +930,8 @@ static bool adlp_tc_phy_connect(struct intel_tc_port *tc, int required_lanes)
 
 	tc->lock_wakeref = tc_cold_block(tc);
 
+	read_pin_configuration(tc);
+
 	if (!tc_phy_verify_legacy_or_dp_alt_mode(tc, required_lanes))
 		goto out_unblock_tc_cold;
 
@@ -1124,9 +1162,18 @@ static void xelpdp_tc_phy_get_hw_state(struct intel_tc_port *tc)
 	tc_cold_wref = __tc_cold_block(tc, &domain);
 
 	tc->mode = tc_phy_get_current_mode(tc);
-	if (tc->mode != TC_PORT_DISCONNECTED)
+	if (tc->mode != TC_PORT_DISCONNECTED) {
 		tc->lock_wakeref = tc_cold_block(tc);
 
+		read_pin_configuration(tc);
+		/*
+		 * Set a valid lane count value for a DP-alt sink which got
+		 * disconnected. The driver can only disable the output on this PHY.
+		 */
+		if (tc->max_lane_count == 0)
+			tc->max_lane_count = 4;
+	}
+
 	drm_WARN_ON(display->drm,
 		    (tc->mode == TC_PORT_DP_ALT || tc->mode == TC_PORT_LEGACY) &&
 		    !xelpdp_tc_phy_tcss_power_is_enabled(tc));
@@ -1138,14 +1185,19 @@ static bool xelpdp_tc_phy_connect(struct intel_tc_port *tc, int required_lanes)
 {
 	tc->lock_wakeref = tc_cold_block(tc);
 
-	if (tc->mode == TC_PORT_TBT_ALT)
+	if (tc->mode == TC_PORT_TBT_ALT) {
+		read_pin_configuration(tc);
+
 		return true;
+	}
 
 	if (!xelpdp_tc_phy_enable_tcss_power(tc, true))
 		goto out_unblock_tccold;
 
 	xelpdp_tc_phy_take_ownership(tc, true);
 
+	read_pin_configuration(tc);
+
 	if (!tc_phy_verify_legacy_or_dp_alt_mode(tc, required_lanes))
 		goto out_release_phy;
 
@@ -1226,14 +1278,19 @@ static void tc_phy_get_hw_state(struct intel_tc_port *tc)
 	tc->phy_ops->get_hw_state(tc);
 }
 
-static bool tc_phy_is_ready_and_owned(struct intel_tc_port *tc,
-				      bool phy_is_ready, bool phy_is_owned)
+/* Is the PHY owned by display i.e. is it in legacy or DP-alt mode? */
+static bool tc_phy_owned_by_display(struct intel_tc_port *tc,
+				    bool phy_is_ready, bool phy_is_owned)
 {
 	struct intel_display *display = to_intel_display(tc->dig_port);
 
-	drm_WARN_ON(display->drm, phy_is_owned && !phy_is_ready);
+	if (DISPLAY_VER(display) < 20) {
+		drm_WARN_ON(display->drm, phy_is_owned && !phy_is_ready);
 
-	return phy_is_ready && phy_is_owned;
+		return phy_is_ready && phy_is_owned;
+	} else {
+		return phy_is_owned;
+	}
 }
 
 static bool tc_phy_is_connected(struct intel_tc_port *tc,
@@ -1244,7 +1301,7 @@ static bool tc_phy_is_connected(struct intel_tc_port *tc,
 	bool phy_is_owned = tc_phy_is_owned(tc);
 	bool is_connected;
 
-	if (tc_phy_is_ready_and_owned(tc, phy_is_ready, phy_is_owned))
+	if (tc_phy_owned_by_display(tc, phy_is_ready, phy_is_owned))
 		is_connected = port_pll_type == ICL_PORT_DPLL_MG_PHY;
 	else
 		is_connected = port_pll_type == ICL_PORT_DPLL_DEFAULT;
@@ -1352,7 +1409,7 @@ tc_phy_get_current_mode(struct intel_tc_port *tc)
 	phy_is_ready = tc_phy_is_ready(tc);
 	phy_is_owned = tc_phy_is_owned(tc);
 
-	if (!tc_phy_is_ready_and_owned(tc, phy_is_ready, phy_is_owned)) {
+	if (!tc_phy_owned_by_display(tc, phy_is_ready, phy_is_owned)) {
 		mode = get_tc_mode_in_phy_not_owned_state(tc, live_mode);
 	} else {
 		drm_WARN_ON(display->drm, live_mode == TC_PORT_TBT_ALT);
@@ -1441,11 +1498,11 @@ static void intel_tc_port_reset_mode(struct intel_tc_port *tc,
 	intel_display_power_flush_work(display);
 	if (!intel_tc_cold_requires_aux_pw(dig_port)) {
 		enum intel_display_power_domain aux_domain;
-		bool aux_powered;
 
 		aux_domain = intel_aux_power_domain(dig_port);
-		aux_powered = intel_display_power_is_enabled(display, aux_domain);
-		drm_WARN_ON(display->drm, aux_powered);
+		if (intel_display_power_is_enabled(display, aux_domain))
+			drm_dbg_kms(display->drm, "Port %s: AUX unexpectedly powered\n",
+				    tc->port_name);
 	}
 
 	tc_phy_disconnect(tc);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index e3d188455f67..b9dae15c1d16 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -514,6 +514,13 @@ static int __create_shmem(struct drm_i915_private *i915,
 	if (IS_ERR(filp))
 		return PTR_ERR(filp);
 
+	/*
+	 * Prevent -EFBIG by allowing large writes beyond MAX_NON_LFS on shmem
+	 * objects by setting O_LARGEFILE.
+	 */
+	if (force_o_largefile())
+		filp->f_flags |= O_LARGEFILE;
+
 	obj->filp = filp;
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index b37e400f74e5..5a95f06900b5 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -634,6 +634,8 @@ static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 				     struct i915_wa_list *wal)
 {
+	struct drm_i915_private *i915 = engine->i915;
+
 	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
 	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
 
@@ -669,6 +671,15 @@ static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 
 	/* Wa_1406306137:icl,ehl */
 	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
+
+	if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
+		/*
+		 * Disable Repacking for Compression (masked R/W access)
+		 * before rendering compressed surfaces for display.
+		 */
+		wa_masked_en(wal, CACHE_MODE_0_GEN7,
+			     DISABLE_REPACKING_FOR_COMPRESSION);
+	}
 }
 
 /*
@@ -2306,15 +2317,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
 	}
 
-	if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
-		/*
-		 * "Disable Repacking for Compression (masked R/W access)
-		 *  before rendering compressed surfaces for display."
-		 */
-		wa_masked_en(wal, CACHE_MODE_0_GEN7,
-			     DISABLE_REPACKING_FOR_COMPRESSION);
-	}
-
 	if (GRAPHICS_VER(i915) == 11) {
 		/* This is not an Wa. Enable for better image quality */
 		wa_masked_en(wal,
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
index d5e6bab36414..eb5537f0ac90 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c
@@ -394,10 +394,12 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev)
 			continue;
 
 		drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match);
+		put_device(&pdev->dev);
 		if (!drm_dev)
 			continue;
 
 		temp_drm_priv = dev_get_drvdata(drm_dev);
+		put_device(drm_dev);
 		if (!temp_drm_priv)
 			continue;
 
diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c
index d7726091819c..0e2bcd5f67b7 100644
--- a/drivers/gpu/drm/mediatek/mtk_dsi.c
+++ b/drivers/gpu/drm/mediatek/mtk_dsi.c
@@ -1002,6 +1002,12 @@ static int mtk_dsi_host_attach(struct mipi_dsi_host *host,
 			return PTR_ERR(dsi->next_bridge);
 	}
 
+	/*
+	 * set flag to request the DSI host bridge be pre-enabled before device bridge
+	 * in the chain, so the DSI host is ready when the device bridge is pre-enabled
+	 */
+	dsi->next_bridge->pre_enable_prev_first = true;
+
 	drm_bridge_add(&dsi->bridge);
 
 	ret = component_add(host->dev, &mtk_dsi_component_ops);
diff --git a/drivers/gpu/drm/mediatek/mtk_hdmi.c b/drivers/gpu/drm/mediatek/mtk_hdmi.c
index 845fd8aa43c3..b766dd5e6c8d 100644
--- a/drivers/gpu/drm/mediatek/mtk_hdmi.c
+++ b/drivers/gpu/drm/mediatek/mtk_hdmi.c
@@ -182,8 +182,8 @@ static inline struct mtk_hdmi *hdmi_ctx_from_bridge(struct drm_bridge *b)
 
 static void mtk_hdmi_hw_vid_black(struct mtk_hdmi *hdmi, bool black)
 {
-	regmap_update_bits(hdmi->regs, VIDEO_SOURCE_SEL,
-			   VIDEO_CFG_4, black ? GEN_RGB : NORMAL_PATH);
+	regmap_update_bits(hdmi->regs, VIDEO_CFG_4,
+			   VIDEO_SOURCE_SEL, black ? GEN_RGB : NORMAL_PATH);
 }
 
 static void mtk_hdmi_hw_make_reg_writable(struct mtk_hdmi *hdmi, bool enable)
@@ -310,8 +310,8 @@ static void mtk_hdmi_hw_send_info_frame(struct mtk_hdmi *hdmi, u8 *buffer,
 
 static void mtk_hdmi_hw_send_aud_packet(struct mtk_hdmi *hdmi, bool enable)
 {
-	regmap_update_bits(hdmi->regs, AUDIO_PACKET_OFF,
-			   GRL_SHIFT_R2, enable ? 0 : AUDIO_PACKET_OFF);
+	regmap_update_bits(hdmi->regs, GRL_SHIFT_R2,
+			   AUDIO_PACKET_OFF, enable ? 0 : AUDIO_PACKET_OFF);
 }
 
 static void mtk_hdmi_hw_config_sys(struct mtk_hdmi *hdmi)
diff --git a/drivers/gpu/drm/mediatek/mtk_plane.c b/drivers/gpu/drm/mediatek/mtk_plane.c
index cbc4f37da8ba..02349bd44001 100644
--- a/drivers/gpu/drm/mediatek/mtk_plane.c
+++ b/drivers/gpu/drm/mediatek/mtk_plane.c
@@ -292,7 +292,8 @@ static void mtk_plane_atomic_disable(struct drm_plane *plane,
 	wmb(); /* Make sure the above parameter is set before update */
 	mtk_plane_state->pending.dirty = true;
 
-	mtk_crtc_plane_disable(old_state->crtc, plane);
+	if (old_state && old_state->crtc)
+		mtk_crtc_plane_disable(old_state->crtc, plane);
 }
 
 static void mtk_plane_atomic_update(struct drm_plane *plane,
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
index faca2a0243ab..d5d1271fce61 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
@@ -11,7 +11,7 @@
 static const unsigned int *gen7_0_0_external_core_regs[] __always_unused;
 static const unsigned int *gen7_2_0_external_core_regs[] __always_unused;
 static const unsigned int *gen7_9_0_external_core_regs[] __always_unused;
-static struct gen7_sptp_cluster_registers gen7_9_0_sptp_clusters[] __always_unused;
+static const struct gen7_sptp_cluster_registers gen7_9_0_sptp_clusters[] __always_unused;
 static const u32 gen7_9_0_cx_debugbus_blocks[] __always_unused;
 
 #include "adreno_gen7_0_0_snapshot.h"
@@ -174,8 +174,15 @@ static int a6xx_crashdumper_run(struct msm_gpu *gpu,
 static int debugbus_read(struct msm_gpu *gpu, u32 block, u32 offset,
 		u32 *data)
 {
-	u32 reg = A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_INDEX(offset) |
-		A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_BLK_SEL(block);
+	u32 reg;
+
+	if (to_adreno_gpu(gpu)->info->family >= ADRENO_7XX_GEN1) {
+		reg = A7XX_DBGC_CFG_DBGBUS_SEL_D_PING_INDEX(offset) |
+			A7XX_DBGC_CFG_DBGBUS_SEL_D_PING_BLK_SEL(block);
+	} else {
+		reg = A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_INDEX(offset) |
+			A6XX_DBGC_CFG_DBGBUS_SEL_D_PING_BLK_SEL(block);
+	}
 
 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_A, reg);
 	gpu_write(gpu, REG_A6XX_DBGC_CFG_DBGBUS_SEL_B, reg);
@@ -198,11 +205,18 @@ static int debugbus_read(struct msm_gpu *gpu, u32 block, u32 offset,
 	readl((ptr) + ((offset) << 2))
 
 /* read a value from the CX debug bus */
-static int cx_debugbus_read(void __iomem *cxdbg, u32 block, u32 offset,
+static int cx_debugbus_read(struct msm_gpu *gpu, void __iomem *cxdbg, u32 block, u32 offset,
 		u32 *data)
 {
-	u32 reg = A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_INDEX(offset) |
-		A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
+	u32 reg;
+
+	if (to_adreno_gpu(gpu)->info->family >= ADRENO_7XX_GEN1) {
+		reg = A7XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_INDEX(offset) |
+			A7XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
+	} else {
+		reg = A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_INDEX(offset) |
+			A6XX_CX_DBGC_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
+	}
 
 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_A, reg);
 	cxdbg_write(cxdbg, REG_A6XX_CX_DBGC_CFG_DBGBUS_SEL_B, reg);
@@ -315,7 +329,8 @@ static void a6xx_get_debugbus_block(struct msm_gpu *gpu,
 		ptr += debugbus_read(gpu, block->id, i, ptr);
 }
 
-static void a6xx_get_cx_debugbus_block(void __iomem *cxdbg,
+static void a6xx_get_cx_debugbus_block(struct msm_gpu *gpu,
+		void __iomem *cxdbg,
 		struct a6xx_gpu_state *a6xx_state,
 		const struct a6xx_debugbus_block *block,
 		struct a6xx_gpu_state_obj *obj)
@@ -330,7 +345,7 @@ static void a6xx_get_cx_debugbus_block(void __iomem *cxdbg,
 	obj->handle = block;
 
 	for (ptr = obj->data, i = 0; i < block->count; i++)
-		ptr += cx_debugbus_read(cxdbg, block->id, i, ptr);
+		ptr += cx_debugbus_read(gpu, cxdbg, block->id, i, ptr);
 }
 
 static void a6xx_get_debugbus_blocks(struct msm_gpu *gpu,
@@ -423,8 +438,9 @@ static void a7xx_get_debugbus_blocks(struct msm_gpu *gpu,
 				a6xx_state, &a7xx_debugbus_blocks[gbif_debugbus_blocks[i]],
 				&a6xx_state->debugbus[i + debugbus_blocks_count]);
 		}
-	}
 
+		a6xx_state->nr_debugbus = total_debugbus_blocks;
+	}
 }
 
 static void a6xx_get_debugbus(struct msm_gpu *gpu,
@@ -526,7 +542,8 @@ static void a6xx_get_debugbus(struct msm_gpu *gpu,
 			int i;
 
 			for (i = 0; i < nr_cx_debugbus_blocks; i++)
-				a6xx_get_cx_debugbus_block(cxdbg,
+				a6xx_get_cx_debugbus_block(gpu,
+					cxdbg,
 					a6xx_state,
 					&cx_debugbus_blocks[i],
 					&a6xx_state->cx_debugbus[i]);
@@ -759,15 +776,15 @@ static void a7xx_get_cluster(struct msm_gpu *gpu,
 	size_t datasize;
 	int i, regcount = 0;
 
-	/* Some clusters need a selector register to be programmed too */
-	if (cluster->sel)
-		in += CRASHDUMP_WRITE(in, cluster->sel->cd_reg, cluster->sel->val);
-
 	in += CRASHDUMP_WRITE(in, REG_A7XX_CP_APERTURE_CNTL_CD,
 		A7XX_CP_APERTURE_CNTL_CD_PIPE(cluster->pipe_id) |
 		A7XX_CP_APERTURE_CNTL_CD_CLUSTER(cluster->cluster_id) |
 		A7XX_CP_APERTURE_CNTL_CD_CONTEXT(cluster->context_id));
 
+	/* Some clusters need a selector register to be programmed too */
+	if (cluster->sel)
+		in += CRASHDUMP_WRITE(in, cluster->sel->cd_reg, cluster->sel->val);
+
 	for (i = 0; cluster->regs[i] != UINT_MAX; i += 2) {
 		int count = RANGE(cluster->regs, i);
 
@@ -1796,6 +1813,7 @@ static void a7xx_show_shader(struct a6xx_gpu_state_obj *obj,
 
 	print_name(p, "  - type: ", a7xx_statetype_names[block->statetype]);
 	print_name(p, "    - pipe: ", a7xx_pipe_names[block->pipeid]);
+	drm_printf(p, "    - location: %d\n", block->location);
 
 	for (i = 0; i < block->num_sps; i++) {
 		drm_printf(p, "      - sp: %d\n", i);
@@ -1873,6 +1891,7 @@ static void a7xx_show_dbgahb_cluster(struct a6xx_gpu_state_obj *obj,
 		print_name(p, "  - pipe: ", a7xx_pipe_names[dbgahb->pipe_id]);
 		print_name(p, "    - cluster-name: ", a7xx_cluster_names[dbgahb->cluster_id]);
 		drm_printf(p, "      - context: %d\n", dbgahb->context_id);
+		drm_printf(p, "      - location: %d\n", dbgahb->location_id);
 		a7xx_show_registers_indented(dbgahb->regs, obj->data, p, 4);
 	}
 }
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
index 95d93ac6812a..1c18499b60bb 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.h
@@ -419,47 +419,47 @@ static const struct a6xx_indexed_registers a6xx_indexed_reglist[] = {
 		REG_A6XX_CP_SQE_STAT_DATA, 0x33, NULL },
 	{ "CP_DRAW_STATE", REG_A6XX_CP_DRAW_STATE_ADDR,
 		REG_A6XX_CP_DRAW_STATE_DATA, 0x100, NULL },
-	{ "CP_UCODE_DBG_DATA", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
+	{ "CP_SQE_UCODE_DBG", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
 		REG_A6XX_CP_SQE_UCODE_DBG_DATA, 0x8000, NULL },
-	{ "CP_ROQ", REG_A6XX_CP_ROQ_DBG_ADDR,
+	{ "CP_ROQ_DBG", REG_A6XX_CP_ROQ_DBG_ADDR,
 		REG_A6XX_CP_ROQ_DBG_DATA, 0, a6xx_get_cp_roq_size},
 };
 
 static const struct a6xx_indexed_registers a7xx_indexed_reglist[] = {
 	{ "CP_SQE_STAT", REG_A6XX_CP_SQE_STAT_ADDR,
-		REG_A6XX_CP_SQE_STAT_DATA, 0x33, NULL },
+		REG_A6XX_CP_SQE_STAT_DATA, 0x40, NULL },
 	{ "CP_DRAW_STATE", REG_A6XX_CP_DRAW_STATE_ADDR,
 		REG_A6XX_CP_DRAW_STATE_DATA, 0x100, NULL },
-	{ "CP_UCODE_DBG_DATA", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
+	{ "CP_SQE_UCODE_DBG", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
 		REG_A6XX_CP_SQE_UCODE_DBG_DATA, 0x8000, NULL },
-	{ "CP_BV_SQE_STAT_ADDR", REG_A7XX_CP_BV_SQE_STAT_ADDR,
-		REG_A7XX_CP_BV_SQE_STAT_DATA, 0x33, NULL },
-	{ "CP_BV_DRAW_STATE_ADDR", REG_A7XX_CP_BV_DRAW_STATE_ADDR,
+	{ "CP_BV_SQE_STAT", REG_A7XX_CP_BV_SQE_STAT_ADDR,
+		REG_A7XX_CP_BV_SQE_STAT_DATA, 0x40, NULL },
+	{ "CP_BV_DRAW_STATE", REG_A7XX_CP_BV_DRAW_STATE_ADDR,
 		REG_A7XX_CP_BV_DRAW_STATE_DATA, 0x100, NULL },
-	{ "CP_BV_SQE_UCODE_DBG_ADDR", REG_A7XX_CP_BV_SQE_UCODE_DBG_ADDR,
+	{ "CP_BV_SQE_UCODE_DBG", REG_A7XX_CP_BV_SQE_UCODE_DBG_ADDR,
 		REG_A7XX_CP_BV_SQE_UCODE_DBG_DATA, 0x8000, NULL },
-	{ "CP_SQE_AC_STAT_ADDR", REG_A7XX_CP_SQE_AC_STAT_ADDR,
-		REG_A7XX_CP_SQE_AC_STAT_DATA, 0x33, NULL },
-	{ "CP_LPAC_DRAW_STATE_ADDR", REG_A7XX_CP_LPAC_DRAW_STATE_ADDR,
+	{ "CP_SQE_AC_STAT", REG_A7XX_CP_SQE_AC_STAT_ADDR,
+		REG_A7XX_CP_SQE_AC_STAT_DATA, 0x40, NULL },
+	{ "CP_LPAC_DRAW_STATE", REG_A7XX_CP_LPAC_DRAW_STATE_ADDR,
 		REG_A7XX_CP_LPAC_DRAW_STATE_DATA, 0x100, NULL },
-	{ "CP_SQE_AC_UCODE_DBG_ADDR", REG_A7XX_CP_SQE_AC_UCODE_DBG_ADDR,
+	{ "CP_SQE_AC_UCODE_DBG", REG_A7XX_CP_SQE_AC_UCODE_DBG_ADDR,
 		REG_A7XX_CP_SQE_AC_UCODE_DBG_DATA, 0x8000, NULL },
-	{ "CP_LPAC_FIFO_DBG_ADDR", REG_A7XX_CP_LPAC_FIFO_DBG_ADDR,
+	{ "CP_LPAC_FIFO_DBG", REG_A7XX_CP_LPAC_FIFO_DBG_ADDR,
 		REG_A7XX_CP_LPAC_FIFO_DBG_DATA, 0x40, NULL },
-	{ "CP_ROQ", REG_A6XX_CP_ROQ_DBG_ADDR,
+	{ "CP_ROQ_DBG", REG_A6XX_CP_ROQ_DBG_ADDR,
 		REG_A6XX_CP_ROQ_DBG_DATA, 0, a7xx_get_cp_roq_size },
 };
 
 static const struct a6xx_indexed_registers a6xx_cp_mempool_indexed = {
-	"CP_MEMPOOL", REG_A6XX_CP_MEM_POOL_DBG_ADDR,
+	"CP_MEM_POOL_DBG", REG_A6XX_CP_MEM_POOL_DBG_ADDR,
 		REG_A6XX_CP_MEM_POOL_DBG_DATA, 0x2060, NULL,
 };
 
 static const struct a6xx_indexed_registers a7xx_cp_bv_mempool_indexed[] = {
-	{ "CP_MEMPOOL", REG_A6XX_CP_MEM_POOL_DBG_ADDR,
-		REG_A6XX_CP_MEM_POOL_DBG_DATA, 0x2100, NULL },
-	{ "CP_BV_MEMPOOL", REG_A7XX_CP_BV_MEM_POOL_DBG_ADDR,
-		REG_A7XX_CP_BV_MEM_POOL_DBG_DATA, 0x2100, NULL },
+	{ "CP_MEM_POOL_DBG", REG_A6XX_CP_MEM_POOL_DBG_ADDR,
+		REG_A6XX_CP_MEM_POOL_DBG_DATA, 0x2200, NULL },
+	{ "CP_BV_MEM_POOL_DBG", REG_A7XX_CP_BV_MEM_POOL_DBG_ADDR,
+		REG_A7XX_CP_BV_MEM_POOL_DBG_DATA, 0x2200, NULL },
 };
 
 #define DEBUGBUS(_id, _count) { .id = _id, .name = #_id, .count = _count }
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gen7_0_0_snapshot.h b/drivers/gpu/drm/msm/adreno/adreno_gen7_0_0_snapshot.h
index cb66ece6606b..04b49d385f9d 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gen7_0_0_snapshot.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gen7_0_0_snapshot.h
@@ -81,7 +81,7 @@ static const u32 gen7_0_0_debugbus_blocks[] = {
 	A7XX_DBGBUS_USPTP_7,
 };
 
-static struct gen7_shader_block gen7_0_0_shader_blocks[] = {
+static const struct gen7_shader_block gen7_0_0_shader_blocks[] = {
 	{A7XX_TP0_TMO_DATA,                 0x200, 4, 2, A7XX_PIPE_BR, A7XX_USPTP},
 	{A7XX_TP0_SMO_DATA,                  0x80, 4, 2, A7XX_PIPE_BR, A7XX_USPTP},
 	{A7XX_TP0_MIPMAP_BASE_DATA,         0x3c0, 4, 2, A7XX_PIPE_BR, A7XX_USPTP},
@@ -668,12 +668,19 @@ static const u32 gen7_0_0_sp_noncontext_pipe_lpac_usptp_registers[] = {
 };
 static_assert(IS_ALIGNED(sizeof(gen7_0_0_sp_noncontext_pipe_lpac_usptp_registers), 8));
 
-/* Block: TPl1 Cluster: noncontext Pipeline: A7XX_PIPE_BR */
-static const u32 gen7_0_0_tpl1_noncontext_pipe_br_registers[] = {
+/* Block: TPl1 Cluster: noncontext Pipeline: A7XX_PIPE_NONE */
+static const u32 gen7_0_0_tpl1_noncontext_pipe_none_registers[] = {
 	0x0b600, 0x0b600, 0x0b602, 0x0b602, 0x0b604, 0x0b604, 0x0b608, 0x0b60c,
 	0x0b60f, 0x0b621, 0x0b630, 0x0b633,
 	UINT_MAX, UINT_MAX,
 };
+static_assert(IS_ALIGNED(sizeof(gen7_0_0_tpl1_noncontext_pipe_none_registers), 8));
+
+/* Block: TPl1 Cluster: noncontext Pipeline: A7XX_PIPE_BR */
+static const u32 gen7_0_0_tpl1_noncontext_pipe_br_registers[] = {
+	 0x0b600, 0x0b600,
+	 UINT_MAX, UINT_MAX,
+};
 static_assert(IS_ALIGNED(sizeof(gen7_0_0_tpl1_noncontext_pipe_br_registers), 8));
 
 /* Block: TPl1 Cluster: noncontext Pipeline: A7XX_PIPE_LPAC */
@@ -695,7 +702,7 @@ static const struct gen7_sel_reg gen7_0_0_rb_rbp_sel = {
 	.val = 0x9,
 };
 
-static struct gen7_cluster_registers gen7_0_0_clusters[] = {
+static const struct gen7_cluster_registers gen7_0_0_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BR, STATE_NON_CONTEXT,
 		gen7_0_0_noncontext_pipe_br_registers, },
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BV, STATE_NON_CONTEXT,
@@ -764,7 +771,7 @@ static struct gen7_cluster_registers gen7_0_0_clusters[] = {
 		gen7_0_0_vpc_cluster_vpc_ps_pipe_bv_registers, },
 };
 
-static struct gen7_sptp_cluster_registers gen7_0_0_sptp_clusters[] = {
+static const struct gen7_sptp_cluster_registers gen7_0_0_sptp_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_HLSQ_STATE,
 		gen7_0_0_sp_noncontext_pipe_br_hlsq_state_registers, 0xae00 },
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_SP_TOP,
@@ -914,7 +921,7 @@ static const u32 gen7_0_0_dpm_registers[] = {
 };
 static_assert(IS_ALIGNED(sizeof(gen7_0_0_dpm_registers), 8));
 
-static struct gen7_reg_list gen7_0_0_reg_list[] = {
+static const struct gen7_reg_list gen7_0_0_reg_list[] = {
 	{ gen7_0_0_gpu_registers, NULL },
 	{ gen7_0_0_cx_misc_registers, NULL },
 	{ gen7_0_0_dpm_registers, NULL },
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gen7_2_0_snapshot.h b/drivers/gpu/drm/msm/adreno/adreno_gen7_2_0_snapshot.h
index 6f8ad50f32ce..772652eb61f3 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gen7_2_0_snapshot.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gen7_2_0_snapshot.h
@@ -95,7 +95,7 @@ static const u32 gen7_2_0_debugbus_blocks[] = {
 	A7XX_DBGBUS_CCHE_2,
 };
 
-static struct gen7_shader_block gen7_2_0_shader_blocks[] = {
+static const struct gen7_shader_block gen7_2_0_shader_blocks[] = {
 	{A7XX_TP0_TMO_DATA,                 0x200, 6, 2, A7XX_PIPE_BR, A7XX_USPTP},
 	{A7XX_TP0_SMO_DATA,                  0x80, 6, 2, A7XX_PIPE_BR, A7XX_USPTP},
 	{A7XX_TP0_MIPMAP_BASE_DATA,         0x3c0, 6, 2, A7XX_PIPE_BR, A7XX_USPTP},
@@ -489,7 +489,7 @@ static const struct gen7_sel_reg gen7_2_0_rb_rbp_sel = {
 	.val = 0x9,
 };
 
-static struct gen7_cluster_registers gen7_2_0_clusters[] = {
+static const struct gen7_cluster_registers gen7_2_0_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BR, STATE_NON_CONTEXT,
 		gen7_2_0_noncontext_pipe_br_registers, },
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BV, STATE_NON_CONTEXT,
@@ -558,7 +558,7 @@ static struct gen7_cluster_registers gen7_2_0_clusters[] = {
 		gen7_0_0_vpc_cluster_vpc_ps_pipe_bv_registers, },
 };
 
-static struct gen7_sptp_cluster_registers gen7_2_0_sptp_clusters[] = {
+static const struct gen7_sptp_cluster_registers gen7_2_0_sptp_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_HLSQ_STATE,
 		gen7_0_0_sp_noncontext_pipe_br_hlsq_state_registers, 0xae00 },
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_SP_TOP,
@@ -573,6 +573,8 @@ static struct gen7_sptp_cluster_registers gen7_2_0_sptp_clusters[] = {
 		gen7_0_0_sp_noncontext_pipe_lpac_usptp_registers, 0xaf80 },
 	{ A7XX_CLUSTER_NONE, A7XX_TP0_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_USPTP,
 		gen7_0_0_tpl1_noncontext_pipe_br_registers, 0xb600 },
+	{ A7XX_CLUSTER_NONE, A7XX_TP0_NCTX_REG, A7XX_PIPE_NONE, 0, A7XX_USPTP,
+		gen7_0_0_tpl1_noncontext_pipe_none_registers, 0xb600 },
 	{ A7XX_CLUSTER_NONE, A7XX_TP0_NCTX_REG, A7XX_PIPE_LPAC, 0, A7XX_USPTP,
 		gen7_0_0_tpl1_noncontext_pipe_lpac_registers, 0xb780 },
 	{ A7XX_CLUSTER_SP_PS, A7XX_SP_CTX0_3D_CPS_REG, A7XX_PIPE_BR, 0, A7XX_HLSQ_STATE,
@@ -737,7 +739,7 @@ static const u32 gen7_2_0_dpm_registers[] = {
 };
 static_assert(IS_ALIGNED(sizeof(gen7_2_0_dpm_registers), 8));
 
-static struct gen7_reg_list gen7_2_0_reg_list[] = {
+static const struct gen7_reg_list gen7_2_0_reg_list[] = {
 	{ gen7_2_0_gpu_registers, NULL },
 	{ gen7_2_0_cx_misc_registers, NULL },
 	{ gen7_2_0_dpm_registers, NULL },
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gen7_9_0_snapshot.h b/drivers/gpu/drm/msm/adreno/adreno_gen7_9_0_snapshot.h
index e02cabb39f19..0956dfca1f05 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gen7_9_0_snapshot.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gen7_9_0_snapshot.h
@@ -117,7 +117,7 @@ static const u32 gen7_9_0_cx_debugbus_blocks[] = {
 	A7XX_DBGBUS_GBIF_CX,
 };
 
-static struct gen7_shader_block gen7_9_0_shader_blocks[] = {
+static const struct gen7_shader_block gen7_9_0_shader_blocks[] = {
 	{ A7XX_TP0_TMO_DATA, 0x0200, 6, 2, A7XX_PIPE_BR, A7XX_USPTP },
 	{ A7XX_TP0_SMO_DATA, 0x0080, 6, 2, A7XX_PIPE_BR, A7XX_USPTP },
 	{ A7XX_TP0_MIPMAP_BASE_DATA, 0x03C0, 6, 2, A7XX_PIPE_BR, A7XX_USPTP },
@@ -1116,7 +1116,7 @@ static const struct gen7_sel_reg gen7_9_0_rb_rbp_sel = {
 	.val = 0x9,
 };
 
-static struct gen7_cluster_registers gen7_9_0_clusters[] = {
+static const struct gen7_cluster_registers gen7_9_0_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BR, STATE_NON_CONTEXT,
 		gen7_9_0_non_context_pipe_br_registers,  },
 	{ A7XX_CLUSTER_NONE, A7XX_PIPE_BV, STATE_NON_CONTEXT,
@@ -1185,7 +1185,7 @@ static struct gen7_cluster_registers gen7_9_0_clusters[] = {
 		gen7_9_0_vpc_pipe_bv_cluster_vpc_ps_registers,  },
 };
 
-static struct gen7_sptp_cluster_registers gen7_9_0_sptp_clusters[] = {
+static const struct gen7_sptp_cluster_registers gen7_9_0_sptp_clusters[] = {
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_HLSQ_STATE,
 		gen7_9_0_non_context_sp_pipe_br_hlsq_state_registers, 0xae00},
 	{ A7XX_CLUSTER_NONE, A7XX_SP_NCTX_REG, A7XX_PIPE_BR, 0, A7XX_SP_TOP,
@@ -1294,34 +1294,34 @@ static struct gen7_sptp_cluster_registers gen7_9_0_sptp_clusters[] = {
 		gen7_9_0_tpl1_pipe_br_cluster_sp_ps_usptp_registers, 0xb000},
 };
 
-static struct a6xx_indexed_registers gen7_9_0_cp_indexed_reg_list[] = {
+static const struct a6xx_indexed_registers gen7_9_0_cp_indexed_reg_list[] = {
 	{ "CP_SQE_STAT", REG_A6XX_CP_SQE_STAT_ADDR,
 		REG_A6XX_CP_SQE_STAT_DATA, 0x00040},
 	{ "CP_DRAW_STATE", REG_A6XX_CP_DRAW_STATE_ADDR,
 		REG_A6XX_CP_DRAW_STATE_DATA, 0x00200},
-	{ "CP_ROQ", REG_A6XX_CP_ROQ_DBG_ADDR,
+	{ "CP_ROQ_DBG", REG_A6XX_CP_ROQ_DBG_ADDR,
 		REG_A6XX_CP_ROQ_DBG_DATA, 0x00800},
-	{ "CP_UCODE_DBG_DATA", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
+	{ "CP_SQE_UCODE_DBG", REG_A6XX_CP_SQE_UCODE_DBG_ADDR,
 		REG_A6XX_CP_SQE_UCODE_DBG_DATA, 0x08000},
-	{ "CP_BV_DRAW_STATE_ADDR", REG_A7XX_CP_BV_DRAW_STATE_ADDR,
+	{ "CP_BV_DRAW_STATE", REG_A7XX_CP_BV_DRAW_STATE_ADDR,
 		REG_A7XX_CP_BV_DRAW_STATE_DATA, 0x00200},
-	{ "CP_BV_ROQ_DBG_ADDR", REG_A7XX_CP_BV_ROQ_DBG_ADDR,
+	{ "CP_BV_ROQ_DBG", REG_A7XX_CP_BV_ROQ_DBG_ADDR,
 		REG_A7XX_CP_BV_ROQ_DBG_DATA, 0x00800},
-	{ "CP_BV_SQE_UCODE_DBG_ADDR", REG_A7XX_CP_BV_SQE_UCODE_DBG_ADDR,
+	{ "CP_BV_SQE_UCODE_DBG", REG_A7XX_CP_BV_SQE_UCODE_DBG_ADDR,
 		REG_A7XX_CP_BV_SQE_UCODE_DBG_DATA, 0x08000},
-	{ "CP_BV_SQE_STAT_ADDR", REG_A7XX_CP_BV_SQE_STAT_ADDR,
+	{ "CP_BV_SQE_STAT", REG_A7XX_CP_BV_SQE_STAT_ADDR,
 		REG_A7XX_CP_BV_SQE_STAT_DATA, 0x00040},
-	{ "CP_RESOURCE_TBL", REG_A7XX_CP_RESOURCE_TABLE_DBG_ADDR,
+	{ "CP_RESOURCE_TABLE_DBG", REG_A7XX_CP_RESOURCE_TABLE_DBG_ADDR,
 		REG_A7XX_CP_RESOURCE_TABLE_DBG_DATA, 0x04100},
-	{ "CP_LPAC_DRAW_STATE_ADDR", REG_A7XX_CP_LPAC_DRAW_STATE_ADDR,
+	{ "CP_LPAC_DRAW_STATE", REG_A7XX_CP_LPAC_DRAW_STATE_ADDR,
 		REG_A7XX_CP_LPAC_DRAW_STATE_DATA, 0x00200},
-	{ "CP_LPAC_ROQ", REG_A7XX_CP_LPAC_ROQ_DBG_ADDR,
+	{ "CP_LPAC_ROQ_DBG", REG_A7XX_CP_LPAC_ROQ_DBG_ADDR,
 		REG_A7XX_CP_LPAC_ROQ_DBG_DATA, 0x00200},
-	{ "CP_SQE_AC_UCODE_DBG_ADDR", REG_A7XX_CP_SQE_AC_UCODE_DBG_ADDR,
+	{ "CP_SQE_AC_UCODE_DBG", REG_A7XX_CP_SQE_AC_UCODE_DBG_ADDR,
 		REG_A7XX_CP_SQE_AC_UCODE_DBG_DATA, 0x08000},
-	{ "CP_SQE_AC_STAT_ADDR", REG_A7XX_CP_SQE_AC_STAT_ADDR,
+	{ "CP_SQE_AC_STAT", REG_A7XX_CP_SQE_AC_STAT_ADDR,
 		REG_A7XX_CP_SQE_AC_STAT_DATA, 0x00040},
-	{ "CP_LPAC_FIFO_DBG_ADDR", REG_A7XX_CP_LPAC_FIFO_DBG_ADDR,
+	{ "CP_LPAC_FIFO_DBG", REG_A7XX_CP_LPAC_FIFO_DBG_ADDR,
 		REG_A7XX_CP_LPAC_FIFO_DBG_DATA, 0x00040},
 	{ "CP_AQE_ROQ_0", REG_A7XX_CP_AQE_ROQ_DBG_ADDR_0,
 		REG_A7XX_CP_AQE_ROQ_DBG_DATA_0, 0x00100},
@@ -1337,7 +1337,7 @@ static struct a6xx_indexed_registers gen7_9_0_cp_indexed_reg_list[] = {
 		REG_A7XX_CP_AQE_STAT_DATA_1, 0x00040},
 };
 
-static struct gen7_reg_list gen7_9_0_reg_list[] = {
+static const struct gen7_reg_list gen7_9_0_reg_list[] = {
 	{ gen7_9_0_gpu_registers, NULL},
 	{ gen7_9_0_cx_misc_registers, NULL},
 	{ gen7_9_0_cx_dbgc_registers, NULL},
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
index d4b545448d74..94912b4708fb 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c
@@ -596,7 +596,7 @@ static void _dpu_crtc_complete_flip(struct drm_crtc *crtc)
 
 	spin_lock_irqsave(&dev->event_lock, flags);
 	if (dpu_crtc->event) {
-		DRM_DEBUG_VBL("%s: send event: %pK\n", dpu_crtc->name,
+		DRM_DEBUG_VBL("%s: send event: %p\n", dpu_crtc->name,
 			      dpu_crtc->event);
 		trace_dpu_crtc_complete_flip(DRMID(crtc));
 		drm_crtc_send_vblank_event(crtc, dpu_crtc->event);
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
index 05e5f3463e30..258edaa18fc0 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -730,6 +730,8 @@ bool dpu_encoder_needs_modeset(struct drm_encoder *drm_enc, struct drm_atomic_st
 		return false;
 
 	conn_state = drm_atomic_get_new_connector_state(state, connector);
+	if (!conn_state)
+		return false;
 
 	/**
 	 * These checks are duplicated from dpu_encoder_update_topology() since
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c
index 11fb1bc54fa9..54b20faa0b69 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c
@@ -31,14 +31,14 @@ static void dpu_setup_dspp_pcc(struct dpu_hw_dspp *ctx,
 	u32 base;
 
 	if (!ctx) {
-		DRM_ERROR("invalid ctx %pK\n", ctx);
+		DRM_ERROR("invalid ctx %p\n", ctx);
 		return;
 	}
 
 	base = ctx->cap->sblk->pcc.base;
 
 	if (!base) {
-		DRM_ERROR("invalid ctx %pK pcc base 0x%x\n", ctx, base);
+		DRM_ERROR("invalid ctx %p pcc base 0x%x\n", ctx, base);
 		return;
 	}
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
index 12dcb32b4724..a306077647c3 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
@@ -1345,7 +1345,7 @@ static int dpu_kms_mmap_mdp5(struct dpu_kms *dpu_kms)
 		dpu_kms->mmio = NULL;
 		return ret;
 	}
-	DRM_DEBUG("mapped dpu address space @%pK\n", dpu_kms->mmio);
+	DRM_DEBUG("mapped dpu address space @%p\n", dpu_kms->mmio);
 
 	dpu_kms->vbif[VBIF_RT] = msm_ioremap_mdss(mdss_dev,
 						  dpu_kms->pdev,
@@ -1380,7 +1380,7 @@ static int dpu_kms_mmap_dpu(struct dpu_kms *dpu_kms)
 		dpu_kms->mmio = NULL;
 		return ret;
 	}
-	DRM_DEBUG("mapped dpu address space @%pK\n", dpu_kms->mmio);
+	DRM_DEBUG("mapped dpu address space @%p\n", dpu_kms->mmio);
 
 	dpu_kms->vbif[VBIF_RT] = msm_ioremap(pdev, "vbif");
 	if (IS_ERR(dpu_kms->vbif[VBIF_RT])) {
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
index 01171c535a27..6859e8ef6b05 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c
@@ -1129,7 +1129,7 @@ static int dpu_plane_virtual_atomic_check(struct drm_plane *plane,
 	struct drm_plane_state *old_plane_state =
 		drm_atomic_get_old_plane_state(state, plane);
 	struct dpu_plane_state *pstate = to_dpu_plane_state(plane_state);
-	struct drm_crtc_state *crtc_state;
+	struct drm_crtc_state *crtc_state = NULL;
 	int ret;
 
 	if (IS_ERR(plane_state))
@@ -1162,7 +1162,7 @@ static int dpu_plane_virtual_atomic_check(struct drm_plane *plane,
 	if (!old_plane_state || !old_plane_state->fb ||
 	    old_plane_state->src_w != plane_state->src_w ||
 	    old_plane_state->src_h != plane_state->src_h ||
-	    old_plane_state->src_w != plane_state->src_w ||
+	    old_plane_state->crtc_w != plane_state->crtc_w ||
 	    old_plane_state->crtc_h != plane_state->crtc_h ||
 	    msm_framebuffer_format(old_plane_state->fb) !=
 	    msm_framebuffer_format(plane_state->fb))
diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
index 221f12db5f8b..4ea681130dba 100644
--- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
+++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
@@ -5,6 +5,8 @@
 
 #include <linux/clk-provider.h>
 #include <linux/platform_device.h>
+#include <linux/pm_clock.h>
+#include <linux/pm_runtime.h>
 #include <dt-bindings/phy/phy.h>
 
 #include "dsi_phy.h"
@@ -511,30 +513,6 @@ int msm_dsi_cphy_timing_calc_v4(struct msm_dsi_dphy_timing *timing,
 	return 0;
 }
 
-static int dsi_phy_enable_resource(struct msm_dsi_phy *phy)
-{
-	struct device *dev = &phy->pdev->dev;
-	int ret;
-
-	ret = pm_runtime_resume_and_get(dev);
-	if (ret)
-		return ret;
-
-	ret = clk_prepare_enable(phy->ahb_clk);
-	if (ret) {
-		DRM_DEV_ERROR(dev, "%s: can't enable ahb clk, %d\n", __func__, ret);
-		pm_runtime_put_sync(dev);
-	}
-
-	return ret;
-}
-
-static void dsi_phy_disable_resource(struct msm_dsi_phy *phy)
-{
-	clk_disable_unprepare(phy->ahb_clk);
-	pm_runtime_put(&phy->pdev->dev);
-}
-
 static const struct of_device_id dsi_phy_dt_match[] = {
 #ifdef CONFIG_DRM_MSM_DSI_28NM_PHY
 	{ .compatible = "qcom,dsi-phy-28nm-hpm",
@@ -698,22 +676,20 @@ static int dsi_phy_driver_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	phy->ahb_clk = msm_clk_get(pdev, "iface");
-	if (IS_ERR(phy->ahb_clk))
-		return dev_err_probe(dev, PTR_ERR(phy->ahb_clk),
-				     "Unable to get ahb clk\n");
+	platform_set_drvdata(pdev, phy);
 
-	ret = devm_pm_runtime_enable(&pdev->dev);
+	ret = devm_pm_runtime_enable(dev);
 	if (ret)
 		return ret;
 
-	/* PLL init will call into clk_register which requires
-	 * register access, so we need to enable power and ahb clock.
-	 */
-	ret = dsi_phy_enable_resource(phy);
+	ret = devm_pm_clk_create(dev);
 	if (ret)
 		return ret;
 
+	ret = pm_clk_add(dev, "iface");
+	if (ret < 0)
+		return dev_err_probe(dev, ret, "Unable to get iface clk\n");
+
 	if (phy->cfg->ops.pll_init) {
 		ret = phy->cfg->ops.pll_init(phy);
 		if (ret)
@@ -727,18 +703,19 @@ static int dsi_phy_driver_probe(struct platform_device *pdev)
 		return dev_err_probe(dev, ret,
 				     "Failed to register clk provider\n");
 
-	dsi_phy_disable_resource(phy);
-
-	platform_set_drvdata(pdev, phy);
-
 	return 0;
 }
 
+static const struct dev_pm_ops dsi_phy_pm_ops = {
+	SET_RUNTIME_PM_OPS(pm_clk_suspend, pm_clk_resume, NULL)
+};
+
 static struct platform_driver dsi_phy_platform_driver = {
 	.probe      = dsi_phy_driver_probe,
 	.driver     = {
 		.name   = "msm_dsi_phy",
 		.of_match_table = dsi_phy_dt_match,
+		.pm = &dsi_phy_pm_ops,
 	},
 };
 
@@ -764,9 +741,9 @@ int msm_dsi_phy_enable(struct msm_dsi_phy *phy,
 
 	dev = &phy->pdev->dev;
 
-	ret = dsi_phy_enable_resource(phy);
+	ret = pm_runtime_resume_and_get(dev);
 	if (ret) {
-		DRM_DEV_ERROR(dev, "%s: resource enable failed, %d\n",
+		DRM_DEV_ERROR(dev, "%s: resume failed, %d\n",
 			__func__, ret);
 		goto res_en_fail;
 	}
@@ -810,7 +787,7 @@ pll_restor_fail:
 phy_en_fail:
 	regulator_bulk_disable(phy->cfg->num_regulators, phy->supplies);
 reg_en_fail:
-	dsi_phy_disable_resource(phy);
+	pm_runtime_put(dev);
 res_en_fail:
 	return ret;
 }
@@ -823,7 +800,7 @@ void msm_dsi_phy_disable(struct msm_dsi_phy *phy)
 	phy->cfg->ops.disable(phy);
 
 	regulator_bulk_disable(phy->cfg->num_regulators, phy->supplies);
-	dsi_phy_disable_resource(phy);
+	pm_runtime_put(&phy->pdev->dev);
 }
 
 void msm_dsi_phy_set_usecase(struct msm_dsi_phy *phy,
diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
index c558f8df1684..3cbf08231492 100644
--- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
+++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
@@ -104,7 +104,6 @@ struct msm_dsi_phy {
 	phys_addr_t lane_size;
 	int id;
 
-	struct clk *ahb_clk;
 	struct regulator_bulk_data *supplies;
 
 	struct msm_dsi_dphy_timing timing;
diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index bbda865addae..97dc70876442 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -325,25 +325,28 @@ static struct drm_info_list msm_debugfs_list[] = {
 
 static int late_init_minor(struct drm_minor *minor)
 {
-	struct drm_device *dev = minor->dev;
-	struct msm_drm_private *priv = dev->dev_private;
+	struct drm_device *dev;
+	struct msm_drm_private *priv;
 	int ret;
 
 	if (!minor)
 		return 0;
 
+	dev = minor->dev;
+	priv = dev->dev_private;
+
 	if (!priv->gpu_pdev)
 		return 0;
 
 	ret = msm_rd_debugfs_init(minor);
 	if (ret) {
-		DRM_DEV_ERROR(minor->dev->dev, "could not install rd debugfs\n");
+		DRM_DEV_ERROR(dev->dev, "could not install rd debugfs\n");
 		return ret;
 	}
 
 	ret = msm_perf_debugfs_init(minor);
 	if (ret) {
-		DRM_DEV_ERROR(minor->dev->dev, "could not install perf debugfs\n");
+		DRM_DEV_ERROR(dev->dev, "could not install perf debugfs\n");
 		return ret;
 	}
 
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 7ff994d4f91a..e7631f4ef530 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -95,7 +95,6 @@ void msm_gem_vma_get(struct drm_gem_object *obj)
 void msm_gem_vma_put(struct drm_gem_object *obj)
 {
 	struct msm_drm_private *priv = obj->dev->dev_private;
-	struct drm_exec exec;
 
 	if (atomic_dec_return(&to_msm_bo(obj)->vma_ref))
 		return;
@@ -103,9 +102,13 @@ void msm_gem_vma_put(struct drm_gem_object *obj)
 	if (!priv->kms)
 		return;
 
+#ifdef CONFIG_DRM_MSM_KMS
+	struct drm_exec exec;
+
 	msm_gem_lock_vm_and_obj(&exec, obj, priv->kms->vm);
 	put_iova_spaces(obj, priv->kms->vm, true, "vma_put");
 	drm_exec_fini(&exec);     /* drop locks */
+#endif
 }
 
 /*
@@ -663,9 +666,13 @@ int msm_gem_set_iova(struct drm_gem_object *obj,
 
 static bool is_kms_vm(struct drm_gpuvm *vm)
 {
+#ifdef CONFIG_DRM_MSM_KMS
 	struct msm_drm_private *priv = vm->drm->dev_private;
 
 	return priv->kms && (priv->kms->vm == vm);
+#else
+	return false;
+#endif
 }
 
 /*
@@ -1113,10 +1120,12 @@ static void msm_gem_free_object(struct drm_gem_object *obj)
 		put_pages(obj);
 	}
 
-	if (msm_obj->flags & MSM_BO_NO_SHARE) {
+	if (obj->resv != &obj->_resv) {
 		struct drm_gem_object *r_obj =
 			container_of(obj->resv, struct drm_gem_object, _resv);
 
+		WARN_ON(!(msm_obj->flags & MSM_BO_NO_SHARE));
+
 		/* Drop reference we hold to shared resv obj: */
 		drm_gem_object_put(r_obj);
 	}
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 88239da1cd72..751c3b4965bc 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -100,7 +100,7 @@ struct msm_gem_vm {
 	 *
 	 * Only used for kernel managed VMs, unused for user managed VMs.
 	 *
-	 * Protected by @mm_lock.
+	 * Protected by vm lock.  See msm_gem_lock_vm_and_obj(), for ex.
 	 */
 	struct drm_mm mm;
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 5f8e939a5906..3ab3b27134f9 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -271,32 +271,37 @@ out:
 	return ret;
 }
 
-/* This is where we make sure all the bo's are reserved and pin'd: */
-static int submit_lock_objects(struct msm_gem_submit *submit)
+static int submit_lock_objects_vmbind(struct msm_gem_submit *submit)
 {
-	unsigned flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
+	unsigned flags = DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES;
 	struct drm_exec *exec = &submit->exec;
-	int ret;
+	int ret = 0;
 
-	if (msm_context_is_vmbind(submit->queue->ctx)) {
-		flags |= DRM_EXEC_IGNORE_DUPLICATES;
+	drm_exec_init(&submit->exec, flags, submit->nr_bos);
 
-		drm_exec_init(&submit->exec, flags, submit->nr_bos);
+	drm_exec_until_all_locked (&submit->exec) {
+		ret = drm_gpuvm_prepare_vm(submit->vm, exec, 1);
+		drm_exec_retry_on_contention(exec);
+		if (ret)
+			break;
 
-		drm_exec_until_all_locked (&submit->exec) {
-			ret = drm_gpuvm_prepare_vm(submit->vm, exec, 1);
-			drm_exec_retry_on_contention(exec);
-			if (ret)
-				return ret;
+		ret = drm_gpuvm_prepare_objects(submit->vm, exec, 1);
+		drm_exec_retry_on_contention(exec);
+		if (ret)
+			break;
+	}
 
-			ret = drm_gpuvm_prepare_objects(submit->vm, exec, 1);
-			drm_exec_retry_on_contention(exec);
-			if (ret)
-				return ret;
-		}
+	return ret;
+}
 
-		return 0;
-	}
+/* This is where we make sure all the bo's are reserved and pin'd: */
+static int submit_lock_objects(struct msm_gem_submit *submit)
+{
+	unsigned flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
+	int ret = 0;
+
+	if (msm_context_is_vmbind(submit->queue->ctx))
+		return submit_lock_objects_vmbind(submit);
 
 	drm_exec_init(&submit->exec, flags, submit->nr_bos);
 
@@ -305,17 +310,17 @@ static int submit_lock_objects(struct msm_gem_submit *submit)
 					drm_gpuvm_resv_obj(submit->vm));
 		drm_exec_retry_on_contention(&submit->exec);
 		if (ret)
-			return ret;
+			break;
 		for (unsigned i = 0; i < submit->nr_bos; i++) {
 			struct drm_gem_object *obj = submit->bos[i].obj;
 			ret = drm_exec_prepare_obj(&submit->exec, obj, 1);
 			drm_exec_retry_on_contention(&submit->exec);
 			if (ret)
-				return ret;
+				break;
 		}
 	}
 
-	return 0;
+	return ret;
 }
 
 static int submit_fence_sync(struct msm_gem_submit *submit)
@@ -514,14 +519,15 @@ out:
  */
 static void submit_cleanup(struct msm_gem_submit *submit, bool error)
 {
+	if (error)
+		submit_unpin_objects(submit);
+
 	if (submit->exec.objects)
 		drm_exec_fini(&submit->exec);
 
-	if (error) {
-		submit_unpin_objects(submit);
-		/* job wasn't enqueued to scheduler, so early retirement: */
+	/* if job wasn't enqueued to scheduler, early retirement: */
+	if (error)
 		msm_submit_retire(submit);
-	}
 }
 
 void msm_submit_retire(struct msm_gem_submit *submit)
@@ -769,12 +775,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	if (ret == 0 && args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
 		sync_file = sync_file_create(submit->user_fence);
-		if (!sync_file) {
+		if (!sync_file)
 			ret = -ENOMEM;
-		} else {
-			fd_install(out_fence_fd, sync_file->file);
-			args->fence_fd = out_fence_fd;
-		}
 	}
 
 	if (ret)
@@ -812,10 +814,14 @@ out:
 out_unlock:
 	mutex_unlock(&queue->lock);
 out_post_unlock:
-	if (ret && (out_fence_fd >= 0)) {
-		put_unused_fd(out_fence_fd);
+	if (ret) {
+		if (out_fence_fd >= 0)
+			put_unused_fd(out_fence_fd);
 		if (sync_file)
 			fput(sync_file->file);
+	} else if (sync_file) {
+		fd_install(out_fence_fd, sync_file->file);
+		args->fence_fd = out_fence_fd;
 	}
 
 	if (!IS_ERR_OR_NULL(submit)) {
diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c
index 3cd8562a5109..00d0f3b7ba32 100644
--- a/drivers/gpu/drm/msm/msm_gem_vma.c
+++ b/drivers/gpu/drm/msm/msm_gem_vma.c
@@ -319,13 +319,10 @@ msm_gem_vma_map(struct drm_gpuva *vma, int prot, struct sg_table *sgt)
 		mutex_lock(&vm->mmu_lock);
 
 	/*
-	 * NOTE: iommu/io-pgtable can allocate pages, so we cannot hold
+	 * NOTE: if not using pgtable preallocation, we cannot hold
 	 * a lock across map/unmap which is also used in the job_run()
 	 * path, as this can cause deadlock in job_run() vs shrinker/
 	 * reclaim.
-	 *
-	 * Revisit this if we can come up with a scheme to pre-alloc pages
-	 * for the pgtable in map/unmap ops.
 	 */
 	ret = vm_map_op(vm, &(struct msm_vm_map_op){
 		.iova = vma->va.addr,
@@ -454,6 +451,8 @@ msm_gem_vm_bo_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
 struct op_arg {
 	unsigned flags;
 	struct msm_vm_bind_job *job;
+	const struct msm_vm_bind_op *op;
+	bool kept;
 };
 
 static void
@@ -475,14 +474,18 @@ vma_from_op(struct op_arg *arg, struct drm_gpuva_op_map *op)
 }
 
 static int
-msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *arg)
+msm_gem_vm_sm_step_map(struct drm_gpuva_op *op, void *_arg)
 {
-	struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
+	struct op_arg *arg = _arg;
+	struct msm_vm_bind_job *job = arg->job;
 	struct drm_gem_object *obj = op->map.gem.obj;
 	struct drm_gpuva *vma;
 	struct sg_table *sgt;
 	unsigned prot;
 
+	if (arg->kept)
+		return 0;
+
 	vma = vma_from_op(arg, &op->map);
 	if (WARN_ON(IS_ERR(vma)))
 		return PTR_ERR(vma);
@@ -602,15 +605,41 @@ msm_gem_vm_sm_step_remap(struct drm_gpuva_op *op, void *arg)
 }
 
 static int
-msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *arg)
+msm_gem_vm_sm_step_unmap(struct drm_gpuva_op *op, void *_arg)
 {
-	struct msm_vm_bind_job *job = ((struct op_arg *)arg)->job;
+	struct op_arg *arg = _arg;
+	struct msm_vm_bind_job *job = arg->job;
 	struct drm_gpuva *vma = op->unmap.va;
 	struct msm_gem_vma *msm_vma = to_msm_vma(vma);
 
 	vm_dbg("%p:%p:%p: %016llx %016llx", vma->vm, vma, vma->gem.obj,
 	       vma->va.addr, vma->va.range);
 
+	/*
+	 * Detect in-place remap.  Turnip does this to change the vma flags,
+	 * in particular MSM_VMA_DUMP.  In this case we want to avoid actually
+	 * touching the page tables, as that would require synchronization
+	 * against SUBMIT jobs running on the GPU.
+	 */
+	if (op->unmap.keep &&
+	    (arg->op->op == MSM_VM_BIND_OP_MAP) &&
+	    (vma->gem.obj == arg->op->obj) &&
+	    (vma->gem.offset == arg->op->obj_offset) &&
+	    (vma->va.addr == arg->op->iova) &&
+	    (vma->va.range == arg->op->range)) {
+		/* We are only expecting a single in-place unmap+map cb pair: */
+		WARN_ON(arg->kept);
+
+		/* Leave the existing VMA in place, but signal that to the map cb: */
+		arg->kept = true;
+
+		/* Only flags are changing, so update that in-place: */
+		unsigned orig_flags = vma->flags & (DRM_GPUVA_USERBITS - 1);
+		vma->flags = orig_flags | arg->flags;
+
+		return 0;
+	}
+
 	if (!msm_vma->mapped)
 		goto out_close;
 
@@ -1271,6 +1300,7 @@ vm_bind_job_prepare(struct msm_vm_bind_job *job)
 		const struct msm_vm_bind_op *op = &job->ops[i];
 		struct op_arg arg = {
 			.job = job,
+			.op = op,
 		};
 
 		switch (op->op) {
@@ -1460,12 +1490,8 @@ msm_ioctl_vm_bind(struct drm_device *dev, void *data, struct drm_file *file)
 
 	if (args->flags & MSM_VM_BIND_FENCE_FD_OUT) {
 		sync_file = sync_file_create(job->fence);
-		if (!sync_file) {
+		if (!sync_file)
 			ret = -ENOMEM;
-		} else {
-			fd_install(out_fence_fd, sync_file->file);
-			args->fence_fd = out_fence_fd;
-		}
 	}
 
 	if (ret)
@@ -1494,10 +1520,14 @@ out:
 out_unlock:
 	mutex_unlock(&queue->lock);
 out_post_unlock:
-	if (ret && (out_fence_fd >= 0)) {
-		put_unused_fd(out_fence_fd);
+	if (ret) {
+		if (out_fence_fd >= 0)
+			put_unused_fd(out_fence_fd);
 		if (sync_file)
 			fput(sync_file->file);
+	} else if (sync_file) {
+		fd_install(out_fence_fd, sync_file->file);
+		args->fence_fd = out_fence_fd;
 	}
 
 	if (!IS_ERR_OR_NULL(job)) {
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index c317b25a8162..26c5ce897cbb 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -465,6 +465,7 @@ static void recover_worker(struct kthread_work *work)
 	struct msm_gem_submit *submit;
 	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
 	char *comm = NULL, *cmd = NULL;
+	struct task_struct *task;
 	int i;
 
 	mutex_lock(&gpu->lock);
@@ -482,16 +483,20 @@ static void recover_worker(struct kthread_work *work)
 
 	/* Increment the fault counts */
 	submit->queue->faults++;
-	if (submit->vm) {
+
+	task = get_pid_task(submit->pid, PIDTYPE_PID);
+	if (!task)
+		gpu->global_faults++;
+	else {
 		struct msm_gem_vm *vm = to_msm_vm(submit->vm);
 
 		vm->faults++;
 
 		/*
 		 * If userspace has opted-in to VM_BIND (and therefore userspace
-		 * management of the VM), faults mark the VM as unusuable.  This
+		 * management of the VM), faults mark the VM as unusable. This
 		 * matches vulkan expectations (vulkan is the main target for
-		 * VM_BIND)
+		 * VM_BIND).
 		 */
 		if (!vm->managed)
 			msm_gem_vm_unusable(submit->vm);
@@ -553,8 +558,15 @@ static void recover_worker(struct kthread_work *work)
 			unsigned long flags;
 
 			spin_lock_irqsave(&ring->submit_lock, flags);
-			list_for_each_entry(submit, &ring->submits, node)
+			list_for_each_entry(submit, &ring->submits, node) {
+				/*
+				 * If the submit uses an unusable vm make sure
+				 * we don't actually run it
+				 */
+				if (to_msm_vm(submit->vm)->unusable)
+					submit->nr_cmds = 0;
 				gpu->funcs->submit(gpu, submit);
+			}
 			spin_unlock_irqrestore(&ring->submit_lock, flags);
 		}
 	}
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index 55c29f49b788..76cdd5ea06a0 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -14,7 +14,9 @@
 struct msm_iommu {
 	struct msm_mmu base;
 	struct iommu_domain *domain;
-	atomic_t pagetables;
+
+	struct mutex init_lock;  /* protects pagetables counter and prr_page */
+	int pagetables;
 	struct page *prr_page;
 
 	struct kmem_cache *pt_cache;
@@ -227,7 +229,8 @@ static void msm_iommu_pagetable_destroy(struct msm_mmu *mmu)
 	 * If this is the last attached pagetable for the parent,
 	 * disable TTBR0 in the arm-smmu driver
 	 */
-	if (atomic_dec_return(&iommu->pagetables) == 0) {
+	mutex_lock(&iommu->init_lock);
+	if (--iommu->pagetables == 0) {
 		adreno_smmu->set_ttbr0_cfg(adreno_smmu->cookie, NULL);
 
 		if (adreno_smmu->set_prr_bit) {
@@ -236,6 +239,7 @@ static void msm_iommu_pagetable_destroy(struct msm_mmu *mmu)
 			iommu->prr_page = NULL;
 		}
 	}
+	mutex_unlock(&iommu->init_lock);
 
 	free_io_pgtable_ops(pagetable->pgtbl_ops);
 	kfree(pagetable);
@@ -568,9 +572,12 @@ struct msm_mmu *msm_iommu_pagetable_create(struct msm_mmu *parent, bool kernel_m
 	 * If this is the first pagetable that we've allocated, send it back to
 	 * the arm-smmu driver as a trigger to set up TTBR0
 	 */
-	if (atomic_inc_return(&iommu->pagetables) == 1) {
+	mutex_lock(&iommu->init_lock);
+	if (iommu->pagetables++ == 0) {
 		ret = adreno_smmu->set_ttbr0_cfg(adreno_smmu->cookie, &ttbr0_cfg);
 		if (ret) {
+			iommu->pagetables--;
+			mutex_unlock(&iommu->init_lock);
 			free_io_pgtable_ops(pagetable->pgtbl_ops);
 			kfree(pagetable);
 			return ERR_PTR(ret);
@@ -595,6 +602,7 @@ struct msm_mmu *msm_iommu_pagetable_create(struct msm_mmu *parent, bool kernel_m
 			adreno_smmu->set_prr_bit(adreno_smmu->cookie, true);
 		}
 	}
+	mutex_unlock(&iommu->init_lock);
 
 	/* Needed later for TLB flush */
 	pagetable->parent = parent;
@@ -730,7 +738,7 @@ struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks)
 	iommu->domain = domain;
 	msm_mmu_init(&iommu->base, dev, &funcs, MSM_MMU_IOMMU);
 
-	atomic_set(&iommu->pagetables, 0);
+	mutex_init(&iommu->init_lock);
 
 	ret = iommu_attach_device(iommu->domain, dev);
 	if (ret) {
diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c
index 6889f1c1e721..56828d218e88 100644
--- a/drivers/gpu/drm/msm/msm_kms.c
+++ b/drivers/gpu/drm/msm/msm_kms.c
@@ -275,6 +275,12 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv)
 	if (ret)
 		return ret;
 
+	ret = msm_disp_snapshot_init(ddev);
+	if (ret) {
+		DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret);
+		return ret;
+	}
+
 	ret = priv->kms_init(ddev);
 	if (ret) {
 		DRM_DEV_ERROR(dev, "failed to load kms\n");
@@ -327,10 +333,6 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv)
 		goto err_msm_uninit;
 	}
 
-	ret = msm_disp_snapshot_init(ddev);
-	if (ret)
-		DRM_DEV_ERROR(dev, "msm_disp_snapshot_init failed ret = %d\n", ret);
-
 	drm_mode_config_reset(ddev);
 
 	return 0;
diff --git a/drivers/gpu/drm/msm/msm_mdss.c b/drivers/gpu/drm/msm/msm_mdss.c
index 1f5fe7811e01..39885b333910 100644
--- a/drivers/gpu/drm/msm/msm_mdss.c
+++ b/drivers/gpu/drm/msm/msm_mdss.c
@@ -423,7 +423,7 @@ static struct msm_mdss *msm_mdss_init(struct platform_device *pdev, bool is_mdp5
 	if (IS_ERR(msm_mdss->mmio))
 		return ERR_CAST(msm_mdss->mmio);
 
-	dev_dbg(&pdev->dev, "mapped mdss address space @%pK\n", msm_mdss->mmio);
+	dev_dbg(&pdev->dev, "mapped mdss address space @%p\n", msm_mdss->mmio);
 
 	ret = msm_mdss_parse_data_bus_icc_path(&pdev->dev, msm_mdss);
 	if (ret)
diff --git a/drivers/gpu/drm/msm/registers/adreno/a6xx.xml b/drivers/gpu/drm/msm/registers/adreno/a6xx.xml
index d860fd94feae..86fab2750ba7 100644
--- a/drivers/gpu/drm/msm/registers/adreno/a6xx.xml
+++ b/drivers/gpu/drm/msm/registers/adreno/a6xx.xml
@@ -594,10 +594,14 @@ by a particular renderpass/blit.
 	<reg32 offset="0x0600" name="DBGC_CFG_DBGBUS_SEL_A"/>
 	<reg32 offset="0x0601" name="DBGC_CFG_DBGBUS_SEL_B"/>
 	<reg32 offset="0x0602" name="DBGC_CFG_DBGBUS_SEL_C"/>
-	<reg32 offset="0x0603" name="DBGC_CFG_DBGBUS_SEL_D">
+	<reg32 offset="0x0603" name="DBGC_CFG_DBGBUS_SEL_D" variants="A6XX">
 		<bitfield high="7" low="0" name="PING_INDEX"/>
 		<bitfield high="15" low="8" name="PING_BLK_SEL"/>
 	</reg32>
+	<reg32 offset="0x0603" name="DBGC_CFG_DBGBUS_SEL_D" variants="A7XX-">
+		<bitfield high="7" low="0" name="PING_INDEX"/>
+		<bitfield high="24" low="16" name="PING_BLK_SEL"/>
+	</reg32>
 	<reg32 offset="0x0604" name="DBGC_CFG_DBGBUS_CNTLT">
 		<bitfield high="5" low="0" name="TRACEEN"/>
 		<bitfield high="14" low="12" name="GRANU"/>
@@ -3796,6 +3800,14 @@ by a particular renderpass/blit.
 	<reg32 offset="0x0030" name="CFG_DBGBUS_TRACE_BUF2"/>
 </domain>
 
+<domain name="A7XX_CX_DBGC" width="32">
+	<!-- Bitfields shifted, but otherwise the same: -->
+	<reg32 offset="0x0000" name="CFG_DBGBUS_SEL_A" variants="A7XX-">
+		<bitfield high="7" low="0" name="PING_INDEX"/>
+		<bitfield high="24" low="16" name="PING_BLK_SEL"/>
+	</reg32>
+</domain>
+
 <domain name="A6XX_CX_MISC" width="32" prefix="variant" varset="chip">
 	<reg32 offset="0x0001" name="SYSTEM_CACHE_CNTL_0"/>
 	<reg32 offset="0x0002" name="SYSTEM_CACHE_CNTL_1"/>
diff --git a/drivers/gpu/drm/msm/registers/display/dsi.xml b/drivers/gpu/drm/msm/registers/display/dsi.xml
index 501ffc585a9f..c7a7b633d747 100644
--- a/drivers/gpu/drm/msm/registers/display/dsi.xml
+++ b/drivers/gpu/drm/msm/registers/display/dsi.xml
@@ -159,28 +159,28 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<bitfield name="RGB_SWAP" low="12" high="14" type="dsi_rgb_swap"/>
 	</reg32>
 	<reg32 offset="0x00020" name="ACTIVE_H">
-		<bitfield name="START" low="0" high="11" type="uint"/>
-		<bitfield name="END" low="16" high="27" type="uint"/>
+		<bitfield name="START" low="0" high="15" type="uint"/>
+		<bitfield name="END" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x00024" name="ACTIVE_V">
-		<bitfield name="START" low="0" high="11" type="uint"/>
-		<bitfield name="END" low="16" high="27" type="uint"/>
+		<bitfield name="START" low="0" high="15" type="uint"/>
+		<bitfield name="END" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x00028" name="TOTAL">
-		<bitfield name="H_TOTAL" low="0" high="11" type="uint"/>
-		<bitfield name="V_TOTAL" low="16" high="27" type="uint"/>
+		<bitfield name="H_TOTAL" low="0" high="15" type="uint"/>
+		<bitfield name="V_TOTAL" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x0002c" name="ACTIVE_HSYNC">
-		<bitfield name="START" low="0" high="11" type="uint"/>
-		<bitfield name="END" low="16" high="27" type="uint"/>
+		<bitfield name="START" low="0" high="15" type="uint"/>
+		<bitfield name="END" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x00030" name="ACTIVE_VSYNC_HPOS">
-		<bitfield name="START" low="0" high="11" type="uint"/>
-		<bitfield name="END" low="16" high="27" type="uint"/>
+		<bitfield name="START" low="0" high="15" type="uint"/>
+		<bitfield name="END" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x00034" name="ACTIVE_VSYNC_VPOS">
-		<bitfield name="START" low="0" high="11" type="uint"/>
-		<bitfield name="END" low="16" high="27" type="uint"/>
+		<bitfield name="START" low="0" high="15" type="uint"/>
+		<bitfield name="END" low="16" high="31" type="uint"/>
 	</reg32>
 
 	<reg32 offset="0x00038" name="CMD_DMA_CTRL">
@@ -209,8 +209,8 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd">
 		<bitfield name="WORD_COUNT" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x00058" name="CMD_MDP_STREAM0_TOTAL">
-		<bitfield name="H_TOTAL" low="0" high="11" type="uint"/>
-		<bitfield name="V_TOTAL" low="16" high="27" type="uint"/>
+		<bitfield name="H_TOTAL" low="0" high="15" type="uint"/>
+		<bitfield name="V_TOTAL" low="16" high="31" type="uint"/>
 	</reg32>
 	<reg32 offset="0x0005c" name="CMD_MDP_STREAM1_CTRL">
 		<bitfield name="DATA_TYPE" low="0" high="5" type="uint"/>
diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
index 11d5b923d6e7..e2c55f4b9c5a 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
@@ -795,6 +795,10 @@ static bool nv50_plane_format_mod_supported(struct drm_plane *plane,
 	struct nouveau_drm *drm = nouveau_drm(plane->dev);
 	uint8_t i;
 
+	/* All chipsets can display all formats in linear layout */
+	if (modifier == DRM_FORMAT_MOD_LINEAR)
+		return true;
+
 	if (drm->client.device.info.chipset < 0xc0) {
 		const struct drm_format_info *info = drm_format_info(format);
 		const uint8_t kind = (modifier >> 12) & 0xff;
diff --git a/drivers/gpu/drm/nouveau/gv100_fence.c b/drivers/gpu/drm/nouveau/gv100_fence.c
index cccdeca72002..317e516c4ec7 100644
--- a/drivers/gpu/drm/nouveau/gv100_fence.c
+++ b/drivers/gpu/drm/nouveau/gv100_fence.c
@@ -18,7 +18,7 @@ gv100_fence_emit32(struct nouveau_channel *chan, u64 virtual, u32 sequence)
 	struct nvif_push *push = &chan->chan.push;
 	int ret;
 
-	ret = PUSH_WAIT(push, 8);
+	ret = PUSH_WAIT(push, 13);
 	if (ret)
 		return ret;
 
@@ -32,6 +32,11 @@ gv100_fence_emit32(struct nouveau_channel *chan, u64 virtual, u32 sequence)
 		  NVDEF(NVC36F, SEM_EXECUTE, PAYLOAD_SIZE, 32BIT) |
 		  NVDEF(NVC36F, SEM_EXECUTE, RELEASE_TIMESTAMP, DIS));
 
+	PUSH_MTHD(push, NVC36F, MEM_OP_A, 0,
+				MEM_OP_B, 0,
+				MEM_OP_C, NVDEF(NVC36F, MEM_OP_C, MEMBAR_TYPE, SYS_MEMBAR),
+				MEM_OP_D, NVDEF(NVC36F, MEM_OP_D, OPERATION, MEMBAR));
+
 	PUSH_MTHD(push, NVC36F, NON_STALL_INTERRUPT, 0);
 
 	PUSH_KICK(push);
diff --git a/drivers/gpu/drm/nouveau/include/nvhw/class/clc36f.h b/drivers/gpu/drm/nouveau/include/nvhw/class/clc36f.h
index 8735dda4c8a7..338f74b9f501 100644
--- a/drivers/gpu/drm/nouveau/include/nvhw/class/clc36f.h
+++ b/drivers/gpu/drm/nouveau/include/nvhw/class/clc36f.h
@@ -7,6 +7,91 @@
 
 #define NVC36F_NON_STALL_INTERRUPT                                 (0x00000020)
 #define NVC36F_NON_STALL_INTERRUPT_HANDLE                                 31:0
+// NOTE - MEM_OP_A and MEM_OP_B have been replaced in gp100 with methods for
+// specifying the page address for a targeted TLB invalidate and the uTLB for
+// a targeted REPLAY_CANCEL for UVM.
+// The previous MEM_OP_A/B functionality is in MEM_OP_C/D, with slightly
+// rearranged fields.
+#define NVC36F_MEM_OP_A                                            (0x00000028)
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_CANCEL_TARGET_CLIENT_UNIT_ID        5:0  // only relevant for REPLAY_CANCEL_TARGETED
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_INVALIDATION_SIZE                   5:0  // Used to specify size of invalidate, used for invalidates which are not of the REPLAY_CANCEL_TARGETED type
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_CANCEL_TARGET_GPC_ID               10:6  // only relevant for REPLAY_CANCEL_TARGETED
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_CANCEL_MMU_ENGINE_ID                6:0  // only relevant for REPLAY_CANCEL_VA_GLOBAL
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR                         11:11
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR_EN                 0x00000001
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_SYSMEMBAR_DIS                0x00000000
+#define NVC36F_MEM_OP_A_TLB_INVALIDATE_TARGET_ADDR_LO                    31:12
+#define NVC36F_MEM_OP_B                                            (0x0000002c)
+#define NVC36F_MEM_OP_B_TLB_INVALIDATE_TARGET_ADDR_HI                     31:0
+#define NVC36F_MEM_OP_C                                            (0x00000030)
+#define NVC36F_MEM_OP_C_MEMBAR_TYPE                                        2:0
+#define NVC36F_MEM_OP_C_MEMBAR_TYPE_SYS_MEMBAR                      0x00000000
+#define NVC36F_MEM_OP_C_MEMBAR_TYPE_MEMBAR                          0x00000001
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB                                 0:0
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_ONE                      0x00000000
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_ALL                      0x00000001  // Probably nonsensical for MMU_TLB_INVALIDATE_TARGETED
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_GPC                                 1:1
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_GPC_ENABLE                   0x00000000
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_GPC_DISABLE                  0x00000001
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY                              4:2  // only relevant if GPC ENABLE
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_NONE                  0x00000000
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_START                 0x00000001
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_START_ACK_ALL         0x00000002
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_TARGETED       0x00000003
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_GLOBAL         0x00000004
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_REPLAY_CANCEL_VA_GLOBAL      0x00000005
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE                            6:5  // only relevant if GPC ENABLE
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_NONE                0x00000000
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_GLOBALLY            0x00000001
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACK_TYPE_INTRANODE           0x00000002
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE                         9:7 //only relevant for REPLAY_CANCEL_VA_GLOBAL
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_READ                 0
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_WRITE                1
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_STRONG        2
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_RSVRVD               3
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_WEAK          4
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ATOMIC_ALL           5
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_WRITE_AND_ATOMIC     6
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_ACCESS_TYPE_VIRT_ALL                  7
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL                    9:7  // Invalidate affects this level and all below
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_ALL         0x00000000  // Invalidate tlb caches at all levels of the page table
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_PTE_ONLY    0x00000001
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE0  0x00000002
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE1  0x00000003
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE2  0x00000004
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3  0x00000005
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE4  0x00000006
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE5  0x00000007
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE                          11:10  // only relevant if PDB_ONE
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_VID_MEM             0x00000000
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_SYS_MEM_COHERENT    0x00000002
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_APERTURE_SYS_MEM_NONCOHERENT 0x00000003
+#define NVC36F_MEM_OP_C_TLB_INVALIDATE_PDB_ADDR_LO                       31:12  // only relevant if PDB_ONE
+#define NVC36F_MEM_OP_C_ACCESS_COUNTER_CLR_TARGETED_NOTIFY_TAG            19:0
+// MEM_OP_D MUST be preceded by MEM_OPs A-C.
+#define NVC36F_MEM_OP_D                                            (0x00000034)
+#define NVC36F_MEM_OP_D_TLB_INVALIDATE_PDB_ADDR_HI                        26:0  // only relevant if PDB_ONE
+#define NVC36F_MEM_OP_D_OPERATION                                        31:27
+#define NVC36F_MEM_OP_D_OPERATION_MEMBAR                            0x00000005
+#define NVC36F_MEM_OP_D_OPERATION_MMU_TLB_INVALIDATE                0x00000009
+#define NVC36F_MEM_OP_D_OPERATION_MMU_TLB_INVALIDATE_TARGETED       0x0000000a
+#define NVC36F_MEM_OP_D_OPERATION_L2_PEERMEM_INVALIDATE             0x0000000d
+#define NVC36F_MEM_OP_D_OPERATION_L2_SYSMEM_INVALIDATE              0x0000000e
+// CLEAN_LINES is an alias for Tegra/GPU IP usage
+#define NVC36F_MEM_OP_B_OPERATION_L2_INVALIDATE_CLEAN_LINES         0x0000000e
+#define NVC36F_MEM_OP_D_OPERATION_L2_CLEAN_COMPTAGS                 0x0000000f
+#define NVC36F_MEM_OP_D_OPERATION_L2_FLUSH_DIRTY                    0x00000010
+#define NVC36F_MEM_OP_D_OPERATION_L2_WAIT_FOR_SYS_PENDING_READS     0x00000015
+#define NVC36F_MEM_OP_D_OPERATION_ACCESS_COUNTER_CLR                0x00000016
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE                            1:0
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_MIMC                0x00000000
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_MOMC                0x00000001
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_ALL                 0x00000002
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TYPE_TARGETED            0x00000003
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE                   2:2
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE_MIMC       0x00000000
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_TYPE_MOMC       0x00000001
+#define NVC36F_MEM_OP_D_ACCESS_COUNTER_CLR_TARGETED_BANK                   6:3
 #define NVC36F_SEM_ADDR_LO                                         (0x0000005c)
 #define NVC36F_SEM_ADDR_LO_OFFSET                                         31:2
 #define NVC36F_SEM_ADDR_HI                                         (0x00000060)
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
index e1e542126310..805d0a87aa54 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -253,6 +253,7 @@ nouveau_check_bl_size(struct nouveau_drm *drm, struct nouveau_bo *nvbo,
 
 int
 nouveau_framebuffer_new(struct drm_device *dev,
+			const struct drm_format_info *info,
 			const struct drm_mode_fb_cmd2 *mode_cmd,
 			struct drm_gem_object *gem,
 			struct drm_framebuffer **pfb)
@@ -260,7 +261,6 @@ nouveau_framebuffer_new(struct drm_device *dev,
 	struct nouveau_drm *drm = nouveau_drm(dev);
 	struct nouveau_bo *nvbo = nouveau_gem_object(gem);
 	struct drm_framebuffer *fb;
-	const struct drm_format_info *info;
 	unsigned int height, i;
 	uint32_t tile_mode;
 	uint8_t kind;
@@ -295,9 +295,6 @@ nouveau_framebuffer_new(struct drm_device *dev,
 		kind = nvbo->kind;
 	}
 
-	info = drm_get_format_info(dev, mode_cmd->pixel_format,
-				   mode_cmd->modifier[0]);
-
 	for (i = 0; i < info->num_planes; i++) {
 		height = drm_format_info_plane_height(info,
 						      mode_cmd->height,
@@ -321,7 +318,7 @@ nouveau_framebuffer_new(struct drm_device *dev,
 	if (!(fb = *pfb = kzalloc(sizeof(*fb), GFP_KERNEL)))
 		return -ENOMEM;
 
-	drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd);
+	drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd);
 	fb->obj[0] = gem;
 
 	ret = drm_framebuffer_init(dev, fb, &nouveau_framebuffer_funcs);
@@ -344,7 +341,7 @@ nouveau_user_framebuffer_create(struct drm_device *dev,
 	if (!gem)
 		return ERR_PTR(-ENOENT);
 
-	ret = nouveau_framebuffer_new(dev, mode_cmd, gem, &fb);
+	ret = nouveau_framebuffer_new(dev, info, mode_cmd, gem, &fb);
 	if (ret == 0)
 		return fb;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.h b/drivers/gpu/drm/nouveau/nouveau_display.h
index e45f211501f6..470e0910d484 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.h
+++ b/drivers/gpu/drm/nouveau/nouveau_display.h
@@ -8,8 +8,11 @@
 
 #include <drm/drm_framebuffer.h>
 
+struct drm_format_info;
+
 int
 nouveau_framebuffer_new(struct drm_device *dev,
+			const struct drm_format_info *info,
 			const struct drm_mode_fb_cmd2 *mode_cmd,
 			struct drm_gem_object *gem,
 			struct drm_framebuffer **pfb);
diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c b/drivers/gpu/drm/nouveau/nouveau_exec.c
index edbbda78bac9..c4949e815eb3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_exec.c
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
@@ -60,14 +60,14 @@
  * virtual address in the GPU's VA space there is no guarantee that the actual
  * mappings are created in the GPU's MMU. If the given memory is swapped out
  * at the time the bind operation is executed the kernel will stash the mapping
- * details into it's internal alloctor and create the actual MMU mappings once
+ * details into it's internal allocator and create the actual MMU mappings once
  * the memory is swapped back in. While this is transparent for userspace, it is
  * guaranteed that all the backing memory is swapped back in and all the memory
  * mappings, as requested by userspace previously, are actually mapped once the
  * DRM_NOUVEAU_EXEC ioctl is called to submit an exec job.
  *
  * A VM_BIND job can be executed either synchronously or asynchronously. If
- * exectued asynchronously, userspace may provide a list of syncobjs this job
+ * executed asynchronously, userspace may provide a list of syncobjs this job
  * will wait for and/or a list of syncobj the kernel will signal once the
  * VM_BIND job finished execution. If executed synchronously the ioctl will
  * block until the bind job is finished. For synchronous jobs the kernel will
@@ -82,7 +82,7 @@
  * Since VM_BIND jobs update the GPU's VA space on job submit, EXEC jobs do have
  * an up to date view of the VA space. However, the actual mappings might still
  * be pending. Hence, EXEC jobs require to have the particular fences - of
- * the corresponding VM_BIND jobs they depent on - attached to them.
+ * the corresponding VM_BIND jobs they depend on - attached to them.
  */
 
 static int
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9f345a008717..869d4335c0f4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence)
 	return ret;
 }
 
-void
-nouveau_fence_cancel(struct nouveau_fence *fence)
-{
-	struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
-	unsigned long flags;
-
-	spin_lock_irqsave(&fctx->lock, flags);
-	if (!dma_fence_is_signaled_locked(&fence->base)) {
-		dma_fence_set_error(&fence->base, -ECANCELED);
-		if (nouveau_fence_signal(fence))
-			nvif_event_block(&fctx->event);
-	}
-	spin_unlock_irqrestore(&fctx->lock, flags);
-}
-
 bool
 nouveau_fence_done(struct nouveau_fence *fence)
 {
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 9957a919bd38..183dd43ecfff 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **);
 
 int  nouveau_fence_emit(struct nouveau_fence *);
 bool nouveau_fence_done(struct nouveau_fence *);
-void nouveau_fence_cancel(struct nouveau_fence *fence);
 int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
 int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
index 0cc0bc9f9952..e60f7892f5ce 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -11,7 +11,6 @@
 #include "nouveau_exec.h"
 #include "nouveau_abi16.h"
 #include "nouveau_sched.h"
-#include "nouveau_chan.h"
 
 #define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000
 
@@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
 {
 	struct nouveau_sched *sched = job->sched;
 
-	spin_lock(&sched->job_list.lock);
+	spin_lock(&sched->job.list.lock);
 	list_del(&job->entry);
-	spin_unlock(&sched->job_list.lock);
+	spin_unlock(&sched->job.list.lock);
+
+	wake_up(&sched->job.wq);
 }
 
 void
@@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
 	}
 
 	/* Submit was successful; add the job to the schedulers job list. */
-	spin_lock(&sched->job_list.lock);
-	list_add(&job->entry, &sched->job_list.head);
-	spin_unlock(&sched->job_list.lock);
+	spin_lock(&sched->job.list.lock);
+	list_add(&job->entry, &sched->job.list.head);
+	spin_unlock(&sched->job.list.lock);
 
 	drm_sched_job_arm(&job->base);
 	job->done_fence = dma_fence_get(&job->base.s_fence->finished);
@@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job)
 	nouveau_job_fini(job);
 }
 
-static void
-nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
-{
-	struct nouveau_fence *fence;
-	struct nouveau_job *job;
-
-	job = to_nouveau_job(sched_job);
-	fence = to_nouveau_fence(job->done_fence);
-
-	nouveau_fence_cancel(fence);
-}
-
 static const struct drm_sched_backend_ops nouveau_sched_ops = {
 	.run_job = nouveau_sched_run_job,
 	.timedout_job = nouveau_sched_timedout_job,
 	.free_job = nouveau_sched_free_job,
-	.cancel_job = nouveau_sched_cancel_job,
 };
 
 static int
@@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm,
 		goto fail_sched;
 
 	mutex_init(&sched->mutex);
-	spin_lock_init(&sched->job_list.lock);
-	INIT_LIST_HEAD(&sched->job_list.head);
+	spin_lock_init(&sched->job.list.lock);
+	INIT_LIST_HEAD(&sched->job.list.head);
+	init_waitqueue_head(&sched->job.wq);
 
 	return 0;
 
@@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
 	return 0;
 }
 
+
 static void
 nouveau_sched_fini(struct nouveau_sched *sched)
 {
 	struct drm_gpu_scheduler *drm_sched = &sched->base;
 	struct drm_sched_entity *entity = &sched->entity;
 
+	rmb(); /* for list_empty to work without lock */
+	wait_event(sched->job.wq, list_empty(&sched->job.list.head));
+
 	drm_sched_entity_fini(entity);
 	drm_sched_fini(drm_sched);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h
index b98c3f0bef30..20cd1da8db73 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.h
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
@@ -103,9 +103,12 @@ struct nouveau_sched {
 	struct mutex mutex;
 
 	struct {
-		struct list_head head;
-		spinlock_t lock;
-	} job_list;
+		struct {
+			struct list_head head;
+			spinlock_t lock;
+		} list;
+		struct wait_queue_head wq;
+	} job;
 };
 
 int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
index ddfc46bc1b3e..48f105239f42 100644
--- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
@@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
 	u64 end = addr + range;
 
 again:
-	spin_lock(&sched->job_list.lock);
-	list_for_each_entry(__job, &sched->job_list.head, entry) {
+	spin_lock(&sched->job.list.lock);
+	list_for_each_entry(__job, &sched->job.list.head, entry) {
 		struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job);
 
 		list_for_each_op(op, &bind_job->ops) {
@@ -1030,7 +1030,7 @@ again:
 
 				if (!(end <= op_addr || addr >= op_end)) {
 					nouveau_uvmm_bind_job_get(bind_job);
-					spin_unlock(&sched->job_list.lock);
+					spin_unlock(&sched->job.list.lock);
 					wait_for_completion(&bind_job->complete);
 					nouveau_uvmm_bind_job_put(bind_job);
 					goto again;
@@ -1038,7 +1038,7 @@ again:
 			}
 		}
 	}
-	spin_unlock(&sched->job_list.lock);
+	spin_unlock(&sched->job.list.lock);
 }
 
 static int
diff --git a/drivers/gpu/drm/nouveau/nvif/vmm.c b/drivers/gpu/drm/nouveau/nvif/vmm.c
index 99296f03371a..07c1ebc2a941 100644
--- a/drivers/gpu/drm/nouveau/nvif/vmm.c
+++ b/drivers/gpu/drm/nouveau/nvif/vmm.c
@@ -219,7 +219,8 @@ nvif_vmm_ctor(struct nvif_mmu *mmu, const char *name, s32 oclass,
 	case RAW: args->type = NVIF_VMM_V0_TYPE_RAW; break;
 	default:
 		WARN_ON(1);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto done;
 	}
 
 	memcpy(args->data, argv, argc);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
index fdffa0391b31..6fd4e60634fb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
@@ -350,6 +350,8 @@ nvkm_fifo_dtor(struct nvkm_engine *engine)
 	nvkm_chid_unref(&fifo->chid);
 
 	nvkm_event_fini(&fifo->nonstall.event);
+	if (fifo->func->nonstall_dtor)
+		fifo->func->nonstall_dtor(fifo);
 	mutex_destroy(&fifo->mutex);
 
 	if (fifo->func->dtor)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
index e74493a4569e..6848a56f20c0 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
@@ -517,19 +517,11 @@ ga100_fifo_nonstall_intr(struct nvkm_inth *inth)
 static void
 ga100_fifo_nonstall_block(struct nvkm_event *event, int type, int index)
 {
-	struct nvkm_fifo *fifo = container_of(event, typeof(*fifo), nonstall.event);
-	struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
-
-	nvkm_inth_block(&runl->nonstall.inth);
 }
 
 static void
 ga100_fifo_nonstall_allow(struct nvkm_event *event, int type, int index)
 {
-	struct nvkm_fifo *fifo = container_of(event, typeof(*fifo), nonstall.event);
-	struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
-
-	nvkm_inth_allow(&runl->nonstall.inth);
 }
 
 const struct nvkm_event_func
@@ -564,12 +556,26 @@ ga100_fifo_nonstall_ctor(struct nvkm_fifo *fifo)
 		if (ret)
 			return ret;
 
+		nvkm_inth_allow(&runl->nonstall.inth);
+
 		nr = max(nr, runl->id + 1);
 	}
 
 	return nr;
 }
 
+void
+ga100_fifo_nonstall_dtor(struct nvkm_fifo *fifo)
+{
+	struct nvkm_runl *runl;
+
+	nvkm_runl_foreach(runl, fifo) {
+		if (runl->nonstall.vector < 0)
+			continue;
+		nvkm_inth_block(&runl->nonstall.inth);
+	}
+}
+
 int
 ga100_fifo_runl_ctor(struct nvkm_fifo *fifo)
 {
@@ -599,6 +605,7 @@ ga100_fifo = {
 	.runl_ctor = ga100_fifo_runl_ctor,
 	.mmu_fault = &tu102_fifo_mmu_fault,
 	.nonstall_ctor = ga100_fifo_nonstall_ctor,
+	.nonstall_dtor = ga100_fifo_nonstall_dtor,
 	.nonstall = &ga100_fifo_nonstall,
 	.runl = &ga100_runl,
 	.runq = &ga100_runq,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
index 755235f55b3a..18a0b1f4eab7 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga102.c
@@ -30,6 +30,7 @@ ga102_fifo = {
 	.runl_ctor = ga100_fifo_runl_ctor,
 	.mmu_fault = &tu102_fifo_mmu_fault,
 	.nonstall_ctor = ga100_fifo_nonstall_ctor,
+	.nonstall_dtor = ga100_fifo_nonstall_dtor,
 	.nonstall = &ga100_fifo_nonstall,
 	.runl = &ga100_runl,
 	.runq = &ga100_runq,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
index 5e81ae195329..fff1428ef267 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
@@ -41,6 +41,7 @@ struct nvkm_fifo_func {
 	void (*start)(struct nvkm_fifo *, unsigned long *);
 
 	int (*nonstall_ctor)(struct nvkm_fifo *);
+	void (*nonstall_dtor)(struct nvkm_fifo *);
 	const struct nvkm_event_func *nonstall;
 
 	const struct nvkm_runl_func *runl;
@@ -200,6 +201,7 @@ u32 tu102_chan_doorbell_handle(struct nvkm_chan *);
 
 int ga100_fifo_runl_ctor(struct nvkm_fifo *);
 int ga100_fifo_nonstall_ctor(struct nvkm_fifo *);
+void ga100_fifo_nonstall_dtor(struct nvkm_fifo *);
 extern const struct nvkm_event_func ga100_fifo_nonstall;
 extern const struct nvkm_runl_func ga100_runl;
 extern const struct nvkm_runq_func ga100_runq;
diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c
index b7da3ab44c27..7c43397c19e6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/falcon/gm200.c
@@ -103,7 +103,7 @@ gm200_flcn_pio_imem_wr_init(struct nvkm_falcon *falcon, u8 port, bool sec, u32 i
 static void
 gm200_flcn_pio_imem_wr(struct nvkm_falcon *falcon, u8 port, const u8 *img, int len, u16 tag)
 {
-	nvkm_falcon_wr32(falcon, 0x188 + (port * 0x10), tag++);
+	nvkm_falcon_wr32(falcon, 0x188 + (port * 0x10), tag);
 	while (len >= 4) {
 		nvkm_falcon_wr32(falcon, 0x184 + (port * 0x10), *(u32 *)img);
 		img += 4;
@@ -249,9 +249,11 @@ int
 gm200_flcn_fw_load(struct nvkm_falcon_fw *fw)
 {
 	struct nvkm_falcon *falcon = fw->falcon;
-	int target, ret;
+	int ret;
 
 	if (fw->inst) {
+		int target;
+
 		nvkm_falcon_mask(falcon, 0x048, 0x00000001, 0x00000001);
 
 		switch (nvkm_memory_target(fw->inst)) {
@@ -285,15 +287,6 @@ gm200_flcn_fw_load(struct nvkm_falcon_fw *fw)
 	}
 
 	if (fw->boot) {
-		switch (nvkm_memory_target(&fw->fw.mem.memory)) {
-		case NVKM_MEM_TARGET_VRAM: target = 4; break;
-		case NVKM_MEM_TARGET_HOST: target = 5; break;
-		case NVKM_MEM_TARGET_NCOH: target = 6; break;
-		default:
-			WARN_ON(1);
-			return -EINVAL;
-		}
-
 		ret = nvkm_falcon_pio_wr(falcon, fw->boot, 0, 0,
 					 IMEM, falcon->code.limit - fw->boot_size, fw->boot_size,
 					 fw->boot_addr >> 8, false);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/fwsec.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/fwsec.c
index 52412965fac1..5b721bd9d799 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/fwsec.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/fwsec.c
@@ -209,11 +209,12 @@ nvkm_gsp_fwsec_v2(struct nvkm_gsp *gsp, const char *name,
 	fw->boot_addr = bld->start_tag << 8;
 	fw->boot_size = bld->code_size;
 	fw->boot = kmemdup(bl->data + hdr->data_offset + bld->code_off, fw->boot_size, GFP_KERNEL);
-	if (!fw->boot)
-		ret = -ENOMEM;
 
 	nvkm_firmware_put(bl);
 
+	if (!fw->boot)
+		return -ENOMEM;
+
 	/* Patch in interface data. */
 	return nvkm_gsp_fwsec_patch(gsp, fw, desc->InterfaceOffset, init_cmd);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
index 1ac5628c5140..4ed54b386a60 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
@@ -601,6 +601,7 @@ r535_fifo_new(const struct nvkm_fifo_func *hw, struct nvkm_device *device,
 	rm->chan.func = &r535_chan;
 	rm->nonstall = &ga100_fifo_nonstall;
 	rm->nonstall_ctor = ga100_fifo_nonstall_ctor;
+	rm->nonstall_dtor = ga100_fifo_nonstall_dtor;
 
 	return nvkm_fifo_new_(rm, device, type, inst, pfifo);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c
index 9d06ff722fea..0dc4782df8c0 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c
@@ -325,7 +325,7 @@ r535_gsp_msgq_recv(struct nvkm_gsp *gsp, u32 gsp_rpc_len, int *retries)
 
 		rpc = r535_gsp_msgq_peek(gsp, sizeof(*rpc), info.retries);
 		if (IS_ERR_OR_NULL(rpc)) {
-			kfree(buf);
+			kvfree(buf);
 			return rpc;
 		}
 
@@ -334,7 +334,7 @@ r535_gsp_msgq_recv(struct nvkm_gsp *gsp, u32 gsp_rpc_len, int *retries)
 
 		rpc = r535_gsp_msgq_recv_one_elem(gsp, &info);
 		if (IS_ERR_OR_NULL(rpc)) {
-			kfree(buf);
+			kvfree(buf);
 			return rpc;
 		}
 
diff --git a/drivers/gpu/drm/nova/file.rs b/drivers/gpu/drm/nova/file.rs
index 7e59a34b830d..4fe62cf98a23 100644
--- a/drivers/gpu/drm/nova/file.rs
+++ b/drivers/gpu/drm/nova/file.rs
@@ -39,7 +39,8 @@ impl File {
             _ => return Err(EINVAL),
         };
 
-        getparam.set_value(value);
+        #[allow(clippy::useless_conversion)]
+        getparam.set_value(value.into());
 
         Ok(0)
     }
diff --git a/drivers/gpu/drm/omapdrm/omap_fb.c b/drivers/gpu/drm/omapdrm/omap_fb.c
index 30c81e2e5d6b..bb3105556f19 100644
--- a/drivers/gpu/drm/omapdrm/omap_fb.c
+++ b/drivers/gpu/drm/omapdrm/omap_fb.c
@@ -351,7 +351,7 @@ struct drm_framebuffer *omap_framebuffer_create(struct drm_device *dev,
 		}
 	}
 
-	fb = omap_framebuffer_init(dev, mode_cmd, bos);
+	fb = omap_framebuffer_init(dev, info, mode_cmd, bos);
 	if (IS_ERR(fb))
 		goto error;
 
@@ -365,9 +365,9 @@ error:
 }
 
 struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
+		const struct drm_format_info *info,
 		const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **bos)
 {
-	const struct drm_format_info *format = NULL;
 	struct omap_framebuffer *omap_fb = NULL;
 	struct drm_framebuffer *fb = NULL;
 	unsigned int pitch = mode_cmd->pitches[0];
@@ -377,15 +377,12 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
 			dev, mode_cmd, mode_cmd->width, mode_cmd->height,
 			(char *)&mode_cmd->pixel_format);
 
-	format = drm_get_format_info(dev, mode_cmd->pixel_format,
-				     mode_cmd->modifier[0]);
-
 	for (i = 0; i < ARRAY_SIZE(formats); i++) {
 		if (formats[i] == mode_cmd->pixel_format)
 			break;
 	}
 
-	if (!format || i == ARRAY_SIZE(formats)) {
+	if (i == ARRAY_SIZE(formats)) {
 		dev_dbg(dev->dev, "unsupported pixel format: %4.4s\n",
 			(char *)&mode_cmd->pixel_format);
 		ret = -EINVAL;
@@ -399,7 +396,7 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
 	}
 
 	fb = &omap_fb->base;
-	omap_fb->format = format;
+	omap_fb->format = info;
 	mutex_init(&omap_fb->lock);
 
 	/*
@@ -407,23 +404,23 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
 	 * that the two planes of multiplane formats need the same number of
 	 * bytes per pixel.
 	 */
-	if (format->num_planes == 2 && pitch != mode_cmd->pitches[1]) {
+	if (info->num_planes == 2 && pitch != mode_cmd->pitches[1]) {
 		dev_dbg(dev->dev, "pitches differ between planes 0 and 1\n");
 		ret = -EINVAL;
 		goto fail;
 	}
 
-	if (pitch % format->cpp[0]) {
+	if (pitch % info->cpp[0]) {
 		dev_dbg(dev->dev,
 			"buffer pitch (%u bytes) is not a multiple of pixel size (%u bytes)\n",
-			pitch, format->cpp[0]);
+			pitch, info->cpp[0]);
 		ret = -EINVAL;
 		goto fail;
 	}
 
-	for (i = 0; i < format->num_planes; i++) {
+	for (i = 0; i < info->num_planes; i++) {
 		struct plane *plane = &omap_fb->planes[i];
-		unsigned int vsub = i == 0 ? 1 : format->vsub;
+		unsigned int vsub = i == 0 ? 1 : info->vsub;
 		unsigned int size;
 
 		size = pitch * mode_cmd->height / vsub;
@@ -440,7 +437,7 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
 		plane->dma_addr  = 0;
 	}
 
-	drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd);
+	drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd);
 
 	ret = drm_framebuffer_init(dev, fb, &omap_framebuffer_funcs);
 	if (ret) {
diff --git a/drivers/gpu/drm/omapdrm/omap_fb.h b/drivers/gpu/drm/omapdrm/omap_fb.h
index 0873f953cf1d..e6010302a22b 100644
--- a/drivers/gpu/drm/omapdrm/omap_fb.h
+++ b/drivers/gpu/drm/omapdrm/omap_fb.h
@@ -13,6 +13,7 @@ struct drm_connector;
 struct drm_device;
 struct drm_file;
 struct drm_framebuffer;
+struct drm_format_info;
 struct drm_gem_object;
 struct drm_mode_fb_cmd2;
 struct drm_plane_state;
@@ -23,6 +24,7 @@ struct drm_framebuffer *omap_framebuffer_create(struct drm_device *dev,
 		struct drm_file *file, const struct drm_format_info *info,
 		const struct drm_mode_fb_cmd2 *mode_cmd);
 struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev,
+		const struct drm_format_info *info,
 		const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **bos);
 int omap_framebuffer_pin(struct drm_framebuffer *fb);
 void omap_framebuffer_unpin(struct drm_framebuffer *fb);
diff --git a/drivers/gpu/drm/omapdrm/omap_fbdev.c b/drivers/gpu/drm/omapdrm/omap_fbdev.c
index 7b6396890681..948af7ec1130 100644
--- a/drivers/gpu/drm/omapdrm/omap_fbdev.c
+++ b/drivers/gpu/drm/omapdrm/omap_fbdev.c
@@ -197,7 +197,10 @@ int omap_fbdev_driver_fbdev_probe(struct drm_fb_helper *helper,
 		goto fail;
 	}
 
-	fb = omap_framebuffer_init(dev, &mode_cmd, &bo);
+	fb = omap_framebuffer_init(dev,
+				   drm_get_format_info(dev, mode_cmd.pixel_format,
+						       mode_cmd.modifier[0]),
+				   &mode_cmd, &bo);
 	if (IS_ERR(fb)) {
 		dev_err(dev->dev, "failed to allocate fb\n");
 		/* note: if fb creation failed, we can't rely on fb destroy
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c
index bb73f2a68a12..85d6289a6eda 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem.c
@@ -432,7 +432,7 @@ static void panfrost_gem_debugfs_bo_print(struct panfrost_gem_object *bo,
 	if (!refcount)
 		return;
 
-	resident_size = bo->base.pages ? bo->base.base.size : 0;
+	resident_size = panfrost_gem_rss(&bo->base.base);
 
 	snprintf(creator_info, sizeof(creator_info),
 		 "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid);
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index 1116f2d2826e..4d8e9b34702a 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1094,7 +1094,7 @@ static int panthor_ioctl_group_create(struct drm_device *ddev, void *data,
 	struct drm_panthor_queue_create *queue_args;
 	int ret;
 
-	if (!args->queues.count)
+	if (!args->queues.count || args->queues.count > MAX_CS_PER_CSG)
 		return -EINVAL;
 
 	ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues);
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 8f17394cc82a..df76653e649a 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -886,8 +886,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
 	if (IS_ERR_OR_NULL(queue))
 		return;
 
-	if (queue->entity.fence_context)
-		drm_sched_entity_destroy(&queue->entity);
+	drm_sched_entity_destroy(&queue->entity);
 
 	if (queue->scheduler.ops)
 		drm_sched_fini(&queue->scheduler);
@@ -3558,11 +3557,6 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle)
 	if (!group)
 		return -EINVAL;
 
-	for (u32 i = 0; i < group->queue_count; i++) {
-		if (group->queues[i])
-			drm_sched_entity_destroy(&group->queues[i]->entity);
-	}
-
 	mutex_lock(&sched->reset.lock);
 	mutex_lock(&sched->lock);
 	group->destroyed = true;
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index b4bf5dfeea2d..4dc77c398617 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -1297,12 +1297,13 @@ static const struct drm_framebuffer_funcs radeon_fb_funcs = {
 int
 radeon_framebuffer_init(struct drm_device *dev,
 			struct drm_framebuffer *fb,
+			const struct drm_format_info *info,
 			const struct drm_mode_fb_cmd2 *mode_cmd,
 			struct drm_gem_object *obj)
 {
 	int ret;
 	fb->obj[0] = obj;
-	drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd);
+	drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd);
 	ret = drm_framebuffer_init(dev, fb, &radeon_fb_funcs);
 	if (ret) {
 		fb->obj[0] = NULL;
@@ -1341,7 +1342,7 @@ radeon_user_framebuffer_create(struct drm_device *dev,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	ret = radeon_framebuffer_init(dev, fb, mode_cmd, obj);
+	ret = radeon_framebuffer_init(dev, fb, info, mode_cmd, obj);
 	if (ret) {
 		kfree(fb);
 		drm_gem_object_put(obj);
diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c b/drivers/gpu/drm/radeon/radeon_fbdev.c
index e3a481bbee7b..dc81b0c2dbff 100644
--- a/drivers/gpu/drm/radeon/radeon_fbdev.c
+++ b/drivers/gpu/drm/radeon/radeon_fbdev.c
@@ -53,10 +53,10 @@ static void radeon_fbdev_destroy_pinned_object(struct drm_gem_object *gobj)
 }
 
 static int radeon_fbdev_create_pinned_object(struct drm_fb_helper *fb_helper,
+					     const struct drm_format_info *info,
 					     struct drm_mode_fb_cmd2 *mode_cmd,
 					     struct drm_gem_object **gobj_p)
 {
-	const struct drm_format_info *info;
 	struct radeon_device *rdev = fb_helper->dev->dev_private;
 	struct drm_gem_object *gobj = NULL;
 	struct radeon_bo *rbo = NULL;
@@ -67,8 +67,6 @@ static int radeon_fbdev_create_pinned_object(struct drm_fb_helper *fb_helper,
 	int height = mode_cmd->height;
 	u32 cpp;
 
-	info = drm_get_format_info(rdev_to_drm(rdev), mode_cmd->pixel_format,
-				   mode_cmd->modifier[0]);
 	cpp = info->cpp[0];
 
 	/* need to align pitch with crtc limits */
@@ -206,6 +204,7 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper,
 				    struct drm_fb_helper_surface_size *sizes)
 {
 	struct radeon_device *rdev = fb_helper->dev->dev_private;
+	const struct drm_format_info *format_info;
 	struct drm_mode_fb_cmd2 mode_cmd = { };
 	struct fb_info *info;
 	struct drm_gem_object *gobj;
@@ -224,7 +223,9 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper,
 	mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp,
 							  sizes->surface_depth);
 
-	ret = radeon_fbdev_create_pinned_object(fb_helper, &mode_cmd, &gobj);
+	format_info = drm_get_format_info(rdev_to_drm(rdev), mode_cmd.pixel_format,
+					  mode_cmd.modifier[0]);
+	ret = radeon_fbdev_create_pinned_object(fb_helper, format_info, &mode_cmd, &gobj);
 	if (ret) {
 		DRM_ERROR("failed to create fbcon object %d\n", ret);
 		return ret;
@@ -236,7 +237,7 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper,
 		ret = -ENOMEM;
 		goto err_radeon_fbdev_destroy_pinned_object;
 	}
-	ret = radeon_framebuffer_init(rdev_to_drm(rdev), fb, &mode_cmd, gobj);
+	ret = radeon_framebuffer_init(rdev_to_drm(rdev), fb, format_info, &mode_cmd, gobj);
 	if (ret) {
 		DRM_ERROR("failed to initialize framebuffer %d\n", ret);
 		goto err_kfree;
diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h
index 3102f6c2d055..9e34da2cacef 100644
--- a/drivers/gpu/drm/radeon/radeon_mode.h
+++ b/drivers/gpu/drm/radeon/radeon_mode.h
@@ -40,6 +40,7 @@
 
 struct drm_fb_helper;
 struct drm_fb_helper_surface_size;
+struct drm_format_info;
 
 struct edid;
 struct drm_edid;
@@ -890,6 +891,7 @@ extern void
 radeon_combios_encoder_dpms_scratch_regs(struct drm_encoder *encoder, bool on);
 int radeon_framebuffer_init(struct drm_device *dev,
 			     struct drm_framebuffer *rfb,
+			     const struct drm_format_info *info,
 			     const struct drm_mode_fb_cmd2 *mode_cmd,
 			     struct drm_gem_object *obj);
 
diff --git a/drivers/gpu/drm/rockchip/Kconfig b/drivers/gpu/drm/rockchip/Kconfig
index ab525668939a..faf50d872be3 100644
--- a/drivers/gpu/drm/rockchip/Kconfig
+++ b/drivers/gpu/drm/rockchip/Kconfig
@@ -53,6 +53,7 @@ config ROCKCHIP_CDN_DP
 	bool "Rockchip cdn DP"
 	depends on EXTCON=y || (EXTCON=m && DRM_ROCKCHIP=m)
 	select DRM_DISPLAY_HELPER
+	select DRM_BRIDGE_CONNECTOR
 	select DRM_DISPLAY_DP_HELPER
 	help
 	  This selects support for Rockchip SoC specific extensions
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
index 186f6452a7d3..b50927a824b4 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c
@@ -2579,12 +2579,13 @@ static int vop2_win_init(struct vop2 *vop2)
 }
 
 /*
- * The window registers are only updated when config done is written.
- * Until that they read back the old value. As we read-modify-write
- * these registers mark them as non-volatile. This makes sure we read
- * the new values from the regmap register cache.
+ * The window and video port registers are only updated when config
+ * done is written. Until that they read back the old value. As we
+ * read-modify-write these registers mark them as non-volatile. This
+ * makes sure we read the new values from the regmap register cache.
  */
 static const struct regmap_range vop2_nonvolatile_range[] = {
+	regmap_reg_range(RK3568_VP0_CTRL_BASE, RK3588_VP3_CTRL_BASE + 255),
 	regmap_reg_range(0x1000, 0x23ff),
 };
 
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 8867b95ab089..3d06f72531ba 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(drm_sched_entity_set_priority);
  * Add a callback to the current dependency of the entity to wake up the
  * scheduler when the entity becomes available.
  */
-static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
+static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity,
+					       struct drm_sched_job *sched_job)
 {
 	struct drm_gpu_scheduler *sched = entity->rq->sched;
 	struct dma_fence *fence = entity->dependency;
@@ -421,6 +422,10 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
 		entity->dependency = fence;
 	}
 
+	if (trace_drm_sched_job_unschedulable_enabled() &&
+	    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags))
+		trace_drm_sched_job_unschedulable(sched_job, entity->dependency);
+
 	if (!dma_fence_add_callback(entity->dependency, &entity->cb,
 				    drm_sched_entity_wakeup))
 		return true;
@@ -461,10 +466,8 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 
 	while ((entity->dependency =
 			drm_sched_job_dependency(sched_job, entity))) {
-		if (drm_sched_entity_add_dependency_cb(entity)) {
-			trace_drm_sched_job_unschedulable(sched_job, entity->dependency);
+		if (drm_sched_entity_add_dependency_cb(entity, sched_job))
 			return NULL;
-		}
 	}
 
 	/* skip jobs from entity that marked guilty */
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 41a285ec889f..8ede07fb7a21 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -526,7 +526,7 @@ void tegra_bo_free_object(struct drm_gem_object *gem)
 		if (drm_gem_is_imported(gem)) {
 			dma_buf_unmap_attachment_unlocked(gem->import_attach, bo->sgt,
 							  DMA_TO_DEVICE);
-			dma_buf_detach(gem->dma_buf, gem->import_attach);
+			dma_buf_detach(gem->import_attach->dmabuf, gem->import_attach);
 		}
 	}
 
diff --git a/drivers/gpu/drm/tests/drm_format_helper_test.c b/drivers/gpu/drm/tests/drm_format_helper_test.c
index 7299fa8971ce..981dada8f3a8 100644
--- a/drivers/gpu/drm/tests/drm_format_helper_test.c
+++ b/drivers/gpu/drm/tests/drm_format_helper_test.c
@@ -1033,13 +1033,14 @@ static void drm_test_fb_xrgb8888_to_xrgb2101010(struct kunit *test)
 		NULL : &result->dst_pitch;
 
 	drm_fb_xrgb8888_to_xrgb2101010(&dst, dst_pitch, &src, &fb, &params->clip, &fmtcnv_state);
-	buf = le32buf_to_cpu(test, buf, dst_size / sizeof(u32));
+	buf = le32buf_to_cpu(test, (__force const __le32 *)buf, dst_size / sizeof(u32));
 	KUNIT_EXPECT_MEMEQ(test, buf, result->expected, dst_size);
 
 	buf = dst.vaddr; /* restore original value of buf */
 	memset(buf, 0, dst_size);
 
 	drm_fb_xrgb8888_to_xrgb2101010(&dst, dst_pitch, &src, &fb, &params->clip, &fmtcnv_state);
+	buf = le32buf_to_cpu(test, (__force const __le32 *)buf, dst_size / sizeof(u32));
 	KUNIT_EXPECT_MEMEQ(test, buf, result->expected, dst_size);
 }
 
diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
index 81eb046aeebf..b9f67d7a00d8 100644
--- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
@@ -117,6 +117,7 @@ enum xe_guc_action {
 	XE_GUC_ACTION_ENTER_S_STATE = 0x501,
 	XE_GUC_ACTION_EXIT_S_STATE = 0x502,
 	XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE = 0x506,
+	XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV = 0x509,
 	XE_GUC_ACTION_SCHED_CONTEXT = 0x1000,
 	XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET = 0x1001,
 	XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002,
diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
index 0366a9da5977..d7719d0e36ca 100644
--- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
+++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
@@ -17,6 +17,7 @@
  *  | 0 | 31:16 | **KEY** - KLV key identifier                                 |
  *  |   |       |   - `GuC Self Config KLVs`_                                  |
  *  |   |       |   - `GuC Opt In Feature KLVs`_                               |
+ *  |   |       |   - `GuC Scheduling Policies KLVs`_                          |
  *  |   |       |   - `GuC VGT Policy KLVs`_                                   |
  *  |   |       |   - `GuC VF Configuration KLVs`_                             |
  *  |   |       |                                                              |
@@ -153,6 +154,30 @@ enum  {
 #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u
 
 /**
+ * DOC: GuC Scheduling Policies KLVs
+ *
+ * `GuC KLV`_ keys available for use with UPDATE_SCHEDULING_POLICIES_KLV.
+ *
+ * _`GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD` : 0x1001
+ *      Some platforms do not allow concurrent execution of RCS and CCS
+ *      workloads from different address spaces. By default, the GuC prioritizes
+ *      RCS submissions over CCS ones, which can lead to CCS workloads being
+ *      significantly (or completely) starved of execution time. This KLV allows
+ *      the driver to specify a quantum (in ms) and a ratio (percentage value
+ *      between 0 and 100), and the GuC will prioritize the CCS for that
+ *      percentage of each quantum. For example, specifying 100ms and 30% will
+ *      make the GuC prioritize the CCS for 30ms of every 100ms.
+ *      Note that this does not necessarly mean that RCS and CCS engines will
+ *      only be active for their percentage of the quantum, as the restriction
+ *      only kicks in if both classes are fully busy with non-compatible address
+ *      spaces; i.e., if one engine is idle or running the same address space,
+ *      a pending job on the other engine will still be submitted to the HW no
+ *      matter what the ratio is
+ */
+#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_KEY	0x1001
+#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_LEN	2u
+
+/**
  * DOC: GuC VGT Policy KLVs
  *
  * `GuC KLV`_ keys available for use with PF2GUC_UPDATE_VGT_POLICY.
diff --git a/drivers/gpu/drm/xe/regs/xe_bars.h b/drivers/gpu/drm/xe/regs/xe_bars.h
index ce05b6ae832f..880140d6ccdc 100644
--- a/drivers/gpu/drm/xe/regs/xe_bars.h
+++ b/drivers/gpu/drm/xe/regs/xe_bars.h
@@ -7,5 +7,6 @@
 
 #define GTTMMADR_BAR			0 /* MMIO + GTT */
 #define LMEM_BAR			2 /* VRAM */
+#define VF_LMEM_BAR			9 /* VF VRAM */
 
 #endif
diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c
index bb469096d072..7b40cc8be1c9 100644
--- a/drivers/gpu/drm/xe/tests/xe_bo.c
+++ b/drivers/gpu/drm/xe/tests/xe_bo.c
@@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc
 		}
 
 		xe_bo_lock(external, false);
-		err = xe_bo_pin_external(external);
+		err = xe_bo_pin_external(external, false);
 		xe_bo_unlock(external);
 		if (err) {
 			KUNIT_FAIL(test, "external bo pin err=%pe\n",
diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c
index c53f67ce4b0a..121f17c112ec 100644
--- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c
@@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported,
 		return;
 	}
 
-	/*
-	 * If on different devices, the exporter is kept in system  if
-	 * possible, saving a migration step as the transfer is just
-	 * likely as fast from system memory.
-	 */
-	if (params->mem_mask & XE_BO_FLAG_SYSTEM)
-		KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT));
-	else
-		KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
+	KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
 
 	if (params->force_different_devices)
 		KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT));
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 18f27da47a36..bae7ff2e5927 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo,
 
 		bo->placements[*c] = (struct ttm_place) {
 			.mem_type = XE_PL_TT,
+			.flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ?
+			TTM_PL_FLAG_FALLBACK : 0,
 		};
 		*c += 1;
 	}
@@ -812,14 +814,14 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
 	}
 
 	if (ttm_bo->type == ttm_bo_type_sg) {
-		ret = xe_bo_move_notify(bo, ctx);
+		if (new_mem->mem_type == XE_PL_SYSTEM)
+			ret = xe_bo_move_notify(bo, ctx);
 		if (!ret)
 			ret = xe_bo_move_dmabuf(ttm_bo, new_mem);
 		return ret;
 	}
 
-	tt_has_data = ttm && (ttm_tt_is_populated(ttm) ||
-			      (ttm->page_flags & TTM_TT_FLAG_SWAPPED));
+	tt_has_data = ttm && (ttm_tt_is_populated(ttm) || ttm_tt_is_swapped(ttm));
 
 	move_lacks_source = !old_mem || (handle_system_ccs ? (!bo->ccs_cleared) :
 					 (!mem_type_is_vram(old_mem_type) && !tt_has_data));
@@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
 /**
  * xe_bo_pin_external - pin an external BO
  * @bo: buffer object to be pinned
+ * @in_place: Pin in current placement, don't attempt to migrate.
  *
  * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD)
  * BO. Unique call compared to xe_bo_pin as this function has it own set of
@@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
  *
  * Returns 0 for success, negative error code otherwise.
  */
-int xe_bo_pin_external(struct xe_bo *bo)
+int xe_bo_pin_external(struct xe_bo *bo, bool in_place)
 {
 	struct xe_device *xe = xe_bo_device(bo);
 	int err;
@@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo)
 	xe_assert(xe, xe_bo_is_user(bo));
 
 	if (!xe_bo_is_pinned(bo)) {
-		err = xe_bo_validate(bo, NULL, false);
-		if (err)
-			return err;
+		if (!in_place) {
+			err = xe_bo_validate(bo, NULL, false);
+			if (err)
+				return err;
+		}
 
 		spin_lock(&xe->pinned.lock);
 		list_add_tail(&bo->pinned_link, &xe->pinned.late.external);
@@ -2438,9 +2443,11 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict)
 		.no_wait_gpu = false,
 		.gfp_retry_mayfail = true,
 	};
-	struct pin_cookie cookie;
 	int ret;
 
+	if (xe_bo_is_pinned(bo))
+		return 0;
+
 	if (vm) {
 		lockdep_assert_held(&vm->lock);
 		xe_vm_assert_held(vm);
@@ -2449,10 +2456,10 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict)
 		ctx.resv = xe_vm_resv(vm);
 	}
 
-	cookie = xe_vm_set_validating(vm, allow_res_evict);
+	xe_vm_set_validating(vm, allow_res_evict);
 	trace_xe_bo_validate(bo);
 	ret = ttm_bo_validate(&bo->ttm, &bo->placement, &ctx);
-	xe_vm_clear_validating(vm, allow_res_evict, cookie);
+	xe_vm_clear_validating(vm, allow_res_evict);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
index 02e8cde4c6b2..9ce94d252015 100644
--- a/drivers/gpu/drm/xe/xe_bo.h
+++ b/drivers/gpu/drm/xe/xe_bo.h
@@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo)
 	}
 }
 
-int xe_bo_pin_external(struct xe_bo *bo);
+int xe_bo_pin_external(struct xe_bo *bo, bool in_place);
 int xe_bo_pin(struct xe_bo *bo);
 void xe_bo_unpin_external(struct xe_bo *bo);
 void xe_bo_unpin(struct xe_bo *bo);
diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c
index 7484ce55a303..d5dbc51e8612 100644
--- a/drivers/gpu/drm/xe/xe_bo_evict.c
+++ b/drivers/gpu/drm/xe/xe_bo_evict.c
@@ -158,8 +158,8 @@ int xe_bo_evict_all(struct xe_device *xe)
 	if (ret)
 		return ret;
 
-	ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present,
-				    &xe->pinned.late.evicted, xe_bo_evict_pinned);
+	ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.external,
+				    &xe->pinned.late.external, xe_bo_evict_pinned);
 
 	if (!ret)
 		ret = xe_bo_apply_to_pinned(xe, &xe->pinned.late.kernel_bo_present,
diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
index e9b46a2d0019..58c1f397c68c 100644
--- a/drivers/gpu/drm/xe/xe_configfs.c
+++ b/drivers/gpu/drm/xe/xe_configfs.c
@@ -404,7 +404,7 @@ int __init xe_configfs_init(void)
 	return 0;
 }
 
-void __exit xe_configfs_exit(void)
+void xe_configfs_exit(void)
 {
 	configfs_unregister_subsystem(&xe_configfs);
 }
diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
index bd9015761aa0..927ee7991696 100644
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
@@ -308,15 +308,19 @@ int xe_device_sysfs_init(struct xe_device *xe)
 			return ret;
 	}
 
-	if (xe->info.platform == XE_BATTLEMAGE) {
+	if (xe->info.platform == XE_BATTLEMAGE && !IS_SRIOV_VF(xe)) {
 		ret = sysfs_create_files(&dev->kobj, auto_link_downgrade_attrs);
 		if (ret)
-			return ret;
+			goto cleanup;
 
 		ret = late_bind_create_files(dev);
 		if (ret)
-			return ret;
+			goto cleanup;
 	}
 
 	return devm_add_action_or_reset(dev, xe_device_sysfs_fini, xe);
+
+cleanup:
+	xe_device_sysfs_fini(xe);
+	return ret;
 }
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index d4d2c6854790..7ceb0c90f391 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -553,6 +553,12 @@ struct xe_device {
 
 	/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
 	struct notifier_block pm_notifier;
+	/** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */
+	struct completion pm_block;
+	/** @rebind_resume_list: List of wq items to kick on resume. */
+	struct list_head rebind_resume_list;
+	/** @rebind_resume_lock: Lock to protect the rebind_resume_list */
+	struct mutex rebind_resume_lock;
 
 	/** @pmt: Support the PMT driver callback interface */
 	struct {
diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c
index 346f857f3837..af64baf872ef 100644
--- a/drivers/gpu/drm/xe/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/xe_dma_buf.c
@@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach)
 		return ret;
 	}
 
-	ret = xe_bo_pin_external(bo);
+	ret = xe_bo_pin_external(bo, true);
 	xe_assert(xe, !ret);
 
 	return 0;
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 44364c042ad7..374c831e691b 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -237,6 +237,15 @@ retry:
 		goto err_unlock_list;
 	}
 
+	/*
+	 * It's OK to block interruptible here with the vm lock held, since
+	 * on task freezing during suspend / hibernate, the call will
+	 * return -ERESTARTSYS and the IOCTL will be rerun.
+	 */
+	err = wait_for_completion_interruptible(&xe->pm_block);
+	if (err)
+		goto err_unlock_list;
+
 	vm_exec.vm = &vm->gpuvm;
 	vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
 	if (xe_vm_in_lr_mode(vm)) {
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 8991b4aed440..c07edcda99c5 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -151,6 +151,16 @@ err_lrc:
 	return err;
 }
 
+static void __xe_exec_queue_fini(struct xe_exec_queue *q)
+{
+	int i;
+
+	q->ops->fini(q);
+
+	for (i = 0; i < q->width; ++i)
+		xe_lrc_put(q->lrc[i]);
+}
+
 struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
 					   u32 logical_mask, u16 width,
 					   struct xe_hw_engine *hwe, u32 flags,
@@ -181,11 +191,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v
 	if (xe_exec_queue_uses_pxp(q)) {
 		err = xe_pxp_exec_queue_add(xe->pxp, q);
 		if (err)
-			goto err_post_alloc;
+			goto err_post_init;
 	}
 
 	return q;
 
+err_post_init:
+	__xe_exec_queue_fini(q);
 err_post_alloc:
 	__xe_exec_queue_free(q);
 	return ERR_PTR(err);
@@ -283,13 +295,11 @@ void xe_exec_queue_destroy(struct kref *ref)
 			xe_exec_queue_put(eq);
 	}
 
-	q->ops->fini(q);
+	q->ops->destroy(q);
 }
 
 void xe_exec_queue_fini(struct xe_exec_queue *q)
 {
-	int i;
-
 	/*
 	 * Before releasing our ref to lrc and xef, accumulate our run ticks
 	 * and wakeup any waiters.
@@ -298,9 +308,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
 	if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal))
 		wake_up_var(&q->xef->exec_queue.pending_removal);
 
-	for (i = 0; i < q->width; ++i)
-		xe_lrc_put(q->lrc[i]);
-
+	__xe_exec_queue_fini(q);
 	__xe_exec_queue_free(q);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index cc1cffb5c87f..1c9d03f2a3e5 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -166,8 +166,14 @@ struct xe_exec_queue_ops {
 	int (*init)(struct xe_exec_queue *q);
 	/** @kill: Kill inflight submissions for backend */
 	void (*kill)(struct xe_exec_queue *q);
-	/** @fini: Fini exec queue for submission backend */
+	/** @fini: Undoes the init() for submission backend */
 	void (*fini)(struct xe_exec_queue *q);
+	/**
+	 * @destroy: Destroy exec queue for submission backend. The backend
+	 * function must call xe_exec_queue_fini() (which will in turn call the
+	 * fini() backend function) to ensure the queue is properly cleaned up.
+	 */
+	void (*destroy)(struct xe_exec_queue *q);
 	/** @set_priority: Set priority for exec queue */
 	int (*set_priority)(struct xe_exec_queue *q,
 			    enum xe_exec_queue_priority priority);
diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
index 788f56b066b6..f83d421ac9d3 100644
--- a/drivers/gpu/drm/xe/xe_execlist.c
+++ b/drivers/gpu/drm/xe/xe_execlist.c
@@ -385,10 +385,20 @@ err_free:
 	return err;
 }
 
-static void execlist_exec_queue_fini_async(struct work_struct *w)
+static void execlist_exec_queue_fini(struct xe_exec_queue *q)
+{
+	struct xe_execlist_exec_queue *exl = q->execlist;
+
+	drm_sched_entity_fini(&exl->entity);
+	drm_sched_fini(&exl->sched);
+
+	kfree(exl);
+}
+
+static void execlist_exec_queue_destroy_async(struct work_struct *w)
 {
 	struct xe_execlist_exec_queue *ee =
-		container_of(w, struct xe_execlist_exec_queue, fini_async);
+		container_of(w, struct xe_execlist_exec_queue, destroy_async);
 	struct xe_exec_queue *q = ee->q;
 	struct xe_execlist_exec_queue *exl = q->execlist;
 	struct xe_device *xe = gt_to_xe(q->gt);
@@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w)
 		list_del(&exl->active_link);
 	spin_unlock_irqrestore(&exl->port->lock, flags);
 
-	drm_sched_entity_fini(&exl->entity);
-	drm_sched_fini(&exl->sched);
-	kfree(exl);
-
 	xe_exec_queue_fini(q);
 }
 
@@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q)
 	/* NIY */
 }
 
-static void execlist_exec_queue_fini(struct xe_exec_queue *q)
+static void execlist_exec_queue_destroy(struct xe_exec_queue *q)
 {
-	INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async);
-	queue_work(system_unbound_wq, &q->execlist->fini_async);
+	INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async);
+	queue_work(system_unbound_wq, &q->execlist->destroy_async);
 }
 
 static int execlist_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
 	.init = execlist_exec_queue_init,
 	.kill = execlist_exec_queue_kill,
 	.fini = execlist_exec_queue_fini,
+	.destroy = execlist_exec_queue_destroy,
 	.set_priority = execlist_exec_queue_set_priority,
 	.set_timeslice = execlist_exec_queue_set_timeslice,
 	.set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,
diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h
index 415140936f11..92c4ba52db0c 100644
--- a/drivers/gpu/drm/xe/xe_execlist_types.h
+++ b/drivers/gpu/drm/xe/xe_execlist_types.h
@@ -42,7 +42,7 @@ struct xe_execlist_exec_queue {
 
 	bool has_run;
 
-	struct work_struct fini_async;
+	struct work_struct destroy_async;
 
 	enum xe_exec_queue_priority active_priority;
 	struct list_head active_link;
diff --git a/drivers/gpu/drm/xe/xe_gen_wa_oob.c b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
index 6581cb0f0e59..247e41c1c48d 100644
--- a/drivers/gpu/drm/xe/xe_gen_wa_oob.c
+++ b/drivers/gpu/drm/xe/xe_gen_wa_oob.c
@@ -123,11 +123,19 @@ static int parse(FILE *input, FILE *csource, FILE *cheader, char *prefix)
 	return 0;
 }
 
+/* Avoid GNU vs POSIX basename() discrepancy, just use our own */
+static const char *xbasename(const char *s)
+{
+	const char *p = strrchr(s, '/');
+
+	return p ? p + 1 : s;
+}
+
 static int fn_to_prefix(const char *fn, char *prefix, size_t size)
 {
 	size_t len;
 
-	fn = basename(fn);
+	fn = xbasename(fn);
 	len = strlen(fn);
 
 	if (len > size - 1)
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index c8eda36546d3..17634195cdc2 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -41,6 +41,7 @@
 #include "xe_gt_topology.h"
 #include "xe_guc_exec_queue_types.h"
 #include "xe_guc_pc.h"
+#include "xe_guc_submit.h"
 #include "xe_hw_fence.h"
 #include "xe_hw_engine_class_sysfs.h"
 #include "xe_irq.h"
@@ -97,7 +98,7 @@ void xe_gt_sanitize(struct xe_gt *gt)
 	 * FIXME: if xe_uc_sanitize is called here, on TGL driver will not
 	 * reload
 	 */
-	gt->uc.guc.submission_state.enabled = false;
+	xe_guc_submit_disable(&gt->uc.guc);
 }
 
 static void xe_gt_enable_host_l2_vram(struct xe_gt *gt)
diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
index 494909f74eb2..d84831a03610 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
@@ -1632,7 +1632,6 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs)
 	u64 fair;
 
 	fair = div_u64(available, num_vfs);
-	fair = rounddown_pow_of_two(fair);	/* XXX: ttm_vram_mgr & drm_buddy limitation */
 	fair = ALIGN_DOWN(fair, alignment);
 #ifdef MAX_FAIR_LMEM
 	fair = min_t(u64, MAX_FAIR_LMEM, fair);
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index b1d1d6da3758..270fc3792493 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -880,9 +880,7 @@ int xe_guc_post_load_init(struct xe_guc *guc)
 			return ret;
 	}
 
-	guc->submission_state.enabled = true;
-
-	return 0;
+	return xe_guc_submit_enable(guc);
 }
 
 int xe_guc_reset(struct xe_guc *guc)
@@ -1579,7 +1577,7 @@ void xe_guc_sanitize(struct xe_guc *guc)
 {
 	xe_uc_fw_sanitize(&guc->fw);
 	xe_guc_ct_disable(&guc->ct);
-	guc->submission_state.enabled = false;
+	xe_guc_submit_disable(guc);
 }
 
 int xe_guc_reset_prepare(struct xe_guc *guc)
diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
index a3f421e2adc0..c30c0e3ccbbb 100644
--- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h
@@ -35,8 +35,8 @@ struct xe_guc_exec_queue {
 	struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE];
 	/** @lr_tdr: long running TDR worker */
 	struct work_struct lr_tdr;
-	/** @fini_async: do final fini async from this worker */
-	struct work_struct fini_async;
+	/** @destroy_async: do final destroy async from this worker */
+	struct work_struct destroy_async;
 	/** @resume_time: time of last resume */
 	u64 resume_time;
 	/** @state: GuC specific state for this xe_exec_queue */
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index cafb47711e9b..0104afbc941c 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -32,6 +32,7 @@
 #include "xe_guc_ct.h"
 #include "xe_guc_exec_queue_types.h"
 #include "xe_guc_id_mgr.h"
+#include "xe_guc_klv_helpers.h"
 #include "xe_guc_submit_types.h"
 #include "xe_hw_engine.h"
 #include "xe_hw_fence.h"
@@ -316,6 +317,71 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids)
 	return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc);
 }
 
+/*
+ * Given that we want to guarantee enough RCS throughput to avoid missing
+ * frames, we set the yield policy to 20% of each 80ms interval.
+ */
+#define RC_YIELD_DURATION	80	/* in ms */
+#define RC_YIELD_RATIO		20	/* in percent */
+static u32 *emit_render_compute_yield_klv(u32 *emit)
+{
+	*emit++ = PREP_GUC_KLV_TAG(SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD);
+	*emit++ = RC_YIELD_DURATION;
+	*emit++ = RC_YIELD_RATIO;
+
+	return emit;
+}
+
+#define SCHEDULING_POLICY_MAX_DWORDS 16
+static int guc_init_global_schedule_policy(struct xe_guc *guc)
+{
+	u32 data[SCHEDULING_POLICY_MAX_DWORDS];
+	u32 *emit = data;
+	u32 count = 0;
+	int ret;
+
+	if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 1, 0))
+		return 0;
+
+	*emit++ = XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV;
+
+	if (CCS_MASK(guc_to_gt(guc)))
+		emit = emit_render_compute_yield_klv(emit);
+
+	count = emit - data;
+	if (count > 1) {
+		xe_assert(guc_to_xe(guc), count <= SCHEDULING_POLICY_MAX_DWORDS);
+
+		ret = xe_guc_ct_send_block(&guc->ct, data, count);
+		if (ret < 0) {
+			xe_gt_err(guc_to_gt(guc),
+				  "failed to enable GuC sheduling policies: %pe\n",
+				  ERR_PTR(ret));
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int xe_guc_submit_enable(struct xe_guc *guc)
+{
+	int ret;
+
+	ret = guc_init_global_schedule_policy(guc);
+	if (ret)
+		return ret;
+
+	guc->submission_state.enabled = true;
+
+	return 0;
+}
+
+void xe_guc_submit_disable(struct xe_guc *guc)
+{
+	guc->submission_state.enabled = false;
+}
+
 static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa_count)
 {
 	int i;
@@ -1277,48 +1343,57 @@ rearm:
 	return DRM_GPU_SCHED_STAT_NO_HANG;
 }
 
-static void __guc_exec_queue_fini_async(struct work_struct *w)
+static void guc_exec_queue_fini(struct xe_exec_queue *q)
+{
+	struct xe_guc_exec_queue *ge = q->guc;
+	struct xe_guc *guc = exec_queue_to_guc(q);
+
+	release_guc_id(guc, q);
+	xe_sched_entity_fini(&ge->entity);
+	xe_sched_fini(&ge->sched);
+
+	/*
+	 * RCU free due sched being exported via DRM scheduler fences
+	 * (timeline name).
+	 */
+	kfree_rcu(ge, rcu);
+}
+
+static void __guc_exec_queue_destroy_async(struct work_struct *w)
 {
 	struct xe_guc_exec_queue *ge =
-		container_of(w, struct xe_guc_exec_queue, fini_async);
+		container_of(w, struct xe_guc_exec_queue, destroy_async);
 	struct xe_exec_queue *q = ge->q;
 	struct xe_guc *guc = exec_queue_to_guc(q);
 
 	xe_pm_runtime_get(guc_to_xe(guc));
 	trace_xe_exec_queue_destroy(q);
 
-	release_guc_id(guc, q);
 	if (xe_exec_queue_is_lr(q))
 		cancel_work_sync(&ge->lr_tdr);
 	/* Confirm no work left behind accessing device structures */
 	cancel_delayed_work_sync(&ge->sched.base.work_tdr);
-	xe_sched_entity_fini(&ge->entity);
-	xe_sched_fini(&ge->sched);
 
-	/*
-	 * RCU free due sched being exported via DRM scheduler fences
-	 * (timeline name).
-	 */
-	kfree_rcu(ge, rcu);
 	xe_exec_queue_fini(q);
+
 	xe_pm_runtime_put(guc_to_xe(guc));
 }
 
-static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
+static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
 {
 	struct xe_guc *guc = exec_queue_to_guc(q);
 	struct xe_device *xe = guc_to_xe(guc);
 
-	INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
+	INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
 
 	/* We must block on kernel engines so slabs are empty on driver unload */
 	if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
-		__guc_exec_queue_fini_async(&q->guc->fini_async);
+		__guc_exec_queue_destroy_async(&q->guc->destroy_async);
 	else
-		queue_work(xe->destroy_wq, &q->guc->fini_async);
+		queue_work(xe->destroy_wq, &q->guc->destroy_async);
 }
 
-static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
+static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q)
 {
 	/*
 	 * Might be done from within the GPU scheduler, need to do async as we
@@ -1327,7 +1402,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
 	 * this we and don't really care when everything is fini'd, just that it
 	 * is.
 	 */
-	guc_exec_queue_fini_async(q);
+	guc_exec_queue_destroy_async(q);
 }
 
 static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
@@ -1341,7 +1416,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
 	if (exec_queue_registered(q))
 		disable_scheduling_deregister(guc, q);
 	else
-		__guc_exec_queue_fini(guc, q);
+		__guc_exec_queue_destroy(guc, q);
 }
 
 static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
@@ -1574,14 +1649,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
 #define STATIC_MSG_CLEANUP	0
 #define STATIC_MSG_SUSPEND	1
 #define STATIC_MSG_RESUME	2
-static void guc_exec_queue_fini(struct xe_exec_queue *q)
+static void guc_exec_queue_destroy(struct xe_exec_queue *q)
 {
 	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
 
 	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
 		guc_exec_queue_add_msg(q, msg, CLEANUP);
 	else
-		__guc_exec_queue_fini(exec_queue_to_guc(q), q);
+		__guc_exec_queue_destroy(exec_queue_to_guc(q), q);
 }
 
 static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -1711,6 +1786,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
 	.init = guc_exec_queue_init,
 	.kill = guc_exec_queue_kill,
 	.fini = guc_exec_queue_fini,
+	.destroy = guc_exec_queue_destroy,
 	.set_priority = guc_exec_queue_set_priority,
 	.set_timeslice = guc_exec_queue_set_timeslice,
 	.set_preempt_timeout = guc_exec_queue_set_preempt_timeout,
@@ -1732,7 +1808,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
 		if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
 			xe_exec_queue_put(q);
 		else if (exec_queue_destroyed(q))
-			__guc_exec_queue_fini(guc, q);
+			__guc_exec_queue_destroy(guc, q);
 	}
 	if (q->guc->suspend_pending) {
 		set_exec_queue_suspended(q);
@@ -1989,7 +2065,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
 	if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
 		xe_exec_queue_put(q);
 	else
-		__guc_exec_queue_fini(guc, q);
+		__guc_exec_queue_destroy(guc, q);
 }
 
 int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index 9b71a986c6ca..0d126b807c10 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -13,6 +13,8 @@ struct xe_exec_queue;
 struct xe_guc;
 
 int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids);
+int xe_guc_submit_enable(struct xe_guc *guc);
+void xe_guc_submit_disable(struct xe_guc *guc);
 
 int xe_guc_submit_reset_prepare(struct xe_guc *guc);
 void xe_guc_submit_reset_wait(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
index f08fc4377d25..c5b63e10bb91 100644
--- a/drivers/gpu/drm/xe/xe_hwmon.c
+++ b/drivers/gpu/drm/xe/xe_hwmon.c
@@ -286,7 +286,7 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg
  */
 static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *value)
 {
-	u64 reg_val = 0, min, max;
+	u32 reg_val = 0;
 	struct xe_device *xe = hwmon->xe;
 	struct xe_reg rapl_limit, pkg_power_sku;
 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
@@ -294,7 +294,7 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe
 	mutex_lock(&hwmon->hwmon_lock);
 
 	if (hwmon->xe->info.has_mbx_power_limits) {
-		xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, (u32 *)&reg_val);
+		xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, &reg_val);
 	} else {
 		rapl_limit = xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel);
 		pkg_power_sku = xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU, channel);
@@ -304,19 +304,21 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe
 	/* Check if PL limits are disabled. */
 	if (!(reg_val & PWR_LIM_EN)) {
 		*value = PL_DISABLE;
-		drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%016llx\n",
+		drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%08x\n",
 			 PWR_ATTR_TO_STR(attr), channel, reg_val);
 		goto unlock;
 	}
 
 	reg_val = REG_FIELD_GET(PWR_LIM_VAL, reg_val);
-	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
+	*value = mul_u32_u32(reg_val, SF_POWER) >> hwmon->scl_shift_power;
 
 	/* For platforms with mailbox power limit support clamping would be done by pcode. */
 	if (!hwmon->xe->info.has_mbx_power_limits) {
-		reg_val = xe_mmio_read64_2x32(mmio, pkg_power_sku);
-		min = REG_FIELD_GET(PKG_MIN_PWR, reg_val);
-		max = REG_FIELD_GET(PKG_MAX_PWR, reg_val);
+		u64 pkg_pwr, min, max;
+
+		pkg_pwr = xe_mmio_read64_2x32(mmio, pkg_power_sku);
+		min = REG_FIELD_GET(PKG_MIN_PWR, pkg_pwr);
+		max = REG_FIELD_GET(PKG_MAX_PWR, pkg_pwr);
 		min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
 		max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
 		if (min && max)
@@ -332,6 +334,7 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe
 	int ret = 0;
 	u32 reg_val, max;
 	struct xe_reg rapl_limit;
+	u64 max_supp_power_limit = 0;
 
 	mutex_lock(&hwmon->hwmon_lock);
 
@@ -356,6 +359,20 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe
 		goto unlock;
 	}
 
+	/*
+	 * If the sysfs value exceeds the maximum pcode supported power limit value, clamp it to
+	 * the supported maximum (U12.3 format).
+	 * This is to avoid truncation during reg_val calculation below and ensure the valid
+	 * power limit is sent for pcode which would clamp it to card-supported value.
+	 */
+	max_supp_power_limit = ((PWR_LIM_VAL) >> hwmon->scl_shift_power) * SF_POWER;
+	if (value > max_supp_power_limit) {
+		value = max_supp_power_limit;
+		drm_info(&hwmon->xe->drm,
+			 "Power limit clamped as selected %s exceeds channel %d limit\n",
+			 PWR_ATTR_TO_STR(attr), channel);
+	}
+
 	/* Computation in 64-bits to avoid overflow. Round to nearest. */
 	reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, SF_POWER);
 
@@ -478,8 +495,8 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at
 {
 	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
 	struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe);
-	u32 x, y, x_w = 2; /* 2 bits */
-	u64 r, tau4, out;
+	u32 reg_val, x, y, x_w = 2; /* 2 bits */
+	u64 tau4, out;
 	int channel = (to_sensor_dev_attr(attr)->index % 2) ? CHANNEL_PKG : CHANNEL_CARD;
 	u32 power_attr = (to_sensor_dev_attr(attr)->index > 1) ? PL2_HWMON_ATTR : PL1_HWMON_ATTR;
 
@@ -490,23 +507,24 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at
 	mutex_lock(&hwmon->hwmon_lock);
 
 	if (hwmon->xe->info.has_mbx_power_limits) {
-		ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, (u32 *)&r);
+		ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, &reg_val);
 		if (ret) {
 			drm_err(&hwmon->xe->drm,
-				"power interval read fail, ch %d, attr %d, r 0%llx, ret %d\n",
-				channel, power_attr, r, ret);
-			r = 0;
+				"power interval read fail, ch %d, attr %d, val 0x%08x, ret %d\n",
+				channel, power_attr, reg_val, ret);
+			reg_val = 0;
 		}
 	} else {
-		r = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel));
+		reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT,
+								channel));
 	}
 
 	mutex_unlock(&hwmon->hwmon_lock);
 
 	xe_pm_runtime_put(hwmon->xe);
 
-	x = REG_FIELD_GET(PWR_LIM_TIME_X, r);
-	y = REG_FIELD_GET(PWR_LIM_TIME_Y, r);
+	x = REG_FIELD_GET(PWR_LIM_TIME_X, reg_val);
+	y = REG_FIELD_GET(PWR_LIM_TIME_Y, reg_val);
 
 	/*
 	 * tau = (1 + (x / 4)) * power(2,y), x = bits(23:22), y = bits(21:17)
@@ -739,9 +757,23 @@ static int xe_hwmon_power_curr_crit_write(struct xe_hwmon *hwmon, int channel,
 {
 	int ret;
 	u32 uval;
+	u64 max_crit_power_curr = 0;
 
 	mutex_lock(&hwmon->hwmon_lock);
 
+	/*
+	 * If the sysfs value exceeds the pcode mailbox cmd POWER_SETUP_SUBCOMMAND_WRITE_I1
+	 * max supported value, clamp it to the command's max (U10.6 format).
+	 * This is to avoid truncation during uval calculation below and ensure the valid power
+	 * limit is sent for pcode which would clamp it to card-supported value.
+	 */
+	max_crit_power_curr = (POWER_SETUP_I1_DATA_MASK >> POWER_SETUP_I1_SHIFT) * scale_factor;
+	if (value > max_crit_power_curr) {
+		value = max_crit_power_curr;
+		drm_info(&hwmon->xe->drm,
+			 "Power limit clamped as selected exceeds channel %d limit\n",
+			 channel);
+	}
 	uval = DIV_ROUND_CLOSEST_ULL(value << POWER_SETUP_I1_SHIFT, scale_factor);
 	ret = xe_hwmon_pcode_write_i1(hwmon, uval);
 
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ba1cff2e4cda..84f412fd3c5d 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -408,7 +408,7 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
 
 	/* Special layout, prepared below.. */
 	vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
-			  XE_VM_FLAG_SET_TILE_ID(tile));
+			  XE_VM_FLAG_SET_TILE_ID(tile), NULL);
 	if (IS_ERR(vm))
 		return ERR_CAST(vm);
 
@@ -1820,15 +1820,19 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
 	if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) ||
 	    !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) {
 		int buf_offset = 0;
+		void *bounce;
+		int err;
+
+		BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES));
+		bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL);
+		if (!bounce)
+			return -ENOMEM;
 
 		/*
 		 * Less than ideal for large unaligned access but this should be
 		 * fairly rare, can fixup if this becomes common.
 		 */
 		do {
-			u8 bounce[XE_CACHELINE_BYTES];
-			void *ptr = (void *)bounce;
-			int err;
 			int copy_bytes = min_t(int, bytes_left,
 					       XE_CACHELINE_BYTES -
 					       (offset & XE_CACHELINE_MASK));
@@ -1837,22 +1841,22 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
 			err = xe_migrate_access_memory(m, bo,
 						       offset &
 						       ~XE_CACHELINE_MASK,
-						       (void *)ptr,
-						       sizeof(bounce), 0);
+						       bounce,
+						       XE_CACHELINE_BYTES, 0);
 			if (err)
-				return err;
+				break;
 
 			if (write) {
-				memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
+				memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes);
 
 				err = xe_migrate_access_memory(m, bo,
 							       offset & ~XE_CACHELINE_MASK,
-							       (void *)ptr,
-							       sizeof(bounce), write);
+							       bounce,
+							       XE_CACHELINE_BYTES, write);
 				if (err)
-					return err;
+					break;
 			} else {
-				memcpy(buf + buf_offset, ptr + ptr_offset,
+				memcpy(buf + buf_offset, bounce + ptr_offset,
 				       copy_bytes);
 			}
 
@@ -1861,7 +1865,8 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
 			offset += copy_bytes;
 		} while (bytes_left);
 
-		return 0;
+		kfree(bounce);
+		return err;
 	}
 
 	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
@@ -1882,8 +1887,11 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
 		else
 			current_bytes = min_t(int, bytes_left, cursor.size);
 
-		if (fence)
-			dma_fence_put(fence);
+		if (current_bytes & ~PAGE_MASK) {
+			int pitch = 4;
+
+			current_bytes = min_t(int, current_bytes, S16_MAX * pitch);
+		}
 
 		__fence = xe_migrate_vram(m, current_bytes,
 					  (unsigned long)buf & ~PAGE_MASK,
@@ -1892,11 +1900,15 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
 					  XE_MIGRATE_COPY_TO_VRAM :
 					  XE_MIGRATE_COPY_TO_SRAM);
 		if (IS_ERR(__fence)) {
-			if (fence)
+			if (fence) {
 				dma_fence_wait(fence, false);
+				dma_fence_put(fence);
+			}
 			fence = __fence;
 			goto out_err;
 		}
+
+		dma_fence_put(fence);
 		fence = __fence;
 
 		buf += current_bytes;
diff --git a/drivers/gpu/drm/xe/xe_nvm.c b/drivers/gpu/drm/xe/xe_nvm.c
index 61b0a1531a53..2cfe9eb67391 100644
--- a/drivers/gpu/drm/xe/xe_nvm.c
+++ b/drivers/gpu/drm/xe/xe_nvm.c
@@ -35,6 +35,10 @@ static const struct intel_dg_nvm_region regions[INTEL_DG_NVM_REGIONS] = {
 
 static void xe_nvm_release_dev(struct device *dev)
 {
+	struct auxiliary_device *aux = container_of(dev, struct auxiliary_device, dev);
+	struct intel_dg_nvm_dev *nvm = container_of(aux, struct intel_dg_nvm_dev, aux_dev);
+
+	kfree(nvm);
 }
 
 static bool xe_nvm_non_posted_erase(struct xe_device *xe)
@@ -162,6 +166,5 @@ void xe_nvm_fini(struct xe_device *xe)
 
 	auxiliary_device_delete(&nvm->aux_dev);
 	auxiliary_device_uninit(&nvm->aux_dev);
-	kfree(nvm);
 	xe->nvm = NULL;
 }
diff --git a/drivers/gpu/drm/xe/xe_pci_sriov.c b/drivers/gpu/drm/xe/xe_pci_sriov.c
index 447a7867eecb..af05db07162e 100644
--- a/drivers/gpu/drm/xe/xe_pci_sriov.c
+++ b/drivers/gpu/drm/xe/xe_pci_sriov.c
@@ -3,6 +3,10 @@
  * Copyright © 2023-2024 Intel Corporation
  */
 
+#include <linux/bitops.h>
+#include <linux/pci.h>
+
+#include "regs/xe_bars.h"
 #include "xe_assert.h"
 #include "xe_device.h"
 #include "xe_gt_sriov_pf_config.h"
@@ -128,6 +132,18 @@ static void pf_engine_activity_stats(struct xe_device *xe, unsigned int num_vfs,
 	}
 }
 
+static int resize_vf_vram_bar(struct xe_device *xe, int num_vfs)
+{
+	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+	u32 sizes;
+
+	sizes = pci_iov_vf_bar_get_sizes(pdev, VF_LMEM_BAR, num_vfs);
+	if (!sizes)
+		return 0;
+
+	return pci_iov_vf_bar_set_size(pdev, VF_LMEM_BAR, __fls(sizes));
+}
+
 static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
 {
 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
@@ -158,6 +174,12 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs)
 	if (err < 0)
 		goto failed;
 
+	if (IS_DGFX(xe)) {
+		err = resize_vf_vram_bar(xe, num_vfs);
+		if (err)
+			xe_sriov_info(xe, "Failed to set VF LMEM BAR size: %d\n", err);
+	}
+
 	err = pci_enable_sriov(pdev, num_vfs);
 	if (err < 0)
 		goto failed;
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index e279b47ba03b..bb9b6ecad2af 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -24,6 +24,7 @@
 #include "xe_pcode.h"
 #include "xe_pxp.h"
 #include "xe_trace.h"
+#include "xe_vm.h"
 #include "xe_wa.h"
 
 /**
@@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe)
 	return DEFAULT_VRAM_THRESHOLD;
 }
 
+static void xe_pm_wake_rebind_workers(struct xe_device *xe)
+{
+	struct xe_vm *vm, *next;
+
+	mutex_lock(&xe->rebind_resume_lock);
+	list_for_each_entry_safe(vm, next, &xe->rebind_resume_list,
+				 preempt.pm_activate_link) {
+		list_del_init(&vm->preempt.pm_activate_link);
+		xe_vm_resume_rebind_worker(vm);
+	}
+	mutex_unlock(&xe->rebind_resume_lock);
+}
+
 static int xe_pm_notifier_callback(struct notifier_block *nb,
 				   unsigned long action, void *data)
 {
@@ -299,30 +313,30 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
 	switch (action) {
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
+		reinit_completion(&xe->pm_block);
 		xe_pm_runtime_get(xe);
 		err = xe_bo_evict_all_user(xe);
-		if (err) {
+		if (err)
 			drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err);
-			xe_pm_runtime_put(xe);
-			break;
-		}
 
 		err = xe_bo_notifier_prepare_all_pinned(xe);
-		if (err) {
+		if (err)
 			drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err);
-			xe_pm_runtime_put(xe);
-		}
+		/*
+		 * Keep the runtime pm reference until post hibernation / post suspend to
+		 * avoid a runtime suspend interfering with evicted objects or backup
+		 * allocations.
+		 */
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
+		complete_all(&xe->pm_block);
+		xe_pm_wake_rebind_workers(xe);
 		xe_bo_notifier_unprepare_all_pinned(xe);
 		xe_pm_runtime_put(xe);
 		break;
 	}
 
-	if (err)
-		return NOTIFY_BAD;
-
 	return NOTIFY_DONE;
 }
 
@@ -344,6 +358,14 @@ int xe_pm_init(struct xe_device *xe)
 	if (err)
 		return err;
 
+	err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock);
+	if (err)
+		goto err_unregister;
+
+	init_completion(&xe->pm_block);
+	complete_all(&xe->pm_block);
+	INIT_LIST_HEAD(&xe->rebind_resume_list);
+
 	/* For now suspend/resume is only allowed with GuC */
 	if (!xe_device_uc_enabled(xe))
 		return 0;
diff --git a/drivers/gpu/drm/xe/xe_pxp_submit.c b/drivers/gpu/drm/xe/xe_pxp_submit.c
index d92ec0f515b0..ca95f2a4d4ef 100644
--- a/drivers/gpu/drm/xe/xe_pxp_submit.c
+++ b/drivers/gpu/drm/xe/xe_pxp_submit.c
@@ -101,7 +101,7 @@ static int allocate_gsc_client_resources(struct xe_gt *gt,
 	xe_assert(xe, hwe);
 
 	/* PXP instructions must be issued from PPGTT */
-	vm = xe_vm_create(xe, XE_VM_FLAG_GSC);
+	vm = xe_vm_create(xe, XE_VM_FLAG_GSC, NULL);
 	if (IS_ERR(vm))
 		return PTR_ERR(vm);
 
diff --git a/drivers/gpu/drm/xe/xe_shrinker.c b/drivers/gpu/drm/xe/xe_shrinker.c
index 1c3c04d52f55..90244fe59b59 100644
--- a/drivers/gpu/drm/xe/xe_shrinker.c
+++ b/drivers/gpu/drm/xe/xe_shrinker.c
@@ -54,10 +54,10 @@ xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long shrinkable, long purgea
 	write_unlock(&shrinker->lock);
 }
 
-static s64 xe_shrinker_walk(struct xe_device *xe,
-			    struct ttm_operation_ctx *ctx,
-			    const struct xe_bo_shrink_flags flags,
-			    unsigned long to_scan, unsigned long *scanned)
+static s64 __xe_shrinker_walk(struct xe_device *xe,
+			      struct ttm_operation_ctx *ctx,
+			      const struct xe_bo_shrink_flags flags,
+			      unsigned long to_scan, unsigned long *scanned)
 {
 	unsigned int mem_type;
 	s64 freed = 0, lret;
@@ -93,6 +93,48 @@ static s64 xe_shrinker_walk(struct xe_device *xe,
 	return freed;
 }
 
+/*
+ * Try shrinking idle objects without writeback first, then if not sufficient,
+ * try also non-idle objects and finally if that's not sufficient either,
+ * add writeback. This avoids stalls and explicit writebacks with light or
+ * moderate memory pressure.
+ */
+static s64 xe_shrinker_walk(struct xe_device *xe,
+			    struct ttm_operation_ctx *ctx,
+			    const struct xe_bo_shrink_flags flags,
+			    unsigned long to_scan, unsigned long *scanned)
+{
+	bool no_wait_gpu = true;
+	struct xe_bo_shrink_flags save_flags = flags;
+	s64 lret, freed;
+
+	swap(no_wait_gpu, ctx->no_wait_gpu);
+	save_flags.writeback = false;
+	lret = __xe_shrinker_walk(xe, ctx, save_flags, to_scan, scanned);
+	swap(no_wait_gpu, ctx->no_wait_gpu);
+	if (lret < 0 || *scanned >= to_scan)
+		return lret;
+
+	freed = lret;
+	if (!ctx->no_wait_gpu) {
+		lret = __xe_shrinker_walk(xe, ctx, save_flags, to_scan, scanned);
+		if (lret < 0)
+			return lret;
+		freed += lret;
+		if (*scanned >= to_scan)
+			return freed;
+	}
+
+	if (flags.writeback) {
+		lret = __xe_shrinker_walk(xe, ctx, flags, to_scan, scanned);
+		if (lret < 0)
+			return lret;
+		freed += lret;
+	}
+
+	return freed;
+}
+
 static unsigned long
 xe_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 {
@@ -199,6 +241,7 @@ static unsigned long xe_shrinker_scan(struct shrinker *shrink, struct shrink_con
 		runtime_pm = xe_shrinker_runtime_pm_get(shrinker, true, 0, can_backup);
 
 	shrink_flags.purge = false;
+
 	lret = xe_shrinker_walk(shrinker->xe, &ctx, shrink_flags,
 				nr_to_scan, &nr_scanned);
 	if (lret >= 0)
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 41705f5d52e3..8f7b0add2364 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -41,6 +41,8 @@
  *
  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
  *
+ * It is the responsibility of the user to clear the mode once firmware flash is complete.
+ *
  * Refer :ref:`xe_configfs` for more details on how to use configfs
  *
  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
@@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg)
 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
 	struct device *dev = &pdev->dev;
 
-	xe_configfs_clear_survivability_mode(pdev);
 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c
index f87276df18f2..82872a51f098 100644
--- a/drivers/gpu/drm/xe/xe_sync.c
+++ b/drivers/gpu/drm/xe/xe_sync.c
@@ -77,6 +77,7 @@ static void user_fence_worker(struct work_struct *w)
 {
 	struct xe_user_fence *ufence = container_of(w, struct xe_user_fence, worker);
 
+	WRITE_ONCE(ufence->signalled, 1);
 	if (mmget_not_zero(ufence->mm)) {
 		kthread_use_mm(ufence->mm);
 		if (copy_to_user(ufence->addr, &ufence->value, sizeof(ufence->value)))
@@ -91,7 +92,6 @@ static void user_fence_worker(struct work_struct *w)
 	 * Wake up waiters only after updating the ufence state, allowing the UMD
 	 * to safely reuse the same ufence without encountering -EBUSY errors.
 	 */
-	WRITE_ONCE(ufence->signalled, 1);
 	wake_up_all(&ufence->xe->ufence_wq);
 	user_fence_put(ufence);
 }
diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c
index b804234a6551..9e1236a9ec67 100644
--- a/drivers/gpu/drm/xe/xe_tile_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c
@@ -44,16 +44,18 @@ int xe_tile_sysfs_init(struct xe_tile *tile)
 	kt->tile = tile;
 
 	err = kobject_add(&kt->base, &dev->kobj, "tile%d", tile->id);
-	if (err) {
-		kobject_put(&kt->base);
-		return err;
-	}
+	if (err)
+		goto err_object;
 
 	tile->sysfs = &kt->base;
 
 	err = xe_vram_freq_sysfs_init(tile);
 	if (err)
-		return err;
+		goto err_object;
 
 	return devm_add_action_or_reset(xe->drm.dev, tile_sysfs_fini, tile);
+
+err_object:
+	kobject_put(&kt->base);
+	return err;
 }
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index 2035604121e6..5146999d27fa 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -240,8 +240,8 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
 
 	pfence = xe_preempt_fence_create(q, q->lr.context,
 					 ++q->lr.seqno);
-	if (!pfence) {
-		err = -ENOMEM;
+	if (IS_ERR(pfence)) {
+		err = PTR_ERR(pfence);
 		goto out_fini;
 	}
 
@@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
 		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
 			       &vm->rebind_list);
 
+	if (!try_wait_for_completion(&vm->xe->pm_block))
+		return -EAGAIN;
+
 	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
 	if (ret)
 		return ret;
@@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
 	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
 }
 
+static bool vm_suspend_rebind_worker(struct xe_vm *vm)
+{
+	struct xe_device *xe = vm->xe;
+	bool ret = false;
+
+	mutex_lock(&xe->rebind_resume_lock);
+	if (!try_wait_for_completion(&vm->xe->pm_block)) {
+		ret = true;
+		list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
+	}
+	mutex_unlock(&xe->rebind_resume_lock);
+
+	return ret;
+}
+
+/**
+ * xe_vm_resume_rebind_worker() - Resume the rebind worker.
+ * @vm: The vm whose preempt worker to resume.
+ *
+ * Resume a preempt worker that was previously suspended by
+ * vm_suspend_rebind_worker().
+ */
+void xe_vm_resume_rebind_worker(struct xe_vm *vm)
+{
+	queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
+}
+
 static void preempt_rebind_work_func(struct work_struct *w)
 {
 	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
 	}
 
 retry:
+	if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
+		up_write(&vm->lock);
+		return;
+	}
+
 	if (xe_vm_userptr_check_repin(vm)) {
 		err = xe_vm_userptr_pin(vm);
 		if (err)
@@ -1610,8 +1645,12 @@ static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
 
 	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
 		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
-		if (IS_ERR(vm->scratch_pt[id][i]))
-			return PTR_ERR(vm->scratch_pt[id][i]);
+		if (IS_ERR(vm->scratch_pt[id][i])) {
+			int err = PTR_ERR(vm->scratch_pt[id][i]);
+
+			vm->scratch_pt[id][i] = NULL;
+			return err;
+		}
 
 		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
 	}
@@ -1640,7 +1679,7 @@ static void xe_vm_free_scratch(struct xe_vm *vm)
 	}
 }
 
-struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
+struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
 {
 	struct drm_gem_object *vm_resv_obj;
 	struct xe_vm *vm;
@@ -1661,9 +1700,10 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 	vm->xe = xe;
 
 	vm->size = 1ull << xe->info.va_bits;
-
 	vm->flags = flags;
 
+	if (xef)
+		vm->xef = xe_file_get(xef);
 	/**
 	 * GSC VMs are kernel-owned, only used for PXP ops and can sometimes be
 	 * manipulated under the PXP mutex. However, the PXP mutex can be taken
@@ -1709,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 	if (flags & XE_VM_FLAG_LR_MODE) {
 		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
 		xe_pm_runtime_get_noresume(xe);
+		INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
 	}
 
 	if (flags & XE_VM_FLAG_FAULT_MODE) {
@@ -1794,6 +1835,20 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
 	if (number_tiles > 1)
 		vm->composite_fence_ctx = dma_fence_context_alloc(1);
 
+	if (xef && xe->info.has_asid) {
+		u32 asid;
+
+		down_write(&xe->usm.lock);
+		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
+				      XA_LIMIT(1, XE_MAX_ASID - 1),
+				      &xe->usm.next_asid, GFP_KERNEL);
+		up_write(&xe->usm.lock);
+		if (err < 0)
+			goto err_unlock_close;
+
+		vm->usm.asid = asid;
+	}
+
 	trace_xe_vm_create(vm);
 
 	return vm;
@@ -1814,6 +1869,8 @@ err_no_resv:
 	for_each_tile(tile, xe, id)
 		xe_range_fence_tree_fini(&vm->rftree[id]);
 	ttm_lru_bulk_move_fini(&xe->ttm, &vm->lru_bulk_move);
+	if (vm->xef)
+		xe_file_put(vm->xef);
 	kfree(vm);
 	if (flags & XE_VM_FLAG_LR_MODE)
 		xe_pm_runtime_put(xe);
@@ -1874,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm)
 	xe_assert(xe, !vm->preempt.num_exec_queues);
 
 	xe_vm_close(vm);
-	if (xe_vm_in_preempt_fence_mode(vm))
+	if (xe_vm_in_preempt_fence_mode(vm)) {
+		mutex_lock(&xe->rebind_resume_lock);
+		list_del_init(&vm->preempt.pm_activate_link);
+		mutex_unlock(&xe->rebind_resume_lock);
 		flush_work(&vm->preempt.rebind_work);
+	}
 	if (xe_vm_in_fault_mode(vm))
 		xe_svm_close(vm);
 
@@ -2059,9 +2120,8 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
 	struct xe_device *xe = to_xe_device(dev);
 	struct xe_file *xef = to_xe_file(file);
 	struct drm_xe_vm_create *args = data;
-	struct xe_tile *tile;
 	struct xe_vm *vm;
-	u32 id, asid;
+	u32 id;
 	int err;
 	u32 flags = 0;
 
@@ -2097,29 +2157,10 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data,
 	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
 		flags |= XE_VM_FLAG_FAULT_MODE;
 
-	vm = xe_vm_create(xe, flags);
+	vm = xe_vm_create(xe, flags, xef);
 	if (IS_ERR(vm))
 		return PTR_ERR(vm);
 
-	if (xe->info.has_asid) {
-		down_write(&xe->usm.lock);
-		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
-				      XA_LIMIT(1, XE_MAX_ASID - 1),
-				      &xe->usm.next_asid, GFP_KERNEL);
-		up_write(&xe->usm.lock);
-		if (err < 0)
-			goto err_close_and_put;
-
-		vm->usm.asid = asid;
-	}
-
-	vm->xef = xe_file_get(xef);
-
-	/* Record BO memory for VM pagetable created against client */
-	for_each_tile(tile, xe, id)
-		if (vm->pt_root[id])
-			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
-
 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
 	/* Warning: Security issue - never enable by default */
 	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
@@ -3421,6 +3462,7 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
 free_bind_ops:
 	if (args->num_binds > 1)
 		kvfree(*bind_ops);
+	*bind_ops = NULL;
 	return err;
 }
 
@@ -3527,7 +3569,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	struct xe_exec_queue *q = NULL;
 	u32 num_syncs, num_ufence = 0;
 	struct xe_sync_entry *syncs = NULL;
-	struct drm_xe_vm_bind_op *bind_ops;
+	struct drm_xe_vm_bind_op *bind_ops = NULL;
 	struct xe_vma_ops vops;
 	struct dma_fence *fence;
 	int err;
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 3475a118f666..82b112795807 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -26,7 +26,7 @@ struct xe_sync_entry;
 struct xe_svm_range;
 struct drm_exec;
 
-struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags);
+struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef);
 
 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id);
 int xe_vma_cmp_vma_cb(const void *key, const struct rb_node *node);
@@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
 				       struct xe_exec_queue *q, u64 addr,
 				       enum xe_cache_level cache_lvl);
 
+void xe_vm_resume_rebind_worker(struct xe_vm *vm);
+
 /**
  * xe_vm_resv() - Return's the vm's reservation object
  * @vm: The vm
@@ -315,22 +317,14 @@ void xe_vm_snapshot_free(struct xe_vm_snapshot *snap);
  * Register this task as currently making bos resident for the vm. Intended
  * to avoid eviction by the same task of shared bos bound to the vm.
  * Call with the vm's resv lock held.
- *
- * Return: A pin cookie that should be used for xe_vm_clear_validating().
  */
-static inline struct pin_cookie xe_vm_set_validating(struct xe_vm *vm,
-						     bool allow_res_evict)
+static inline void xe_vm_set_validating(struct xe_vm *vm, bool allow_res_evict)
 {
-	struct pin_cookie cookie = {};
-
 	if (vm && !allow_res_evict) {
 		xe_vm_assert_held(vm);
-		cookie = lockdep_pin_lock(&xe_vm_resv(vm)->lock.base);
 		/* Pairs with READ_ONCE in xe_vm_is_validating() */
 		WRITE_ONCE(vm->validating, current);
 	}
-
-	return cookie;
 }
 
 /**
@@ -338,17 +332,14 @@ static inline struct pin_cookie xe_vm_set_validating(struct xe_vm *vm,
  * @vm: Pointer to the vm or NULL
  * @allow_res_evict: Eviction from @vm was allowed. Must be set to the same
  * value as for xe_vm_set_validation().
- * @cookie: Cookie obtained from xe_vm_set_validating().
  *
  * Register this task as currently making bos resident for the vm. Intended
  * to avoid eviction by the same task of shared bos bound to the vm.
  * Call with the vm's resv lock held.
  */
-static inline void xe_vm_clear_validating(struct xe_vm *vm, bool allow_res_evict,
-					  struct pin_cookie cookie)
+static inline void xe_vm_clear_validating(struct xe_vm *vm, bool allow_res_evict)
 {
 	if (vm && !allow_res_evict) {
-		lockdep_unpin_lock(&xe_vm_resv(vm)->lock.base, cookie);
 		/* Pairs with READ_ONCE in xe_vm_is_validating() */
 		WRITE_ONCE(vm->validating, NULL);
 	}
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 8a07feef503b..6058cf739388 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -293,6 +293,11 @@ struct xe_vm {
 		 * BOs
 		 */
 		struct work_struct rebind_work;
+		/**
+		 * @preempt.pm_activate_link: Link to list of rebind workers to be
+		 * kicked on resume.
+		 */
+		struct list_head pm_activate_link;
 	} preempt;
 
 	/** @um: unified memory state */
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index e990f20eccfe..710f4423726c 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -30,7 +30,8 @@
 16022287689	GRAPHICS_VERSION(2001)
 		GRAPHICS_VERSION(2004)
 13011645652	GRAPHICS_VERSION(2004)
-		GRAPHICS_VERSION(3001)
+		GRAPHICS_VERSION_RANGE(3000, 3001)
+		GRAPHICS_VERSION(3003)
 14022293748	GRAPHICS_VERSION_RANGE(2001, 2002)
 		GRAPHICS_VERSION(2004)
 		GRAPHICS_VERSION_RANGE(3000, 3001)
diff --git a/drivers/gpu/nova-core/Kconfig b/drivers/gpu/nova-core/Kconfig
index 8726d80d6ba4..20d3e6d0d796 100644
--- a/drivers/gpu/nova-core/Kconfig
+++ b/drivers/gpu/nova-core/Kconfig
@@ -1,5 +1,6 @@
 config NOVA_CORE
 	tristate "Nova Core GPU driver"
+	depends on 64BIT
 	depends on PCI
 	depends on RUST
 	depends on RUST_FW_LOADER_ABSTRACTIONS
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index a57901203aeb..b934523593d9 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -597,8 +597,6 @@ config HID_LED
 
 config HID_LENOVO
 	tristate "Lenovo / Thinkpad devices"
-	depends on ACPI
-	select ACPI_PLATFORM_PROFILE
 	select NEW_LEDS
 	select LEDS_CLASS
 	help
@@ -1243,7 +1241,7 @@ config HID_U2FZERO
 
 	  U2F Zero supports custom commands for blinking the LED
 	  and getting data from the internal hardware RNG.
-	  The internal hardware can be used to feed the enthropy pool.
+	  The internal hardware can be used to feed the entropy pool.
 
 	  U2F Zero only supports blinking its LED, so this driver doesn't
 	  allow setting the brightness to anything but 1, which will
diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_client.c b/drivers/hid/amd-sfh-hid/amd_sfh_client.c
index 0f2cbae39b2b..7017bfa59093 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_client.c
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_client.c
@@ -39,8 +39,12 @@ int amd_sfh_get_report(struct hid_device *hid, int report_id, int report_type)
 	struct amdtp_hid_data *hid_data = hid->driver_data;
 	struct amdtp_cl_data *cli_data = hid_data->cli_data;
 	struct request_list *req_list = &cli_data->req_list;
+	struct amd_input_data *in_data = cli_data->in_data;
+	struct amd_mp2_dev *mp2;
 	int i;
 
+	mp2 = container_of(in_data, struct amd_mp2_dev, in_data);
+	guard(mutex)(&mp2->lock);
 	for (i = 0; i < cli_data->num_hid_devices; i++) {
 		if (cli_data->hid_sensor_hubs[i] == hid) {
 			struct request_list *new = kzalloc(sizeof(*new), GFP_KERNEL);
@@ -75,6 +79,8 @@ void amd_sfh_work(struct work_struct *work)
 	u8 report_id, node_type;
 	u8 report_size = 0;
 
+	mp2 = container_of(in_data, struct amd_mp2_dev, in_data);
+	guard(mutex)(&mp2->lock);
 	req_node = list_last_entry(&req_list->list, struct request_list, list);
 	list_del(&req_node->list);
 	current_index = req_node->current_index;
@@ -83,7 +89,6 @@ void amd_sfh_work(struct work_struct *work)
 	node_type = req_node->report_type;
 	kfree(req_node);
 
-	mp2 = container_of(in_data, struct amd_mp2_dev, in_data);
 	mp2_ops = mp2->mp2_ops;
 	if (node_type == HID_FEATURE_REPORT) {
 		report_size = mp2_ops->get_feat_rep(sensor_index, report_id,
@@ -107,6 +112,8 @@ void amd_sfh_work(struct work_struct *work)
 	cli_data->cur_hid_dev = current_index;
 	cli_data->sensor_requested_cnt[current_index] = 0;
 	amdtp_hid_wakeup(cli_data->hid_sensor_hubs[current_index]);
+	if (!list_empty(&req_list->list))
+		schedule_delayed_work(&cli_data->work, 0);
 }
 
 void amd_sfh_work_buffer(struct work_struct *work)
@@ -117,9 +124,10 @@ void amd_sfh_work_buffer(struct work_struct *work)
 	u8 report_size;
 	int i;
 
+	mp2 = container_of(in_data, struct amd_mp2_dev, in_data);
+	guard(mutex)(&mp2->lock);
 	for (i = 0; i < cli_data->num_hid_devices; i++) {
 		if (cli_data->sensor_sts[i] == SENSOR_ENABLED) {
-			mp2 = container_of(in_data, struct amd_mp2_dev, in_data);
 			report_size = mp2->mp2_ops->get_in_rep(i, cli_data->sensor_idx[i],
 							       cli_data->report_id[i], in_data);
 			hid_input_report(cli_data->hid_sensor_hubs[i], HID_INPUT_REPORT,
diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_common.h b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
index f44a3bb2fbd4..78f830c133e5 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_common.h
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_common.h
@@ -10,6 +10,7 @@
 #ifndef AMD_SFH_COMMON_H
 #define AMD_SFH_COMMON_H
 
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include "amd_sfh_hid.h"
 
@@ -59,6 +60,8 @@ struct amd_mp2_dev {
 	u32 mp2_acs;
 	struct sfh_dev_status dev_en;
 	struct work_struct work;
+	/* mp2 to protect data */
+	struct mutex lock;
 	u8 init_done;
 	u8 rver;
 };
diff --git a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c
index 2983af969579..1d9f955573aa 100644
--- a/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c
+++ b/drivers/hid/amd-sfh-hid/amd_sfh_pcie.c
@@ -466,6 +466,10 @@ static int amd_mp2_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i
 	if (!privdata->cl_data)
 		return -ENOMEM;
 
+	rc = devm_mutex_init(&pdev->dev, &privdata->lock);
+	if (rc)
+		return rc;
+
 	privdata->sfh1_1_ops = (const struct amd_sfh1_1_ops *)id->driver_data;
 	if (privdata->sfh1_1_ops) {
 		if (boot_cpu_data.x86 >= 0x1A)
diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index 4b45e31f0bab..8db9d4e7c3b0 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -974,7 +974,10 @@ static int asus_input_mapping(struct hid_device *hdev,
 		case 0xc4: asus_map_key_clear(KEY_KBDILLUMUP);		break;
 		case 0xc5: asus_map_key_clear(KEY_KBDILLUMDOWN);		break;
 		case 0xc7: asus_map_key_clear(KEY_KBDILLUMTOGGLE);	break;
+		case 0x4e: asus_map_key_clear(KEY_FN_ESC);		break;
+		case 0x7e: asus_map_key_clear(KEY_EMOJI_PICKER);	break;
 
+		case 0x8b: asus_map_key_clear(KEY_PROG1);	break; /* ProArt Creator Hub key */
 		case 0x6b: asus_map_key_clear(KEY_F21);		break; /* ASUS touchpad toggle */
 		case 0x38: asus_map_key_clear(KEY_PROG1);	break; /* ROG key */
 		case 0xba: asus_map_key_clear(KEY_PROG2);	break; /* Fn+C ASUS Splendid */
@@ -1213,7 +1216,13 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		return ret;
 	}
 
-	if (!drvdata->input) {
+	/*
+	 * Check that input registration succeeded. Checking that
+	 * HID_CLAIMED_INPUT is set prevents a UAF when all input devices
+	 * were freed during registration due to no usages being mapped,
+	 * leaving drvdata->input pointing to freed memory.
+	 */
+	if (!drvdata->input || !(hdev->claimed & HID_CLAIMED_INPUT)) {
 		hid_err(hdev, "Asus input not registered\n");
 		ret = -ENOMEM;
 		goto err_stop_hw;
diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 482f62a78c41..5a95ea3bec98 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -229,10 +229,12 @@ static int cp2112_gpio_set_unlocked(struct cp2112_device *dev,
 	ret = hid_hw_raw_request(hdev, CP2112_GPIO_SET, buf,
 				 CP2112_GPIO_SET_LENGTH, HID_FEATURE_REPORT,
 				 HID_REQ_SET_REPORT);
-	if (ret < 0)
+	if (ret != CP2112_GPIO_SET_LENGTH) {
 		hid_err(hdev, "error setting GPIO values: %d\n", ret);
+		return ret < 0 ? ret : -EIO;
+	}
 
-	return ret;
+	return 0;
 }
 
 static int cp2112_gpio_set(struct gpio_chip *chip, unsigned int offset,
@@ -309,9 +311,7 @@ static int cp2112_gpio_direction_output(struct gpio_chip *chip,
 	 * Set gpio value when output direction is already set,
 	 * as specified in AN495, Rev. 0.2, cpt. 4.4
 	 */
-	cp2112_gpio_set_unlocked(dev, offset, value);
-
-	return 0;
+	return cp2112_gpio_set_unlocked(dev, offset, value);
 }
 
 static int cp2112_hid_get(struct hid_device *hdev, unsigned char report_number,
diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c
index 0ad7d25d9864..69771fd35006 100644
--- a/drivers/hid/hid-elecom.c
+++ b/drivers/hid/hid-elecom.c
@@ -101,6 +101,7 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 		 */
 		mouse_button_fixup(hdev, rdesc, *rsize, 12, 30, 14, 20, 8);
 		break;
+	case USB_DEVICE_ID_ELECOM_M_DT2DRBK:
 	case USB_DEVICE_ID_ELECOM_M_HT1DRBK_011C:
 		/*
 		 * Report descriptor format:
@@ -123,6 +124,7 @@ static const struct hid_device_id elecom_devices[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT2DRBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK_010C) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK_019B) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) },
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 5a1096283855..149798754570 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -451,6 +451,7 @@
 #define USB_DEVICE_ID_ELECOM_M_XT4DRBK	0x00fd
 #define USB_DEVICE_ID_ELECOM_M_DT1URBK	0x00fe
 #define USB_DEVICE_ID_ELECOM_M_DT1DRBK	0x00ff
+#define USB_DEVICE_ID_ELECOM_M_DT2DRBK	0x018d
 #define USB_DEVICE_ID_ELECOM_M_HT1URBK_010C	0x010c
 #define USB_DEVICE_ID_ELECOM_M_HT1URBK_019B	0x019b
 #define USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D	0x010d
@@ -834,6 +835,8 @@
 #define USB_DEVICE_ID_LENOVO_PIXART_USB_MOUSE_6019	0x6019
 #define USB_DEVICE_ID_LENOVO_PIXART_USB_MOUSE_602E	0x602e
 #define USB_DEVICE_ID_LENOVO_PIXART_USB_MOUSE_6093	0x6093
+#define USB_DEVICE_ID_LENOVO_LEGION_GO_DUAL_DINPUT	0x6184
+#define USB_DEVICE_ID_LENOVO_LEGION_GO2_DUAL_DINPUT	0x61ed
 
 #define USB_VENDOR_ID_LETSKETCH		0x6161
 #define USB_DEVICE_ID_WP9620N		0x4d15
@@ -907,6 +910,7 @@
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2		0xc534
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1	0xc539
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1	0xc53f
+#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_2	0xc543
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY	0xc53a
 #define USB_DEVICE_ID_LOGITECH_BOLT_RECEIVER	0xc548
 #define USB_DEVICE_ID_SPACETRAVELLER	0xc623
diff --git a/drivers/hid/hid-input-test.c b/drivers/hid/hid-input-test.c
index 77c2d45ac62a..6f5c71660d82 100644
--- a/drivers/hid/hid-input-test.c
+++ b/drivers/hid/hid-input-test.c
@@ -7,7 +7,7 @@
 
 #include <kunit/test.h>
 
-static void hid_test_input_set_battery_charge_status(struct kunit *test)
+static void hid_test_input_update_battery_charge_status(struct kunit *test)
 {
 	struct hid_device *dev;
 	bool handled;
@@ -15,15 +15,15 @@ static void hid_test_input_set_battery_charge_status(struct kunit *test)
 	dev = kunit_kzalloc(test, sizeof(*dev), GFP_KERNEL);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
 
-	handled = hidinput_set_battery_charge_status(dev, HID_DG_HEIGHT, 0);
+	handled = hidinput_update_battery_charge_status(dev, HID_DG_HEIGHT, 0);
 	KUNIT_EXPECT_FALSE(test, handled);
 	KUNIT_EXPECT_EQ(test, dev->battery_charge_status, POWER_SUPPLY_STATUS_UNKNOWN);
 
-	handled = hidinput_set_battery_charge_status(dev, HID_BAT_CHARGING, 0);
+	handled = hidinput_update_battery_charge_status(dev, HID_BAT_CHARGING, 0);
 	KUNIT_EXPECT_TRUE(test, handled);
 	KUNIT_EXPECT_EQ(test, dev->battery_charge_status, POWER_SUPPLY_STATUS_DISCHARGING);
 
-	handled = hidinput_set_battery_charge_status(dev, HID_BAT_CHARGING, 1);
+	handled = hidinput_update_battery_charge_status(dev, HID_BAT_CHARGING, 1);
 	KUNIT_EXPECT_TRUE(test, handled);
 	KUNIT_EXPECT_EQ(test, dev->battery_charge_status, POWER_SUPPLY_STATUS_CHARGING);
 }
@@ -63,7 +63,7 @@ static void hid_test_input_get_battery_property(struct kunit *test)
 }
 
 static struct kunit_case hid_input_tests[] = {
-	KUNIT_CASE(hid_test_input_set_battery_charge_status),
+	KUNIT_CASE(hid_test_input_update_battery_charge_status),
 	KUNIT_CASE(hid_test_input_get_battery_property),
 	{ }
 };
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index ff1784b5c2a4..f45f856a127f 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -595,13 +595,33 @@ static void hidinput_cleanup_battery(struct hid_device *dev)
 	dev->battery = NULL;
 }
 
-static void hidinput_update_battery(struct hid_device *dev, int value)
+static bool hidinput_update_battery_charge_status(struct hid_device *dev,
+						  unsigned int usage, int value)
+{
+	switch (usage) {
+	case HID_BAT_CHARGING:
+		dev->battery_charge_status = value ?
+					     POWER_SUPPLY_STATUS_CHARGING :
+					     POWER_SUPPLY_STATUS_DISCHARGING;
+		return true;
+	}
+
+	return false;
+}
+
+static void hidinput_update_battery(struct hid_device *dev, unsigned int usage,
+				    int value)
 {
 	int capacity;
 
 	if (!dev->battery)
 		return;
 
+	if (hidinput_update_battery_charge_status(dev, usage, value)) {
+		power_supply_changed(dev->battery);
+		return;
+	}
+
 	if (value == 0 || value < dev->battery_min || value > dev->battery_max)
 		return;
 
@@ -617,20 +637,6 @@ static void hidinput_update_battery(struct hid_device *dev, int value)
 		power_supply_changed(dev->battery);
 	}
 }
-
-static bool hidinput_set_battery_charge_status(struct hid_device *dev,
-					       unsigned int usage, int value)
-{
-	switch (usage) {
-	case HID_BAT_CHARGING:
-		dev->battery_charge_status = value ?
-					     POWER_SUPPLY_STATUS_CHARGING :
-					     POWER_SUPPLY_STATUS_DISCHARGING;
-		return true;
-	}
-
-	return false;
-}
 #else  /* !CONFIG_HID_BATTERY_STRENGTH */
 static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 				  struct hid_field *field, bool is_percentage)
@@ -642,14 +648,9 @@ static void hidinput_cleanup_battery(struct hid_device *dev)
 {
 }
 
-static void hidinput_update_battery(struct hid_device *dev, int value)
-{
-}
-
-static bool hidinput_set_battery_charge_status(struct hid_device *dev,
-					       unsigned int usage, int value)
+static void hidinput_update_battery(struct hid_device *dev, unsigned int usage,
+				    int value)
 {
-	return false;
 }
 #endif	/* CONFIG_HID_BATTERY_STRENGTH */
 
@@ -1515,11 +1516,7 @@ void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct
 		return;
 
 	if (usage->type == EV_PWR) {
-		bool handled = hidinput_set_battery_charge_status(hid, usage->hid, value);
-
-		if (!handled)
-			hidinput_update_battery(hid, value);
-
+		hidinput_update_battery(hid, usage->hid, value);
 		return;
 	}
 
diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c
index b3121fa7a72d..654879814f97 100644
--- a/drivers/hid/hid-lenovo.c
+++ b/drivers/hid/hid-lenovo.c
@@ -32,8 +32,6 @@
 #include <linux/leds.h>
 #include <linux/workqueue.h>
 
-#include <linux/platform_profile.h>
-
 #include "hid-ids.h"
 
 /* Userspace expects F20 for mic-mute KEY_MICMUTE does not work */
@@ -734,7 +732,7 @@ static int lenovo_raw_event_TP_X12_tab(struct hid_device *hdev, u32 raw_data)
 				report_key_event(input, KEY_RFKILL);
 				return 1;
 			}
-			platform_profile_cycle();
+			report_key_event(input, KEY_PERFORMANCE);
 			return 1;
 		case TP_X12_RAW_HOTKEY_FN_F10:
 			/* TAB1 has PICKUP Phone and TAB2 use Snipping tool*/
diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c
index 34fa71ceec2b..cce54dd9884a 100644
--- a/drivers/hid/hid-logitech-dj.c
+++ b/drivers/hid/hid-logitech-dj.c
@@ -1983,6 +1983,10 @@ static const struct hid_device_id logi_dj_receivers[] = {
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
 		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1),
 	 .driver_data = recvr_type_gaming_hidpp},
+	{ /* Logitech lightspeed receiver (0xc543) */
+	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
+		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_2),
+	 .driver_data = recvr_type_gaming_hidpp},
 
 	{ /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER),
diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
index 10a3bc5f931b..aaef405a717e 100644
--- a/drivers/hid/hid-logitech-hidpp.c
+++ b/drivers/hid/hid-logitech-hidpp.c
@@ -4596,6 +4596,8 @@ static const struct hid_device_id hidpp_devices[] = {
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC094) },
 	{ /* Logitech G Pro X Superlight 2 Gaming Mouse over USB */
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xC09b) },
+	{ /* Logitech G PRO 2 LIGHTSPEED Wireless Mouse over USB */
+	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xc09a) },
 
 	{ /* G935 Gaming Headset */
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0x0a87),
diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c
index 475ac352df30..33603b019f97 100644
--- a/drivers/hid/hid-mcp2221.c
+++ b/drivers/hid/hid-mcp2221.c
@@ -906,6 +906,10 @@ static int mcp2221_raw_event(struct hid_device *hdev,
 			}
 			if (data[2] == MCP2221_I2C_READ_COMPL ||
 			    data[2] == MCP2221_I2C_READ_PARTIAL) {
+				if (!mcp->rxbuf || mcp->rxbuf_idx < 0 || data[3] > 60) {
+					mcp->status = -EINVAL;
+					break;
+				}
 				buf = mcp->rxbuf;
 				memcpy(&buf[mcp->rxbuf_idx], &data[4], data[3]);
 				mcp->rxbuf_idx = mcp->rxbuf_idx + data[3];
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 294516a8f541..22c6314a8843 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -1503,6 +1503,14 @@ static const __u8 *mt_report_fixup(struct hid_device *hdev, __u8 *rdesc,
 	if (hdev->vendor == I2C_VENDOR_ID_GOODIX &&
 	    (hdev->product == I2C_DEVICE_ID_GOODIX_01E8 ||
 	     hdev->product == I2C_DEVICE_ID_GOODIX_01E9)) {
+		if (*size < 608) {
+			dev_info(
+				&hdev->dev,
+				"GT7868Q fixup: report descriptor is only %u bytes, skipping\n",
+				*size);
+			return rdesc;
+		}
+
 		if (rdesc[607] == 0x15) {
 			rdesc[607] = 0x25;
 			dev_info(
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index 2738ce947434..0f76e241e0af 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -144,6 +144,9 @@ static void ntrig_report_version(struct hid_device *hdev)
 	struct usb_device *usb_dev = hid_to_usb_dev(hdev);
 	unsigned char *data = kmalloc(8, GFP_KERNEL);
 
+	if (!hid_is_usb(hdev))
+		return;
+
 	if (!data)
 		goto err_free;
 
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index ff11f1ad344d..f619ed10535d 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -124,6 +124,8 @@ static const struct hid_device_id hid_quirks[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2), HID_QUIRK_MULTI_INPUT },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_T609A), HID_QUIRK_MULTI_INPUT },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LABTEC, USB_DEVICE_ID_LABTEC_ODDOR_HANDBRAKE), HID_QUIRK_ALWAYS_POLL },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_LEGION_GO_DUAL_DINPUT), HID_QUIRK_MULTI_INPUT },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_LEGION_GO2_DUAL_DINPUT), HID_QUIRK_MULTI_INPUT },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_OPTICAL_USB_MOUSE_600E), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_PIXART_USB_MOUSE_608D), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_PIXART_USB_MOUSE_6019), HID_QUIRK_ALWAYS_POLL },
@@ -411,6 +413,7 @@ static const struct hid_device_id hid_have_special_driver[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1DRBK) },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT2DRBK) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK_010C) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1URBK_019B) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_HT1DRBK_010D) },
diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
index c57483224db6..9d150ce234f2 100644
--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c
+++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
@@ -264,9 +264,6 @@ static void ish_shutdown(struct pci_dev *pdev)
 
 static struct device __maybe_unused *ish_resume_device;
 
-/* 50ms to get resume response */
-#define WAIT_FOR_RESUME_ACK_MS		50
-
 /**
  * ish_resume_handler() - Work function to complete resume
  * @work:	work struct
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index 6550ad5bfbb5..d8c3c54a8c0f 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -759,6 +759,9 @@ static void hid_ishtp_cl_resume_handler(struct work_struct *work)
 	if (ishtp_wait_resume(ishtp_get_ishtp_device(hid_ishtp_cl))) {
 		client_data->suspended = false;
 		wake_up_interruptible(&client_data->ishtp_resume_wait);
+	} else {
+		hid_ishtp_trace(client_data, "hid client: wait for resume timed out");
+		dev_err(cl_data_to_dev(client_data), "wait for resume timed out");
 	}
 }
 
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index 5ac7d70a7c84..93a0432e7058 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -852,9 +852,6 @@ EXPORT_SYMBOL(ishtp_device);
  */
 bool ishtp_wait_resume(struct ishtp_device *dev)
 {
-	/* 50ms to get resume response */
-	#define WAIT_FOR_RESUME_ACK_MS		50
-
 	/* Waiting to get resume response */
 	if (dev->resume_flag)
 		wait_event_interruptible_timeout(dev->resume_wait,
diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
index ec9f6e87aaf2..23db97ecf21c 100644
--- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
+++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h
@@ -47,6 +47,9 @@
 
 #define	MAX_DMA_DELAY	20
 
+/* 300ms to get resume response */
+#define WAIT_FOR_RESUME_ACK_MS		300
+
 /* ISHTP device states */
 enum ishtp_dev_state {
 	ISHTP_DEV_INITIALIZING = 0,
diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c
index e944a6ccb776..a2643ae790d6 100644
--- a/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c
+++ b/drivers/hid/intel-thc-hid/intel-quicki2c/pci-quicki2c.c
@@ -419,6 +419,7 @@ static struct quicki2c_device *quicki2c_dev_init(struct pci_dev *pdev, void __io
  */
 static void quicki2c_dev_deinit(struct quicki2c_device *qcdev)
 {
+	thc_interrupt_quiesce(qcdev->thc_hw, true);
 	thc_interrupt_enable(qcdev->thc_hw, false);
 	thc_ltr_unconfig(qcdev->thc_hw);
 	thc_wot_unconfig(qcdev->thc_hw);
@@ -996,6 +997,8 @@ static const struct pci_device_id quicki2c_pci_tbl[] = {
 	{ PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_I2C_PORT2, &ptl_ddata) },
 	{ PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT1, &ptl_ddata) },
 	{ PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_I2C_PORT2, &ptl_ddata) },
+	{ PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT1, &ptl_ddata) },
+	{ PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_I2C_PORT2, &ptl_ddata) },
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, quicki2c_pci_tbl);
diff --git a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h
index 93d6fa982d60..4e60a7de4727 100644
--- a/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h
+++ b/drivers/hid/intel-thc-hid/intel-quicki2c/quicki2c-dev.h
@@ -13,6 +13,8 @@
 #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_I2C_PORT2	0xE34A
 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT1	0xE448
 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_I2C_PORT2	0xE44A
+#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT1 	0x4D48
+#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_I2C_PORT2 	0x4D4A
 
 /* Packet size value, the unit is 16 bytes */
 #define MAX_PACKET_SIZE_VALUE_LNL			256
@@ -77,6 +79,7 @@ struct quicki2c_subip_acpi_parameter {
 	u16 device_address;
 	u64 connection_speed;
 	u8 addressing_mode;
+	u8 reserved;
 } __packed;
 
 /**
@@ -126,6 +129,7 @@ struct quicki2c_subip_acpi_config {
 	u64 HMTD;
 	u64 HMRD;
 	u64 HMSL;
+	u8 reserved;
 };
 
 /**
diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c
index 5e5f179dd113..84314989dc53 100644
--- a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c
+++ b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c
@@ -976,6 +976,8 @@ static const struct pci_device_id quickspi_pci_tbl[] = {
 	{PCI_DEVICE_DATA(INTEL, THC_PTL_H_DEVICE_ID_SPI_PORT2, &ptl), },
 	{PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT1, &ptl), },
 	{PCI_DEVICE_DATA(INTEL, THC_PTL_U_DEVICE_ID_SPI_PORT2, &ptl), },
+	{PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT1, &ptl), },
+	{PCI_DEVICE_DATA(INTEL, THC_WCL_DEVICE_ID_SPI_PORT2, &ptl), },
 	{}
 };
 MODULE_DEVICE_TABLE(pci, quickspi_pci_tbl);
diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h
index 6fdf674b21c5..f3532d866749 100644
--- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h
+++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-dev.h
@@ -19,6 +19,8 @@
 #define PCI_DEVICE_ID_INTEL_THC_PTL_H_DEVICE_ID_SPI_PORT2	0xE34B
 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT1	0xE449
 #define PCI_DEVICE_ID_INTEL_THC_PTL_U_DEVICE_ID_SPI_PORT2	0xE44B
+#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT1 	0x4D49
+#define PCI_DEVICE_ID_INTEL_THC_WCL_DEVICE_ID_SPI_PORT2 	0x4D4B
 
 /* HIDSPI special ACPI parameters DSM methods */
 #define ACPI_QUICKSPI_REVISION_NUM			2
diff --git a/drivers/hid/intel-thc-hid/intel-thc/intel-thc-dev.c b/drivers/hid/intel-thc-hid/intel-thc/intel-thc-dev.c
index 6f2263869b20..e1cb9b117ebc 100644
--- a/drivers/hid/intel-thc-hid/intel-thc/intel-thc-dev.c
+++ b/drivers/hid/intel-thc-hid/intel-thc/intel-thc-dev.c
@@ -1540,7 +1540,7 @@ int thc_i2c_subip_regs_save(struct thc_device *dev)
 
 	for (int i = 0; i < ARRAY_SIZE(i2c_subip_regs); i++) {
 		ret = thc_i2c_subip_pio_read(dev, i2c_subip_regs[i],
-					     &read_size, (u32 *)&dev->i2c_subip_regs + i);
+					     &read_size, &dev->i2c_subip_regs[i]);
 		if (ret < 0)
 			return ret;
 	}
@@ -1563,7 +1563,7 @@ int thc_i2c_subip_regs_restore(struct thc_device *dev)
 
 	for (int i = 0; i < ARRAY_SIZE(i2c_subip_regs); i++) {
 		ret = thc_i2c_subip_pio_write(dev, i2c_subip_regs[i],
-					      write_size, (u32 *)&dev->i2c_subip_regs + i);
+					      write_size, &dev->i2c_subip_regs[i]);
 		if (ret < 0)
 			return ret;
 	}
diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
index 955b39d22524..9b2c710f8da1 100644
--- a/drivers/hid/wacom_wac.c
+++ b/drivers/hid/wacom_wac.c
@@ -684,6 +684,7 @@ static bool wacom_is_art_pen(int tool_id)
 	case 0x885:	/* Intuos3 Marker Pen */
 	case 0x804:	/* Intuos4/5 13HD/24HD Marker Pen */
 	case 0x10804:	/* Intuos4/5 13HD/24HD Art Pen */
+	case 0x204:     /* Art Pen 2 */
 		is_art_pen = true;
 		break;
 	}
diff --git a/drivers/hwmon/ina238.c b/drivers/hwmon/ina238.c
index 5a394eeff676..59a2c8889fa2 100644
--- a/drivers/hwmon/ina238.c
+++ b/drivers/hwmon/ina238.c
@@ -379,7 +379,7 @@ static int ina238_write_in(struct device *dev, u32 attr, int channel,
 		regval = clamp_val(val, -163, 163);
 		regval = (regval * 1000 * 4) /
 			 (INA238_SHUNT_VOLTAGE_LSB * data->gain);
-		regval = clamp_val(regval, S16_MIN, S16_MAX);
+		regval = clamp_val(regval, S16_MIN, S16_MAX) & 0xffff;
 
 		switch (attr) {
 		case hwmon_in_max:
@@ -517,9 +517,10 @@ static int ina238_write_power(struct device *dev, u32 attr, long val)
 	 * Unsigned postive values. Compared against the 24-bit power register,
 	 * lower 8-bits are truncated. Same conversion to/from uW as POWER
 	 * register.
+	 * The first clamp_val() is to establish a baseline to avoid overflows.
 	 */
-	regval = clamp_val(val, 0, LONG_MAX);
-	regval = div_u64(val * 4 * 100 * data->rshunt, data->config->power_calculate_factor *
+	regval = clamp_val(val, 0, LONG_MAX / 2);
+	regval = div_u64(regval * 4 * 100 * data->rshunt, data->config->power_calculate_factor *
 			1000ULL * INA238_FIXED_SHUNT * data->gain);
 	regval = clamp_val(regval >> 8, 0, U16_MAX);
 
@@ -572,7 +573,7 @@ static int ina238_write_temp(struct device *dev, u32 attr, long val)
 		return -EOPNOTSUPP;
 
 	/* Signed */
-	regval = clamp_val(val, -40000, 125000);
+	val = clamp_val(val, -40000, 125000);
 	regval = div_s64(val * 10000, data->config->temp_lsb) << data->config->temp_shift;
 	regval = clamp_val(regval, S16_MIN, S16_MAX) & (0xffff << data->config->temp_shift);
 
diff --git a/drivers/hwmon/mlxreg-fan.c b/drivers/hwmon/mlxreg-fan.c
index a5f89aab3fb4..c25a54d5b39a 100644
--- a/drivers/hwmon/mlxreg-fan.c
+++ b/drivers/hwmon/mlxreg-fan.c
@@ -561,15 +561,14 @@ static int mlxreg_fan_cooling_config(struct device *dev, struct mlxreg_fan *fan)
 		if (!pwm->connected)
 			continue;
 		pwm->fan = fan;
+		/* Set minimal PWM speed. */
+		pwm->last_hwmon_state = MLXREG_FAN_PWM_DUTY2STATE(MLXREG_FAN_MIN_DUTY);
 		pwm->cdev = devm_thermal_of_cooling_device_register(dev, NULL, mlxreg_fan_name[i],
 								    pwm, &mlxreg_fan_cooling_ops);
 		if (IS_ERR(pwm->cdev)) {
 			dev_err(dev, "Failed to register cooling device\n");
 			return PTR_ERR(pwm->cdev);
 		}
-
-		/* Set minimal PWM speed. */
-		pwm->last_hwmon_state = MLXREG_FAN_PWM_DUTY2STATE(MLXREG_FAN_MIN_DUTY);
 	}
 
 	return 0;
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
index 8267dd1a2130..8f426f94e32a 100644
--- a/drivers/hwtracing/coresight/coresight-trbe.c
+++ b/drivers/hwtracing/coresight/coresight-trbe.c
@@ -23,7 +23,8 @@
 #include "coresight-self-hosted-trace.h"
 #include "coresight-trbe.h"
 
-#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
+#define PERF_IDX2OFF(idx, buf) \
+	((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT))
 
 /*
  * A padding packet that will help the user space tools
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index a7f89946dad4..e94ac746a741 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -1052,7 +1052,7 @@ static const struct pci_device_id i801_ids[] = {
 	{ PCI_DEVICE_DATA(INTEL, METEOR_LAKE_P_SMBUS,		FEATURES_ICH5 | FEATURE_TCO_CNL) },
 	{ PCI_DEVICE_DATA(INTEL, METEOR_LAKE_SOC_S_SMBUS,	FEATURES_ICH5 | FEATURE_TCO_CNL) },
 	{ PCI_DEVICE_DATA(INTEL, METEOR_LAKE_PCH_S_SMBUS,	FEATURES_ICH5 | FEATURE_TCO_CNL) },
-	{ PCI_DEVICE_DATA(INTEL, BIRCH_STREAM_SMBUS,		FEATURES_ICH5 | FEATURE_TCO_CNL) },
+	{ PCI_DEVICE_DATA(INTEL, BIRCH_STREAM_SMBUS,		FEATURES_ICH5)			 },
 	{ PCI_DEVICE_DATA(INTEL, ARROW_LAKE_H_SMBUS,		FEATURES_ICH5 | FEATURE_TCO_CNL) },
 	{ PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_H_SMBUS,		FEATURES_ICH5 | FEATURE_TCO_CNL) },
 	{ PCI_DEVICE_DATA(INTEL, PANTHER_LAKE_P_SMBUS,		FEATURES_ICH5 | FEATURE_TCO_CNL) },
diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c
index 9c164a4b9bb9..b0ee9ac45a97 100644
--- a/drivers/i2c/busses/i2c-riic.c
+++ b/drivers/i2c/busses/i2c-riic.c
@@ -386,7 +386,7 @@ static int riic_init_hw(struct riic_dev *riic)
 	 */
 	total_ticks = DIV_ROUND_UP(rate, t->bus_freq_hz ?: 1);
 
-	for (cks = 0; cks < 7; cks++) {
+	for (cks = 0; cks <= 7; cks++) {
 		/*
 		 * 60% low time must be less than BRL + 2 + 1
 		 * BRL max register value is 0x1F.
diff --git a/drivers/i2c/busses/i2c-rtl9300.c b/drivers/i2c/busses/i2c-rtl9300.c
index e064e8a4a1f0..af991b28e4f8 100644
--- a/drivers/i2c/busses/i2c-rtl9300.c
+++ b/drivers/i2c/busses/i2c-rtl9300.c
@@ -99,6 +99,9 @@ static int rtl9300_i2c_config_xfer(struct rtl9300_i2c *i2c, struct rtl9300_i2c_c
 {
 	u32 val, mask;
 
+	if (len < 1 || len > 16)
+		return -EINVAL;
+
 	val = chan->bus_freq << RTL9300_I2C_MST_CTRL2_SCL_FREQ_OFS;
 	mask = RTL9300_I2C_MST_CTRL2_SCL_FREQ_MASK;
 
@@ -143,10 +146,10 @@ static int rtl9300_i2c_write(struct rtl9300_i2c *i2c, u8 *buf, int len)
 		return -EIO;
 
 	for (i = 0; i < len; i++) {
-		if (i % 4 == 0)
-			vals[i/4] = 0;
-		vals[i/4] <<= 8;
-		vals[i/4] |= buf[i];
+		unsigned int shift = (i % 4) * 8;
+		unsigned int reg = i / 4;
+
+		vals[reg] |= buf[i] << shift;
 	}
 
 	return regmap_bulk_write(i2c->regmap, i2c->reg_base + RTL9300_I2C_MST_DATA_WORD0,
@@ -175,7 +178,7 @@ static int rtl9300_i2c_execute_xfer(struct rtl9300_i2c *i2c, char read_write,
 		return ret;
 
 	ret = regmap_read_poll_timeout(i2c->regmap, i2c->reg_base + RTL9300_I2C_MST_CTRL1,
-				       val, !(val & RTL9300_I2C_MST_CTRL1_I2C_TRIG), 100, 2000);
+				       val, !(val & RTL9300_I2C_MST_CTRL1_I2C_TRIG), 100, 100000);
 	if (ret)
 		return ret;
 
@@ -222,15 +225,6 @@ static int rtl9300_i2c_smbus_xfer(struct i2c_adapter *adap, u16 addr, unsigned s
 	}
 
 	switch (size) {
-	case I2C_SMBUS_QUICK:
-		ret = rtl9300_i2c_config_xfer(i2c, chan, addr, 0);
-		if (ret)
-			goto out_unlock;
-		ret = rtl9300_i2c_reg_addr_set(i2c, 0, 0);
-		if (ret)
-			goto out_unlock;
-		break;
-
 	case I2C_SMBUS_BYTE:
 		if (read_write == I2C_SMBUS_WRITE) {
 			ret = rtl9300_i2c_config_xfer(i2c, chan, addr, 0);
@@ -281,15 +275,19 @@ static int rtl9300_i2c_smbus_xfer(struct i2c_adapter *adap, u16 addr, unsigned s
 		ret = rtl9300_i2c_reg_addr_set(i2c, command, 1);
 		if (ret)
 			goto out_unlock;
-		ret = rtl9300_i2c_config_xfer(i2c, chan, addr, data->block[0]);
+		if (data->block[0] < 1 || data->block[0] > I2C_SMBUS_BLOCK_MAX) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		ret = rtl9300_i2c_config_xfer(i2c, chan, addr, data->block[0] + 1);
 		if (ret)
 			goto out_unlock;
 		if (read_write == I2C_SMBUS_WRITE) {
-			ret = rtl9300_i2c_write(i2c, &data->block[1], data->block[0]);
+			ret = rtl9300_i2c_write(i2c, &data->block[0], data->block[0] + 1);
 			if (ret)
 				goto out_unlock;
 		}
-		len = data->block[0];
+		len = data->block[0] + 1;
 		break;
 
 	default:
@@ -308,9 +306,8 @@ out_unlock:
 
 static u32 rtl9300_i2c_func(struct i2c_adapter *a)
 {
-	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
-	       I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
-	       I2C_FUNC_SMBUS_BLOCK_DATA;
+	return I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA |
+	       I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA;
 }
 
 static const struct i2c_algorithm rtl9300_i2c_algo = {
@@ -319,7 +316,7 @@ static const struct i2c_algorithm rtl9300_i2c_algo = {
 };
 
 static struct i2c_adapter_quirks rtl9300_i2c_quirks = {
-	.flags		= I2C_AQ_NO_CLK_STRETCH,
+	.flags		= I2C_AQ_NO_CLK_STRETCH | I2C_AQ_NO_ZERO_LEN,
 	.max_read_len	= 16,
 	.max_write_len	= 16,
 };
@@ -349,7 +346,7 @@ static int rtl9300_i2c_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, i2c);
 
-	if (device_get_child_node_count(dev) >= RTL9300_I2C_MUX_NCHAN)
+	if (device_get_child_node_count(dev) > RTL9300_I2C_MUX_NCHAN)
 		return dev_err_probe(dev, -EINVAL, "Too many channels\n");
 
 	device_for_each_child_node(dev, child) {
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 73747d20df85..91a7b7e7c0c8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1679,7 +1679,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 };
 
 static const struct x86_cpu_id intel_mwait_ids[] __initconst = {
-	X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL),
+	X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL),
 	{}
 };
 
diff --git a/drivers/iio/accel/sca3300.c b/drivers/iio/accel/sca3300.c
index bda370c0f660..8380b237831c 100644
--- a/drivers/iio/accel/sca3300.c
+++ b/drivers/iio/accel/sca3300.c
@@ -477,7 +477,7 @@ static irqreturn_t sca3300_trigger_handler(int irq, void *p)
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct sca3300_data *data = iio_priv(indio_dev);
 	int bit, ret, val, i = 0;
-	IIO_DECLARE_BUFFER_WITH_TS(s16, channels, SCA3300_SCAN_MAX);
+	IIO_DECLARE_BUFFER_WITH_TS(s16, channels, SCA3300_SCAN_MAX) = { };
 
 	iio_for_each_active_channel(indio_dev, bit) {
 		ret = sca3300_read_reg(data, indio_dev->channels[bit].address, &val);
diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index 6de2abad0197..24f2572c487e 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -1300,7 +1300,7 @@ config RN5T618_ADC
 
 config ROHM_BD79124
 	tristate "Rohm BD79124 ADC driver"
-	depends on I2C
+	depends on I2C && GPIOLIB
 	select REGMAP_I2C
 	select IIO_ADC_HELPER
 	help
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index 9808df2e9242..4d8c6bafd1c3 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -849,7 +849,7 @@ enum {
 static int ad7124_syscalib_locked(struct ad7124_state *st, const struct iio_chan_spec *chan)
 {
 	struct device *dev = &st->sd.spi->dev;
-	struct ad7124_channel *ch = &st->channels[chan->channel];
+	struct ad7124_channel *ch = &st->channels[chan->address];
 	int ret;
 
 	if (ch->syscalib_mode == AD7124_SYSCALIB_ZERO_SCALE) {
@@ -865,8 +865,8 @@ static int ad7124_syscalib_locked(struct ad7124_state *st, const struct iio_chan
 		if (ret < 0)
 			return ret;
 
-		dev_dbg(dev, "offset for channel %d after zero-scale calibration: 0x%x\n",
-			chan->channel, ch->cfg.calibration_offset);
+		dev_dbg(dev, "offset for channel %lu after zero-scale calibration: 0x%x\n",
+			chan->address, ch->cfg.calibration_offset);
 	} else {
 		ch->cfg.calibration_gain = st->gain_default;
 
@@ -880,8 +880,8 @@ static int ad7124_syscalib_locked(struct ad7124_state *st, const struct iio_chan
 		if (ret < 0)
 			return ret;
 
-		dev_dbg(dev, "gain for channel %d after full-scale calibration: 0x%x\n",
-			chan->channel, ch->cfg.calibration_gain);
+		dev_dbg(dev, "gain for channel %lu after full-scale calibration: 0x%x\n",
+			chan->address, ch->cfg.calibration_gain);
 	}
 
 	return 0;
@@ -924,7 +924,7 @@ static int ad7124_set_syscalib_mode(struct iio_dev *indio_dev,
 {
 	struct ad7124_state *st = iio_priv(indio_dev);
 
-	st->channels[chan->channel].syscalib_mode = mode;
+	st->channels[chan->address].syscalib_mode = mode;
 
 	return 0;
 }
@@ -934,7 +934,7 @@ static int ad7124_get_syscalib_mode(struct iio_dev *indio_dev,
 {
 	struct ad7124_state *st = iio_priv(indio_dev);
 
-	return st->channels[chan->channel].syscalib_mode;
+	return st->channels[chan->address].syscalib_mode;
 }
 
 static const struct iio_enum ad7124_syscalib_mode_enum = {
diff --git a/drivers/iio/adc/ad7173.c b/drivers/iio/adc/ad7173.c
index 4413207be28f..683146e83ab2 100644
--- a/drivers/iio/adc/ad7173.c
+++ b/drivers/iio/adc/ad7173.c
@@ -200,7 +200,7 @@ struct ad7173_channel_config {
 	/*
 	 * Following fields are used to compare equality. If you
 	 * make adaptations in it, you most likely also have to adapt
-	 * ad7173_find_live_config(), too.
+	 * ad7173_is_setup_equal(), too.
 	 */
 	struct_group(config_props,
 		bool bipolar;
@@ -561,12 +561,19 @@ static void ad7173_reset_usage_cnts(struct ad7173_state *st)
 	st->config_usage_counter = 0;
 }
 
-static struct ad7173_channel_config *
-ad7173_find_live_config(struct ad7173_state *st, struct ad7173_channel_config *cfg)
+/**
+ * ad7173_is_setup_equal - Compare two channel setups
+ * @cfg1: First channel configuration
+ * @cfg2: Second channel configuration
+ *
+ * Compares all configuration options that affect the registers connected to
+ * SETUP_SEL, namely CONFIGx, FILTERx, GAINx and OFFSETx.
+ *
+ * Returns: true if the setups are identical, false otherwise
+ */
+static bool ad7173_is_setup_equal(const struct ad7173_channel_config *cfg1,
+				  const struct ad7173_channel_config *cfg2)
 {
-	struct ad7173_channel_config *cfg_aux;
-	int i;
-
 	/*
 	 * This is just to make sure that the comparison is adapted after
 	 * struct ad7173_channel_config was changed.
@@ -579,14 +586,22 @@ ad7173_find_live_config(struct ad7173_state *st, struct ad7173_channel_config *c
 				     u8 ref_sel;
 			     }));
 
+	return cfg1->bipolar == cfg2->bipolar &&
+	       cfg1->input_buf == cfg2->input_buf &&
+	       cfg1->odr == cfg2->odr &&
+	       cfg1->ref_sel == cfg2->ref_sel;
+}
+
+static struct ad7173_channel_config *
+ad7173_find_live_config(struct ad7173_state *st, struct ad7173_channel_config *cfg)
+{
+	struct ad7173_channel_config *cfg_aux;
+	int i;
+
 	for (i = 0; i < st->num_channels; i++) {
 		cfg_aux = &st->channels[i].cfg;
 
-		if (cfg_aux->live &&
-		    cfg->bipolar == cfg_aux->bipolar &&
-		    cfg->input_buf == cfg_aux->input_buf &&
-		    cfg->odr == cfg_aux->odr &&
-		    cfg->ref_sel == cfg_aux->ref_sel)
+		if (cfg_aux->live && ad7173_is_setup_equal(cfg, cfg_aux))
 			return cfg_aux;
 	}
 	return NULL;
@@ -1228,7 +1243,7 @@ static int ad7173_update_scan_mode(struct iio_dev *indio_dev,
 				   const unsigned long *scan_mask)
 {
 	struct ad7173_state *st = iio_priv(indio_dev);
-	int i, ret;
+	int i, j, k, ret;
 
 	for (i = 0; i < indio_dev->num_channels; i++) {
 		if (test_bit(i, scan_mask))
@@ -1239,6 +1254,54 @@ static int ad7173_update_scan_mode(struct iio_dev *indio_dev,
 			return ret;
 	}
 
+	/*
+	 * On some chips, there are more channels that setups, so if there were
+	 * more unique setups requested than the number of available slots,
+	 * ad7173_set_channel() will have written over some of the slots. We
+	 * can detect this by making sure each assigned cfg_slot matches the
+	 * requested configuration. If it doesn't, we know that the slot was
+	 * overwritten by a different channel.
+	 */
+	for_each_set_bit(i, scan_mask, indio_dev->num_channels) {
+		const struct ad7173_channel_config *cfg1, *cfg2;
+
+		cfg1 = &st->channels[i].cfg;
+
+		for_each_set_bit(j, scan_mask, indio_dev->num_channels) {
+			cfg2 = &st->channels[j].cfg;
+
+			/*
+			 * Only compare configs that are assigned to the same
+			 * SETUP_SEL slot and don't compare channel to itself.
+			 */
+			if (i == j || cfg1->cfg_slot != cfg2->cfg_slot)
+				continue;
+
+			/*
+			 * If we find two different configs trying to use the
+			 * same SETUP_SEL slot, then we know that the that we
+			 * have too many unique configurations requested for
+			 * the available slots and at least one was overwritten.
+			 */
+			if (!ad7173_is_setup_equal(cfg1, cfg2)) {
+				/*
+				 * At this point, there isn't a way to tell
+				 * which setups are actually programmed in the
+				 * ADC anymore, so we could read them back to
+				 * see, but it is simpler to just turn off all
+				 * of the live flags so that everything gets
+				 * reprogramed on the next attempt read a sample.
+				 */
+				for (k = 0; k < st->num_channels; k++)
+					st->channels[k].cfg.live = false;
+
+				dev_err(&st->sd.spi->dev,
+					"Too many unique channel configurations requested for scan\n");
+				return -EINVAL;
+			}
+		}
+	}
+
 	return 0;
 }
 
diff --git a/drivers/iio/adc/ad7380.c b/drivers/iio/adc/ad7380.c
index 6f7034b6c266..fa251dc1aae6 100644
--- a/drivers/iio/adc/ad7380.c
+++ b/drivers/iio/adc/ad7380.c
@@ -873,6 +873,7 @@ static const struct ad7380_chip_info adaq4381_4_chip_info = {
 	.has_hardware_gain = true,
 	.available_scan_masks = ad7380_4_channel_scan_masks,
 	.timing_specs = &ad7380_4_timing,
+	.max_conversion_rate_hz = 4 * MEGA,
 };
 
 static const struct spi_offload_config ad7380_offload_config = {
diff --git a/drivers/iio/adc/rzg2l_adc.c b/drivers/iio/adc/rzg2l_adc.c
index 9674d48074c9..cadb0446bc29 100644
--- a/drivers/iio/adc/rzg2l_adc.c
+++ b/drivers/iio/adc/rzg2l_adc.c
@@ -89,7 +89,6 @@ struct rzg2l_adc {
 	struct completion completion;
 	struct mutex lock;
 	u16 last_val[RZG2L_ADC_MAX_CHANNELS];
-	bool was_rpm_active;
 };
 
 /**
@@ -428,6 +427,8 @@ static int rzg2l_adc_probe(struct platform_device *pdev)
 	if (!indio_dev)
 		return -ENOMEM;
 
+	platform_set_drvdata(pdev, indio_dev);
+
 	adc = iio_priv(indio_dev);
 
 	adc->hw_params = device_get_match_data(dev);
@@ -460,8 +461,6 @@ static int rzg2l_adc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	platform_set_drvdata(pdev, indio_dev);
-
 	ret = rzg2l_adc_hw_init(dev, adc);
 	if (ret)
 		return dev_err_probe(&pdev->dev, ret,
@@ -541,14 +540,9 @@ static int rzg2l_adc_suspend(struct device *dev)
 	};
 	int ret;
 
-	if (pm_runtime_suspended(dev)) {
-		adc->was_rpm_active = false;
-	} else {
-		ret = pm_runtime_force_suspend(dev);
-		if (ret)
-			return ret;
-		adc->was_rpm_active = true;
-	}
+	ret = pm_runtime_force_suspend(dev);
+	if (ret)
+		return ret;
 
 	ret = reset_control_bulk_assert(ARRAY_SIZE(resets), resets);
 	if (ret)
@@ -557,9 +551,7 @@ static int rzg2l_adc_suspend(struct device *dev)
 	return 0;
 
 rpm_restore:
-	if (adc->was_rpm_active)
-		pm_runtime_force_resume(dev);
-
+	pm_runtime_force_resume(dev);
 	return ret;
 }
 
@@ -577,11 +569,9 @@ static int rzg2l_adc_resume(struct device *dev)
 	if (ret)
 		return ret;
 
-	if (adc->was_rpm_active) {
-		ret = pm_runtime_force_resume(dev);
-		if (ret)
-			goto resets_restore;
-	}
+	ret = pm_runtime_force_resume(dev);
+	if (ret)
+		goto resets_restore;
 
 	ret = rzg2l_adc_hw_init(dev, adc);
 	if (ret)
@@ -590,10 +580,7 @@ static int rzg2l_adc_resume(struct device *dev)
 	return 0;
 
 rpm_restore:
-	if (adc->was_rpm_active) {
-		pm_runtime_mark_last_busy(dev);
-		pm_runtime_put_autosuspend(dev);
-	}
+	pm_runtime_force_suspend(dev);
 resets_restore:
 	reset_control_bulk_assert(ARRAY_SIZE(resets), resets);
 	return ret;
diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_temp.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_temp.c
index 8b15afca498c..271a4788604a 100644
--- a/drivers/iio/imu/inv_icm42600/inv_icm42600_temp.c
+++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_temp.c
@@ -32,8 +32,12 @@ static int inv_icm42600_temp_read(struct inv_icm42600_state *st, s16 *temp)
 		goto exit;
 
 	*temp = (s16)be16_to_cpup(raw);
+	/*
+	 * Temperature data is invalid if both accel and gyro are off.
+	 * Return -EBUSY in this case.
+	 */
 	if (*temp == INV_ICM42600_DATA_INVALID)
-		ret = -EINVAL;
+		ret = -EBUSY;
 
 exit:
 	mutex_unlock(&st->lock);
diff --git a/drivers/iio/light/as73211.c b/drivers/iio/light/as73211.c
index 68f60dc3c79d..32719f584c47 100644
--- a/drivers/iio/light/as73211.c
+++ b/drivers/iio/light/as73211.c
@@ -639,7 +639,7 @@ static irqreturn_t as73211_trigger_handler(int irq __always_unused, void *p)
 	struct {
 		__le16 chan[4];
 		aligned_s64 ts;
-	} scan;
+	} scan = { };
 	int data_result, ret;
 
 	mutex_lock(&data->mutex);
diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c
index 74505c9ec1a0..6cdc8ed53520 100644
--- a/drivers/iio/pressure/bmp280-core.c
+++ b/drivers/iio/pressure/bmp280-core.c
@@ -3213,11 +3213,12 @@ int bmp280_common_probe(struct device *dev,
 
 	/* Bring chip out of reset if there is an assigned GPIO line */
 	gpiod = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(gpiod))
+		return dev_err_probe(dev, PTR_ERR(gpiod), "failed to get reset GPIO\n");
+
 	/* Deassert the signal */
-	if (gpiod) {
-		dev_info(dev, "release reset\n");
-		gpiod_set_value(gpiod, 0);
-	}
+	dev_info(dev, "release reset\n");
+	gpiod_set_value(gpiod, 0);
 
 	data->regmap = regmap;
 
diff --git a/drivers/iio/proximity/isl29501.c b/drivers/iio/proximity/isl29501.c
index d1510fe24050..f69db6f2f380 100644
--- a/drivers/iio/proximity/isl29501.c
+++ b/drivers/iio/proximity/isl29501.c
@@ -938,12 +938,18 @@ static irqreturn_t isl29501_trigger_handler(int irq, void *p)
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct isl29501_private *isl29501 = iio_priv(indio_dev);
 	const unsigned long *active_mask = indio_dev->active_scan_mask;
-	u32 buffer[4] __aligned(8) = {}; /* 1x16-bit + naturally aligned ts */
-
-	if (test_bit(ISL29501_DISTANCE_SCAN_INDEX, active_mask))
-		isl29501_register_read(isl29501, REG_DISTANCE, buffer);
+	u32 value;
+	struct {
+		u16 data;
+		aligned_s64 ts;
+	} scan = { };
+
+	if (test_bit(ISL29501_DISTANCE_SCAN_INDEX, active_mask)) {
+		isl29501_register_read(isl29501, REG_DISTANCE, &value);
+		scan.data = value;
+	}
 
-	iio_push_to_buffers_with_timestamp(indio_dev, buffer, pf->timestamp);
+	iio_push_to_buffers_with_timestamp(indio_dev, &scan, pf->timestamp);
 	iio_trigger_notify_done(indio_dev->trig);
 
 	return IRQ_HANDLED;
diff --git a/drivers/iio/temperature/maxim_thermocouple.c b/drivers/iio/temperature/maxim_thermocouple.c
index cae8e84821d7..205939680fd4 100644
--- a/drivers/iio/temperature/maxim_thermocouple.c
+++ b/drivers/iio/temperature/maxim_thermocouple.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/spi/spi.h>
+#include <linux/types.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
 #include <linux/iio/trigger.h>
@@ -121,8 +122,15 @@ struct maxim_thermocouple_data {
 	struct spi_device *spi;
 	const struct maxim_thermocouple_chip *chip;
 	char tc_type;
-
-	u8 buffer[16] __aligned(IIO_DMA_MINALIGN);
+	/* Buffer for reading up to 2 hardware channels. */
+	struct {
+		union {
+			__be16 raw16;
+			__be32 raw32;
+			__be16 raw[2];
+		};
+		aligned_s64 timestamp;
+	} buffer __aligned(IIO_DMA_MINALIGN);
 };
 
 static int maxim_thermocouple_read(struct maxim_thermocouple_data *data,
@@ -130,18 +138,16 @@ static int maxim_thermocouple_read(struct maxim_thermocouple_data *data,
 {
 	unsigned int storage_bytes = data->chip->read_size;
 	unsigned int shift = chan->scan_type.shift + (chan->address * 8);
-	__be16 buf16;
-	__be32 buf32;
 	int ret;
 
 	switch (storage_bytes) {
 	case 2:
-		ret = spi_read(data->spi, (void *)&buf16, storage_bytes);
-		*val = be16_to_cpu(buf16);
+		ret = spi_read(data->spi, &data->buffer.raw16, storage_bytes);
+		*val = be16_to_cpu(data->buffer.raw16);
 		break;
 	case 4:
-		ret = spi_read(data->spi, (void *)&buf32, storage_bytes);
-		*val = be32_to_cpu(buf32);
+		ret = spi_read(data->spi, &data->buffer.raw32, storage_bytes);
+		*val = be32_to_cpu(data->buffer.raw32);
 		break;
 	default:
 		ret = -EINVAL;
@@ -166,9 +172,9 @@ static irqreturn_t maxim_thermocouple_trigger_handler(int irq, void *private)
 	struct maxim_thermocouple_data *data = iio_priv(indio_dev);
 	int ret;
 
-	ret = spi_read(data->spi, data->buffer, data->chip->read_size);
+	ret = spi_read(data->spi, data->buffer.raw, data->chip->read_size);
 	if (!ret) {
-		iio_push_to_buffers_with_ts(indio_dev, data->buffer,
+		iio_push_to_buffers_with_ts(indio_dev, &data->buffer,
 					    sizeof(data->buffer),
 					    iio_get_time_ns(indio_dev));
 	}
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index b1c44ec1a3f3..572a91a62a7b 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -115,7 +115,7 @@ static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
 
 out_free_map:
 	if (ib_uses_virt_dma(dev))
-		kfree(map->pfn_list);
+		kvfree(map->pfn_list);
 	else
 		hmm_dma_map_free(dev->dma_device, map);
 	return ret;
@@ -287,7 +287,7 @@ static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
 	mutex_unlock(&umem_odp->umem_mutex);
 	mmu_interval_notifier_remove(&umem_odp->notifier);
 	if (ib_uses_virt_dma(dev))
-		kfree(umem_odp->map.pfn_list);
+		kvfree(umem_odp->map.pfn_list);
 	else
 		hmm_dma_map_free(dev->dma_device, &umem_odp->map);
 }
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 37c2bc3bdba5..260dc67b8b87 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -1921,7 +1921,6 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
 	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
 					       ib_srq);
 	struct bnxt_re_dev *rdev = srq->rdev;
-	int rc;
 
 	switch (srq_attr_mask) {
 	case IB_SRQ_MAX_WR:
@@ -1933,11 +1932,8 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
 			return -EINVAL;
 
 		srq->qplib_srq.threshold = srq_attr->srq_limit;
-		rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq);
-		if (rc) {
-			ibdev_err(&rdev->ibdev, "Modify HW SRQ failed!");
-			return rc;
-		}
+		bnxt_qplib_srq_arm_db(&srq->qplib_srq.dbinfo, srq->qplib_srq.threshold);
+
 		/* On success, update the shadow */
 		srq->srq_limit = srq_attr->srq_limit;
 		/* No need to Build and send response back to udata */
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 293b0a96c8e3..df7cf8d68e27 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -2017,6 +2017,28 @@ static void bnxt_re_free_nqr_mem(struct bnxt_re_dev *rdev)
 	rdev->nqr = NULL;
 }
 
+/* When DEL_GID fails, driver is not freeing GID ctx memory.
+ * To avoid the memory leak, free the memory during unload
+ */
+static void bnxt_re_free_gid_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+	struct bnxt_re_gid_ctx *ctx, **ctx_tbl;
+	int i;
+
+	if (!sgid_tbl->active)
+		return;
+
+	ctx_tbl = sgid_tbl->ctx;
+	for (i = 0; i < sgid_tbl->max; i++) {
+		if (sgid_tbl->hw_id[i] == 0xFFFF)
+			continue;
+
+		ctx = ctx_tbl[i];
+		kfree(ctx);
+	}
+}
+
 static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 {
 	u8 type;
@@ -2030,6 +2052,7 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
 		cancel_delayed_work_sync(&rdev->worker);
 
+	bnxt_re_free_gid_ctx(rdev);
 	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED,
 			       &rdev->flags))
 		bnxt_re_cleanup_res(rdev);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index dfe3177123e5..ee36b3d82cc0 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -705,9 +705,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 	srq->dbinfo.db = srq->dpi->dbr;
 	srq->dbinfo.max_slot = 1;
 	srq->dbinfo.priv_db = res->dpi_tbl.priv_db;
-	if (srq->threshold)
-		bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA);
-	srq->arm_req = false;
+	bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA);
 
 	return 0;
 fail:
@@ -717,24 +715,6 @@ fail:
 	return rc;
 }
 
-int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
-			  struct bnxt_qplib_srq *srq)
-{
-	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
-	u32 count;
-
-	count = __bnxt_qplib_get_avail(srq_hwq);
-	if (count > srq->threshold) {
-		srq->arm_req = false;
-		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
-	} else {
-		/* Deferred arming */
-		srq->arm_req = true;
-	}
-
-	return 0;
-}
-
 int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_srq *srq)
 {
@@ -776,7 +756,6 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
 	struct rq_wqe *srqe;
 	struct sq_sge *hw_sge;
-	u32 count = 0;
 	int i, next;
 
 	spin_lock(&srq_hwq->lock);
@@ -808,15 +787,8 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 
 	bnxt_qplib_hwq_incr_prod(&srq->dbinfo, srq_hwq, srq->dbinfo.max_slot);
 
-	spin_lock(&srq_hwq->lock);
-	count = __bnxt_qplib_get_avail(srq_hwq);
-	spin_unlock(&srq_hwq->lock);
 	/* Ring DB */
 	bnxt_qplib_ring_prod_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ);
-	if (srq->arm_req == true && count > srq->threshold) {
-		srq->arm_req = false;
-		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
-	}
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index ab125f1d949e..4921a214c34c 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -546,8 +546,6 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 srqn_handler_t srq_handler);
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_srq *srq);
-int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
-			  struct bnxt_qplib_srq *srq);
 int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_srq *srq);
 void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 6cd05207ffed..cc5c82d96839 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -121,6 +121,7 @@ static int __alloc_pbl(struct bnxt_qplib_res *res,
 	pbl->pg_arr = vmalloc_array(pages, sizeof(void *));
 	if (!pbl->pg_arr)
 		return -ENOMEM;
+	memset(pbl->pg_arr, 0, pages * sizeof(void *));
 
 	pbl->pg_map_arr = vmalloc_array(pages, sizeof(dma_addr_t));
 	if (!pbl->pg_map_arr) {
@@ -128,6 +129,7 @@ static int __alloc_pbl(struct bnxt_qplib_res *res,
 		pbl->pg_arr = NULL;
 		return -ENOMEM;
 	}
+	memset(pbl->pg_map_arr, 0, pages * sizeof(dma_addr_t));
 	pbl->pg_count = 0;
 	pbl->pg_size = sginfo->pgsize;
 
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
index 94c211df09d8..fdeec33c71da 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.c
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -994,6 +994,8 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
 		old_entry = xa_store(&dev->qp_xa, 1, qp, GFP_KERNEL);
 		if (xa_is_err(old_entry))
 			ret = xa_err(old_entry);
+		else
+			qp->ibqp.qp_num = 1;
 	} else {
 		ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp,
 				      XA_LIMIT(1, dev->attrs.max_qp - 1),
@@ -1031,7 +1033,9 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
 		if (ret)
 			goto err_out_cmd;
 	} else {
-		init_kernel_qp(dev, qp, attrs);
+		ret = init_kernel_qp(dev, qp, attrs);
+		if (ret)
+			goto err_out_xa;
 	}
 
 	qp->attrs.max_send_sge = attrs->cap.max_send_sge;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 64bca08f3f1a..f82bdd46a917 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3043,7 +3043,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 	if (!hr_dev->is_vf)
 		hns_roce_free_link_table(hr_dev);
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09)
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
 		free_dip_entry(hr_dev);
 }
 
@@ -5476,7 +5476,7 @@ out:
 	return ret;
 }
 
-static int hns_roce_v2_query_sccc(struct hns_roce_dev *hr_dev, u32 qpn,
+static int hns_roce_v2_query_sccc(struct hns_roce_dev *hr_dev, u32 sccn,
 				  void *buffer)
 {
 	struct hns_roce_v2_scc_context *context;
@@ -5488,7 +5488,7 @@ static int hns_roce_v2_query_sccc(struct hns_roce_dev *hr_dev, u32 qpn,
 		return PTR_ERR(mailbox);
 
 	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_SCCC,
-				qpn);
+				sccn);
 	if (ret)
 		goto out;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
index f637b73b946e..230187dda6a0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_restrack.c
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -100,6 +100,7 @@ int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp)
 		struct hns_roce_v2_qp_context qpc;
 		struct hns_roce_v2_scc_context sccc;
 	} context = {};
+	u32 sccn = hr_qp->qpn;
 	int ret;
 
 	if (!hr_dev->hw->query_qpc)
@@ -116,7 +117,13 @@ int hns_roce_fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ib_qp)
 	    !hr_dev->hw->query_sccc)
 		goto out;
 
-	ret = hr_dev->hw->query_sccc(hr_dev, hr_qp->qpn, &context.sccc);
+	if (hr_qp->cong_type == CONG_TYPE_DIP) {
+		if (!hr_qp->dip)
+			goto out;
+		sccn = hr_qp->dip->dip_idx;
+	}
+
+	ret = hr_dev->hw->query_sccc(hr_dev, sccn, &context.sccc);
 	if (ret)
 		ibdev_warn_ratelimited(&hr_dev->ib_dev,
 				       "failed to query SCCC, ret = %d.\n",
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 028d9f031dde..8b506417ad2f 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -233,6 +233,7 @@ static u16 get_legacy_obj_type(u16 opcode)
 {
 	switch (opcode) {
 	case MLX5_CMD_OP_CREATE_RQ:
+	case MLX5_CMD_OP_CREATE_RMP:
 		return MLX5_EVENT_QUEUE_TYPE_RQ;
 	case MLX5_CMD_OP_CREATE_QP:
 		return MLX5_EVENT_QUEUE_TYPE_QP;
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index 132a87e52d5c..ac0183a2ff7a 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -345,33 +345,15 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
 
 static void rxe_skb_tx_dtor(struct sk_buff *skb)
 {
-	struct net_device *ndev = skb->dev;
-	struct rxe_dev *rxe;
-	unsigned int qp_index;
-	struct rxe_qp *qp;
+	struct rxe_qp *qp = skb->sk->sk_user_data;
 	int skb_out;
 
-	rxe = rxe_get_dev_from_net(ndev);
-	if (!rxe && is_vlan_dev(ndev))
-		rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
-	if (WARN_ON(!rxe))
-		return;
-
-	qp_index = (int)(uintptr_t)skb->sk->sk_user_data;
-	if (!qp_index)
-		return;
-
-	qp = rxe_pool_get_index(&rxe->qp_pool, qp_index);
-	if (!qp)
-		goto put_dev;
-
 	skb_out = atomic_dec_return(&qp->skb_out);
-	if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)
+	if (unlikely(qp->need_req_skb &&
+		skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
 		rxe_sched_task(&qp->send_task);
 
 	rxe_put(qp);
-put_dev:
-	ib_device_put(&rxe->ib_dev);
 	sock_put(skb->sk);
 }
 
@@ -383,6 +365,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
 	sock_hold(sk);
 	skb->sk = sk;
 	skb->destructor = rxe_skb_tx_dtor;
+	rxe_get(pkt->qp);
 	atomic_inc(&pkt->qp->skb_out);
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -405,6 +388,7 @@ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt)
 	sock_hold(sk);
 	skb->sk = sk;
 	skb->destructor = rxe_skb_tx_dtor;
+	rxe_get(pkt->qp);
 	atomic_inc(&pkt->qp->skb_out);
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -497,6 +481,9 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
 		goto out;
 	}
 
+	/* Add time stamp to skb. */
+	skb->tstamp = ktime_get();
+
 	skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));
 
 	/* FIXME: hold reference to this netdev until life of this skb. */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index f2af3e0aef35..95f1c1c2949d 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -244,7 +244,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 	err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
 	if (err < 0)
 		return err;
-	qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index;
+	qp->sk->sk->sk_user_data = qp;
 
 	/* pick a source UDP port number for this QP based on
 	 * the source QPN. this spreads traffic for different QPs
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index 4c94297e17e6..d72e89c25e50 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -422,6 +422,7 @@ static const struct xpad_device {
 	{ 0x3537, 0x1010, "GameSir G7 SE", 0, XTYPE_XBOXONE },
 	{ 0x366c, 0x0005, "ByoWave Proteus Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE, FLAG_DELAY_INIT },
 	{ 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX },
+	{ 0x37d7, 0x2501, "Flydigi Apex 5", 0, XTYPE_XBOX360 },
 	{ 0x413d, 0x2104, "Black Shark Green Ghost Gamepad", 0, XTYPE_XBOX360 },
 	{ 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX },
 	{ 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN }
@@ -578,6 +579,7 @@ static const struct usb_device_id xpad_table[] = {
 	XPAD_XBOX360_VENDOR(0x3537),		/* GameSir Controllers */
 	XPAD_XBOXONE_VENDOR(0x3537),		/* GameSir Controllers */
 	XPAD_XBOXONE_VENDOR(0x366c),		/* ByoWave controllers */
+	XPAD_XBOX360_VENDOR(0x37d7),		/* Flydigi Controllers */
 	XPAD_XBOX360_VENDOR(0x413d),		/* Black Shark Green Ghost Controller */
 	{ }
 };
diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c
index 50e2e792c91d..c78d9f6d97c4 100644
--- a/drivers/input/keyboard/mtk-pmic-keys.c
+++ b/drivers/input/keyboard/mtk-pmic-keys.c
@@ -55,6 +55,7 @@ struct mtk_pmic_regs {
 	const struct mtk_pmic_keys_regs keys_regs[MTK_PMIC_MAX_KEY_COUNT];
 	u32 pmic_rst_reg;
 	u32 rst_lprst_mask; /* Long-press reset timeout bitmask */
+	bool key_release_irq;
 };
 
 static const struct mtk_pmic_regs mt6397_regs = {
@@ -116,6 +117,7 @@ static const struct mtk_pmic_regs mt6358_regs = {
 				   MTK_PMIC_HOMEKEY_RST),
 	.pmic_rst_reg = MT6358_TOP_RST_MISC,
 	.rst_lprst_mask = MTK_PMIC_RST_DU_MASK,
+	.key_release_irq = true,
 };
 
 static const struct mtk_pmic_regs mt6359_regs = {
@@ -129,6 +131,7 @@ static const struct mtk_pmic_regs mt6359_regs = {
 				   MTK_PMIC_HOMEKEY_RST),
 	.pmic_rst_reg = MT6359_TOP_RST_MISC,
 	.rst_lprst_mask = MTK_PMIC_RST_DU_MASK,
+	.key_release_irq = true,
 };
 
 struct mtk_pmic_keys_info {
@@ -368,7 +371,7 @@ static int mtk_pmic_keys_probe(struct platform_device *pdev)
 		if (keys->keys[index].irq < 0)
 			return keys->keys[index].irq;
 
-		if (of_device_is_compatible(node, "mediatek,mt6358-keys")) {
+		if (mtk_pmic_regs->key_release_irq) {
 			keys->keys[index].irq_r = platform_get_irq_byname(pdev,
 									  irqnames_r[index]);
 
diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c
index 6fac31c0d99f..ff23219a582a 100644
--- a/drivers/input/misc/iqs7222.c
+++ b/drivers/input/misc/iqs7222.c
@@ -2427,6 +2427,9 @@ static int iqs7222_parse_chan(struct iqs7222_private *iqs7222,
 		if (error)
 			return error;
 
+		if (!iqs7222->kp_type[chan_index][i])
+			continue;
+
 		if (!dev_desc->event_offset)
 			continue;
 
diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h
index 6ed9fc34948c..1caa6c4ca435 100644
--- a/drivers/input/serio/i8042-acpipnpio.h
+++ b/drivers/input/serio/i8042-acpipnpio.h
@@ -1155,6 +1155,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = {
 		.driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS |
 					SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP)
 	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "XxHP4NAx"),
+		},
+		.driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS |
+					SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP)
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "XxKK4NAx_XxSP4NAx"),
+		},
+		.driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS |
+					SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP)
+	},
 	/*
 	 * A lot of modern Clevo barebones have touchpad and/or keyboard issues
 	 * after suspend fixable with the forcenorestore quirk.
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 5219d7ddfdaa..95f63c5f6159 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -555,6 +555,7 @@ struct gcr3_tbl_info {
 };
 
 struct amd_io_pgtable {
+	seqcount_t		seqcount;	/* Protects root/mode update */
 	struct io_pgtable	pgtbl;
 	int			mode;
 	u64			*root;
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 7b5af6176de9..ba9e582a8bbe 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1455,12 +1455,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu,
 				    PCI_FUNC(e->devid));
 
 			devid = e->devid;
-			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
-				if (alias)
+			if (alias) {
+				for (dev_i = devid_start; dev_i <= devid; ++dev_i)
 					pci_seg->alias_table[dev_i] = devid_to;
+				set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags);
 			}
 			set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags);
-			set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags);
 			break;
 		case IVHD_DEV_SPECIAL: {
 			u8 handle, type;
@@ -3067,7 +3067,8 @@ static int __init early_amd_iommu_init(void)
 
 	if (!boot_cpu_has(X86_FEATURE_CX16)) {
 		pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	/*
@@ -3638,7 +3639,7 @@ static int __init parse_ivrs_acpihid(char *str)
 {
 	u32 seg = 0, bus, dev, fn;
 	char *hid, *uid, *p, *addr;
-	char acpiid[ACPIID_LEN] = {0};
+	char acpiid[ACPIID_LEN + 1] = { }; /* size with NULL terminator */
 	int i;
 
 	addr = strchr(str, '@');
@@ -3664,7 +3665,7 @@ static int __init parse_ivrs_acpihid(char *str)
 	/* We have the '@', make it the terminator to get just the acpiid */
 	*addr++ = 0;
 
-	if (strlen(str) > ACPIID_LEN + 1)
+	if (strlen(str) > ACPIID_LEN)
 		goto not_found;
 
 	if (sscanf(str, "=%s", acpiid) != 1)
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index a91e71f981ef..70c2f5b1631b 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/dma-mapping.h>
+#include <linux/seqlock.h>
 
 #include <asm/barrier.h>
 
@@ -130,8 +131,11 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable,
 
 	*pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root));
 
+	write_seqcount_begin(&pgtable->seqcount);
 	pgtable->root  = pte;
 	pgtable->mode += 1;
+	write_seqcount_end(&pgtable->seqcount);
+
 	amd_iommu_update_and_flush_device_table(domain);
 
 	pte = NULL;
@@ -153,6 +157,7 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
 {
 	unsigned long last_addr = address + (page_size - 1);
 	struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg;
+	unsigned int seqcount;
 	int level, end_lvl;
 	u64 *pte, *page;
 
@@ -170,8 +175,14 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable,
 	}
 
 
-	level   = pgtable->mode - 1;
-	pte     = &pgtable->root[PM_LEVEL_INDEX(level, address)];
+	do {
+		seqcount = read_seqcount_begin(&pgtable->seqcount);
+
+		level   = pgtable->mode - 1;
+		pte     = &pgtable->root[PM_LEVEL_INDEX(level, address)];
+	} while (read_seqcount_retry(&pgtable->seqcount, seqcount));
+
+
 	address = PAGE_SIZE_ALIGN(address, page_size);
 	end_lvl = PAGE_SIZE_LEVEL(page_size);
 
@@ -249,6 +260,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
 		      unsigned long *page_size)
 {
 	int level;
+	unsigned int seqcount;
 	u64 *pte;
 
 	*page_size = 0;
@@ -256,8 +268,12 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
 	if (address > PM_LEVEL_SIZE(pgtable->mode))
 		return NULL;
 
-	level	   =  pgtable->mode - 1;
-	pte	   = &pgtable->root[PM_LEVEL_INDEX(level, address)];
+	do {
+		seqcount = read_seqcount_begin(&pgtable->seqcount);
+		level	   =  pgtable->mode - 1;
+		pte	   = &pgtable->root[PM_LEVEL_INDEX(level, address)];
+	} while (read_seqcount_retry(&pgtable->seqcount, seqcount));
+
 	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
 
 	while (level > 0) {
@@ -541,6 +557,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
 	if (!pgtable->root)
 		return NULL;
 	pgtable->mode = PAGE_MODE_3_LEVEL;
+	seqcount_init(&pgtable->seqcount);
 
 	cfg->pgsize_bitmap  = amd_iommu_pgsize_bitmap;
 	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5968043ac802..2a8b46b948f0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2997,9 +2997,9 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
 		/* ATS is being switched off, invalidate the entire ATC */
 		arm_smmu_atc_inv_master(master, IOMMU_NO_PASID);
 	}
-	master->ats_enabled = state->ats_enabled;
 
 	arm_smmu_remove_master_domain(master, state->old_domain, state->ssid);
+	master->ats_enabled = state->ats_enabled;
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index be1aaaf8cd17..378104cd395e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -301,9 +301,11 @@ static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf)
 	struct iommu_vevent_tegra241_cmdqv vevent_data;
 	int i;
 
-	for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++)
-		vevent_data.lvcmdq_err_map[i] =
-			readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+	for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) {
+		u64 err = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+
+		vevent_data.lvcmdq_err_map[i] = cpu_to_le64(err);
+	}
 
 	iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV,
 				    &vevent_data, sizeof(vevent_data));
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9c3ab9d9f69a..dff2d895b8ab 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1575,6 +1575,10 @@ static void switch_to_super_page(struct dmar_domain *domain,
 	unsigned long lvl_pages = lvl_to_nr_pages(level);
 	struct dma_pte *pte = NULL;
 
+	if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
+		    !IS_ALIGNED(end_pfn + 1, lvl_pages)))
+		return;
+
 	while (start_pfn <= end_pfn) {
 		if (!pte)
 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
@@ -1650,7 +1654,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 				unsigned long pages_to_remove;
 
 				pteval |= DMA_PTE_LARGE_PAGE;
-				pages_to_remove = min_t(unsigned long, nr_pages,
+				pages_to_remove = min_t(unsigned long,
+							round_down(nr_pages, lvl_pages),
 							nr_pte_to_next_page(pte) * lvl_pages);
 				end_pfn = iov_pfn + pages_to_remove - 1;
 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 65fbd098f9e9..4c842368289f 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -711,6 +711,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid)
 		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
 	mutex_unlock(&igroup->lock);
 
+	iommufd_hw_pagetable_put(idev->ictx, hwpt);
+
 	/* Caller must destroy hwpt */
 	return hwpt;
 }
@@ -1057,7 +1059,6 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid)
 	hwpt = iommufd_hw_pagetable_detach(idev, pasid);
 	if (!hwpt)
 		return;
-	iommufd_hw_pagetable_put(idev->ictx, hwpt);
 	refcount_dec(&idev->obj.users);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD");
diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c
index fc4de63b0bce..e23d9ee4fe38 100644
--- a/drivers/iommu/iommufd/eventq.c
+++ b/drivers/iommu/iommufd/eventq.c
@@ -393,12 +393,12 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name,
 			       const struct file_operations *fops)
 {
 	struct file *filep;
-	int fdno;
 
 	spin_lock_init(&eventq->lock);
 	INIT_LIST_HEAD(&eventq->deliver);
 	init_waitqueue_head(&eventq->wait_queue);
 
+	/* The filep is fput() by the core code during failure */
 	filep = anon_inode_getfile(name, fops, eventq, O_RDWR);
 	if (IS_ERR(filep))
 		return PTR_ERR(filep);
@@ -408,10 +408,7 @@ static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name,
 	eventq->filep = filep;
 	refcount_inc(&eventq->obj.users);
 
-	fdno = get_unused_fd_flags(O_CLOEXEC);
-	if (fdno < 0)
-		fput(filep);
-	return fdno;
+	return get_unused_fd_flags(O_CLOEXEC);
 }
 
 static const struct file_operations iommufd_fault_fops =
@@ -452,7 +449,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
 	return 0;
 out_put_fdno:
 	put_unused_fd(fdno);
-	fput(fault->common.filep);
 	return rc;
 }
 
@@ -536,7 +532,6 @@ int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
 
 out_put_fdno:
 	put_unused_fd(fdno);
-	fput(veventq->common.filep);
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj);
 out_unlock_veventqs:
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 0da2a81eedfa..627f9b78483a 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -454,9 +454,8 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
 	if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
 		struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
 
-		lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
-
 		if (hwpt_paging->auto_domain) {
+			lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
 			iommufd_object_put_and_try_destroy(ictx, &hwpt->obj);
 			return;
 		}
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 15af7ced0501..ce775fbbae94 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -23,6 +23,7 @@
 #include "iommufd_test.h"
 
 struct iommufd_object_ops {
+	size_t file_offset;
 	void (*pre_destroy)(struct iommufd_object *obj);
 	void (*destroy)(struct iommufd_object *obj);
 	void (*abort)(struct iommufd_object *obj);
@@ -121,6 +122,10 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
 	old = xas_store(&xas, NULL);
 	xa_unlock(&ictx->objects);
 	WARN_ON(old != XA_ZERO_ENTRY);
+
+	if (WARN_ON(!refcount_dec_and_test(&obj->users)))
+		return;
+
 	kfree(obj);
 }
 
@@ -131,10 +136,30 @@ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj)
 void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx,
 				      struct iommufd_object *obj)
 {
-	if (iommufd_object_ops[obj->type].abort)
-		iommufd_object_ops[obj->type].abort(obj);
+	const struct iommufd_object_ops *ops = &iommufd_object_ops[obj->type];
+
+	if (ops->file_offset) {
+		struct file **filep = ((void *)obj) + ops->file_offset;
+
+		/*
+		 * A file should hold a users refcount while the file is open
+		 * and put it back in its release. The file should hold a
+		 * pointer to obj in their private data. Normal fput() is
+		 * deferred to a workqueue and can get out of order with the
+		 * following kfree(obj). Using the sync version ensures the
+		 * release happens immediately. During abort we require the file
+		 * refcount is one at this point - meaning the object alloc
+		 * function cannot do anything to allow another thread to take a
+		 * refcount prior to a guaranteed success.
+		 */
+		if (*filep)
+			__fput_sync(*filep);
+	}
+
+	if (ops->abort)
+		ops->abort(obj);
 	else
-		iommufd_object_ops[obj->type].destroy(obj);
+		ops->destroy(obj);
 	iommufd_object_abort(ictx, obj);
 }
 
@@ -550,16 +575,23 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (vma->vm_flags & VM_EXEC)
 		return -EPERM;
 
+	mtree_lock(&ictx->mt_mmap);
 	/* vma->vm_pgoff carries a page-shifted start position to an immap */
 	immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT);
-	if (!immap)
+	if (!immap || !refcount_inc_not_zero(&immap->owner->users)) {
+		mtree_unlock(&ictx->mt_mmap);
 		return -ENXIO;
+	}
+	mtree_unlock(&ictx->mt_mmap);
+
 	/*
 	 * mtree_load() returns the immap for any contained mmio_addr, so only
 	 * allow the exact immap thing to be mapped
 	 */
-	if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length)
-		return -ENXIO;
+	if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) {
+		rc = -ENXIO;
+		goto err_refcount;
+	}
 
 	vma->vm_pgoff = 0;
 	vma->vm_private_data = immap;
@@ -570,10 +602,11 @@ static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma)
 				immap->mmio_addr >> PAGE_SHIFT, length,
 				vma->vm_page_prot);
 	if (rc)
-		return rc;
+		goto err_refcount;
+	return 0;
 
-	/* vm_ops.open won't be called for mmap itself. */
-	refcount_inc(&immap->owner->users);
+err_refcount:
+	refcount_dec(&immap->owner->users);
 	return rc;
 }
 
@@ -651,6 +684,12 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx)
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, "IOMMUFD");
 
+#define IOMMUFD_FILE_OFFSET(_struct, _filep, _obj)                           \
+	.file_offset = (offsetof(_struct, _filep) +                          \
+			BUILD_BUG_ON_ZERO(!__same_type(                      \
+				struct file *, ((_struct *)NULL)->_filep)) + \
+			BUILD_BUG_ON_ZERO(offsetof(_struct, _obj)))
+
 static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_ACCESS] = {
 		.destroy = iommufd_access_destroy_object,
@@ -661,6 +700,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	},
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
+		IOMMUFD_FILE_OFFSET(struct iommufd_fault, common.filep, common.obj),
 	},
 	[IOMMUFD_OBJ_HW_QUEUE] = {
 		.destroy = iommufd_hw_queue_destroy,
@@ -683,6 +723,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VEVENTQ] = {
 		.destroy = iommufd_veventq_destroy,
 		.abort = iommufd_veventq_abort,
+		IOMMUFD_FILE_OFFSET(struct iommufd_veventq, common.filep, common.obj),
 	},
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 2ca5809b238b..462b457ffd0c 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -339,7 +339,7 @@ iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd,
 	}
 
 	*base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset;
-	kfree(pages);
+	kvfree(pages);
 	return access;
 
 out_unpin:
@@ -349,7 +349,7 @@ out_detach:
 out_destroy:
 	iommufd_access_destroy_internal(viommu->ictx, access);
 out_free:
-	kfree(pages);
+	kvfree(pages);
 	return ERR_PTR(rc);
 }
 
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 2d0d31ba2886..0eae2f4bdc5e 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1283,7 +1283,7 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
 	unsigned long *ptr;
 
 	ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
-	if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+	if (!ptr)
 		return 0;
 
 	return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 9c80d61deb2c..aa576736d60b 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -612,6 +612,23 @@ static u64 get_iota_region_flag(struct s390_domain *domain)
 	}
 }
 
+static bool reg_ioat_propagate_error(int cc, u8 status)
+{
+	/*
+	 * If the device is in the error state the reset routine
+	 * will register the IOAT of the newly set domain on re-enable
+	 */
+	if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+		return false;
+	/*
+	 * If the device was removed treat registration as success
+	 * and let the subsequent error event trigger tear down.
+	 */
+	if (cc == ZPCI_CC_INVAL_HANDLE)
+		return false;
+	return cc != ZPCI_CC_OK;
+}
+
 static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev,
 				      struct iommu_domain *domain, u8 *status)
 {
@@ -696,7 +713,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
 
 	/* If we fail now DMA remains blocked via blocking domain */
 	cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
-	if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+	if (reg_ioat_propagate_error(cc, status))
 		return -EIO;
 	zdev->dma_table = s390_domain->dma_table;
 	zdev_s390_domain_update(zdev, domain);
@@ -1032,7 +1049,8 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
 
 	lockdep_assert_held(&zdev->dom_lock);
 
-	if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED)
+	if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED ||
+	    zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY)
 		return NULL;
 
 	s390_domain = to_s390_domain(zdev->s390_domain);
@@ -1123,12 +1141,7 @@ static int s390_attach_dev_identity(struct iommu_domain *domain,
 
 	/* If we fail now DMA remains blocked via blocking domain */
 	cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);
-
-	/*
-	 * If the device is undergoing error recovery the reset code
-	 * will re-establish the new domain.
-	 */
-	if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+	if (reg_ioat_propagate_error(cc, status))
 		return -EIO;
 
 	zdev_s390_domain_update(zdev, domain);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 532db1de201b..b39d6f134ab2 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -998,8 +998,7 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
 	iommu_dma_get_resv_regions(dev, head);
 }
 
-static const struct iommu_ops viommu_ops;
-static struct virtio_driver virtio_iommu_drv;
+static const struct bus_type *virtio_bus_type;
 
 static int viommu_match_node(struct device *dev, const void *data)
 {
@@ -1008,8 +1007,9 @@ static int viommu_match_node(struct device *dev, const void *data)
 
 static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode)
 {
-	struct device *dev = driver_find_device(&virtio_iommu_drv.driver, NULL,
-						fwnode, viommu_match_node);
+	struct device *dev = bus_find_device(virtio_bus_type, NULL, fwnode,
+					     viommu_match_node);
+
 	put_device(dev);
 
 	return dev ? dev_to_virtio(dev)->priv : NULL;
@@ -1160,6 +1160,9 @@ static int viommu_probe(struct virtio_device *vdev)
 	if (!viommu)
 		return -ENOMEM;
 
+	/* Borrow this for easy lookups later */
+	virtio_bus_type = dev->bus;
+
 	spin_lock_init(&viommu->request_lock);
 	ida_init(&viommu->domain_ids);
 	viommu->dev = dev;
@@ -1229,10 +1232,10 @@ static int viommu_probe(struct virtio_device *vdev)
 	if (ret)
 		goto err_free_vqs;
 
-	iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
-
 	vdev->priv = viommu;
 
+	iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
+
 	dev_info(dev, "input address: %u bits\n",
 		 order_base_2(viommu->geometry.aperture_end));
 	dev_info(dev, "page mask: %#llx\n", viommu->pgsize_bitmap);
diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c
index 03aeed39a4d2..1dcc52760eca 100644
--- a/drivers/irqchip/irq-atmel-aic.c
+++ b/drivers/irqchip/irq-atmel-aic.c
@@ -188,7 +188,7 @@ static int aic_irq_domain_xlate(struct irq_domain *d,
 
 	gc = dgc->gc[idx];
 
-	guard(raw_spinlock_irq)(&gc->lock);
+	guard(raw_spinlock_irqsave)(&gc->lock);
 	smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq));
 	aic_common_set_priority(intspec[2], &smr);
 	irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c
index 60b00d2c3d7a..1f14b401f71d 100644
--- a/drivers/irqchip/irq-atmel-aic5.c
+++ b/drivers/irqchip/irq-atmel-aic5.c
@@ -279,7 +279,7 @@ static int aic5_irq_domain_xlate(struct irq_domain *d,
 	if (ret)
 		return ret;
 
-	guard(raw_spinlock_irq)(&bgc->lock);
+	guard(raw_spinlock_irqsave)(&bgc->lock);
 	irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR);
 	smr = irq_reg_readl(bgc, AT91_AIC5_SMR);
 	aic_common_set_priority(intspec[2], &smr);
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index ad1435a858a4..ce2732d649a3 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -5,6 +5,7 @@
 
 #define pr_fmt(fmt)	"GICv5 IRS: " fmt
 
+#include <linux/kmemleak.h>
 #include <linux/log2.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -117,6 +118,7 @@ static int __init gicv5_irs_init_ist_linear(struct gicv5_irs_chip_data *irs_data
 		kfree(ist);
 		return ret;
 	}
+	kmemleak_ignore(ist);
 
 	return 0;
 }
@@ -232,6 +234,7 @@ int gicv5_irs_iste_alloc(const u32 lpi)
 		kfree(l2ist);
 		return ret;
 	}
+	kmemleak_ignore(l2ist);
 
 	/*
 	 * Make sure we invalidate the cache line pulled before the IRS
@@ -623,12 +626,14 @@ static int __init gicv5_irs_of_init_affinity(struct device_node *node,
 		int cpu;
 
 		cpu_node = of_parse_phandle(node, "cpus", i);
-		if (WARN_ON(!cpu_node))
+		if (!cpu_node) {
+			pr_warn(FW_BUG "Erroneous CPU node phandle\n");
 			continue;
+		}
 
 		cpu = of_cpu_node_to_id(cpu_node);
 		of_node_put(cpu_node);
-		if (WARN_ON(cpu < 0))
+		if (cpu < 0)
 			continue;
 
 		if (iaffids[i] & ~iaffid_mask) {
diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c
index 54833717f8a7..667bde3c651f 100644
--- a/drivers/irqchip/irq-mvebu-gicp.c
+++ b/drivers/irqchip/irq-mvebu-gicp.c
@@ -238,7 +238,7 @@ static int mvebu_gicp_probe(struct platform_device *pdev)
 	}
 
 	base = ioremap(gicp->res->start, resource_size(gicp->res));
-	if (IS_ERR(base)) {
+	if (!base) {
 		dev_err(&pdev->dev, "ioremap() failed. Unable to clear pending interrupts.\n");
 	} else {
 		for (i = 0; i < 64; i++)
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index 2b05722d4dbe..ea8a0ab47afd 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -39,12 +39,13 @@
 
 #include "hfc_pci.h"
 
+static void hfcpci_softirq(struct timer_list *unused);
 static const char *hfcpci_revision = "2.0";
 
 static int HFC_cnt;
 static uint debug;
 static uint poll, tics;
-static struct timer_list hfc_tl;
+static DEFINE_TIMER(hfc_tl, hfcpci_softirq);
 static unsigned long hfc_jiffies;
 
 MODULE_AUTHOR("Karsten Keil");
@@ -2305,8 +2306,7 @@ hfcpci_softirq(struct timer_list *unused)
 		hfc_jiffies = jiffies + 1;
 	else
 		hfc_jiffies += tics;
-	hfc_tl.expires = hfc_jiffies;
-	add_timer(&hfc_tl);
+	mod_timer(&hfc_tl, hfc_jiffies);
 }
 
 static int __init
@@ -2332,10 +2332,8 @@ HFC_init(void)
 	if (poll != HFCPCI_BTRANS_THRESHOLD) {
 		printk(KERN_INFO "%s: Using alternative poll value of %d\n",
 		       __func__, poll);
-		timer_setup(&hfc_tl, hfcpci_softirq, 0);
-		hfc_tl.expires = jiffies + tics;
-		hfc_jiffies = hfc_tl.expires;
-		add_timer(&hfc_tl);
+		hfc_jiffies = jiffies + tics;
+		mod_timer(&hfc_tl, hfc_jiffies);
 	} else
 		tics = 0; /* indicate the use of controller's timer */
 
diff --git a/drivers/isdn/mISDN/dsp_hwec.c b/drivers/isdn/mISDN/dsp_hwec.c
index 0b3f29195330..0cd216e28f00 100644
--- a/drivers/isdn/mISDN/dsp_hwec.c
+++ b/drivers/isdn/mISDN/dsp_hwec.c
@@ -51,14 +51,14 @@ void dsp_hwec_enable(struct dsp *dsp, const char *arg)
 		goto _do;
 
 	{
-		char *dup, *tok, *name, *val;
+		char *dup, *next, *tok, *name, *val;
 		int tmp;
 
-		dup = kstrdup(arg, GFP_ATOMIC);
+		dup = next = kstrdup(arg, GFP_ATOMIC);
 		if (!dup)
 			return;
 
-		while ((tok = strsep(&dup, ","))) {
+		while ((tok = strsep(&next, ","))) {
 			if (!strlen(tok))
 				continue;
 			name = strsep(&tok, "=");
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index efeee0a873c0..ab96b692e5a3 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -133,7 +133,7 @@ struct journal_sector {
 	commit_id_t commit_id;
 };
 
-#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
+#define MAX_TAG_SIZE			255
 
 #define METADATA_PADDING_SECTORS	8
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 79ea85d18e24..f4b904e24328 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	struct raid_set *rs = ti->private;
 	unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
 
-	limits->io_min = chunk_size_bytes;
-	limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
+	if (chunk_size_bytes) {
+		limits->io_min = chunk_size_bytes;
+		limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs);
+	}
 }
 
 static void raid_presuspend(struct dm_target *ti)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 58902091bf79..1461dc740dae 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti,
 			    struct queue_limits *limits)
 {
 	struct stripe_c *sc = ti->private;
-	unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT;
+	unsigned int io_min, io_opt;
 
 	limits->chunk_sectors = sc->chunk_size;
-	limits->io_min = chunk_size;
-	limits->io_opt = chunk_size * sc->stripes;
+
+	if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) &&
+	    !check_mul_overflow(io_min, sc->stripes, &io_opt)) {
+		limits->io_min = io_min;
+		limits->io_opt = io_opt;
+	}
 }
 
 static struct target_type stripe_target = {
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5497eaee96e7..6e9a0045f0ff 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -979,7 +979,7 @@ err:
 	lockres_free(cinfo->resync_lockres);
 	lockres_free(cinfo->bitmap_lockres);
 	if (cinfo->lockspace)
-		dlm_release_lockspace(cinfo->lockspace, 2);
+		dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL);
 	mddev->cluster_info = NULL;
 	kfree(cinfo);
 	return ret;
@@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->resync_lockres);
 	lockres_free(cinfo->bitmap_lockres);
 	unlock_all_bitmaps(mddev);
-	dlm_release_lockspace(cinfo->lockspace, 2);
+	dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL);
 	kfree(cinfo);
 	return 0;
 }
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 5d9b08115375..3e1f165c2d20 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev)
 	md_init_stacking_limits(&lim);
 	lim.max_hw_sectors = mddev->chunk_sectors;
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
 	lim.io_min = mddev->chunk_sectors << 9;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ac85ec73a409..4e033c26fdd4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -339,6 +339,7 @@ static int start_readonly;
  * so all the races disappear.
  */
 static bool create_on_open = true;
+static bool legacy_async_del_gendisk = true;
 
 /*
  * We have a system wide 'event count' that is incremented
@@ -877,15 +878,18 @@ void mddev_unlock(struct mddev *mddev)
 		export_rdev(rdev, mddev);
 	}
 
-	/* Call del_gendisk after release reconfig_mutex to avoid
-	 * deadlock (e.g. call del_gendisk under the lock and an
-	 * access to sysfs files waits the lock)
-	 * And MD_DELETED is only used for md raid which is set in
-	 * do_md_stop. dm raid only uses md_stop to stop. So dm raid
-	 * doesn't need to check MD_DELETED when getting reconfig lock
-	 */
-	if (test_bit(MD_DELETED, &mddev->flags))
-		del_gendisk(mddev->gendisk);
+	if (!legacy_async_del_gendisk) {
+		/*
+		 * Call del_gendisk after release reconfig_mutex to avoid
+		 * deadlock (e.g. call del_gendisk under the lock and an
+		 * access to sysfs files waits the lock)
+		 * And MD_DELETED is only used for md raid which is set in
+		 * do_md_stop. dm raid only uses md_stop to stop. So dm raid
+		 * doesn't need to check MD_DELETED when getting reconfig lock
+		 */
+		if (test_bit(MD_DELETED, &mddev->flags))
+			del_gendisk(mddev->gendisk);
+	}
 }
 EXPORT_SYMBOL_GPL(mddev_unlock);
 
@@ -1419,7 +1423,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
 		else {
 			if (sb->events_hi == sb->cp_events_hi &&
 				sb->events_lo == sb->cp_events_lo) {
-				mddev->resync_offset = sb->resync_offset;
+				mddev->resync_offset = sb->recovery_cp;
 			} else
 				mddev->resync_offset = 0;
 		}
@@ -1547,13 +1551,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
-		sb->resync_offset = mddev->resync_offset;
+		sb->recovery_cp = mddev->resync_offset;
 		sb->cp_events_hi = (mddev->events>>32);
 		sb->cp_events_lo = (u32)mddev->events;
 		if (mddev->resync_offset == MaxSector)
 			sb->state = (1<< MD_SB_CLEAN);
 	} else
-		sb->resync_offset = 0;
+		sb->recovery_cp = 0;
 
 	sb->layout = mddev->layout;
 	sb->chunk_size = mddev->chunk_sectors << 9;
@@ -4835,9 +4839,42 @@ out_unlock:
 static struct md_sysfs_entry md_metadata =
 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 
+static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
+{
+	return rdev->raid_disk >= 0 &&
+	       !test_bit(Journal, &rdev->flags) &&
+	       !test_bit(Faulty, &rdev->flags) &&
+	       !test_bit(In_sync, &rdev->flags) &&
+	       rdev->recovery_offset < sectors;
+}
+
+static enum sync_action md_get_active_sync_action(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+	bool is_recover = false;
+
+	if (mddev->resync_offset < MaxSector)
+		return ACTION_RESYNC;
+
+	if (mddev->reshape_position != MaxSector)
+		return ACTION_RESHAPE;
+
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev) {
+		if (rdev_needs_recovery(rdev, MaxSector)) {
+			is_recover = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return is_recover ? ACTION_RECOVER : ACTION_IDLE;
+}
+
 enum sync_action md_sync_action(struct mddev *mddev)
 {
 	unsigned long recovery = mddev->recovery;
+	enum sync_action active_action;
 
 	/*
 	 * frozen has the highest priority, means running sync_thread will be
@@ -4861,8 +4898,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
 	    !test_bit(MD_RECOVERY_NEEDED, &recovery))
 		return ACTION_IDLE;
 
-	if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
-	    mddev->reshape_position != MaxSector)
+	/*
+	 * Check if any sync operation (resync/recover/reshape) is
+	 * currently active. This ensures that only one sync operation
+	 * can run at a time. Returns the type of active operation, or
+	 * ACTION_IDLE if none are active.
+	 */
+	active_action = md_get_active_sync_action(mddev);
+	if (active_action != ACTION_IDLE)
+		return active_action;
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
 		return ACTION_RESHAPE;
 
 	if (test_bit(MD_RECOVERY_RECOVER, &recovery))
@@ -5818,6 +5864,13 @@ static void md_kobj_release(struct kobject *ko)
 {
 	struct mddev *mddev = container_of(ko, struct mddev, kobj);
 
+	if (legacy_async_del_gendisk) {
+		if (mddev->sysfs_state)
+			sysfs_put(mddev->sysfs_state);
+		if (mddev->sysfs_level)
+			sysfs_put(mddev->sysfs_level);
+		del_gendisk(mddev->gendisk);
+	}
 	put_disk(mddev->gendisk);
 }
 
@@ -6021,6 +6074,9 @@ static int md_alloc_and_put(dev_t dev, char *name)
 {
 	struct mddev *mddev = md_alloc(dev, name);
 
+	if (legacy_async_del_gendisk)
+		pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
+
 	if (IS_ERR(mddev))
 		return PTR_ERR(mddev);
 	mddev_put(mddev);
@@ -6431,10 +6487,22 @@ static void md_clean(struct mddev *mddev)
 	mddev->persistent = 0;
 	mddev->level = LEVEL_NONE;
 	mddev->clevel[0] = 0;
-	/* if UNTIL_STOP is set, it's cleared here */
-	mddev->hold_active = 0;
-	/* Don't clear MD_CLOSING, or mddev can be opened again. */
-	mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
+
+	/*
+	 * For legacy_async_del_gendisk mode, it can stop the array in the
+	 * middle of assembling it, then it still can access the array. So
+	 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk,
+	 * it can't open the array again after stopping it. So it doesn't
+	 * clear MD_CLOSING.
+	 */
+	if (legacy_async_del_gendisk && mddev->hold_active) {
+		clear_bit(MD_CLOSING, &mddev->flags);
+	} else {
+		/* if UNTIL_STOP is set, it's cleared here */
+		mddev->hold_active = 0;
+		/* Don't clear MD_CLOSING, or mddev can be opened again. */
+		mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
+	}
 	mddev->sb_flags = 0;
 	mddev->ro = MD_RDWR;
 	mddev->metadata_type[0] = 0;
@@ -6658,7 +6726,8 @@ static int do_md_stop(struct mddev *mddev, int mode)
 
 		export_array(mddev);
 		md_clean(mddev);
-		set_bit(MD_DELETED, &mddev->flags);
+		if (!legacy_async_del_gendisk)
+			set_bit(MD_DELETED, &mddev->flags);
 	}
 	md_new_event();
 	sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -8968,11 +9037,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
 		start = MaxSector;
 		rcu_read_lock();
 		rdev_for_each_rcu(rdev, mddev)
-			if (rdev->raid_disk >= 0 &&
-			    !test_bit(Journal, &rdev->flags) &&
-			    !test_bit(Faulty, &rdev->flags) &&
-			    !test_bit(In_sync, &rdev->flags) &&
-			    rdev->recovery_offset < start)
+			if (rdev_needs_recovery(rdev, start))
 				start = rdev->recovery_offset;
 		rcu_read_unlock();
 
@@ -9060,6 +9125,11 @@ void md_do_sync(struct md_thread *thread)
 	}
 
 	action = md_sync_action(mddev);
+	if (action == ACTION_FROZEN || action == ACTION_IDLE) {
+		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+		goto skip;
+	}
+
 	desc = md_sync_action_name(action);
 	mddev->last_sync_action = action;
 
@@ -9331,12 +9401,8 @@ void md_do_sync(struct md_thread *thread)
 			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
 				rcu_read_lock();
 				rdev_for_each_rcu(rdev, mddev)
-					if (rdev->raid_disk >= 0 &&
-					    mddev->delta_disks >= 0 &&
-					    !test_bit(Journal, &rdev->flags) &&
-					    !test_bit(Faulty, &rdev->flags) &&
-					    !test_bit(In_sync, &rdev->flags) &&
-					    rdev->recovery_offset < mddev->curr_resync)
+					if (mddev->delta_disks >= 0 &&
+					    rdev_needs_recovery(rdev, mddev->curr_resync))
 						rdev->recovery_offset = mddev->curr_resync;
 				rcu_read_unlock();
 			}
@@ -10392,6 +10458,7 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
+module_param(legacy_async_del_gendisk, bool, 0600);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f1d8811a542a..419139ad7663 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev)
 	md_init_stacking_limits(&lim);
 	lim.max_hw_sectors = mddev->chunk_sectors;
 	lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+	lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * mddev->raid_disks;
 	lim.chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 408c26398321..d30b82beeb92 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1225,7 +1225,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
 	int i = 0;
 	struct bio *behind_bio = NULL;
 
-	behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
+	behind_bio = bio_alloc_bioset(NULL, vcnt, bio->bi_opf, GFP_NOIO,
 				      &r1_bio->mddev->bio_set);
 
 	/* discard op, we don't support writezero/writesame yet */
@@ -3211,6 +3211,7 @@ static int raid1_set_limits(struct mddev *mddev)
 
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
+	lim.max_hw_wzeroes_unmap_sectors = 0;
 	lim.features |= BLK_FEAT_ATOMIC_WRITES;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b60c30bfb6c7..9832eefb2f15 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4008,6 +4008,7 @@ static int raid10_set_queue_limits(struct mddev *mddev)
 
 	md_init_stacking_limits(&lim);
 	lim.max_write_zeroes_sectors = 0;
+	lim.max_hw_wzeroes_unmap_sectors = 0;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.chunk_sectors = mddev->chunk_sectors;
 	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 023649fe2476..e385ef1355e8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7732,6 +7732,7 @@ static int raid5_set_limits(struct mddev *mddev)
 	lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
 	lim.discard_granularity = stripe;
 	lim.max_write_zeroes_sectors = 0;
+	lim.max_hw_wzeroes_unmap_sectors = 0;
 	mddev_stack_rdev_limits(mddev, &lim, 0);
 	rdev_for_each(rdev, mddev)
 		queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
diff --git a/drivers/media/i2c/alvium-csi2.c b/drivers/media/i2c/alvium-csi2.c
index 05b708bd0a64..1f088acecf36 100644
--- a/drivers/media/i2c/alvium-csi2.c
+++ b/drivers/media/i2c/alvium-csi2.c
@@ -1841,7 +1841,6 @@ static int alvium_s_stream(struct v4l2_subdev *sd, int enable)
 
 	} else {
 		alvium_set_stream_mipi(alvium, enable);
-		pm_runtime_mark_last_busy(&client->dev);
 		pm_runtime_put_autosuspend(&client->dev);
 	}
 
diff --git a/drivers/media/i2c/ccs/ccs-core.c b/drivers/media/i2c/ccs/ccs-core.c
index 487bcabb4a19..1c889c878abd 100644
--- a/drivers/media/i2c/ccs/ccs-core.c
+++ b/drivers/media/i2c/ccs/ccs-core.c
@@ -787,10 +787,8 @@ static int ccs_set_ctrl(struct v4l2_ctrl *ctrl)
 		rval = -EINVAL;
 	}
 
-	if (pm_status > 0) {
-		pm_runtime_mark_last_busy(&client->dev);
+	if (pm_status > 0)
 		pm_runtime_put_autosuspend(&client->dev);
-	}
 
 	return rval;
 }
@@ -1914,7 +1912,6 @@ static int ccs_set_stream(struct v4l2_subdev *subdev, int enable)
 	if (!enable) {
 		ccs_stop_streaming(sensor);
 		sensor->streaming = false;
-		pm_runtime_mark_last_busy(&client->dev);
 		pm_runtime_put_autosuspend(&client->dev);
 
 		return 0;
@@ -1929,7 +1926,6 @@ static int ccs_set_stream(struct v4l2_subdev *subdev, int enable)
 	rval = ccs_start_streaming(sensor);
 	if (rval < 0) {
 		sensor->streaming = false;
-		pm_runtime_mark_last_busy(&client->dev);
 		pm_runtime_put_autosuspend(&client->dev);
 	}
 
@@ -2677,7 +2673,6 @@ nvm_show(struct device *dev, struct device_attribute *attr, char *buf)
 		return -ENODEV;
 	}
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	/*
diff --git a/drivers/media/i2c/dw9768.c b/drivers/media/i2c/dw9768.c
index 3a4d100b9199..d434721ba8ed 100644
--- a/drivers/media/i2c/dw9768.c
+++ b/drivers/media/i2c/dw9768.c
@@ -374,7 +374,6 @@ static int dw9768_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 
 static int dw9768_close(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 {
-	pm_runtime_mark_last_busy(sd->dev);
 	pm_runtime_put_autosuspend(sd->dev);
 
 	return 0;
diff --git a/drivers/media/i2c/gc0308.c b/drivers/media/i2c/gc0308.c
index 069f42785b3c..cbcda0e18ff1 100644
--- a/drivers/media/i2c/gc0308.c
+++ b/drivers/media/i2c/gc0308.c
@@ -974,7 +974,6 @@ static int gc0308_s_ctrl(struct v4l2_ctrl *ctrl)
 	if (ret)
 		dev_err(gc0308->dev, "failed to set control: %d\n", ret);
 
-	pm_runtime_mark_last_busy(gc0308->dev);
 	pm_runtime_put_autosuspend(gc0308->dev);
 
 	return ret;
@@ -1157,14 +1156,12 @@ static int gc0308_start_stream(struct gc0308 *gc0308)
 	return 0;
 
 disable_pm:
-	pm_runtime_mark_last_busy(gc0308->dev);
 	pm_runtime_put_autosuspend(gc0308->dev);
 	return ret;
 }
 
 static int gc0308_stop_stream(struct gc0308 *gc0308)
 {
-	pm_runtime_mark_last_busy(gc0308->dev);
 	pm_runtime_put_autosuspend(gc0308->dev);
 	return 0;
 }
diff --git a/drivers/media/i2c/gc2145.c b/drivers/media/i2c/gc2145.c
index ba02161d46e7..559a851669aa 100644
--- a/drivers/media/i2c/gc2145.c
+++ b/drivers/media/i2c/gc2145.c
@@ -963,7 +963,6 @@ static int gc2145_enable_streams(struct v4l2_subdev *sd,
 	return 0;
 
 err_rpm_put:
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 	return ret;
 }
@@ -985,7 +984,6 @@ static int gc2145_disable_streams(struct v4l2_subdev *sd,
 	if (ret)
 		dev_err(&client->dev, "%s failed to write regs\n", __func__);
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	return ret;
@@ -1193,7 +1191,6 @@ static int gc2145_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	return ret;
diff --git a/drivers/media/i2c/imx219.c b/drivers/media/i2c/imx219.c
index 3b4f68543342..3faf48f34af4 100644
--- a/drivers/media/i2c/imx219.c
+++ b/drivers/media/i2c/imx219.c
@@ -771,7 +771,6 @@ static int imx219_enable_streams(struct v4l2_subdev *sd,
 	return 0;
 
 err_rpm_put:
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 	return ret;
 }
@@ -793,7 +792,6 @@ static int imx219_disable_streams(struct v4l2_subdev *sd,
 	__v4l2_ctrl_grab(imx219->vflip, false);
 	__v4l2_ctrl_grab(imx219->hflip, false);
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	return ret;
diff --git a/drivers/media/i2c/imx283.c b/drivers/media/i2c/imx283.c
index da618c8cbadc..67e8bb432d10 100644
--- a/drivers/media/i2c/imx283.c
+++ b/drivers/media/i2c/imx283.c
@@ -1143,7 +1143,6 @@ static int imx283_enable_streams(struct v4l2_subdev *sd,
 	return 0;
 
 err_rpm_put:
-	pm_runtime_mark_last_busy(imx283->dev);
 	pm_runtime_put_autosuspend(imx283->dev);
 
 	return ret;
@@ -1163,7 +1162,6 @@ static int imx283_disable_streams(struct v4l2_subdev *sd,
 	if (ret)
 		dev_err(imx283->dev, "Failed to stop stream\n");
 
-	pm_runtime_mark_last_busy(imx283->dev);
 	pm_runtime_put_autosuspend(imx283->dev);
 
 	return ret;
@@ -1558,7 +1556,6 @@ static int imx283_probe(struct i2c_client *client)
 	 * Decrease the PM usage count. The device will get suspended after the
 	 * autosuspend delay, turning the power off.
 	 */
-	pm_runtime_mark_last_busy(imx283->dev);
 	pm_runtime_put_autosuspend(imx283->dev);
 
 	return 0;
diff --git a/drivers/media/i2c/imx290.c b/drivers/media/i2c/imx290.c
index 4f3f386c5353..ec172556612e 100644
--- a/drivers/media/i2c/imx290.c
+++ b/drivers/media/i2c/imx290.c
@@ -869,7 +869,6 @@ static int imx290_set_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(imx290->dev);
 	pm_runtime_put_autosuspend(imx290->dev);
 
 	return ret;
@@ -1099,7 +1098,6 @@ static int imx290_set_stream(struct v4l2_subdev *sd, int enable)
 		}
 	} else {
 		imx290_stop_streaming(imx290);
-		pm_runtime_mark_last_busy(imx290->dev);
 		pm_runtime_put_autosuspend(imx290->dev);
 	}
 
@@ -1294,7 +1292,6 @@ static int imx290_subdev_init(struct imx290 *imx290)
 	 * will already be prevented even before the delay.
 	 */
 	v4l2_i2c_subdev_init(&imx290->sd, client, &imx290_subdev_ops);
-	pm_runtime_mark_last_busy(imx290->dev);
 	pm_runtime_put_autosuspend(imx290->dev);
 
 	imx290->sd.internal_ops = &imx290_internal_ops;
diff --git a/drivers/media/i2c/imx296.c b/drivers/media/i2c/imx296.c
index f3bec16b527c..61116f4e3f76 100644
--- a/drivers/media/i2c/imx296.c
+++ b/drivers/media/i2c/imx296.c
@@ -604,7 +604,6 @@ static int imx296_s_stream(struct v4l2_subdev *sd, int enable)
 	if (!enable) {
 		ret = imx296_stream_off(sensor);
 
-		pm_runtime_mark_last_busy(sensor->dev);
 		pm_runtime_put_autosuspend(sensor->dev);
 
 		goto unlock;
diff --git a/drivers/media/i2c/imx415.c b/drivers/media/i2c/imx415.c
index 278e743646ea..276bf4d6f39d 100644
--- a/drivers/media/i2c/imx415.c
+++ b/drivers/media/i2c/imx415.c
@@ -952,7 +952,6 @@ static int imx415_s_stream(struct v4l2_subdev *sd, int enable)
 	if (!enable) {
 		ret = imx415_stream_off(sensor);
 
-		pm_runtime_mark_last_busy(sensor->dev);
 		pm_runtime_put_autosuspend(sensor->dev);
 
 		goto unlock;
diff --git a/drivers/media/i2c/mt9m114.c b/drivers/media/i2c/mt9m114.c
index 3f540ca40f3c..aa3fd6c6c76c 100644
--- a/drivers/media/i2c/mt9m114.c
+++ b/drivers/media/i2c/mt9m114.c
@@ -974,7 +974,6 @@ static int mt9m114_start_streaming(struct mt9m114 *sensor,
 	return 0;
 
 error:
-	pm_runtime_mark_last_busy(&sensor->client->dev);
 	pm_runtime_put_autosuspend(&sensor->client->dev);
 
 	return ret;
@@ -988,7 +987,6 @@ static int mt9m114_stop_streaming(struct mt9m114 *sensor)
 
 	ret = mt9m114_set_state(sensor, MT9M114_SYS_STATE_ENTER_SUSPEND);
 
-	pm_runtime_mark_last_busy(&sensor->client->dev);
 	pm_runtime_put_autosuspend(&sensor->client->dev);
 
 	return ret;
@@ -1046,7 +1044,6 @@ static int mt9m114_pa_g_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&sensor->client->dev);
 	pm_runtime_put_autosuspend(&sensor->client->dev);
 
 	return ret;
@@ -1113,7 +1110,6 @@ static int mt9m114_pa_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&sensor->client->dev);
 	pm_runtime_put_autosuspend(&sensor->client->dev);
 
 	return ret;
@@ -1565,7 +1561,6 @@ static int mt9m114_ifp_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&sensor->client->dev);
 	pm_runtime_put_autosuspend(&sensor->client->dev);
 
 	return ret;
@@ -2472,7 +2467,6 @@ static int mt9m114_probe(struct i2c_client *client)
 	 * Decrease the PM usage count. The device will get suspended after the
 	 * autosuspend delay, turning the power off.
 	 */
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return 0;
diff --git a/drivers/media/i2c/ov4689.c b/drivers/media/i2c/ov4689.c
index 1c3a449f9354..7d740ad3926f 100644
--- a/drivers/media/i2c/ov4689.c
+++ b/drivers/media/i2c/ov4689.c
@@ -497,7 +497,6 @@ static int ov4689_s_stream(struct v4l2_subdev *sd, int on)
 	} else {
 		cci_write(ov4689->regmap, OV4689_REG_CTRL_MODE,
 			  OV4689_MODE_SW_STANDBY, NULL);
-		pm_runtime_mark_last_busy(dev);
 		pm_runtime_put_autosuspend(dev);
 	}
 
@@ -702,7 +701,6 @@ static int ov4689_set_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return ret;
@@ -999,7 +997,6 @@ static int ov4689_probe(struct i2c_client *client)
 		goto err_clean_subdev_pm;
 	}
 
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return 0;
diff --git a/drivers/media/i2c/ov5640.c b/drivers/media/i2c/ov5640.c
index 0dae0438aa80..84198613381d 100644
--- a/drivers/media/i2c/ov5640.c
+++ b/drivers/media/i2c/ov5640.c
@@ -3341,7 +3341,6 @@ static int ov5640_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&sensor->i2c_client->dev);
 	pm_runtime_put_autosuspend(&sensor->i2c_client->dev);
 
 	return 0;
@@ -3417,7 +3416,6 @@ static int ov5640_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(&sensor->i2c_client->dev);
 	pm_runtime_put_autosuspend(&sensor->i2c_client->dev);
 
 	return ret;
@@ -3754,7 +3752,6 @@ out:
 	mutex_unlock(&sensor->lock);
 
 	if (!enable || ret) {
-		pm_runtime_mark_last_busy(&sensor->i2c_client->dev);
 		pm_runtime_put_autosuspend(&sensor->i2c_client->dev);
 	}
 
@@ -3965,7 +3962,6 @@ static int ov5640_probe(struct i2c_client *client)
 
 	pm_runtime_set_autosuspend_delay(dev, 1000);
 	pm_runtime_use_autosuspend(dev);
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return 0;
diff --git a/drivers/media/i2c/ov5645.c b/drivers/media/i2c/ov5645.c
index 004d0ee5c3f5..58c846a44376 100644
--- a/drivers/media/i2c/ov5645.c
+++ b/drivers/media/i2c/ov5645.c
@@ -808,7 +808,6 @@ static int ov5645_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(ov5645->dev);
 	pm_runtime_put_autosuspend(ov5645->dev);
 
 	return ret;
@@ -979,7 +978,6 @@ static int ov5645_disable_streams(struct v4l2_subdev *sd,
 			       OV5645_SYSTEM_CTRL0_STOP);
 
 rpm_put:
-	pm_runtime_mark_last_busy(ov5645->dev);
 	pm_runtime_put_autosuspend(ov5645->dev);
 
 	return ret;
@@ -1196,7 +1194,6 @@ static int ov5645_probe(struct i2c_client *client)
 
 	pm_runtime_set_autosuspend_delay(dev, 1000);
 	pm_runtime_use_autosuspend(dev);
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return 0;
diff --git a/drivers/media/i2c/ov64a40.c b/drivers/media/i2c/ov64a40.c
index a5da4fe47e0b..2031cbd05c26 100644
--- a/drivers/media/i2c/ov64a40.c
+++ b/drivers/media/i2c/ov64a40.c
@@ -2990,7 +2990,6 @@ static int ov64a40_start_streaming(struct ov64a40 *ov64a40,
 	return 0;
 
 error_power_off:
-	pm_runtime_mark_last_busy(ov64a40->dev);
 	pm_runtime_put_autosuspend(ov64a40->dev);
 
 	return ret;
@@ -3000,7 +2999,6 @@ static int ov64a40_stop_streaming(struct ov64a40 *ov64a40,
 				  struct v4l2_subdev_state *state)
 {
 	cci_update_bits(ov64a40->cci, OV64A40_REG_SMIA, BIT(0), 0, NULL);
-	pm_runtime_mark_last_busy(ov64a40->dev);
 	pm_runtime_put_autosuspend(ov64a40->dev);
 
 	__v4l2_ctrl_grab(ov64a40->link_freq, false);
@@ -3329,10 +3327,8 @@ static int ov64a40_set_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	if (pm_status > 0) {
-		pm_runtime_mark_last_busy(ov64a40->dev);
+	if (pm_status > 0)
 		pm_runtime_put_autosuspend(ov64a40->dev);
-	}
 
 	return ret;
 }
@@ -3622,7 +3618,6 @@ static int ov64a40_probe(struct i2c_client *client)
 		goto error_subdev_cleanup;
 	}
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	return 0;
diff --git a/drivers/media/i2c/ov8858.c b/drivers/media/i2c/ov8858.c
index 95f9ae794846..6b7193eaea1f 100644
--- a/drivers/media/i2c/ov8858.c
+++ b/drivers/media/i2c/ov8858.c
@@ -1391,7 +1391,6 @@ static int ov8858_s_stream(struct v4l2_subdev *sd, int on)
 		}
 	} else {
 		ov8858_stop_stream(ov8858);
-		pm_runtime_mark_last_busy(&client->dev);
 		pm_runtime_put_autosuspend(&client->dev);
 	}
 
@@ -1945,7 +1944,6 @@ static int ov8858_probe(struct i2c_client *client)
 		goto err_power_off;
 	}
 
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	return 0;
diff --git a/drivers/media/i2c/st-mipid02.c b/drivers/media/i2c/st-mipid02.c
index f4568e87f018..41ae25b0911f 100644
--- a/drivers/media/i2c/st-mipid02.c
+++ b/drivers/media/i2c/st-mipid02.c
@@ -465,7 +465,6 @@ static int mipid02_disable_streams(struct v4l2_subdev *sd,
 	if (ret)
 		goto error;
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 error:
@@ -542,7 +541,6 @@ error:
 	cci_write(bridge->regmap, MIPID02_DATA_LANE0_REG1, 0, &ret);
 	cci_write(bridge->regmap, MIPID02_DATA_LANE1_REG1, 0, &ret);
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 	return ret;
 }
diff --git a/drivers/media/i2c/tc358746.c b/drivers/media/i2c/tc358746.c
index 143aa1359aba..bcfc274cf891 100644
--- a/drivers/media/i2c/tc358746.c
+++ b/drivers/media/i2c/tc358746.c
@@ -816,7 +816,6 @@ static int tc358746_s_stream(struct v4l2_subdev *sd, int enable)
 		return 0;
 
 err_out:
-		pm_runtime_mark_last_busy(sd->dev);
 		pm_runtime_put_sync_autosuspend(sd->dev);
 
 		return err;
@@ -838,7 +837,6 @@ err_out:
 	if (err)
 		return err;
 
-	pm_runtime_mark_last_busy(sd->dev);
 	pm_runtime_put_sync_autosuspend(sd->dev);
 
 	return v4l2_subdev_call(src, video, s_stream, 0);
@@ -1016,7 +1014,6 @@ tc358746_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 	err = tc358746_read(tc358746, reg->reg, &val);
 	reg->val = val;
 
-	pm_runtime_mark_last_busy(sd->dev);
 	pm_runtime_put_sync_autosuspend(sd->dev);
 
 	return err;
@@ -1032,7 +1029,6 @@ tc358746_s_register(struct v4l2_subdev *sd, const struct v4l2_dbg_register *reg)
 
 	tc358746_write(tc358746, (u32)reg->reg, (u32)reg->val);
 
-	pm_runtime_mark_last_busy(sd->dev);
 	pm_runtime_put_sync_autosuspend(sd->dev);
 
 	return 0;
@@ -1395,7 +1391,6 @@ static int tc358746_init_hw(struct tc358746 *tc358746)
 	}
 
 	err = tc358746_read(tc358746, CHIPID_REG, &val);
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_sync_autosuspend(dev);
 	if (err)
 		return -ENODEV;
diff --git a/drivers/media/i2c/thp7312.c b/drivers/media/i2c/thp7312.c
index 8852c56431fe..775cfba188d8 100644
--- a/drivers/media/i2c/thp7312.c
+++ b/drivers/media/i2c/thp7312.c
@@ -808,7 +808,6 @@ static int thp7312_s_stream(struct v4l2_subdev *sd, int enable)
 	if (!enable) {
 		thp7312_stream_enable(thp7312, false);
 
-		pm_runtime_mark_last_busy(thp7312->dev);
 		pm_runtime_put_autosuspend(thp7312->dev);
 
 		v4l2_subdev_unlock_state(sd_state);
@@ -839,7 +838,6 @@ static int thp7312_s_stream(struct v4l2_subdev *sd, int enable)
 	goto finish_unlock;
 
 finish_pm:
-	pm_runtime_mark_last_busy(thp7312->dev);
 	pm_runtime_put_autosuspend(thp7312->dev);
 finish_unlock:
 	v4l2_subdev_unlock_state(sd_state);
@@ -1147,7 +1145,6 @@ static int thp7312_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(thp7312->dev);
 	pm_runtime_put_autosuspend(thp7312->dev);
 
 	return ret;
@@ -2183,7 +2180,6 @@ static int thp7312_probe(struct i2c_client *client)
 	 * Decrease the PM usage count. The device will get suspended after the
 	 * autosuspend delay, turning the power off.
 	 */
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	dev_info(dev, "THP7312 firmware version %02u.%02u\n",
diff --git a/drivers/media/i2c/vd55g1.c b/drivers/media/i2c/vd55g1.c
index c0754fd03b1d..7c39183dd44b 100644
--- a/drivers/media/i2c/vd55g1.c
+++ b/drivers/media/i2c/vd55g1.c
@@ -1104,7 +1104,6 @@ static int vd55g1_disable_streams(struct v4l2_subdev *sd,
 
 	vd55g1_grab_ctrls(sensor, false);
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -1338,7 +1337,6 @@ static int vd55g1_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -1433,7 +1431,6 @@ static int vd55g1_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -1895,7 +1892,6 @@ static int vd55g1_probe(struct i2c_client *client)
 	pm_runtime_enable(dev);
 	pm_runtime_set_autosuspend_delay(dev, 4000);
 	pm_runtime_use_autosuspend(dev);
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	ret = vd55g1_subdev_init(sensor);
diff --git a/drivers/media/i2c/vd56g3.c b/drivers/media/i2c/vd56g3.c
index 5d951ad0b478..d66e21ba4498 100644
--- a/drivers/media/i2c/vd56g3.c
+++ b/drivers/media/i2c/vd56g3.c
@@ -493,7 +493,6 @@ static int vd56g3_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -577,7 +576,6 @@ static int vd56g3_s_ctrl(struct v4l2_ctrl *ctrl)
 		break;
 	}
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -1021,7 +1019,6 @@ static int vd56g3_disable_streams(struct v4l2_subdev *sd,
 	__v4l2_ctrl_grab(sensor->vflip_ctrl, false);
 	__v4l2_ctrl_grab(sensor->patgen_ctrl, false);
 
-	pm_runtime_mark_last_busy(sensor->dev);
 	pm_runtime_put_autosuspend(sensor->dev);
 
 	return ret;
@@ -1527,7 +1524,6 @@ static int vd56g3_probe(struct i2c_client *client)
 	}
 
 	/* Sensor could now be powered off (after the autosuspend delay) */
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	dev_dbg(dev, "Successfully probe %s sensor\n",
diff --git a/drivers/media/i2c/video-i2c.c b/drivers/media/i2c/video-i2c.c
index 0dd991d70d53..1eee2d4f5b40 100644
--- a/drivers/media/i2c/video-i2c.c
+++ b/drivers/media/i2c/video-i2c.c
@@ -288,7 +288,6 @@ static int amg88xx_read(struct device *dev, enum hwmon_sensor_types type,
 		return tmp;
 
 	tmp = regmap_bulk_read(data->regmap, AMG88XX_REG_TTHL, &buf, 2);
-	pm_runtime_mark_last_busy(regmap_get_device(data->regmap));
 	pm_runtime_put_autosuspend(regmap_get_device(data->regmap));
 	if (tmp)
 		return tmp;
@@ -527,7 +526,6 @@ static int start_streaming(struct vb2_queue *vq, unsigned int count)
 		return 0;
 
 error_rpm_put:
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 error_del_list:
 	video_i2c_del_list(vq, VB2_BUF_STATE_QUEUED);
@@ -544,7 +542,6 @@ static void stop_streaming(struct vb2_queue *vq)
 
 	kthread_stop(data->kthread_vid_cap);
 	data->kthread_vid_cap = NULL;
-	pm_runtime_mark_last_busy(regmap_get_device(data->regmap));
 	pm_runtime_put_autosuspend(regmap_get_device(data->regmap));
 
 	video_i2c_del_list(vq, VB2_BUF_STATE_ERROR);
@@ -853,7 +850,6 @@ static int video_i2c_probe(struct i2c_client *client)
 	if (ret < 0)
 		goto error_pm_disable;
 
-	pm_runtime_mark_last_busy(&client->dev);
 	pm_runtime_put_autosuspend(&client->dev);
 
 	return 0;
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
index fd71f0c43ac3..a9ce032cc5a2 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-dec.c
@@ -451,7 +451,6 @@ static void wave5_vpu_dec_finish_decode(struct vpu_instance *inst)
 	if (q_status.report_queue_count == 0 &&
 	    (q_status.instance_queue_count == 0 || dec_info.sequence_changed)) {
 		dev_dbg(inst->dev->dev, "%s: finishing job.\n", __func__);
-		pm_runtime_mark_last_busy(inst->dev->dev);
 		pm_runtime_put_autosuspend(inst->dev->dev);
 		v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
 	}
@@ -1364,7 +1363,6 @@ static int wave5_vpu_dec_start_streaming(struct vb2_queue *q, unsigned int count
 		}
 
 	}
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 	return ret;
 
@@ -1498,7 +1496,6 @@ static void wave5_vpu_dec_stop_streaming(struct vb2_queue *q)
 	else
 		streamoff_capture(q);
 
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 }
 
@@ -1662,7 +1659,6 @@ static void wave5_vpu_dec_device_run(void *priv)
 
 finish_job_and_return:
 	dev_dbg(inst->dev->dev, "%s: leave and finish job", __func__);
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 	v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
 }
diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
index 1e5fc5f8b856..35913a7de834 100644
--- a/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
+++ b/drivers/media/platform/chips-media/wave5/wave5-vpu-enc.c
@@ -1391,12 +1391,10 @@ static int wave5_vpu_enc_start_streaming(struct vb2_queue *q, unsigned int count
 	if (ret)
 		goto return_buffers;
 
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 	return 0;
 return_buffers:
 	wave5_return_bufs(q, VB2_BUF_STATE_QUEUED);
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 	return ret;
 }
@@ -1465,7 +1463,6 @@ static void wave5_vpu_enc_stop_streaming(struct vb2_queue *q)
 	else
 		streamoff_capture(inst, q);
 
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 }
 
@@ -1520,7 +1517,6 @@ static void wave5_vpu_enc_device_run(void *priv)
 			break;
 		}
 		dev_dbg(inst->dev->dev, "%s: leave with active job", __func__);
-		pm_runtime_mark_last_busy(inst->dev->dev);
 		pm_runtime_put_autosuspend(inst->dev->dev);
 		return;
 	default:
@@ -1529,7 +1525,6 @@ static void wave5_vpu_enc_device_run(void *priv)
 		break;
 	}
 	dev_dbg(inst->dev->dev, "%s: leave and finish job", __func__);
-	pm_runtime_mark_last_busy(inst->dev->dev);
 	pm_runtime_put_autosuspend(inst->dev->dev);
 	v4l2_m2m_job_finish(inst->v4l2_m2m_dev, m2m_ctx);
 }
diff --git a/drivers/media/platform/nvidia/tegra-vde/h264.c b/drivers/media/platform/nvidia/tegra-vde/h264.c
index 0e56a4331b0d..45f8f6904867 100644
--- a/drivers/media/platform/nvidia/tegra-vde/h264.c
+++ b/drivers/media/platform/nvidia/tegra-vde/h264.c
@@ -585,7 +585,6 @@ static int tegra_vde_decode_begin(struct tegra_vde *vde,
 	return 0;
 
 put_runtime_pm:
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 unlock:
@@ -612,7 +611,6 @@ static void tegra_vde_decode_abort(struct tegra_vde *vde)
 	if (err)
 		dev_err(dev, "DEC end: Failed to assert HW reset: %d\n", err);
 
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 
 	mutex_unlock(&vde->lock);
diff --git a/drivers/media/platform/qcom/iris/iris_hfi_queue.c b/drivers/media/platform/qcom/iris/iris_hfi_queue.c
index 221dcd09e1e1..b3ed06297953 100644
--- a/drivers/media/platform/qcom/iris/iris_hfi_queue.c
+++ b/drivers/media/platform/qcom/iris/iris_hfi_queue.c
@@ -142,7 +142,6 @@ int iris_hfi_queue_cmd_write(struct iris_core *core, void *pkt, u32 pkt_size)
 	}
 	mutex_unlock(&core->lock);
 
-	pm_runtime_mark_last_busy(core->dev);
 	pm_runtime_put_autosuspend(core->dev);
 
 	return 0;
diff --git a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
index b30891718d8d..d60d92d2ffa1 100644
--- a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
+++ b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
@@ -950,7 +950,6 @@ static void pispbe_node_stop_streaming(struct vb2_queue *q)
 		kfree(job);
 	}
 
-	pm_runtime_mark_last_busy(pispbe->dev);
 	pm_runtime_put_autosuspend(pispbe->dev);
 
 	dev_dbg(pispbe->dev, "Nodes streaming now 0x%x\n",
@@ -1742,7 +1741,6 @@ static int pispbe_probe(struct platform_device *pdev)
 	if (ret)
 		goto disable_devs_err;
 
-	pm_runtime_mark_last_busy(pispbe->dev);
 	pm_runtime_put_autosuspend(pispbe->dev);
 
 	return 0;
diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
index d707088ec0dc..d3b31f461194 100644
--- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
+++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
@@ -765,7 +765,6 @@ static void rkvdec_job_finish(struct rkvdec_ctx *ctx,
 {
 	struct rkvdec_dev *rkvdec = ctx->dev;
 
-	pm_runtime_mark_last_busy(rkvdec->dev);
 	pm_runtime_put_autosuspend(rkvdec->dev);
 	rkvdec_job_finish_no_pm(ctx, result);
 }
@@ -1159,13 +1158,6 @@ static int rkvdec_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	if (iommu_get_domain_for_dev(&pdev->dev)) {
-		rkvdec->empty_domain = iommu_paging_domain_alloc(rkvdec->dev);
-
-		if (!rkvdec->empty_domain)
-			dev_warn(rkvdec->dev, "cannot alloc new empty domain\n");
-	}
-
 	vb2_dma_contig_set_max_seg_size(&pdev->dev, DMA_BIT_MASK(32));
 
 	irq = platform_get_irq(pdev, 0);
@@ -1188,6 +1180,15 @@ static int rkvdec_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_disable_runtime_pm;
 
+	if (iommu_get_domain_for_dev(&pdev->dev)) {
+		rkvdec->empty_domain = iommu_paging_domain_alloc(rkvdec->dev);
+
+		if (IS_ERR(rkvdec->empty_domain)) {
+			rkvdec->empty_domain = NULL;
+			dev_warn(rkvdec->dev, "cannot alloc new empty domain\n");
+		}
+	}
+
 	return 0;
 
 err_disable_runtime_pm:
diff --git a/drivers/media/platform/verisilicon/hantro_drv.c b/drivers/media/platform/verisilicon/hantro_drv.c
index 8542238e0fb1..fa972effd4a2 100644
--- a/drivers/media/platform/verisilicon/hantro_drv.c
+++ b/drivers/media/platform/verisilicon/hantro_drv.c
@@ -89,7 +89,6 @@ static void hantro_job_finish(struct hantro_dev *vpu,
 			      struct hantro_ctx *ctx,
 			      enum vb2_buffer_state result)
 {
-	pm_runtime_mark_last_busy(vpu->dev);
 	pm_runtime_put_autosuspend(vpu->dev);
 
 	clk_bulk_disable(vpu->variant->num_clocks, vpu->clocks);
diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c
index bf6d8fa983bf..a6418ef782bc 100644
--- a/drivers/media/rc/gpio-ir-recv.c
+++ b/drivers/media/rc/gpio-ir-recv.c
@@ -48,10 +48,8 @@ static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id)
 	if (val >= 0)
 		ir_raw_event_store_edge(gpio_dev->rcdev, val == 1);
 
-	if (pmdev) {
-		pm_runtime_mark_last_busy(pmdev);
+	if (pmdev)
 		pm_runtime_put_autosuspend(pmdev);
-	}
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index 7f3f47db4c98..e4275f8ee5db 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -555,7 +555,6 @@ EXPORT_SYMBOL(memstick_add_host);
  */
 void memstick_remove_host(struct memstick_host *host)
 {
-	host->removing = 1;
 	flush_workqueue(workqueue);
 	mutex_lock(&host->lock);
 	if (host->card)
diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c
index 3878136227e4..5b5e9354fb2e 100644
--- a/drivers/memstick/host/rtsx_usb_ms.c
+++ b/drivers/memstick/host/rtsx_usb_ms.c
@@ -812,6 +812,7 @@ static void rtsx_usb_ms_drv_remove(struct platform_device *pdev)
 	int err;
 
 	host->eject = true;
+	msh->removing = true;
 	cancel_work_sync(&host->handle_req);
 	cancel_delayed_work_sync(&host->poll_card);
 
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index c44de892a61e..5372ed2a363e 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc)
 
 static const struct super_operations ibmasmfs_s_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 };
 
 static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c
index 6a33889d0902..c3971f7caa65 100644
--- a/drivers/misc/lkdtm/cfi.c
+++ b/drivers/misc/lkdtm/cfi.c
@@ -43,7 +43,7 @@ static void lkdtm_CFI_FORWARD_PROTO(void)
 	lkdtm_indirect_call((void *)lkdtm_increment_int);
 
 	pr_err("FAIL: survived mismatched prototype function call!\n");
-	pr_expected_config(CONFIG_CFI_CLANG);
+	pr_expected_config(CONFIG_CFI);
 }
 
 /*
diff --git a/drivers/misc/lkdtm/fortify.c b/drivers/misc/lkdtm/fortify.c
index 015927665678..00ed2147113e 100644
--- a/drivers/misc/lkdtm/fortify.c
+++ b/drivers/misc/lkdtm/fortify.c
@@ -44,6 +44,9 @@ static void lkdtm_FORTIFY_STR_MEMBER(void)
 	char *src;
 
 	src = kmalloc(size, GFP_KERNEL);
+	if (!src)
+		return;
+
 	strscpy(src, "over ten bytes", size);
 	size = strlen(src) + 1;
 
@@ -109,6 +112,9 @@ static void lkdtm_FORTIFY_MEM_MEMBER(void)
 	char *src;
 
 	src = kmalloc(size, GFP_KERNEL);
+	if (!src)
+		return;
+
 	strscpy(src, "over ten bytes", size);
 	size = strlen(src) + 1;
 
diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index a9e6277789ba..79df2fa89a3f 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -292,7 +292,7 @@ static u32 mvsd_finish_data(struct mvsd_host *host, struct mmc_data *data,
 		host->pio_ptr = NULL;
 		host->pio_size = 0;
 	} else {
-		dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags,
+		dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
 			     mmc_get_dma_dir(data));
 	}
 
diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c
index 42878474e56e..60dbc815e501 100644
--- a/drivers/mmc/host/sdhci-of-arasan.c
+++ b/drivers/mmc/host/sdhci-of-arasan.c
@@ -99,6 +99,9 @@
 #define HIWORD_UPDATE(val, mask, shift) \
 		((val) << (shift) | (mask) << ((shift) + 16))
 
+#define CD_STABLE_TIMEOUT_US		1000000
+#define CD_STABLE_MAX_SLEEP_US		10
+
 /**
  * struct sdhci_arasan_soc_ctl_field - Field used in sdhci_arasan_soc_ctl_map
  *
@@ -206,12 +209,15 @@ struct sdhci_arasan_data {
  * 19MHz instead
  */
 #define SDHCI_ARASAN_QUIRK_CLOCK_25_BROKEN BIT(2)
+/* Enable CD stable check before power-up */
+#define SDHCI_ARASAN_QUIRK_ENSURE_CD_STABLE	BIT(3)
 };
 
 struct sdhci_arasan_of_data {
 	const struct sdhci_arasan_soc_ctl_map *soc_ctl_map;
 	const struct sdhci_pltfm_data *pdata;
 	const struct sdhci_arasan_clk_ops *clk_ops;
+	u32 quirks;
 };
 
 static const struct sdhci_arasan_soc_ctl_map rk3399_soc_ctl_map = {
@@ -514,6 +520,24 @@ static int sdhci_arasan_voltage_switch(struct mmc_host *mmc,
 	return -EINVAL;
 }
 
+static void sdhci_arasan_set_power_and_bus_voltage(struct sdhci_host *host, unsigned char mode,
+						   unsigned short vdd)
+{
+	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+	struct sdhci_arasan_data *sdhci_arasan = sdhci_pltfm_priv(pltfm_host);
+	u32 reg;
+
+	/*
+	 * Ensure that the card detect logic has stabilized before powering up, this is
+	 * necessary after a host controller reset.
+	 */
+	if (mode == MMC_POWER_UP && sdhci_arasan->quirks & SDHCI_ARASAN_QUIRK_ENSURE_CD_STABLE)
+		read_poll_timeout(sdhci_readl, reg, reg & SDHCI_CD_STABLE, CD_STABLE_MAX_SLEEP_US,
+				  CD_STABLE_TIMEOUT_US, false, host, SDHCI_PRESENT_STATE);
+
+	sdhci_set_power_and_bus_voltage(host, mode, vdd);
+}
+
 static const struct sdhci_ops sdhci_arasan_ops = {
 	.set_clock = sdhci_arasan_set_clock,
 	.get_max_clock = sdhci_pltfm_clk_get_max_clock,
@@ -521,7 +545,7 @@ static const struct sdhci_ops sdhci_arasan_ops = {
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_arasan_reset,
 	.set_uhs_signaling = sdhci_set_uhs_signaling,
-	.set_power = sdhci_set_power_and_bus_voltage,
+	.set_power = sdhci_arasan_set_power_and_bus_voltage,
 	.hw_reset = sdhci_arasan_hw_reset,
 };
 
@@ -570,7 +594,7 @@ static const struct sdhci_ops sdhci_arasan_cqe_ops = {
 	.set_bus_width = sdhci_set_bus_width,
 	.reset = sdhci_arasan_reset,
 	.set_uhs_signaling = sdhci_set_uhs_signaling,
-	.set_power = sdhci_set_power_and_bus_voltage,
+	.set_power = sdhci_arasan_set_power_and_bus_voltage,
 	.irq = sdhci_arasan_cqhci_irq,
 };
 
@@ -1447,6 +1471,7 @@ static const struct sdhci_arasan_clk_ops zynqmp_clk_ops = {
 static struct sdhci_arasan_of_data sdhci_arasan_zynqmp_data = {
 	.pdata = &sdhci_arasan_zynqmp_pdata,
 	.clk_ops = &zynqmp_clk_ops,
+	.quirks = SDHCI_ARASAN_QUIRK_ENSURE_CD_STABLE,
 };
 
 static const struct sdhci_arasan_clk_ops versal_clk_ops = {
@@ -1457,6 +1482,7 @@ static const struct sdhci_arasan_clk_ops versal_clk_ops = {
 static struct sdhci_arasan_of_data sdhci_arasan_versal_data = {
 	.pdata = &sdhci_arasan_zynqmp_pdata,
 	.clk_ops = &versal_clk_ops,
+	.quirks = SDHCI_ARASAN_QUIRK_ENSURE_CD_STABLE,
 };
 
 static const struct sdhci_arasan_clk_ops versal_net_clk_ops = {
@@ -1467,6 +1493,7 @@ static const struct sdhci_arasan_clk_ops versal_net_clk_ops = {
 static struct sdhci_arasan_of_data sdhci_arasan_versal_net_data = {
 	.pdata = &sdhci_arasan_versal_net_pdata,
 	.clk_ops = &versal_net_clk_ops,
+	.quirks = SDHCI_ARASAN_QUIRK_ENSURE_CD_STABLE,
 };
 
 static struct sdhci_arasan_of_data intel_keembay_emmc_data = {
@@ -1937,6 +1964,8 @@ static int sdhci_arasan_probe(struct platform_device *pdev)
 	if (of_device_is_compatible(np, "rockchip,rk3399-sdhci-5.1"))
 		sdhci_arasan_update_clockmultiplier(host, 0x0);
 
+	sdhci_arasan->quirks |= data->quirks;
+
 	if (of_device_is_compatible(np, "intel,keembay-sdhci-5.1-emmc") ||
 	    of_device_is_compatible(np, "intel,keembay-sdhci-5.1-sd") ||
 	    of_device_is_compatible(np, "intel,keembay-sdhci-5.1-sdio")) {
diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c
index 4c2ae71770f7..b0f91cc9e40e 100644
--- a/drivers/mmc/host/sdhci-pci-gli.c
+++ b/drivers/mmc/host/sdhci-pci-gli.c
@@ -283,10 +283,26 @@
 #define   PCIE_GLI_9767_UHS2_CTL2_ZC_VALUE	  0xb
 #define   PCIE_GLI_9767_UHS2_CTL2_ZC_CTL	  BIT(6)
 #define   PCIE_GLI_9767_UHS2_CTL2_ZC_CTL_VALUE	  0x1
+#define   PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN	BIT(13)
+#define   PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE	BIT(14)
 
 #define GLI_MAX_TUNING_LOOP 40
 
 /* Genesys Logic chipset */
+static void sdhci_gli_mask_replay_timer_timeout(struct pci_dev *pdev)
+{
+	int aer;
+	u32 value;
+
+	/* mask the replay timer timeout of AER */
+	aer = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
+	if (aer) {
+		pci_read_config_dword(pdev, aer + PCI_ERR_COR_MASK, &value);
+		value |= PCI_ERR_COR_REP_TIMER;
+		pci_write_config_dword(pdev, aer + PCI_ERR_COR_MASK, value);
+	}
+}
+
 static inline void gl9750_wt_on(struct sdhci_host *host)
 {
 	u32 wt_value;
@@ -607,7 +623,6 @@ static void gl9750_hw_setting(struct sdhci_host *host)
 {
 	struct sdhci_pci_slot *slot = sdhci_priv(host);
 	struct pci_dev *pdev;
-	int aer;
 	u32 value;
 
 	pdev = slot->chip->pdev;
@@ -626,12 +641,7 @@ static void gl9750_hw_setting(struct sdhci_host *host)
 	pci_set_power_state(pdev, PCI_D0);
 
 	/* mask the replay timer timeout of AER */
-	aer = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
-	if (aer) {
-		pci_read_config_dword(pdev, aer + PCI_ERR_COR_MASK, &value);
-		value |= PCI_ERR_COR_REP_TIMER;
-		pci_write_config_dword(pdev, aer + PCI_ERR_COR_MASK, value);
-	}
+	sdhci_gli_mask_replay_timer_timeout(pdev);
 
 	gl9750_wt_off(host);
 }
@@ -806,7 +816,6 @@ static void sdhci_gl9755_set_clock(struct sdhci_host *host, unsigned int clock)
 static void gl9755_hw_setting(struct sdhci_pci_slot *slot)
 {
 	struct pci_dev *pdev = slot->chip->pdev;
-	int aer;
 	u32 value;
 
 	gl9755_wt_on(pdev);
@@ -841,12 +850,7 @@ static void gl9755_hw_setting(struct sdhci_pci_slot *slot)
 	pci_set_power_state(pdev, PCI_D0);
 
 	/* mask the replay timer timeout of AER */
-	aer = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
-	if (aer) {
-		pci_read_config_dword(pdev, aer + PCI_ERR_COR_MASK, &value);
-		value |= PCI_ERR_COR_REP_TIMER;
-		pci_write_config_dword(pdev, aer + PCI_ERR_COR_MASK, value);
-	}
+	sdhci_gli_mask_replay_timer_timeout(pdev);
 
 	gl9755_wt_off(pdev);
 }
@@ -1177,6 +1181,65 @@ static void gl9767_set_low_power_negotiation(struct pci_dev *pdev, bool enable)
 	gl9767_vhs_read(pdev);
 }
 
+static void sdhci_gl9767_uhs2_phy_reset(struct sdhci_host *host, bool assert)
+{
+	struct sdhci_pci_slot *slot = sdhci_priv(host);
+	struct pci_dev *pdev = slot->chip->pdev;
+	u32 value, set, clr;
+
+	if (assert) {
+		/* Assert reset, set RESETN and clean RESETN_VALUE */
+		set = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN;
+		clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE;
+	} else {
+		/* De-assert reset, clean RESETN and set RESETN_VALUE */
+		set = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE;
+		clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN;
+	}
+
+	gl9767_vhs_write(pdev);
+	pci_read_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, &value);
+	value |= set;
+	pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value);
+	value &= ~clr;
+	pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value);
+	gl9767_vhs_read(pdev);
+}
+
+static void __gl9767_uhs2_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd)
+{
+	u8 pwr = 0;
+
+	if (mode != MMC_POWER_OFF) {
+		pwr = sdhci_get_vdd_value(vdd);
+		if (!pwr)
+			WARN(1, "%s: Invalid vdd %#x\n",
+			     mmc_hostname(host->mmc), vdd);
+		pwr |= SDHCI_VDD2_POWER_180;
+	}
+
+	if (host->pwr == pwr)
+		return;
+
+	host->pwr = pwr;
+
+	if (pwr == 0) {
+		sdhci_writeb(host, 0, SDHCI_POWER_CONTROL);
+	} else {
+		sdhci_writeb(host, 0, SDHCI_POWER_CONTROL);
+
+		pwr |= SDHCI_POWER_ON;
+		sdhci_writeb(host, pwr & 0xf, SDHCI_POWER_CONTROL);
+		usleep_range(5000, 6250);
+
+		/* Assert reset */
+		sdhci_gl9767_uhs2_phy_reset(host, true);
+		pwr |= SDHCI_VDD2_POWER_ON;
+		sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL);
+		usleep_range(5000, 6250);
+	}
+}
+
 static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock)
 {
 	struct sdhci_pci_slot *slot = sdhci_priv(host);
@@ -1203,6 +1266,11 @@ static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock)
 	}
 
 	sdhci_enable_clk(host, clk);
+
+	if (mmc_card_uhs2(host->mmc))
+		/* De-assert reset */
+		sdhci_gl9767_uhs2_phy_reset(host, false);
+
 	gl9767_set_low_power_negotiation(pdev, true);
 }
 
@@ -1474,7 +1542,7 @@ static void sdhci_gl9767_set_power(struct sdhci_host *host, unsigned char mode,
 		gl9767_vhs_read(pdev);
 
 		sdhci_gli_overcurrent_event_enable(host, false);
-		sdhci_uhs2_set_power(host, mode, vdd);
+		__gl9767_uhs2_set_power(host, mode, vdd);
 		sdhci_gli_overcurrent_event_enable(host, true);
 	} else {
 		gl9767_vhs_write(pdev);
@@ -1751,7 +1819,7 @@ cleanup:
 	return ret;
 }
 
-static void gli_set_gl9763e(struct sdhci_pci_slot *slot)
+static void gl9763e_hw_setting(struct sdhci_pci_slot *slot)
 {
 	struct pci_dev *pdev = slot->chip->pdev;
 	u32 value;
@@ -1780,6 +1848,9 @@ static void gli_set_gl9763e(struct sdhci_pci_slot *slot)
 	value |= FIELD_PREP(GLI_9763E_HS400_RXDLY, GLI_9763E_HS400_RXDLY_5);
 	pci_write_config_dword(pdev, PCIE_GLI_9763E_CLKRXDLY, value);
 
+	/* mask the replay timer timeout of AER */
+	sdhci_gli_mask_replay_timer_timeout(pdev);
+
 	pci_read_config_dword(pdev, PCIE_GLI_9763E_VHS, &value);
 	value &= ~GLI_9763E_VHS_REV;
 	value |= FIELD_PREP(GLI_9763E_VHS_REV, GLI_9763E_VHS_REV_R);
@@ -1923,7 +1994,7 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot)
 	gli_pcie_enable_msi(slot);
 	host->mmc_host_ops.hs400_enhanced_strobe =
 					gl9763e_hs400_enhanced_strobe;
-	gli_set_gl9763e(slot);
+	gl9763e_hw_setting(slot);
 	sdhci_enable_v4_mode(host);
 
 	return 0;
diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c
index 0efeb9d0c376..c459a08d01da 100644
--- a/drivers/mmc/host/sdhci-uhs2.c
+++ b/drivers/mmc/host/sdhci-uhs2.c
@@ -295,7 +295,8 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 	else
 		sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd);
 
-	sdhci_set_clock(host, host->clock);
+	host->ops->set_clock(host, ios->clock);
+	host->clock = ios->clock;
 }
 
 static int sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 3a17821efa5c..ac7e11f37af7 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -2367,23 +2367,6 @@ void sdhci_set_ios_common(struct mmc_host *mmc, struct mmc_ios *ios)
 		(ios->power_mode == MMC_POWER_UP) &&
 		!(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN))
 		sdhci_enable_preset_value(host, false);
-
-	if (!ios->clock || ios->clock != host->clock) {
-		host->ops->set_clock(host, ios->clock);
-		host->clock = ios->clock;
-
-		if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK &&
-		    host->clock) {
-			host->timeout_clk = mmc->actual_clock ?
-						mmc->actual_clock / 1000 :
-						host->clock / 1000;
-			mmc->max_busy_timeout =
-				host->ops->get_max_timeout_count ?
-				host->ops->get_max_timeout_count(host) :
-				1 << 27;
-			mmc->max_busy_timeout /= host->timeout_clk;
-		}
-	}
 }
 EXPORT_SYMBOL_GPL(sdhci_set_ios_common);
 
@@ -2410,6 +2393,23 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 
 	sdhci_set_ios_common(mmc, ios);
 
+	if (!ios->clock || ios->clock != host->clock) {
+		host->ops->set_clock(host, ios->clock);
+		host->clock = ios->clock;
+
+		if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK &&
+		    host->clock) {
+			host->timeout_clk = mmc->actual_clock ?
+						mmc->actual_clock / 1000 :
+						host->clock / 1000;
+			mmc->max_busy_timeout =
+				host->ops->get_max_timeout_count ?
+				host->ops->get_max_timeout_count(host) :
+				1 << 27;
+			mmc->max_busy_timeout /= host->timeout_clk;
+		}
+	}
+
 	if (host->ops->set_power)
 		host->ops->set_power(host, ios->power_mode, ios->vdd);
 	else
diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c
index e4fc345be7e5..17e62c61b6e6 100644
--- a/drivers/mmc/host/sdhci_am654.c
+++ b/drivers/mmc/host/sdhci_am654.c
@@ -156,6 +156,7 @@ struct sdhci_am654_data {
 
 #define SDHCI_AM654_QUIRK_FORCE_CDTEST BIT(0)
 #define SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA BIT(1)
+#define SDHCI_AM654_QUIRK_DISABLE_HS400 BIT(2)
 };
 
 struct window {
@@ -765,6 +766,7 @@ static int sdhci_am654_init(struct sdhci_host *host)
 {
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
 	struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host);
+	struct device *dev = mmc_dev(host->mmc);
 	u32 ctl_cfg_2 = 0;
 	u32 mask;
 	u32 val;
@@ -820,6 +822,12 @@ static int sdhci_am654_init(struct sdhci_host *host)
 	if (ret)
 		goto err_cleanup_host;
 
+	if (sdhci_am654->quirks & SDHCI_AM654_QUIRK_DISABLE_HS400 &&
+	    host->mmc->caps2 & (MMC_CAP2_HS400 | MMC_CAP2_HS400_ES)) {
+		dev_info(dev, "HS400 mode not supported on this silicon revision, disabling it\n");
+		host->mmc->caps2 &= ~(MMC_CAP2_HS400 | MMC_CAP2_HS400_ES);
+	}
+
 	ret = __sdhci_add_host(host);
 	if (ret)
 		goto err_cleanup_host;
@@ -883,6 +891,12 @@ static int sdhci_am654_get_of_property(struct platform_device *pdev,
 	return 0;
 }
 
+static const struct soc_device_attribute sdhci_am654_descope_hs400[] = {
+	{ .family = "AM62PX", .revision = "SR1.0" },
+	{ .family = "AM62PX", .revision = "SR1.1" },
+	{ /* sentinel */ }
+};
+
 static const struct of_device_id sdhci_am654_of_match[] = {
 	{
 		.compatible = "ti,am654-sdhci-5.1",
@@ -970,6 +984,10 @@ static int sdhci_am654_probe(struct platform_device *pdev)
 	if (ret)
 		return dev_err_probe(dev, ret, "parsing dt failed\n");
 
+	soc = soc_device_match(sdhci_am654_descope_hs400);
+	if (soc)
+		sdhci_am654->quirks |= SDHCI_AM654_QUIRK_DISABLE_HS400;
+
 	host->mmc_host_ops.start_signal_voltage_switch = sdhci_am654_start_signal_voltage_switch;
 	host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning;
 
diff --git a/drivers/most/core.c b/drivers/most/core.c
index a635d5082ebb..da319d108ea1 100644
--- a/drivers/most/core.c
+++ b/drivers/most/core.c
@@ -538,8 +538,8 @@ static struct most_channel *get_channel(char *mdev, char *mdev_ch)
 	dev = bus_find_device_by_name(&mostbus, NULL, mdev);
 	if (!dev)
 		return NULL;
-	put_device(dev);
 	iface = dev_get_drvdata(dev);
+	put_device(dev);
 	list_for_each_entry_safe(c, tmp, &iface->p->channel_list, list) {
 		if (!strcmp(dev_name(&c->dev), mdev_ch))
 			return c;
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 46cebde79f34..e518dfeee654 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -185,8 +185,8 @@ config MTD_POWERNV_FLASH
 
 config MTD_INTEL_DG
 	tristate "Intel Discrete Graphics non-volatile memory driver"
-	depends on AUXILIARY_BUS
-	depends on MTD
+	depends on AUXILIARY_BUS && MTD
+	depends on DRM_I915!=n || DRM_XE!=n || COMPILE_TEST
 	help
 	  This provides an MTD device to access Intel Discrete Graphics
 	  non-volatile memory.
diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c
index 84ab4a83cbd6..db94d14a3807 100644
--- a/drivers/mtd/nand/raw/atmel/nand-controller.c
+++ b/drivers/mtd/nand/raw/atmel/nand-controller.c
@@ -1378,13 +1378,23 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand,
 		return ret;
 
 	/*
+	 * Read setup timing depends on the operation done on the NAND:
+	 *
+	 * NRD_SETUP = max(tAR, tCLR)
+	 */
+	timeps = max(conf->timings.sdr.tAR_min, conf->timings.sdr.tCLR_min);
+	ncycles = DIV_ROUND_UP(timeps, mckperiodps);
+	totalcycles += ncycles;
+	ret = atmel_smc_cs_conf_set_setup(smcconf, ATMEL_SMC_NRD_SHIFT, ncycles);
+	if (ret)
+		return ret;
+
+	/*
 	 * The read cycle timing is directly matching tRC, but is also
 	 * dependent on the setup and hold timings we calculated earlier,
 	 * which gives:
 	 *
-	 * NRD_CYCLE = max(tRC, NRD_PULSE + NRD_HOLD)
-	 *
-	 * NRD_SETUP is always 0.
+	 * NRD_CYCLE = max(tRC, NRD_SETUP + NRD_PULSE + NRD_HOLD)
 	 */
 	ncycles = DIV_ROUND_UP(conf->timings.sdr.tRC_min, mckperiodps);
 	ncycles = max(totalcycles, ncycles);
diff --git a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c
index c23b537948d5..1a285cd8fad6 100644
--- a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c
+++ b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c
@@ -935,10 +935,10 @@ static void ma35_chips_cleanup(struct ma35_nand_info *nand)
 
 static int ma35_nand_chips_init(struct device *dev, struct ma35_nand_info *nand)
 {
-	struct device_node *np = dev->of_node, *nand_np;
+	struct device_node *np = dev->of_node;
 	int ret;
 
-	for_each_child_of_node(np, nand_np) {
+	for_each_child_of_node_scoped(np, nand_np) {
 		ret = ma35_nand_chip_init(dev, nand, nand_np);
 		if (ret) {
 			ma35_chips_cleanup(nand);
diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c
index a960403081f1..d957327fb4fa 100644
--- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c
+++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c
@@ -272,6 +272,7 @@ struct stm32_fmc2_nfc {
 	struct sg_table dma_data_sg;
 	struct sg_table dma_ecc_sg;
 	u8 *ecc_buf;
+	dma_addr_t dma_ecc_addr;
 	int dma_ecc_len;
 	u32 tx_dma_max_burst;
 	u32 rx_dma_max_burst;
@@ -902,17 +903,10 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf,
 
 	if (!write_data && !raw) {
 		/* Configure DMA ECC status */
-		p = nfc->ecc_buf;
 		for_each_sg(nfc->dma_ecc_sg.sgl, sg, eccsteps, s) {
-			sg_set_buf(sg, p, nfc->dma_ecc_len);
-			p += nfc->dma_ecc_len;
-		}
-
-		ret = dma_map_sg(nfc->dev, nfc->dma_ecc_sg.sgl,
-				 eccsteps, dma_data_dir);
-		if (!ret) {
-			ret = -EIO;
-			goto err_unmap_data;
+			sg_dma_address(sg) = nfc->dma_ecc_addr +
+					     s * nfc->dma_ecc_len;
+			sg_dma_len(sg) = nfc->dma_ecc_len;
 		}
 
 		desc_ecc = dmaengine_prep_slave_sg(nfc->dma_ecc_ch,
@@ -921,7 +915,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf,
 						   DMA_PREP_INTERRUPT);
 		if (!desc_ecc) {
 			ret = -ENOMEM;
-			goto err_unmap_ecc;
+			goto err_unmap_data;
 		}
 
 		reinit_completion(&nfc->dma_ecc_complete);
@@ -929,7 +923,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf,
 		desc_ecc->callback_param = &nfc->dma_ecc_complete;
 		ret = dma_submit_error(dmaengine_submit(desc_ecc));
 		if (ret)
-			goto err_unmap_ecc;
+			goto err_unmap_data;
 
 		dma_async_issue_pending(nfc->dma_ecc_ch);
 	}
@@ -949,7 +943,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf,
 		if (!write_data && !raw)
 			dmaengine_terminate_all(nfc->dma_ecc_ch);
 		ret = -ETIMEDOUT;
-		goto err_unmap_ecc;
+		goto err_unmap_data;
 	}
 
 	/* Wait DMA data transfer completion */
@@ -969,11 +963,6 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf,
 		}
 	}
 
-err_unmap_ecc:
-	if (!write_data && !raw)
-		dma_unmap_sg(nfc->dev, nfc->dma_ecc_sg.sgl,
-			     eccsteps, dma_data_dir);
-
 err_unmap_data:
 	dma_unmap_sg(nfc->dev, nfc->dma_data_sg.sgl, eccsteps, dma_data_dir);
 
@@ -996,9 +985,21 @@ static int stm32_fmc2_nfc_seq_write(struct nand_chip *chip, const u8 *buf,
 
 	/* Write oob */
 	if (oob_required) {
-		ret = nand_change_write_column_op(chip, mtd->writesize,
-						  chip->oob_poi, mtd->oobsize,
-						  false);
+		unsigned int offset_in_page = mtd->writesize;
+		const void *buf = chip->oob_poi;
+		unsigned int len = mtd->oobsize;
+
+		if (!raw) {
+			struct mtd_oob_region oob_free;
+
+			mtd_ooblayout_free(mtd, 0, &oob_free);
+			offset_in_page += oob_free.offset;
+			buf += oob_free.offset;
+			len = oob_free.length;
+		}
+
+		ret = nand_change_write_column_op(chip, offset_in_page,
+						  buf, len, false);
 		if (ret)
 			return ret;
 	}
@@ -1610,7 +1611,8 @@ static int stm32_fmc2_nfc_dma_setup(struct stm32_fmc2_nfc *nfc)
 		return ret;
 
 	/* Allocate a buffer to store ECC status registers */
-	nfc->ecc_buf = devm_kzalloc(nfc->dev, FMC2_MAX_ECC_BUF_LEN, GFP_KERNEL);
+	nfc->ecc_buf = dmam_alloc_coherent(nfc->dev, FMC2_MAX_ECC_BUF_LEN,
+					   &nfc->dma_ecc_addr, GFP_KERNEL);
 	if (!nfc->ecc_buf)
 		return -ENOMEM;
 
diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
index 87053389a1fc..4870b2d5edb2 100644
--- a/drivers/mtd/nand/spi/winbond.c
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -176,6 +176,36 @@ static const struct mtd_ooblayout_ops w25n02kv_ooblayout = {
 	.free = w25n02kv_ooblayout_free,
 };
 
+static int w25n01jw_ooblayout_ecc(struct mtd_info *mtd, int section,
+				  struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section) + 12;
+	region->length = 4;
+
+	return 0;
+}
+
+static int w25n01jw_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section);
+	region->length = 12;
+
+	/* Extract BBM */
+	if (!section) {
+		region->offset += 2;
+		region->length -= 2;
+	}
+
+	return 0;
+}
+
 static int w35n01jw_ooblayout_ecc(struct mtd_info *mtd, int section,
 				  struct mtd_oob_region *region)
 {
@@ -206,6 +236,11 @@ static int w35n01jw_ooblayout_free(struct mtd_info *mtd, int section,
 	return 0;
 }
 
+static const struct mtd_ooblayout_ops w25n01jw_ooblayout = {
+	.ecc = w25n01jw_ooblayout_ecc,
+	.free = w25n01jw_ooblayout_free,
+};
+
 static const struct mtd_ooblayout_ops w35n01jw_ooblayout = {
 	.ecc = w35n01jw_ooblayout_ecc,
 	.free = w35n01jw_ooblayout_free,
@@ -394,7 +429,7 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_variants,
 					      &update_cache_variants),
 		     0,
-		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL),
+		     SPINAND_ECCINFO(&w25n01jw_ooblayout, NULL),
 		     SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)),
 	SPINAND_INFO("W25N01KV", /* 3.3V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xae, 0x21),
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b29628d46be9..ac12eaf11755 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -76,24 +76,11 @@ config WIREGUARD
 	tristate "WireGuard secure network tunnel"
 	depends on NET && INET
 	depends on IPV6 || !IPV6
-	depends on !KMSAN # KMSAN doesn't support the crypto configs below
 	select NET_UDP_TUNNEL
 	select DST_CACHE
-	select CRYPTO
 	select CRYPTO_LIB_CURVE25519
 	select CRYPTO_LIB_CHACHA20POLY1305
-	select CRYPTO_CHACHA20_X86_64 if X86 && 64BIT
-	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
-	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
-	select CRYPTO_CURVE25519_X86 if X86 && 64BIT
-	select CRYPTO_CHACHA20_NEON if ARM || (ARM64 && KERNEL_MODE_NEON)
-	select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON
-	select CRYPTO_POLY1305_ARM if ARM
-	select CRYPTO_BLAKE2S_ARM if ARM
-	select CRYPTO_CURVE25519_NEON if ARM && KERNEL_MODE_NEON
-	select CRYPTO_CHACHA_MIPS if CPU_MIPS32_R2
-	select CRYPTO_POLY1305_MIPS if MIPS
-	select CRYPTO_CHACHA_S390 if S390
+	select CRYPTO_LIB_UTILS
 	help
 	  WireGuard is a secure, fast, and easy to use replacement for IPSec
 	  that uses modern cryptography and clever networking tricks. It's
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 2fca8e84ab10..4edc8e6b6b64 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -95,13 +95,13 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker);
 static void ad_mux_machine(struct port *port, bool *update_slave_arr);
 static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port);
 static void ad_tx_machine(struct port *port);
-static void ad_periodic_machine(struct port *port, struct bond_params *bond_params);
+static void ad_periodic_machine(struct port *port);
 static void ad_port_selection_logic(struct port *port, bool *update_slave_arr);
 static void ad_agg_selection_logic(struct aggregator *aggregator,
 				   bool *update_slave_arr);
 static void ad_clear_agg(struct aggregator *aggregator);
 static void ad_initialize_agg(struct aggregator *aggregator);
-static void ad_initialize_port(struct port *port, int lacp_fast);
+static void ad_initialize_port(struct port *port, const struct bond_params *bond_params);
 static void ad_enable_collecting(struct port *port);
 static void ad_disable_distributing(struct port *port,
 				    bool *update_slave_arr);
@@ -1307,10 +1307,16 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port)
 			 * case of EXPIRED even if LINK_DOWN didn't arrive for
 			 * the port.
 			 */
-			port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION;
 			port->sm_vars &= ~AD_PORT_MATCHED;
+			/* Based on IEEE 8021AX-2014, Figure 6-18 - Receive
+			 * machine state diagram, the statue should be
+			 * Partner_Oper_Port_State.Synchronization = FALSE;
+			 * Partner_Oper_Port_State.LACP_Timeout = Short Timeout;
+			 * start current_while_timer(Short Timeout);
+			 * Actor_Oper_Port_State.Expired = TRUE;
+			 */
+			port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION;
 			port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT;
-			port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY;
 			port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT));
 			port->actor_oper_port_state |= LACP_STATE_EXPIRED;
 			port->sm_vars |= AD_PORT_CHURNED;
@@ -1417,11 +1423,10 @@ static void ad_tx_machine(struct port *port)
 /**
  * ad_periodic_machine - handle a port's periodic state machine
  * @port: the port we're looking at
- * @bond_params: bond parameters we will use
  *
  * Turn ntt flag on priodically to perform periodic transmission of lacpdu's.
  */
-static void ad_periodic_machine(struct port *port, struct bond_params *bond_params)
+static void ad_periodic_machine(struct port *port)
 {
 	periodic_states_t last_state;
 
@@ -1430,8 +1435,7 @@ static void ad_periodic_machine(struct port *port, struct bond_params *bond_para
 
 	/* check if port was reinitialized */
 	if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) ||
-	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) ||
-	    !bond_params->lacp_active) {
+	    (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))) {
 		port->sm_periodic_state = AD_NO_PERIODIC;
 	}
 	/* check if state machine should change state */
@@ -1955,16 +1959,16 @@ static void ad_initialize_agg(struct aggregator *aggregator)
 /**
  * ad_initialize_port - initialize a given port's parameters
  * @port: the port we're looking at
- * @lacp_fast: boolean. whether fast periodic should be used
+ * @bond_params: bond parameters we will use
  */
-static void ad_initialize_port(struct port *port, int lacp_fast)
+static void ad_initialize_port(struct port *port, const struct bond_params *bond_params)
 {
 	static const struct port_params tmpl = {
 		.system_priority = 0xffff,
 		.key             = 1,
 		.port_number     = 1,
 		.port_priority   = 0xff,
-		.port_state      = 1,
+		.port_state      = 0,
 	};
 	static const struct lacpdu lacpdu = {
 		.subtype		= 0x01,
@@ -1982,12 +1986,14 @@ static void ad_initialize_port(struct port *port, int lacp_fast)
 		port->actor_port_priority = 0xff;
 		port->actor_port_aggregator_identifier = 0;
 		port->ntt = false;
-		port->actor_admin_port_state = LACP_STATE_AGGREGATION |
-					       LACP_STATE_LACP_ACTIVITY;
-		port->actor_oper_port_state  = LACP_STATE_AGGREGATION |
-					       LACP_STATE_LACP_ACTIVITY;
+		port->actor_admin_port_state = LACP_STATE_AGGREGATION;
+		port->actor_oper_port_state  = LACP_STATE_AGGREGATION;
+		if (bond_params->lacp_active) {
+			port->actor_admin_port_state |= LACP_STATE_LACP_ACTIVITY;
+			port->actor_oper_port_state  |= LACP_STATE_LACP_ACTIVITY;
+		}
 
-		if (lacp_fast)
+		if (bond_params->lacp_fast)
 			port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT;
 
 		memcpy(&port->partner_admin, &tmpl, sizeof(tmpl));
@@ -2201,7 +2207,7 @@ void bond_3ad_bind_slave(struct slave *slave)
 		/* port initialization */
 		port = &(SLAVE_AD_INFO(slave)->port);
 
-		ad_initialize_port(port, bond->params.lacp_fast);
+		ad_initialize_port(port, &bond->params);
 
 		port->slave = slave;
 		port->actor_port_number = SLAVE_AD_INFO(slave)->id;
@@ -2513,7 +2519,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work)
 		}
 
 		ad_rx_machine(NULL, port);
-		ad_periodic_machine(port, &bond->params);
+		ad_periodic_machine(port);
 		ad_port_selection_logic(port, &update_slave_arr);
 		ad_mux_machine(port, &update_slave_arr);
 		ad_tx_machine(port);
@@ -2883,6 +2889,31 @@ void bond_3ad_update_lacp_rate(struct bonding *bond)
 	spin_unlock_bh(&bond->mode_lock);
 }
 
+/**
+ * bond_3ad_update_lacp_active - change the lacp active
+ * @bond: bonding struct
+ *
+ * Update actor_oper_port_state when lacp_active is modified.
+ */
+void bond_3ad_update_lacp_active(struct bonding *bond)
+{
+	struct port *port = NULL;
+	struct list_head *iter;
+	struct slave *slave;
+	int lacp_active;
+
+	lacp_active = bond->params.lacp_active;
+	spin_lock_bh(&bond->mode_lock);
+	bond_for_each_slave(bond, slave, iter) {
+		port = &(SLAVE_AD_INFO(slave)->port);
+		if (lacp_active)
+			port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY;
+		else
+			port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY;
+	}
+	spin_unlock_bh(&bond->mode_lock);
+}
+
 size_t bond_3ad_stats_size(void)
 {
 	return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 257333c88710..57be04f6cb11 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2132,6 +2132,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
 	} else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW &&
 		   BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
+		   bond_has_slaves(bond) &&
 		   memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) {
 		/* Set slave to random address to avoid duplicate mac
 		 * address in later fail over.
@@ -3355,7 +3356,6 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave)
 		/* Find out through which dev should the packet go */
 		memset(&fl6, 0, sizeof(struct flowi6));
 		fl6.daddr = targets[i];
-		fl6.flowi6_oif = bond->dev->ifindex;
 
 		dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6);
 		if (dst->error) {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 1d639a3be6ba..3b6f815c55ff 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1660,6 +1660,7 @@ static int bond_option_lacp_active_set(struct bonding *bond,
 	netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n",
 		   newval->string, newval->value);
 	bond->params.lacp_active = newval->value;
+	bond_3ad_update_lacp_active(bond);
 
 	return 0;
 }
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 64e664f5adcc..87c134bcd48d 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -861,7 +861,6 @@ static int rcar_can_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct rcar_can_priv *priv = netdev_priv(ndev);
-	u16 ctlr;
 	int err;
 
 	if (!netif_running(ndev))
@@ -873,12 +872,7 @@ static int rcar_can_resume(struct device *dev)
 		return err;
 	}
 
-	ctlr = readw(&priv->regs->ctlr);
-	ctlr &= ~RCAR_CAN_CTLR_SLPM;
-	writew(ctlr, &priv->regs->ctlr);
-	ctlr &= ~RCAR_CAN_CTLR_CANM;
-	writew(ctlr, &priv->regs->ctlr);
-	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+	rcar_can_start(ndev);
 
 	netif_device_attach(ndev);
 	netif_start_queue(ndev);
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index b3c8c592fb0e..7e8b1d2f1af6 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -823,9 +823,6 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
 	/* Reset Global error flags */
 	rcar_canfd_write(gpriv->base, RCANFD_GERFL, 0x0);
 
-	/* Set the controller into appropriate mode */
-	rcar_canfd_set_mode(gpriv);
-
 	/* Transition all Channels to reset mode */
 	for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
 		rcar_canfd_clear_bit(gpriv->base,
@@ -844,6 +841,10 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
 			return err;
 		}
 	}
+
+	/* Set the controller into appropriate mode */
+	rcar_canfd_set_mode(gpriv);
+
 	return 0;
 }
 
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index 09ae218315d7..963ea8510dd9 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -545,8 +545,6 @@ static int hi3110_stop(struct net_device *net)
 
 	priv->force_quit = 1;
 	free_irq(spi->irq, priv);
-	destroy_workqueue(priv->wq);
-	priv->wq = NULL;
 
 	mutex_lock(&priv->hi3110_lock);
 
@@ -770,34 +768,23 @@ static int hi3110_open(struct net_device *net)
 		goto out_close;
 	}
 
-	priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
-				   0);
-	if (!priv->wq) {
-		ret = -ENOMEM;
-		goto out_free_irq;
-	}
-	INIT_WORK(&priv->tx_work, hi3110_tx_work_handler);
-	INIT_WORK(&priv->restart_work, hi3110_restart_work_handler);
-
 	ret = hi3110_hw_reset(spi);
 	if (ret)
-		goto out_free_wq;
+		goto out_free_irq;
 
 	ret = hi3110_setup(net);
 	if (ret)
-		goto out_free_wq;
+		goto out_free_irq;
 
 	ret = hi3110_set_normal_mode(spi);
 	if (ret)
-		goto out_free_wq;
+		goto out_free_irq;
 
 	netif_wake_queue(net);
 	mutex_unlock(&priv->hi3110_lock);
 
 	return 0;
 
- out_free_wq:
-	destroy_workqueue(priv->wq);
  out_free_irq:
 	free_irq(spi->irq, priv);
 	hi3110_hw_sleep(spi);
@@ -812,6 +799,7 @@ static const struct net_device_ops hi3110_netdev_ops = {
 	.ndo_open = hi3110_open,
 	.ndo_stop = hi3110_stop,
 	.ndo_start_xmit = hi3110_hard_start_xmit,
+	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops hi3110_ethtool_ops = {
@@ -908,6 +896,15 @@ static int hi3110_can_probe(struct spi_device *spi)
 	if (ret)
 		goto out_clk;
 
+	priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM,
+				   0);
+	if (!priv->wq) {
+		ret = -ENOMEM;
+		goto out_clk;
+	}
+	INIT_WORK(&priv->tx_work, hi3110_tx_work_handler);
+	INIT_WORK(&priv->restart_work, hi3110_restart_work_handler);
+
 	priv->spi = spi;
 	mutex_init(&priv->hi3110_lock);
 
@@ -943,6 +940,8 @@ static int hi3110_can_probe(struct spi_device *spi)
 	return 0;
 
  error_probe:
+	destroy_workqueue(priv->wq);
+	priv->wq = NULL;
 	hi3110_power_enable(priv->power, 0);
 
  out_clk:
@@ -963,6 +962,9 @@ static void hi3110_can_remove(struct spi_device *spi)
 
 	hi3110_power_enable(priv->power, 0);
 
+	destroy_workqueue(priv->wq);
+	priv->wq = NULL;
+
 	clk_disable_unprepare(priv->clk);
 
 	free_candev(net);
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 6fcb301ef611..53bfd873de9b 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -768,6 +768,7 @@ static const struct net_device_ops sun4ican_netdev_ops = {
 	.ndo_open = sun4ican_open,
 	.ndo_stop = sun4ican_close,
 	.ndo_start_xmit = sun4ican_start_xmit,
+	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops sun4ican_ethtool_ops = {
diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c
index db1acf6d504c..adc91873c083 100644
--- a/drivers/net/can/usb/etas_es58x/es58x_core.c
+++ b/drivers/net/can/usb/etas_es58x/es58x_core.c
@@ -7,7 +7,7 @@
  *
  * Copyright (c) 2019 Robert Bosch Engineering and Business Solutions. All rights reserved.
  * Copyright (c) 2020 ETAS K.K.. All rights reserved.
- * Copyright (c) 2020-2022 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
+ * Copyright (c) 2020-2025 Vincent Mailhol <mailhol@kernel.org>
  */
 
 #include <linux/unaligned.h>
@@ -1977,6 +1977,7 @@ static const struct net_device_ops es58x_netdev_ops = {
 	.ndo_stop = es58x_stop,
 	.ndo_start_xmit = es58x_start_xmit,
 	.ndo_eth_ioctl = can_eth_ioctl_hwts,
+	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops es58x_ethtool_ops = {
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index 41c0a1c399bf..1f9b915094e6 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -761,6 +761,7 @@ static const struct net_device_ops mcba_netdev_ops = {
 	.ndo_open = mcba_usb_open,
 	.ndo_stop = mcba_usb_close,
 	.ndo_start_xmit = mcba_usb_start_xmit,
+	.ndo_change_mtu = can_change_mtu,
 };
 
 static const struct ethtool_ops mcba_ethtool_ops = {
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 117637b9b995..dd5caa1c302b 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -111,7 +111,7 @@ void peak_usb_update_ts_now(struct peak_time_ref *time_ref, u32 ts_now)
 		u32 delta_ts = time_ref->ts_dev_2 - time_ref->ts_dev_1;
 
 		if (time_ref->ts_dev_2 < time_ref->ts_dev_1)
-			delta_ts &= (1 << time_ref->adapter->ts_used_bits) - 1;
+			delta_ts &= (1ULL << time_ref->adapter->ts_used_bits) - 1;
 
 		time_ref->ts_total += delta_ts;
 	}
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 81baec8eb1e5..a25a3ca62c12 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -690,14 +690,6 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb,
 		dlc |= XCAN_DLCR_EDL_MASK;
 	}
 
-	if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) &&
-	    (priv->devtype.flags & XCAN_FLAG_TXFEMP))
-		can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0);
-	else
-		can_put_echo_skb(skb, ndev, 0, 0);
-
-	priv->tx_head++;
-
 	priv->write_reg(priv, XCAN_FRAME_ID_OFFSET(frame_offset), id);
 	/* If the CAN frame is RTR frame this write triggers transmission
 	 * (not on CAN FD)
@@ -730,6 +722,14 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb,
 					data[1]);
 		}
 	}
+
+	if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) &&
+	    (priv->devtype.flags & XCAN_FLAG_TXFEMP))
+		can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0);
+	else
+		can_put_echo_skb(skb, ndev, 0, 0);
+
+	priv->tx_head++;
 }
 
 /**
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 9942fb6f7f4b..2f846381d5a7 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1273,9 +1273,15 @@ static int b53_setup(struct dsa_switch *ds)
 	 */
 	ds->untag_vlan_aware_bridge_pvid = true;
 
-	/* Ageing time is set in seconds */
-	ds->ageing_time_min = 1 * 1000;
-	ds->ageing_time_max = AGE_TIME_MAX * 1000;
+	if (dev->chip_id == BCM53101_DEVICE_ID) {
+		/* BCM53101 uses 0.5 second increments */
+		ds->ageing_time_min = 1 * 500;
+		ds->ageing_time_max = AGE_TIME_MAX * 500;
+	} else {
+		/* Everything else uses 1 second increments */
+		ds->ageing_time_min = 1 * 1000;
+		ds->ageing_time_max = AGE_TIME_MAX * 1000;
+	}
 
 	ret = b53_reset_switch(dev);
 	if (ret) {
@@ -2078,7 +2084,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port,
 
 	/* Start search operation */
 	reg = ARL_SRCH_STDN;
-	b53_write8(priv, offset, B53_ARL_SRCH_CTL, reg);
+	b53_write8(priv, B53_ARLIO_PAGE, offset, reg);
 
 	do {
 		ret = b53_arl_search_wait(priv);
@@ -2559,7 +2565,10 @@ int b53_set_ageing_time(struct dsa_switch *ds, unsigned int msecs)
 	else
 		reg = B53_AGING_TIME_CONTROL;
 
-	atc = DIV_ROUND_CLOSEST(msecs, 1000);
+	if (dev->chip_id == BCM53101_DEVICE_ID)
+		atc = DIV_ROUND_CLOSEST(msecs, 500);
+	else
+		atc = DIV_ROUND_CLOSEST(msecs, 1000);
 
 	if (!is5325(dev) && !is5365(dev))
 		atc |= AGE_CHANGE;
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 6eb3140d4044..84dc6e517acf 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -685,18 +685,27 @@ static int gswip_add_single_port_br(struct gswip_priv *priv, int port, bool add)
 	return 0;
 }
 
-static int gswip_port_enable(struct dsa_switch *ds, int port,
-			     struct phy_device *phydev)
+static int gswip_port_setup(struct dsa_switch *ds, int port)
 {
 	struct gswip_priv *priv = ds->priv;
 	int err;
 
 	if (!dsa_is_cpu_port(ds, port)) {
-		u32 mdio_phy = 0;
-
 		err = gswip_add_single_port_br(priv, port, true);
 		if (err)
 			return err;
+	}
+
+	return 0;
+}
+
+static int gswip_port_enable(struct dsa_switch *ds, int port,
+			     struct phy_device *phydev)
+{
+	struct gswip_priv *priv = ds->priv;
+
+	if (!dsa_is_cpu_port(ds, port)) {
+		u32 mdio_phy = 0;
 
 		if (phydev)
 			mdio_phy = phydev->mdio.addr & GSWIP_MDIO_PHY_ADDR_MASK;
@@ -1359,8 +1368,9 @@ static int gswip_port_fdb(struct dsa_switch *ds, int port,
 	int i;
 	int err;
 
+	/* Operation not supported on the CPU port, don't throw errors */
 	if (!bridge)
-		return -EINVAL;
+		return 0;
 
 	for (i = max_ports; i < ARRAY_SIZE(priv->vlans); i++) {
 		if (priv->vlans[i].bridge == bridge) {
@@ -1829,6 +1839,7 @@ static const struct phylink_mac_ops gswip_phylink_mac_ops = {
 static const struct dsa_switch_ops gswip_xrx200_switch_ops = {
 	.get_tag_protocol	= gswip_get_tag_protocol,
 	.setup			= gswip_setup,
+	.port_setup		= gswip_port_setup,
 	.port_enable		= gswip_port_enable,
 	.port_disable		= gswip_port_disable,
 	.port_bridge_join	= gswip_port_bridge_join,
diff --git a/drivers/net/dsa/microchip/ksz8.c b/drivers/net/dsa/microchip/ksz8.c
index 76e490070e9c..c354abdafc1b 100644
--- a/drivers/net/dsa/microchip/ksz8.c
+++ b/drivers/net/dsa/microchip/ksz8.c
@@ -36,15 +36,14 @@
 
 static void ksz_cfg(struct ksz_device *dev, u32 addr, u8 bits, bool set)
 {
-	regmap_update_bits(ksz_regmap_8(dev), addr, bits, set ? bits : 0);
+	ksz_rmw8(dev, addr, bits, set ? bits : 0);
 }
 
 static void ksz_port_cfg(struct ksz_device *dev, int port, int offset, u8 bits,
 			 bool set)
 {
-	regmap_update_bits(ksz_regmap_8(dev),
-			   dev->dev_ops->get_port_addr(port, offset),
-			   bits, set ? bits : 0);
+	ksz_rmw8(dev, dev->dev_ops->get_port_addr(port, offset), bits,
+		 set ? bits : 0);
 }
 
 /**
@@ -1955,16 +1954,19 @@ int ksz8_setup(struct dsa_switch *ds)
 	ksz_cfg(dev, S_LINK_AGING_CTRL, SW_LINK_AUTO_AGING, true);
 
 	/* Enable aggressive back off algorithm in half duplex mode. */
-	regmap_update_bits(ksz_regmap_8(dev), REG_SW_CTRL_1,
-			   SW_AGGR_BACKOFF, SW_AGGR_BACKOFF);
+	ret = ksz_rmw8(dev, REG_SW_CTRL_1, SW_AGGR_BACKOFF, SW_AGGR_BACKOFF);
+	if (ret)
+		return ret;
 
 	/*
 	 * Make sure unicast VLAN boundary is set as default and
 	 * enable no excessive collision drop.
 	 */
-	regmap_update_bits(ksz_regmap_8(dev), REG_SW_CTRL_2,
-			   UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP,
-			   UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP);
+	ret = ksz_rmw8(dev, REG_SW_CTRL_2,
+		       UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP,
+		       UNICAST_VLAN_BOUNDARY | NO_EXC_COLLISION_DROP);
+	if (ret)
+		return ret;
 
 	ksz_cfg(dev, S_REPLACE_VID_CTRL, SW_REPLACE_VID, false);
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 7292bfe2f7ca..9568cc391fe3 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1447,6 +1447,7 @@ static const struct regmap_range ksz8873_valid_regs[] = {
 	regmap_reg_range(0x3f, 0x3f),
 
 	/* advanced control registers */
+	regmap_reg_range(0x43, 0x43),
 	regmap_reg_range(0x60, 0x6f),
 	regmap_reg_range(0x70, 0x75),
 	regmap_reg_range(0x76, 0x78),
@@ -2456,6 +2457,12 @@ static void ksz_update_port_member(struct ksz_device *dev, int port)
 		dev->dev_ops->cfg_port_member(dev, i, val | cpu_port);
 	}
 
+	/* HSR ports are setup once so need to use the assigned membership
+	 * when the port is enabled.
+	 */
+	if (!port_member && p->stp_state == BR_STATE_FORWARDING &&
+	    (dev->hsr_ports & BIT(port)))
+		port_member = dev->hsr_ports;
 	dev->dev_ops->cfg_port_member(dev, port, port_member | cpu_port);
 }
 
diff --git a/drivers/net/dsa/mv88e6xxx/leds.c b/drivers/net/dsa/mv88e6xxx/leds.c
index 1c88bfaea46b..ab3bc645da56 100644
--- a/drivers/net/dsa/mv88e6xxx/leds.c
+++ b/drivers/net/dsa/mv88e6xxx/leds.c
@@ -779,7 +779,8 @@ int mv88e6xxx_port_setup_leds(struct mv88e6xxx_chip *chip, int port)
 			continue;
 		if (led_num > 1) {
 			dev_err(dev, "invalid LED specified port %d\n", port);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto err_put_led;
 		}
 
 		if (led_num == 0)
@@ -823,17 +824,25 @@ int mv88e6xxx_port_setup_leds(struct mv88e6xxx_chip *chip, int port)
 		init_data.devname_mandatory = true;
 		init_data.devicename = kasprintf(GFP_KERNEL, "%s:0%d:0%d", chip->info->name,
 						 port, led_num);
-		if (!init_data.devicename)
-			return -ENOMEM;
+		if (!init_data.devicename) {
+			ret = -ENOMEM;
+			goto err_put_led;
+		}
 
 		ret = devm_led_classdev_register_ext(dev, l, &init_data);
 		kfree(init_data.devicename);
 
 		if (ret) {
 			dev_err(dev, "Failed to init LED %d for port %d", led_num, port);
-			return ret;
+			goto err_put_led;
 		}
 	}
 
+	fwnode_handle_put(leds);
 	return 0;
+
+err_put_led:
+	fwnode_handle_put(led);
+	fwnode_handle_put(leds);
+	return ret;
 }
diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c
index 47411d2cbd28..88694b08afa1 100644
--- a/drivers/net/ethernet/airoha/airoha_ppe.c
+++ b/drivers/net/ethernet/airoha/airoha_ppe.c
@@ -736,10 +736,8 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe,
 			continue;
 		}
 
-		if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) {
-			e->hash = 0xffff;
+		if (!airoha_ppe_foe_compare_entry(e, hwe))
 			continue;
-		}
 
 		airoha_ppe_foe_commit_entry(ppe, &e->data, hash);
 		commit_done = true;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 5578ddcb465d..0daa08cecaf2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -926,15 +926,21 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
 
 static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
 					 struct bnxt_rx_ring_info *rxr,
+					 unsigned int *offset,
 					 gfp_t gfp)
 {
 	netmem_ref netmem;
 
-	netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
+	if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) {
+		netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset, BNXT_RX_PAGE_SIZE, gfp);
+	} else {
+		netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
+		*offset = 0;
+	}
 	if (!netmem)
 		return 0;
 
-	*mapping = page_pool_get_dma_addr_netmem(netmem);
+	*mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
 	return netmem;
 }
 
@@ -1029,7 +1035,7 @@ static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
 	dma_addr_t mapping;
 	netmem_ref netmem;
 
-	netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp);
+	netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
 	if (!netmem)
 		return -ENOMEM;
 
@@ -3819,7 +3825,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
 	if (BNXT_RX_PAGE_MODE(bp))
 		pp.pool_size += bp->rx_ring_size / rx_size_fac;
 	pp.nid = numa_node;
-	pp.napi = &rxr->bnapi->napi;
 	pp.netdev = bp->dev;
 	pp.dev = &bp->pdev->dev;
 	pp.dma_dir = bp->rx_dir;
@@ -3851,6 +3856,12 @@ err_destroy_pp:
 	return PTR_ERR(pool);
 }
 
+static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr)
+{
+	page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi);
+	page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi);
+}
+
 static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
 {
 	u16 mem_size;
@@ -3889,6 +3900,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
 		rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
 		if (rc)
 			return rc;
+		bnxt_enable_rx_page_pool(rxr);
 
 		rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0);
 		if (rc < 0)
@@ -4385,7 +4397,7 @@ static void bnxt_alloc_one_rx_ring_netmem(struct bnxt *bp,
 	for (i = 0; i < bp->rx_agg_ring_size; i++) {
 		if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) {
 			netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n",
-				    ring_nr, i, bp->rx_ring_size);
+				    ring_nr, i, bp->rx_agg_ring_size);
 			break;
 		}
 		prod = NEXT_RX_AGG(prod);
@@ -5320,7 +5332,7 @@ static void bnxt_free_ntp_fltrs(struct bnxt *bp, bool all)
 {
 	int i;
 
-	netdev_assert_locked(bp->dev);
+	netdev_assert_locked_or_invisible(bp->dev);
 
 	/* Under netdev instance lock and all our NAPIs have been disabled.
 	 * It's safe to delete the hash table.
@@ -8004,7 +8016,8 @@ static int __bnxt_reserve_rings(struct bnxt *bp)
 	}
 	rx_rings = min_t(int, rx_rings, hwr.grp);
 	hwr.cp = min_t(int, hwr.cp, bp->cp_nr_rings);
-	if (hwr.stat > bnxt_get_ulp_stat_ctxs(bp))
+	if (bnxt_ulp_registered(bp->edev) &&
+	    hwr.stat > bnxt_get_ulp_stat_ctxs(bp))
 		hwr.stat -= bnxt_get_ulp_stat_ctxs(bp);
 	hwr.cp = min_t(int, hwr.cp, hwr.stat);
 	rc = bnxt_trim_rings(bp, &rx_rings, &hwr.tx, hwr.cp, sh);
@@ -8012,6 +8025,11 @@ static int __bnxt_reserve_rings(struct bnxt *bp)
 		hwr.rx = rx_rings << 1;
 	tx_cp = bnxt_num_tx_to_cp(bp, hwr.tx);
 	hwr.cp = sh ? max_t(int, tx_cp, rx_rings) : tx_cp + rx_rings;
+	if (hwr.tx != bp->tx_nr_rings) {
+		netdev_warn(bp->dev,
+			    "Able to reserve only %d out of %d requested TX rings\n",
+			    hwr.tx, bp->tx_nr_rings);
+	}
 	bp->tx_nr_rings = hwr.tx;
 
 	/* If we cannot reserve all the RX rings, reset the RSS map only
@@ -12839,6 +12857,17 @@ static int bnxt_set_xps_mapping(struct bnxt *bp)
 	return rc;
 }
 
+static int bnxt_tx_nr_rings(struct bnxt *bp)
+{
+	return bp->num_tc ? bp->tx_nr_rings_per_tc * bp->num_tc :
+			    bp->tx_nr_rings_per_tc;
+}
+
+static int bnxt_tx_nr_rings_per_tc(struct bnxt *bp)
+{
+	return bp->num_tc ? bp->tx_nr_rings / bp->num_tc : bp->tx_nr_rings;
+}
+
 static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 {
 	int rc = 0;
@@ -12856,6 +12885,13 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 	if (rc)
 		return rc;
 
+	/* Make adjustments if reserved TX rings are less than requested */
+	bp->tx_nr_rings -= bp->tx_nr_rings_xdp;
+	bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp);
+	if (bp->tx_nr_rings_xdp) {
+		bp->tx_nr_rings_xdp = bp->tx_nr_rings_per_tc;
+		bp->tx_nr_rings += bp->tx_nr_rings_xdp;
+	}
 	rc = bnxt_alloc_mem(bp, irq_re_init);
 	if (rc) {
 		netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
@@ -16031,6 +16067,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx)
 			goto err_reset;
 	}
 
+	bnxt_enable_rx_page_pool(rxr);
 	napi_enable_locked(&bnapi->napi);
 	bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
 
@@ -16312,7 +16349,7 @@ static void bnxt_trim_dflt_sh_rings(struct bnxt *bp)
 	bp->cp_nr_rings = min_t(int, bp->tx_nr_rings_per_tc, bp->rx_nr_rings);
 	bp->rx_nr_rings = bp->cp_nr_rings;
 	bp->tx_nr_rings_per_tc = bp->cp_nr_rings;
-	bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
+	bp->tx_nr_rings = bnxt_tx_nr_rings(bp);
 }
 
 static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
@@ -16344,7 +16381,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 		bnxt_trim_dflt_sh_rings(bp);
 	else
 		bp->cp_nr_rings = bp->tx_nr_rings_per_tc + bp->rx_nr_rings;
-	bp->tx_nr_rings = bp->tx_nr_rings_per_tc;
+	bp->tx_nr_rings = bnxt_tx_nr_rings(bp);
 
 	avail_msix = bnxt_get_max_func_irqs(bp) - bp->cp_nr_rings;
 	if (avail_msix >= BNXT_MIN_ROCE_CP_RINGS) {
@@ -16357,7 +16394,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 	rc = __bnxt_reserve_rings(bp);
 	if (rc && rc != -ENODEV)
 		netdev_warn(bp->dev, "Unable to reserve tx rings\n");
-	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+	bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp);
 	if (sh)
 		bnxt_trim_dflt_sh_rings(bp);
 
@@ -16366,7 +16403,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 		rc = __bnxt_reserve_rings(bp);
 		if (rc && rc != -ENODEV)
 			netdev_warn(bp->dev, "2nd rings reservation failed.\n");
-		bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+		bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp);
 	}
 	if (BNXT_CHIP_TYPE_NITRO_A0(bp)) {
 		bp->rx_nr_rings++;
@@ -16400,7 +16437,7 @@ static int bnxt_init_dflt_ring_mode(struct bnxt *bp)
 	if (rc)
 		goto init_dflt_ring_err;
 
-	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+	bp->tx_nr_rings_per_tc = bnxt_tx_nr_rings_per_tc(bp);
 
 	bnxt_set_dflt_rfs(bp);
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index d72fd248f3aa..2d66bf59cd64 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -244,7 +244,7 @@ bnxt_tc_parse_pedit(struct bnxt *bp, struct bnxt_tc_actions *actions,
 			   offset < offset_of_ip6_daddr + 16) {
 			actions->nat.src_xlate = false;
 			idx = (offset - offset_of_ip6_daddr) / 4;
-			actions->nat.l3.ipv6.saddr.s6_addr32[idx] = htonl(val);
+			actions->nat.l3.ipv6.daddr.s6_addr32[idx] = htonl(val);
 		} else {
 			netdev_err(bp->dev,
 				   "%s: IPv6_hdr: Invalid pedit field\n",
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index a9040c42d2ff..6e97a5a7daaf 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -4230,8 +4230,7 @@ static void cnic_cm_stop_bnx2x_hw(struct cnic_dev *dev)
 
 	cnic_bnx2x_delete_wait(dev, 0);
 
-	cancel_delayed_work(&cp->delete_task);
-	flush_workqueue(cnic_wq);
+	cancel_delayed_work_sync(&cp->delete_task);
 
 	if (atomic_read(&cp->iscsi_conn) != 0)
 		netdev_warn(dev->netdev, "%d iSCSI connections not destroyed\n",
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ce95fad8cedd..c769b7dbd3ba 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1223,12 +1223,13 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
 {
 	struct macb *bp = queue->bp;
 	u16 queue_index = queue - bp->queues;
+	unsigned long flags;
 	unsigned int tail;
 	unsigned int head;
 	int packets = 0;
 	u32 bytes = 0;
 
-	spin_lock(&queue->tx_ptr_lock);
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
 	head = queue->tx_head;
 	for (tail = queue->tx_tail; tail != head && packets < budget; tail++) {
 		struct macb_tx_skb	*tx_skb;
@@ -1291,7 +1292,7 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
 	    CIRC_CNT(queue->tx_head, queue->tx_tail,
 		     bp->tx_ring_size) <= MACB_TX_WAKEUP_THRESH(bp))
 		netif_wake_subqueue(bp->dev, queue_index);
-	spin_unlock(&queue->tx_ptr_lock);
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
 
 	return packets;
 }
@@ -1707,8 +1708,9 @@ static void macb_tx_restart(struct macb_queue *queue)
 {
 	struct macb *bp = queue->bp;
 	unsigned int head_idx, tbqp;
+	unsigned long flags;
 
-	spin_lock(&queue->tx_ptr_lock);
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
 
 	if (queue->tx_head == queue->tx_tail)
 		goto out_tx_ptr_unlock;
@@ -1720,19 +1722,20 @@ static void macb_tx_restart(struct macb_queue *queue)
 	if (tbqp == head_idx)
 		goto out_tx_ptr_unlock;
 
-	spin_lock_irq(&bp->lock);
+	spin_lock(&bp->lock);
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
-	spin_unlock_irq(&bp->lock);
+	spin_unlock(&bp->lock);
 
 out_tx_ptr_unlock:
-	spin_unlock(&queue->tx_ptr_lock);
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
 }
 
 static bool macb_tx_complete_pending(struct macb_queue *queue)
 {
 	bool retval = false;
+	unsigned long flags;
 
-	spin_lock(&queue->tx_ptr_lock);
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
 	if (queue->tx_head != queue->tx_tail) {
 		/* Make hw descriptor updates visible to CPU */
 		rmb();
@@ -1740,7 +1743,7 @@ static bool macb_tx_complete_pending(struct macb_queue *queue)
 		if (macb_tx_desc(queue, queue->tx_tail)->ctrl & MACB_BIT(TX_USED))
 			retval = true;
 	}
-	spin_unlock(&queue->tx_ptr_lock);
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
 	return retval;
 }
 
@@ -2308,6 +2311,7 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct macb_queue *queue = &bp->queues[queue_index];
 	unsigned int desc_cnt, nr_frags, frag_size, f;
 	unsigned int hdrlen;
+	unsigned long flags;
 	bool is_lso;
 	netdev_tx_t ret = NETDEV_TX_OK;
 
@@ -2368,7 +2372,7 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		desc_cnt += DIV_ROUND_UP(frag_size, bp->max_tx_length);
 	}
 
-	spin_lock_bh(&queue->tx_ptr_lock);
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
 
 	/* This is a hard error, log it. */
 	if (CIRC_SPACE(queue->tx_head, queue->tx_tail,
@@ -2392,15 +2396,15 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	netdev_tx_sent_queue(netdev_get_tx_queue(bp->dev, queue_index),
 			     skb->len);
 
-	spin_lock_irq(&bp->lock);
+	spin_lock(&bp->lock);
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
-	spin_unlock_irq(&bp->lock);
+	spin_unlock(&bp->lock);
 
 	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1)
 		netif_stop_subqueue(dev, queue_index);
 
 unlock:
-	spin_unlock_bh(&queue->tx_ptr_lock);
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
 
 	return ret;
 }
@@ -3090,7 +3094,7 @@ static void gem_update_stats(struct macb *bp)
 			/* Add GEM_OCTTXH, GEM_OCTRXH */
 			val = bp->macb_reg_readl(bp, offset + 4);
 			bp->ethtool_stats[i] += ((u64)val) << 32;
-			*(p++) += ((u64)val) << 32;
+			*p += ((u64)val) << 32;
 		}
 	}
 
@@ -5113,7 +5117,8 @@ static const struct macb_config sama7g5_gem_config = {
 
 static const struct macb_config sama7g5_emac_config = {
 	.caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII |
-		MACB_CAPS_MIIONRGMII | MACB_CAPS_GEM_HAS_PTP,
+		MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_MIIONRGMII |
+		MACB_CAPS_GEM_HAS_PTP,
 	.dma_burst_length = 16,
 	.clk_init = macb_clk_init,
 	.init = macb_init,
@@ -5398,19 +5403,16 @@ static void macb_remove(struct platform_device *pdev)
 
 	if (dev) {
 		bp = netdev_priv(dev);
+		unregister_netdev(dev);
 		phy_exit(bp->sgmii_phy);
 		mdiobus_unregister(bp->mii_bus);
 		mdiobus_free(bp->mii_bus);
 
-		unregister_netdev(dev);
+		device_set_wakeup_enable(&bp->pdev->dev, 0);
 		cancel_work_sync(&bp->hresp_err_bh_work);
 		pm_runtime_disable(&pdev->dev);
 		pm_runtime_dont_use_autosuspend(&pdev->dev);
-		if (!pm_runtime_suspended(&pdev->dev)) {
-			macb_clks_disable(bp->pclk, bp->hclk, bp->tx_clk,
-					  bp->rx_clk, bp->tsu_clk);
-			pm_runtime_set_suspended(&pdev->dev);
-		}
+		pm_runtime_set_suspended(&pdev->dev);
 		phylink_destroy(bp->phylink);
 		free_netdev(dev);
 	}
diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c
index de8a6ce86ad7..12105ffb5dac 100644
--- a/drivers/net/ethernet/cavium/liquidio/request_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c
@@ -126,7 +126,7 @@ int octeon_init_instr_queue(struct octeon_device *oct,
 	oct->io_qmask.iq |= BIT_ULL(iq_no);
 
 	/* Set the 32B/64B mode for each input queue */
-	oct->io_qmask.iq64B |= ((conf->instr_type == 64) << iq_no);
+	oct->io_qmask.iq64B |= ((u64)(conf->instr_type == 64) << iq_no);
 	iq->iqcmd_64B = (conf->instr_type == 64);
 
 	oct->fn_list.setup_iq_regs(oct, iq_no);
diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 21495b5dce25..9efb60842ad1 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1493,13 +1493,17 @@ static int bgx_init_of_phy(struct bgx *bgx)
 		 * this cortina phy, for which there is no driver
 		 * support, ignore it.
 		 */
-		if (phy_np &&
-		    !of_device_is_compatible(phy_np, "cortina,cs4223-slice")) {
-			/* Wait until the phy drivers are available */
-			pd = of_phy_find_device(phy_np);
-			if (!pd)
-				goto defer;
-			bgx->lmac[lmac].phydev = pd;
+		if (phy_np) {
+			if (!of_device_is_compatible(phy_np, "cortina,cs4223-slice")) {
+				/* Wait until the phy drivers are available */
+				pd = of_phy_find_device(phy_np);
+				if (!pd) {
+					of_node_put(phy_np);
+					goto defer;
+				}
+				bgx->lmac[lmac].phydev = pd;
+			}
+			of_node_put(phy_np);
 		}
 
 		lmac++;
@@ -1515,11 +1519,11 @@ defer:
 	 * for phy devices we may have already found.
 	 */
 	while (lmac) {
+		lmac--;
 		if (bgx->lmac[lmac].phydev) {
 			put_device(&bgx->lmac[lmac].phydev->mdio.dev);
 			bgx->lmac[lmac].phydev = NULL;
 		}
-		lmac--;
 	}
 	of_node_put(node);
 	return -EPROBE_DEFER;
diff --git a/drivers/net/ethernet/dlink/Kconfig b/drivers/net/ethernet/dlink/Kconfig
index e9e13654812c..0d77f84c8e7b 100644
--- a/drivers/net/ethernet/dlink/Kconfig
+++ b/drivers/net/ethernet/dlink/Kconfig
@@ -32,4 +32,24 @@ config DL2K
 	  To compile this driver as a module, choose M here: the
 	  module will be called dl2k.
 
+config SUNDANCE
+	tristate "Sundance Alta support"
+	depends on PCI
+	select CRC32
+	select MII
+	help
+	  This driver is for the Sundance "Alta" chip.
+	  More specific information and updates are available from
+	  <http://www.scyld.com/network/sundance.html>.
+
+config SUNDANCE_MMIO
+	bool "Use MMIO instead of PIO"
+	depends on SUNDANCE
+	help
+	  Enable memory-mapped I/O for interaction with Sundance NIC registers.
+	  Do NOT enable this by default, PIO (enabled when MMIO is disabled)
+	  is known to solve bugs on certain chips.
+
+	  If unsure, say N.
+
 endif # NET_VENDOR_DLINK
diff --git a/drivers/net/ethernet/dlink/Makefile b/drivers/net/ethernet/dlink/Makefile
index 38c236eb6007..3ff503c747db 100644
--- a/drivers/net/ethernet/dlink/Makefile
+++ b/drivers/net/ethernet/dlink/Makefile
@@ -4,3 +4,4 @@
 #
 
 obj-$(CONFIG_DL2K) += dl2k.o
+obj-$(CONFIG_SUNDANCE) += sundance.o
diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c
index cc60ee454bf9..6bbf6e5584e5 100644
--- a/drivers/net/ethernet/dlink/dl2k.c
+++ b/drivers/net/ethernet/dlink/dl2k.c
@@ -1099,7 +1099,7 @@ get_stats (struct net_device *dev)
 	dev->stats.rx_bytes += dr32(OctetRcvOk);
 	dev->stats.tx_bytes += dr32(OctetXmtOk);
 
-	dev->stats.multicast = dr32(McstFramesRcvdOk);
+	dev->stats.multicast += dr32(McstFramesRcvdOk);
 	dev->stats.collisions += dr32(SingleColFrames)
 			     +  dr32(MultiColFrames);
 
diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c
new file mode 100644
index 000000000000..277c50ef773f
--- /dev/null
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -0,0 +1,1990 @@
+/* sundance.c: A Linux device driver for the Sundance ST201 "Alta". */
+/*
+	Written 1999-2000 by Donald Becker.
+
+	This software may be used and distributed according to the terms of
+	the GNU General Public License (GPL), incorporated herein by reference.
+	Drivers based on or derived from this code fall under the GPL and must
+	retain the authorship, copyright and license notice.  This file is not
+	a complete program and may only be used when the entire operating
+	system is licensed under the GPL.
+
+	The author may be reached as becker@scyld.com, or C/O
+	Scyld Computing Corporation
+	410 Severn Ave., Suite 210
+	Annapolis MD 21403
+
+	Support and updates available at
+	http://www.scyld.com/network/sundance.html
+	[link no longer provides useful info -jgarzik]
+	Archives of the mailing list are still available at
+	https://www.beowulf.org/pipermail/netdrivers/
+
+*/
+
+#define DRV_NAME	"sundance"
+
+/* The user-configurable values.
+   These may be modified when a driver module is loaded.*/
+static int debug = 1;			/* 1 normal messages, 0 quiet .. 7 verbose. */
+/* Maximum number of multicast addresses to filter (vs. rx-all-multicast).
+   Typical is a 64 element hash table based on the Ethernet CRC.  */
+static const int multicast_filter_limit = 32;
+
+/* Set the copy breakpoint for the copy-only-tiny-frames scheme.
+   Setting to > 1518 effectively disables this feature.
+   This chip can receive into offset buffers, so the Alpha does not
+   need a copy-align. */
+static int rx_copybreak;
+static int flowctrl=1;
+
+/* media[] specifies the media type the NIC operates at.
+		 autosense	Autosensing active media.
+		 10mbps_hd 	10Mbps half duplex.
+		 10mbps_fd 	10Mbps full duplex.
+		 100mbps_hd 	100Mbps half duplex.
+		 100mbps_fd 	100Mbps full duplex.
+		 0		Autosensing active media.
+		 1	 	10Mbps half duplex.
+		 2	 	10Mbps full duplex.
+		 3	 	100Mbps half duplex.
+		 4	 	100Mbps full duplex.
+*/
+#define MAX_UNITS 8
+static char *media[MAX_UNITS];
+
+
+/* Operational parameters that are set at compile time. */
+
+/* Keep the ring sizes a power of two for compile efficiency.
+   The compiler will convert <unsigned>'%'<2^N> into a bit mask.
+   Making the Tx ring too large decreases the effectiveness of channel
+   bonding and packet priority, and more than 128 requires modifying the
+   Tx error recovery.
+   Large receive rings merely waste memory. */
+#define TX_RING_SIZE	32
+#define TX_QUEUE_LEN	(TX_RING_SIZE - 1) /* Limit ring entries actually used.  */
+#define RX_RING_SIZE	64
+#define RX_BUDGET	32
+#define TX_TOTAL_SIZE	TX_RING_SIZE*sizeof(struct netdev_desc)
+#define RX_TOTAL_SIZE	RX_RING_SIZE*sizeof(struct netdev_desc)
+
+/* Operational parameters that usually are not changed. */
+/* Time in jiffies before concluding the transmitter is hung. */
+#define TX_TIMEOUT  (4*HZ)
+#define PKT_BUF_SZ		1536	/* Size of each temporary Rx buffer.*/
+
+/* Include files, designed to support most kernel versions 2.0.0 and later. */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <asm/processor.h>		/* Processor type for cache alignment. */
+#include <asm/io.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
+#include <linux/crc32.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+
+MODULE_AUTHOR("Donald Becker <becker@scyld.com>");
+MODULE_DESCRIPTION("Sundance Alta Ethernet driver");
+MODULE_LICENSE("GPL");
+
+module_param(debug, int, 0);
+module_param(rx_copybreak, int, 0);
+module_param_array(media, charp, NULL, 0);
+module_param(flowctrl, int, 0);
+MODULE_PARM_DESC(debug, "Sundance Alta debug level (0-5)");
+MODULE_PARM_DESC(rx_copybreak, "Sundance Alta copy breakpoint for copy-only-tiny-frames");
+MODULE_PARM_DESC(flowctrl, "Sundance Alta flow control [0|1]");
+
+/*
+				Theory of Operation
+
+I. Board Compatibility
+
+This driver is designed for the Sundance Technologies "Alta" ST201 chip.
+
+II. Board-specific settings
+
+III. Driver operation
+
+IIIa. Ring buffers
+
+This driver uses two statically allocated fixed-size descriptor lists
+formed into rings by a branch from the final descriptor to the beginning of
+the list.  The ring sizes are set at compile time by RX/TX_RING_SIZE.
+Some chips explicitly use only 2^N sized rings, while others use a
+'next descriptor' pointer that the driver forms into rings.
+
+IIIb/c. Transmit/Receive Structure
+
+This driver uses a zero-copy receive and transmit scheme.
+The driver allocates full frame size skbuffs for the Rx ring buffers at
+open() time and passes the skb->data field to the chip as receive data
+buffers.  When an incoming frame is less than RX_COPYBREAK bytes long,
+a fresh skbuff is allocated and the frame is copied to the new skbuff.
+When the incoming frame is larger, the skbuff is passed directly up the
+protocol stack.  Buffers consumed this way are replaced by newly allocated
+skbuffs in a later phase of receives.
+
+The RX_COPYBREAK value is chosen to trade-off the memory wasted by
+using a full-sized skbuff for small frames vs. the copying costs of larger
+frames.  New boards are typically used in generously configured machines
+and the underfilled buffers have negligible impact compared to the benefit of
+a single allocation size, so the default value of zero results in never
+copying packets.  When copying is done, the cost is usually mitigated by using
+a combined copy/checksum routine.  Copying also preloads the cache, which is
+most useful with small frames.
+
+A subtle aspect of the operation is that the IP header at offset 14 in an
+ethernet frame isn't longword aligned for further processing.
+Unaligned buffers are permitted by the Sundance hardware, so
+frames are received into the skbuff at an offset of "+2", 16-byte aligning
+the IP header.
+
+IIId. Synchronization
+
+The driver runs as two independent, single-threaded flows of control.  One
+is the send-packet routine, which enforces single-threaded use by the
+dev->tbusy flag.  The other thread is the interrupt handler, which is single
+threaded by the hardware and interrupt handling software.
+
+The send packet thread has partial control over the Tx ring and 'dev->tbusy'
+flag.  It sets the tbusy flag whenever it's queuing a Tx packet. If the next
+queue slot is empty, it clears the tbusy flag when finished otherwise it sets
+the 'lp->tx_full' flag.
+
+The interrupt handler has exclusive control over the Rx ring and records stats
+from the Tx ring.  After reaping the stats, it marks the Tx queue entry as
+empty by incrementing the dirty_tx mark. Iff the 'lp->tx_full' flag is set, it
+clears both the tx_full and tbusy flags.
+
+IV. Notes
+
+IVb. References
+
+The Sundance ST201 datasheet, preliminary version.
+The Kendin KS8723 datasheet, preliminary version.
+The ICplus IP100 datasheet, preliminary version.
+http://www.scyld.com/expert/100mbps.html
+http://www.scyld.com/expert/NWay.html
+
+IVc. Errata
+
+*/
+
+/* Work-around for Kendin chip bugs. */
+#ifndef CONFIG_SUNDANCE_MMIO
+#define USE_IO_OPS 1
+#endif
+
+static const struct pci_device_id sundance_pci_tbl[] = {
+	{ 0x1186, 0x1002, 0x1186, 0x1002, 0, 0, 0 },
+	{ 0x1186, 0x1002, 0x1186, 0x1003, 0, 0, 1 },
+	{ 0x1186, 0x1002, 0x1186, 0x1012, 0, 0, 2 },
+	{ 0x1186, 0x1002, 0x1186, 0x1040, 0, 0, 3 },
+	{ 0x1186, 0x1002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4 },
+	{ 0x13F0, 0x0201, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 5 },
+	{ 0x13F0, 0x0200, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 6 },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, sundance_pci_tbl);
+
+enum {
+	netdev_io_size = 128
+};
+
+struct pci_id_info {
+        const char *name;
+};
+static const struct pci_id_info pci_id_tbl[] = {
+	{"D-Link DFE-550TX FAST Ethernet Adapter"},
+	{"D-Link DFE-550FX 100Mbps Fiber-optics Adapter"},
+	{"D-Link DFE-580TX 4 port Server Adapter"},
+	{"D-Link DFE-530TXS FAST Ethernet Adapter"},
+	{"D-Link DL10050-based FAST Ethernet Adapter"},
+	{"Sundance Technology Alta"},
+	{"IC Plus Corporation IP100A FAST Ethernet Adapter"},
+	{ }	/* terminate list. */
+};
+
+/* This driver was written to use PCI memory space, however x86-oriented
+   hardware often uses I/O space accesses. */
+
+/* Offsets to the device registers.
+   Unlike software-only systems, device drivers interact with complex hardware.
+   It's not useful to define symbolic names for every register bit in the
+   device.  The name can only partially document the semantics and make
+   the driver longer and more difficult to read.
+   In general, only the important configuration values or bits changed
+   multiple times should be defined symbolically.
+*/
+enum alta_offsets {
+	DMACtrl = 0x00,
+	TxListPtr = 0x04,
+	TxDMABurstThresh = 0x08,
+	TxDMAUrgentThresh = 0x09,
+	TxDMAPollPeriod = 0x0a,
+	RxDMAStatus = 0x0c,
+	RxListPtr = 0x10,
+	DebugCtrl0 = 0x1a,
+	DebugCtrl1 = 0x1c,
+	RxDMABurstThresh = 0x14,
+	RxDMAUrgentThresh = 0x15,
+	RxDMAPollPeriod = 0x16,
+	LEDCtrl = 0x1a,
+	ASICCtrl = 0x30,
+	EEData = 0x34,
+	EECtrl = 0x36,
+	FlashAddr = 0x40,
+	FlashData = 0x44,
+	WakeEvent = 0x45,
+	TxStatus = 0x46,
+	TxFrameId = 0x47,
+	DownCounter = 0x18,
+	IntrClear = 0x4a,
+	IntrEnable = 0x4c,
+	IntrStatus = 0x4e,
+	MACCtrl0 = 0x50,
+	MACCtrl1 = 0x52,
+	StationAddr = 0x54,
+	MaxFrameSize = 0x5A,
+	RxMode = 0x5c,
+	MIICtrl = 0x5e,
+	MulticastFilter0 = 0x60,
+	MulticastFilter1 = 0x64,
+	RxOctetsLow = 0x68,
+	RxOctetsHigh = 0x6a,
+	TxOctetsLow = 0x6c,
+	TxOctetsHigh = 0x6e,
+	TxFramesOK = 0x70,
+	RxFramesOK = 0x72,
+	StatsCarrierError = 0x74,
+	StatsLateColl = 0x75,
+	StatsMultiColl = 0x76,
+	StatsOneColl = 0x77,
+	StatsTxDefer = 0x78,
+	RxMissed = 0x79,
+	StatsTxXSDefer = 0x7a,
+	StatsTxAbort = 0x7b,
+	StatsBcastTx = 0x7c,
+	StatsBcastRx = 0x7d,
+	StatsMcastTx = 0x7e,
+	StatsMcastRx = 0x7f,
+	/* Aliased and bogus values! */
+	RxStatus = 0x0c,
+};
+
+#define ASIC_HI_WORD(x)	((x) + 2)
+
+enum ASICCtrl_HiWord_bit {
+	GlobalReset = 0x0001,
+	RxReset = 0x0002,
+	TxReset = 0x0004,
+	DMAReset = 0x0008,
+	FIFOReset = 0x0010,
+	NetworkReset = 0x0020,
+	HostReset = 0x0040,
+	ResetBusy = 0x0400,
+};
+
+/* Bits in the interrupt status/mask registers. */
+enum intr_status_bits {
+	IntrSummary=0x0001, IntrPCIErr=0x0002, IntrMACCtrl=0x0008,
+	IntrTxDone=0x0004, IntrRxDone=0x0010, IntrRxStart=0x0020,
+	IntrDrvRqst=0x0040,
+	StatsMax=0x0080, LinkChange=0x0100,
+	IntrTxDMADone=0x0200, IntrRxDMADone=0x0400,
+};
+
+/* Bits in the RxMode register. */
+enum rx_mode_bits {
+	AcceptAllIPMulti=0x20, AcceptMultiHash=0x10, AcceptAll=0x08,
+	AcceptBroadcast=0x04, AcceptMulticast=0x02, AcceptMyPhys=0x01,
+};
+/* Bits in MACCtrl. */
+enum mac_ctrl0_bits {
+	EnbFullDuplex=0x20, EnbRcvLargeFrame=0x40,
+	EnbFlowCtrl=0x100, EnbPassRxCRC=0x200,
+};
+enum mac_ctrl1_bits {
+	StatsEnable=0x0020,	StatsDisable=0x0040, StatsEnabled=0x0080,
+	TxEnable=0x0100, TxDisable=0x0200, TxEnabled=0x0400,
+	RxEnable=0x0800, RxDisable=0x1000, RxEnabled=0x2000,
+};
+
+/* Bits in WakeEvent register. */
+enum wake_event_bits {
+	WakePktEnable = 0x01,
+	MagicPktEnable = 0x02,
+	LinkEventEnable = 0x04,
+	WolEnable = 0x80,
+};
+
+/* The Rx and Tx buffer descriptors. */
+/* Note that using only 32 bit fields simplifies conversion to big-endian
+   architectures. */
+struct netdev_desc {
+	__le32 next_desc;
+	__le32 status;
+	struct desc_frag { __le32 addr, length; } frag;
+};
+
+/* Bits in netdev_desc.status */
+enum desc_status_bits {
+	DescOwn=0x8000,
+	DescEndPacket=0x4000,
+	DescEndRing=0x2000,
+	LastFrag=0x80000000,
+	DescIntrOnTx=0x8000,
+	DescIntrOnDMADone=0x80000000,
+	DisableAlign = 0x00000001,
+};
+
+#define PRIV_ALIGN	15 	/* Required alignment mask */
+/* Use  __attribute__((aligned (L1_CACHE_BYTES)))  to maintain alignment
+   within the structure. */
+#define MII_CNT		4
+struct netdev_private {
+	/* Descriptor rings first for alignment. */
+	struct netdev_desc *rx_ring;
+	struct netdev_desc *tx_ring;
+	struct sk_buff* rx_skbuff[RX_RING_SIZE];
+	struct sk_buff* tx_skbuff[TX_RING_SIZE];
+        dma_addr_t tx_ring_dma;
+        dma_addr_t rx_ring_dma;
+	struct timer_list timer;		/* Media monitoring timer. */
+	struct net_device *ndev;		/* backpointer */
+	/* ethtool extra stats */
+	struct {
+		u64 tx_multiple_collisions;
+		u64 tx_single_collisions;
+		u64 tx_late_collisions;
+		u64 tx_deferred;
+		u64 tx_deferred_excessive;
+		u64 tx_aborted;
+		u64 tx_bcasts;
+		u64 rx_bcasts;
+		u64 tx_mcasts;
+		u64 rx_mcasts;
+	} xstats;
+	/* Frequently used values: keep some adjacent for cache effect. */
+	spinlock_t lock;
+	int msg_enable;
+	int chip_id;
+	unsigned int cur_rx, dirty_rx;		/* Producer/consumer ring indices */
+	unsigned int rx_buf_sz;			/* Based on MTU+slack. */
+	struct netdev_desc *last_tx;		/* Last Tx descriptor used. */
+	unsigned int cur_tx, dirty_tx;
+	/* These values are keep track of the transceiver/media in use. */
+	unsigned int flowctrl:1;
+	unsigned int default_port:4;		/* Last dev->if_port value. */
+	unsigned int an_enable:1;
+	unsigned int speed;
+	unsigned int wol_enabled:1;			/* Wake on LAN enabled */
+	struct tasklet_struct rx_tasklet;
+	struct tasklet_struct tx_tasklet;
+	int budget;
+	int cur_task;
+	/* Multicast and receive mode. */
+	spinlock_t mcastlock;			/* SMP lock multicast updates. */
+	u16 mcast_filter[4];
+	/* MII transceiver section. */
+	struct mii_if_info mii_if;
+	int mii_preamble_required;
+	unsigned char phys[MII_CNT];		/* MII device addresses, only first one used. */
+	struct pci_dev *pci_dev;
+	void __iomem *base;
+	spinlock_t statlock;
+};
+
+/* The station address location in the EEPROM. */
+#define EEPROM_SA_OFFSET	0x10
+#define DEFAULT_INTR (IntrRxDMADone | IntrPCIErr | \
+			IntrDrvRqst | IntrTxDone | StatsMax | \
+			LinkChange)
+
+static int  change_mtu(struct net_device *dev, int new_mtu);
+static int  eeprom_read(void __iomem *ioaddr, int location);
+static int  mdio_read(struct net_device *dev, int phy_id, int location);
+static void mdio_write(struct net_device *dev, int phy_id, int location, int value);
+static int  mdio_wait_link(struct net_device *dev, int wait);
+static int  netdev_open(struct net_device *dev);
+static void check_duplex(struct net_device *dev);
+static void netdev_timer(struct timer_list *t);
+static void tx_timeout(struct net_device *dev, unsigned int txqueue);
+static void init_ring(struct net_device *dev);
+static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev);
+static int reset_tx (struct net_device *dev);
+static irqreturn_t intr_handler(int irq, void *dev_instance);
+static void rx_poll(struct tasklet_struct *t);
+static void tx_poll(struct tasklet_struct *t);
+static void refill_rx (struct net_device *dev);
+static void netdev_error(struct net_device *dev, int intr_status);
+static void netdev_error(struct net_device *dev, int intr_status);
+static void set_rx_mode(struct net_device *dev);
+static int __set_mac_addr(struct net_device *dev);
+static int sundance_set_mac_addr(struct net_device *dev, void *data);
+static struct net_device_stats *get_stats(struct net_device *dev);
+static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+static int  netdev_close(struct net_device *dev);
+static const struct ethtool_ops ethtool_ops;
+
+static void sundance_reset(struct net_device *dev, unsigned long reset_cmd)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base + ASICCtrl;
+	int countdown;
+
+	/* ST201 documentation states ASICCtrl is a 32bit register */
+	iowrite32 (reset_cmd | ioread32 (ioaddr), ioaddr);
+	/* ST201 documentation states reset can take up to 1 ms */
+	countdown = 10 + 1;
+	while (ioread32 (ioaddr) & (ResetBusy << 16)) {
+		if (--countdown == 0) {
+			printk(KERN_WARNING "%s : reset not completed !!\n", dev->name);
+			break;
+		}
+		udelay(100);
+	}
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void sundance_poll_controller(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+
+	disable_irq(np->pci_dev->irq);
+	intr_handler(np->pci_dev->irq, dev);
+	enable_irq(np->pci_dev->irq);
+}
+#endif
+
+static const struct net_device_ops netdev_ops = {
+	.ndo_open		= netdev_open,
+	.ndo_stop		= netdev_close,
+	.ndo_start_xmit		= start_tx,
+	.ndo_get_stats 		= get_stats,
+	.ndo_set_rx_mode	= set_rx_mode,
+	.ndo_eth_ioctl		= netdev_ioctl,
+	.ndo_tx_timeout		= tx_timeout,
+	.ndo_change_mtu		= change_mtu,
+	.ndo_set_mac_address 	= sundance_set_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller 	= sundance_poll_controller,
+#endif
+};
+
+static int sundance_probe1(struct pci_dev *pdev,
+			   const struct pci_device_id *ent)
+{
+	struct net_device *dev;
+	struct netdev_private *np;
+	static int card_idx;
+	int chip_idx = ent->driver_data;
+	int irq;
+	int i;
+	void __iomem *ioaddr;
+	u16 mii_ctl;
+	void *ring_space;
+	dma_addr_t ring_dma;
+#ifdef USE_IO_OPS
+	int bar = 0;
+#else
+	int bar = 1;
+#endif
+	int phy, phy_end, phy_idx = 0;
+	__le16 addr[ETH_ALEN / 2];
+
+	if (pci_enable_device(pdev))
+		return -EIO;
+	pci_set_master(pdev);
+
+	irq = pdev->irq;
+
+	dev = alloc_etherdev(sizeof(*np));
+	if (!dev)
+		return -ENOMEM;
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	if (pci_request_regions(pdev, DRV_NAME))
+		goto err_out_netdev;
+
+	ioaddr = pci_iomap(pdev, bar, netdev_io_size);
+	if (!ioaddr)
+		goto err_out_res;
+
+	for (i = 0; i < 3; i++)
+		addr[i] =
+			cpu_to_le16(eeprom_read(ioaddr, i + EEPROM_SA_OFFSET));
+	eth_hw_addr_set(dev, (u8 *)addr);
+
+	np = netdev_priv(dev);
+	np->ndev = dev;
+	np->base = ioaddr;
+	np->pci_dev = pdev;
+	np->chip_id = chip_idx;
+	np->msg_enable = (1 << debug) - 1;
+	spin_lock_init(&np->lock);
+	spin_lock_init(&np->statlock);
+	tasklet_setup(&np->rx_tasklet, rx_poll);
+	tasklet_setup(&np->tx_tasklet, tx_poll);
+
+	ring_space = dma_alloc_coherent(&pdev->dev, TX_TOTAL_SIZE,
+			&ring_dma, GFP_KERNEL);
+	if (!ring_space)
+		goto err_out_cleardev;
+	np->tx_ring = (struct netdev_desc *)ring_space;
+	np->tx_ring_dma = ring_dma;
+
+	ring_space = dma_alloc_coherent(&pdev->dev, RX_TOTAL_SIZE,
+			&ring_dma, GFP_KERNEL);
+	if (!ring_space)
+		goto err_out_unmap_tx;
+	np->rx_ring = (struct netdev_desc *)ring_space;
+	np->rx_ring_dma = ring_dma;
+
+	np->mii_if.dev = dev;
+	np->mii_if.mdio_read = mdio_read;
+	np->mii_if.mdio_write = mdio_write;
+	np->mii_if.phy_id_mask = 0x1f;
+	np->mii_if.reg_num_mask = 0x1f;
+
+	/* The chip-specific entries in the device structure. */
+	dev->netdev_ops = &netdev_ops;
+	dev->ethtool_ops = &ethtool_ops;
+	dev->watchdog_timeo = TX_TIMEOUT;
+
+	/* MTU range: 68 - 8191 */
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = 8191;
+
+	pci_set_drvdata(pdev, dev);
+
+	i = register_netdev(dev);
+	if (i)
+		goto err_out_unmap_rx;
+
+	printk(KERN_INFO "%s: %s at %p, %pM, IRQ %d.\n",
+	       dev->name, pci_id_tbl[chip_idx].name, ioaddr,
+	       dev->dev_addr, irq);
+
+	np->phys[0] = 1;		/* Default setting */
+	np->mii_preamble_required++;
+
+	/*
+	 * It seems some phys doesn't deal well with address 0 being accessed
+	 * first
+	 */
+	if (sundance_pci_tbl[np->chip_id].device == 0x0200) {
+		phy = 0;
+		phy_end = 31;
+	} else {
+		phy = 1;
+		phy_end = 32;	/* wraps to zero, due to 'phy & 0x1f' */
+	}
+	for (; phy <= phy_end && phy_idx < MII_CNT; phy++) {
+		int phyx = phy & 0x1f;
+		int mii_status = mdio_read(dev, phyx, MII_BMSR);
+		if (mii_status != 0xffff  &&  mii_status != 0x0000) {
+			np->phys[phy_idx++] = phyx;
+			np->mii_if.advertising = mdio_read(dev, phyx, MII_ADVERTISE);
+			if ((mii_status & 0x0040) == 0)
+				np->mii_preamble_required++;
+			printk(KERN_INFO "%s: MII PHY found at address %d, status "
+				   "0x%4.4x advertising %4.4x.\n",
+				   dev->name, phyx, mii_status, np->mii_if.advertising);
+		}
+	}
+	np->mii_preamble_required--;
+
+	if (phy_idx == 0) {
+		printk(KERN_INFO "%s: No MII transceiver found, aborting.  ASIC status %x\n",
+			   dev->name, ioread32(ioaddr + ASICCtrl));
+		goto err_out_unregister;
+	}
+
+	np->mii_if.phy_id = np->phys[0];
+
+	/* Parse override configuration */
+	np->an_enable = 1;
+	if (card_idx < MAX_UNITS) {
+		if (media[card_idx] != NULL) {
+			np->an_enable = 0;
+			if (strcmp (media[card_idx], "100mbps_fd") == 0 ||
+			    strcmp (media[card_idx], "4") == 0) {
+				np->speed = 100;
+				np->mii_if.full_duplex = 1;
+			} else if (strcmp (media[card_idx], "100mbps_hd") == 0 ||
+				   strcmp (media[card_idx], "3") == 0) {
+				np->speed = 100;
+				np->mii_if.full_duplex = 0;
+			} else if (strcmp (media[card_idx], "10mbps_fd") == 0 ||
+				   strcmp (media[card_idx], "2") == 0) {
+				np->speed = 10;
+				np->mii_if.full_duplex = 1;
+			} else if (strcmp (media[card_idx], "10mbps_hd") == 0 ||
+				   strcmp (media[card_idx], "1") == 0) {
+				np->speed = 10;
+				np->mii_if.full_duplex = 0;
+			} else {
+				np->an_enable = 1;
+			}
+		}
+		if (flowctrl == 1)
+			np->flowctrl = 1;
+	}
+
+	/* Fibre PHY? */
+	if (ioread32 (ioaddr + ASICCtrl) & 0x80) {
+		/* Default 100Mbps Full */
+		if (np->an_enable) {
+			np->speed = 100;
+			np->mii_if.full_duplex = 1;
+			np->an_enable = 0;
+		}
+	}
+	/* Reset PHY */
+	mdio_write (dev, np->phys[0], MII_BMCR, BMCR_RESET);
+	mdelay (300);
+	/* If flow control enabled, we need to advertise it.*/
+	if (np->flowctrl)
+		mdio_write (dev, np->phys[0], MII_ADVERTISE, np->mii_if.advertising | 0x0400);
+	mdio_write (dev, np->phys[0], MII_BMCR, BMCR_ANENABLE|BMCR_ANRESTART);
+	/* Force media type */
+	if (!np->an_enable) {
+		mii_ctl = 0;
+		mii_ctl |= (np->speed == 100) ? BMCR_SPEED100 : 0;
+		mii_ctl |= (np->mii_if.full_duplex) ? BMCR_FULLDPLX : 0;
+		mdio_write (dev, np->phys[0], MII_BMCR, mii_ctl);
+		printk (KERN_INFO "Override speed=%d, %s duplex\n",
+			np->speed, np->mii_if.full_duplex ? "Full" : "Half");
+
+	}
+
+	/* Perhaps move the reset here? */
+	/* Reset the chip to erase previous misconfiguration. */
+	if (netif_msg_hw(np))
+		printk("ASIC Control is %x.\n", ioread32(ioaddr + ASICCtrl));
+	sundance_reset(dev, 0x00ff << 16);
+	if (netif_msg_hw(np))
+		printk("ASIC Control is now %x.\n", ioread32(ioaddr + ASICCtrl));
+
+	card_idx++;
+	return 0;
+
+err_out_unregister:
+	unregister_netdev(dev);
+err_out_unmap_rx:
+	dma_free_coherent(&pdev->dev, RX_TOTAL_SIZE,
+		np->rx_ring, np->rx_ring_dma);
+err_out_unmap_tx:
+	dma_free_coherent(&pdev->dev, TX_TOTAL_SIZE,
+		np->tx_ring, np->tx_ring_dma);
+err_out_cleardev:
+	pci_iounmap(pdev, ioaddr);
+err_out_res:
+	pci_release_regions(pdev);
+err_out_netdev:
+	free_netdev (dev);
+	return -ENODEV;
+}
+
+static int change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (netif_running(dev))
+		return -EBUSY;
+	WRITE_ONCE(dev->mtu, new_mtu);
+	return 0;
+}
+
+#define eeprom_delay(ee_addr)	ioread32(ee_addr)
+/* Read the EEPROM and MII Management Data I/O (MDIO) interfaces. */
+static int eeprom_read(void __iomem *ioaddr, int location)
+{
+	int boguscnt = 10000;		/* Typical 1900 ticks. */
+	iowrite16(0x0200 | (location & 0xff), ioaddr + EECtrl);
+	do {
+		eeprom_delay(ioaddr + EECtrl);
+		if (! (ioread16(ioaddr + EECtrl) & 0x8000)) {
+			return ioread16(ioaddr + EEData);
+		}
+	} while (--boguscnt > 0);
+	return 0;
+}
+
+/*  MII transceiver control section.
+	Read and write the MII registers using software-generated serial
+	MDIO protocol.  See the MII specifications or DP83840A data sheet
+	for details.
+
+	The maximum data clock rate is 2.5 Mhz.  The minimum timing is usually
+	met by back-to-back 33Mhz PCI cycles. */
+#define mdio_delay() ioread8(mdio_addr)
+
+enum mii_reg_bits {
+	MDIO_ShiftClk=0x0001, MDIO_Data=0x0002, MDIO_EnbOutput=0x0004,
+};
+#define MDIO_EnbIn  (0)
+#define MDIO_WRITE0 (MDIO_EnbOutput)
+#define MDIO_WRITE1 (MDIO_Data | MDIO_EnbOutput)
+
+/* Generate the preamble required for initial synchronization and
+   a few older transceivers. */
+static void mdio_sync(void __iomem *mdio_addr)
+{
+	int bits = 32;
+
+	/* Establish sync by sending at least 32 logic ones. */
+	while (--bits >= 0) {
+		iowrite8(MDIO_WRITE1, mdio_addr);
+		mdio_delay();
+		iowrite8(MDIO_WRITE1 | MDIO_ShiftClk, mdio_addr);
+		mdio_delay();
+	}
+}
+
+static int mdio_read(struct net_device *dev, int phy_id, int location)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *mdio_addr = np->base + MIICtrl;
+	int mii_cmd = (0xf6 << 10) | (phy_id << 5) | location;
+	int i, retval = 0;
+
+	if (np->mii_preamble_required)
+		mdio_sync(mdio_addr);
+
+	/* Shift the read command bits out. */
+	for (i = 15; i >= 0; i--) {
+		int dataval = (mii_cmd & (1 << i)) ? MDIO_WRITE1 : MDIO_WRITE0;
+
+		iowrite8(dataval, mdio_addr);
+		mdio_delay();
+		iowrite8(dataval | MDIO_ShiftClk, mdio_addr);
+		mdio_delay();
+	}
+	/* Read the two transition, 16 data, and wire-idle bits. */
+	for (i = 19; i > 0; i--) {
+		iowrite8(MDIO_EnbIn, mdio_addr);
+		mdio_delay();
+		retval = (retval << 1) | ((ioread8(mdio_addr) & MDIO_Data) ? 1 : 0);
+		iowrite8(MDIO_EnbIn | MDIO_ShiftClk, mdio_addr);
+		mdio_delay();
+	}
+	return (retval>>1) & 0xffff;
+}
+
+static void mdio_write(struct net_device *dev, int phy_id, int location, int value)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *mdio_addr = np->base + MIICtrl;
+	int mii_cmd = (0x5002 << 16) | (phy_id << 23) | (location<<18) | value;
+	int i;
+
+	if (np->mii_preamble_required)
+		mdio_sync(mdio_addr);
+
+	/* Shift the command bits out. */
+	for (i = 31; i >= 0; i--) {
+		int dataval = (mii_cmd & (1 << i)) ? MDIO_WRITE1 : MDIO_WRITE0;
+
+		iowrite8(dataval, mdio_addr);
+		mdio_delay();
+		iowrite8(dataval | MDIO_ShiftClk, mdio_addr);
+		mdio_delay();
+	}
+	/* Clear out extra bits. */
+	for (i = 2; i > 0; i--) {
+		iowrite8(MDIO_EnbIn, mdio_addr);
+		mdio_delay();
+		iowrite8(MDIO_EnbIn | MDIO_ShiftClk, mdio_addr);
+		mdio_delay();
+	}
+}
+
+static int mdio_wait_link(struct net_device *dev, int wait)
+{
+	int bmsr;
+	int phy_id;
+	struct netdev_private *np;
+
+	np = netdev_priv(dev);
+	phy_id = np->phys[0];
+
+	do {
+		bmsr = mdio_read(dev, phy_id, MII_BMSR);
+		if (bmsr & 0x0004)
+			return 0;
+		mdelay(1);
+	} while (--wait > 0);
+	return -1;
+}
+
+static int netdev_open(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	const int irq = np->pci_dev->irq;
+	unsigned long flags;
+	int i;
+
+	sundance_reset(dev, 0x00ff << 16);
+
+	i = request_irq(irq, intr_handler, IRQF_SHARED, dev->name, dev);
+	if (i)
+		return i;
+
+	if (netif_msg_ifup(np))
+		printk(KERN_DEBUG "%s: netdev_open() irq %d\n", dev->name, irq);
+
+	init_ring(dev);
+
+	iowrite32(np->rx_ring_dma, ioaddr + RxListPtr);
+	/* The Tx list pointer is written as packets are queued. */
+
+	/* Initialize other registers. */
+	__set_mac_addr(dev);
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+	iowrite16(dev->mtu + 18, ioaddr + MaxFrameSize);
+#else
+	iowrite16(dev->mtu + 14, ioaddr + MaxFrameSize);
+#endif
+	if (dev->mtu > 2047)
+		iowrite32(ioread32(ioaddr + ASICCtrl) | 0x0C, ioaddr + ASICCtrl);
+
+	/* Configure the PCI bus bursts and FIFO thresholds. */
+
+	if (dev->if_port == 0)
+		dev->if_port = np->default_port;
+
+	spin_lock_init(&np->mcastlock);
+
+	set_rx_mode(dev);
+	iowrite16(0, ioaddr + IntrEnable);
+	iowrite16(0, ioaddr + DownCounter);
+	/* Set the chip to poll every N*320nsec. */
+	iowrite8(100, ioaddr + RxDMAPollPeriod);
+	iowrite8(127, ioaddr + TxDMAPollPeriod);
+	/* Fix DFE-580TX packet drop issue */
+	if (np->pci_dev->revision >= 0x14)
+		iowrite8(0x01, ioaddr + DebugCtrl1);
+	netif_start_queue(dev);
+
+	spin_lock_irqsave(&np->lock, flags);
+	reset_tx(dev);
+	spin_unlock_irqrestore(&np->lock, flags);
+
+	iowrite16 (StatsEnable | RxEnable | TxEnable, ioaddr + MACCtrl1);
+
+	/* Disable Wol */
+	iowrite8(ioread8(ioaddr + WakeEvent) | 0x00, ioaddr + WakeEvent);
+	np->wol_enabled = 0;
+
+	if (netif_msg_ifup(np))
+		printk(KERN_DEBUG "%s: Done netdev_open(), status: Rx %x Tx %x "
+			   "MAC Control %x, %4.4x %4.4x.\n",
+			   dev->name, ioread32(ioaddr + RxStatus), ioread8(ioaddr + TxStatus),
+			   ioread32(ioaddr + MACCtrl0),
+			   ioread16(ioaddr + MACCtrl1), ioread16(ioaddr + MACCtrl0));
+
+	/* Set the timer to check for link beat. */
+	timer_setup(&np->timer, netdev_timer, 0);
+	np->timer.expires = jiffies + 3*HZ;
+	add_timer(&np->timer);
+
+	/* Enable interrupts by setting the interrupt mask. */
+	iowrite16(DEFAULT_INTR, ioaddr + IntrEnable);
+
+	return 0;
+}
+
+static void check_duplex(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	int mii_lpa = mdio_read(dev, np->phys[0], MII_LPA);
+	int negotiated = mii_lpa & np->mii_if.advertising;
+	int duplex;
+
+	/* Force media */
+	if (!np->an_enable || mii_lpa == 0xffff) {
+		if (np->mii_if.full_duplex)
+			iowrite16 (ioread16 (ioaddr + MACCtrl0) | EnbFullDuplex,
+				ioaddr + MACCtrl0);
+		return;
+	}
+
+	/* Autonegotiation */
+	duplex = (negotiated & 0x0100) || (negotiated & 0x01C0) == 0x0040;
+	if (np->mii_if.full_duplex != duplex) {
+		np->mii_if.full_duplex = duplex;
+		if (netif_msg_link(np))
+			printk(KERN_INFO "%s: Setting %s-duplex based on MII #%d "
+				   "negotiated capability %4.4x.\n", dev->name,
+				   duplex ? "full" : "half", np->phys[0], negotiated);
+		iowrite16(ioread16(ioaddr + MACCtrl0) | (duplex ? 0x20 : 0), ioaddr + MACCtrl0);
+	}
+}
+
+static void netdev_timer(struct timer_list *t)
+{
+	struct netdev_private *np = timer_container_of(np, t, timer);
+	struct net_device *dev = np->mii_if.dev;
+	void __iomem *ioaddr = np->base;
+	int next_tick = 10*HZ;
+
+	if (netif_msg_timer(np)) {
+		printk(KERN_DEBUG "%s: Media selection timer tick, intr status %4.4x, "
+			   "Tx %x Rx %x.\n",
+			   dev->name, ioread16(ioaddr + IntrEnable),
+			   ioread8(ioaddr + TxStatus), ioread32(ioaddr + RxStatus));
+	}
+	check_duplex(dev);
+	np->timer.expires = jiffies + next_tick;
+	add_timer(&np->timer);
+}
+
+static void tx_timeout(struct net_device *dev, unsigned int txqueue)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	unsigned long flag;
+
+	netif_stop_queue(dev);
+	tasklet_disable_in_atomic(&np->tx_tasklet);
+	iowrite16(0, ioaddr + IntrEnable);
+	printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x "
+		   "TxFrameId %2.2x,"
+		   " resetting...\n", dev->name, ioread8(ioaddr + TxStatus),
+		   ioread8(ioaddr + TxFrameId));
+
+	{
+		int i;
+		for (i=0; i<TX_RING_SIZE; i++) {
+			printk(KERN_DEBUG "%02x %08llx %08x %08x(%02x) %08x %08x\n", i,
+				(unsigned long long)(np->tx_ring_dma + i*sizeof(*np->tx_ring)),
+				le32_to_cpu(np->tx_ring[i].next_desc),
+				le32_to_cpu(np->tx_ring[i].status),
+				(le32_to_cpu(np->tx_ring[i].status) >> 2) & 0xff,
+				le32_to_cpu(np->tx_ring[i].frag.addr),
+				le32_to_cpu(np->tx_ring[i].frag.length));
+		}
+		printk(KERN_DEBUG "TxListPtr=%08x netif_queue_stopped=%d\n",
+			ioread32(np->base + TxListPtr),
+			netif_queue_stopped(dev));
+		printk(KERN_DEBUG "cur_tx=%d(%02x) dirty_tx=%d(%02x)\n",
+			np->cur_tx, np->cur_tx % TX_RING_SIZE,
+			np->dirty_tx, np->dirty_tx % TX_RING_SIZE);
+		printk(KERN_DEBUG "cur_rx=%d dirty_rx=%d\n", np->cur_rx, np->dirty_rx);
+		printk(KERN_DEBUG "cur_task=%d\n", np->cur_task);
+	}
+	spin_lock_irqsave(&np->lock, flag);
+
+	/* Stop and restart the chip's Tx processes . */
+	reset_tx(dev);
+	spin_unlock_irqrestore(&np->lock, flag);
+
+	dev->if_port = 0;
+
+	netif_trans_update(dev); /* prevent tx timeout */
+	dev->stats.tx_errors++;
+	if (np->cur_tx - np->dirty_tx < TX_QUEUE_LEN - 4) {
+		netif_wake_queue(dev);
+	}
+	iowrite16(DEFAULT_INTR, ioaddr + IntrEnable);
+	tasklet_enable(&np->tx_tasklet);
+}
+
+
+/* Initialize the Rx and Tx rings, along with various 'dev' bits. */
+static void init_ring(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int i;
+
+	np->cur_rx = np->cur_tx = 0;
+	np->dirty_rx = np->dirty_tx = 0;
+	np->cur_task = 0;
+
+	np->rx_buf_sz = (dev->mtu <= 1520 ? PKT_BUF_SZ : dev->mtu + 16);
+
+	/* Initialize all Rx descriptors. */
+	for (i = 0; i < RX_RING_SIZE; i++) {
+		np->rx_ring[i].next_desc = cpu_to_le32(np->rx_ring_dma +
+			((i+1)%RX_RING_SIZE)*sizeof(*np->rx_ring));
+		np->rx_ring[i].status = 0;
+		np->rx_ring[i].frag.length = 0;
+		np->rx_skbuff[i] = NULL;
+	}
+
+	/* Fill in the Rx buffers.  Handle allocation failure gracefully. */
+	for (i = 0; i < RX_RING_SIZE; i++) {
+		dma_addr_t addr;
+
+		struct sk_buff *skb =
+			netdev_alloc_skb(dev, np->rx_buf_sz + 2);
+		np->rx_skbuff[i] = skb;
+		if (skb == NULL)
+			break;
+		skb_reserve(skb, 2);	/* 16 byte align the IP header. */
+		addr = dma_map_single(&np->pci_dev->dev, skb->data,
+				      np->rx_buf_sz, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&np->pci_dev->dev, addr)) {
+			dev_kfree_skb(skb);
+			np->rx_skbuff[i] = NULL;
+			break;
+		}
+		np->rx_ring[i].frag.addr = cpu_to_le32(addr);
+		np->rx_ring[i].frag.length = cpu_to_le32(np->rx_buf_sz | LastFrag);
+	}
+	np->dirty_rx = (unsigned int)(i - RX_RING_SIZE);
+
+	for (i = 0; i < TX_RING_SIZE; i++) {
+		np->tx_skbuff[i] = NULL;
+		np->tx_ring[i].status = 0;
+	}
+}
+
+static void tx_poll(struct tasklet_struct *t)
+{
+	struct netdev_private *np = from_tasklet(np, t, tx_tasklet);
+	unsigned head = np->cur_task % TX_RING_SIZE;
+	struct netdev_desc *txdesc =
+		&np->tx_ring[(np->cur_tx - 1) % TX_RING_SIZE];
+
+	/* Chain the next pointer */
+	for (; np->cur_tx - np->cur_task > 0; np->cur_task++) {
+		int entry = np->cur_task % TX_RING_SIZE;
+		txdesc = &np->tx_ring[entry];
+		if (np->last_tx) {
+			np->last_tx->next_desc = cpu_to_le32(np->tx_ring_dma +
+				entry*sizeof(struct netdev_desc));
+		}
+		np->last_tx = txdesc;
+	}
+	/* Indicate the latest descriptor of tx ring */
+	txdesc->status |= cpu_to_le32(DescIntrOnTx);
+
+	if (ioread32 (np->base + TxListPtr) == 0)
+		iowrite32 (np->tx_ring_dma + head * sizeof(struct netdev_desc),
+			np->base + TxListPtr);
+}
+
+static netdev_tx_t
+start_tx (struct sk_buff *skb, struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	struct netdev_desc *txdesc;
+	dma_addr_t addr;
+	unsigned entry;
+
+	/* Calculate the next Tx descriptor entry. */
+	entry = np->cur_tx % TX_RING_SIZE;
+	np->tx_skbuff[entry] = skb;
+	txdesc = &np->tx_ring[entry];
+
+	addr = dma_map_single(&np->pci_dev->dev, skb->data, skb->len,
+			      DMA_TO_DEVICE);
+	if (dma_mapping_error(&np->pci_dev->dev, addr))
+		goto drop_frame;
+
+	txdesc->next_desc = 0;
+	txdesc->status = cpu_to_le32 ((entry << 2) | DisableAlign);
+	txdesc->frag.addr = cpu_to_le32(addr);
+	txdesc->frag.length = cpu_to_le32 (skb->len | LastFrag);
+
+	/* Increment cur_tx before tasklet_schedule() */
+	np->cur_tx++;
+	mb();
+	/* Schedule a tx_poll() task */
+	tasklet_schedule(&np->tx_tasklet);
+
+	/* On some architectures: explicitly flush cache lines here. */
+	if (np->cur_tx - np->dirty_tx < TX_QUEUE_LEN - 1 &&
+	    !netif_queue_stopped(dev)) {
+		/* do nothing */
+	} else {
+		netif_stop_queue (dev);
+	}
+	if (netif_msg_tx_queued(np)) {
+		printk (KERN_DEBUG
+			"%s: Transmit frame #%d queued in slot %d.\n",
+			dev->name, np->cur_tx, entry);
+	}
+	return NETDEV_TX_OK;
+
+drop_frame:
+	dev_kfree_skb_any(skb);
+	np->tx_skbuff[entry] = NULL;
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
+/* Reset hardware tx and free all of tx buffers */
+static int
+reset_tx (struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	struct sk_buff *skb;
+	int i;
+
+	/* Reset tx logic, TxListPtr will be cleaned */
+	iowrite16 (TxDisable, ioaddr + MACCtrl1);
+	sundance_reset(dev, (NetworkReset|FIFOReset|DMAReset|TxReset) << 16);
+
+	/* free all tx skbuff */
+	for (i = 0; i < TX_RING_SIZE; i++) {
+		np->tx_ring[i].next_desc = 0;
+
+		skb = np->tx_skbuff[i];
+		if (skb) {
+			dma_unmap_single(&np->pci_dev->dev,
+				le32_to_cpu(np->tx_ring[i].frag.addr),
+				skb->len, DMA_TO_DEVICE);
+			dev_kfree_skb_any(skb);
+			np->tx_skbuff[i] = NULL;
+			dev->stats.tx_dropped++;
+		}
+	}
+	np->cur_tx = np->dirty_tx = 0;
+	np->cur_task = 0;
+
+	np->last_tx = NULL;
+	iowrite8(127, ioaddr + TxDMAPollPeriod);
+
+	iowrite16 (StatsEnable | RxEnable | TxEnable, ioaddr + MACCtrl1);
+	return 0;
+}
+
+/* The interrupt handler cleans up after the Tx thread,
+   and schedule a Rx thread work */
+static irqreturn_t intr_handler(int irq, void *dev_instance)
+{
+	struct net_device *dev = (struct net_device *)dev_instance;
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	int hw_frame_id;
+	int tx_cnt;
+	int tx_status;
+	int handled = 0;
+	int i;
+
+	do {
+		int intr_status = ioread16(ioaddr + IntrStatus);
+		iowrite16(intr_status, ioaddr + IntrStatus);
+
+		if (netif_msg_intr(np))
+			printk(KERN_DEBUG "%s: Interrupt, status %4.4x.\n",
+				   dev->name, intr_status);
+
+		if (!(intr_status & DEFAULT_INTR))
+			break;
+
+		handled = 1;
+
+		if (intr_status & (IntrRxDMADone)) {
+			iowrite16(DEFAULT_INTR & ~(IntrRxDone|IntrRxDMADone),
+					ioaddr + IntrEnable);
+			if (np->budget < 0)
+				np->budget = RX_BUDGET;
+			tasklet_schedule(&np->rx_tasklet);
+		}
+		if (intr_status & (IntrTxDone | IntrDrvRqst)) {
+			tx_status = ioread16 (ioaddr + TxStatus);
+			for (tx_cnt=32; tx_status & 0x80; --tx_cnt) {
+				if (netif_msg_tx_done(np))
+					printk
+					    ("%s: Transmit status is %2.2x.\n",
+				     	dev->name, tx_status);
+				if (tx_status & 0x1e) {
+					if (netif_msg_tx_err(np))
+						printk("%s: Transmit error status %4.4x.\n",
+							   dev->name, tx_status);
+					dev->stats.tx_errors++;
+					if (tx_status & 0x10)
+						dev->stats.tx_fifo_errors++;
+					if (tx_status & 0x08)
+						dev->stats.collisions++;
+					if (tx_status & 0x04)
+						dev->stats.tx_fifo_errors++;
+					if (tx_status & 0x02)
+						dev->stats.tx_window_errors++;
+
+					/*
+					** This reset has been verified on
+					** DFE-580TX boards ! phdm@macqel.be.
+					*/
+					if (tx_status & 0x10) {	/* TxUnderrun */
+						/* Restart Tx FIFO and transmitter */
+						sundance_reset(dev, (NetworkReset|FIFOReset|TxReset) << 16);
+						/* No need to reset the Tx pointer here */
+					}
+					/* Restart the Tx. Need to make sure tx enabled */
+					i = 10;
+					do {
+						iowrite16(ioread16(ioaddr + MACCtrl1) | TxEnable, ioaddr + MACCtrl1);
+						if (ioread16(ioaddr + MACCtrl1) & TxEnabled)
+							break;
+						mdelay(1);
+					} while (--i);
+				}
+				/* Yup, this is a documentation bug.  It cost me *hours*. */
+				iowrite16 (0, ioaddr + TxStatus);
+				if (tx_cnt < 0) {
+					iowrite32(5000, ioaddr + DownCounter);
+					break;
+				}
+				tx_status = ioread16 (ioaddr + TxStatus);
+			}
+			hw_frame_id = (tx_status >> 8) & 0xff;
+		} else 	{
+			hw_frame_id = ioread8(ioaddr + TxFrameId);
+		}
+
+		if (np->pci_dev->revision >= 0x14) {
+			spin_lock(&np->lock);
+			for (; np->cur_tx - np->dirty_tx > 0; np->dirty_tx++) {
+				int entry = np->dirty_tx % TX_RING_SIZE;
+				struct sk_buff *skb;
+				int sw_frame_id;
+				sw_frame_id = (le32_to_cpu(
+					np->tx_ring[entry].status) >> 2) & 0xff;
+				if (sw_frame_id == hw_frame_id &&
+					!(le32_to_cpu(np->tx_ring[entry].status)
+					& 0x00010000))
+						break;
+				if (sw_frame_id == (hw_frame_id + 1) %
+					TX_RING_SIZE)
+						break;
+				skb = np->tx_skbuff[entry];
+				/* Free the original skb. */
+				dma_unmap_single(&np->pci_dev->dev,
+					le32_to_cpu(np->tx_ring[entry].frag.addr),
+					skb->len, DMA_TO_DEVICE);
+				dev_consume_skb_irq(np->tx_skbuff[entry]);
+				np->tx_skbuff[entry] = NULL;
+				np->tx_ring[entry].frag.addr = 0;
+				np->tx_ring[entry].frag.length = 0;
+			}
+			spin_unlock(&np->lock);
+		} else {
+			spin_lock(&np->lock);
+			for (; np->cur_tx - np->dirty_tx > 0; np->dirty_tx++) {
+				int entry = np->dirty_tx % TX_RING_SIZE;
+				struct sk_buff *skb;
+				if (!(le32_to_cpu(np->tx_ring[entry].status)
+							& 0x00010000))
+					break;
+				skb = np->tx_skbuff[entry];
+				/* Free the original skb. */
+				dma_unmap_single(&np->pci_dev->dev,
+					le32_to_cpu(np->tx_ring[entry].frag.addr),
+					skb->len, DMA_TO_DEVICE);
+				dev_consume_skb_irq(np->tx_skbuff[entry]);
+				np->tx_skbuff[entry] = NULL;
+				np->tx_ring[entry].frag.addr = 0;
+				np->tx_ring[entry].frag.length = 0;
+			}
+			spin_unlock(&np->lock);
+		}
+
+		if (netif_queue_stopped(dev) &&
+			np->cur_tx - np->dirty_tx < TX_QUEUE_LEN - 4) {
+			/* The ring is no longer full, clear busy flag. */
+			netif_wake_queue (dev);
+		}
+		/* Abnormal error summary/uncommon events handlers. */
+		if (intr_status & (IntrPCIErr | LinkChange | StatsMax))
+			netdev_error(dev, intr_status);
+	} while (0);
+	if (netif_msg_intr(np))
+		printk(KERN_DEBUG "%s: exiting interrupt, status=%#4.4x.\n",
+			   dev->name, ioread16(ioaddr + IntrStatus));
+	return IRQ_RETVAL(handled);
+}
+
+static void rx_poll(struct tasklet_struct *t)
+{
+	struct netdev_private *np = from_tasklet(np, t, rx_tasklet);
+	struct net_device *dev = np->ndev;
+	int entry = np->cur_rx % RX_RING_SIZE;
+	int boguscnt = np->budget;
+	void __iomem *ioaddr = np->base;
+	int received = 0;
+
+	/* If EOP is set on the next entry, it's a new packet. Send it up. */
+	while (1) {
+		struct netdev_desc *desc = &(np->rx_ring[entry]);
+		u32 frame_status = le32_to_cpu(desc->status);
+		int pkt_len;
+
+		if (--boguscnt < 0) {
+			goto not_done;
+		}
+		if (!(frame_status & DescOwn))
+			break;
+		pkt_len = frame_status & 0x1fff;	/* Chip omits the CRC. */
+		if (netif_msg_rx_status(np))
+			printk(KERN_DEBUG "  netdev_rx() status was %8.8x.\n",
+				   frame_status);
+		if (frame_status & 0x001f4000) {
+			/* There was a error. */
+			if (netif_msg_rx_err(np))
+				printk(KERN_DEBUG "  netdev_rx() Rx error was %8.8x.\n",
+					   frame_status);
+			dev->stats.rx_errors++;
+			if (frame_status & 0x00100000)
+				dev->stats.rx_length_errors++;
+			if (frame_status & 0x00010000)
+				dev->stats.rx_fifo_errors++;
+			if (frame_status & 0x00060000)
+				dev->stats.rx_frame_errors++;
+			if (frame_status & 0x00080000)
+				dev->stats.rx_crc_errors++;
+			if (frame_status & 0x00100000) {
+				printk(KERN_WARNING "%s: Oversized Ethernet frame,"
+					   " status %8.8x.\n",
+					   dev->name, frame_status);
+			}
+		} else {
+			struct sk_buff *skb;
+#ifndef final_version
+			if (netif_msg_rx_status(np))
+				printk(KERN_DEBUG "  netdev_rx() normal Rx pkt length %d"
+					   ", bogus_cnt %d.\n",
+					   pkt_len, boguscnt);
+#endif
+			/* Check if the packet is long enough to accept without copying
+			   to a minimally-sized skbuff. */
+			if (pkt_len < rx_copybreak &&
+			    (skb = netdev_alloc_skb(dev, pkt_len + 2)) != NULL) {
+				skb_reserve(skb, 2);	/* 16 byte align the IP header */
+				dma_sync_single_for_cpu(&np->pci_dev->dev,
+						le32_to_cpu(desc->frag.addr),
+						np->rx_buf_sz, DMA_FROM_DEVICE);
+				skb_copy_to_linear_data(skb, np->rx_skbuff[entry]->data, pkt_len);
+				dma_sync_single_for_device(&np->pci_dev->dev,
+						le32_to_cpu(desc->frag.addr),
+						np->rx_buf_sz, DMA_FROM_DEVICE);
+				skb_put(skb, pkt_len);
+			} else {
+				dma_unmap_single(&np->pci_dev->dev,
+					le32_to_cpu(desc->frag.addr),
+					np->rx_buf_sz, DMA_FROM_DEVICE);
+				skb_put(skb = np->rx_skbuff[entry], pkt_len);
+				np->rx_skbuff[entry] = NULL;
+			}
+			skb->protocol = eth_type_trans(skb, dev);
+			/* Note: checksum -> skb->ip_summed = CHECKSUM_UNNECESSARY; */
+			netif_rx(skb);
+		}
+		entry = (entry + 1) % RX_RING_SIZE;
+		received++;
+	}
+	np->cur_rx = entry;
+	refill_rx (dev);
+	np->budget -= received;
+	iowrite16(DEFAULT_INTR, ioaddr + IntrEnable);
+	return;
+
+not_done:
+	np->cur_rx = entry;
+	refill_rx (dev);
+	if (!received)
+		received = 1;
+	np->budget -= received;
+	if (np->budget <= 0)
+		np->budget = RX_BUDGET;
+	tasklet_schedule(&np->rx_tasklet);
+}
+
+static void refill_rx (struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int entry;
+
+	/* Refill the Rx ring buffers. */
+	for (;(np->cur_rx - np->dirty_rx + RX_RING_SIZE) % RX_RING_SIZE > 0;
+		np->dirty_rx = (np->dirty_rx + 1) % RX_RING_SIZE) {
+		struct sk_buff *skb;
+		dma_addr_t addr;
+
+		entry = np->dirty_rx % RX_RING_SIZE;
+		if (np->rx_skbuff[entry] == NULL) {
+			skb = netdev_alloc_skb(dev, np->rx_buf_sz + 2);
+			np->rx_skbuff[entry] = skb;
+			if (skb == NULL)
+				break;		/* Better luck next round. */
+			skb_reserve(skb, 2);	/* Align IP on 16 byte boundaries */
+			addr = dma_map_single(&np->pci_dev->dev, skb->data,
+					      np->rx_buf_sz, DMA_FROM_DEVICE);
+			if (dma_mapping_error(&np->pci_dev->dev, addr)) {
+			    dev_kfree_skb_irq(skb);
+			    np->rx_skbuff[entry] = NULL;
+			    break;
+			}
+
+			np->rx_ring[entry].frag.addr = cpu_to_le32(addr);
+		}
+		/* Perhaps we need not reset this field. */
+		np->rx_ring[entry].frag.length =
+			cpu_to_le32(np->rx_buf_sz | LastFrag);
+		np->rx_ring[entry].status = 0;
+	}
+}
+static void netdev_error(struct net_device *dev, int intr_status)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	u16 mii_ctl, mii_advertise, mii_lpa;
+	int speed;
+
+	if (intr_status & LinkChange) {
+		if (mdio_wait_link(dev, 10) == 0) {
+			printk(KERN_INFO "%s: Link up\n", dev->name);
+			if (np->an_enable) {
+				mii_advertise = mdio_read(dev, np->phys[0],
+							   MII_ADVERTISE);
+				mii_lpa = mdio_read(dev, np->phys[0], MII_LPA);
+				mii_advertise &= mii_lpa;
+				printk(KERN_INFO "%s: Link changed: ",
+					dev->name);
+				if (mii_advertise & ADVERTISE_100FULL) {
+					np->speed = 100;
+					printk("100Mbps, full duplex\n");
+				} else if (mii_advertise & ADVERTISE_100HALF) {
+					np->speed = 100;
+					printk("100Mbps, half duplex\n");
+				} else if (mii_advertise & ADVERTISE_10FULL) {
+					np->speed = 10;
+					printk("10Mbps, full duplex\n");
+				} else if (mii_advertise & ADVERTISE_10HALF) {
+					np->speed = 10;
+					printk("10Mbps, half duplex\n");
+				} else
+					printk("\n");
+
+			} else {
+				mii_ctl = mdio_read(dev, np->phys[0], MII_BMCR);
+				speed = (mii_ctl & BMCR_SPEED100) ? 100 : 10;
+				np->speed = speed;
+				printk(KERN_INFO "%s: Link changed: %dMbps ,",
+					dev->name, speed);
+				printk("%s duplex.\n",
+					(mii_ctl & BMCR_FULLDPLX) ?
+						"full" : "half");
+			}
+			check_duplex(dev);
+			if (np->flowctrl && np->mii_if.full_duplex) {
+				iowrite16(ioread16(ioaddr + MulticastFilter1+2) | 0x0200,
+					ioaddr + MulticastFilter1+2);
+				iowrite16(ioread16(ioaddr + MACCtrl0) | EnbFlowCtrl,
+					ioaddr + MACCtrl0);
+			}
+			netif_carrier_on(dev);
+		} else {
+			printk(KERN_INFO "%s: Link down\n", dev->name);
+			netif_carrier_off(dev);
+		}
+	}
+	if (intr_status & StatsMax) {
+		get_stats(dev);
+	}
+	if (intr_status & IntrPCIErr) {
+		printk(KERN_ERR "%s: Something Wicked happened! %4.4x.\n",
+			   dev->name, intr_status);
+		/* We must do a global reset of DMA to continue. */
+	}
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	unsigned long flags;
+	u8 late_coll, single_coll, mult_coll;
+
+	spin_lock_irqsave(&np->statlock, flags);
+	/* The chip only need report frame silently dropped. */
+	dev->stats.rx_missed_errors	+= ioread8(ioaddr + RxMissed);
+	dev->stats.tx_packets += ioread16(ioaddr + TxFramesOK);
+	dev->stats.rx_packets += ioread16(ioaddr + RxFramesOK);
+	dev->stats.tx_carrier_errors += ioread8(ioaddr + StatsCarrierError);
+
+	mult_coll = ioread8(ioaddr + StatsMultiColl);
+	np->xstats.tx_multiple_collisions += mult_coll;
+	single_coll = ioread8(ioaddr + StatsOneColl);
+	np->xstats.tx_single_collisions += single_coll;
+	late_coll = ioread8(ioaddr + StatsLateColl);
+	np->xstats.tx_late_collisions += late_coll;
+	dev->stats.collisions += mult_coll
+		+ single_coll
+		+ late_coll;
+
+	np->xstats.tx_deferred += ioread8(ioaddr + StatsTxDefer);
+	np->xstats.tx_deferred_excessive += ioread8(ioaddr + StatsTxXSDefer);
+	np->xstats.tx_aborted += ioread8(ioaddr + StatsTxAbort);
+	np->xstats.tx_bcasts += ioread8(ioaddr + StatsBcastTx);
+	np->xstats.rx_bcasts += ioread8(ioaddr + StatsBcastRx);
+	np->xstats.tx_mcasts += ioread8(ioaddr + StatsMcastTx);
+	np->xstats.rx_mcasts += ioread8(ioaddr + StatsMcastRx);
+
+	dev->stats.tx_bytes += ioread16(ioaddr + TxOctetsLow);
+	dev->stats.tx_bytes += ioread16(ioaddr + TxOctetsHigh) << 16;
+	dev->stats.rx_bytes += ioread16(ioaddr + RxOctetsLow);
+	dev->stats.rx_bytes += ioread16(ioaddr + RxOctetsHigh) << 16;
+
+	spin_unlock_irqrestore(&np->statlock, flags);
+
+	return &dev->stats;
+}
+
+static void set_rx_mode(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	u16 mc_filter[4];			/* Multicast hash filter */
+	u32 rx_mode;
+	int i;
+
+	if (dev->flags & IFF_PROMISC) {			/* Set promiscuous. */
+		memset(mc_filter, 0xff, sizeof(mc_filter));
+		rx_mode = AcceptBroadcast | AcceptMulticast | AcceptAll | AcceptMyPhys;
+	} else if ((netdev_mc_count(dev) > multicast_filter_limit) ||
+		   (dev->flags & IFF_ALLMULTI)) {
+		/* Too many to match, or accept all multicasts. */
+		memset(mc_filter, 0xff, sizeof(mc_filter));
+		rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys;
+	} else if (!netdev_mc_empty(dev)) {
+		struct netdev_hw_addr *ha;
+		int bit;
+		int index;
+		int crc;
+		memset (mc_filter, 0, sizeof (mc_filter));
+		netdev_for_each_mc_addr(ha, dev) {
+			crc = ether_crc_le(ETH_ALEN, ha->addr);
+			for (index=0, bit=0; bit < 6; bit++, crc <<= 1)
+				if (crc & 0x80000000) index |= 1 << bit;
+			mc_filter[index/16] |= (1 << (index % 16));
+		}
+		rx_mode = AcceptBroadcast | AcceptMultiHash | AcceptMyPhys;
+	} else {
+		iowrite8(AcceptBroadcast | AcceptMyPhys, ioaddr + RxMode);
+		return;
+	}
+	if (np->mii_if.full_duplex && np->flowctrl)
+		mc_filter[3] |= 0x0200;
+
+	for (i = 0; i < 4; i++)
+		iowrite16(mc_filter[i], ioaddr + MulticastFilter0 + i*2);
+	iowrite8(rx_mode, ioaddr + RxMode);
+}
+
+static int __set_mac_addr(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	u16 addr16;
+
+	addr16 = (dev->dev_addr[0] | (dev->dev_addr[1] << 8));
+	iowrite16(addr16, np->base + StationAddr);
+	addr16 = (dev->dev_addr[2] | (dev->dev_addr[3] << 8));
+	iowrite16(addr16, np->base + StationAddr+2);
+	addr16 = (dev->dev_addr[4] | (dev->dev_addr[5] << 8));
+	iowrite16(addr16, np->base + StationAddr+4);
+	return 0;
+}
+
+/* Invoked with rtnl_lock held */
+static int sundance_set_mac_addr(struct net_device *dev, void *data)
+{
+	const struct sockaddr *addr = data;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	eth_hw_addr_set(dev, addr->sa_data);
+	__set_mac_addr(dev);
+
+	return 0;
+}
+
+static const struct {
+	const char name[ETH_GSTRING_LEN];
+} sundance_stats[] = {
+	{ "tx_multiple_collisions" },
+	{ "tx_single_collisions" },
+	{ "tx_late_collisions" },
+	{ "tx_deferred" },
+	{ "tx_deferred_excessive" },
+	{ "tx_aborted" },
+	{ "tx_bcasts" },
+	{ "rx_bcasts" },
+	{ "tx_mcasts" },
+	{ "rx_mcasts" },
+};
+
+static int check_if_running(struct net_device *dev)
+{
+	if (!netif_running(dev))
+		return -EINVAL;
+	return 0;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strscpy(info->bus_info, pci_name(np->pci_dev), sizeof(info->bus_info));
+}
+
+static int get_link_ksettings(struct net_device *dev,
+			      struct ethtool_link_ksettings *cmd)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	spin_lock_irq(&np->lock);
+	mii_ethtool_get_link_ksettings(&np->mii_if, cmd);
+	spin_unlock_irq(&np->lock);
+	return 0;
+}
+
+static int set_link_ksettings(struct net_device *dev,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int res;
+	spin_lock_irq(&np->lock);
+	res = mii_ethtool_set_link_ksettings(&np->mii_if, cmd);
+	spin_unlock_irq(&np->lock);
+	return res;
+}
+
+static int nway_reset(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	return mii_nway_restart(&np->mii_if);
+}
+
+static u32 get_link(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	return mii_link_ok(&np->mii_if);
+}
+
+static u32 get_msglevel(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	return np->msg_enable;
+}
+
+static void set_msglevel(struct net_device *dev, u32 val)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	np->msg_enable = val;
+}
+
+static void get_strings(struct net_device *dev, u32 stringset,
+		u8 *data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, sundance_stats, sizeof(sundance_stats));
+}
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(sundance_stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, u64 *data)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int i = 0;
+
+	get_stats(dev);
+	data[i++] = np->xstats.tx_multiple_collisions;
+	data[i++] = np->xstats.tx_single_collisions;
+	data[i++] = np->xstats.tx_late_collisions;
+	data[i++] = np->xstats.tx_deferred;
+	data[i++] = np->xstats.tx_deferred_excessive;
+	data[i++] = np->xstats.tx_aborted;
+	data[i++] = np->xstats.tx_bcasts;
+	data[i++] = np->xstats.rx_bcasts;
+	data[i++] = np->xstats.tx_mcasts;
+	data[i++] = np->xstats.rx_mcasts;
+}
+
+#ifdef CONFIG_PM
+
+static void sundance_get_wol(struct net_device *dev,
+		struct ethtool_wolinfo *wol)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	u8 wol_bits;
+
+	wol->wolopts = 0;
+
+	wol->supported = (WAKE_PHY | WAKE_MAGIC);
+	if (!np->wol_enabled)
+		return;
+
+	wol_bits = ioread8(ioaddr + WakeEvent);
+	if (wol_bits & MagicPktEnable)
+		wol->wolopts |= WAKE_MAGIC;
+	if (wol_bits & LinkEventEnable)
+		wol->wolopts |= WAKE_PHY;
+}
+
+static int sundance_set_wol(struct net_device *dev,
+	struct ethtool_wolinfo *wol)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	u8 wol_bits;
+
+	if (!device_can_wakeup(&np->pci_dev->dev))
+		return -EOPNOTSUPP;
+
+	np->wol_enabled = !!(wol->wolopts);
+	wol_bits = ioread8(ioaddr + WakeEvent);
+	wol_bits &= ~(WakePktEnable | MagicPktEnable |
+			LinkEventEnable | WolEnable);
+
+	if (np->wol_enabled) {
+		if (wol->wolopts & WAKE_MAGIC)
+			wol_bits |= (MagicPktEnable | WolEnable);
+		if (wol->wolopts & WAKE_PHY)
+			wol_bits |= (LinkEventEnable | WolEnable);
+	}
+	iowrite8(wol_bits, ioaddr + WakeEvent);
+
+	device_set_wakeup_enable(&np->pci_dev->dev, np->wol_enabled);
+
+	return 0;
+}
+#else
+#define sundance_get_wol NULL
+#define sundance_set_wol NULL
+#endif /* CONFIG_PM */
+
+static const struct ethtool_ops ethtool_ops = {
+	.begin = check_if_running,
+	.get_drvinfo = get_drvinfo,
+	.nway_reset = nway_reset,
+	.get_link = get_link,
+	.get_wol = sundance_get_wol,
+	.set_wol = sundance_set_wol,
+	.get_msglevel = get_msglevel,
+	.set_msglevel = set_msglevel,
+	.get_strings = get_strings,
+	.get_sset_count = get_sset_count,
+	.get_ethtool_stats = get_ethtool_stats,
+	.get_link_ksettings = get_link_ksettings,
+	.set_link_ksettings = set_link_ksettings,
+};
+
+static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	int rc;
+
+	if (!netif_running(dev))
+		return -EINVAL;
+
+	spin_lock_irq(&np->lock);
+	rc = generic_mii_ioctl(&np->mii_if, if_mii(rq), cmd, NULL);
+	spin_unlock_irq(&np->lock);
+
+	return rc;
+}
+
+static int netdev_close(struct net_device *dev)
+{
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+	struct sk_buff *skb;
+	int i;
+
+	/* Wait and kill tasklet */
+	tasklet_kill(&np->rx_tasklet);
+	tasklet_kill(&np->tx_tasklet);
+	np->cur_tx = 0;
+	np->dirty_tx = 0;
+	np->cur_task = 0;
+	np->last_tx = NULL;
+
+	netif_stop_queue(dev);
+
+	if (netif_msg_ifdown(np)) {
+		printk(KERN_DEBUG "%s: Shutting down ethercard, status was Tx %2.2x "
+			   "Rx %4.4x Int %2.2x.\n",
+			   dev->name, ioread8(ioaddr + TxStatus),
+			   ioread32(ioaddr + RxStatus), ioread16(ioaddr + IntrStatus));
+		printk(KERN_DEBUG "%s: Queue pointers were Tx %d / %d,  Rx %d / %d.\n",
+			   dev->name, np->cur_tx, np->dirty_tx, np->cur_rx, np->dirty_rx);
+	}
+
+	/* Disable interrupts by clearing the interrupt mask. */
+	iowrite16(0x0000, ioaddr + IntrEnable);
+
+	/* Disable Rx and Tx DMA for safely release resource */
+	iowrite32(0x500, ioaddr + DMACtrl);
+
+	/* Stop the chip's Tx and Rx processes. */
+	iowrite16(TxDisable | RxDisable | StatsDisable, ioaddr + MACCtrl1);
+
+	for (i = 2000; i > 0; i--) {
+		if ((ioread32(ioaddr + DMACtrl) & 0xc000) == 0)
+			break;
+		mdelay(1);
+	}
+
+	iowrite16(GlobalReset | DMAReset | FIFOReset | NetworkReset,
+			ioaddr + ASIC_HI_WORD(ASICCtrl));
+
+	for (i = 2000; i > 0; i--) {
+		if ((ioread16(ioaddr + ASIC_HI_WORD(ASICCtrl)) & ResetBusy) == 0)
+			break;
+		mdelay(1);
+	}
+
+#ifdef __i386__
+	if (netif_msg_hw(np)) {
+		printk(KERN_DEBUG "  Tx ring at %8.8x:\n",
+			   (int)(np->tx_ring_dma));
+		for (i = 0; i < TX_RING_SIZE; i++)
+			printk(KERN_DEBUG " #%d desc. %4.4x %8.8x %8.8x.\n",
+				   i, np->tx_ring[i].status, np->tx_ring[i].frag.addr,
+				   np->tx_ring[i].frag.length);
+		printk(KERN_DEBUG "  Rx ring %8.8x:\n",
+			   (int)(np->rx_ring_dma));
+		for (i = 0; i < /*RX_RING_SIZE*/4 ; i++) {
+			printk(KERN_DEBUG " #%d desc. %4.4x %4.4x %8.8x\n",
+				   i, np->rx_ring[i].status, np->rx_ring[i].frag.addr,
+				   np->rx_ring[i].frag.length);
+		}
+	}
+#endif /* __i386__ debugging only */
+
+	free_irq(np->pci_dev->irq, dev);
+
+	timer_delete_sync(&np->timer);
+
+	/* Free all the skbuffs in the Rx queue. */
+	for (i = 0; i < RX_RING_SIZE; i++) {
+		np->rx_ring[i].status = 0;
+		skb = np->rx_skbuff[i];
+		if (skb) {
+			dma_unmap_single(&np->pci_dev->dev,
+				le32_to_cpu(np->rx_ring[i].frag.addr),
+				np->rx_buf_sz, DMA_FROM_DEVICE);
+			dev_kfree_skb(skb);
+			np->rx_skbuff[i] = NULL;
+		}
+		np->rx_ring[i].frag.addr = cpu_to_le32(0xBADF00D0); /* poison */
+	}
+	for (i = 0; i < TX_RING_SIZE; i++) {
+		np->tx_ring[i].next_desc = 0;
+		skb = np->tx_skbuff[i];
+		if (skb) {
+			dma_unmap_single(&np->pci_dev->dev,
+				le32_to_cpu(np->tx_ring[i].frag.addr),
+				skb->len, DMA_TO_DEVICE);
+			dev_kfree_skb(skb);
+			np->tx_skbuff[i] = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static void sundance_remove1(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+
+	if (dev) {
+	    struct netdev_private *np = netdev_priv(dev);
+	    unregister_netdev(dev);
+	    dma_free_coherent(&pdev->dev, RX_TOTAL_SIZE,
+		    np->rx_ring, np->rx_ring_dma);
+	    dma_free_coherent(&pdev->dev, TX_TOTAL_SIZE,
+		    np->tx_ring, np->tx_ring_dma);
+	    pci_iounmap(pdev, np->base);
+	    pci_release_regions(pdev);
+	    free_netdev(dev);
+	}
+}
+
+static int __maybe_unused sundance_suspend(struct device *dev_d)
+{
+	struct net_device *dev = dev_get_drvdata(dev_d);
+	struct netdev_private *np = netdev_priv(dev);
+	void __iomem *ioaddr = np->base;
+
+	if (!netif_running(dev))
+		return 0;
+
+	netdev_close(dev);
+	netif_device_detach(dev);
+
+	if (np->wol_enabled) {
+		iowrite8(AcceptBroadcast | AcceptMyPhys, ioaddr + RxMode);
+		iowrite16(RxEnable, ioaddr + MACCtrl1);
+	}
+
+	device_set_wakeup_enable(dev_d, np->wol_enabled);
+
+	return 0;
+}
+
+static int __maybe_unused sundance_resume(struct device *dev_d)
+{
+	struct net_device *dev = dev_get_drvdata(dev_d);
+	int err = 0;
+
+	if (!netif_running(dev))
+		return 0;
+
+	err = netdev_open(dev);
+	if (err) {
+		printk(KERN_ERR "%s: Can't resume interface!\n",
+				dev->name);
+		goto out;
+	}
+
+	netif_device_attach(dev);
+
+out:
+	return err;
+}
+
+static SIMPLE_DEV_PM_OPS(sundance_pm_ops, sundance_suspend, sundance_resume);
+
+static struct pci_driver sundance_driver = {
+	.name		= DRV_NAME,
+	.id_table	= sundance_pci_tbl,
+	.probe		= sundance_probe1,
+	.remove		= sundance_remove1,
+	.driver.pm	= &sundance_pm_ops,
+};
+
+module_pci_driver(sundance_driver);
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
index 4643a3380618..b1e1ad9e4b48 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
@@ -2736,7 +2736,7 @@ static int dpaa2_switch_setup_dpbp(struct ethsw_core *ethsw)
 		dev_err(dev, "dpsw_ctrl_if_set_pools() failed\n");
 		goto err_get_attr;
 	}
-	ethsw->bpid = dpbp_attrs.id;
+	ethsw->bpid = dpbp_attrs.bpid;
 
 	return 0;
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 1383918f8a3f..adf1f2bbcbb1 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2363,7 +2363,8 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev)
 		 */
 		phy_dev = of_phy_find_device(fep->phy_node);
 		phy_reset_after_clk_enable(phy_dev);
-		put_device(&phy_dev->mdio.dev);
+		if (phy_dev)
+			put_device(&phy_dev->mdio.dev);
 	}
 }
 
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 1f411d7c4373..1be1b1ef31ee 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -2870,6 +2870,8 @@ static void gve_shutdown(struct pci_dev *pdev)
 	struct gve_priv *priv = netdev_priv(netdev);
 	bool was_up = netif_running(priv->dev);
 
+	netif_device_detach(netdev);
+
 	rtnl_lock();
 	netdev_lock(netdev);
 	if (was_up && gve_close(priv->dev)) {
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
index 503cfbfb4a8a..83cf75bf7a17 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
@@ -53,9 +53,11 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
 {
 	int ret;
 
-	ASSERT_RTNL();
+	if (test_and_set_bit(HBG_NIC_STATE_RESETTING, &priv->state))
+		return -EBUSY;
 
 	if (netif_running(priv->netdev)) {
+		clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
 		dev_warn(&priv->pdev->dev,
 			 "failed to reset because port is up\n");
 		return -EBUSY;
@@ -64,7 +66,6 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
 	netif_device_detach(priv->netdev);
 
 	priv->reset_type = type;
-	set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
 	clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
 	ret = hbg_hw_event_notify(priv, HBG_HW_EVENT_RESET);
 	if (ret) {
@@ -84,29 +85,26 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
 	    type != priv->reset_type)
 		return 0;
 
-	ASSERT_RTNL();
-
-	clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
 	ret = hbg_rebuild(priv);
 	if (ret) {
 		priv->stats.reset_fail_cnt++;
 		set_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
+		clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
 		dev_err(&priv->pdev->dev, "failed to rebuild after reset\n");
 		return ret;
 	}
 
 	netif_device_attach(priv->netdev);
+	clear_bit(HBG_NIC_STATE_RESETTING, &priv->state);
 
 	dev_info(&priv->pdev->dev, "reset done\n");
 	return ret;
 }
 
-/* must be protected by rtnl lock */
 int hbg_reset(struct hbg_priv *priv)
 {
 	int ret;
 
-	ASSERT_RTNL();
 	ret = hbg_reset_prepare(priv, HBG_RESET_TYPE_FUNCTION);
 	if (ret)
 		return ret;
@@ -171,7 +169,6 @@ static void hbg_pci_err_reset_prepare(struct pci_dev *pdev)
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct hbg_priv *priv = netdev_priv(netdev);
 
-	rtnl_lock();
 	hbg_reset_prepare(priv, HBG_RESET_TYPE_FLR);
 }
 
@@ -181,7 +178,6 @@ static void hbg_pci_err_reset_done(struct pci_dev *pdev)
 	struct hbg_priv *priv = netdev_priv(netdev);
 
 	hbg_reset_done(priv, HBG_RESET_TYPE_FLR);
-	rtnl_unlock();
 }
 
 static const struct pci_error_handlers hbg_pci_err_handler = {
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
index 8cca8316ba40..d0aa0661ecd4 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_hw.c
@@ -12,6 +12,8 @@
 
 #define HBG_HW_EVENT_WAIT_TIMEOUT_US	(2 * 1000 * 1000)
 #define HBG_HW_EVENT_WAIT_INTERVAL_US	(10 * 1000)
+#define HBG_MAC_LINK_WAIT_TIMEOUT_US	(500 * 1000)
+#define HBG_MAC_LINK_WAIT_INTERVAL_US	(5 * 1000)
 /* little endian or big endian.
  * ctrl means packet description, data means skb packet data
  */
@@ -228,6 +230,9 @@ void hbg_hw_fill_buffer(struct hbg_priv *priv, u32 buffer_dma_addr)
 
 void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
 {
+	u32 link_status;
+	int ret;
+
 	hbg_hw_mac_enable(priv, HBG_STATUS_DISABLE);
 
 	hbg_reg_write_field(priv, HBG_REG_PORT_MODE_ADDR,
@@ -239,8 +244,14 @@ void hbg_hw_adjust_link(struct hbg_priv *priv, u32 speed, u32 duplex)
 
 	hbg_hw_mac_enable(priv, HBG_STATUS_ENABLE);
 
-	if (!hbg_reg_read_field(priv, HBG_REG_AN_NEG_STATE_ADDR,
-				HBG_REG_AN_NEG_STATE_NP_LINK_OK_B))
+	/* wait MAC link up */
+	ret = readl_poll_timeout(priv->io_base + HBG_REG_AN_NEG_STATE_ADDR,
+				 link_status,
+				 FIELD_GET(HBG_REG_AN_NEG_STATE_NP_LINK_OK_B,
+					   link_status),
+				 HBG_MAC_LINK_WAIT_INTERVAL_US,
+				 HBG_MAC_LINK_WAIT_TIMEOUT_US);
+	if (ret)
 		hbg_np_link_fail_task_schedule(priv);
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
index 2883a5899ae2..8b6110599e10 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.h
@@ -29,7 +29,12 @@ static inline bool hbg_fifo_is_full(struct hbg_priv *priv, enum hbg_dir dir)
 
 static inline u32 hbg_get_queue_used_num(struct hbg_ring *ring)
 {
-	return (ring->ntu + ring->len - ring->ntc) % ring->len;
+	u32 len = READ_ONCE(ring->len);
+
+	if (!len)
+		return 0;
+
+	return (READ_ONCE(ring->ntu) + len - READ_ONCE(ring->ntc)) % len;
 }
 
 netdev_tx_t hbg_net_start_xmit(struct sk_buff *skb, struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index c0bbb12eed2e..cf01a108a5bb 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -549,12 +549,12 @@ static int e1000_set_eeprom(struct net_device *netdev,
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
+	size_t total_len, max_len;
 	u16 *eeprom_buff;
-	void *ptr;
-	int max_len;
+	int ret_val = 0;
 	int first_word;
 	int last_word;
-	int ret_val = 0;
+	void *ptr;
 	u16 i;
 
 	if (eeprom->len == 0)
@@ -569,6 +569,10 @@ static int e1000_set_eeprom(struct net_device *netdev,
 
 	max_len = hw->nvm.word_size * 2;
 
+	if (check_add_overflow(eeprom->offset, eeprom->len, &total_len) ||
+	    total_len > max_len)
+		return -EFBIG;
+
 	first_word = eeprom->offset >> 1;
 	last_word = (eeprom->offset + eeprom->len - 1) >> 1;
 	eeprom_buff = kmalloc(max_len, GFP_KERNEL);
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 49aa4497efce..801a57a925da 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -1278,7 +1278,8 @@ struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi,
 					    const u8 *macaddr);
 int i40e_del_mac_filter(struct i40e_vsi *vsi, const u8 *macaddr);
 bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi);
-int i40e_count_filters(struct i40e_vsi *vsi);
+int i40e_count_all_filters(struct i40e_vsi *vsi);
+int i40e_count_active_filters(struct i40e_vsi *vsi);
 struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, const u8 *macaddr);
 void i40e_vlan_stripping_enable(struct i40e_vsi *vsi);
 static inline bool i40e_is_sw_dcb(struct i40e_pf *pf)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 76d872b91a38..cc02a85ad42b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1561,6 +1561,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config);
 struct i40e_aq_set_mac_config {
 	__le16	max_frame_size;
 	u8	params;
+#define I40E_AQ_SET_MAC_CONFIG_CRC_EN	BIT(2)
 	u8	tx_timer_priority; /* bitmap */
 	__le16	tx_timer_value;
 	__le16	fc_refresh_threshold;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 5f1a405cbbf8..518bc738ea3b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -359,8 +359,8 @@ static void i40e_client_add_instance(struct i40e_pf *pf)
 	if (i40e_client_get_params(vsi, &cdev->lan_info.params))
 		goto free_cdev;
 
-	mac = list_first_entry(&cdev->lan_info.netdev->dev_addrs.list,
-			       struct netdev_hw_addr, list);
+	mac = list_first_entry_or_null(&cdev->lan_info.netdev->dev_addrs.list,
+				       struct netdev_hw_addr, list);
 	if (mac)
 		ether_addr_copy(cdev->lan_info.lanmac, mac->addr);
 	else
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 270e7e8cf9cf..59f5c1e810eb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1190,6 +1190,40 @@ int i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
 }
 
 /**
+ * i40e_aq_set_mac_config - Configure MAC settings
+ * @hw: pointer to the hw struct
+ * @max_frame_size: Maximum Frame Size to be supported by the port
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Set MAC configuration (0x0603). Note that max_frame_size must be greater
+ * than zero.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size,
+			   struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_set_mac_config *cmd;
+	struct libie_aq_desc desc;
+
+	cmd = libie_aq_raw(&desc);
+
+	if (max_frame_size == 0)
+		return -EINVAL;
+
+	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_mac_config);
+
+	cmd->max_frame_size = cpu_to_le16(max_frame_size);
+	cmd->params = I40E_AQ_SET_MAC_CONFIG_CRC_EN;
+
+#define I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD	0x7FFF
+	cmd->fc_refresh_threshold =
+		cpu_to_le16(I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD);
+
+	return i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+}
+
+/**
  * i40e_aq_clear_pxe_mode
  * @hw: pointer to the hw struct
  * @cmd_details: pointer to command details structure or NULL
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 6cd6f23d42a6..c17b5d290f0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -40,48 +40,6 @@ static struct i40e_vsi *i40e_dbg_find_vsi(struct i40e_pf *pf, int seid)
  * setup, adding or removing filters, or other things.  Many of
  * these will be useful for some forms of unit testing.
  **************************************************************/
-static char i40e_dbg_command_buf[256] = "";
-
-/**
- * i40e_dbg_command_read - read for command datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t i40e_dbg_command_read(struct file *filp, char __user *buffer,
-				     size_t count, loff_t *ppos)
-{
-	struct i40e_pf *pf = filp->private_data;
-	struct i40e_vsi *main_vsi;
-	int bytes_not_copied;
-	int buf_size = 256;
-	char *buf;
-	int len;
-
-	/* don't allow partial reads */
-	if (*ppos != 0)
-		return 0;
-	if (count < buf_size)
-		return -ENOSPC;
-
-	buf = kzalloc(buf_size, GFP_KERNEL);
-	if (!buf)
-		return -ENOSPC;
-
-	main_vsi = i40e_pf_get_main_vsi(pf);
-	len = snprintf(buf, buf_size, "%s: %s\n", main_vsi->netdev->name,
-		       i40e_dbg_command_buf);
-
-	bytes_not_copied = copy_to_user(buffer, buf, len);
-	kfree(buf);
-
-	if (bytes_not_copied)
-		return -EFAULT;
-
-	*ppos = len;
-	return len;
-}
 
 static char *i40e_filter_state_string[] = {
 	"INVALID",
@@ -1621,7 +1579,6 @@ command_write_done:
 static const struct file_operations i40e_dbg_command_fops = {
 	.owner = THIS_MODULE,
 	.open =  simple_open,
-	.read =  i40e_dbg_command_read,
 	.write = i40e_dbg_command_write,
 };
 
@@ -1630,48 +1587,6 @@ static const struct file_operations i40e_dbg_command_fops = {
  * The netdev_ops entry in debugfs is for giving the driver commands
  * to be executed from the netdev operations.
  **************************************************************/
-static char i40e_dbg_netdev_ops_buf[256] = "";
-
-/**
- * i40e_dbg_netdev_ops_read - read for netdev_ops datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t i40e_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
-					size_t count, loff_t *ppos)
-{
-	struct i40e_pf *pf = filp->private_data;
-	struct i40e_vsi *main_vsi;
-	int bytes_not_copied;
-	int buf_size = 256;
-	char *buf;
-	int len;
-
-	/* don't allow partal reads */
-	if (*ppos != 0)
-		return 0;
-	if (count < buf_size)
-		return -ENOSPC;
-
-	buf = kzalloc(buf_size, GFP_KERNEL);
-	if (!buf)
-		return -ENOSPC;
-
-	main_vsi = i40e_pf_get_main_vsi(pf);
-	len = snprintf(buf, buf_size, "%s: %s\n", main_vsi->netdev->name,
-		       i40e_dbg_netdev_ops_buf);
-
-	bytes_not_copied = copy_to_user(buffer, buf, len);
-	kfree(buf);
-
-	if (bytes_not_copied)
-		return -EFAULT;
-
-	*ppos = len;
-	return len;
-}
 
 /**
  * i40e_dbg_netdev_ops_write - write into netdev_ops datum
@@ -1685,35 +1600,36 @@ static ssize_t i40e_dbg_netdev_ops_write(struct file *filp,
 					 size_t count, loff_t *ppos)
 {
 	struct i40e_pf *pf = filp->private_data;
+	char *cmd_buf, *buf_tmp;
 	int bytes_not_copied;
 	struct i40e_vsi *vsi;
-	char *buf_tmp;
 	int vsi_seid;
 	int i, cnt;
 
 	/* don't allow partial writes */
 	if (*ppos != 0)
 		return 0;
-	if (count >= sizeof(i40e_dbg_netdev_ops_buf))
-		return -ENOSPC;
 
-	memset(i40e_dbg_netdev_ops_buf, 0, sizeof(i40e_dbg_netdev_ops_buf));
-	bytes_not_copied = copy_from_user(i40e_dbg_netdev_ops_buf,
-					  buffer, count);
-	if (bytes_not_copied)
+	cmd_buf = kzalloc(count + 1, GFP_KERNEL);
+	if (!cmd_buf)
+		return count;
+	bytes_not_copied = copy_from_user(cmd_buf, buffer, count);
+	if (bytes_not_copied) {
+		kfree(cmd_buf);
 		return -EFAULT;
-	i40e_dbg_netdev_ops_buf[count] = '\0';
+	}
+	cmd_buf[count] = '\0';
 
-	buf_tmp = strchr(i40e_dbg_netdev_ops_buf, '\n');
+	buf_tmp = strchr(cmd_buf, '\n');
 	if (buf_tmp) {
 		*buf_tmp = '\0';
-		count = buf_tmp - i40e_dbg_netdev_ops_buf + 1;
+		count = buf_tmp - cmd_buf + 1;
 	}
 
-	if (strncmp(i40e_dbg_netdev_ops_buf, "change_mtu", 10) == 0) {
+	if (strncmp(cmd_buf, "change_mtu", 10) == 0) {
 		int mtu;
 
-		cnt = sscanf(&i40e_dbg_netdev_ops_buf[11], "%i %i",
+		cnt = sscanf(&cmd_buf[11], "%i %i",
 			     &vsi_seid, &mtu);
 		if (cnt != 2) {
 			dev_info(&pf->pdev->dev, "change_mtu <vsi_seid> <mtu>\n");
@@ -1735,8 +1651,8 @@ static ssize_t i40e_dbg_netdev_ops_write(struct file *filp,
 			dev_info(&pf->pdev->dev, "Could not acquire RTNL - please try again\n");
 		}
 
-	} else if (strncmp(i40e_dbg_netdev_ops_buf, "set_rx_mode", 11) == 0) {
-		cnt = sscanf(&i40e_dbg_netdev_ops_buf[11], "%i", &vsi_seid);
+	} else if (strncmp(cmd_buf, "set_rx_mode", 11) == 0) {
+		cnt = sscanf(&cmd_buf[11], "%i", &vsi_seid);
 		if (cnt != 1) {
 			dev_info(&pf->pdev->dev, "set_rx_mode <vsi_seid>\n");
 			goto netdev_ops_write_done;
@@ -1756,8 +1672,8 @@ static ssize_t i40e_dbg_netdev_ops_write(struct file *filp,
 			dev_info(&pf->pdev->dev, "Could not acquire RTNL - please try again\n");
 		}
 
-	} else if (strncmp(i40e_dbg_netdev_ops_buf, "napi", 4) == 0) {
-		cnt = sscanf(&i40e_dbg_netdev_ops_buf[4], "%i", &vsi_seid);
+	} else if (strncmp(cmd_buf, "napi", 4) == 0) {
+		cnt = sscanf(&cmd_buf[4], "%i", &vsi_seid);
 		if (cnt != 1) {
 			dev_info(&pf->pdev->dev, "napi <vsi_seid>\n");
 			goto netdev_ops_write_done;
@@ -1775,21 +1691,20 @@ static ssize_t i40e_dbg_netdev_ops_write(struct file *filp,
 			dev_info(&pf->pdev->dev, "napi called\n");
 		}
 	} else {
-		dev_info(&pf->pdev->dev, "unknown command '%s'\n",
-			 i40e_dbg_netdev_ops_buf);
+		dev_info(&pf->pdev->dev, "unknown command '%s'\n", cmd_buf);
 		dev_info(&pf->pdev->dev, "available commands\n");
 		dev_info(&pf->pdev->dev, "  change_mtu <vsi_seid> <mtu>\n");
 		dev_info(&pf->pdev->dev, "  set_rx_mode <vsi_seid>\n");
 		dev_info(&pf->pdev->dev, "  napi <vsi_seid>\n");
 	}
 netdev_ops_write_done:
+	kfree(cmd_buf);
 	return count;
 }
 
 static const struct file_operations i40e_dbg_netdev_ops_fops = {
 	.owner = THIS_MODULE,
 	.open = simple_open,
-	.read = i40e_dbg_netdev_ops_read,
 	.write = i40e_dbg_netdev_ops_write,
 };
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b83f823e4917..529d5501baac 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1243,12 +1243,30 @@ void i40e_update_stats(struct i40e_vsi *vsi)
 }
 
 /**
- * i40e_count_filters - counts VSI mac filters
+ * i40e_count_all_filters - counts VSI MAC filters
  * @vsi: the VSI to be searched
  *
- * Returns count of mac filters
- **/
-int i40e_count_filters(struct i40e_vsi *vsi)
+ * Return: count of MAC filters in any state.
+ */
+int i40e_count_all_filters(struct i40e_vsi *vsi)
+{
+	struct i40e_mac_filter *f;
+	struct hlist_node *h;
+	int bkt, cnt = 0;
+
+	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist)
+		cnt++;
+
+	return cnt;
+}
+
+/**
+ * i40e_count_active_filters - counts VSI MAC filters
+ * @vsi: the VSI to be searched
+ *
+ * Return: count of active MAC filters.
+ */
+int i40e_count_active_filters(struct i40e_vsi *vsi)
 {
 	struct i40e_mac_filter *f;
 	struct hlist_node *h;
@@ -4156,7 +4174,7 @@ free_queue_irqs:
 		irq_num = pf->msix_entries[base + vector].vector;
 		irq_set_affinity_notifier(irq_num, NULL);
 		irq_update_affinity_hint(irq_num, NULL);
-		free_irq(irq_num, &vsi->q_vectors[vector]);
+		free_irq(irq_num, vsi->q_vectors[vector]);
 	}
 	return err;
 }
@@ -16045,13 +16063,17 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev_dbg(&pf->pdev->dev, "get supported phy types ret =  %pe last_status =  %s\n",
 			ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status));
 
-	/* make sure the MFS hasn't been set lower than the default */
 #define MAX_FRAME_SIZE_DEFAULT 0x2600
-	val = FIELD_GET(I40E_PRTGL_SAH_MFS_MASK,
-			rd32(&pf->hw, I40E_PRTGL_SAH));
-	if (val < MAX_FRAME_SIZE_DEFAULT)
-		dev_warn(&pdev->dev, "MFS for port %x (%d) has been set below the default (%d)\n",
-			 pf->hw.port, val, MAX_FRAME_SIZE_DEFAULT);
+
+	err = i40e_aq_set_mac_config(hw, MAX_FRAME_SIZE_DEFAULT, NULL);
+	if (err)
+		dev_warn(&pdev->dev, "set mac config ret = %pe last_status = %s\n",
+			 ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status));
+
+	/* Make sure the MFS is set to the expected value */
+	val = rd32(hw, I40E_PRTGL_SAH);
+	FIELD_MODIFY(I40E_PRTGL_SAH_MFS_MASK, &val, MAX_FRAME_SIZE_DEFAULT);
+	wr32(hw, I40E_PRTGL_SAH, val);
 
 	/* Add a filter to drop all Flow control frames from any VSI from being
 	 * transmitted. By doing so we stop a malicious VF from sending out
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index aef5de53ce3b..26bb7bffe361 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -98,6 +98,8 @@ int i40e_aq_set_mac_loopback(struct i40e_hw *hw,
 			     struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask,
 			     struct i40e_asq_cmd_details *cmd_details);
+int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size,
+			   struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
 			   struct i40e_asq_cmd_details *cmd_details);
 int i40e_aq_set_link_restart_an(struct i40e_hw *hw,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 048c33039130..b194eae03208 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -948,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 		if (!eop_desc)
 			break;
 
-		/* prevent any other reads prior to eop_desc */
-		smp_rmb();
-
 		i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
 		/* we have caught up to head, no work left to do */
 		if (tx_head == tx_desc)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 9b8efdeafbcf..081a4526a2f0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -448,7 +448,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
 		    (qtype << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) |
 		    (pf_queue_id << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) |
 		    BIT(I40E_QINT_RQCTL_CAUSE_ENA_SHIFT) |
-		    (itr_idx << I40E_QINT_RQCTL_ITR_INDX_SHIFT);
+		    FIELD_PREP(I40E_QINT_RQCTL_ITR_INDX_MASK, itr_idx);
 		wr32(hw, reg_idx, reg);
 	}
 
@@ -653,6 +653,13 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id,
 
 	/* only set the required fields */
 	tx_ctx.base = info->dma_ring_addr / 128;
+
+	/* ring_len has to be multiple of 8 */
+	if (!IS_ALIGNED(info->ring_len, 8) ||
+	    info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) {
+		ret = -EINVAL;
+		goto error_context;
+	}
 	tx_ctx.qlen = info->ring_len;
 	tx_ctx.rdylist = le16_to_cpu(vsi->info.qs_handle[0]);
 	tx_ctx.rdylist_act = 0;
@@ -716,6 +723,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
 
 	/* only set the required fields */
 	rx_ctx.base = info->dma_ring_addr / 128;
+
+	/* ring_len has to be multiple of 32 */
+	if (!IS_ALIGNED(info->ring_len, 32) ||
+	    info->ring_len > I40E_MAX_NUM_DESCRIPTORS_XL710) {
+		ret = -EINVAL;
+		goto error_param;
+	}
 	rx_ctx.qlen = info->ring_len;
 
 	if (info->splithdr_enabled) {
@@ -1450,6 +1464,7 @@ static void i40e_trigger_vf_reset(struct i40e_vf *vf, bool flr)
 	 * functions that may still be running at this point.
 	 */
 	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
+	clear_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states);
 
 	/* In the case of a VFLR, the HW has already reset the VF and we
 	 * just need to clean up, so don't hit the VFRTRIG register.
@@ -2116,7 +2131,10 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 	size_t len = 0;
 	int ret;
 
-	if (!i40e_sync_vf_state(vf, I40E_VF_STATE_INIT)) {
+	i40e_sync_vf_state(vf, I40E_VF_STATE_INIT);
+
+	if (!test_bit(I40E_VF_STATE_INIT, &vf->vf_states) ||
+	    test_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states)) {
 		aq_ret = -EINVAL;
 		goto err;
 	}
@@ -2219,6 +2237,7 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 				vf->default_lan_addr.addr);
 	}
 	set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
+	set_bit(I40E_VF_STATE_RESOURCES_LOADED, &vf->vf_states);
 
 err:
 	/* send the response back to the VF */
@@ -2381,7 +2400,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg)
 		}
 
 		if (vf->adq_enabled) {
-			if (idx >= ARRAY_SIZE(vf->ch)) {
+			if (idx >= vf->num_tc) {
 				aq_ret = -ENODEV;
 				goto error_param;
 			}
@@ -2402,7 +2421,7 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg)
 		 * to its appropriate VSIs based on TC mapping
 		 */
 		if (vf->adq_enabled) {
-			if (idx >= ARRAY_SIZE(vf->ch)) {
+			if (idx >= vf->num_tc) {
 				aq_ret = -ENODEV;
 				goto error_param;
 			}
@@ -2452,8 +2471,10 @@ static int i40e_validate_queue_map(struct i40e_vf *vf, u16 vsi_id,
 	u16 vsi_queue_id, queue_id;
 
 	for_each_set_bit(vsi_queue_id, &queuemap, I40E_MAX_VSI_QP) {
-		if (vf->adq_enabled) {
-			vsi_id = vf->ch[vsi_queue_id / I40E_MAX_VF_VSI].vsi_id;
+		u16 idx = vsi_queue_id / I40E_MAX_VF_VSI;
+
+		if (vf->adq_enabled && idx < vf->num_tc) {
+			vsi_id = vf->ch[idx].vsi_id;
 			queue_id = (vsi_queue_id % I40E_DEFAULT_QUEUES_PER_VF);
 		} else {
 			queue_id = vsi_queue_id;
@@ -2841,24 +2862,6 @@ error_param:
 				      (u8 *)&stats, sizeof(stats));
 }
 
-/**
- * i40e_can_vf_change_mac
- * @vf: pointer to the VF info
- *
- * Return true if the VF is allowed to change its MAC filters, false otherwise
- */
-static bool i40e_can_vf_change_mac(struct i40e_vf *vf)
-{
-	/* If the VF MAC address has been set administratively (via the
-	 * ndo_set_vf_mac command), then deny permission to the VF to
-	 * add/delete unicast MAC addresses, unless the VF is trusted
-	 */
-	if (vf->pf_set_mac && !vf->trusted)
-		return false;
-
-	return true;
-}
-
 #define I40E_MAX_MACVLAN_PER_HW 3072
 #define I40E_MAX_MACVLAN_PER_PF(num_ports) (I40E_MAX_MACVLAN_PER_HW /	\
 	(num_ports))
@@ -2897,8 +2900,10 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf,
 	struct i40e_pf *pf = vf->pf;
 	struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx];
 	struct i40e_hw *hw = &pf->hw;
-	int mac2add_cnt = 0;
-	int i;
+	int i, mac_add_max, mac_add_cnt = 0;
+	bool vf_trusted;
+
+	vf_trusted = test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
 
 	for (i = 0; i < al->num_elements; i++) {
 		struct i40e_mac_filter *f;
@@ -2918,9 +2923,8 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf,
 		 * The VF may request to set the MAC address filter already
 		 * assigned to it so do not return an error in that case.
 		 */
-		if (!i40e_can_vf_change_mac(vf) &&
-		    !is_multicast_ether_addr(addr) &&
-		    !ether_addr_equal(addr, vf->default_lan_addr.addr)) {
+		if (!vf_trusted && !is_multicast_ether_addr(addr) &&
+		    vf->pf_set_mac && !ether_addr_equal(addr, vf->default_lan_addr.addr)) {
 			dev_err(&pf->pdev->dev,
 				"VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n");
 			return -EPERM;
@@ -2929,29 +2933,33 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf,
 		/*count filters that really will be added*/
 		f = i40e_find_mac(vsi, addr);
 		if (!f)
-			++mac2add_cnt;
+			++mac_add_cnt;
 	}
 
 	/* If this VF is not privileged, then we can't add more than a limited
-	 * number of addresses. Check to make sure that the additions do not
-	 * push us over the limit.
-	 */
-	if (!test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
-		if ((i40e_count_filters(vsi) + mac2add_cnt) >
-		    I40E_VC_MAX_MAC_ADDR_PER_VF) {
-			dev_err(&pf->pdev->dev,
-				"Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n");
-			return -EPERM;
-		}
-	/* If this VF is trusted, it can use more resources than untrusted.
+	 * number of addresses.
+	 *
+	 * If this VF is trusted, it can use more resources than untrusted.
 	 * However to ensure that every trusted VF has appropriate number of
 	 * resources, divide whole pool of resources per port and then across
 	 * all VFs.
 	 */
-	} else {
-		if ((i40e_count_filters(vsi) + mac2add_cnt) >
-		    I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs,
-						       hw->num_ports)) {
+	if (!vf_trusted)
+		mac_add_max = I40E_VC_MAX_MAC_ADDR_PER_VF;
+	else
+		mac_add_max = I40E_VC_MAX_MACVLAN_PER_TRUSTED_VF(pf->num_alloc_vfs, hw->num_ports);
+
+	/* VF can replace all its filters in one step, in this case mac_add_max
+	 * will be added as active and another mac_add_max will be in
+	 * a to-be-removed state. Account for that.
+	 */
+	if ((i40e_count_active_filters(vsi) + mac_add_cnt) > mac_add_max ||
+	    (i40e_count_all_filters(vsi) + mac_add_cnt) > 2 * mac_add_max) {
+		if (!vf_trusted) {
+			dev_err(&pf->pdev->dev,
+				"Cannot add more MAC addresses, VF is not trusted, switch the VF to trusted to add more functionality\n");
+			return -EPERM;
+		} else {
 			dev_err(&pf->pdev->dev,
 				"Cannot add more MAC addresses, trusted VF exhausted it's resources\n");
 			return -EPERM;
@@ -3587,7 +3595,7 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf,
 
 	/* action_meta is TC number here to which the filter is applied */
 	if (!tc_filter->action_meta ||
-	    tc_filter->action_meta > vf->num_tc) {
+	    tc_filter->action_meta >= vf->num_tc) {
 		dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n",
 			 vf->vf_id, tc_filter->action_meta);
 		goto err;
@@ -3884,6 +3892,8 @@ err:
 				       aq_ret);
 }
 
+#define I40E_MAX_VF_CLOUD_FILTER 0xFF00
+
 /**
  * i40e_vc_add_cloud_filter
  * @vf: pointer to the VF info
@@ -3923,6 +3933,14 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg)
 		goto err_out;
 	}
 
+	if (vf->num_cloud_filters >= I40E_MAX_VF_CLOUD_FILTER) {
+		dev_warn(&pf->pdev->dev,
+			 "VF %d: Max number of filters reached, can't apply cloud filter\n",
+			 vf->vf_id);
+		aq_ret = -ENOSPC;
+		goto err_out;
+	}
+
 	cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL);
 	if (!cfilter) {
 		aq_ret = -ENOMEM;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 5cf74f16f433..f558b45725c8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -41,7 +41,8 @@ enum i40e_vf_states {
 	I40E_VF_STATE_MC_PROMISC,
 	I40E_VF_STATE_UC_PROMISC,
 	I40E_VF_STATE_PRE_ENABLE,
-	I40E_VF_STATE_RESETTING
+	I40E_VF_STATE_RESETTING,
+	I40E_VF_STATE_RESOURCES_LOADED,
 };
 
 /* VF capabilities */
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 2098f00b3cd3..8a8a01a4bb40 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -510,6 +510,7 @@ enum ice_pf_flags {
 	ICE_FLAG_LINK_LENIENT_MODE_ENA,
 	ICE_FLAG_PLUG_AUX_DEV,
 	ICE_FLAG_UNPLUG_AUX_DEV,
+	ICE_FLAG_AUX_DEV_CREATED,
 	ICE_FLAG_MTU_CHANGED,
 	ICE_FLAG_GNSS,			/* GNSS successfully initialized */
 	ICE_FLAG_DPLL,			/* SyncE/PTP dplls initialized */
diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.c b/drivers/net/ethernet/intel/ice/ice_adapter.c
index 9e4adc43e474..b53561c34708 100644
--- a/drivers/net/ethernet/intel/ice/ice_adapter.c
+++ b/drivers/net/ethernet/intel/ice/ice_adapter.c
@@ -13,16 +13,45 @@
 static DEFINE_XARRAY(ice_adapters);
 static DEFINE_MUTEX(ice_adapters_mutex);
 
-static unsigned long ice_adapter_index(u64 dsn)
+#define ICE_ADAPTER_FIXED_INDEX	BIT_ULL(63)
+
+#define ICE_ADAPTER_INDEX_E825C	\
+	(ICE_DEV_ID_E825C_BACKPLANE | ICE_ADAPTER_FIXED_INDEX)
+
+static u64 ice_adapter_index(struct pci_dev *pdev)
 {
+	switch (pdev->device) {
+	case ICE_DEV_ID_E825C_BACKPLANE:
+	case ICE_DEV_ID_E825C_QSFP:
+	case ICE_DEV_ID_E825C_SFP:
+	case ICE_DEV_ID_E825C_SGMII:
+		/* E825C devices have multiple NACs which are connected to the
+		 * same clock source, and which must share the same
+		 * ice_adapter structure. We can't use the serial number since
+		 * each NAC has its own NVM generated with its own unique
+		 * Device Serial Number. Instead, rely on the embedded nature
+		 * of the E825C devices, and use a fixed index. This relies on
+		 * the fact that all E825C physical functions in a given
+		 * system are part of the same overall device.
+		 */
+		return ICE_ADAPTER_INDEX_E825C;
+	default:
+		return pci_get_dsn(pdev) & ~ICE_ADAPTER_FIXED_INDEX;
+	}
+}
+
+static unsigned long ice_adapter_xa_index(struct pci_dev *pdev)
+{
+	u64 index = ice_adapter_index(pdev);
+
 #if BITS_PER_LONG == 64
-	return dsn;
+	return index;
 #else
-	return (u32)dsn ^ (u32)(dsn >> 32);
+	return (u32)index ^ (u32)(index >> 32);
 #endif
 }
 
-static struct ice_adapter *ice_adapter_new(u64 dsn)
+static struct ice_adapter *ice_adapter_new(struct pci_dev *pdev)
 {
 	struct ice_adapter *adapter;
 
@@ -30,7 +59,7 @@ static struct ice_adapter *ice_adapter_new(u64 dsn)
 	if (!adapter)
 		return NULL;
 
-	adapter->device_serial_number = dsn;
+	adapter->index = ice_adapter_index(pdev);
 	spin_lock_init(&adapter->ptp_gltsyn_time_lock);
 	spin_lock_init(&adapter->txq_ctx_lock);
 	refcount_set(&adapter->refcount, 1);
@@ -64,24 +93,23 @@ static void ice_adapter_free(struct ice_adapter *adapter)
  */
 struct ice_adapter *ice_adapter_get(struct pci_dev *pdev)
 {
-	u64 dsn = pci_get_dsn(pdev);
 	struct ice_adapter *adapter;
 	unsigned long index;
 	int err;
 
-	index = ice_adapter_index(dsn);
+	index = ice_adapter_xa_index(pdev);
 	scoped_guard(mutex, &ice_adapters_mutex) {
 		err = xa_insert(&ice_adapters, index, NULL, GFP_KERNEL);
 		if (err == -EBUSY) {
 			adapter = xa_load(&ice_adapters, index);
 			refcount_inc(&adapter->refcount);
-			WARN_ON_ONCE(adapter->device_serial_number != dsn);
+			WARN_ON_ONCE(adapter->index != ice_adapter_index(pdev));
 			return adapter;
 		}
 		if (err)
 			return ERR_PTR(err);
 
-		adapter = ice_adapter_new(dsn);
+		adapter = ice_adapter_new(pdev);
 		if (!adapter)
 			return ERR_PTR(-ENOMEM);
 		xa_store(&ice_adapters, index, adapter, GFP_KERNEL);
@@ -100,11 +128,10 @@ struct ice_adapter *ice_adapter_get(struct pci_dev *pdev)
  */
 void ice_adapter_put(struct pci_dev *pdev)
 {
-	u64 dsn = pci_get_dsn(pdev);
 	struct ice_adapter *adapter;
 	unsigned long index;
 
-	index = ice_adapter_index(dsn);
+	index = ice_adapter_xa_index(pdev);
 	scoped_guard(mutex, &ice_adapters_mutex) {
 		adapter = xa_load(&ice_adapters, index);
 		if (WARN_ON(!adapter))
diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.h b/drivers/net/ethernet/intel/ice/ice_adapter.h
index db66d03c9f96..e95266c7f20b 100644
--- a/drivers/net/ethernet/intel/ice/ice_adapter.h
+++ b/drivers/net/ethernet/intel/ice/ice_adapter.h
@@ -33,7 +33,7 @@ struct ice_port_list {
  * @txq_ctx_lock: Spinlock protecting access to the GLCOMM_QTX_CNTX_CTL register
  * @ctrl_pf: Control PF of the adapter
  * @ports: Ports list
- * @device_serial_number: DSN cached for collision detection on 32bit systems
+ * @index: 64-bit index cached for collision detection on 32bit systems
  */
 struct ice_adapter {
 	refcount_t refcount;
@@ -44,7 +44,7 @@ struct ice_adapter {
 
 	struct ice_pf *ctrl_pf;
 	struct ice_port_list ports;
-	u64 device_serial_number;
+	u64 index;
 };
 
 struct ice_adapter *ice_adapter_get(struct pci_dev *pdev);
diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c
index e2a036ce76ca..3b2d9c436979 100644
--- a/drivers/net/ethernet/intel/ice/ice_ddp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ddp.c
@@ -2377,7 +2377,13 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size,
  * The function will apply the new Tx topology from the package buffer
  * if available.
  *
- * Return: zero when update was successful, negative values otherwise.
+ * Return:
+ * * 0 - Successfully applied topology configuration.
+ * * -EBUSY - Failed to acquire global configuration lock.
+ * * -EEXIST - Topology configuration has already been applied.
+ * * -EIO - Unable to apply topology configuration.
+ * * -ENODEV - Failed to re-initialize device after applying configuration.
+ * * Other negative error codes indicate unexpected failures.
  */
 int ice_cfg_tx_topo(struct ice_hw *hw, const void *buf, u32 len)
 {
@@ -2410,7 +2416,7 @@ int ice_cfg_tx_topo(struct ice_hw *hw, const void *buf, u32 len)
 
 	if (status) {
 		ice_debug(hw, ICE_DBG_INIT, "Get current topology is failed\n");
-		return status;
+		return -EIO;
 	}
 
 	/* Is default topology already applied ? */
@@ -2497,31 +2503,45 @@ update_topo:
 				 ICE_GLOBAL_CFG_LOCK_TIMEOUT);
 	if (status) {
 		ice_debug(hw, ICE_DBG_INIT, "Failed to acquire global lock\n");
-		return status;
+		return -EBUSY;
 	}
 
 	/* Check if reset was triggered already. */
 	reg = rd32(hw, GLGEN_RSTAT);
 	if (reg & GLGEN_RSTAT_DEVSTATE_M) {
-		/* Reset is in progress, re-init the HW again */
 		ice_debug(hw, ICE_DBG_INIT, "Reset is in progress. Layer topology might be applied already\n");
 		ice_check_reset(hw);
-		return 0;
+		/* Reset is in progress, re-init the HW again */
+		goto reinit_hw;
 	}
 
 	/* Set new topology */
 	status = ice_get_set_tx_topo(hw, new_topo, size, NULL, NULL, true);
 	if (status) {
-		ice_debug(hw, ICE_DBG_INIT, "Failed setting Tx topology\n");
-		return status;
+		ice_debug(hw, ICE_DBG_INIT, "Failed to set Tx topology, status %pe\n",
+			  ERR_PTR(status));
+		/* only report -EIO here as the caller checks the error value
+		 * and reports an informational error message informing that
+		 * the driver failed to program Tx topology.
+		 */
+		status = -EIO;
 	}
 
-	/* New topology is updated, delay 1 second before issuing the CORER */
+	/* Even if Tx topology config failed, we need to CORE reset here to
+	 * clear the global configuration lock. Delay 1 second to allow
+	 * hardware to settle then issue a CORER
+	 */
 	msleep(1000);
 	ice_reset(hw, ICE_RESET_CORER);
-	/* CORER will clear the global lock, so no explicit call
-	 * required for release.
-	 */
+	ice_check_reset(hw);
+
+reinit_hw:
+	/* Since we triggered a CORER, re-initialize hardware */
+	ice_deinit_hw(hw);
+	if (ice_init_hw(hw)) {
+		ice_debug(hw, ICE_DBG_INIT, "Failed to re-init hardware after setting Tx topology\n");
+		return -ENODEV;
+	}
 
-	return 0;
+	return status;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c
index 6ab53e430f91..420d45c2558b 100644
--- a/drivers/net/ethernet/intel/ice/ice_idc.c
+++ b/drivers/net/ethernet/intel/ice/ice_idc.c
@@ -336,6 +336,7 @@ int ice_plug_aux_dev(struct ice_pf *pf)
 	mutex_lock(&pf->adev_mutex);
 	cdev->adev = adev;
 	mutex_unlock(&pf->adev_mutex);
+	set_bit(ICE_FLAG_AUX_DEV_CREATED, pf->flags);
 
 	return 0;
 }
@@ -347,15 +348,16 @@ void ice_unplug_aux_dev(struct ice_pf *pf)
 {
 	struct auxiliary_device *adev;
 
+	if (!test_and_clear_bit(ICE_FLAG_AUX_DEV_CREATED, pf->flags))
+		return;
+
 	mutex_lock(&pf->adev_mutex);
 	adev = pf->cdev_info->adev;
 	pf->cdev_info->adev = NULL;
 	mutex_unlock(&pf->adev_mutex);
 
-	if (adev) {
-		auxiliary_device_delete(adev);
-		auxiliary_device_uninit(adev);
-	}
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 8e0b06c1e02b..77781277aa8e 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -3176,12 +3176,14 @@ static irqreturn_t ice_ll_ts_intr(int __always_unused irq, void *data)
 	hw = &pf->hw;
 	tx = &pf->ptp.port.tx;
 	spin_lock_irqsave(&tx->lock, flags);
-	ice_ptp_complete_tx_single_tstamp(tx);
+	if (tx->init) {
+		ice_ptp_complete_tx_single_tstamp(tx);
 
-	idx = find_next_bit_wrap(tx->in_use, tx->len,
-				 tx->last_ll_ts_idx_read + 1);
-	if (idx != tx->len)
-		ice_ptp_req_tx_single_tstamp(tx, idx);
+		idx = find_next_bit_wrap(tx->in_use, tx->len,
+					 tx->last_ll_ts_idx_read + 1);
+		if (idx != tx->len)
+			ice_ptp_req_tx_single_tstamp(tx, idx);
+	}
 	spin_unlock_irqrestore(&tx->lock, flags);
 
 	val = GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
@@ -4536,17 +4538,23 @@ ice_init_tx_topology(struct ice_hw *hw, const struct firmware *firmware)
 			dev_info(dev, "Tx scheduling layers switching feature disabled\n");
 		else
 			dev_info(dev, "Tx scheduling layers switching feature enabled\n");
-		/* if there was a change in topology ice_cfg_tx_topo triggered
-		 * a CORER and we need to re-init hw
+		return 0;
+	} else if (err == -ENODEV) {
+		/* If we failed to re-initialize the device, we can no longer
+		 * continue loading.
 		 */
-		ice_deinit_hw(hw);
-		err = ice_init_hw(hw);
-
+		dev_warn(dev, "Failed to initialize hardware after applying Tx scheduling configuration.\n");
 		return err;
 	} else if (err == -EIO) {
 		dev_info(dev, "DDP package does not support Tx scheduling layers switching feature - please update to the latest DDP package and try again\n");
+		return 0;
+	} else if (err == -EEXIST) {
+		return 0;
 	}
 
+	/* Do not treat this as a fatal error. */
+	dev_info(dev, "Failed to apply Tx scheduling configuration, err %pe\n",
+		 ERR_PTR(err));
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index e358eb1d719f..fb0f6365a6d6 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -2701,16 +2701,19 @@ irqreturn_t ice_ptp_ts_irq(struct ice_pf *pf)
 		 */
 		if (hw->dev_caps.ts_dev_info.ts_ll_int_read) {
 			struct ice_ptp_tx *tx = &pf->ptp.port.tx;
-			u8 idx;
+			u8 idx, last;
 
 			if (!ice_pf_state_is_nominal(pf))
 				return IRQ_HANDLED;
 
 			spin_lock(&tx->lock);
-			idx = find_next_bit_wrap(tx->in_use, tx->len,
-						 tx->last_ll_ts_idx_read + 1);
-			if (idx != tx->len)
-				ice_ptp_req_tx_single_tstamp(tx, idx);
+			if (tx->init) {
+				last = tx->last_ll_ts_idx_read + 1;
+				idx = find_next_bit_wrap(tx->in_use, tx->len,
+							 last);
+				if (idx != tx->len)
+					ice_ptp_req_tx_single_tstamp(tx, idx);
+			}
 			spin_unlock(&tx->lock);
 
 			return IRQ_HANDLED;
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 29e0088ab6b2..41e7e29879a3 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -894,10 +894,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
 	__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
 				   rx_buf->page_offset, size);
 	sinfo->xdp_frags_size += size;
-	/* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
-	 * can pop off frags but driver has to handle it on its own
-	 */
-	rx_ring->nr_frags = sinfo->nr_frags;
 
 	if (page_is_pfmemalloc(rx_buf->page))
 		xdp_buff_set_frag_pfmemalloc(xdp);
@@ -968,20 +964,20 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
 /**
  * ice_get_pgcnts - grab page_count() for gathered fragments
  * @rx_ring: Rx descriptor ring to store the page counts on
+ * @ntc: the next to clean element (not included in this frame!)
  *
  * This function is intended to be called right before running XDP
  * program so that the page recycling mechanism will be able to take
  * a correct decision regarding underlying pages; this is done in such
  * way as XDP program can change the refcount of page
  */
-static void ice_get_pgcnts(struct ice_rx_ring *rx_ring)
+static void ice_get_pgcnts(struct ice_rx_ring *rx_ring, unsigned int ntc)
 {
-	u32 nr_frags = rx_ring->nr_frags + 1;
 	u32 idx = rx_ring->first_desc;
 	struct ice_rx_buf *rx_buf;
 	u32 cnt = rx_ring->count;
 
-	for (int i = 0; i < nr_frags; i++) {
+	while (idx != ntc) {
 		rx_buf = &rx_ring->rx_buf[idx];
 		rx_buf->pgcnt = page_count(rx_buf->page);
 
@@ -1154,62 +1150,51 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
 }
 
 /**
- * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags
+ * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all buffers in frame
  * @rx_ring: Rx ring with all the auxiliary data
  * @xdp: XDP buffer carrying linear + frags part
- * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage
- * @ntc: a current next_to_clean value to be stored at rx_ring
+ * @ntc: the next to clean element (not included in this frame!)
  * @verdict: return code from XDP program execution
  *
- * Walk through gathered fragments and satisfy internal page
- * recycle mechanism; we take here an action related to verdict
- * returned by XDP program;
+ * Called after XDP program is completed, or on error with verdict set to
+ * ICE_XDP_CONSUMED.
+ *
+ * Walk through buffers from first_desc to the end of the frame, releasing
+ * buffers and satisfying internal page recycle mechanism. The action depends
+ * on verdict from XDP program.
  */
 static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
-			    u32 *xdp_xmit, u32 ntc, u32 verdict)
+			    u32 ntc, u32 verdict)
 {
-	u32 nr_frags = rx_ring->nr_frags + 1;
 	u32 idx = rx_ring->first_desc;
 	u32 cnt = rx_ring->count;
-	u32 post_xdp_frags = 1;
 	struct ice_rx_buf *buf;
-	int i;
+	u32 xdp_frags = 0;
+	int i = 0;
 
 	if (unlikely(xdp_buff_has_frags(xdp)))
-		post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags;
+		xdp_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
 
-	for (i = 0; i < post_xdp_frags; i++) {
+	while (idx != ntc) {
 		buf = &rx_ring->rx_buf[idx];
+		if (++idx == cnt)
+			idx = 0;
 
-		if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+		/* An XDP program could release fragments from the end of the
+		 * buffer. For these, we need to keep the pagecnt_bias as-is.
+		 * To do this, only adjust pagecnt_bias for fragments up to
+		 * the total remaining after the XDP program has run.
+		 */
+		if (verdict != ICE_XDP_CONSUMED)
 			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
-			*xdp_xmit |= verdict;
-		} else if (verdict & ICE_XDP_CONSUMED) {
+		else if (i++ <= xdp_frags)
 			buf->pagecnt_bias++;
-		} else if (verdict == ICE_XDP_PASS) {
-			ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
-		}
 
 		ice_put_rx_buf(rx_ring, buf);
-
-		if (++idx == cnt)
-			idx = 0;
-	}
-	/* handle buffers that represented frags released by XDP prog;
-	 * for these we keep pagecnt_bias as-is; refcount from struct page
-	 * has been decremented within XDP prog and we do not have to increase
-	 * the biased refcnt
-	 */
-	for (; i < nr_frags; i++) {
-		buf = &rx_ring->rx_buf[idx];
-		ice_put_rx_buf(rx_ring, buf);
-		if (++idx == cnt)
-			idx = 0;
 	}
 
 	xdp->data = NULL;
 	rx_ring->first_desc = ntc;
-	rx_ring->nr_frags = 0;
 }
 
 /**
@@ -1317,6 +1302,10 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
 		/* retrieve a buffer from the ring */
 		rx_buf = ice_get_rx_buf(rx_ring, size, ntc);
 
+		/* Increment ntc before calls to ice_put_rx_mbuf() */
+		if (++ntc == cnt)
+			ntc = 0;
+
 		if (!xdp->data) {
 			void *hard_start;
 
@@ -1325,24 +1314,23 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
 			xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
 			xdp_buff_clear_frags_flag(xdp);
 		} else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) {
-			ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED);
+			ice_put_rx_mbuf(rx_ring, xdp, ntc, ICE_XDP_CONSUMED);
 			break;
 		}
-		if (++ntc == cnt)
-			ntc = 0;
 
 		/* skip if it is NOP desc */
 		if (ice_is_non_eop(rx_ring, rx_desc))
 			continue;
 
-		ice_get_pgcnts(rx_ring);
+		ice_get_pgcnts(rx_ring, ntc);
 		xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc);
 		if (xdp_verdict == ICE_XDP_PASS)
 			goto construct_skb;
 		total_rx_bytes += xdp_get_buff_len(xdp);
 		total_rx_pkts++;
 
-		ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);
+		ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict);
+		xdp_xmit |= xdp_verdict & (ICE_XDP_TX | ICE_XDP_REDIR);
 
 		continue;
 construct_skb:
@@ -1352,10 +1340,10 @@ construct_skb:
 			skb = ice_construct_skb(rx_ring, xdp);
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			rx_ring->ring_stats->rx_stats.alloc_page_failed++;
+			rx_ring->ring_stats->rx_stats.alloc_buf_failed++;
 			xdp_verdict = ICE_XDP_CONSUMED;
 		}
-		ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);
+		ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict);
 
 		if (!skb)
 			break;
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index fef750c5f288..2fd8e78178a2 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -358,7 +358,6 @@ struct ice_rx_ring {
 	struct ice_tx_ring *xdp_ring;
 	struct ice_rx_ring *next;	/* pointer to next ring in q_vector */
 	struct xsk_buff_pool *xsk_pool;
-	u32 nr_frags;
 	u16 max_frame;
 	u16 rx_buf_len;
 	dma_addr_t dma;			/* physical address of ring */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c
index 4d2905103215..7e20a07e98e5 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_idc.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c
@@ -247,10 +247,10 @@ static void idpf_unplug_aux_dev(struct auxiliary_device *adev)
 	if (!adev)
 		return;
 
+	ida_free(&idpf_idc_ida, adev->id);
+
 	auxiliary_device_delete(adev);
 	auxiliary_device_uninit(adev);
-
-	ida_free(&idpf_idc_ida, adev->id);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 2c2a3e85d693..513032cb5f08 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -2344,6 +2344,7 @@ static int idpf_set_mac(struct net_device *netdev, void *p)
 	struct idpf_netdev_priv *np = netdev_priv(netdev);
 	struct idpf_vport_config *vport_config;
 	struct sockaddr *addr = p;
+	u8 old_mac_addr[ETH_ALEN];
 	struct idpf_vport *vport;
 	int err = 0;
 
@@ -2367,17 +2368,19 @@ static int idpf_set_mac(struct net_device *netdev, void *p)
 	if (ether_addr_equal(netdev->dev_addr, addr->sa_data))
 		goto unlock_mutex;
 
+	ether_addr_copy(old_mac_addr, vport->default_mac_addr);
+	ether_addr_copy(vport->default_mac_addr, addr->sa_data);
 	vport_config = vport->adapter->vport_config[vport->idx];
 	err = idpf_add_mac_filter(vport, np, addr->sa_data, false);
 	if (err) {
 		__idpf_del_mac_filter(vport_config, addr->sa_data);
+		ether_addr_copy(vport->default_mac_addr, netdev->dev_addr);
 		goto unlock_mutex;
 	}
 
-	if (is_valid_ether_addr(vport->default_mac_addr))
-		idpf_del_mac_filter(vport, np, vport->default_mac_addr, false);
+	if (is_valid_ether_addr(old_mac_addr))
+		__idpf_del_mac_filter(vport_config, old_mac_addr);
 
-	ether_addr_copy(vport->default_mac_addr, addr->sa_data);
 	eth_hw_addr_set(netdev, addr->sa_data);
 
 unlock_mutex:
diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
index 555879b1248d..b19b462e0bb6 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
@@ -180,6 +180,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb,
 }
 
 /**
+ * idpf_tx_singleq_dma_map_error - handle TX DMA map errors
+ * @txq: queue to send buffer on
+ * @skb: send buffer
+ * @first: original first buffer info buffer for packet
+ * @idx: starting point on ring to unwind
+ */
+static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq,
+					  struct sk_buff *skb,
+					  struct idpf_tx_buf *first, u16 idx)
+{
+	struct libeth_sq_napi_stats ss = { };
+	struct libeth_cq_pp cp = {
+		.dev	= txq->dev,
+		.ss	= &ss,
+	};
+
+	u64_stats_update_begin(&txq->stats_sync);
+	u64_stats_inc(&txq->q_stats.dma_map_errs);
+	u64_stats_update_end(&txq->stats_sync);
+
+	/* clear dma mappings for failed tx_buf map */
+	for (;;) {
+		struct idpf_tx_buf *tx_buf;
+
+		tx_buf = &txq->tx_buf[idx];
+		libeth_tx_complete(tx_buf, &cp);
+		if (tx_buf == first)
+			break;
+		if (idx == 0)
+			idx = txq->desc_count;
+		idx--;
+	}
+
+	if (skb_is_gso(skb)) {
+		union idpf_tx_flex_desc *tx_desc;
+
+		/* If we failed a DMA mapping for a TSO packet, we will have
+		 * used one additional descriptor for a context
+		 * descriptor. Reset that here.
+		 */
+		tx_desc = &txq->flex_tx[idx];
+		memset(tx_desc, 0, sizeof(*tx_desc));
+		if (idx == 0)
+			idx = txq->desc_count;
+		idx--;
+	}
+
+	/* Update tail in case netdev_xmit_more was previously true */
+	idpf_tx_buf_hw_update(txq, idx, false);
+}
+
+/**
  * idpf_tx_singleq_map - Build the Tx base descriptor
  * @tx_q: queue to send buffer on
  * @first: first buffer info buffer to use
@@ -219,8 +271,9 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q,
 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
 		unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED;
 
-		if (dma_mapping_error(tx_q->dev, dma))
-			return idpf_tx_dma_map_error(tx_q, skb, first, i);
+		if (unlikely(dma_mapping_error(tx_q->dev, dma)))
+			return idpf_tx_singleq_dma_map_error(tx_q, skb,
+							     first, i);
 
 		/* record length, and DMA address */
 		dma_unmap_len_set(tx_buf, len, size);
@@ -362,11 +415,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 {
 	struct idpf_tx_offload_params offload = { };
 	struct idpf_tx_buf *first;
+	u32 count, buf_count = 1;
 	int csum, tso, needed;
-	unsigned int count;
 	__be16 protocol;
 
-	count = idpf_tx_desc_count_required(tx_q, skb);
+	count = idpf_tx_res_count_required(tx_q, skb, &buf_count);
 	if (unlikely(!count))
 		return idpf_tx_drop_skb(tx_q, skb);
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 66a1b040639d..eaad52a83b04 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -8,48 +8,13 @@
 #include "idpf_ptp.h"
 #include "idpf_virtchnl.h"
 
-struct idpf_tx_stash {
-	struct hlist_node hlist;
-	struct libeth_sqe buf;
-};
-
-#define idpf_tx_buf_compl_tag(buf)	(*(u32 *)&(buf)->priv)
+#define idpf_tx_buf_next(buf)		(*(u32 *)&(buf)->priv)
 LIBETH_SQE_CHECK_PRIV(u32);
 
 static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs,
 			       unsigned int count);
 
 /**
- * idpf_buf_lifo_push - push a buffer pointer onto stack
- * @stack: pointer to stack struct
- * @buf: pointer to buf to push
- *
- * Returns 0 on success, negative on failure
- **/
-static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack,
-			      struct idpf_tx_stash *buf)
-{
-	if (unlikely(stack->top == stack->size))
-		return -ENOSPC;
-
-	stack->bufs[stack->top++] = buf;
-
-	return 0;
-}
-
-/**
- * idpf_buf_lifo_pop - pop a buffer pointer from stack
- * @stack: pointer to stack struct
- **/
-static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack)
-{
-	if (unlikely(!stack->top))
-		return NULL;
-
-	return stack->bufs[--stack->top];
-}
-
-/**
  * idpf_tx_timeout - Respond to a Tx Hang
  * @netdev: network interface device structure
  * @txqueue: TX queue
@@ -77,52 +42,22 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue)
 static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq)
 {
 	struct libeth_sq_napi_stats ss = { };
-	struct idpf_buf_lifo *buf_stack;
-	struct idpf_tx_stash *stash;
 	struct libeth_cq_pp cp = {
 		.dev	= txq->dev,
 		.ss	= &ss,
 	};
-	struct hlist_node *tmp;
-	u32 i, tag;
+	u32 i;
 
 	/* Buffers already cleared, nothing to do */
 	if (!txq->tx_buf)
 		return;
 
 	/* Free all the Tx buffer sk_buffs */
-	for (i = 0; i < txq->desc_count; i++)
+	for (i = 0; i < txq->buf_pool_size; i++)
 		libeth_tx_complete(&txq->tx_buf[i], &cp);
 
 	kfree(txq->tx_buf);
 	txq->tx_buf = NULL;
-
-	if (!idpf_queue_has(FLOW_SCH_EN, txq))
-		return;
-
-	buf_stack = &txq->stash->buf_stack;
-	if (!buf_stack->bufs)
-		return;
-
-	/*
-	 * If a Tx timeout occurred, there are potentially still bufs in the
-	 * hash table, free them here.
-	 */
-	hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash,
-			   hlist) {
-		if (!stash)
-			continue;
-
-		libeth_tx_complete(&stash->buf, &cp);
-		hash_del(&stash->hlist);
-		idpf_buf_lifo_push(buf_stack, stash);
-	}
-
-	for (i = 0; i < buf_stack->size; i++)
-		kfree(buf_stack->bufs[i]);
-
-	kfree(buf_stack->bufs);
-	buf_stack->bufs = NULL;
 }
 
 /**
@@ -139,6 +74,9 @@ static void idpf_tx_desc_rel(struct idpf_tx_queue *txq)
 	if (!txq->desc_ring)
 		return;
 
+	if (txq->refillq)
+		kfree(txq->refillq->ring);
+
 	dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma);
 	txq->desc_ring = NULL;
 	txq->next_to_use = 0;
@@ -195,41 +133,18 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport)
  */
 static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q)
 {
-	struct idpf_buf_lifo *buf_stack;
-	int buf_size;
-	int i;
-
 	/* Allocate book keeping buffers only. Buffers to be supplied to HW
 	 * are allocated by kernel network stack and received as part of skb
 	 */
-	buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count;
-	tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL);
+	if (idpf_queue_has(FLOW_SCH_EN, tx_q))
+		tx_q->buf_pool_size = U16_MAX;
+	else
+		tx_q->buf_pool_size = tx_q->desc_count;
+	tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf),
+			       GFP_KERNEL);
 	if (!tx_q->tx_buf)
 		return -ENOMEM;
 
-	if (!idpf_queue_has(FLOW_SCH_EN, tx_q))
-		return 0;
-
-	buf_stack = &tx_q->stash->buf_stack;
-
-	/* Initialize tx buf stack for out-of-order completions if
-	 * flow scheduling offload is enabled
-	 */
-	buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs),
-				  GFP_KERNEL);
-	if (!buf_stack->bufs)
-		return -ENOMEM;
-
-	buf_stack->size = tx_q->desc_count;
-	buf_stack->top = tx_q->desc_count;
-
-	for (i = 0; i < tx_q->desc_count; i++) {
-		buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]),
-					     GFP_KERNEL);
-		if (!buf_stack->bufs[i])
-			return -ENOMEM;
-	}
-
 	return 0;
 }
 
@@ -244,6 +159,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport,
 			      struct idpf_tx_queue *tx_q)
 {
 	struct device *dev = tx_q->dev;
+	struct idpf_sw_queue *refillq;
 	int err;
 
 	err = idpf_tx_buf_alloc_all(tx_q);
@@ -267,6 +183,31 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport,
 	tx_q->next_to_clean = 0;
 	idpf_queue_set(GEN_CHK, tx_q);
 
+	if (!idpf_queue_has(FLOW_SCH_EN, tx_q))
+		return 0;
+
+	refillq = tx_q->refillq;
+	refillq->desc_count = tx_q->buf_pool_size;
+	refillq->ring = kcalloc(refillq->desc_count, sizeof(u32),
+				GFP_KERNEL);
+	if (!refillq->ring) {
+		err = -ENOMEM;
+		goto err_alloc;
+	}
+
+	for (unsigned int i = 0; i < refillq->desc_count; i++)
+		refillq->ring[i] =
+			FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) |
+			FIELD_PREP(IDPF_RFL_BI_GEN_M,
+				   idpf_queue_has(GEN_CHK, refillq));
+
+	/* Go ahead and flip the GEN bit since this counts as filling
+	 * up the ring, i.e. we already ring wrapped.
+	 */
+	idpf_queue_change(GEN_CHK, refillq);
+
+	tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP;
+
 	return 0;
 
 err_alloc:
@@ -317,8 +258,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport)
 	for (i = 0; i < vport->num_txq_grp; i++) {
 		for (j = 0; j < vport->txq_grps[i].num_txq; j++) {
 			struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j];
-			u8 gen_bits = 0;
-			u16 bufidx_mask;
 
 			err = idpf_tx_desc_alloc(vport, txq);
 			if (err) {
@@ -327,34 +266,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport)
 					i);
 				goto err_out;
 			}
-
-			if (!idpf_is_queue_model_split(vport->txq_model))
-				continue;
-
-			txq->compl_tag_cur_gen = 0;
-
-			/* Determine the number of bits in the bufid
-			 * mask and add one to get the start of the
-			 * generation bits
-			 */
-			bufidx_mask = txq->desc_count - 1;
-			while (bufidx_mask >> 1) {
-				txq->compl_tag_gen_s++;
-				bufidx_mask = bufidx_mask >> 1;
-			}
-			txq->compl_tag_gen_s++;
-
-			gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH -
-							txq->compl_tag_gen_s;
-			txq->compl_tag_gen_max = GETMAXVAL(gen_bits);
-
-			/* Set bufid mask based on location of first
-			 * gen bit; it cannot simply be the descriptor
-			 * ring size-1 since we can have size values
-			 * where not all of those bits are set.
-			 */
-			txq->compl_tag_bufid_m =
-				GETMAXVAL(txq->compl_tag_gen_s);
 		}
 
 		if (!idpf_is_queue_model_split(vport->txq_model))
@@ -603,18 +514,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq)
 }
 
 /**
- * idpf_rx_post_buf_refill - Post buffer id to refill queue
+ * idpf_post_buf_refill - Post buffer id to refill queue
  * @refillq: refill queue to post to
  * @buf_id: buffer id to post
  */
-static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id)
+static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id)
 {
 	u32 nta = refillq->next_to_use;
 
 	/* store the buffer ID and the SW maintained GEN bit to the refillq */
 	refillq->ring[nta] =
-		FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) |
-		FIELD_PREP(IDPF_RX_BI_GEN_M,
+		FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) |
+		FIELD_PREP(IDPF_RFL_BI_GEN_M,
 			   idpf_queue_has(GEN_CHK, refillq));
 
 	if (unlikely(++nta == refillq->desc_count)) {
@@ -995,6 +906,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport)
 		struct idpf_txq_group *txq_grp = &vport->txq_grps[i];
 
 		for (j = 0; j < txq_grp->num_txq; j++) {
+			if (flow_sch_en) {
+				kfree(txq_grp->txqs[j]->refillq);
+				txq_grp->txqs[j]->refillq = NULL;
+			}
+
 			kfree(txq_grp->txqs[j]);
 			txq_grp->txqs[j] = NULL;
 		}
@@ -1004,9 +920,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport)
 
 		kfree(txq_grp->complq);
 		txq_grp->complq = NULL;
-
-		if (flow_sch_en)
-			kfree(txq_grp->stashes);
 	}
 	kfree(vport->txq_grps);
 	vport->txq_grps = NULL;
@@ -1367,7 +1280,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq)
 	for (i = 0; i < vport->num_txq_grp; i++) {
 		struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i];
 		struct idpf_adapter *adapter = vport->adapter;
-		struct idpf_txq_stash *stashes;
 		int j;
 
 		tx_qgrp->vport = vport;
@@ -1380,15 +1292,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq)
 				goto err_alloc;
 		}
 
-		if (split && flow_sch_en) {
-			stashes = kcalloc(num_txq, sizeof(*stashes),
-					  GFP_KERNEL);
-			if (!stashes)
-				goto err_alloc;
-
-			tx_qgrp->stashes = stashes;
-		}
-
 		for (j = 0; j < tx_qgrp->num_txq; j++) {
 			struct idpf_tx_queue *q = tx_qgrp->txqs[j];
 
@@ -1408,12 +1311,14 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq)
 			if (!flow_sch_en)
 				continue;
 
-			if (split) {
-				q->stash = &stashes[j];
-				hash_init(q->stash->sched_buf_hash);
-			}
-
 			idpf_queue_set(FLOW_SCH_EN, q);
+
+			q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL);
+			if (!q->refillq)
+				goto err_alloc;
+
+			idpf_queue_set(GEN_CHK, q->refillq);
+			idpf_queue_set(RFL_GEN_CHK, q->refillq);
 		}
 
 		if (!split)
@@ -1697,87 +1602,6 @@ static void idpf_tx_read_tstamp(struct idpf_tx_queue *txq, struct sk_buff *skb)
 	spin_unlock_bh(&tx_tstamp_caps->status_lock);
 }
 
-/**
- * idpf_tx_clean_stashed_bufs - clean bufs that were stored for
- * out of order completions
- * @txq: queue to clean
- * @compl_tag: completion tag of packet to clean (from completion descriptor)
- * @cleaned: pointer to stats struct to track cleaned packets/bytes
- * @budget: Used to determine if we are in netpoll
- */
-static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq,
-				       u16 compl_tag,
-				       struct libeth_sq_napi_stats *cleaned,
-				       int budget)
-{
-	struct idpf_tx_stash *stash;
-	struct hlist_node *tmp_buf;
-	struct libeth_cq_pp cp = {
-		.dev	= txq->dev,
-		.ss	= cleaned,
-		.napi	= budget,
-	};
-
-	/* Buffer completion */
-	hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf,
-				    hlist, compl_tag) {
-		if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag))
-			continue;
-
-		hash_del(&stash->hlist);
-
-		if (stash->buf.type == LIBETH_SQE_SKB &&
-		    (skb_shinfo(stash->buf.skb)->tx_flags & SKBTX_IN_PROGRESS))
-			idpf_tx_read_tstamp(txq, stash->buf.skb);
-
-		libeth_tx_complete(&stash->buf, &cp);
-
-		/* Push shadow buf back onto stack */
-		idpf_buf_lifo_push(&txq->stash->buf_stack, stash);
-	}
-}
-
-/**
- * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a
- * later time (only relevant for flow scheduling mode)
- * @txq: Tx queue to clean
- * @tx_buf: buffer to store
- */
-static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq,
-				       struct idpf_tx_buf *tx_buf)
-{
-	struct idpf_tx_stash *stash;
-
-	if (unlikely(tx_buf->type <= LIBETH_SQE_CTX))
-		return 0;
-
-	stash = idpf_buf_lifo_pop(&txq->stash->buf_stack);
-	if (unlikely(!stash)) {
-		net_err_ratelimited("%s: No out-of-order TX buffers left!\n",
-				    netdev_name(txq->netdev));
-
-		return -ENOMEM;
-	}
-
-	/* Store buffer params in shadow buffer */
-	stash->buf.skb = tx_buf->skb;
-	stash->buf.bytes = tx_buf->bytes;
-	stash->buf.packets = tx_buf->packets;
-	stash->buf.type = tx_buf->type;
-	stash->buf.nr_frags = tx_buf->nr_frags;
-	dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma));
-	dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len));
-	idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf);
-
-	/* Add buffer to buf_hash table to be freed later */
-	hash_add(txq->stash->sched_buf_hash, &stash->hlist,
-		 idpf_tx_buf_compl_tag(&stash->buf));
-
-	tx_buf->type = LIBETH_SQE_EMPTY;
-
-	return 0;
-}
-
 #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf)	\
 do {								\
 	if (unlikely(++(ntc) == (txq)->desc_count)) {		\
@@ -1805,14 +1629,8 @@ do {								\
  * Separate packet completion events will be reported on the completion queue,
  * and the buffers will be cleaned separately. The stats are not updated from
  * this function when using flow-based scheduling.
- *
- * Furthermore, in flow scheduling mode, check to make sure there are enough
- * reserve buffers to stash the packet. If there are not, return early, which
- * will leave next_to_clean pointing to the packet that failed to be stashed.
- *
- * Return: false in the scenario above, true otherwise.
  */
-static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
+static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
 				 int napi_budget,
 				 struct libeth_sq_napi_stats *cleaned,
 				 bool descs_only)
@@ -1826,7 +1644,12 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
 		.napi	= napi_budget,
 	};
 	struct idpf_tx_buf *tx_buf;
-	bool clean_complete = true;
+
+	if (descs_only) {
+		/* Bump ring index to mark as cleaned. */
+		tx_q->next_to_clean = end;
+		return;
+	}
 
 	tx_desc = &tx_q->flex_tx[ntc];
 	next_pending_desc = &tx_q->flex_tx[end];
@@ -1846,136 +1669,61 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end,
 			break;
 
 		eop_idx = tx_buf->rs_idx;
+		libeth_tx_complete(tx_buf, &cp);
 
-		if (descs_only) {
-			if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) {
-				clean_complete = false;
-				goto tx_splitq_clean_out;
-			}
-
-			idpf_stash_flow_sch_buffers(tx_q, tx_buf);
+		/* unmap remaining buffers */
+		while (ntc != eop_idx) {
+			idpf_tx_splitq_clean_bump_ntc(tx_q, ntc,
+						      tx_desc, tx_buf);
 
-			while (ntc != eop_idx) {
-				idpf_tx_splitq_clean_bump_ntc(tx_q, ntc,
-							      tx_desc, tx_buf);
-				idpf_stash_flow_sch_buffers(tx_q, tx_buf);
-			}
-		} else {
+			/* unmap any remaining paged data */
 			libeth_tx_complete(tx_buf, &cp);
-
-			/* unmap remaining buffers */
-			while (ntc != eop_idx) {
-				idpf_tx_splitq_clean_bump_ntc(tx_q, ntc,
-							      tx_desc, tx_buf);
-
-				/* unmap any remaining paged data */
-				libeth_tx_complete(tx_buf, &cp);
-			}
 		}
 
 fetch_next_txq_desc:
 		idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf);
 	}
 
-tx_splitq_clean_out:
 	tx_q->next_to_clean = ntc;
-
-	return clean_complete;
 }
 
-#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf)	\
-do {							\
-	(buf)++;					\
-	(ntc)++;					\
-	if (unlikely((ntc) == (txq)->desc_count)) {	\
-		buf = (txq)->tx_buf;			\
-		ntc = 0;				\
-	}						\
-} while (0)
-
 /**
- * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers
+ * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers
  * @txq: queue to clean
- * @compl_tag: completion tag of packet to clean (from completion descriptor)
+ * @buf_id: packet's starting buffer ID, from completion descriptor
  * @cleaned: pointer to stats struct to track cleaned packets/bytes
  * @budget: Used to determine if we are in netpoll
  *
- * Cleans all buffers associated with the input completion tag either from the
- * TX buffer ring or from the hash table if the buffers were previously
- * stashed. Returns the byte/segment count for the cleaned packet associated
- * this completion tag.
+ * Clean all buffers associated with the packet starting at buf_id. Returns the
+ * byte/segment count for the cleaned packet.
  */
-static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag,
-				   struct libeth_sq_napi_stats *cleaned,
-				   int budget)
+static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id,
+			       struct libeth_sq_napi_stats *cleaned,
+			       int budget)
 {
-	u16 idx = compl_tag & txq->compl_tag_bufid_m;
 	struct idpf_tx_buf *tx_buf = NULL;
 	struct libeth_cq_pp cp = {
 		.dev	= txq->dev,
 		.ss	= cleaned,
 		.napi	= budget,
 	};
-	u16 ntc, orig_idx = idx;
-
-	tx_buf = &txq->tx_buf[idx];
-
-	if (unlikely(tx_buf->type <= LIBETH_SQE_CTX ||
-		     idpf_tx_buf_compl_tag(tx_buf) != compl_tag))
-		return false;
 
+	tx_buf = &txq->tx_buf[buf_id];
 	if (tx_buf->type == LIBETH_SQE_SKB) {
 		if (skb_shinfo(tx_buf->skb)->tx_flags & SKBTX_IN_PROGRESS)
 			idpf_tx_read_tstamp(txq, tx_buf->skb);
 
 		libeth_tx_complete(tx_buf, &cp);
+		idpf_post_buf_refill(txq->refillq, buf_id);
 	}
 
-	idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf);
+	while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) {
+		buf_id = idpf_tx_buf_next(tx_buf);
 
-	while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) {
+		tx_buf = &txq->tx_buf[buf_id];
 		libeth_tx_complete(tx_buf, &cp);
-		idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf);
+		idpf_post_buf_refill(txq->refillq, buf_id);
 	}
-
-	/*
-	 * It's possible the packet we just cleaned was an out of order
-	 * completion, which means we can stash the buffers starting from
-	 * the original next_to_clean and reuse the descriptors. We need
-	 * to compare the descriptor ring next_to_clean packet's "first" buffer
-	 * to the "first" buffer of the packet we just cleaned to determine if
-	 * this is the case. Howevever, next_to_clean can point to either a
-	 * reserved buffer that corresponds to a context descriptor used for the
-	 * next_to_clean packet (TSO packet) or the "first" buffer (single
-	 * packet). The orig_idx from the packet we just cleaned will always
-	 * point to the "first" buffer. If next_to_clean points to a reserved
-	 * buffer, let's bump ntc once and start the comparison from there.
-	 */
-	ntc = txq->next_to_clean;
-	tx_buf = &txq->tx_buf[ntc];
-
-	if (tx_buf->type == LIBETH_SQE_CTX)
-		idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf);
-
-	/*
-	 * If ntc still points to a different "first" buffer, clean the
-	 * descriptor ring and stash all of the buffers for later cleaning. If
-	 * we cannot stash all of the buffers, next_to_clean will point to the
-	 * "first" buffer of the packet that could not be stashed and cleaning
-	 * will start there next time.
-	 */
-	if (unlikely(tx_buf != &txq->tx_buf[orig_idx] &&
-		     !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned,
-					   true)))
-		return true;
-
-	/*
-	 * Otherwise, update next_to_clean to reflect the cleaning that was
-	 * done above.
-	 */
-	txq->next_to_clean = idx;
-
-	return true;
 }
 
 /**
@@ -1994,22 +1742,17 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq,
 					 struct libeth_sq_napi_stats *cleaned,
 					 int budget)
 {
-	u16 compl_tag;
+	/* RS completion contains queue head for queue based scheduling or
+	 * completion tag for flow based scheduling.
+	 */
+	u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head);
 
 	if (!idpf_queue_has(FLOW_SCH_EN, txq)) {
-		u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head);
-
-		idpf_tx_splitq_clean(txq, head, budget, cleaned, false);
+		idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false);
 		return;
 	}
 
-	compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag);
-
-	/* If we didn't clean anything on the ring, this packet must be
-	 * in the hash table. Go clean it there.
-	 */
-	if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget))
-		idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget);
+	idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget);
 }
 
 /**
@@ -2126,8 +1869,7 @@ fetch_next_desc:
 		/* Update BQL */
 		nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx);
 
-		dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) ||
-			    np->state != __IDPF_VPORT_UP ||
+		dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP ||
 			    !netif_carrier_ok(tx_q->netdev);
 		/* Check if the TXQ needs to and can be restarted */
 		__netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes,
@@ -2184,15 +1926,21 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc,
 	desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag);
 }
 
-/* Global conditions to tell whether the txq (and related resources)
- * has room to allow the use of "size" descriptors.
+/**
+ * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available
+ * @tx_q: the queue to be checked
+ * @descs_needed: number of descriptors required for this packet
+ * @bufs_needed: number of Tx buffers required for this packet
+ *
+ * Return: 0 if no room available, 1 otherwise
  */
-static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size)
+static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed,
+			     u32 bufs_needed)
 {
-	if (IDPF_DESC_UNUSED(tx_q) < size ||
+	if (IDPF_DESC_UNUSED(tx_q) < descs_needed ||
 	    IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) >
 		IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) ||
-	    IDPF_TX_BUF_RSV_LOW(tx_q))
+	    idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed)
 		return 0;
 	return 1;
 }
@@ -2201,14 +1949,21 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size)
  * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions
  * @tx_q: the queue to be checked
  * @descs_needed: number of descriptors required for this packet
+ * @bufs_needed: number of buffers needed for this packet
  *
- * Returns 0 if stop is not needed
+ * Return: 0 if stop is not needed
  */
 static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q,
-				     unsigned int descs_needed)
+				     u32 descs_needed,
+				     u32 bufs_needed)
 {
+	/* Since we have multiple resources to check for splitq, our
+	 * start,stop_thrs becomes a boolean check instead of a count
+	 * threshold.
+	 */
 	if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx,
-				      idpf_txq_has_room(tx_q, descs_needed),
+				      idpf_txq_has_room(tx_q, descs_needed,
+							bufs_needed),
 				      1, 1))
 		return 0;
 
@@ -2250,14 +2005,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val,
 }
 
 /**
- * idpf_tx_desc_count_required - calculate number of Tx descriptors needed
+ * idpf_tx_res_count_required - get number of Tx resources needed for this pkt
  * @txq: queue to send buffer on
  * @skb: send buffer
+ * @bufs_needed: (output) number of buffers needed for this skb.
  *
- * Returns number of data descriptors needed for this skb.
+ * Return: number of data descriptors and buffers needed for this skb.
  */
-unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
-					 struct sk_buff *skb)
+unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq,
+					struct sk_buff *skb,
+					u32 *bufs_needed)
 {
 	const struct skb_shared_info *shinfo;
 	unsigned int count = 0, i;
@@ -2268,6 +2025,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
 		return count;
 
 	shinfo = skb_shinfo(skb);
+	*bufs_needed += shinfo->nr_frags;
 	for (i = 0; i < shinfo->nr_frags; i++) {
 		unsigned int size;
 
@@ -2297,71 +2055,89 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
 }
 
 /**
- * idpf_tx_dma_map_error - handle TX DMA map errors
- * @txq: queue to send buffer on
- * @skb: send buffer
- * @first: original first buffer info buffer for packet
- * @idx: starting point on ring to unwind
+ * idpf_tx_splitq_bump_ntu - adjust NTU and generation
+ * @txq: the tx ring to wrap
+ * @ntu: ring index to bump
  */
-void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
-			   struct idpf_tx_buf *first, u16 idx)
+static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu)
 {
-	struct libeth_sq_napi_stats ss = { };
-	struct libeth_cq_pp cp = {
-		.dev	= txq->dev,
-		.ss	= &ss,
-	};
+	ntu++;
 
-	u64_stats_update_begin(&txq->stats_sync);
-	u64_stats_inc(&txq->q_stats.dma_map_errs);
-	u64_stats_update_end(&txq->stats_sync);
+	if (ntu == txq->desc_count)
+		ntu = 0;
 
-	/* clear dma mappings for failed tx_buf map */
-	for (;;) {
-		struct idpf_tx_buf *tx_buf;
+	return ntu;
+}
 
-		tx_buf = &txq->tx_buf[idx];
-		libeth_tx_complete(tx_buf, &cp);
-		if (tx_buf == first)
-			break;
-		if (idx == 0)
-			idx = txq->desc_count;
-		idx--;
-	}
+/**
+ * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue
+ * @refillq: refill queue to get buffer ID from
+ * @buf_id: return buffer ID
+ *
+ * Return: true if a buffer ID was found, false if not
+ */
+static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq,
+				    u32 *buf_id)
+{
+	u32 ntc = refillq->next_to_clean;
+	u32 refill_desc;
 
-	if (skb_is_gso(skb)) {
-		union idpf_tx_flex_desc *tx_desc;
+	refill_desc = refillq->ring[ntc];
 
-		/* If we failed a DMA mapping for a TSO packet, we will have
-		 * used one additional descriptor for a context
-		 * descriptor. Reset that here.
-		 */
-		tx_desc = &txq->flex_tx[idx];
-		memset(tx_desc, 0, sizeof(*tx_desc));
-		if (idx == 0)
-			idx = txq->desc_count;
-		idx--;
+	if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) !=
+		     !!(refill_desc & IDPF_RFL_BI_GEN_M)))
+		return false;
+
+	*buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc);
+
+	if (unlikely(++ntc == refillq->desc_count)) {
+		idpf_queue_change(RFL_GEN_CHK, refillq);
+		ntc = 0;
 	}
 
-	/* Update tail in case netdev_xmit_more was previously true */
-	idpf_tx_buf_hw_update(txq, idx, false);
+	refillq->next_to_clean = ntc;
+
+	return true;
 }
 
 /**
- * idpf_tx_splitq_bump_ntu - adjust NTU and generation
- * @txq: the tx ring to wrap
- * @ntu: ring index to bump
+ * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error
+ * @txq: Tx queue to unwind
+ * @params: pointer to splitq params struct
+ * @first: starting buffer for packet to unmap
  */
-static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu)
+static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq,
+					 struct idpf_tx_splitq_params *params,
+					 struct idpf_tx_buf *first)
 {
-	ntu++;
+	struct idpf_sw_queue *refillq = txq->refillq;
+	struct libeth_sq_napi_stats ss = { };
+	struct idpf_tx_buf *tx_buf = first;
+	struct libeth_cq_pp cp = {
+		.dev    = txq->dev,
+		.ss     = &ss,
+	};
 
-	if (ntu == txq->desc_count) {
-		ntu = 0;
-		txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq);
+	u64_stats_update_begin(&txq->stats_sync);
+	u64_stats_inc(&txq->q_stats.dma_map_errs);
+	u64_stats_update_end(&txq->stats_sync);
+
+	libeth_tx_complete(tx_buf, &cp);
+	while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) {
+		tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)];
+		libeth_tx_complete(tx_buf, &cp);
 	}
 
-	return ntu;
+	/* Update tail in case netdev_xmit_more was previously true. */
+	idpf_tx_buf_hw_update(txq, params->prev_ntu, false);
+
+	if (!refillq)
+		return;
+
+	/* Restore refillq state to avoid leaking tags. */
+	if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq))
+		idpf_queue_change(RFL_GEN_CHK, refillq);
+	refillq->next_to_clean = params->prev_refill_ntc;
 }
 
 /**
@@ -2385,6 +2161,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 	struct netdev_queue *nq;
 	struct sk_buff *skb;
 	skb_frag_t *frag;
+	u32 next_buf_id;
 	u16 td_cmd = 0;
 	dma_addr_t dma;
 
@@ -2402,17 +2179,16 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 	tx_buf = first;
 	first->nr_frags = 0;
 
-	params->compl_tag =
-		(tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i;
-
 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
 		unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED;
 
-		if (dma_mapping_error(tx_q->dev, dma))
-			return idpf_tx_dma_map_error(tx_q, skb, first, i);
+		if (unlikely(dma_mapping_error(tx_q->dev, dma))) {
+			idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL;
+			return idpf_tx_splitq_pkt_err_unmap(tx_q, params,
+							    first);
+		}
 
 		first->nr_frags++;
-		idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag;
 		tx_buf->type = LIBETH_SQE_FRAG;
 
 		/* record length, and DMA address */
@@ -2468,29 +2244,12 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 						  max_data);
 
 			if (unlikely(++i == tx_q->desc_count)) {
-				tx_buf = tx_q->tx_buf;
 				tx_desc = &tx_q->flex_tx[0];
 				i = 0;
-				tx_q->compl_tag_cur_gen =
-					IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q);
 			} else {
-				tx_buf++;
 				tx_desc++;
 			}
 
-			/* Since this packet has a buffer that is going to span
-			 * multiple descriptors, it's going to leave holes in
-			 * to the TX buffer ring. To ensure these holes do not
-			 * cause issues in the cleaning routines, we will clear
-			 * them of any stale data and assign them the same
-			 * completion tag as the current packet. Then when the
-			 * packet is being cleaned, the cleaning routines will
-			 * simply pass over these holes and finish cleaning the
-			 * rest of the packet.
-			 */
-			tx_buf->type = LIBETH_SQE_EMPTY;
-			idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag;
-
 			/* Adjust the DMA offset and the remaining size of the
 			 * fragment.  On the first iteration of this loop,
 			 * max_data will be >= 12K and <= 16K-1.  On any
@@ -2515,15 +2274,25 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 		idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size);
 
 		if (unlikely(++i == tx_q->desc_count)) {
-			tx_buf = tx_q->tx_buf;
 			tx_desc = &tx_q->flex_tx[0];
 			i = 0;
-			tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q);
 		} else {
-			tx_buf++;
 			tx_desc++;
 		}
 
+		if (idpf_queue_has(FLOW_SCH_EN, tx_q)) {
+			if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq,
+							      &next_buf_id))) {
+				idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL;
+				return idpf_tx_splitq_pkt_err_unmap(tx_q, params,
+								    first);
+			}
+		} else {
+			next_buf_id = i;
+		}
+		idpf_tx_buf_next(tx_buf) = next_buf_id;
+		tx_buf = &tx_q->tx_buf[next_buf_id];
+
 		size = skb_frag_size(frag);
 		data_len -= size;
 
@@ -2538,6 +2307,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q,
 
 	/* write last descriptor with RS and EOP bits */
 	first->rs_idx = i;
+	idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL;
 	td_cmd |= params->eop_cmd;
 	idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size);
 	i = idpf_tx_splitq_bump_ntu(tx_q, i);
@@ -2746,8 +2516,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq)
 	union idpf_flex_tx_ctx_desc *desc;
 	int i = txq->next_to_use;
 
-	txq->tx_buf[i].type = LIBETH_SQE_CTX;
-
 	/* grab the next descriptor */
 	desc = &txq->flex_ctx[i];
 	txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i);
@@ -2841,6 +2609,21 @@ static void idpf_tx_set_tstamp_desc(union idpf_flex_tx_ctx_desc *ctx_desc,
 #endif /* CONFIG_PTP_1588_CLOCK */
 
 /**
+ * idpf_tx_splitq_need_re - check whether RE bit needs to be set
+ * @tx_q: pointer to Tx queue
+ *
+ * Return: true if RE bit needs to be set, false otherwise
+ */
+static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q)
+{
+	int gap = tx_q->next_to_use - tx_q->last_re;
+
+	gap += (gap < 0) ? tx_q->desc_count : 0;
+
+	return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP;
+}
+
+/**
  * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors
  * @skb: send buffer
  * @tx_q: queue to send buffer on
@@ -2850,13 +2633,16 @@ static void idpf_tx_set_tstamp_desc(union idpf_flex_tx_ctx_desc *ctx_desc,
 static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb,
 					struct idpf_tx_queue *tx_q)
 {
-	struct idpf_tx_splitq_params tx_params = { };
+	struct idpf_tx_splitq_params tx_params = {
+		.prev_ntu = tx_q->next_to_use,
+	};
 	union idpf_flex_tx_ctx_desc *ctx_desc;
 	struct idpf_tx_buf *first;
-	unsigned int count;
+	u32 count, buf_count = 1;
 	int tso, idx;
+	u32 buf_id;
 
-	count = idpf_tx_desc_count_required(tx_q, skb);
+	count = idpf_tx_res_count_required(tx_q, skb, &buf_count);
 	if (unlikely(!count))
 		return idpf_tx_drop_skb(tx_q, skb);
 
@@ -2866,7 +2652,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb,
 
 	/* Check for splitq specific TX resources */
 	count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso);
-	if (idpf_tx_maybe_stop_splitq(tx_q, count)) {
+	if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) {
 		idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false);
 
 		return NETDEV_TX_BUSY;
@@ -2898,36 +2684,47 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb,
 		idpf_tx_set_tstamp_desc(ctx_desc, idx);
 	}
 
-	/* record the location of the first descriptor for this packet */
-	first = &tx_q->tx_buf[tx_q->next_to_use];
-	first->skb = skb;
+	if (idpf_queue_has(FLOW_SCH_EN, tx_q)) {
+		struct idpf_sw_queue *refillq = tx_q->refillq;
 
-	if (tso) {
-		first->packets = tx_params.offload.tso_segs;
-		first->bytes = skb->len +
-			((first->packets - 1) * tx_params.offload.tso_hdr_len);
-	} else {
-		first->packets = 1;
-		first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
-	}
+		/* Save refillq state in case of a packet rollback.  Otherwise,
+		 * the tags will be leaked since they will be popped from the
+		 * refillq but never reposted during cleaning.
+		 */
+		tx_params.prev_refill_gen =
+			idpf_queue_has(RFL_GEN_CHK, refillq);
+		tx_params.prev_refill_ntc = refillq->next_to_clean;
+
+		if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq,
+						      &buf_id))) {
+			if (tx_params.prev_refill_gen !=
+			    idpf_queue_has(RFL_GEN_CHK, refillq))
+				idpf_queue_change(RFL_GEN_CHK, refillq);
+			refillq->next_to_clean = tx_params.prev_refill_ntc;
+
+			tx_q->next_to_use = tx_params.prev_ntu;
+			return idpf_tx_drop_skb(tx_q, skb);
+		}
+		tx_params.compl_tag = buf_id;
 
-	if (idpf_queue_has(FLOW_SCH_EN, tx_q)) {
 		tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE;
 		tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP;
-		/* Set the RE bit to catch any packets that may have not been
-		 * stashed during RS completion cleaning. MIN_GAP is set to
-		 * MIN_RING size to ensure it will be set at least once each
-		 * time around the ring.
+		/* Set the RE bit to periodically "clean" the descriptor ring.
+		 * MIN_GAP is set to MIN_RING size to ensure it will be set at
+		 * least once each time around the ring.
 		 */
-		if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) {
+		if (idpf_tx_splitq_need_re(tx_q)) {
 			tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE;
 			tx_q->txq_grp->num_completions_pending++;
+			tx_q->last_re = tx_q->next_to_use;
 		}
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN;
 
 	} else {
+		buf_id = tx_q->next_to_use;
+
 		tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2;
 		tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD;
 
@@ -2935,6 +2732,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb,
 			tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN;
 	}
 
+	first = &tx_q->tx_buf[buf_id];
+	first->skb = skb;
+
+	if (tso) {
+		first->packets = tx_params.offload.tso_segs;
+		first->bytes = skb->len +
+			((first->packets - 1) * tx_params.offload.tso_hdr_len);
+	} else {
+		first->packets = 1;
+		first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+	}
+
 	idpf_tx_splitq_map(tx_q, &tx_params, first);
 
 	return NETDEV_TX_OK;
@@ -3472,7 +3281,7 @@ payload:
 skip_data:
 		rx_buf->netmem = 0;
 
-		idpf_rx_post_buf_refill(refillq, buf_id);
+		idpf_post_buf_refill(refillq, buf_id);
 		IDPF_RX_BUMP_NTC(rxq, ntc);
 
 		/* skip if it is non EOP desc */
@@ -3580,10 +3389,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq,
 		bool failure;
 
 		if (idpf_queue_has(RFL_GEN_CHK, refillq) !=
-		    !!(refill_desc & IDPF_RX_BI_GEN_M))
+		    !!(refill_desc & IDPF_RFL_BI_GEN_M))
 			break;
 
-		buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc);
+		buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc);
 		failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc);
 		if (failure)
 			break;
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 281de655a813..52753dff381c 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -108,8 +108,8 @@ do {								\
  */
 #define IDPF_TX_SPLITQ_RE_MIN_GAP	64
 
-#define IDPF_RX_BI_GEN_M		BIT(16)
-#define IDPF_RX_BI_BUFID_M		GENMASK(15, 0)
+#define IDPF_RFL_BI_GEN_M		BIT(16)
+#define IDPF_RFL_BI_BUFID_M		GENMASK(15, 0)
 
 #define IDPF_RXD_EOF_SPLITQ		VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M
 #define IDPF_RXD_EOF_SINGLEQ		VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M
@@ -118,10 +118,6 @@ do {								\
 	((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \
 	(txq)->next_to_clean - (txq)->next_to_use - 1)
 
-#define IDPF_TX_BUF_RSV_UNUSED(txq)	((txq)->stash->buf_stack.top)
-#define IDPF_TX_BUF_RSV_LOW(txq)	(IDPF_TX_BUF_RSV_UNUSED(txq) < \
-					 (txq)->desc_count >> 2)
-
 #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq)	((txcq)->desc_count >> 1)
 /* Determine the absolute number of completions pending, i.e. the number of
  * completions that are expected to arrive on the TX completion queue.
@@ -131,11 +127,7 @@ do {								\
 	0 : U32_MAX) + \
 	(txq)->num_completions_pending - (txq)->complq->num_completions)
 
-#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH	16
-/* Adjust the generation for the completion tag and wrap if necessary */
-#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \
-	((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \
-	0 : (txq)->compl_tag_cur_gen)
+#define IDPF_TXBUF_NULL			U32_MAX
 
 #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS)
 
@@ -153,18 +145,6 @@ union idpf_tx_flex_desc {
 #define idpf_tx_buf libeth_sqe
 
 /**
- * struct idpf_buf_lifo - LIFO for managing OOO completions
- * @top: Used to know how many buffers are left
- * @size: Total size of LIFO
- * @bufs: Backing array
- */
-struct idpf_buf_lifo {
-	u16 top;
-	u16 size;
-	struct idpf_tx_stash **bufs;
-};
-
-/**
  * struct idpf_tx_offload_params - Offload parameters for a given packet
  * @tx_flags: Feature flags enabled for this packet
  * @hdr_offsets: Offset parameter for single queue model
@@ -196,6 +176,9 @@ struct idpf_tx_offload_params {
  * @compl_tag: Associated tag for completion
  * @td_tag: Descriptor tunneling tag
  * @offload: Offload parameters
+ * @prev_ntu: stored TxQ next_to_use in case of rollback
+ * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback
+ * @prev_refill_gen: stored refillq generation bit in case of packet rollback
  */
 struct idpf_tx_splitq_params {
 	enum idpf_tx_desc_dtype_value dtype;
@@ -206,6 +189,10 @@ struct idpf_tx_splitq_params {
 	};
 
 	struct idpf_tx_offload_params offload;
+
+	u16 prev_ntu;
+	u16 prev_refill_ntc;
+	bool prev_refill_gen;
 };
 
 enum idpf_tx_ctx_desc_eipt_offload {
@@ -468,17 +455,6 @@ struct idpf_tx_queue_stats {
 #define IDPF_DIM_DEFAULT_PROFILE_IX		1
 
 /**
- * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode
- * @buf_stack: Stack of empty buffers to store buffer info for out of order
- *	       buffer completions. See struct idpf_buf_lifo
- * @sched_buf_hash: Hash table to store buffers
- */
-struct idpf_txq_stash {
-	struct idpf_buf_lifo buf_stack;
-	DECLARE_HASHTABLE(sched_buf_hash, 12);
-} ____cacheline_aligned;
-
-/**
  * struct idpf_rx_queue - software structure representing a receive queue
  * @rx: universal receive descriptor array
  * @single_buf: buffer descriptor array in singleq
@@ -610,6 +586,8 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64,
  * @netdev: &net_device corresponding to this queue
  * @next_to_use: Next descriptor to use
  * @next_to_clean: Next descriptor to clean
+ * @last_re: last descriptor index that RE bit was set
+ * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather
  * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on
  *		   the TX completion queue, it can be for any TXQ associated
  *		   with that completion queue. This means we can clean up to
@@ -620,11 +598,7 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64,
  *		   only once at the end of the cleaning routine.
  * @clean_budget: singleq only, queue cleaning budget
  * @cleaned_pkts: Number of packets cleaned for the above said case
- * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather
- * @stash: Tx buffer stash for Flow-based scheduling mode
- * @compl_tag_bufid_m: Completion tag buffer id mask
- * @compl_tag_cur_gen: Used to keep track of current completion tag generation
- * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset
+ * @refillq: Pointer to refill queue
  * @cached_tstamp_caps: Tx timestamp capabilities negotiated with the CP
  * @tstamp_task: Work that handles Tx timestamp read
  * @stats_sync: See struct u64_stats_sync
@@ -633,6 +607,7 @@ libeth_cacheline_set_assert(struct idpf_rx_queue, 64,
  * @size: Length of descriptor ring in bytes
  * @dma: Physical address of ring
  * @q_vector: Backreference to associated vector
+ * @buf_pool_size: Total number of idpf_tx_buf
  */
 struct idpf_tx_queue {
 	__cacheline_group_begin_aligned(read_mostly);
@@ -654,7 +629,6 @@ struct idpf_tx_queue {
 	u16 desc_count;
 
 	u16 tx_min_pkt_len;
-	u16 compl_tag_gen_s;
 
 	struct net_device *netdev;
 	__cacheline_group_end_aligned(read_mostly);
@@ -662,6 +636,8 @@ struct idpf_tx_queue {
 	__cacheline_group_begin_aligned(read_write);
 	u16 next_to_use;
 	u16 next_to_clean;
+	u16 last_re;
+	u16 tx_max_bufs;
 
 	union {
 		u32 cleaned_bytes;
@@ -669,12 +645,7 @@ struct idpf_tx_queue {
 	};
 	u16 cleaned_pkts;
 
-	u16 tx_max_bufs;
-	struct idpf_txq_stash *stash;
-
-	u16 compl_tag_bufid_m;
-	u16 compl_tag_cur_gen;
-	u16 compl_tag_gen_max;
+	struct idpf_sw_queue *refillq;
 
 	struct idpf_ptp_vport_tx_tstamp_caps *cached_tstamp_caps;
 	struct work_struct *tstamp_task;
@@ -689,11 +660,12 @@ struct idpf_tx_queue {
 	dma_addr_t dma;
 
 	struct idpf_q_vector *q_vector;
+	u32 buf_pool_size;
 	__cacheline_group_end_aligned(cold);
 };
 libeth_cacheline_set_assert(struct idpf_tx_queue, 64,
-			    112 + sizeof(struct u64_stats_sync),
-			    24);
+			    104 + sizeof(struct u64_stats_sync),
+			    32);
 
 /**
  * struct idpf_buf_queue - software structure representing a buffer queue
@@ -903,7 +875,6 @@ struct idpf_rxq_group {
  * @vport: Vport back pointer
  * @num_txq: Number of TX queues associated
  * @txqs: Array of TX queue pointers
- * @stashes: array of OOO stashes for the queues
  * @complq: Associated completion queue pointer, split queue only
  * @num_completions_pending: Total number of completions pending for the
  *			     completion queue, acculumated for all TX queues
@@ -918,7 +889,6 @@ struct idpf_txq_group {
 
 	u16 num_txq;
 	struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q];
-	struct idpf_txq_stash *stashes;
 
 	struct idpf_compl_queue *complq;
 
@@ -1011,6 +981,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector)
 	       reg->dyn_ctl);
 }
 
+/**
+ * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq
+ * @refillq: pointer to refillq containing buf_ids
+ */
+static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq)
+{
+	return (refillq->next_to_use > refillq->next_to_clean ?
+		0 : refillq->desc_count) +
+	       refillq->next_to_use - refillq->next_to_clean - 1;
+}
+
 int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget);
 void idpf_vport_init_num_qs(struct idpf_vport *vport,
 			    struct virtchnl2_create_vport *vport_msg);
@@ -1038,10 +1019,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val,
 			   bool xmit_more);
 unsigned int idpf_size_to_txd_count(unsigned int size);
 netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb);
-void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb,
-			   struct idpf_tx_buf *first, u16 ring_idx);
-unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq,
-					 struct sk_buff *skb);
+unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq,
+					struct sk_buff *skb, u32 *buf_count);
 void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue);
 netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb,
 				  struct idpf_tx_queue *tx_q);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index a028c69f7fdc..6330d4a0ae07 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -3765,6 +3765,16 @@ u32 idpf_get_vport_id(struct idpf_vport *vport)
 	return le32_to_cpu(vport_msg->vport_id);
 }
 
+static void idpf_set_mac_type(struct idpf_vport *vport,
+			      struct virtchnl2_mac_addr *mac_addr)
+{
+	bool is_primary;
+
+	is_primary = ether_addr_equal(vport->default_mac_addr, mac_addr->addr);
+	mac_addr->type = is_primary ? VIRTCHNL2_MAC_ADDR_PRIMARY :
+				      VIRTCHNL2_MAC_ADDR_EXTRA;
+}
+
 /**
  * idpf_mac_filter_async_handler - Async callback for mac filters
  * @adapter: private data struct
@@ -3894,6 +3904,7 @@ int idpf_add_del_mac_filters(struct idpf_vport *vport,
 			    list) {
 		if (add && f->add) {
 			ether_addr_copy(mac_addr[i].addr, f->macaddr);
+			idpf_set_mac_type(vport, &mac_addr[i]);
 			i++;
 			f->add = false;
 			if (i == total_filters)
@@ -3901,6 +3912,7 @@ int idpf_add_del_mac_filters(struct idpf_vport *vport,
 		}
 		if (!add && f->remove) {
 			ether_addr_copy(mac_addr[i].addr, f->macaddr);
+			idpf_set_mac_type(vport, &mac_addr[i]);
 			i++;
 			f->remove = false;
 			if (i == total_filters)
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 92ef33459aec..7b8f32c5169a 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2081,11 +2081,8 @@ static void igb_diag_test(struct net_device *netdev,
 	} else {
 		dev_info(&adapter->pdev->dev, "online testing starting\n");
 
-		/* PHY is powered down when interface is down */
-		if (if_running && igb_link_test(adapter, &data[TEST_LINK]))
+		if (igb_link_test(adapter, &data[TEST_LINK]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
-		else
-			data[TEST_LINK] = 0;
 
 		/* Online tests aren't run; pass by default */
 		data[TEST_REG] = 0;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index a9a7a94ae61e..453deb6d14b3 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4453,8 +4453,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
-			       rx_ring->queue_index,
-			       rx_ring->q_vector->napi.napi_id);
+			       rx_ring->queue_index, 0);
 	if (res < 0) {
 		dev_err(dev, "Failed to register xdp_rxq index %u\n",
 			rx_ring->queue_index);
diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 266bfcf2a28f..a427f05814c1 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -345,6 +345,7 @@ struct igc_adapter {
 	/* LEDs */
 	struct mutex led_mutex;
 	struct igc_led_classdev *leds;
+	bool leds_available;
 };
 
 void igc_up(struct igc_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 458e5eaa92e5..728d7ca5338b 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -7149,6 +7149,13 @@ static int igc_probe(struct pci_dev *pdev,
 	adapter->port_num = hw->bus.func;
 	adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
 
+	/* PCI config space info */
+	hw->vendor_id = pdev->vendor;
+	hw->device_id = pdev->device;
+	hw->revision_id = pdev->revision;
+	hw->subsystem_vendor_id = pdev->subsystem_vendor;
+	hw->subsystem_device_id = pdev->subsystem_device;
+
 	/* Disable ASPM L1.2 on I226 devices to avoid packet loss */
 	if (igc_is_device_id_i226(hw))
 		pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2);
@@ -7175,13 +7182,6 @@ static int igc_probe(struct pci_dev *pdev,
 	netdev->mem_start = pci_resource_start(pdev, 0);
 	netdev->mem_end = pci_resource_end(pdev, 0);
 
-	/* PCI config space info */
-	hw->vendor_id = pdev->vendor;
-	hw->device_id = pdev->device;
-	hw->revision_id = pdev->revision;
-	hw->subsystem_vendor_id = pdev->subsystem_vendor;
-	hw->subsystem_device_id = pdev->subsystem_device;
-
 	/* Copy the default MAC and PHY function pointers */
 	memcpy(&hw->mac.ops, ei->mac_ops, sizeof(hw->mac.ops));
 	memcpy(&hw->phy.ops, ei->phy_ops, sizeof(hw->phy.ops));
@@ -7335,8 +7335,14 @@ static int igc_probe(struct pci_dev *pdev,
 
 	if (IS_ENABLED(CONFIG_IGC_LEDS)) {
 		err = igc_led_setup(adapter);
-		if (err)
-			goto err_register;
+		if (err) {
+			netdev_warn_once(netdev,
+					 "LED init failed (%d); continuing without LED support\n",
+					 err);
+			adapter->leds_available = false;
+		} else {
+			adapter->leds_available = true;
+		}
 	}
 
 	return 0;
@@ -7392,7 +7398,7 @@ static void igc_remove(struct pci_dev *pdev)
 	cancel_work_sync(&adapter->watchdog_task);
 	hrtimer_cancel(&adapter->hrtimer);
 
-	if (IS_ENABLED(CONFIG_IGC_LEDS))
+	if (IS_ENABLED(CONFIG_IGC_LEDS) && adapter->leds_available)
 		igc_led_free(adapter);
 
 	/* Release control of h/w to f/w.  If f/w is AMT enabled, this
diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c
index 54f1b83dfe42..d227f4d2a2d1 100644
--- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c
+++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c
@@ -543,6 +543,7 @@ int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter)
 
 	attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
 	attrs.phys.port_number = adapter->hw.bus.func;
+	attrs.no_phys_port_name = 1;
 	ixgbe_devlink_set_switch_id(adapter, &attrs.switch_id);
 
 	devlink_port_attrs_set(devlink_port, &attrs);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
index d74116441d1c..bfeef5b0b99d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
@@ -3125,7 +3125,7 @@ static int ixgbe_get_orom_ver_info(struct ixgbe_hw *hw,
 	if (err)
 		return err;
 
-	combo_ver = le32_to_cpu(civd.combo_ver);
+	combo_ver = get_unaligned_le32(&civd.combo_ver);
 
 	orom->major = (u8)FIELD_GET(IXGBE_OROM_VER_MASK, combo_ver);
 	orom->patch = (u8)FIELD_GET(IXGBE_OROM_VER_PATCH_MASK, combo_ver);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 25c3a09ad7f1..1a2f1bdb91aa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3571,13 +3571,13 @@ ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_keee *edata)
 
 	for (i = 0; i < ARRAY_SIZE(ixgbe_ls_map); ++i) {
 		if (hw->phy.eee_speeds_supported & ixgbe_ls_map[i].mac_speed)
-			linkmode_set_bit(ixgbe_lp_map[i].link_mode,
+			linkmode_set_bit(ixgbe_ls_map[i].link_mode,
 					 edata->supported);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(ixgbe_ls_map); ++i) {
 		if (hw->phy.eee_speeds_advertised & ixgbe_ls_map[i].mac_speed)
-			linkmode_set_bit(ixgbe_lp_map[i].link_mode,
+			linkmode_set_bit(ixgbe_ls_map[i].link_mode,
 					 edata->advertised);
 	}
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6122a0abb41f..6218bdb7f941 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -968,10 +968,6 @@ static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter)
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		clear_bit(__IXGBE_HANG_CHECK_ARMED,
 			  &adapter->tx_ring[i]->state);
-
-	for (i = 0; i < adapter->num_xdp_queues; i++)
-		clear_bit(__IXGBE_HANG_CHECK_ARMED,
-			  &adapter->xdp_ring[i]->state);
 }
 
 static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter)
@@ -1214,7 +1210,7 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring,
 	struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
 
-	e_err(drv, "Detected Tx Unit Hang%s\n"
+	e_err(drv, "Detected Tx Unit Hang\n"
 		   "  Tx Queue             <%d>\n"
 		   "  TDH, TDT             <%x>, <%x>\n"
 		   "  next_to_use          <%x>\n"
@@ -1222,16 +1218,14 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring,
 		   "tx_buffer_info[next_to_clean]\n"
 		   "  time_stamp           <%lx>\n"
 		   "  jiffies              <%lx>\n",
-	      ring_is_xdp(tx_ring) ? " (XDP)" : "",
 	      tx_ring->queue_index,
 	      IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)),
 	      IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)),
 	      tx_ring->next_to_use, next,
 	      tx_ring->tx_buffer_info[next].time_stamp, jiffies);
 
-	if (!ring_is_xdp(tx_ring))
-		netif_stop_subqueue(tx_ring->netdev,
-				    tx_ring->queue_index);
+	netif_stop_subqueue(tx_ring->netdev,
+			    tx_ring->queue_index);
 }
 
 /**
@@ -1451,6 +1445,9 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 				   total_bytes);
 	adapter->tx_ipsec += total_ipsec;
 
+	if (ring_is_xdp(tx_ring))
+		return !!budget;
+
 	if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) {
 		if (adapter->hw.mac.type == ixgbe_mac_e610)
 			ixgbe_handle_mdd_event(adapter, tx_ring);
@@ -1468,9 +1465,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 		return true;
 	}
 
-	if (ring_is_xdp(tx_ring))
-		return !!budget;
-
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
 	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
 	if (!__netif_txq_completed_wake(txq, total_packets, total_bytes,
@@ -6979,6 +6973,13 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter,
 		break;
 	}
 
+	/* Make sure the SWFW semaphore is in a valid state */
+	if (hw->mac.ops.init_swfw_sync)
+		hw->mac.ops.init_swfw_sync(hw);
+
+	if (hw->mac.type == ixgbe_mac_e610)
+		mutex_init(&hw->aci.lock);
+
 #ifdef IXGBE_FCOE
 	/* FCoE support exists, always init the FCoE lock */
 	spin_lock_init(&adapter->fcoe.lock);
@@ -7974,12 +7975,9 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter)
 		return;
 
 	/* Force detection of hung controller */
-	if (netif_carrier_ok(adapter->netdev)) {
+	if (netif_carrier_ok(adapter->netdev))
 		for (i = 0; i < adapter->num_tx_queues; i++)
 			set_check_for_tx_hang(adapter->tx_ring[i]);
-		for (i = 0; i < adapter->num_xdp_queues; i++)
-			set_check_for_tx_hang(adapter->xdp_ring[i]);
-	}
 
 	if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) {
 		/*
@@ -8199,13 +8197,6 @@ static bool ixgbe_ring_tx_pending(struct ixgbe_adapter *adapter)
 			return true;
 	}
 
-	for (i = 0; i < adapter->num_xdp_queues; i++) {
-		struct ixgbe_ring *ring = adapter->xdp_ring[i];
-
-		if (ring->next_to_use != ring->next_to_clean)
-			return true;
-	}
-
 	return false;
 }
 
@@ -11005,6 +10996,10 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
+	if (!netif_carrier_ok(adapter->netdev) ||
+	    !netif_running(adapter->netdev))
+		return -ENETDOWN;
+
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
@@ -11655,10 +11650,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto err_sw_init;
 
-	/* Make sure the SWFW semaphore is in a valid state */
-	if (hw->mac.ops.init_swfw_sync)
-		hw->mac.ops.init_swfw_sync(hw);
-
 	if (ixgbe_check_fw_error(adapter))
 		return ixgbe_recovery_probe(adapter);
 
@@ -11862,8 +11853,6 @@ skip_sriov:
 	ether_addr_copy(hw->mac.addr, hw->mac.perm_addr);
 	ixgbe_mac_set_default_filter(adapter);
 
-	if (hw->mac.type == ixgbe_mac_e610)
-		mutex_init(&hw->aci.lock);
 	timer_setup(&adapter->service_timer, ixgbe_service_timer, 0);
 
 	if (ixgbe_removed(hw->hw_addr)) {
@@ -12019,9 +12008,9 @@ err_register:
 	devl_unlock(adapter->devlink);
 	ixgbe_release_hw_control(adapter);
 	ixgbe_clear_interrupt_scheme(adapter);
+err_sw_init:
 	if (hw->mac.type == ixgbe_mac_e610)
 		mutex_destroy(&adapter->hw.aci.lock);
-err_sw_init:
 	ixgbe_disable_sriov(adapter);
 	adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP;
 	iounmap(adapter->io_addr);
@@ -12072,10 +12061,8 @@ static void ixgbe_remove(struct pci_dev *pdev)
 	set_bit(__IXGBE_REMOVING, &adapter->state);
 	cancel_work_sync(&adapter->service_task);
 
-	if (adapter->hw.mac.type == ixgbe_mac_e610) {
+	if (adapter->hw.mac.type == ixgbe_mac_e610)
 		ixgbe_disable_link_status_events(adapter);
-		mutex_destroy(&adapter->hw.aci.lock);
-	}
 
 	if (adapter->mii_bus)
 		mdiobus_unregister(adapter->mii_bus);
@@ -12135,6 +12122,9 @@ static void ixgbe_remove(struct pci_dev *pdev)
 	disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);
 	free_netdev(netdev);
 
+	if (adapter->hw.mac.type == ixgbe_mac_e610)
+		mutex_destroy(&adapter->hw.aci.lock);
+
 	if (disable_dev)
 		pci_disable_device(pdev);
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
index d2f22d8558f8..ff8d640a50b1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
@@ -932,7 +932,7 @@ struct ixgbe_orom_civd_info {
 	__le32 combo_ver;	/* Combo Image Version number */
 	u8 combo_name_len;	/* Length of the unicode combo image version string, max of 32 */
 	__le16 combo_name[32];	/* Unicode string representing the Combo Image version */
-};
+} __packed;
 
 /* Function specific capabilities */
 struct ixgbe_hw_func_caps {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index ac58964b2f08..7b941505a9d0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -398,7 +398,7 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 	dma_addr_t dma;
 	u32 cmd_type;
 
-	while (budget-- > 0) {
+	while (likely(budget)) {
 		if (unlikely(!ixgbe_desc_unused(xdp_ring))) {
 			work_done = false;
 			break;
@@ -433,6 +433,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
 		xdp_ring->next_to_use++;
 		if (xdp_ring->next_to_use == xdp_ring->count)
 			xdp_ring->next_to_use = 0;
+
+		budget--;
 	}
 
 	if (tx_desc) {
diff --git a/drivers/net/ethernet/intel/libie/adminq.c b/drivers/net/ethernet/intel/libie/adminq.c
index 55356548e3f0..7b4ff479e7e5 100644
--- a/drivers/net/ethernet/intel/libie/adminq.c
+++ b/drivers/net/ethernet/intel/libie/adminq.c
@@ -6,7 +6,7 @@
 
 static const char * const libie_aq_str_arr[] = {
 #define LIBIE_AQ_STR(x)					\
-	[LIBIE_AQ_RC_##x]	= "LIBIE_AQ_RC" #x
+	[LIBIE_AQ_RC_##x]	= "LIBIE_AQ_RC_" #x
 	LIBIE_AQ_STR(OK),
 	LIBIE_AQ_STR(EPERM),
 	LIBIE_AQ_STR(ENOENT),
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index 24499bb36c00..bcea3fc26a8c 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -1124,11 +1124,24 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features
 	return err;
 }
 
+static bool octep_is_vf_valid(struct octep_device *oct, int vf)
+{
+	if (vf >= CFG_GET_ACTIVE_VFS(oct->conf)) {
+		netdev_err(oct->netdev, "Invalid VF ID %d\n", vf);
+		return false;
+	}
+
+	return true;
+}
+
 static int octep_get_vf_config(struct net_device *dev, int vf,
 			       struct ifla_vf_info *ivi)
 {
 	struct octep_device *oct = netdev_priv(dev);
 
+	if (!octep_is_vf_valid(oct, vf))
+		return -EINVAL;
+
 	ivi->vf = vf;
 	ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr);
 	ivi->spoofchk = true;
@@ -1143,6 +1156,9 @@ static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
 	struct octep_device *oct = netdev_priv(dev);
 	int err;
 
+	if (!octep_is_vf_valid(oct, vf))
+		return -EINVAL;
+
 	if (!is_valid_ether_addr(mac)) {
 		dev_err(&oct->pdev->dev, "Invalid  MAC Address %pM\n", mac);
 		return -EADDRNOTAVAIL;
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
index ebecdd29f3bd..0867fab61b19 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
@@ -196,6 +196,7 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct,  u32 vf_id,
 			vf_id);
 		return;
 	}
+	ether_addr_copy(oct->vf_info[vf_id].mac_addr, rsp->s_set_mac.mac_addr);
 	rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK;
 }
 
@@ -205,6 +206,8 @@ static void octep_pfvf_dev_remove(struct octep_device *oct,  u32 vf_id,
 {
 	int err;
 
+	/* Reset VF-specific information maintained by the PF */
+	memset(&oct->vf_info[vf_id], 0, sizeof(struct octep_pfvf_info));
 	err = octep_ctrl_net_dev_remove(oct, vf_id);
 	if (err) {
 		rsp->s.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index 4ff19a04b23e..69324ae09397 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -21,8 +21,7 @@
 #include "rvu.h"
 #include "lmac_common.h"
 
-#define DRV_NAME	"Marvell-CGX/RPM"
-#define DRV_STRING      "Marvell CGX/RPM Driver"
+#define DRV_NAME	"Marvell-CGX-RPM"
 
 #define CGX_RX_STAT_GLOBAL_INDEX	9
 
@@ -1978,6 +1977,13 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_release_regions;
 	}
 
+	if (!is_cn20k(pdev) &&
+	    !is_cgx_mapped_to_nix(pdev->subsystem_device, cgx->cgx_id)) {
+		dev_notice(dev, "CGX %d not mapped to NIX, skipping probe\n",
+			   cgx->cgx_id);
+		goto err_release_regions;
+	}
+
 	cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx);
 	if (!cgx->lmac_count) {
 		dev_notice(dev, "CGX %d LMAC count is zero, skipping probe\n", cgx->cgx_id);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 7ee1fdeb5295..18c7bb39dbc7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -783,6 +783,20 @@ static inline bool is_cn10kb(struct rvu *rvu)
 	return false;
 }
 
+static inline bool is_cgx_mapped_to_nix(unsigned short id, u8 cgx_id)
+{
+	/* On CNF10KA and CNF10KB silicons only two CGX blocks are connected
+	 * to NIX.
+	 */
+	if (id == PCI_SUBSYS_DEVID_CNF10K_A || id == PCI_SUBSYS_DEVID_CNF10K_B)
+		return cgx_id <= 1;
+
+	return !(cgx_id && !(id == PCI_SUBSYS_DEVID_96XX ||
+			     id == PCI_SUBSYS_DEVID_98XX ||
+			     id == PCI_SUBSYS_DEVID_CN10K_A ||
+			     id == PCI_SUBSYS_DEVID_CN10K_B));
+}
+
 static inline bool is_rvu_npc_hash_extract_en(struct rvu *rvu)
 {
 	u64 npc_const3;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c
index 1b765045aa63..b56395ac5a74 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c
@@ -606,8 +606,8 @@ static void npc_set_features(struct rvu *rvu, int blkaddr, u8 intf)
 		if (!npc_check_field(rvu, blkaddr, NPC_LB, intf))
 			*features &= ~BIT_ULL(NPC_OUTER_VID);
 
-	/* Set SPI flag only if AH/ESP and IPSEC_SPI are in the key */
-	if (npc_check_field(rvu, blkaddr, NPC_IPSEC_SPI, intf) &&
+	/* Allow extracting SPI field from AH and ESP headers at same offset */
+	if (npc_is_field_present(rvu, NPC_IPSEC_SPI, intf) &&
 	    (*features & (BIT_ULL(NPC_IPPROTO_ESP) | BIT_ULL(NPC_IPPROTO_AH))))
 		*features |= BIT_ULL(NPC_IPSEC_SPI);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index f674729124e6..aff17c37ddde 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -124,7 +124,9 @@ void otx2_get_dev_stats(struct otx2_nic *pfvf)
 			       dev_stats->rx_ucast_frames;
 
 	dev_stats->tx_bytes = OTX2_GET_TX_STATS(TX_OCTS);
-	dev_stats->tx_drops = OTX2_GET_TX_STATS(TX_DROP);
+	dev_stats->tx_drops = OTX2_GET_TX_STATS(TX_DROP) +
+			       (unsigned long)atomic_long_read(&dev_stats->tx_discards);
+
 	dev_stats->tx_bcast_frames = OTX2_GET_TX_STATS(TX_BCAST);
 	dev_stats->tx_mcast_frames = OTX2_GET_TX_STATS(TX_MCAST);
 	dev_stats->tx_ucast_frames = OTX2_GET_TX_STATS(TX_UCAST);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index e3765b73c434..1c8a3c078a64 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -153,6 +153,7 @@ struct otx2_dev_stats {
 	u64 tx_bcast_frames;
 	u64 tx_mcast_frames;
 	u64 tx_drops;
+	atomic_long_t tx_discards;
 };
 
 /* Driver counted stats */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index b23585c5e5c2..5027fae0aa77 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -2220,6 +2220,7 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
 	struct otx2_nic *pf = netdev_priv(netdev);
 	int qidx = skb_get_queue_mapping(skb);
+	struct otx2_dev_stats *dev_stats;
 	struct otx2_snd_queue *sq;
 	struct netdev_queue *txq;
 	int sq_idx;
@@ -2232,6 +2233,8 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
 	/* Check for minimum and maximum packet length */
 	if (skb->len <= ETH_HLEN ||
 	    (!skb_shinfo(skb)->gso_size && skb->len > pf->tx_max_pktlen)) {
+		dev_stats = &pf->hw.dev_stats;
+		atomic_long_inc(&dev_stats->tx_discards);
 		dev_kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
index e52cc6b1a26c..dedd586ed310 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
@@ -491,7 +491,7 @@ void otx2_ptp_destroy(struct otx2_nic *pfvf)
 	if (!ptp)
 		return;
 
-	cancel_delayed_work(&pfvf->ptp->synctstamp_work);
+	cancel_delayed_work_sync(&pfvf->ptp->synctstamp_work);
 
 	ptp_clock_unregister(ptp->ptp_clock);
 	kfree(ptp);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
index 5f80b23c5335..26a08d2cfbb1 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
@@ -1326,7 +1326,6 @@ static int otx2_tc_add_flow(struct otx2_nic *nic,
 
 free_leaf:
 	otx2_tc_del_from_flow_list(flow_cfg, new_node);
-	kfree_rcu(new_node, rcu);
 	if (new_node->is_act_police) {
 		mutex_lock(&nic->mbox.lock);
 
@@ -1346,6 +1345,7 @@ free_leaf:
 
 		mutex_unlock(&nic->mbox.lock);
 	}
+	kfree_rcu(new_node, rcu);
 
 	return rc;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
index 5589fccd370b..7ebb6e656884 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
@@ -417,9 +417,19 @@ static netdev_tx_t otx2vf_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
 	struct otx2_nic *vf = netdev_priv(netdev);
 	int qidx = skb_get_queue_mapping(skb);
+	struct otx2_dev_stats *dev_stats;
 	struct otx2_snd_queue *sq;
 	struct netdev_queue *txq;
 
+	/* Check for minimum and maximum packet length */
+	if (skb->len <= ETH_HLEN ||
+	    (!skb_shinfo(skb)->gso_size && skb->len > vf->tx_max_pktlen)) {
+		dev_stats = &vf->hw.dev_stats;
+		atomic_long_inc(&dev_stats->tx_discards);
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
 	sq = &vf->qset.sq[qidx];
 	txq = netdev_get_tx_queue(netdev, qidx);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
index 25af98034e2e..b476733a0234 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
@@ -371,7 +371,8 @@ static void rvu_rep_get_stats(struct work_struct *work)
 	stats->rx_mcast_frames = rsp->rx.mcast;
 	stats->tx_bytes = rsp->tx.octs;
 	stats->tx_frames = rsp->tx.ucast + rsp->tx.bcast + rsp->tx.mcast;
-	stats->tx_drops = rsp->tx.drop;
+	stats->tx_drops = rsp->tx.drop +
+			  (unsigned long)atomic_long_read(&stats->tx_discards);
 exit:
 	mutex_unlock(&priv->mbox.lock);
 }
@@ -418,6 +419,16 @@ static netdev_tx_t rvu_rep_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct otx2_nic *pf = rep->mdev;
 	struct otx2_snd_queue *sq;
 	struct netdev_queue *txq;
+	struct rep_stats *stats;
+
+	/* Check for minimum and maximum packet length */
+	if (skb->len <= ETH_HLEN ||
+	    (!skb_shinfo(skb)->gso_size && skb->len > pf->tx_max_pktlen)) {
+		stats = &rep->stats;
+		atomic_long_inc(&stats->tx_discards);
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
 
 	sq = &pf->qset.sq[rep->rep_id];
 	txq = netdev_get_tx_queue(dev, 0);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.h b/drivers/net/ethernet/marvell/octeontx2/nic/rep.h
index 38446b3e4f13..5bc9e2c7d800 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.h
@@ -27,6 +27,7 @@ struct rep_stats {
 	u64 tx_bytes;
 	u64 tx_frames;
 	u64 tx_drops;
+	atomic_long_t tx_discards;
 };
 
 struct rep_dev {
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 5a5fcde76dc0..e68997a29191 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1761,6 +1761,13 @@ static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	bool gso = false;
 	int tx_num;
 
+	if (skb_vlan_tag_present(skb) &&
+	    !eth_proto_is_802_3(eth_hdr(skb)->h_proto)) {
+		skb = __vlan_hwaccel_push_inside(skb);
+		if (!skb)
+			goto dropped;
+	}
+
 	/* normally we can rely on the stack not calling this more than once,
 	 * however we have 2 queues running on the same ring so we need to lock
 	 * the ring access
@@ -1806,8 +1813,9 @@ static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 drop:
 	spin_unlock(&eth->page_lock);
-	stats->tx_dropped++;
 	dev_kfree_skb_any(skb);
+dropped:
+	stats->tx_dropped++;
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index c855fb799ce1..e9bd32741983 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -101,7 +101,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i
 	if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED))
 		return -1;
 
+	rcu_read_lock();
 	err = dev_fill_forward_path(dev, addr, &stack);
+	rcu_read_unlock();
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 92a16ddb7d86..13666d50b90f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -267,8 +267,10 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	pp.dma_dir = priv->dma_dir;
 
 	ring->pp = page_pool_create(&pp);
-	if (!ring->pp)
+	if (IS_ERR(ring->pp)) {
+		err = PTR_ERR(ring->pp);
 		goto err_ring;
+	}
 
 	if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0)
 		goto err_pp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 3ffa3fbacd16..2c0e0c16ca90 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -160,7 +160,7 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli
 	if (err)
 		return err;
 
-	mlx5_unload_one_devl_locked(dev, true);
+	mlx5_sync_reset_unload_flow(dev, true);
 	err = mlx5_health_wait_pci_up(dev);
 	if (err)
 		NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h
index b59aee75de94..2c98a5299df3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h
@@ -26,7 +26,6 @@ struct mlx5e_dcbx {
 	u8                         cap;
 
 	/* Buffer configuration */
-	bool                       manual_buffer;
 	u32                        cable_len;
 	u32                        xoff;
 	u16                        port_buff_cell_sz;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index 9560fcba643f..ac65e3191480 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -92,6 +92,7 @@ enum {
 	MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1,
 	MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL,
 	MLX5E_ACCEL_FS_POL_FT_LEVEL,
+	MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL,
 	MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL,
 #endif
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
index 5ae787656a7c..4720523813b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
@@ -272,8 +272,8 @@ static int port_update_shared_buffer(struct mlx5_core_dev *mdev,
 	/* Total shared buffer size is split in a ratio of 3:1 between
 	 * lossy and lossless pools respectively.
 	 */
-	lossy_epool_size = (shared_buffer_size / 4) * 3;
 	lossless_ipool_size = shared_buffer_size / 4;
+	lossy_epool_size    = shared_buffer_size - lossless_ipool_size;
 
 	mlx5e_port_set_sbpr(mdev, 0, MLX5_EGRESS_DIR, MLX5_LOSSY_POOL, 0,
 			    lossy_epool_size);
@@ -288,14 +288,12 @@ static int port_set_buffer(struct mlx5e_priv *priv,
 	u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
-	u32 new_headroom_size = 0;
-	u32 current_headroom_size;
+	u32 current_headroom_cells = 0;
+	u32 new_headroom_cells = 0;
 	void *in;
 	int err;
 	int i;
 
-	current_headroom_size = port_buffer->headroom_size;
-
 	in = kzalloc(sz, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -306,12 +304,14 @@ static int port_set_buffer(struct mlx5e_priv *priv,
 
 	for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) {
 		void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]);
+		current_headroom_cells += MLX5_GET(bufferx_reg, buffer, size);
+
 		u64 size = port_buffer->buffer[i].size;
 		u64 xoff = port_buffer->buffer[i].xoff;
 		u64 xon = port_buffer->buffer[i].xon;
 
-		new_headroom_size += size;
 		do_div(size, port_buff_cell_sz);
+		new_headroom_cells += size;
 		do_div(xoff, port_buff_cell_sz);
 		do_div(xon, port_buff_cell_sz);
 		MLX5_SET(bufferx_reg, buffer, size, size);
@@ -320,10 +320,8 @@ static int port_set_buffer(struct mlx5e_priv *priv,
 		MLX5_SET(bufferx_reg, buffer, xon_threshold, xon);
 	}
 
-	new_headroom_size /= port_buff_cell_sz;
-	current_headroom_size /= port_buff_cell_sz;
-	err = port_update_shared_buffer(priv->mdev, current_headroom_size,
-					new_headroom_size);
+	err = port_update_shared_buffer(priv->mdev, current_headroom_cells,
+					new_headroom_cells);
 	if (err)
 		goto out;
 
@@ -577,7 +575,6 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
 		if (err)
 			return err;
 	}
-	priv->dcbx.xoff = xoff;
 
 	/* Apply the settings */
 	if (update_buffer) {
@@ -586,6 +583,8 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
 			return err;
 	}
 
+	priv->dcbx.xoff = xoff;
+
 	if (update_prio2buffer)
 		err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
index f4a19ffbb641..66d276a1be83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
@@ -66,11 +66,23 @@ struct mlx5e_port_buffer {
 	struct mlx5e_bufferx_reg  buffer[MLX5E_MAX_NETWORK_BUFFER];
 };
 
+#ifdef CONFIG_MLX5_CORE_EN_DCB
 int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
 				    u32 change, unsigned int mtu,
 				    struct ieee_pfc *pfc,
 				    u32 *buffer_size,
 				    u8 *prio2buffer);
+#else
+static inline int
+mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
+				u32 change, unsigned int mtu,
+				void *pfc,
+				u32 *buffer_size,
+				u8 *prio2buffer)
+{
+	return 0;
+}
+#endif
 
 int mlx5e_port_query_buffer(struct mlx5e_priv *priv,
 			    struct mlx5e_port_buffer *port_buffer);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c
index a4263137fef5..01d522b02947 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c
@@ -173,6 +173,8 @@ static void mlx5_ct_fs_hmfs_fill_rule_actions(struct mlx5_ct_fs_hmfs *fs_hmfs,
 
 	memset(rule_actions, 0, NUM_CT_HMFS_RULES * sizeof(*rule_actions));
 	rule_actions[0].action = mlx5_fc_get_hws_action(fs_hmfs->ctx, attr->counter);
+	rule_actions[0].counter.offset =
+		attr->counter->id - attr->counter->bulk->base_id;
 	/* Modify header is special, it may require extra arguments outside the action itself. */
 	if (mh_action->mh_data) {
 		rule_actions[1].modify_header.offset = mh_action->mh_data->offset;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
index ffcd0cdeb775..23703f28386a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
@@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr {
 	u32 family;
 	int prio;
 	int pol_level;
+	int pol_miss_level;
 	int sa_level;
 	int status_level;
 	enum mlx5_flow_namespace_type chains_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
index 98b6a3a623f9..65dc3529283b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
@@ -747,6 +747,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec,
 	attr->family = family;
 	attr->prio = MLX5E_NIC_PRIO;
 	attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL;
+	attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL;
 	attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL;
 	attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL;
 	attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL;
@@ -833,7 +834,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec,
 
 	ft_attr.max_fte = 1;
 	ft_attr.autogroup.max_num_groups = 1;
-	ft_attr.level = attr->pol_level;
+	ft_attr.level = attr->pol_miss_level;
 	ft_attr.prio = attr->prio;
 
 	ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 5fe016e477b3..d166c0d5189e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -362,6 +362,7 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev,
 static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
 				   struct ieee_pfc *pfc)
 {
+	u8 buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN;
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 old_cable_len = priv->dcbx.cable_len;
@@ -389,7 +390,14 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
 
 	if (MLX5_BUFFER_SUPPORTED(mdev)) {
 		pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en;
-		if (priv->dcbx.manual_buffer)
+		ret = mlx5_query_port_buffer_ownership(mdev,
+						       &buffer_ownership);
+		if (ret)
+			netdev_err(dev,
+				   "%s, Failed to get buffer ownership: %d\n",
+				   __func__, ret);
+
+		if (buffer_ownership == MLX5_BUF_OWNERSHIP_SW_OWNED)
 			ret = mlx5e_port_manual_buffer_config(priv, changed,
 							      dev->mtu, &pfc_new,
 							      NULL, NULL);
@@ -982,7 +990,6 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev,
 	if (!changed)
 		return 0;
 
-	priv->dcbx.manual_buffer = true;
 	err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL,
 					      buffer_size, prio2buffer);
 	return err;
@@ -1252,7 +1259,6 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv)
 		priv->dcbx.cap |= DCB_CAP_DCBX_HOST;
 
 	priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv);
-	priv->dcbx.manual_buffer = false;
 	priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN;
 
 	mlx5e_ets_init(priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 21bb88c5d3dc..15eded36b872 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -49,6 +49,7 @@
 #include "en.h"
 #include "en/dim.h"
 #include "en/txrx.h"
+#include "en/port_buffer.h"
 #include "en_tc.h"
 #include "en_rep.h"
 #include "en_accel/ipsec.h"
@@ -3040,9 +3041,11 @@ int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv)
 	struct mlx5e_params *params = &priv->channels.params;
 	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	u16 mtu;
+	u16 mtu, prev_mtu;
 	int err;
 
+	mlx5e_query_mtu(mdev, params, &prev_mtu);
+
 	err = mlx5e_set_mtu(mdev, params, params->sw_mtu);
 	if (err)
 		return err;
@@ -3052,6 +3055,18 @@ int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv)
 		netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n",
 			    __func__, mtu, params->sw_mtu);
 
+	if (mtu != prev_mtu && MLX5_BUFFER_SUPPORTED(mdev)) {
+		err = mlx5e_port_manual_buffer_config(priv, 0, mtu,
+						      NULL, NULL, NULL);
+		if (err) {
+			netdev_warn(netdev, "%s: Failed to set Xon/Xoff values with MTU %d (err %d), setting back to previous MTU %d\n",
+				    __func__, mtu, err, prev_mtu);
+
+			mlx5e_set_mtu(mdev, params, prev_mtu);
+			return err;
+		}
+	}
+
 	params->sw_mtu = mtu;
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 63a7a788fb0d..cd0242eb008c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = {
 static int
 mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
-	struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev));
 	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+	int err;
+
+	netdev = mlx5_uplink_netdev_get(dev);
+	if (!netdev)
+		return 0;
 
+	priv = netdev_priv(netdev);
 	rpriv->netdev = priv->netdev;
-	return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile,
-					   rpriv);
+	err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile,
+					  rpriv);
+	mlx5_uplink_netdev_put(dev, netdev);
+	return err;
 }
 
 static void
@@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
 	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
 	struct net_device *netdev = rpriv->netdev;
-	struct mlx5e_priv *priv = netdev_priv(netdev);
-	void *ppriv = priv->ppriv;
+	struct mlx5e_priv *priv;
+	void *ppriv;
+
+	if (!netdev) {
+		ppriv = rpriv;
+		goto free_ppriv;
+	}
+
+	priv = netdev_priv(netdev);
+	ppriv = priv->ppriv;
 
 	if (rep->vport == MLX5_VPORT_UPLINK) {
 		mlx5e_vport_uplink_rep_unload(rpriv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 87536f158d07..c6185ddba04b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -1466,6 +1466,7 @@ static void fec_set_block_stats(struct mlx5e_priv *priv,
 	case MLX5E_FEC_RS_528_514:
 	case MLX5E_FEC_RS_544_514:
 	case MLX5E_FEC_LLRS_272_257_1:
+	case MLX5E_FEC_RS_544_514_INTERLEAVED_QUAD:
 		fec_set_rs_stats(fec_stats, out);
 		return;
 	case MLX5E_FEC_FIRECODE:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
index b7102e14d23d..c33accadae0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
@@ -47,10 +47,12 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch *
 		devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum,
 					      vport_num - 1, external);
 	}  else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) {
+		u16 base_vport = mlx5_core_ec_vf_vport_base(dev);
+
 		memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len);
 		dl_port->attrs.switch_id.id_len = ppid.id_len;
 		devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum,
-					      vport_num - 1, false);
+					      vport_num - base_vport, false);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 91d863c8c152..5f2d6c35f1ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -102,6 +102,8 @@ struct mlx5_esw_sched_node {
 	u8 level;
 	/* Valid only when this node represents a traffic class. */
 	u8 tc;
+	/* Valid only for a TC arbiter node or vport TC arbiter. */
+	u32 tc_bw[DEVLINK_RATE_TCS_MAX];
 };
 
 static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node)
@@ -462,6 +464,7 @@ static int
 esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
 				   struct netlink_ext_ack *extack)
 {
+	struct mlx5_esw_sched_node *parent = vport_node->parent;
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_core_dev *dev = vport_node->esw->dev;
 	void *attr;
@@ -477,7 +480,7 @@ esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
 	attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
 	MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport);
 	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
-		 vport_node->parent->ix);
+		 parent ? parent->ix : vport_node->esw->qos.root_tsar_ix);
 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
 		 vport_node->max_rate);
 
@@ -608,10 +611,7 @@ static void
 esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
 				 u32 *tc_bw)
 {
-	struct mlx5_esw_sched_node *vports_tc_node;
-
-	list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry)
-		tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share;
+	memcpy(tc_bw, tc_arbiter_node->tc_bw, sizeof(tc_arbiter_node->tc_bw));
 }
 
 static void
@@ -628,6 +628,7 @@ esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
 		u8 tc = vports_tc_node->tc;
 		u32 bw_share;
 
+		tc_arbiter_node->tc_bw[tc] = tc_bw[tc];
 		bw_share = tc_bw[tc] * fw_max_bw_share;
 		bw_share = esw_qos_calc_bw_share(bw_share, divider,
 						 fw_max_bw_share);
@@ -786,48 +787,15 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta
 		return err;
 	}
 
-	if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) {
-		esw->qos.node0 = __esw_qos_create_vports_sched_node(esw, NULL, extack);
-	} else {
-		/* The eswitch doesn't support scheduling nodes.
-		 * Create a software-only node0 using the root TSAR to attach vport QoS to.
-		 */
-		if (!__esw_qos_alloc_node(esw,
-					  esw->qos.root_tsar_ix,
-					  SCHED_NODE_TYPE_VPORTS_TSAR,
-					  NULL))
-			esw->qos.node0 = ERR_PTR(-ENOMEM);
-		else
-			list_add_tail(&esw->qos.node0->entry,
-				      &esw->qos.domain->nodes);
-	}
-	if (IS_ERR(esw->qos.node0)) {
-		err = PTR_ERR(esw->qos.node0);
-		esw_warn(dev, "E-Switch create rate node 0 failed (%d)\n", err);
-		goto err_node0;
-	}
 	refcount_set(&esw->qos.refcnt, 1);
 
 	return 0;
-
-err_node0:
-	if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH,
-						esw->qos.root_tsar_ix))
-		esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n");
-
-	return err;
 }
 
 static void esw_qos_destroy(struct mlx5_eswitch *esw)
 {
 	int err;
 
-	if (esw->qos.node0->ix != esw->qos.root_tsar_ix)
-		__esw_qos_destroy_node(esw->qos.node0, NULL);
-	else
-		__esw_qos_free_node(esw->qos.node0);
-	esw->qos.node0 = NULL;
-
 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
 						  SCHEDULING_HIERARCHY_E_SWITCH,
 						  esw->qos.root_tsar_ix);
@@ -990,13 +958,16 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type,
 			struct netlink_ext_ack *extack)
 {
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	int err, new_level, max_level;
+	struct mlx5_esw_sched_node *parent = vport_node->parent;
+	int err;
 
 	if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+		int new_level, max_level;
+
 		/* Increase the parent's level by 2 to account for both the
 		 * TC arbiter and the vports TC scheduling element.
 		 */
-		new_level = vport_node->parent->level + 2;
+		new_level = (parent ? parent->level : 2) + 2;
 		max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev,
 					      log_esw_max_sched_depth);
 		if (new_level > max_level) {
@@ -1033,9 +1004,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type,
 err_sched_nodes:
 	if (type == SCHED_NODE_TYPE_RATE_LIMITER) {
 		esw_qos_node_destroy_sched_element(vport_node, NULL);
-		list_add_tail(&vport_node->entry,
-			      &vport_node->parent->children);
-		vport_node->level = vport_node->parent->level + 1;
+		esw_qos_node_attach_to_parent(vport_node);
 	} else {
 		esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL);
 	}
@@ -1083,7 +1052,6 @@ err_out:
 static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack)
 {
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
-	struct mlx5_esw_sched_node *parent = vport_node->parent;
 	enum sched_node_type curr_type = vport_node->type;
 
 	if (curr_type == SCHED_NODE_TYPE_VPORT)
@@ -1092,8 +1060,9 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a
 		esw_qos_vport_tc_disable(vport, extack);
 
 	vport_node->bw_share = 0;
+	memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw));
 	list_del_init(&vport_node->entry);
-	esw_qos_normalize_min_rate(parent->esw, parent, extack);
+	esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack);
 
 	trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport);
 }
@@ -1103,25 +1072,23 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport,
 				struct mlx5_esw_sched_node *parent,
 				struct netlink_ext_ack *extack)
 {
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
 	int err;
 
 	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
 
-	esw_qos_node_set_parent(vport->qos.sched_node, parent);
-	if (type == SCHED_NODE_TYPE_VPORT) {
-		err = esw_qos_vport_create_sched_element(vport->qos.sched_node,
-							 extack);
-	} else {
+	esw_qos_node_set_parent(vport_node, parent);
+	if (type == SCHED_NODE_TYPE_VPORT)
+		err = esw_qos_vport_create_sched_element(vport_node, extack);
+	else
 		err = esw_qos_vport_tc_enable(vport, type, extack);
-	}
 	if (err)
 		return err;
 
-	vport->qos.sched_node->type = type;
-	esw_qos_normalize_min_rate(parent->esw, parent, extack);
-	trace_mlx5_esw_vport_qos_create(vport->dev, vport,
-					vport->qos.sched_node->max_rate,
-					vport->qos.sched_node->bw_share);
+	vport_node->type = type;
+	esw_qos_normalize_min_rate(vport_node->esw, parent, extack);
+	trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate,
+					vport_node->bw_share);
 
 	return 0;
 }
@@ -1132,6 +1099,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	struct mlx5_esw_sched_node *sched_node;
+	struct mlx5_eswitch *parent_esw;
 	int err;
 
 	esw_assert_qos_lock_held(esw);
@@ -1139,10 +1107,14 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 	if (err)
 		return err;
 
-	parent = parent ?: esw->qos.node0;
-	sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent);
-	if (!sched_node)
+	parent_esw = parent ? parent->esw : esw;
+	sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent);
+	if (!sched_node) {
+		esw_qos_put(esw);
 		return -ENOMEM;
+	}
+	if (!parent)
+		list_add_tail(&sched_node->entry, &esw->qos.domain->nodes);
 
 	sched_node->max_rate = max_rate;
 	sched_node->min_rate = min_rate;
@@ -1150,6 +1122,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 	vport->qos.sched_node = sched_node;
 	err = esw_qos_vport_enable(vport, type, parent, extack);
 	if (err) {
+		__esw_qos_free_node(sched_node);
 		esw_qos_put(esw);
 		vport->qos.sched_node = NULL;
 	}
@@ -1157,6 +1130,19 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 	return err;
 }
 
+static void mlx5_esw_qos_vport_disable_locked(struct mlx5_vport *vport)
+{
+	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+
+	esw_assert_qos_lock_held(esw);
+	if (!vport->qos.sched_node)
+		return;
+
+	esw_qos_vport_disable(vport, NULL);
+	mlx5_esw_qos_vport_qos_free(vport);
+	esw_qos_put(esw);
+}
+
 void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
@@ -1168,11 +1154,9 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 		goto unlock;
 
 	parent = vport->qos.sched_node->parent;
-	WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node");
+	WARN(parent, "Disabling QoS on port before detaching it from node");
 
-	esw_qos_vport_disable(vport, NULL);
-	mlx5_esw_qos_vport_qos_free(vport);
-	esw_qos_put(esw);
+	mlx5_esw_qos_vport_disable_locked(vport);
 unlock:
 	esw_qos_unlock(esw);
 }
@@ -1262,13 +1246,13 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
 				struct mlx5_esw_sched_node *parent,
 				struct netlink_ext_ack *extack)
 {
-	struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent;
-	enum sched_node_type curr_type = vport->qos.sched_node->type;
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+	struct mlx5_esw_sched_node *curr_parent = vport_node->parent;
+	enum sched_node_type curr_type = vport_node->type;
 	u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
 	int err;
 
 	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
-	parent = parent ?: curr_parent;
 	if (curr_type == type && curr_parent == parent)
 		return 0;
 
@@ -1276,10 +1260,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
 	if (err)
 		return err;
 
-	if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
-		esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node,
-						 curr_tc_bw);
-	}
+	if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type)
+		esw_qos_tc_arbiter_get_bw_shares(vport_node, curr_tc_bw);
 
 	esw_qos_vport_disable(vport, extack);
 
@@ -1290,8 +1272,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
 	}
 
 	if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
-		esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node,
-						 curr_tc_bw, extack);
+		esw_qos_set_tc_arbiter_bw_shares(vport_node, curr_tc_bw,
+						 extack);
 	}
 
 	return err;
@@ -1306,16 +1288,16 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw
 
 	esw_assert_qos_lock_held(esw);
 	curr_parent = vport->qos.sched_node->parent;
-	parent = parent ?: esw->qos.node0;
 	if (curr_parent == parent)
 		return 0;
 
 	/* Set vport QoS type based on parent node type if different from
 	 * default QoS; otherwise, use the vport's current QoS type.
 	 */
-	if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+	if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
 		type = SCHED_NODE_TYPE_RATE_LIMITER;
-	else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
+	else if (curr_parent &&
+		 curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
 		type = SCHED_NODE_TYPE_VPORT;
 	else
 		type = vport->qos.sched_node->type;
@@ -1533,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev)
 		speed = lksettings.base.speed;
 
 out:
+	mlx5_uplink_netdev_put(mdev, slave);
 	return speed;
 }
 
@@ -1654,9 +1637,10 @@ static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw,
 static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport,
 						     u32 *tc_bw)
 {
-	struct mlx5_eswitch *esw = vport->qos.sched_node ?
-				   vport->qos.sched_node->parent->esw :
-				   vport->dev->priv.eswitch;
+	struct mlx5_esw_sched_node *node = vport->qos.sched_node;
+	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+
+	esw = (node && node->parent) ? node->parent->esw : esw;
 
 	return esw_qos_validate_unsupported_tc_bw(esw, tc_bw);
 }
@@ -1673,6 +1657,21 @@ static bool esw_qos_tc_bw_disabled(u32 *tc_bw)
 	return true;
 }
 
+static void esw_vport_qos_prune_empty(struct mlx5_vport *vport)
+{
+	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
+
+	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+	if (!vport_node)
+		return;
+
+	if (vport_node->parent || vport_node->max_rate ||
+	    vport_node->min_rate || !esw_qos_tc_bw_disabled(vport_node->tc_bw))
+		return;
+
+	mlx5_esw_qos_vport_disable_locked(vport);
+}
+
 int mlx5_esw_qos_init(struct mlx5_eswitch *esw)
 {
 	if (esw->qos.domain)
@@ -1706,6 +1705,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void
 
 	esw_qos_lock(esw);
 	err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack);
+	if (err)
+		goto out;
+	esw_vport_qos_prune_empty(vport);
+out:
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -1727,6 +1730,10 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *
 
 	esw_qos_lock(esw);
 	err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack);
+	if (err)
+		goto out;
+	esw_vport_qos_prune_empty(vport);
+out:
 	esw_qos_unlock(esw);
 	return err;
 }
@@ -1763,7 +1770,8 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf,
 	if (disable) {
 		if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
 			err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT,
-						   NULL, extack);
+						   vport_node->parent, extack);
+		esw_vport_qos_prune_empty(vport);
 		goto unlock;
 	}
 
@@ -1775,7 +1783,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf,
 	} else {
 		err = esw_qos_vport_update(vport,
 					   SCHED_NODE_TYPE_TC_ARBITER_TSAR,
-					   NULL, extack);
+					   vport_node->parent, extack);
 	}
 	if (!err)
 		esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack);
@@ -1924,14 +1932,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate,
 					  void *priv, void *parent_priv,
 					  struct netlink_ext_ack *extack)
 {
-	struct mlx5_esw_sched_node *node;
+	struct mlx5_esw_sched_node *node = parent ? parent_priv : NULL;
 	struct mlx5_vport *vport = priv;
+	int err;
 
-	if (!parent)
-		return mlx5_esw_qos_vport_update_parent(vport, NULL, extack);
+	err = mlx5_esw_qos_vport_update_parent(vport, node, extack);
+	if (!err) {
+		struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+
+		esw_qos_lock(esw);
+		esw_vport_qos_prune_empty(vport);
+		esw_qos_unlock(esw);
+	}
 
-	node = parent_priv;
-	return mlx5_esw_qos_vport_update_parent(vport, node, extack);
+	return err;
 }
 
 static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b0b8ef3ec3c4..45506ad56847 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -373,11 +373,6 @@ struct mlx5_eswitch {
 		refcount_t refcnt;
 		u32 root_tsar_ix;
 		struct mlx5_qos_domain *domain;
-		/* Contains all vports with QoS enabled but no explicit node.
-		 * Cannot be NULL if QoS is enabled, but may be a fake node
-		 * referencing the root TSAR if the esw doesn't support nodes.
-		 */
-		struct mlx5_esw_sched_node *node0;
 	} qos;
 
 	struct mlx5_esw_bridge_offloads *br_offloads;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index d87392360dbd..80245c38dbad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -114,9 +114,9 @@
 #define ETHTOOL_NUM_PRIOS 11
 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
 /* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy,
- * {IPsec RoCE MPV,Alias table},IPsec RoCE policy
+ * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy
  */
-#define KERNEL_NIC_PRIO_NUM_LEVELS 10
+#define KERNEL_NIC_PRIO_NUM_LEVELS 11
 #define KERNEL_NIC_NUM_PRIOS 1
 /* One more level for tc, and one more for promisc */
 #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2)
@@ -663,7 +663,7 @@ static void del_sw_hw_rule(struct fs_node *node)
 			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
 			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
 		fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
-		mlx5_fc_local_destroy(rule->dest_attr.counter);
+		mlx5_fc_local_put(rule->dest_attr.counter);
 		goto out;
 	}
 
@@ -3734,6 +3734,13 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id,
 	char *value = val.vstr;
 	u8 eswitch_mode;
 
+	eswitch_mode = mlx5_eswitch_mode(dev);
+	if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) {
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "Changing fs mode is not supported when eswitch offloads enabled.");
+		return -EOPNOTSUPP;
+	}
+
 	if (!strcmp(value, "dmfs"))
 		return 0;
 
@@ -3759,14 +3766,6 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id,
 		return -EINVAL;
 	}
 
-	eswitch_mode = mlx5_eswitch_mode(dev);
-	if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) {
-		NL_SET_ERR_MSG_FMT_MOD(extack,
-				       "Moving to %s is not supported when eswitch offloads enabled.",
-				       value);
-		return -EOPNOTSUPP;
-	}
-
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 500826229b0b..e6a95b310b55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -343,6 +343,7 @@ struct mlx5_fc {
 	enum mlx5_fc_type type;
 	struct mlx5_fc_bulk *bulk;
 	struct mlx5_fc_cache cache;
+	refcount_t fc_local_refcount;
 	/* last{packets,bytes} are used for calculating deltas since last reading. */
 	u64 lastpackets;
 	u64 lastbytes;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 492775d3d193..83001eda3884 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -562,17 +562,36 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size)
 	counter->id = counter_id;
 	fc_bulk->base_id = counter_id - offset;
 	fc_bulk->fs_bulk.bulk_len = bulk_size;
+	refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0);
+	mutex_init(&fc_bulk->hws_data.lock);
 	counter->bulk = fc_bulk;
+	refcount_set(&counter->fc_local_refcount, 1);
 	return counter;
 }
 EXPORT_SYMBOL(mlx5_fc_local_create);
 
 void mlx5_fc_local_destroy(struct mlx5_fc *counter)
 {
-	if (!counter || counter->type != MLX5_FC_TYPE_LOCAL)
-		return;
-
 	kfree(counter->bulk);
 	kfree(counter);
 }
 EXPORT_SYMBOL(mlx5_fc_local_destroy);
+
+void mlx5_fc_local_get(struct mlx5_fc *counter)
+{
+	if (!counter || counter->type != MLX5_FC_TYPE_LOCAL)
+		return;
+
+	refcount_inc(&counter->fc_local_refcount);
+}
+
+void mlx5_fc_local_put(struct mlx5_fc *counter)
+{
+	if (!counter || counter->type != MLX5_FC_TYPE_LOCAL)
+		return;
+
+	if (!refcount_dec_and_test(&counter->fc_local_refcount))
+		return;
+
+	mlx5_fc_local_destroy(counter);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 69933addd921..22995131824a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -6,13 +6,15 @@
 #include "fw_reset.h"
 #include "diag/fw_tracer.h"
 #include "lib/tout.h"
+#include "sf/sf.h"
 
 enum {
 	MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
 	MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
 	MLX5_FW_RESET_FLAGS_PENDING_COMP,
 	MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
-	MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
+	MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED,
+	MLX5_FW_RESET_FLAGS_UNLOAD_EVENT,
 };
 
 struct mlx5_fw_reset {
@@ -219,7 +221,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev)
 	return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false);
 }
 
-static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unloaded)
+static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
 	struct devlink *devlink = priv_to_devlink(dev);
@@ -228,8 +230,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev, bool unload
 	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
 		complete(&fw_reset->done);
 	} else {
-		if (!unloaded)
-			mlx5_unload_one(dev, false);
+		mlx5_sync_reset_unload_flow(dev, false);
 		if (mlx5_health_wait_pci_up(dev))
 			mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
 		else
@@ -272,7 +273,7 @@ static void mlx5_sync_reset_reload_work(struct work_struct *work)
 
 	mlx5_sync_reset_clear_reset_requested(dev, false);
 	mlx5_enter_error_state(dev, true);
-	mlx5_fw_reset_complete_reload(dev, false);
+	mlx5_fw_reset_complete_reload(dev);
 }
 
 #define MLX5_RESET_POLL_INTERVAL	(HZ / 10)
@@ -428,6 +429,11 @@ static bool mlx5_is_reset_now_capable(struct mlx5_core_dev *dev,
 		return false;
 	}
 
+	if (!mlx5_core_is_ecpf(dev) && !mlx5_sf_table_empty(dev)) {
+		mlx5_core_warn(dev, "SFs should be removed before reset\n");
+		return false;
+	}
+
 #if IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)
 	if (reset_method != MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET) {
 		err = mlx5_check_hotplug_interrupt(dev, bridge);
@@ -586,6 +592,65 @@ static int mlx5_sync_pci_reset(struct mlx5_core_dev *dev, u8 reset_method)
 	return err;
 }
 
+void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked)
+{
+	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
+	unsigned long timeout;
+	int poll_freq = 20;
+	bool reset_action;
+	u8 rst_state;
+	int err;
+
+	if (locked)
+		mlx5_unload_one_devl_locked(dev, false);
+	else
+		mlx5_unload_one(dev, false);
+
+	if (!test_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags))
+		return;
+
+	mlx5_set_fw_rst_ack(dev);
+	mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
+
+	reset_action = false;
+	timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
+	do {
+		rst_state = mlx5_get_fw_rst_state(dev);
+		if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
+		    rst_state == MLX5_FW_RST_STATE_IDLE) {
+			reset_action = true;
+			break;
+		}
+		if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) {
+			mlx5_core_info(dev, "Sync Reset Drop mode ack\n");
+			mlx5_set_fw_rst_ack(dev);
+			poll_freq = 1000;
+		}
+		msleep(poll_freq);
+	} while (!time_after(jiffies, timeout));
+
+	if (!reset_action) {
+		mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
+			      rst_state);
+		fw_reset->ret = -ETIMEDOUT;
+		goto done;
+	}
+
+	mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n",
+		       rst_state);
+	if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
+		err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
+		if (err) {
+			mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n",
+				       err);
+			fw_reset->ret = err;
+		}
+	}
+
+done:
+	clear_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
+}
+
 static void mlx5_sync_reset_now_event(struct work_struct *work)
 {
 	struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset,
@@ -613,17 +678,13 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
 	mlx5_enter_error_state(dev, true);
 done:
 	fw_reset->ret = err;
-	mlx5_fw_reset_complete_reload(dev, false);
+	mlx5_fw_reset_complete_reload(dev);
 }
 
 static void mlx5_sync_reset_unload_event(struct work_struct *work)
 {
 	struct mlx5_fw_reset *fw_reset;
 	struct mlx5_core_dev *dev;
-	unsigned long timeout;
-	int poll_freq = 20;
-	bool reset_action;
-	u8 rst_state;
 	int err;
 
 	fw_reset = container_of(work, struct mlx5_fw_reset, reset_unload_work);
@@ -632,6 +693,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
 	if (mlx5_sync_reset_clear_reset_requested(dev, false))
 		return;
 
+	set_bit(MLX5_FW_RESET_FLAGS_UNLOAD_EVENT, &fw_reset->reset_flags);
 	mlx5_core_warn(dev, "Sync Reset Unload. Function is forced down.\n");
 
 	err = mlx5_cmd_fast_teardown_hca(dev);
@@ -640,49 +702,7 @@ static void mlx5_sync_reset_unload_event(struct work_struct *work)
 	else
 		mlx5_enter_error_state(dev, true);
 
-	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags))
-		mlx5_unload_one_devl_locked(dev, false);
-	else
-		mlx5_unload_one(dev, false);
-
-	mlx5_set_fw_rst_ack(dev);
-	mlx5_core_warn(dev, "Sync Reset Unload done, device reset expected\n");
-
-	reset_action = false;
-	timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, RESET_UNLOAD));
-	do {
-		rst_state = mlx5_get_fw_rst_state(dev);
-		if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ ||
-		    rst_state == MLX5_FW_RST_STATE_IDLE) {
-			reset_action = true;
-			break;
-		}
-		if (rst_state == MLX5_FW_RST_STATE_DROP_MODE) {
-			mlx5_core_info(dev, "Sync Reset Drop mode ack\n");
-			mlx5_set_fw_rst_ack(dev);
-			poll_freq = 1000;
-		}
-		msleep(poll_freq);
-	} while (!time_after(jiffies, timeout));
-
-	if (!reset_action) {
-		mlx5_core_err(dev, "Got timeout waiting for sync reset action, state = %u\n",
-			      rst_state);
-		fw_reset->ret = -ETIMEDOUT;
-		goto done;
-	}
-
-	mlx5_core_warn(dev, "Sync Reset, got reset action. rst_state = %u\n", rst_state);
-	if (rst_state == MLX5_FW_RST_STATE_TOGGLE_REQ) {
-		err = mlx5_sync_pci_reset(dev, fw_reset->reset_method);
-		if (err) {
-			mlx5_core_warn(dev, "mlx5_sync_pci_reset failed, err %d\n", err);
-			fw_reset->ret = err;
-		}
-	}
-
-done:
-	mlx5_fw_reset_complete_reload(dev, true);
+	mlx5_fw_reset_complete_reload(dev);
 }
 
 static void mlx5_sync_reset_abort_event(struct work_struct *work)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
index ea527d06a85f..d5b28525c960 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h
@@ -12,6 +12,7 @@ int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel,
 int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
 
 int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
+void mlx5_sync_reset_unload_flow(struct mlx5_core_dev *dev, bool locked);
 int mlx5_fw_reset_verify_fw_complete(struct mlx5_core_dev *dev,
 				     struct netlink_ext_ack *extack);
 void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index b111ccd03b02..74ea5da58b7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
 
 static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev)
 {
-	return mdev->mlx5e_res.uplink_netdev;
+	struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res;
+	struct net_device *netdev;
+
+	mutex_lock(&mlx5e_res->uplink_netdev_lock);
+	netdev = mlx5e_res->uplink_netdev;
+	netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL);
+	mutex_unlock(&mlx5e_res->uplink_netdev_lock);
+	return netdev;
+}
+
+static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev,
+					  struct net_device *netdev)
+{
+	netdev_put(netdev, &mdev->mlx5e_res.tracker);
 }
 
 struct mlx5_sd;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b6d53db27cd5..9d3504f5abfa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -367,6 +367,8 @@ int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
 int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
 int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
 int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
+int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev,
+				     u8 *buffer_ownership);
 int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
 int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 549f1066d2a5..aa9f2b0a77d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -968,6 +968,26 @@ int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state)
 	return err;
 }
 
+int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev,
+				     u8 *buffer_ownership)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)] = {};
+	int err;
+
+	if (!MLX5_CAP_PCAM_FEATURE(mdev, buffer_ownership)) {
+		*buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN;
+		return 0;
+	}
+
+	err = mlx5_query_pfcc_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	*buffer_ownership = MLX5_GET(pfcc_reg, out, buf_ownership);
+
+	return 0;
+}
+
 int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio)
 {
 	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
@@ -1150,7 +1170,11 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
 	mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size,
 					  force_legacy);
 	i = find_first_bit(&temp, max_size);
-	if (i < max_size)
+
+	/* mlx5e_link_info has holes. Check speed
+	 * is not zero as indication of one.
+	 */
+	if (i < max_size && table[i].speed)
 		return &table[i];
 
 	return NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 0864ba625c07..3304f25cc805 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -518,3 +518,13 @@ void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 	WARN_ON(!xa_empty(&table->function_ids));
 	kfree(table);
 }
+
+bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev)
+{
+	struct mlx5_sf_table *table = dev->priv.sf_table;
+
+	if (!table)
+		return true;
+
+	return xa_empty(&table->function_ids);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
index 860f9ddb7107..89559a37997a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h
@@ -17,6 +17,7 @@ void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev);
 
 int mlx5_sf_table_init(struct mlx5_core_dev *dev);
 void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev);
+bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev);
 
 int mlx5_devlink_sf_port_new(struct devlink *devlink,
 			     const struct devlink_port_new_attrs *add_attr,
@@ -61,6 +62,11 @@ static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev)
 {
 }
 
+static inline bool mlx5_sf_table_empty(const struct mlx5_core_dev *dev)
+{
+	return true;
+}
+
 #endif
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
index 396804369b00..fe56b59e24c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c
@@ -117,7 +117,7 @@ static int hws_action_get_shared_stc_nic(struct mlx5hws_context *ctx,
 		mlx5hws_err(ctx, "No such stc_type: %d\n", stc_type);
 		pr_warn("HWS: Invalid stc_type: %d\n", stc_type);
 		ret = -EINVAL;
-		goto unlock_and_out;
+		goto free_shared_stc;
 	}
 
 	ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl_type,
@@ -1360,7 +1360,7 @@ free_action:
 struct mlx5hws_action *
 mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest,
 				 struct mlx5hws_action_dest_attr *dests,
-				 bool ignore_flow_level, u32 flags)
+				 u32 flags)
 {
 	struct mlx5hws_cmd_set_fte_dest *dest_list = NULL;
 	struct mlx5hws_cmd_ft_create_attr ft_attr = {0};
@@ -1397,7 +1397,7 @@ mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest,
 				MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 			dest_list[i].destination_id = dests[i].dest->dest_obj.obj_id;
 			fte_attr.action_flags |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-			fte_attr.ignore_flow_level = ignore_flow_level;
+			fte_attr.ignore_flow_level = 1;
 			if (dests[i].is_wire_ft)
 				last_dest_idx = i;
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
index 92de4b761a83..adeccc588e5d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c
@@ -74,9 +74,9 @@ static void hws_bwc_matcher_init_attr(struct mlx5hws_bwc_matcher *bwc_matcher,
 static int
 hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
 {
-	bool move_error = false, poll_error = false, drain_error = false;
 	struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
 	struct mlx5hws_matcher *matcher = bwc_matcher->matcher;
+	int drain_error = 0, move_error = 0, poll_error = 0;
 	u16 bwc_queues = mlx5hws_bwc_queues(ctx);
 	struct mlx5hws_rule_attr rule_attr;
 	struct mlx5hws_bwc_rule *bwc_rule;
@@ -84,6 +84,7 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
 	struct list_head *rules_list;
 	u32 pending_rules;
 	int i, ret = 0;
+	bool drain;
 
 	mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr);
 
@@ -99,23 +100,37 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
 			ret = mlx5hws_matcher_resize_rule_move(matcher,
 							       bwc_rule->rule,
 							       &rule_attr);
-			if (unlikely(ret && !move_error)) {
-				mlx5hws_err(ctx,
-					    "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n",
-					    ret);
-				move_error = true;
+			if (unlikely(ret)) {
+				if (!move_error) {
+					mlx5hws_err(ctx,
+						    "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n",
+						    ret);
+					move_error = ret;
+				}
+				/* Rule wasn't queued, no need to poll */
+				continue;
 			}
 
 			pending_rules++;
+			drain = pending_rules >=
+				hws_bwc_get_burst_th(ctx, rule_attr.queue_id);
 			ret = mlx5hws_bwc_queue_poll(ctx,
 						     rule_attr.queue_id,
 						     &pending_rules,
-						     false);
-			if (unlikely(ret && !poll_error)) {
-				mlx5hws_err(ctx,
-					    "Moving BWC rule: poll failed (%d), attempting to move rest of the rules\n",
-					    ret);
-				poll_error = true;
+						     drain);
+			if (unlikely(ret)) {
+				if (ret == -ETIMEDOUT) {
+					mlx5hws_err(ctx,
+						    "Moving BWC rule: timeout polling for completions (%d), aborting rehash\n",
+						    ret);
+					return ret;
+				}
+				if (!poll_error) {
+					mlx5hws_err(ctx,
+						    "Moving BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n",
+						    ret);
+					poll_error = ret;
+				}
 			}
 		}
 
@@ -126,17 +141,30 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher)
 						     rule_attr.queue_id,
 						     &pending_rules,
 						     true);
-			if (unlikely(ret && !drain_error)) {
-				mlx5hws_err(ctx,
-					    "Moving BWC rule: drain failed (%d), attempting to move rest of the rules\n",
-					    ret);
-				drain_error = true;
+			if (unlikely(ret)) {
+				if (ret == -ETIMEDOUT) {
+					mlx5hws_err(ctx,
+						    "Moving bwc rule: timeout draining completions (%d), aborting rehash\n",
+						    ret);
+					return ret;
+				}
+				if (!drain_error) {
+					mlx5hws_err(ctx,
+						    "Moving bwc rule: drain failed (%d), attempting to move rest of the rules\n",
+						    ret);
+					drain_error = ret;
+				}
 			}
 		}
 	}
 
-	if (move_error || poll_error || drain_error)
-		ret = -EINVAL;
+	/* Return the first error that happened */
+	if (unlikely(move_error))
+		return move_error;
+	if (unlikely(poll_error))
+		return poll_error;
+	if (unlikely(drain_error))
+		return drain_error;
 
 	return ret;
 }
@@ -1035,6 +1063,21 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule,
 		return 0; /* rule inserted successfully */
 	}
 
+	/* Rule insertion could fail due to queue being full, timeout, or
+	 * matcher in resize. In such cases, no point in trying to rehash.
+	 */
+	if (ret == -EBUSY || ret == -ETIMEDOUT || ret == -EAGAIN) {
+		mutex_unlock(queue_lock);
+		mlx5hws_err(ctx,
+			    "BWC rule insertion failed - %s (%d)\n",
+			    ret == -EBUSY ? "queue is full" :
+			    ret == -ETIMEDOUT ? "timeout" :
+			    ret == -EAGAIN ? "matcher in resize" : "N/A",
+			    ret);
+		hws_bwc_rule_cnt_dec(bwc_rule);
+		return ret;
+	}
+
 	/* At this point the rule wasn't added.
 	 * It could be because there was collision, or some other problem.
 	 * Try rehash by size and insert rule again - last chance.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
index ca7501c57468..14e79579c719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c
@@ -1328,11 +1328,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher)
 {
 	struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx;
 	struct mlx5hws_matcher *matcher = bwc_matcher->matcher;
-	bool move_error = false, poll_error = false;
 	u16 bwc_queues = mlx5hws_bwc_queues(ctx);
 	struct mlx5hws_bwc_rule *tmp_bwc_rule;
 	struct mlx5hws_rule_attr rule_attr;
 	struct mlx5hws_table *isolated_tbl;
+	int move_error = 0, poll_error = 0;
 	struct mlx5hws_rule *tmp_rule;
 	struct list_head *rules_list;
 	u32 expected_completions = 1;
@@ -1391,11 +1391,15 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher)
 			ret = mlx5hws_matcher_resize_rule_move(matcher,
 							       tmp_rule,
 							       &rule_attr);
-			if (unlikely(ret && !move_error)) {
-				mlx5hws_err(ctx,
-					    "Moving complex BWC rule failed (%d), attempting to move rest of the rules\n",
-					    ret);
-				move_error = true;
+			if (unlikely(ret)) {
+				if (!move_error) {
+					mlx5hws_err(ctx,
+						    "Moving complex BWC rule: move failed (%d), attempting to move rest of the rules\n",
+						    ret);
+					move_error = ret;
+				}
+				/* Rule wasn't queued, no need to poll */
+				continue;
 			}
 
 			expected_completions = 1;
@@ -1403,11 +1407,19 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher)
 						     rule_attr.queue_id,
 						     &expected_completions,
 						     true);
-			if (unlikely(ret && !poll_error)) {
-				mlx5hws_err(ctx,
-					    "Moving complex BWC rule: poll failed (%d), attempting to move rest of the rules\n",
-					    ret);
-				poll_error = true;
+			if (unlikely(ret)) {
+				if (ret == -ETIMEDOUT) {
+					mlx5hws_err(ctx,
+						    "Moving complex BWC rule: timeout polling for completions (%d), aborting rehash\n",
+						    ret);
+					return ret;
+				}
+				if (!poll_error) {
+					mlx5hws_err(ctx,
+						    "Moving complex BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n",
+						    ret);
+					poll_error = ret;
+				}
 			}
 
 			/* Done moving the rule to the new matcher,
@@ -1422,8 +1434,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher)
 		}
 	}
 
-	if (move_error || poll_error)
-		ret = -EINVAL;
+	/* Return the first error that happened */
+	if (unlikely(move_error))
+		return move_error;
+	if (unlikely(poll_error))
+		return poll_error;
 
 	return ret;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
index 9c83753e4592..0bdcab2e5cf3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c
@@ -55,6 +55,7 @@ int mlx5hws_cmd_flow_table_create(struct mlx5_core_dev *mdev,
 
 	MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE);
 	MLX5_SET(create_flow_table_in, in, table_type, ft_attr->type);
+	MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid);
 
 	ft_ctx = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context);
 	MLX5_SET(flow_table_context, ft_ctx, level, ft_attr->level);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
index fa6bff210266..122ccc671628 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h
@@ -36,6 +36,7 @@ struct mlx5hws_cmd_set_fte_attr {
 struct mlx5hws_cmd_ft_create_attr {
 	u8 type;
 	u8 level;
+	u16 uid;
 	bool rtc_valid;
 	bool decap_en;
 	bool reformat_en;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
index 57592b92e24b..6a4c4cccd643 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c
@@ -267,6 +267,7 @@ static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns,
 
 	tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB;
 	tbl_attr.level = ft_attr->level;
+	tbl_attr.uid = ft_attr->uid;
 	tbl = mlx5hws_table_create(ctx, &tbl_attr);
 	if (!tbl) {
 		mlx5_core_err(ns->dev, "Failed creating hws flow_table\n");
@@ -571,12 +572,12 @@ static void mlx5_fs_put_dest_action_sampler(struct mlx5_fs_hws_context *fs_ctx,
 static struct mlx5hws_action *
 mlx5_fs_create_action_dest_array(struct mlx5hws_context *ctx,
 				 struct mlx5hws_action_dest_attr *dests,
-				 u32 num_of_dests, bool ignore_flow_level)
+				 u32 num_of_dests)
 {
 	u32 flags = MLX5HWS_ACTION_FLAG_HWS_FDB | MLX5HWS_ACTION_FLAG_SHARED;
 
 	return mlx5hws_action_create_dest_array(ctx, num_of_dests, dests,
-						ignore_flow_level, flags);
+						flags);
 }
 
 static struct mlx5hws_action *
@@ -1013,19 +1014,14 @@ static int mlx5_fs_fte_get_hws_actions(struct mlx5_flow_root_namespace *ns,
 		}
 		(*ractions)[num_actions++].action = dest_actions->dest;
 	} else if (num_dest_actions > 1) {
-		bool ignore_flow_level;
-
 		if (num_actions == MLX5_FLOW_CONTEXT_ACTION_MAX ||
 		    num_fs_actions == MLX5_FLOW_CONTEXT_ACTION_MAX) {
 			err = -EOPNOTSUPP;
 			goto free_actions;
 		}
-		ignore_flow_level =
-			!!(fte_action->flags & FLOW_ACT_IGNORE_FLOW_LEVEL);
 		tmp_action =
 			mlx5_fs_create_action_dest_array(ctx, dest_actions,
-							 num_dest_actions,
-							 ignore_flow_level);
+							 num_dest_actions);
 		if (!tmp_action) {
 			err = -EOPNOTSUPP;
 			goto free_actions;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c
index f1ecdba74e1f..839d71bd4216 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c
@@ -407,15 +407,21 @@ struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx,
 {
 	struct mlx5_fs_hws_create_action_ctx create_ctx;
 	struct mlx5_fc_bulk *fc_bulk = counter->bulk;
+	struct mlx5hws_action *hws_action;
 
 	create_ctx.hws_ctx = ctx;
 	create_ctx.id = fc_bulk->base_id;
 	create_ctx.actions_type = MLX5HWS_ACTION_TYP_CTR;
 
-	return mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx);
+	mlx5_fc_local_get(counter);
+	hws_action = mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx);
+	if (!hws_action)
+		mlx5_fc_local_put(counter);
+	return hws_action;
 }
 
 void mlx5_fc_put_hws_action(struct mlx5_fc *counter)
 {
 	mlx5_fs_put_hws_action(&counter->bulk->hws_data);
+	mlx5_fc_local_put(counter);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
index f3ea09caba2b..32f87fdf3213 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c
@@ -85,6 +85,7 @@ static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher)
 
 	ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev,
 					      tbl,
+					      0,
 					      &matcher->end_ft_id);
 	if (ret) {
 		mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n");
@@ -112,7 +113,9 @@ static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher)
 	if (mlx5hws_matcher_is_isolated(matcher))
 		ret = hws_matcher_create_end_ft_isolated(matcher);
 	else
-		ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl,
+		ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev,
+						      tbl,
+						      0,
 						      &matcher->end_ft_id);
 
 	if (ret) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
index 59c14745ed0c..1ad7a50d938b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h
@@ -75,6 +75,7 @@ struct mlx5hws_context_attr {
 struct mlx5hws_table_attr {
 	enum mlx5hws_table_type type;
 	u32 level;
+	u16 uid;
 };
 
 enum mlx5hws_matcher_flow_src {
@@ -734,7 +735,6 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags);
  * @num_dest: The number of dests attributes.
  * @dests: The destination array. Each contains a destination action and can
  *	   have additional actions.
- * @ignore_flow_level: Whether to turn on 'ignore_flow_level' for this dest.
  * @flags: Action creation flags (enum mlx5hws_action_flags).
  *
  * Return: pointer to mlx5hws_action on success NULL otherwise.
@@ -742,7 +742,7 @@ mlx5hws_action_create_push_vlan(struct mlx5hws_context *ctx, u32 flags);
 struct mlx5hws_action *
 mlx5hws_action_create_dest_array(struct mlx5hws_context *ctx, size_t num_dest,
 				 struct mlx5hws_action_dest_attr *dests,
-				 bool ignore_flow_level, u32 flags);
+				 u32 flags);
 
 /**
  * mlx5hws_action_create_insert_header - Create insert header action.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
index 51e4c551e0ef..d56271a9e4f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pat_arg.c
@@ -279,7 +279,7 @@ int mlx5hws_pat_get_pattern(struct mlx5hws_context *ctx,
 	return ret;
 
 clean_pattern:
-	mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, *pattern_id);
+	mlx5hws_cmd_header_modify_pattern_destroy(ctx->mdev, ptrn_id);
 out_unlock:
 	mutex_unlock(&ctx->pattern_cache->lock);
 	return ret;
@@ -527,7 +527,6 @@ int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
 			 u32 *nop_locations, __be64 *new_pat)
 {
 	u16 prev_src_field = INVALID_FIELD, prev_dst_field = INVALID_FIELD;
-	u16 src_field, dst_field;
 	u8 action_type;
 	bool dependent;
 	size_t i, j;
@@ -539,6 +538,9 @@ int mlx5hws_pat_calc_nop(__be64 *pattern, size_t num_actions,
 		return 0;
 
 	for (i = 0, j = 0; i < num_actions; i++, j++) {
+		u16 src_field = INVALID_FIELD;
+		u16 dst_field = INVALID_FIELD;
+
 		if (j >= max_actions)
 			return -EINVAL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
index 7e37d6e9eb83..7b5071c3df36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c
@@ -124,6 +124,7 @@ static int hws_pool_buddy_init(struct mlx5hws_pool *pool)
 		mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n",
 			    pool->type, pool->alloc_log_sz);
 		mlx5hws_buddy_cleanup(buddy);
+		kfree(buddy);
 		return -ENOMEM;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index c4b22be19a9b..b0595c9b09e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -964,7 +964,6 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev,
 		return -ENOMEM;
 
 	MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index);
-	MLX5_SET(cqc, cqc_data, cqe_sz, queue->num_entries);
 	MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries));
 
 	err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
index 568f691733f3..6113383ae47b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c
@@ -9,6 +9,7 @@ u32 mlx5hws_table_get_id(struct mlx5hws_table *tbl)
 }
 
 static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl,
+					u16 uid,
 					struct mlx5hws_cmd_ft_create_attr *ft_attr)
 {
 	ft_attr->type = tbl->fw_ft_type;
@@ -16,7 +17,9 @@ static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl,
 		ft_attr->level = tbl->ctx->caps->fdb_ft.max_level - 1;
 	else
 		ft_attr->level = tbl->ctx->caps->nic_ft.max_level - 1;
+
 	ft_attr->rtc_valid = true;
+	ft_attr->uid = uid;
 }
 
 static void hws_table_set_cap_attr(struct mlx5hws_table *tbl,
@@ -119,12 +122,12 @@ static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32
 
 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev,
 				    struct mlx5hws_table *tbl,
-				    u32 *ft_id)
+				    u16 uid, u32 *ft_id)
 {
 	struct mlx5hws_cmd_ft_create_attr ft_attr = {0};
 	int ret;
 
-	hws_table_init_next_ft_attr(tbl, &ft_attr);
+	hws_table_init_next_ft_attr(tbl, uid, &ft_attr);
 	hws_table_set_cap_attr(tbl, &ft_attr);
 
 	ret = mlx5hws_cmd_flow_table_create(mdev, &ft_attr, ft_id);
@@ -189,7 +192,10 @@ static int hws_table_init(struct mlx5hws_table *tbl)
 	}
 
 	mutex_lock(&ctx->ctrl_lock);
-	ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &tbl->ft_id);
+	ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev,
+					      tbl,
+					      tbl->uid,
+					      &tbl->ft_id);
 	if (ret) {
 		mlx5hws_err(tbl->ctx, "Failed to create flow table object\n");
 		mutex_unlock(&ctx->ctrl_lock);
@@ -239,6 +245,7 @@ struct mlx5hws_table *mlx5hws_table_create(struct mlx5hws_context *ctx,
 	tbl->ctx = ctx;
 	tbl->type = attr->type;
 	tbl->level = attr->level;
+	tbl->uid = attr->uid;
 
 	ret = hws_table_init(tbl);
 	if (ret) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
index 0400cce0c317..1246f9bd8422 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h
@@ -18,6 +18,7 @@ struct mlx5hws_table {
 	enum mlx5hws_table_type type;
 	u32 fw_ft_type;
 	u32 level;
+	u16 uid;
 	struct list_head matchers_list;
 	struct list_head tbl_list_node;
 	struct mlx5hws_default_miss default_miss;
@@ -47,7 +48,7 @@ u32 mlx5hws_table_get_res_fw_ft_type(enum mlx5hws_table_type tbl_type,
 
 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev,
 				    struct mlx5hws_table *tbl,
-				    u32 *ft_id);
+				    u16 uid, u32 *ft_id);
 
 void mlx5hws_table_destroy_default_ft(struct mlx5hws_table *tbl,
 				      u32 ft_id);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 618957d65663..9a2d64a0a858 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2375,6 +2375,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
 			     ROUTER_EXP, false),
 	MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD,
 			     ROUTER_EXP, false),
+	MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_LINK_LOCAL, FORWARD,
+			     ROUTER_EXP, false),
 	/* Multicast Router Traps */
 	MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
 	MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h
index 80ee5c4825dc..9962dc157901 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/trap.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h
@@ -94,6 +94,7 @@ enum {
 	MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_SIP_BC = 0x16A,
 	MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_DIP_LOCAL_NET = 0x16B,
 	MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LINK_LOCAL = 0x16C,
+	MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_LINK_LOCAL = 0x16D,
 	MLXSW_TRAP_ID_DISCARD_ROUTER_IRIF_EN = 0x178,
 	MLXSW_TRAP_ID_DISCARD_ROUTER_ERIF_EN = 0x179,
 	MLXSW_TRAP_ID_DISCARD_ROUTER_LPM4 = 0x17B,
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
index e67e99487a27..40581550da1a 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
@@ -52,6 +52,8 @@ int __fbnic_open(struct fbnic_net *fbn)
 	fbnic_bmc_rpc_init(fbd);
 	fbnic_rss_reinit(fbd, fbn);
 
+	phylink_resume(fbn->phylink);
+
 	return 0;
 time_stop:
 	fbnic_time_stop(fbn);
@@ -84,6 +86,8 @@ static int fbnic_stop(struct net_device *netdev)
 {
 	struct fbnic_net *fbn = netdev_priv(netdev);
 
+	phylink_suspend(fbn->phylink, fbnic_bmc_present(fbn->fbd));
+
 	fbnic_down(fbn);
 	fbnic_pcs_free_irq(fbn->fbd);
 
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index b70e4cadb37b..28e23e3ffca8 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -118,14 +118,12 @@ static void fbnic_service_task_start(struct fbnic_net *fbn)
 	struct fbnic_dev *fbd = fbn->fbd;
 
 	schedule_delayed_work(&fbd->service_task, HZ);
-	phylink_resume(fbn->phylink);
 }
 
 static void fbnic_service_task_stop(struct fbnic_net *fbn)
 {
 	struct fbnic_dev *fbd = fbn->fbd;
 
-	phylink_suspend(fbn->phylink, fbnic_bmc_present(fbd));
 	cancel_delayed_work(&fbd->service_task);
 }
 
@@ -443,11 +441,10 @@ static int __fbnic_pm_resume(struct device *dev)
 
 	/* Re-enable mailbox */
 	err = fbnic_fw_request_mbx(fbd);
+	devl_unlock(priv_to_devlink(fbd));
 	if (err)
 		goto err_free_irqs;
 
-	devl_unlock(priv_to_devlink(fbd));
-
 	/* Only send log history if log buffer is empty to prevent duplicate
 	 * log entries.
 	 */
@@ -464,20 +461,20 @@ static int __fbnic_pm_resume(struct device *dev)
 
 	rtnl_lock();
 
-	if (netif_running(netdev)) {
+	if (netif_running(netdev))
 		err = __fbnic_open(fbn);
-		if (err)
-			goto err_free_mbx;
-	}
 
 	rtnl_unlock();
+	if (err)
+		goto err_free_mbx;
 
 	return 0;
 err_free_mbx:
 	fbnic_fw_log_disable(fbd);
 
-	rtnl_unlock();
+	devl_lock(priv_to_devlink(fbd));
 	fbnic_fw_free_mbx(fbd);
+	devl_unlock(priv_to_devlink(fbd));
 err_free_irqs:
 	fbnic_free_irqs(fbd);
 err_invalidate_uc_addr:
diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c
index dd436bdff0f8..79b800d2b72c 100644
--- a/drivers/net/ethernet/microchip/lan865x/lan865x.c
+++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c
@@ -32,6 +32,10 @@
 /* MAC Specific Addr 1 Top Reg */
 #define LAN865X_REG_MAC_H_SADDR1	0x00010023
 
+/* MAC TSU Timer Increment Register */
+#define LAN865X_REG_MAC_TSU_TIMER_INCR		0x00010077
+#define MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS	0x0028
+
 struct lan865x_priv {
 	struct work_struct multicast_work;
 	struct net_device *netdev;
@@ -311,6 +315,8 @@ static int lan865x_net_open(struct net_device *netdev)
 
 	phy_start(netdev->phydev);
 
+	netif_start_queue(netdev);
+
 	return 0;
 }
 
@@ -344,6 +350,21 @@ static int lan865x_probe(struct spi_device *spi)
 		goto free_netdev;
 	}
 
+	/* LAN865x Rev.B0/B1 configuration parameters from AN1760
+	 * As per the Configuration Application Note AN1760 published in the
+	 * link, https://www.microchip.com/en-us/application-notes/an1760
+	 * Revision F (DS60001760G - June 2024), configure the MAC to set time
+	 * stamping at the end of the Start of Frame Delimiter (SFD) and set the
+	 * Timer Increment reg to 40 ns to be used as a 25 MHz internal clock.
+	 */
+	ret = oa_tc6_write_register(priv->tc6, LAN865X_REG_MAC_TSU_TIMER_INCR,
+				    MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS);
+	if (ret) {
+		dev_err(&spi->dev, "Failed to config TSU Timer Incr reg: %d\n",
+			ret);
+		goto oa_tc6_exit;
+	}
+
 	/* As per the point s3 in the below errata, SPI receive Ethernet frame
 	 * transfer may halt when starting the next frame in the same data block
 	 * (chunk) as the end of a previous frame. The RFA field should be
@@ -402,13 +423,16 @@ static void lan865x_remove(struct spi_device *spi)
 	free_netdev(priv->netdev);
 }
 
-static const struct spi_device_id spidev_spi_ids[] = {
+static const struct spi_device_id lan865x_ids[] = {
 	{ .name = "lan8650" },
+	{ .name = "lan8651" },
 	{},
 };
+MODULE_DEVICE_TABLE(spi, lan865x_ids);
 
 static const struct of_device_id lan865x_dt_ids[] = {
 	{ .compatible = "microchip,lan8650" },
+	{ .compatible = "microchip,lan8651" },
 	{ /* Sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, lan865x_dt_ids);
@@ -420,7 +444,7 @@ static struct spi_driver lan865x_driver = {
 	 },
 	.probe = lan865x_probe,
 	.remove = lan865x_remove,
-	.id_table = spidev_spi_ids,
+	.id_table = lan865x_ids,
 };
 module_spi_driver(lan865x_driver);
 
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 56d5464222d9..cdbf82affa7b 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -820,7 +820,7 @@ static void rx_irq(struct net_device *ndev)
 	struct ns83820 *dev = PRIV(ndev);
 	struct rx_info *info = &dev->rx_info;
 	unsigned next_rx;
-	int rx_rc, len;
+	int len;
 	u32 cmdsts;
 	__le32 *desc;
 	unsigned long flags;
@@ -881,8 +881,10 @@ static void rx_irq(struct net_device *ndev)
 		if (likely(CMDSTS_OK & cmdsts)) {
 #endif
 			skb_put(skb, len);
-			if (unlikely(!skb))
+			if (unlikely(!skb)) {
+				ndev->stats.rx_dropped++;
 				goto netdev_mangle_me_harder_failed;
+			}
 			if (cmdsts & CMDSTS_DEST_MULTI)
 				ndev->stats.multicast++;
 			ndev->stats.rx_packets++;
@@ -901,15 +903,12 @@ static void rx_irq(struct net_device *ndev)
 				__vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag);
 			}
 #endif
-			rx_rc = netif_rx(skb);
-			if (NET_RX_DROP == rx_rc) {
-netdev_mangle_me_harder_failed:
-				ndev->stats.rx_dropped++;
-			}
+			netif_rx(skb);
 		} else {
 			dev_kfree_skb_irq(skb);
 		}
 
+netdev_mangle_me_harder_failed:
 		nr++;
 		next_rx = info->next_rx;
 		desc = info->descs + (DESC_SIZE * next_rx);
diff --git a/drivers/net/ethernet/oa_tc6.c b/drivers/net/ethernet/oa_tc6.c
index db200e4ec284..91a906a7918a 100644
--- a/drivers/net/ethernet/oa_tc6.c
+++ b/drivers/net/ethernet/oa_tc6.c
@@ -1249,7 +1249,8 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev)
 
 	/* Set the SPI controller to pump at realtime priority */
 	tc6->spi->rt = true;
-	spi_setup(tc6->spi);
+	if (spi_setup(tc6->spi) < 0)
+		return NULL;
 
 	tc6->spi_ctrl_tx_buf = devm_kzalloc(&tc6->spi->dev,
 					    OA_TC6_CTRL_SPI_BUF_SIZE,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 9c3d3dd2f847..1f0cea3cae92 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -4462,10 +4462,11 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn,
 		goto out;
 	}
 
-	/* Add override window info to buffer */
+	/* Add override window info to buffer, preventing buffer overflow */
 	override_window_dwords =
-		qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) *
-		PROTECTION_OVERRIDE_ELEMENT_DWORDS;
+		min(qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) *
+		PROTECTION_OVERRIDE_ELEMENT_DWORDS,
+		PROTECTION_OVERRIDE_DEPTH_DWORDS);
 	if (override_window_dwords) {
 		addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW);
 		offset += qed_grc_dump_addr_range(p_hwfn,
diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h
index 20decdeb9fdb..b9209eb6ea73 100644
--- a/drivers/net/ethernet/realtek/rtase/rtase.h
+++ b/drivers/net/ethernet/realtek/rtase/rtase.h
@@ -241,7 +241,7 @@ union rtase_rx_desc {
 #define RTASE_RX_RES        BIT(20)
 #define RTASE_RX_RUNT       BIT(19)
 #define RTASE_RX_RWT        BIT(18)
-#define RTASE_RX_CRC        BIT(16)
+#define RTASE_RX_CRC        BIT(17)
 #define RTASE_RX_V6F        BIT(31)
 #define RTASE_RX_V4F        BIT(30)
 #define RTASE_RX_UDPT       BIT(29)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
index 09ae16e026eb..6c363f9b0ce2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
@@ -330,15 +330,11 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev)
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
-	ret = devm_clk_bulk_get_all(&pdev->dev, &plat_dat->clks);
+	ret = devm_clk_bulk_get_all_enabled(&pdev->dev, &plat_dat->clks);
 	if (ret < 0)
-		return dev_err_probe(&pdev->dev, ret, "Failed to retrieve all required clocks\n");
+		return dev_err_probe(&pdev->dev, ret, "Failed to retrieve and enable all required clocks\n");
 	plat_dat->num_clks = ret;
 
-	ret = clk_bulk_prepare_enable(plat_dat->num_clks, plat_dat->clks);
-	if (ret)
-		return dev_err_probe(&pdev->dev, ret, "Failed to enable clocks\n");
-
 	plat_dat->stmmac_clk = stmmac_pltfr_find_clk(plat_dat,
 						     data->stmmac_clk_name);
 
@@ -346,7 +342,6 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev)
 		ret = data->probe(pdev, plat_dat, &stmmac_res);
 	if (ret < 0) {
 		dev_err_probe(&pdev->dev, ret, "failed to probe subdriver\n");
-		clk_bulk_disable_unprepare(plat_dat->num_clks, plat_dat->clks);
 		return ret;
 	}
 
@@ -370,15 +365,11 @@ remove:
 static void dwc_eth_dwmac_remove(struct platform_device *pdev)
 {
 	const struct dwc_eth_dwmac_data *data = device_get_match_data(&pdev->dev);
-	struct plat_stmmacenet_data *plat_dat = dev_get_platdata(&pdev->dev);
 
 	stmmac_dvr_remove(&pdev->dev);
 
 	if (data->remove)
 		data->remove(pdev);
-
-	if (plat_dat)
-		clk_bulk_disable_unprepare(plat_dat->num_clks, plat_dat->clks);
 }
 
 static const struct of_device_id dwc_eth_dwmac_match[] = {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 79b92130a03f..f6687c2f30f6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -1765,11 +1765,15 @@ err_gmac_powerdown:
 
 static void rk_gmac_remove(struct platform_device *pdev)
 {
-	struct rk_priv_data *bsp_priv = get_stmmac_bsp_priv(&pdev->dev);
+	struct stmmac_priv *priv = netdev_priv(platform_get_drvdata(pdev));
+	struct rk_priv_data *bsp_priv = priv->plat->bsp_priv;
 
 	stmmac_dvr_remove(&pdev->dev);
 
 	rk_gmac_powerdown(bsp_priv);
+
+	if (priv->plat->phy_node && bsp_priv->integrated_phy)
+		clk_put(bsp_priv->clk_phy);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
index c72ee759aae5..6c6c49e4b66f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c
@@ -152,7 +152,7 @@ static int thead_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i,
 static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat)
 {
 	struct thead_dwmac *dwmac = plat->bsp_priv;
-	u32 reg;
+	u32 reg, div;
 
 	switch (plat->mac_interface) {
 	case PHY_INTERFACE_MODE_MII:
@@ -164,6 +164,13 @@ static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat)
 	case PHY_INTERFACE_MODE_RGMII_RXID:
 	case PHY_INTERFACE_MODE_RGMII_TXID:
 		/* use pll */
+		div = clk_get_rate(plat->stmmac_clk) / rgmii_clock(SPEED_1000);
+		reg = FIELD_PREP(GMAC_PLLCLK_DIV_EN, 1) |
+		      FIELD_PREP(GMAC_PLLCLK_DIV_NUM, div);
+
+		writel(0, dwmac->apb_base + GMAC_PLLCLK_DIV);
+		writel(reg, dwmac->apb_base + GMAC_PLLCLK_DIV);
+
 		writel(GMAC_GTXCLK_SEL_PLL, dwmac->apb_base + GMAC_GTXCLK_SEL);
 		reg = GMAC_TX_CLK_EN | GMAC_TX_CLK_N_EN | GMAC_TX_CLK_OUT_EN |
 		      GMAC_RX_CLK_EN | GMAC_RX_CLK_N_EN;
@@ -211,6 +218,7 @@ static int thead_dwmac_probe(struct platform_device *pdev)
 	struct stmmac_resources stmmac_res;
 	struct plat_stmmacenet_data *plat;
 	struct thead_dwmac *dwmac;
+	struct clk *apb_clk;
 	void __iomem *apb;
 	int ret;
 
@@ -224,6 +232,19 @@ static int thead_dwmac_probe(struct platform_device *pdev)
 		return dev_err_probe(&pdev->dev, PTR_ERR(plat),
 				     "dt configuration failed\n");
 
+	/*
+	 * The APB clock is essential for accessing glue registers. However,
+	 * old devicetrees don't describe it correctly. We continue to probe
+	 * and emit a warning if it isn't present.
+	 */
+	apb_clk = devm_clk_get_enabled(&pdev->dev, "apb");
+	if (PTR_ERR(apb_clk) == -ENOENT)
+		dev_warn(&pdev->dev,
+			 "cannot get apb clock, link may break after speed changes\n");
+	else if (IS_ERR(apb_clk))
+		return dev_err_probe(&pdev->dev, PTR_ERR(apb_clk),
+				     "failed to get apb clock\n");
+
 	dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL);
 	if (!dwmac)
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index 6cadf8de4fdf..00e929bf280b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -49,6 +49,14 @@ static void dwxgmac2_core_init(struct mac_device_info *hw,
 	writel(XGMAC_INT_DEFAULT_EN, ioaddr + XGMAC_INT_EN);
 }
 
+static void dwxgmac2_update_caps(struct stmmac_priv *priv)
+{
+	if (!priv->dma_cap.mbps_10_100)
+		priv->hw->link.caps &= ~(MAC_10 | MAC_100);
+	else if (!priv->dma_cap.half_duplex)
+		priv->hw->link.caps &= ~(MAC_10HD | MAC_100HD);
+}
+
 static void dwxgmac2_set_mac(void __iomem *ioaddr, bool enable)
 {
 	u32 tx = readl(ioaddr + XGMAC_TX_CONFIG);
@@ -1424,6 +1432,7 @@ static void dwxgmac2_set_arp_offload(struct mac_device_info *hw, bool en,
 
 const struct stmmac_ops dwxgmac210_ops = {
 	.core_init = dwxgmac2_core_init,
+	.update_caps = dwxgmac2_update_caps,
 	.set_mac = dwxgmac2_set_mac,
 	.rx_ipc = dwxgmac2_rx_ipc,
 	.rx_queue_enable = dwxgmac2_rx_queue_enable,
@@ -1532,8 +1541,8 @@ int dwxgmac2_setup(struct stmmac_priv *priv)
 		mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins);
 
 	mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE |
-			 MAC_1000FD | MAC_2500FD | MAC_5000FD |
-			 MAC_10000FD;
+			 MAC_10 | MAC_100 | MAC_1000FD |
+			 MAC_2500FD | MAC_5000FD | MAC_10000FD;
 	mac->link.duplex = 0;
 	mac->link.speed10 = XGMAC_CONFIG_SS_10_MII;
 	mac->link.speed100 = XGMAC_CONFIG_SS_100_MII;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
index 5dcc95bc0ad2..4d6bb995d8d8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
@@ -203,10 +203,6 @@ static void dwxgmac2_dma_rx_mode(struct stmmac_priv *priv, void __iomem *ioaddr,
 	}
 
 	writel(value, ioaddr + XGMAC_MTL_RXQ_OPMODE(channel));
-
-	/* Enable MTL RX overflow */
-	value = readl(ioaddr + XGMAC_MTL_QINTEN(channel));
-	writel(value | XGMAC_RXOIE, ioaddr + XGMAC_MTL_QINTEN(channel));
 }
 
 static void dwxgmac2_dma_tx_mode(struct stmmac_priv *priv, void __iomem *ioaddr,
@@ -386,8 +382,11 @@ static int dwxgmac2_dma_interrupt(struct stmmac_priv *priv,
 static int dwxgmac2_get_hw_feature(void __iomem *ioaddr,
 				   struct dma_features *dma_cap)
 {
+	struct stmmac_priv *priv;
 	u32 hw_cap;
 
+	priv = container_of(dma_cap, struct stmmac_priv, dma_cap);
+
 	/* MAC HW feature 0 */
 	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE0);
 	dma_cap->edma = (hw_cap & XGMAC_HWFEAT_EDMA) >> 31;
@@ -410,6 +409,8 @@ static int dwxgmac2_get_hw_feature(void __iomem *ioaddr,
 	dma_cap->vlhash = (hw_cap & XGMAC_HWFEAT_VLHASH) >> 4;
 	dma_cap->half_duplex = (hw_cap & XGMAC_HWFEAT_HDSEL) >> 3;
 	dma_cap->mbps_1000 = (hw_cap & XGMAC_HWFEAT_GMIISEL) >> 1;
+	if (dma_cap->mbps_1000 && priv->synopsys_id >= DWXGMAC_CORE_2_20)
+		dma_cap->mbps_10_100 = 1;
 
 	/* MAC HW feature 1 */
 	hw_cap = readl(ioaddr + XGMAC_HW_FEATURE1);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f1abf4242cd2..7b16d1207b80 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2584,6 +2584,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
 	struct netdev_queue *nq = netdev_get_tx_queue(priv->dev, queue);
 	struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
 	struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[queue];
+	bool csum = !priv->plat->tx_queues_cfg[queue].coe_unsupported;
 	struct xsk_buff_pool *pool = tx_q->xsk_pool;
 	unsigned int entry = tx_q->cur_tx;
 	struct dma_desc *tx_desc = NULL;
@@ -2671,7 +2672,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
 		}
 
 		stmmac_prepare_tx_desc(priv, tx_desc, 1, xdp_desc.len,
-				       true, priv->mode, true, true,
+				       csum, priv->mode, true, true,
 				       xdp_desc.len);
 
 		stmmac_enable_dma_transmission(priv, priv->ioaddr, queue);
@@ -4983,6 +4984,7 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue,
 {
 	struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[queue];
 	struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
+	bool csum = !priv->plat->tx_queues_cfg[queue].coe_unsupported;
 	unsigned int entry = tx_q->cur_tx;
 	struct dma_desc *tx_desc;
 	dma_addr_t dma_addr;
@@ -5034,7 +5036,7 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue,
 	stmmac_set_desc_addr(priv, tx_desc, dma_addr);
 
 	stmmac_prepare_tx_desc(priv, tx_desc, 1, xdpf->len,
-			       true, priv->mode, true, true,
+			       csum, priv->mode, true, true,
 			       xdpf->len);
 
 	tx_q->tx_count_frames++;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index ecd6ecac87bb..8b2364f5f731 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1522,7 +1522,7 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
 		}
 	}
 
-	if (single_port) {
+	if (single_port && num_tx) {
 		netif_txq = netdev_get_tx_queue(ndev, chn);
 		netdev_tx_completed_queue(netif_txq, num_tx, total_bytes);
 		am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq);
diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c
index 50bfbc2779e4..d8c9fe1d98c4 100644
--- a/drivers/net/ethernet/ti/icssg/icss_iep.c
+++ b/drivers/net/ethernet/ti/icssg/icss_iep.c
@@ -621,7 +621,8 @@ exit:
 
 static int icss_iep_extts_enable(struct icss_iep *iep, u32 index, int on)
 {
-	u32 val, cap, ret = 0;
+	u32 val, cap;
+	int ret = 0;
 
 	mutex_lock(&iep->ptp_clk_mutex);
 
diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
index 2b973d6e2341..e42d0fdefee1 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
@@ -50,6 +50,8 @@
 /* CTRLMMR_ICSSG_RGMII_CTRL register bits */
 #define ICSSG_CTRL_RGMII_ID_MODE                BIT(24)
 
+static void emac_adjust_link(struct net_device *ndev);
+
 static int emac_get_tx_ts(struct prueth_emac *emac,
 			  struct emac_tx_ts_response *rsp)
 {
@@ -201,6 +203,44 @@ static void prueth_emac_stop(struct prueth *prueth)
 	}
 }
 
+static void icssg_enable_fw_offload(struct prueth *prueth)
+{
+	struct prueth_emac *emac;
+	int mac;
+
+	for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) {
+		emac = prueth->emac[mac];
+		if (prueth->is_hsr_offload_mode) {
+			if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM)
+				icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE);
+			else
+				icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE);
+		}
+
+		if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) {
+			if (netif_running(emac->ndev)) {
+				icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan,
+						  ICSSG_FDB_ENTRY_P0_MEMBERSHIP |
+						  ICSSG_FDB_ENTRY_P1_MEMBERSHIP |
+						  ICSSG_FDB_ENTRY_P2_MEMBERSHIP |
+						  ICSSG_FDB_ENTRY_BLOCK,
+						  true);
+				icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID,
+						  BIT(emac->port_id) | DEFAULT_PORT_MASK,
+						  BIT(emac->port_id) | DEFAULT_UNTAG_MASK,
+						  true);
+				if (prueth->is_hsr_offload_mode)
+					icssg_vtbl_modify(emac, DEFAULT_VID,
+							  DEFAULT_PORT_MASK,
+							  DEFAULT_UNTAG_MASK, true);
+				icssg_set_pvid(prueth, emac->port_vlan, emac->port_id);
+				if (prueth->is_switch_mode)
+					icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE);
+			}
+		}
+	}
+}
+
 static int prueth_emac_common_start(struct prueth *prueth)
 {
 	struct prueth_emac *emac;
@@ -229,6 +269,10 @@ static int prueth_emac_common_start(struct prueth *prueth)
 		ret = icssg_config(prueth, emac, slice);
 		if (ret)
 			goto disable_class;
+
+		mutex_lock(&emac->ndev->phydev->lock);
+		emac_adjust_link(emac->ndev);
+		mutex_unlock(&emac->ndev->phydev->lock);
 	}
 
 	ret = prueth_emac_start(prueth);
@@ -610,7 +654,7 @@ static void icssg_prueth_hsr_fdb_add_del(struct prueth_emac *emac,
 
 static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr)
 {
-	struct net_device *real_dev;
+	struct net_device *real_dev, *port_dev;
 	struct prueth_emac *emac;
 	u8 vlan_id, i;
 
@@ -619,11 +663,15 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr)
 
 	if (is_hsr_master(real_dev)) {
 		for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) {
-			emac = netdev_priv(hsr_get_port_ndev(real_dev, i));
-			if (!emac)
+			port_dev = hsr_get_port_ndev(real_dev, i);
+			emac = netdev_priv(port_dev);
+			if (!emac) {
+				dev_put(port_dev);
 				return -EINVAL;
+			}
 			icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id,
 						     true);
+			dev_put(port_dev);
 		}
 	} else {
 		emac = netdev_priv(real_dev);
@@ -635,7 +683,7 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr)
 
 static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr)
 {
-	struct net_device *real_dev;
+	struct net_device *real_dev, *port_dev;
 	struct prueth_emac *emac;
 	u8 vlan_id, i;
 
@@ -644,11 +692,15 @@ static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr)
 
 	if (is_hsr_master(real_dev)) {
 		for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) {
-			emac = netdev_priv(hsr_get_port_ndev(real_dev, i));
-			if (!emac)
+			port_dev = hsr_get_port_ndev(real_dev, i);
+			emac = netdev_priv(port_dev);
+			if (!emac) {
+				dev_put(port_dev);
 				return -EINVAL;
+			}
 			icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id,
 						     false);
+			dev_put(port_dev);
 		}
 	} else {
 		emac = netdev_priv(real_dev);
@@ -747,6 +799,7 @@ static int emac_ndo_open(struct net_device *ndev)
 		ret = prueth_emac_common_start(prueth);
 		if (ret)
 			goto free_rx_irq;
+		icssg_enable_fw_offload(prueth);
 	}
 
 	flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET;
@@ -1354,8 +1407,7 @@ static int prueth_emac_restart(struct prueth *prueth)
 
 static void icssg_change_mode(struct prueth *prueth)
 {
-	struct prueth_emac *emac;
-	int mac, ret;
+	int ret;
 
 	ret = prueth_emac_restart(prueth);
 	if (ret) {
@@ -1363,35 +1415,7 @@ static void icssg_change_mode(struct prueth *prueth)
 		return;
 	}
 
-	for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) {
-		emac = prueth->emac[mac];
-		if (prueth->is_hsr_offload_mode) {
-			if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM)
-				icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE);
-			else
-				icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE);
-		}
-
-		if (netif_running(emac->ndev)) {
-			icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan,
-					  ICSSG_FDB_ENTRY_P0_MEMBERSHIP |
-					  ICSSG_FDB_ENTRY_P1_MEMBERSHIP |
-					  ICSSG_FDB_ENTRY_P2_MEMBERSHIP |
-					  ICSSG_FDB_ENTRY_BLOCK,
-					  true);
-			icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID,
-					  BIT(emac->port_id) | DEFAULT_PORT_MASK,
-					  BIT(emac->port_id) | DEFAULT_UNTAG_MASK,
-					  true);
-			if (prueth->is_hsr_offload_mode)
-				icssg_vtbl_modify(emac, DEFAULT_VID,
-						  DEFAULT_PORT_MASK,
-						  DEFAULT_UNTAG_MASK, true);
-			icssg_set_pvid(prueth, emac->port_vlan, emac->port_id);
-			if (prueth->is_switch_mode)
-				icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE);
-		}
-	}
+	icssg_enable_fw_offload(prueth);
 }
 
 static int prueth_netdevice_port_link(struct net_device *ndev,
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
index bcd07a715752..5cb353a97d6d 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
@@ -2078,10 +2078,6 @@ static void wx_setup_mrqc(struct wx *wx)
 {
 	u32 rss_field = 0;
 
-	/* VT, and RSS do not coexist at the same time */
-	if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags))
-		return;
-
 	/* Disable indicating checksum in descriptor, enables RSS hash */
 	wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD);
 
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c
index 5d48df7a849f..3023ea2732ef 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c
@@ -192,7 +192,7 @@ void wx_setup_vfmrqc_vf(struct wx *wx)
 	u8 i, j;
 
 	/* Fill out hash function seeds */
-	netdev_rss_key_fill(wx->rss_key, sizeof(wx->rss_key));
+	netdev_rss_key_fill(wx->rss_key, WX_RSS_KEY_SIZE);
 	for (i = 0; i < WX_RSS_KEY_SIZE / 4; i++)
 		wr32(wx, WX_VXRSSRK(i), wx->rss_key[i]);
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 6011d7eae0c7..ec6d47dc984a 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1160,6 +1160,7 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result)
 	struct axienet_local *lp = data;
 	struct sk_buff *skb;
 	u32 *app_metadata;
+	int i;
 
 	skbuf_dma = axienet_get_rx_desc(lp, lp->rx_ring_tail++);
 	skb = skbuf_dma->skb;
@@ -1167,6 +1168,15 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result)
 						       &meta_max_len);
 	dma_unmap_single(lp->dev, skbuf_dma->dma_address, lp->max_frm_size,
 			 DMA_FROM_DEVICE);
+
+	if (IS_ERR(app_metadata)) {
+		if (net_ratelimit())
+			netdev_err(lp->ndev, "Failed to get RX metadata pointer\n");
+		dev_kfree_skb_any(skb);
+		lp->ndev->stats.rx_dropped++;
+		goto rx_submit;
+	}
+
 	/* TODO: Derive app word index programmatically */
 	rx_len = (app_metadata[LEN_APP] & 0xFFFF);
 	skb_put(skb, rx_len);
@@ -1178,7 +1188,11 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result)
 	u64_stats_add(&lp->rx_packets, 1);
 	u64_stats_add(&lp->rx_bytes, rx_len);
 	u64_stats_update_end(&lp->rx_stat_sync);
-	axienet_rx_submit_desc(lp->ndev);
+
+rx_submit:
+	for (i = 0; i < CIRC_SPACE(lp->rx_ring_head, lp->rx_ring_tail,
+				   RX_BUF_NUM_DEFAULT); i++)
+		axienet_rx_submit_desc(lp->ndev);
 	dma_async_issue_pending(lp->rx_chan);
 }
 
@@ -1457,7 +1471,6 @@ static void axienet_rx_submit_desc(struct net_device *ndev)
 	if (!skbuf_dma)
 		return;
 
-	lp->rx_ring_head++;
 	skb = netdev_alloc_skb(ndev, lp->max_frm_size);
 	if (!skb)
 		return;
@@ -1482,6 +1495,7 @@ static void axienet_rx_submit_desc(struct net_device *ndev)
 	skbuf_dma->desc = dma_rx_desc;
 	dma_rx_desc->callback_param = lp;
 	dma_rx_desc->callback_result = axienet_dma_rx_cb;
+	lp->rx_ring_head++;
 	dmaengine_submit(dma_rx_desc);
 
 	return;
diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index a31d5d5e6593..97e88886253f 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -1576,7 +1576,7 @@ do_reset(struct net_device *dev, int full)
 	    msleep(40);			/* wait 40 msec to let it complete */
 	}
 	if (full_duplex)
-	    PutByte(XIRCREG1_ECR, GetByte(XIRCREG1_ECR | FullDuplex));
+	    PutByte(XIRCREG1_ECR, GetByte(XIRCREG1_ECR) | FullDuplex);
     } else {  /* No MII */
 	SelectPage(0);
 	value = GetByte(XIRCREG_ESR);	 /* read the ESR */
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index 0e0fe32d2da4..045c5177262e 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -138,7 +138,7 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev)
 
 static inline int dev_is_ethdev(struct net_device *dev)
 {
-	return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
+	return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
 }
 
 /* ------------------------------------------------------------------------ */
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index cb6f5482d203..7397c693f984 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -1061,6 +1061,7 @@ struct net_device_context {
 	struct net_device __rcu *vf_netdev;
 	struct netvsc_vf_pcpu_stats __percpu *vf_stats;
 	struct delayed_work vf_takeover;
+	struct delayed_work vfns_work;
 
 	/* 1: allocated, serial number is valid. 0: not allocated */
 	u32 vf_alloc;
@@ -1075,6 +1076,8 @@ struct net_device_context {
 	struct netvsc_device_info *saved_netvsc_dev_info;
 };
 
+void netvsc_vfns_work(struct work_struct *w);
+
 /* Azure hosts don't support non-TCP port numbers in hashing for fragmented
  * packets. We can use ethtool to change UDP hash level when necessary.
  */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 720104661d7f..60a4629fe6ba 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1812,6 +1812,11 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 	/* Enable NAPI handler before init callbacks */
 	netif_napi_add(ndev, &net_device->chan_table[0].napi, netvsc_poll);
+	napi_enable(&net_device->chan_table[0].napi);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX,
+			     &net_device->chan_table[0].napi);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX,
+			     &net_device->chan_table[0].napi);
 
 	/* Open the channel */
 	device->channel->next_request_id_callback = vmbus_next_request_id;
@@ -1831,12 +1836,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 	/* Channel is opened */
 	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
 
-	napi_enable(&net_device->chan_table[0].napi);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX,
-			     &net_device->chan_table[0].napi);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX,
-			     &net_device->chan_table[0].napi);
-
 	/* Connect with the NetVsp */
 	ret = netvsc_connect_vsp(device, net_device, device_info);
 	if (ret != 0) {
@@ -1854,14 +1853,14 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
 
 close:
 	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL);
-	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL);
-	napi_disable(&net_device->chan_table[0].napi);
 
 	/* Now, we can close the channel safely */
 	vmbus_close(device->channel);
 
 cleanup:
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_TX, NULL);
+	netif_queue_set_napi(ndev, 0, NETDEV_QUEUE_TYPE_RX, NULL);
+	napi_disable(&net_device->chan_table[0].napi);
 	netif_napi_del(&net_device->chan_table[0].napi);
 
 cleanup2:
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f44753756358..39c892e46cb0 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2522,6 +2522,7 @@ static int netvsc_probe(struct hv_device *dev,
 	spin_lock_init(&net_device_ctx->lock);
 	INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
 	INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
+	INIT_DELAYED_WORK(&net_device_ctx->vfns_work, netvsc_vfns_work);
 
 	net_device_ctx->vf_stats
 		= netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats);
@@ -2666,6 +2667,8 @@ static void netvsc_remove(struct hv_device *dev)
 	cancel_delayed_work_sync(&ndev_ctx->dwork);
 
 	rtnl_lock();
+	cancel_delayed_work_sync(&ndev_ctx->vfns_work);
+
 	nvdev = rtnl_dereference(ndev_ctx->nvdev);
 	if (nvdev) {
 		cancel_work_sync(&nvdev->subchan_work);
@@ -2707,6 +2710,7 @@ static int netvsc_suspend(struct hv_device *dev)
 	cancel_delayed_work_sync(&ndev_ctx->dwork);
 
 	rtnl_lock();
+	cancel_delayed_work_sync(&ndev_ctx->vfns_work);
 
 	nvdev = rtnl_dereference(ndev_ctx->nvdev);
 	if (nvdev == NULL) {
@@ -2800,6 +2804,27 @@ static void netvsc_event_set_vf_ns(struct net_device *ndev)
 	}
 }
 
+void netvsc_vfns_work(struct work_struct *w)
+{
+	struct net_device_context *ndev_ctx =
+		container_of(w, struct net_device_context, vfns_work.work);
+	struct net_device *ndev;
+
+	if (!rtnl_trylock()) {
+		schedule_delayed_work(&ndev_ctx->vfns_work, 1);
+		return;
+	}
+
+	ndev = hv_get_drvdata(ndev_ctx->device_ctx);
+	if (!ndev)
+		goto out;
+
+	netvsc_event_set_vf_ns(ndev);
+
+out:
+	rtnl_unlock();
+}
+
 /*
  * On Hyper-V, every VF interface is matched with a corresponding
  * synthetic interface. The synthetic interface is presented first
@@ -2810,10 +2835,12 @@ static int netvsc_netdev_event(struct notifier_block *this,
 			       unsigned long event, void *ptr)
 {
 	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+	struct net_device_context *ndev_ctx;
 	int ret = 0;
 
 	if (event_dev->netdev_ops == &device_ops && event == NETDEV_REGISTER) {
-		netvsc_event_set_vf_ns(event_dev);
+		ndev_ctx = netdev_priv(event_dev);
+		schedule_delayed_work(&ndev_ctx->vfns_work, 0);
 		return NOTIFY_DONE;
 	}
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 9e73959e61ee..c35f9685b6bf 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1252,17 +1252,26 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes);
 	new_sc->max_pkt_size = NETVSC_MAX_PKT_SIZE;
 
+	/* Enable napi before opening the vmbus channel to avoid races
+	 * as the host placing data on the host->guest ring may be left
+	 * out if napi was not enabled.
+	 */
+	napi_enable(&nvchan->napi);
+	netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
+			     &nvchan->napi);
+	netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
+			     &nvchan->napi);
+
 	ret = vmbus_open(new_sc, netvsc_ring_bytes,
 			 netvsc_ring_bytes, NULL, 0,
 			 netvsc_channel_cb, nvchan);
-	if (ret == 0) {
-		napi_enable(&nvchan->napi);
-		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
-				     &nvchan->napi);
-		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
-				     &nvchan->napi);
-	} else {
+	if (ret != 0) {
 		netdev_notice(ndev, "sub channel open failed: %d\n", ret);
+		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_TX,
+				     NULL);
+		netif_queue_set_napi(ndev, chn_index, NETDEV_QUEUE_TYPE_RX,
+				     NULL);
+		napi_disable(&nvchan->napi);
 	}
 
 	if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn)
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 4c75d1fea552..0eca96eeed58 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1844,7 +1844,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
 
 	if (tb_sa[MACSEC_SA_ATTR_PN]) {
 		spin_lock_bh(&rx_sa->lock);
-		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
+		rx_sa->next_pn = nla_get_uint(tb_sa[MACSEC_SA_ATTR_PN]);
 		spin_unlock_bh(&rx_sa->lock);
 	}
 
@@ -2086,7 +2086,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	spin_lock_bh(&tx_sa->lock);
-	tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
+	tx_sa->next_pn = nla_get_uint(tb_sa[MACSEC_SA_ATTR_PN]);
 	spin_unlock_bh(&tx_sa->lock);
 
 	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
@@ -2398,7 +2398,7 @@ static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
 
 		spin_lock_bh(&tx_sa->lock);
 		prev_pn = tx_sa->next_pn_halves;
-		tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
+		tx_sa->next_pn = nla_get_uint(tb_sa[MACSEC_SA_ATTR_PN]);
 		spin_unlock_bh(&tx_sa->lock);
 	}
 
@@ -2496,7 +2496,7 @@ static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
 
 		spin_lock_bh(&rx_sa->lock);
 		prev_pn = rx_sa->next_pn_halves;
-		rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
+		rx_sa->next_pn = nla_get_uint(tb_sa[MACSEC_SA_ATTR_PN]);
 		spin_unlock_bh(&rx_sa->lock);
 	}
 
@@ -4286,6 +4286,7 @@ static int macsec_newlink(struct net_device *dev,
 	if (err < 0)
 		goto del_dev;
 
+	netdev_update_features(dev);
 	netif_stacked_transfer_operstate(real_dev, dev);
 	linkwatch_fire_event(dev);
 
diff --git a/drivers/net/mctp/mctp-usb.c b/drivers/net/mctp/mctp-usb.c
index 775a386d0aca..36ccc53b1797 100644
--- a/drivers/net/mctp/mctp-usb.c
+++ b/drivers/net/mctp/mctp-usb.c
@@ -183,6 +183,7 @@ static void mctp_usb_in_complete(struct urb *urb)
 		struct mctp_usb_hdr *hdr;
 		u8 pkt_len; /* length of MCTP packet, no USB header */
 
+		skb_reset_mac_header(skb);
 		hdr = skb_pull_data(skb, sizeof(*hdr));
 		if (!hdr)
 			break;
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 39fe28af48b9..0178219f0db5 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -710,9 +710,13 @@ static struct nsim_rq *nsim_queue_alloc(void)
 static void nsim_queue_free(struct net_device *dev, struct nsim_rq *rq)
 {
 	hrtimer_cancel(&rq->napi_timer);
-	local_bh_disable();
-	dev_dstats_rx_dropped_add(dev, rq->skb_queue.qlen);
-	local_bh_enable();
+
+	if (rq->skb_queue.qlen) {
+		local_bh_disable();
+		dev_dstats_rx_dropped_add(dev, rq->skb_queue.qlen);
+		local_bh_enable();
+	}
+
 	skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE);
 	kfree(rq);
 }
diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c
index d79bb9b06cd2..ce73d9474d5b 100644
--- a/drivers/net/pcs/pcs-rzn1-miic.c
+++ b/drivers/net/pcs/pcs-rzn1-miic.c
@@ -19,7 +19,7 @@
 #define MIIC_PRCMD			0x0
 #define MIIC_ESID_CODE			0x4
 
-#define MIIC_MODCTRL			0x20
+#define MIIC_MODCTRL			0x8
 #define MIIC_MODCTRL_SW_MODE		GENMASK(4, 0)
 
 #define MIIC_CONVCTRL(port)		(0x100 + (port) * 4)
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 28acc6392cfc..392749aae54d 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -361,7 +361,7 @@ config NXP_TJA11XX_PHY
 	tristate "NXP TJA11xx PHYs support"
 	depends on HWMON
 	help
-	  Currently supports the NXP TJA1100 and TJA1101 PHY.
+	  Currently supports the NXP TJA1100, TJA1101 and TJA1102 PHYs.
 
 config NCN26000_PHY
 	tristate "Onsemi 10BASE-T1S Ethernet PHY"
diff --git a/drivers/net/phy/bcm-phy-ptp.c b/drivers/net/phy/bcm-phy-ptp.c
index eba8b5fb1365..d3501f8487d9 100644
--- a/drivers/net/phy/bcm-phy-ptp.c
+++ b/drivers/net/phy/bcm-phy-ptp.c
@@ -597,10 +597,6 @@ static int bcm_ptp_perout_locked(struct bcm_ptp_private *priv,
 
 	period = BCM_MAX_PERIOD_8NS;	/* write nonzero value */
 
-	/* Reject unsupported flags */
-	if (req->flags & ~PTP_PEROUT_DUTY_CYCLE)
-		return -EOPNOTSUPP;
-
 	if (req->flags & PTP_PEROUT_DUTY_CYCLE)
 		pulse = ktime_to_ns(ktime_set(req->on.sec, req->on.nsec));
 	else
@@ -741,6 +737,8 @@ static const struct ptp_clock_info bcm_ptp_clock_info = {
 	.n_pins		= 1,
 	.n_per_out	= 1,
 	.n_ext_ts	= 1,
+	.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE,
+	.supported_extts_flags = PTP_STRICT_FLAGS | PTP_RISING_EDGE,
 };
 
 static void bcm_ptp_txtstamp(struct mii_timestamper *mii_ts,
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index fda2e27c1810..cad6ed3aa10b 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -91,6 +91,7 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev)
 	if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev)
 		return -EINVAL;
 
+	gpiod_put(mdiodev->reset_gpio);
 	reset_control_put(mdiodev->reset_ctrl);
 
 	mdiodev->bus->mdio_map[mdiodev->addr] = NULL;
diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c
index 48dc4bf85125..f43973e73ea3 100644
--- a/drivers/net/phy/mdio_bus_provider.c
+++ b/drivers/net/phy/mdio_bus_provider.c
@@ -443,9 +443,6 @@ void mdiobus_unregister(struct mii_bus *bus)
 		if (!mdiodev)
 			continue;
 
-		if (mdiodev->reset_gpio)
-			gpiod_put(mdiodev->reset_gpio);
-
 		mdiodev->device_remove(mdiodev);
 		mdiodev->device_free(mdiodev);
 	}
diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h
index 6a3d8a754eb8..2bfe314ef881 100644
--- a/drivers/net/phy/mscc/mscc.h
+++ b/drivers/net/phy/mscc/mscc.h
@@ -362,6 +362,13 @@ struct vsc85xx_hw_stat {
 	u16 mask;
 };
 
+struct vsc8531_skb_cb {
+	u32 ns;
+};
+
+#define VSC8531_SKB_CB(skb) \
+	((struct vsc8531_skb_cb *)((skb)->cb))
+
 struct vsc8531_private {
 	int rate_magic;
 	u16 supp_led_modes;
@@ -410,6 +417,11 @@ struct vsc8531_private {
 	 */
 	struct mutex ts_lock;
 	struct mutex phc_lock;
+
+	/* list of skbs that were received and need timestamp information but it
+	 * didn't received it yet
+	 */
+	struct sk_buff_head rx_skbs_list;
 };
 
 /* Shared structure between the PHYs of the same package.
@@ -469,6 +481,7 @@ static inline void vsc8584_config_macsec_intr(struct phy_device *phydev)
 void vsc85xx_link_change_notify(struct phy_device *phydev);
 void vsc8584_config_ts_intr(struct phy_device *phydev);
 int vsc8584_ptp_init(struct phy_device *phydev);
+void vsc8584_ptp_deinit(struct phy_device *phydev);
 int vsc8584_ptp_probe_once(struct phy_device *phydev);
 int vsc8584_ptp_probe(struct phy_device *phydev);
 irqreturn_t vsc8584_handle_ts_interrupt(struct phy_device *phydev);
@@ -483,6 +496,9 @@ static inline int vsc8584_ptp_init(struct phy_device *phydev)
 {
 	return 0;
 }
+static inline void vsc8584_ptp_deinit(struct phy_device *phydev)
+{
+}
 static inline int vsc8584_ptp_probe_once(struct phy_device *phydev)
 {
 	return 0;
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 7ed6522fb0ef..24c75903f535 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -2335,6 +2335,11 @@ static int vsc85xx_probe(struct phy_device *phydev)
 	return vsc85xx_dt_led_modes_get(phydev, default_mode);
 }
 
+static void vsc85xx_remove(struct phy_device *phydev)
+{
+	vsc8584_ptp_deinit(phydev);
+}
+
 /* Microsemi VSC85xx PHYs */
 static struct phy_driver vsc85xx_driver[] = {
 {
@@ -2589,6 +2594,7 @@ static struct phy_driver vsc85xx_driver[] = {
 	.config_intr    = &vsc85xx_config_intr,
 	.suspend	= &genphy_suspend,
 	.resume		= &genphy_resume,
+	.remove		= &vsc85xx_remove,
 	.probe		= &vsc8574_probe,
 	.set_wol	= &vsc85xx_wol_set,
 	.get_wol	= &vsc85xx_wol_get,
@@ -2614,6 +2620,7 @@ static struct phy_driver vsc85xx_driver[] = {
 	.config_intr    = &vsc85xx_config_intr,
 	.suspend	= &genphy_suspend,
 	.resume		= &genphy_resume,
+	.remove		= &vsc85xx_remove,
 	.probe		= &vsc8574_probe,
 	.set_wol	= &vsc85xx_wol_set,
 	.get_wol	= &vsc85xx_wol_get,
@@ -2639,6 +2646,7 @@ static struct phy_driver vsc85xx_driver[] = {
 	.config_intr    = &vsc85xx_config_intr,
 	.suspend	= &genphy_suspend,
 	.resume		= &genphy_resume,
+	.remove		= &vsc85xx_remove,
 	.probe		= &vsc8584_probe,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
@@ -2662,6 +2670,7 @@ static struct phy_driver vsc85xx_driver[] = {
 	.config_intr    = &vsc85xx_config_intr,
 	.suspend	= &genphy_suspend,
 	.resume		= &genphy_resume,
+	.remove		= &vsc85xx_remove,
 	.probe		= &vsc8584_probe,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
@@ -2685,6 +2694,7 @@ static struct phy_driver vsc85xx_driver[] = {
 	.config_intr    = &vsc85xx_config_intr,
 	.suspend	= &genphy_suspend,
 	.resume		= &genphy_resume,
+	.remove		= &vsc85xx_remove,
 	.probe		= &vsc8584_probe,
 	.get_tunable	= &vsc85xx_get_tunable,
 	.set_tunable	= &vsc85xx_set_tunable,
diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c
index 275706de5847..d692df7d975c 100644
--- a/drivers/net/phy/mscc/mscc_ptp.c
+++ b/drivers/net/phy/mscc/mscc_ptp.c
@@ -456,12 +456,12 @@ static void vsc85xx_dequeue_skb(struct vsc85xx_ptp *ptp)
 		*p++ = (reg >> 24) & 0xff;
 	}
 
-	len = skb_queue_len(&ptp->tx_queue);
+	len = skb_queue_len_lockless(&ptp->tx_queue);
 	if (len < 1)
 		return;
 
 	while (len--) {
-		skb = __skb_dequeue(&ptp->tx_queue);
+		skb = skb_dequeue(&ptp->tx_queue);
 		if (!skb)
 			return;
 
@@ -486,7 +486,7 @@ static void vsc85xx_dequeue_skb(struct vsc85xx_ptp *ptp)
 		 * packet in the FIFO right now, reschedule it for later
 		 * packets.
 		 */
-		__skb_queue_tail(&ptp->tx_queue, skb);
+		skb_queue_tail(&ptp->tx_queue, skb);
 	}
 }
 
@@ -1068,6 +1068,7 @@ static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts,
 	case HWTSTAMP_TX_ON:
 		break;
 	case HWTSTAMP_TX_OFF:
+		skb_queue_purge(&vsc8531->ptp->tx_queue);
 		break;
 	default:
 		return -ERANGE;
@@ -1092,9 +1093,6 @@ static int vsc85xx_hwtstamp(struct mii_timestamper *mii_ts,
 
 	mutex_lock(&vsc8531->ts_lock);
 
-	__skb_queue_purge(&vsc8531->ptp->tx_queue);
-	__skb_queue_head_init(&vsc8531->ptp->tx_queue);
-
 	/* Disable predictor while configuring the 1588 block */
 	val = vsc85xx_ts_read_csr(phydev, PROCESSOR,
 				  MSCC_PHY_PTP_INGR_PREDICTOR);
@@ -1180,9 +1178,7 @@ static void vsc85xx_txtstamp(struct mii_timestamper *mii_ts,
 
 	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
 
-	mutex_lock(&vsc8531->ts_lock);
-	__skb_queue_tail(&vsc8531->ptp->tx_queue, skb);
-	mutex_unlock(&vsc8531->ts_lock);
+	skb_queue_tail(&vsc8531->ptp->tx_queue, skb);
 	return;
 
 out:
@@ -1194,9 +1190,7 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts,
 {
 	struct vsc8531_private *vsc8531 =
 		container_of(mii_ts, struct vsc8531_private, mii_ts);
-	struct skb_shared_hwtstamps *shhwtstamps = NULL;
 	struct vsc85xx_ptphdr *ptphdr;
-	struct timespec64 ts;
 	unsigned long ns;
 
 	if (!vsc8531->ptp->configured)
@@ -1206,27 +1200,52 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts,
 	    type == PTP_CLASS_NONE)
 		return false;
 
-	vsc85xx_gettime(&vsc8531->ptp->caps, &ts);
-
 	ptphdr = get_ptp_header_rx(skb, vsc8531->ptp->rx_filter);
 	if (!ptphdr)
 		return false;
 
-	shhwtstamps = skb_hwtstamps(skb);
-	memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
-
 	ns = ntohl(ptphdr->rsrvd2);
 
-	/* nsec is in reserved field */
-	if (ts.tv_nsec < ns)
-		ts.tv_sec--;
+	VSC8531_SKB_CB(skb)->ns = ns;
+	skb_queue_tail(&vsc8531->rx_skbs_list, skb);
 
-	shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, ns);
-	netif_rx(skb);
+	ptp_schedule_worker(vsc8531->ptp->ptp_clock, 0);
 
 	return true;
 }
 
+static long vsc85xx_do_aux_work(struct ptp_clock_info *info)
+{
+	struct vsc85xx_ptp *ptp = container_of(info, struct vsc85xx_ptp, caps);
+	struct skb_shared_hwtstamps *shhwtstamps = NULL;
+	struct phy_device *phydev = ptp->phydev;
+	struct vsc8531_private *priv = phydev->priv;
+	struct sk_buff_head received;
+	struct sk_buff *rx_skb;
+	struct timespec64 ts;
+	unsigned long flags;
+
+	__skb_queue_head_init(&received);
+	spin_lock_irqsave(&priv->rx_skbs_list.lock, flags);
+	skb_queue_splice_tail_init(&priv->rx_skbs_list, &received);
+	spin_unlock_irqrestore(&priv->rx_skbs_list.lock, flags);
+
+	vsc85xx_gettime(info, &ts);
+	while ((rx_skb = __skb_dequeue(&received)) != NULL) {
+		shhwtstamps = skb_hwtstamps(rx_skb);
+		memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
+
+		if (ts.tv_nsec < VSC8531_SKB_CB(rx_skb)->ns)
+			ts.tv_sec--;
+
+		shhwtstamps->hwtstamp = ktime_set(ts.tv_sec,
+						  VSC8531_SKB_CB(rx_skb)->ns);
+		netif_rx(rx_skb);
+	}
+
+	return -1;
+}
+
 static const struct ptp_clock_info vsc85xx_clk_caps = {
 	.owner		= THIS_MODULE,
 	.name		= "VSC85xx timer",
@@ -1240,6 +1259,7 @@ static const struct ptp_clock_info vsc85xx_clk_caps = {
 	.adjfine	= &vsc85xx_adjfine,
 	.gettime64	= &vsc85xx_gettime,
 	.settime64	= &vsc85xx_settime,
+	.do_aux_work	= &vsc85xx_do_aux_work,
 };
 
 static struct vsc8531_private *vsc8584_base_priv(struct phy_device *phydev)
@@ -1274,7 +1294,6 @@ static void vsc8584_set_input_clk_configured(struct phy_device *phydev)
 
 static int __vsc8584_init_ptp(struct phy_device *phydev)
 {
-	struct vsc8531_private *vsc8531 = phydev->priv;
 	static const u32 ltc_seq_e[] = { 0, 400000, 0, 0, 0 };
 	static const u8  ltc_seq_a[] = { 8, 6, 5, 4, 2 };
 	u32 val;
@@ -1491,17 +1510,7 @@ static int __vsc8584_init_ptp(struct phy_device *phydev)
 
 	vsc85xx_ts_eth_cmp1_sig(phydev);
 
-	vsc8531->mii_ts.rxtstamp = vsc85xx_rxtstamp;
-	vsc8531->mii_ts.txtstamp = vsc85xx_txtstamp;
-	vsc8531->mii_ts.hwtstamp = vsc85xx_hwtstamp;
-	vsc8531->mii_ts.ts_info  = vsc85xx_ts_info;
-	phydev->mii_ts = &vsc8531->mii_ts;
-
-	memcpy(&vsc8531->ptp->caps, &vsc85xx_clk_caps, sizeof(vsc85xx_clk_caps));
-
-	vsc8531->ptp->ptp_clock = ptp_clock_register(&vsc8531->ptp->caps,
-						     &phydev->mdio.dev);
-	return PTR_ERR_OR_ZERO(vsc8531->ptp->ptp_clock);
+	return 0;
 }
 
 void vsc8584_config_ts_intr(struct phy_device *phydev)
@@ -1528,6 +1537,17 @@ int vsc8584_ptp_init(struct phy_device *phydev)
 	return 0;
 }
 
+void vsc8584_ptp_deinit(struct phy_device *phydev)
+{
+	struct vsc8531_private *vsc8531 = phydev->priv;
+
+	if (vsc8531->ptp->ptp_clock) {
+		ptp_clock_unregister(vsc8531->ptp->ptp_clock);
+		skb_queue_purge(&vsc8531->rx_skbs_list);
+		skb_queue_purge(&vsc8531->ptp->tx_queue);
+	}
+}
+
 irqreturn_t vsc8584_handle_ts_interrupt(struct phy_device *phydev)
 {
 	struct vsc8531_private *priv = phydev->priv;
@@ -1548,7 +1568,7 @@ irqreturn_t vsc8584_handle_ts_interrupt(struct phy_device *phydev)
 	if (rc & VSC85XX_1588_INT_FIFO_ADD) {
 		vsc85xx_get_tx_ts(priv->ptp);
 	} else if (rc & VSC85XX_1588_INT_FIFO_OVERFLOW) {
-		__skb_queue_purge(&priv->ptp->tx_queue);
+		skb_queue_purge(&priv->ptp->tx_queue);
 		vsc85xx_ts_reset_fifo(phydev);
 	}
 
@@ -1567,6 +1587,8 @@ int vsc8584_ptp_probe(struct phy_device *phydev)
 
 	mutex_init(&vsc8531->phc_lock);
 	mutex_init(&vsc8531->ts_lock);
+	skb_queue_head_init(&vsc8531->rx_skbs_list);
+	skb_queue_head_init(&vsc8531->ptp->tx_queue);
 
 	/* Retrieve the shared load/save GPIO. Request it as non exclusive as
 	 * the same GPIO can be requested by all the PHYs of the same package.
@@ -1587,7 +1609,16 @@ int vsc8584_ptp_probe(struct phy_device *phydev)
 
 	vsc8531->ptp->phydev = phydev;
 
-	return 0;
+	vsc8531->mii_ts.rxtstamp = vsc85xx_rxtstamp;
+	vsc8531->mii_ts.txtstamp = vsc85xx_txtstamp;
+	vsc8531->mii_ts.hwtstamp = vsc85xx_hwtstamp;
+	vsc8531->mii_ts.ts_info  = vsc85xx_ts_info;
+	phydev->mii_ts = &vsc8531->mii_ts;
+
+	memcpy(&vsc8531->ptp->caps, &vsc85xx_clk_caps, sizeof(vsc85xx_clk_caps));
+	vsc8531->ptp->ptp_clock = ptp_clock_register(&vsc8531->ptp->caps,
+						     &phydev->mdio.dev);
+	return PTR_ERR_OR_ZERO(vsc8531->ptp->ptp_clock);
 }
 
 int vsc8584_ptp_probe_once(struct phy_device *phydev)
diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c
index 4c6d905f0a9f..87adb6508017 100644
--- a/drivers/net/phy/nxp-c45-tja11xx.c
+++ b/drivers/net/phy/nxp-c45-tja11xx.c
@@ -1965,24 +1965,27 @@ static int nxp_c45_macsec_ability(struct phy_device *phydev)
 	return macsec_ability;
 }
 
+static bool tja11xx_phy_id_compare(struct phy_device *phydev,
+				   const struct phy_driver *phydrv)
+{
+	u32 id = phydev->is_c45 ? phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] :
+				  phydev->phy_id;
+
+	return phy_id_compare(id, phydrv->phy_id, phydrv->phy_id_mask);
+}
+
 static int tja11xx_no_macsec_match_phy_device(struct phy_device *phydev,
 					      const struct phy_driver *phydrv)
 {
-	if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
-			    phydrv->phy_id_mask))
-		return 0;
-
-	return !nxp_c45_macsec_ability(phydev);
+	return tja11xx_phy_id_compare(phydev, phydrv) &&
+	       !nxp_c45_macsec_ability(phydev);
 }
 
 static int tja11xx_macsec_match_phy_device(struct phy_device *phydev,
 					   const struct phy_driver *phydrv)
 {
-	if (!phy_id_compare(phydev->phy_id, phydrv->phy_id,
-			    phydrv->phy_id_mask))
-		return 0;
-
-	return nxp_c45_macsec_ability(phydev);
+	return tja11xx_phy_id_compare(phydev, phydrv) &&
+	       nxp_c45_macsec_ability(phydev);
 }
 
 static const struct nxp_c45_regmap tja1120_regmap = {
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 13df28445f02..c02da57a4da5 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1065,23 +1065,19 @@ EXPORT_SYMBOL_GPL(phy_inband_caps);
  */
 int phy_config_inband(struct phy_device *phydev, unsigned int modes)
 {
-	int err;
+	lockdep_assert_held(&phydev->lock);
 
 	if (!!(modes & LINK_INBAND_DISABLE) +
 	    !!(modes & LINK_INBAND_ENABLE) +
 	    !!(modes & LINK_INBAND_BYPASS) != 1)
 		return -EINVAL;
 
-	mutex_lock(&phydev->lock);
 	if (!phydev->drv)
-		err = -EIO;
+		return -EIO;
 	else if (!phydev->drv->config_inband)
-		err = -EOPNOTSUPP;
-	else
-		err = phydev->drv->config_inband(phydev, modes);
-	mutex_unlock(&phydev->lock);
+		return -EOPNOTSUPP;
 
-	return err;
+	return phydev->drv->config_inband(phydev, modes);
 }
 EXPORT_SYMBOL(phy_config_inband);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 7556aa3dd7ee..c82c1997147b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -287,8 +287,7 @@ static bool phy_uses_state_machine(struct phy_device *phydev)
 	if (phydev->phy_link_change == phy_link_change)
 		return phydev->attached_dev && phydev->adjust_link;
 
-	/* phydev->phy_link_change is implicitly phylink_phy_change() */
-	return true;
+	return !!phydev->phy_link_change;
 }
 
 static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
@@ -1864,6 +1863,8 @@ void phy_detach(struct phy_device *phydev)
 		phydev->attached_dev = NULL;
 		phy_link_topo_del_phy(dev, phydev);
 	}
+
+	phydev->phy_link_change = NULL;
 	phydev->phylink = NULL;
 
 	if (!phydev->is_on_sfp_module)
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index c7f867b361dd..1988b7d2089a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -67,6 +67,8 @@ struct phylink {
 	struct timer_list link_poll;
 
 	struct mutex state_mutex;
+	/* Serialize updates to pl->phydev with phylink_resolve() */
+	struct mutex phydev_mutex;
 	struct phylink_link_state phy_state;
 	unsigned int phy_ib_mode;
 	struct work_struct resolve;
@@ -1016,6 +1018,42 @@ static void phylink_pcs_an_restart(struct phylink *pl)
 		pl->pcs->ops->pcs_an_restart(pl->pcs);
 }
 
+enum inband_type {
+	INBAND_NONE,
+	INBAND_CISCO_SGMII,
+	INBAND_BASEX,
+};
+
+static enum inband_type phylink_get_inband_type(phy_interface_t interface)
+{
+	switch (interface) {
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_QSGMII:
+	case PHY_INTERFACE_MODE_QUSGMII:
+	case PHY_INTERFACE_MODE_USXGMII:
+	case PHY_INTERFACE_MODE_10G_QXGMII:
+		/* These protocols are designed for use with a PHY which
+		 * communicates its negotiation result back to the MAC via
+		 * inband communication. Note: there exist PHYs that run
+		 * with SGMII but do not send the inband data.
+		 */
+		return INBAND_CISCO_SGMII;
+
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		/* 1000base-X is designed for use media-side for Fibre
+		 * connections, and thus the Autoneg bit needs to be
+		 * taken into account. We also do this for 2500base-X
+		 * as well, but drivers may not support this, so may
+		 * need to override this.
+		 */
+		return INBAND_BASEX;
+
+	default:
+		return INBAND_NONE;
+	}
+}
+
 /**
  * phylink_pcs_neg_mode() - helper to determine PCS inband mode
  * @pl: a pointer to a &struct phylink returned from phylink_create()
@@ -1043,46 +1081,19 @@ static void phylink_pcs_neg_mode(struct phylink *pl, struct phylink_pcs *pcs,
 	unsigned int pcs_ib_caps = 0;
 	unsigned int phy_ib_caps = 0;
 	unsigned int neg_mode, mode;
-	enum {
-		INBAND_CISCO_SGMII,
-		INBAND_BASEX,
-	} type;
-
-	mode = pl->req_link_an_mode;
-
-	pl->phy_ib_mode = 0;
-
-	switch (interface) {
-	case PHY_INTERFACE_MODE_SGMII:
-	case PHY_INTERFACE_MODE_QSGMII:
-	case PHY_INTERFACE_MODE_QUSGMII:
-	case PHY_INTERFACE_MODE_USXGMII:
-	case PHY_INTERFACE_MODE_10G_QXGMII:
-		/* These protocols are designed for use with a PHY which
-		 * communicates its negotiation result back to the MAC via
-		 * inband communication. Note: there exist PHYs that run
-		 * with SGMII but do not send the inband data.
-		 */
-		type = INBAND_CISCO_SGMII;
-		break;
+	enum inband_type type;
 
-	case PHY_INTERFACE_MODE_1000BASEX:
-	case PHY_INTERFACE_MODE_2500BASEX:
-		/* 1000base-X is designed for use media-side for Fibre
-		 * connections, and thus the Autoneg bit needs to be
-		 * taken into account. We also do this for 2500base-X
-		 * as well, but drivers may not support this, so may
-		 * need to override this.
-		 */
-		type = INBAND_BASEX;
-		break;
-
-	default:
+	type = phylink_get_inband_type(interface);
+	if (type == INBAND_NONE) {
 		pl->pcs_neg_mode = PHYLINK_PCS_NEG_NONE;
-		pl->act_link_an_mode = mode;
+		pl->act_link_an_mode = pl->req_link_an_mode;
 		return;
 	}
 
+	mode = pl->req_link_an_mode;
+
+	pl->phy_ib_mode = 0;
+
 	if (pcs)
 		pcs_ib_caps = phylink_pcs_inband_caps(pcs, interface);
 
@@ -1423,6 +1434,7 @@ static void phylink_get_fixed_state(struct phylink *pl,
 static void phylink_mac_initial_config(struct phylink *pl, bool force_restart)
 {
 	struct phylink_link_state link_state;
+	struct phy_device *phy = pl->phydev;
 
 	switch (pl->req_link_an_mode) {
 	case MLO_AN_PHY:
@@ -1446,7 +1458,11 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart)
 	link_state.link = false;
 
 	phylink_apply_manual_flow(pl, &link_state);
+	if (phy)
+		mutex_lock(&phy->lock);
 	phylink_major_config(pl, force_restart, &link_state);
+	if (phy)
+		mutex_unlock(&phy->lock);
 }
 
 static const char *phylink_pause_to_str(int pause)
@@ -1582,8 +1598,13 @@ static void phylink_resolve(struct work_struct *w)
 	struct phylink_link_state link_state;
 	bool mac_config = false;
 	bool retrigger = false;
+	struct phy_device *phy;
 	bool cur_link_state;
 
+	mutex_lock(&pl->phydev_mutex);
+	phy = pl->phydev;
+	if (phy)
+		mutex_lock(&phy->lock);
 	mutex_lock(&pl->state_mutex);
 	cur_link_state = phylink_link_is_up(pl);
 
@@ -1617,11 +1638,11 @@ static void phylink_resolve(struct work_struct *w)
 		/* If we have a phy, the "up" state is the union of both the
 		 * PHY and the MAC
 		 */
-		if (pl->phydev)
+		if (phy)
 			link_state.link &= pl->phy_state.link;
 
 		/* Only update if the PHY link is up */
-		if (pl->phydev && pl->phy_state.link) {
+		if (phy && pl->phy_state.link) {
 			/* If the interface has changed, force a link down
 			 * event if the link isn't already down, and re-resolve.
 			 */
@@ -1685,6 +1706,9 @@ static void phylink_resolve(struct work_struct *w)
 		queue_work(system_power_efficient_wq, &pl->resolve);
 	}
 	mutex_unlock(&pl->state_mutex);
+	if (phy)
+		mutex_unlock(&phy->lock);
+	mutex_unlock(&pl->phydev_mutex);
 }
 
 static void phylink_run_resolve(struct phylink *pl)
@@ -1820,6 +1844,7 @@ struct phylink *phylink_create(struct phylink_config *config,
 	if (!pl)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&pl->phydev_mutex);
 	mutex_init(&pl->state_mutex);
 	INIT_WORK(&pl->resolve, phylink_resolve);
 
@@ -2080,6 +2105,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy,
 		     dev_name(&phy->mdio.dev), phy->drv->name, irq_str);
 	kfree(irq_str);
 
+	mutex_lock(&pl->phydev_mutex);
 	mutex_lock(&phy->lock);
 	mutex_lock(&pl->state_mutex);
 	pl->phydev = phy;
@@ -2125,6 +2151,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy,
 
 	mutex_unlock(&pl->state_mutex);
 	mutex_unlock(&phy->lock);
+	mutex_unlock(&pl->phydev_mutex);
 
 	phylink_dbg(pl,
 		    "phy: %s setting supported %*pb advertising %*pb\n",
@@ -2132,9 +2159,6 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy,
 		    __ETHTOOL_LINK_MODE_MASK_NBITS, pl->supported,
 		    __ETHTOOL_LINK_MODE_MASK_NBITS, phy->advertising);
 
-	if (phy_interrupt_is_valid(phy))
-		phy_request_interrupt(phy);
-
 	if (pl->config->mac_managed_pm)
 		phy->mac_managed_pm = true;
 
@@ -2151,6 +2175,9 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy,
 			ret = 0;
 	}
 
+	if (ret == 0 && phy_interrupt_is_valid(phy))
+		phy_request_interrupt(phy);
+
 	return ret;
 }
 
@@ -2303,6 +2330,7 @@ void phylink_disconnect_phy(struct phylink *pl)
 
 	ASSERT_RTNL();
 
+	mutex_lock(&pl->phydev_mutex);
 	phy = pl->phydev;
 	if (phy) {
 		mutex_lock(&phy->lock);
@@ -2312,8 +2340,11 @@ void phylink_disconnect_phy(struct phylink *pl)
 		pl->mac_tx_clk_stop = false;
 		mutex_unlock(&pl->state_mutex);
 		mutex_unlock(&phy->lock);
-		flush_work(&pl->resolve);
+	}
+	mutex_unlock(&pl->phydev_mutex);
 
+	if (phy) {
+		flush_work(&pl->resolve);
 		phy_disconnect(phy);
 	}
 }
@@ -3625,6 +3656,7 @@ static int phylink_sfp_config_optical(struct phylink *pl)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(support);
 	struct phylink_link_state config;
+	enum inband_type inband_type;
 	phy_interface_t interface;
 	int ret;
 
@@ -3671,6 +3703,23 @@ static int phylink_sfp_config_optical(struct phylink *pl)
 	phylink_dbg(pl, "optical SFP: chosen %s interface\n",
 		    phy_modes(interface));
 
+	inband_type = phylink_get_inband_type(interface);
+	if (inband_type == INBAND_NONE) {
+		/* If this is the sole interface, and there is no inband
+		 * support, clear the advertising mask and Autoneg bit in
+		 * the support mask. Otherwise, just clear the Autoneg bit
+		 * in the advertising mask.
+		 */
+		if (phy_interface_weight(pl->sfp_interfaces) == 1) {
+			linkmode_clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					   pl->sfp_support);
+			linkmode_zero(config.advertising);
+		} else {
+			linkmode_clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					   config.advertising);
+		}
+	}
+
 	if (!phylink_validate_pcs_inband_autoneg(pl, interface,
 						 config.advertising)) {
 		phylink_err(pl, "autoneg setting not compatible with PCS");
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 5347c95d1e77..4cd1d6c51dc2 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -492,6 +492,9 @@ static const struct sfp_quirk sfp_quirks[] = {
 	SFP_QUIRK("ALCATELLUCENT", "3FE46541AA", sfp_quirk_2500basex,
 		  sfp_fixup_nokia),
 
+	// FLYPRO SFP-10GT-CS-30M uses Rollball protocol to talk to the PHY.
+	SFP_QUIRK_F("FLYPRO", "SFP-10GT-CS-30M", sfp_fixup_rollball),
+
 	// Fiberstore SFP-10G-T doesn't identify as copper, uses the Rollball
 	// protocol to talk to the PHY and needs 4 sec wait before probing the
 	// PHY.
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 8c98cbd4b06d..702a7f7183ce 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -33,6 +33,7 @@
 #include <linux/ppp_channel.h>
 #include <linux/ppp-comp.h>
 #include <linux/skbuff.h>
+#include <linux/rculist.h>
 #include <linux/rtnetlink.h>
 #include <linux/if_arp.h>
 #include <linux/ip.h>
@@ -1598,11 +1599,14 @@ static int ppp_fill_forward_path(struct net_device_path_ctx *ctx,
 	if (ppp->flags & SC_MULTILINK)
 		return -EOPNOTSUPP;
 
-	if (list_empty(&ppp->channels))
+	pch = list_first_or_null_rcu(&ppp->channels, struct channel, clist);
+	if (!pch)
+		return -ENODEV;
+
+	chan = READ_ONCE(pch->chan);
+	if (!chan)
 		return -ENODEV;
 
-	pch = list_first_entry(&ppp->channels, struct channel, clist);
-	chan = pch->chan;
 	if (!chan->ops->fill_forward_path)
 		return -EOPNOTSUPP;
 
@@ -1740,7 +1744,6 @@ pad_compress_skb(struct ppp *ppp, struct sk_buff *skb)
 		 */
 		if (net_ratelimit())
 			netdev_err(ppp->dev, "ppp: compressor dropped pkt\n");
-		kfree_skb(skb);
 		consume_skb(new_skb);
 		new_skb = NULL;
 	}
@@ -1841,9 +1844,10 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
 					   "down - pkt dropped.\n");
 			goto drop;
 		}
-		skb = pad_compress_skb(ppp, skb);
-		if (!skb)
+		new_skb = pad_compress_skb(ppp, skb);
+		if (!new_skb)
 			goto drop;
+		skb = new_skb;
 	}
 
 	/*
@@ -2994,7 +2998,7 @@ ppp_unregister_channel(struct ppp_channel *chan)
 	 */
 	down_write(&pch->chan_sem);
 	spin_lock_bh(&pch->downl);
-	pch->chan = NULL;
+	WRITE_ONCE(pch->chan, NULL);
 	spin_unlock_bh(&pch->downl);
 	up_write(&pch->chan_sem);
 	ppp_disconnect_channel(pch);
@@ -3515,7 +3519,7 @@ ppp_connect_channel(struct channel *pch, int unit)
 	hdrlen = pch->file.hdrlen + 2;	/* for protocol bytes */
 	if (hdrlen > ppp->dev->hard_header_len)
 		ppp->dev->hard_header_len = hdrlen;
-	list_add_tail(&pch->clist, &ppp->channels);
+	list_add_tail_rcu(&pch->clist, &ppp->channels);
 	++ppp->n_channels;
 	pch->ppp = ppp;
 	refcount_inc(&ppp->file.refcnt);
@@ -3545,10 +3549,11 @@ ppp_disconnect_channel(struct channel *pch)
 	if (ppp) {
 		/* remove it from the ppp unit's list */
 		ppp_lock(ppp);
-		list_del(&pch->clist);
+		list_del_rcu(&pch->clist);
 		if (--ppp->n_channels == 0)
 			wake_up_interruptible(&ppp->file.rwait);
 		ppp_unlock(ppp);
+		synchronize_net();
 		if (refcount_dec_and_test(&ppp->file.refcnt))
 			ppp_destroy_interface(ppp);
 		err = 0;
diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c
index 399ce9febda4..f4e91ba64a66 100644
--- a/drivers/net/pse-pd/pd692x0.c
+++ b/drivers/net/pse-pd/pd692x0.c
@@ -1041,6 +1041,10 @@ pd692x0_configure_managers(struct pd692x0_priv *priv, int nmanagers)
 		int pw_budget;
 
 		pw_budget = regulator_get_unclaimed_power_budget(supply);
+		if (!pw_budget)
+			/* Do nothing if no power budget */
+			continue;
+
 		/* Max power budget per manager */
 		if (pw_budget > 6000000)
 			pw_budget = 6000000;
@@ -1162,12 +1166,44 @@ pd692x0_write_ports_matrix(struct pd692x0_priv *priv,
 	return 0;
 }
 
+static void pd692x0_of_put_managers(struct pd692x0_priv *priv,
+				    struct pd692x0_manager *manager,
+				    int nmanagers)
+{
+	int i, j;
+
+	for (i = 0; i < nmanagers; i++) {
+		for (j = 0; j < manager[i].nports; j++)
+			of_node_put(manager[i].port_node[j]);
+		of_node_put(manager[i].node);
+	}
+}
+
+static void pd692x0_managers_free_pw_budget(struct pd692x0_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < PD692X0_MAX_MANAGERS; i++) {
+		struct regulator *supply;
+
+		if (!priv->manager_reg[i] || !priv->manager_pw_budget[i])
+			continue;
+
+		supply = priv->manager_reg[i]->supply;
+		if (!supply)
+			continue;
+
+		regulator_free_power_budget(supply,
+					    priv->manager_pw_budget[i]);
+	}
+}
+
 static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev)
 {
 	struct pd692x0_manager *manager __free(kfree) = NULL;
 	struct pd692x0_priv *priv = to_pd692x0_priv(pcdev);
 	struct pd692x0_matrix port_matrix[PD692X0_MAX_PIS];
-	int ret, i, j, nmanagers;
+	int ret, nmanagers;
 
 	/* Should we flash the port matrix */
 	if (priv->fw_state != PD692X0_FW_OK &&
@@ -1185,31 +1221,27 @@ static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev)
 	nmanagers = ret;
 	ret = pd692x0_register_managers_regulator(priv, manager, nmanagers);
 	if (ret)
-		goto out;
+		goto err_of_managers;
 
 	ret = pd692x0_configure_managers(priv, nmanagers);
 	if (ret)
-		goto out;
+		goto err_of_managers;
 
 	ret = pd692x0_set_ports_matrix(priv, manager, nmanagers, port_matrix);
 	if (ret)
-		goto out;
+		goto err_managers_req_pw;
 
 	ret = pd692x0_write_ports_matrix(priv, port_matrix);
 	if (ret)
-		goto out;
-
-out:
-	for (i = 0; i < nmanagers; i++) {
-		struct regulator *supply = priv->manager_reg[i]->supply;
+		goto err_managers_req_pw;
 
-		regulator_free_power_budget(supply,
-					    priv->manager_pw_budget[i]);
+	pd692x0_of_put_managers(priv, manager, nmanagers);
+	return 0;
 
-		for (j = 0; j < manager[i].nports; j++)
-			of_node_put(manager[i].port_node[j]);
-		of_node_put(manager[i].node);
-	}
+err_managers_req_pw:
+	pd692x0_managers_free_pw_budget(priv);
+err_of_managers:
+	pd692x0_of_put_managers(priv, manager, nmanagers);
 	return ret;
 }
 
@@ -1748,6 +1780,7 @@ static void pd692x0_i2c_remove(struct i2c_client *client)
 {
 	struct pd692x0_priv *priv = i2c_get_clientdata(client);
 
+	pd692x0_managers_free_pw_budget(priv);
 	firmware_upload_unregister(priv->fwl);
 }
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cc6c50180663..47ddcb4b9a78 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1875,6 +1875,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				local_bh_enable();
 				goto unlock_frags;
 			}
+
+			if (frags && skb != tfile->napi.skb)
+				tfile->napi.skb = skb;
 		}
 		rcu_read_unlock();
 		local_bh_enable();
diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index 9b0318fb50b5..792ddda1ad49 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -676,6 +676,7 @@ static int ax88772_init_mdio(struct usbnet *dev)
 	priv->mdio->read = &asix_mdio_bus_read;
 	priv->mdio->write = &asix_mdio_bus_write;
 	priv->mdio->name = "Asix MDIO Bus";
+	priv->mdio->phy_mask = ~(BIT(priv->phy_addr & 0x1f) | BIT(AX_EMBD_PHY_ADDR));
 	/* mii bus name is usb-<usb bus number>-<usb device number> */
 	snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d",
 		 dev->udev->bus->busnum, dev->udev->devnum);
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index ea0e5e276cd6..5d123df0a866 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -2087,6 +2087,13 @@ static const struct usb_device_id cdc_devs[] = {
 	  .driver_info = (unsigned long)&wwan_info,
 	},
 
+	/* Intel modem (label from OEM reads Fibocom L850-GL) */
+	{ USB_DEVICE_AND_INTERFACE_INFO(0x8087, 0x095a,
+		USB_CLASS_COMM,
+		USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
+	  .driver_info = (unsigned long)&wwan_info,
+	},
+
 	/* DisplayLink docking stations */
 	{ .match_flags = USB_DEVICE_ID_MATCH_INT_INFO
 		| USB_DEVICE_ID_MATCH_VENDOR,
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index f5647ee0adde..11352d85475a 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1355,12 +1355,16 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x2357, 0x0201, 4)},	/* TP-LINK HSUPA Modem MA180 */
 	{QMI_FIXED_INTF(0x2357, 0x9000, 4)},	/* TP-LINK MA260 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */
+	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1034, 2)}, /* Telit LE910C4-WWX */
+	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1037, 4)}, /* Telit LE910C4-WWX */
+	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1038, 3)}, /* Telit LE910C4-WWX */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x103a, 0)}, /* Telit LE910C4-WWX */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)},	/* Telit LE922A */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)},	/* Telit FN980 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1057, 2)},	/* Telit FN980 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1060, 2)},	/* Telit LN920 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1070, 2)},	/* Telit FN990A */
+	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1077, 2)},	/* Telit FN990A w/audio */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1080, 2)}, /* Telit FE990A */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x10a0, 0)}, /* Telit FN920C04 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x10a4, 0)}, /* Telit FN920C04 */
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d14e6d602273..975bdc5dab84 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -5758,14 +5758,15 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
 	disable_rx_mode_work(vi);
 	flush_work(&vi->rx_mode_work);
 
-	netif_tx_lock_bh(vi->dev);
-	netif_device_detach(vi->dev);
-	netif_tx_unlock_bh(vi->dev);
 	if (netif_running(vi->dev)) {
 		rtnl_lock();
 		virtnet_close(vi->dev);
 		rtnl_unlock();
 	}
+
+	netif_tx_lock_bh(vi->dev);
+	netif_device_detach(vi->dev);
+	netif_tx_unlock_bh(vi->dev);
 }
 
 static int init_vqs(struct virtnet_info *vi);
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index f32be2e301f2..dab864bc733c 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1445,6 +1445,10 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
 		if (READ_ONCE(f->updated) != now)
 			WRITE_ONCE(f->updated, now);
 
+		/* Don't override an fdb with nexthop with a learnt entry */
+		if (rcu_access_pointer(f->nh))
+			return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;
+
 		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
 			   rdst->remote_ifindex == ifindex))
 			return SKB_NOT_DROPPED_YET;
@@ -1453,10 +1457,6 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
 		if (f->state & (NUD_PERMANENT | NUD_NOARP))
 			return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;
 
-		/* Don't override an fdb with nexthop with a learnt entry */
-		if (rcu_access_pointer(f->nh))
-			return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;
-
 		if (net_ratelimit())
 			netdev_info(dev,
 				    "%pM migrated from %pIS to %pIS\n",
@@ -1877,6 +1877,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 	n = neigh_lookup(&arp_tbl, &tip, dev);
 
 	if (n) {
+		struct vxlan_rdst *rdst = NULL;
 		struct vxlan_fdb *f;
 		struct sk_buff	*reply;
 
@@ -1887,7 +1888,9 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 
 		rcu_read_lock();
 		f = vxlan_find_mac_tx(vxlan, n->ha, vni);
-		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
+		if (f)
+			rdst = first_remote_rcu(f);
+		if (rdst && vxlan_addr_any(&rdst->remote_ip)) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			rcu_read_unlock();
@@ -2044,6 +2047,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
 
 	if (n) {
+		struct vxlan_rdst *rdst = NULL;
 		struct vxlan_fdb *f;
 		struct sk_buff *reply;
 
@@ -2053,7 +2057,9 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 		}
 
 		f = vxlan_find_mac_tx(vxlan, n->ha, vni);
-		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
+		if (f)
+			rdst = first_remote_rcu(f);
+		if (rdst && vxlan_addr_any(&rdst->remote_ip)) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h
index 6c625fb29c6c..99fe772ad679 100644
--- a/drivers/net/vxlan/vxlan_private.h
+++ b/drivers/net/vxlan/vxlan_private.h
@@ -61,9 +61,7 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
 	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
 }
 
-/* First remote destination for a forwarding entry.
- * Guaranteed to be non-NULL because remotes are never deleted.
- */
+/* First remote destination for a forwarding entry. */
 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
 {
 	if (rcu_access_pointer(fdb->nh))
diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index 995a7207bdf8..f357a7ac70ac 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -81,7 +81,7 @@ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev)
 
 static __inline__ int dev_is_ethdev(struct net_device *dev)
 {
-	return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5);
+	return dev->type == ARPHRD_ETHER && !netdev_need_ops_lock(dev);
 }
 
 /* ------------------------------------------------------------------------ */
diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h
index 220d69a7a429..e8780b05ce11 100644
--- a/drivers/net/wireless/ath/ath11k/core.h
+++ b/drivers/net/wireless/ath/ath11k/core.h
@@ -411,6 +411,8 @@ struct ath11k_vif {
 	bool do_not_send_tmpl;
 	struct ath11k_arp_ns_offload arp_ns_offload;
 	struct ath11k_rekey_data rekey_data;
+	u32 num_stations;
+	bool reinstall_group_keys;
 
 	struct ath11k_reg_tpc_power_info reg_tpc_info;
 
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 1fadf5faafb8..106e2530b64e 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -4317,6 +4317,40 @@ static int ath11k_clear_peer_keys(struct ath11k_vif *arvif,
 	return first_errno;
 }
 
+static int ath11k_set_group_keys(struct ath11k_vif *arvif)
+{
+	struct ath11k *ar = arvif->ar;
+	struct ath11k_base *ab = ar->ab;
+	const u8 *addr = arvif->bssid;
+	int i, ret, first_errno = 0;
+	struct ath11k_peer *peer;
+
+	spin_lock_bh(&ab->base_lock);
+	peer = ath11k_peer_find(ab, arvif->vdev_id, addr);
+	spin_unlock_bh(&ab->base_lock);
+
+	if (!peer)
+		return -ENOENT;
+
+	for (i = 0; i < ARRAY_SIZE(peer->keys); i++) {
+		struct ieee80211_key_conf *key = peer->keys[i];
+
+		if (!key || (key->flags & IEEE80211_KEY_FLAG_PAIRWISE))
+			continue;
+
+		ret = ath11k_install_key(arvif, key, SET_KEY, addr,
+					 WMI_KEY_GROUP);
+		if (ret < 0 && first_errno == 0)
+			first_errno = ret;
+
+		if (ret < 0)
+			ath11k_warn(ab, "failed to set group key of idx %d for vdev %d: %d\n",
+				    i, arvif->vdev_id, ret);
+	}
+
+	return first_errno;
+}
+
 static int ath11k_mac_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 				 struct ieee80211_vif *vif, struct ieee80211_sta *sta,
 				 struct ieee80211_key_conf *key)
@@ -4326,6 +4360,7 @@ static int ath11k_mac_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif);
 	struct ath11k_peer *peer;
 	struct ath11k_sta *arsta;
+	bool is_ap_with_no_sta;
 	const u8 *peer_addr;
 	int ret = 0;
 	u32 flags = 0;
@@ -4386,16 +4421,57 @@ static int ath11k_mac_op_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	else
 		flags |= WMI_KEY_GROUP;
 
-	ret = ath11k_install_key(arvif, key, cmd, peer_addr, flags);
-	if (ret) {
-		ath11k_warn(ab, "ath11k_install_key failed (%d)\n", ret);
-		goto exit;
-	}
+	ath11k_dbg(ar->ab, ATH11K_DBG_MAC,
+		   "%s for peer %pM on vdev %d flags 0x%X, type = %d, num_sta %d\n",
+		   cmd == SET_KEY ? "SET_KEY" : "DEL_KEY", peer_addr, arvif->vdev_id,
+		   flags, arvif->vdev_type, arvif->num_stations);
+
+	/* Allow group key clearing only in AP mode when no stations are
+	 * associated. There is a known race condition in firmware where
+	 * group addressed packets may be dropped if the key is cleared
+	 * and immediately set again during rekey.
+	 *
+	 * During GTK rekey, mac80211 issues a clear key (if the old key
+	 * exists) followed by an install key operation for same key
+	 * index. This causes ath11k to send two WMI commands in quick
+	 * succession: one to clear the old key and another to install the
+	 * new key in the same slot.
+	 *
+	 * Under certain conditions—especially under high load or time
+	 * sensitive scenarios, firmware may process these commands
+	 * asynchronously in a way that firmware assumes the key is
+	 * cleared whereas hardware has a valid key. This inconsistency
+	 * between hardware and firmware leads to group addressed packet
+	 * drops after rekey.
+	 * Only setting the same key again can restore a valid key in
+	 * firmware and allow packets to be transmitted.
+	 *
+	 * There is a use case where an AP can transition from Secure mode
+	 * to open mode without a vdev restart by just deleting all
+	 * associated peers and clearing key, Hence allow clear key for
+	 * that case alone. Mark arvif->reinstall_group_keys in such cases
+	 * and reinstall the same key when the first peer is added,
+	 * allowing firmware to recover from the race if it had occurred.
+	 */
 
-	ret = ath11k_dp_peer_rx_pn_replay_config(arvif, peer_addr, cmd, key);
-	if (ret) {
-		ath11k_warn(ab, "failed to offload PN replay detection %d\n", ret);
-		goto exit;
+	is_ap_with_no_sta = (vif->type == NL80211_IFTYPE_AP &&
+			     !arvif->num_stations);
+	if ((flags & WMI_KEY_PAIRWISE) || cmd == SET_KEY || is_ap_with_no_sta) {
+		ret = ath11k_install_key(arvif, key, cmd, peer_addr, flags);
+		if (ret) {
+			ath11k_warn(ab, "ath11k_install_key failed (%d)\n", ret);
+			goto exit;
+		}
+
+		ret = ath11k_dp_peer_rx_pn_replay_config(arvif, peer_addr, cmd, key);
+		if (ret) {
+			ath11k_warn(ab, "failed to offload PN replay detection %d\n",
+				    ret);
+			goto exit;
+		}
+
+		if ((flags & WMI_KEY_GROUP) && cmd == SET_KEY && is_ap_with_no_sta)
+			arvif->reinstall_group_keys = true;
 	}
 
 	spin_lock_bh(&ab->base_lock);
@@ -4994,6 +5070,7 @@ static int ath11k_mac_inc_num_stations(struct ath11k_vif *arvif,
 		return -ENOBUFS;
 
 	ar->num_stations++;
+	arvif->num_stations++;
 
 	return 0;
 }
@@ -5009,6 +5086,7 @@ static void ath11k_mac_dec_num_stations(struct ath11k_vif *arvif,
 		return;
 
 	ar->num_stations--;
+	arvif->num_stations--;
 }
 
 static u32 ath11k_mac_ieee80211_sta_bw_to_wmi(struct ath11k *ar,
@@ -9540,6 +9618,21 @@ static int ath11k_mac_station_add(struct ath11k *ar,
 		goto exit;
 	}
 
+	/* Driver allows the DEL KEY followed by SET KEY sequence for
+	 * group keys for only when there is no clients associated, if at
+	 * all firmware has entered the race during that window,
+	 * reinstalling the same key when the first sta connects will allow
+	 * firmware to recover from the race.
+	 */
+	if (arvif->num_stations == 1 && arvif->reinstall_group_keys) {
+		ath11k_dbg(ab, ATH11K_DBG_MAC, "set group keys on 1st station add for vdev %d\n",
+			   arvif->vdev_id);
+		ret = ath11k_set_group_keys(arvif);
+		if (ret)
+			goto dec_num_station;
+		arvif->reinstall_group_keys = false;
+	}
+
 	arsta->rx_stats = kzalloc(sizeof(*arsta->rx_stats), GFP_KERNEL);
 	if (!arsta->rx_stats) {
 		ret = -ENOMEM;
diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c
index bd1ec3b2c084..3a3965b79942 100644
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -4078,12 +4078,68 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif,
 	return ret;
 }
 
+static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif)
+{
+	struct ath12k *ar = arvif->ar;
+	struct ieee80211_vif *vif = arvif->ahvif->vif;
+	struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf;
+	enum wmi_sta_powersave_param param;
+	struct ieee80211_bss_conf *info;
+	enum wmi_sta_ps_mode psmode;
+	int ret;
+	int timeout;
+	bool enable_ps;
+
+	lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy);
+
+	if (vif->type != NL80211_IFTYPE_STATION)
+		return;
+
+	enable_ps = arvif->ahvif->ps;
+	if (enable_ps) {
+		psmode = WMI_STA_PS_MODE_ENABLED;
+		param = WMI_STA_PS_PARAM_INACTIVITY_TIME;
+
+		timeout = conf->dynamic_ps_timeout;
+		if (timeout == 0) {
+			info = ath12k_mac_get_link_bss_conf(arvif);
+			if (!info) {
+				ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n",
+					    vif->addr, arvif->link_id);
+				return;
+			}
+
+			/* firmware doesn't like 0 */
+			timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000;
+		}
+
+		ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param,
+						  timeout);
+		if (ret) {
+			ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n",
+				    arvif->vdev_id, ret);
+			return;
+		}
+	} else {
+		psmode = WMI_STA_PS_MODE_DISABLED;
+	}
+
+	ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n",
+		   arvif->vdev_id, psmode ? "enable" : "disable");
+
+	ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode);
+	if (ret)
+		ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n",
+			    psmode, arvif->vdev_id, ret);
+}
+
 static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw,
 					  struct ieee80211_vif *vif,
 					  u64 changed)
 {
 	struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif);
 	unsigned long links = ahvif->links_map;
+	struct ieee80211_vif_cfg *vif_cfg;
 	struct ieee80211_bss_conf *info;
 	struct ath12k_link_vif *arvif;
 	struct ieee80211_sta *sta;
@@ -4147,61 +4203,24 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw,
 			}
 		}
 	}
-}
-
-static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif)
-{
-	struct ath12k *ar = arvif->ar;
-	struct ieee80211_vif *vif = arvif->ahvif->vif;
-	struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf;
-	enum wmi_sta_powersave_param param;
-	struct ieee80211_bss_conf *info;
-	enum wmi_sta_ps_mode psmode;
-	int ret;
-	int timeout;
-	bool enable_ps;
 
-	lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy);
+	if (changed & BSS_CHANGED_PS) {
+		links = ahvif->links_map;
+		vif_cfg = &vif->cfg;
 
-	if (vif->type != NL80211_IFTYPE_STATION)
-		return;
+		for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) {
+			arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]);
+			if (!arvif || !arvif->ar)
+				continue;
 
-	enable_ps = arvif->ahvif->ps;
-	if (enable_ps) {
-		psmode = WMI_STA_PS_MODE_ENABLED;
-		param = WMI_STA_PS_PARAM_INACTIVITY_TIME;
+			ar = arvif->ar;
 
-		timeout = conf->dynamic_ps_timeout;
-		if (timeout == 0) {
-			info = ath12k_mac_get_link_bss_conf(arvif);
-			if (!info) {
-				ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n",
-					    vif->addr, arvif->link_id);
-				return;
+			if (ar->ab->hw_params->supports_sta_ps) {
+				ahvif->ps = vif_cfg->ps;
+				ath12k_mac_vif_setup_ps(arvif);
 			}
-
-			/* firmware doesn't like 0 */
-			timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000;
 		}
-
-		ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param,
-						  timeout);
-		if (ret) {
-			ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n",
-				    arvif->vdev_id, ret);
-			return;
-		}
-	} else {
-		psmode = WMI_STA_PS_MODE_DISABLED;
 	}
-
-	ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n",
-		   arvif->vdev_id, psmode ? "enable" : "disable");
-
-	ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode);
-	if (ret)
-		ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n",
-			    psmode, arvif->vdev_id, ret);
 }
 
 static bool ath12k_mac_supports_tpc(struct ath12k *ar, struct ath12k_vif *ahvif,
@@ -4223,7 +4242,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar,
 {
 	struct ath12k_vif *ahvif = arvif->ahvif;
 	struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif);
-	struct ieee80211_vif_cfg *vif_cfg = &vif->cfg;
 	struct cfg80211_chan_def def;
 	u32 param_id, param_value;
 	enum nl80211_band band;
@@ -4510,12 +4528,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar,
 	}
 
 	ath12k_mac_fils_discovery(arvif, info);
-
-	if (changed & BSS_CHANGED_PS &&
-	    ar->ab->hw_params->supports_sta_ps) {
-		ahvif->ps = vif_cfg->ps;
-		ath12k_mac_vif_setup_ps(arvif);
-	}
 }
 
 static struct ath12k_vif_cache *ath12k_ahvif_get_link_cache(struct ath12k_vif *ahvif,
diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c
index da85c28ec355..29dadedefdd2 100644
--- a/drivers/net/wireless/ath/ath12k/wmi.c
+++ b/drivers/net/wireless/ath/ath12k/wmi.c
@@ -843,7 +843,7 @@ int ath12k_wmi_mgmt_send(struct ath12k_link_vif *arvif, u32 buf_id,
 	cmd->tx_params_valid = 0;
 
 	frame_tlv = (struct wmi_tlv *)(skb->data + sizeof(*cmd));
-	frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len);
+	frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len_aligned);
 
 	memcpy(frame_tlv->value, frame->data, buf_len);
 
@@ -2423,6 +2423,7 @@ int ath12k_wmi_send_peer_assoc_cmd(struct ath12k *ar,
 
 	eml_cap = arg->ml.eml_cap;
 	if (u16_get_bits(eml_cap, IEEE80211_EML_CAP_EMLSR_SUPP)) {
+		ml_params->flags |= cpu_to_le32(ATH12K_WMI_FLAG_MLO_EMLSR_SUPPORT);
 		/* Padding delay */
 		eml_pad_delay = ieee80211_emlsr_pad_delay_in_us(eml_cap);
 		ml_params->emlsr_padding_delay_us = cpu_to_le32(eml_pad_delay);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
index 69ef8cf203d2..67c0c5a92f99 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
@@ -393,10 +393,8 @@ void brcmf_btcoex_detach(struct brcmf_cfg80211_info *cfg)
 	if (!cfg->btcoex)
 		return;
 
-	if (cfg->btcoex->timer_on) {
-		cfg->btcoex->timer_on = false;
-		timer_shutdown_sync(&cfg->btcoex->timer);
-	}
+	timer_shutdown_sync(&cfg->btcoex->timer);
+	cfg->btcoex->timer_on = false;
 
 	cancel_work_sync(&cfg->btcoex->work);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c
index bee7d92293b8..7ec22738b5d6 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/acpi.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/acpi.c
@@ -169,7 +169,7 @@ int iwl_acpi_get_dsm(struct iwl_fw_runtime *fwrt,
 
 	BUILD_BUG_ON(ARRAY_SIZE(acpi_dsm_size) != DSM_FUNC_NUM_FUNCS);
 
-	if (WARN_ON(func >= ARRAY_SIZE(acpi_dsm_size)))
+	if (WARN_ON(func >= ARRAY_SIZE(acpi_dsm_size) || !func))
 		return -EINVAL;
 
 	expected_size = acpi_dsm_size[func];
@@ -178,6 +178,29 @@ int iwl_acpi_get_dsm(struct iwl_fw_runtime *fwrt,
 	if (expected_size != sizeof(u8) && expected_size != sizeof(u32))
 		return -EOPNOTSUPP;
 
+	if (!fwrt->acpi_dsm_funcs_valid) {
+		ret = iwl_acpi_get_dsm_integer(fwrt->dev, ACPI_DSM_REV,
+					       DSM_FUNC_QUERY,
+					       &iwl_guid, &tmp,
+					       acpi_dsm_size[DSM_FUNC_QUERY]);
+		if (ret) {
+			/* always indicate BIT(0) to avoid re-reading */
+			fwrt->acpi_dsm_funcs_valid = BIT(0);
+			return ret;
+		}
+
+		IWL_DEBUG_RADIO(fwrt, "ACPI DSM validity bitmap 0x%x\n",
+				(u32)tmp);
+		/* always indicate BIT(0) to avoid re-reading */
+		fwrt->acpi_dsm_funcs_valid = tmp | BIT(0);
+	}
+
+	if (!(fwrt->acpi_dsm_funcs_valid & BIT(func))) {
+		IWL_DEBUG_RADIO(fwrt, "ACPI DSM %d not indicated as valid\n",
+				func);
+		return -ENODATA;
+	}
+
 	ret = iwl_acpi_get_dsm_integer(fwrt->dev, ACPI_DSM_REV, func,
 				       &iwl_guid, &tmp, expected_size);
 	if (ret)
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
index 0444a736c2b2..bd3bc2846cfa 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
@@ -113,6 +113,10 @@ struct iwl_txf_iter_data {
  * @phy_filters: specific phy filters as read from WPFC BIOS table
  * @ppag_bios_rev: PPAG BIOS revision
  * @ppag_bios_source: see &enum bios_source
+ * @acpi_dsm_funcs_valid: bitmap indicating which DSM values are valid,
+ *	zero (default initialization) means it hasn't been read yet,
+ *	and BIT(0) is set when it has since function 0 also has this
+ *	bitmap and is always supported
  */
 struct iwl_fw_runtime {
 	struct iwl_trans *trans;
@@ -189,6 +193,10 @@ struct iwl_fw_runtime {
 	bool uats_valid;
 	u8 uefi_tables_lock_status;
 	struct iwl_phy_specific_cfg phy_filters;
+
+#ifdef CONFIG_ACPI
+	u32 acpi_dsm_funcs_valid;
+#endif
 };
 
 void iwl_fw_runtime_init(struct iwl_fw_runtime *fwrt, struct iwl_trans *trans,
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/uefi.c b/drivers/net/wireless/intel/iwlwifi/fw/uefi.c
index 48126ec6b94b..99a17b9323e9 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/uefi.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/uefi.c
@@ -747,6 +747,12 @@ int iwl_uefi_get_dsm(struct iwl_fw_runtime *fwrt, enum iwl_dsm_funcs func,
 		goto out;
 	}
 
+	if (!(data->functions[DSM_FUNC_QUERY] & BIT(func))) {
+		IWL_DEBUG_RADIO(fwrt, "DSM func %d not in 0x%x\n",
+				func, data->functions[DSM_FUNC_QUERY]);
+		goto out;
+	}
+
 	*value = data->functions[func];
 
 	IWL_DEBUG_RADIO(fwrt,
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
index b7add05f7a85..7e56e4ff7642 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
@@ -124,13 +124,13 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = {
 	{IWL_PCI_DEVICE(0x0082, 0x1304, iwl6005_mac_cfg)},/* low 5GHz active */
 	{IWL_PCI_DEVICE(0x0082, 0x1305, iwl6005_mac_cfg)},/* high 5GHz active */
 
-/* 6x30 Series */
-	{IWL_PCI_DEVICE(0x008A, 0x5305, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x008A, 0x5307, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x008A, 0x5325, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x008A, 0x5327, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x008B, 0x5315, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x008B, 0x5317, iwl1000_mac_cfg)},
+/* 1030/6x30 Series */
+	{IWL_PCI_DEVICE(0x008A, 0x5305, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x008A, 0x5307, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x008A, 0x5325, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x008A, 0x5327, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x008B, 0x5315, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x008B, 0x5317, iwl6030_mac_cfg)},
 	{IWL_PCI_DEVICE(0x0090, 0x5211, iwl6030_mac_cfg)},
 	{IWL_PCI_DEVICE(0x0090, 0x5215, iwl6030_mac_cfg)},
 	{IWL_PCI_DEVICE(0x0090, 0x5216, iwl6030_mac_cfg)},
@@ -181,12 +181,12 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = {
 	{IWL_PCI_DEVICE(0x08AE, 0x1027, iwl1000_mac_cfg)},
 
 /* 130 Series WiFi */
-	{IWL_PCI_DEVICE(0x0896, 0x5005, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x0896, 0x5007, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x0897, 0x5015, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x0897, 0x5017, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x0896, 0x5025, iwl1000_mac_cfg)},
-	{IWL_PCI_DEVICE(0x0896, 0x5027, iwl1000_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0896, 0x5005, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0896, 0x5007, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0897, 0x5015, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0897, 0x5017, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0896, 0x5025, iwl6030_mac_cfg)},
+	{IWL_PCI_DEVICE(0x0896, 0x5027, iwl6030_mac_cfg)},
 
 /* 2x00 Series */
 	{IWL_PCI_DEVICE(0x0890, 0x4022, iwl2000_mac_cfg)},
@@ -673,6 +673,8 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = {
 
 	IWL_DEV_INFO(iwl6005_n_cfg, iwl6005_2agn_sff_name,
 		     DEVICE(0x0082), SUBDEV_MASKED(0xC000, 0xF000)),
+	IWL_DEV_INFO(iwl6005_n_cfg, iwl6005_2agn_sff_name,
+		     DEVICE(0x0085), SUBDEV_MASKED(0xC000, 0xF000)),
 	IWL_DEV_INFO(iwl6005_n_cfg, iwl6005_2agn_d_name,
 		     DEVICE(0x0082), SUBDEV(0x4820)),
 	IWL_DEV_INFO(iwl6005_n_cfg, iwl6005_2agn_mow1_name,
@@ -729,10 +731,10 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = {
 		     DEVICE(0x0083), SUBDEV_MASKED(0x5, 0xF)),
 	IWL_DEV_INFO(iwl1000_bg_cfg, iwl1000_bg_name,
 		     DEVICE(0x0083), SUBDEV_MASKED(0x6, 0xF)),
+	IWL_DEV_INFO(iwl1000_bgn_cfg, iwl1000_bgn_name,
+		     DEVICE(0x0084), SUBDEV_MASKED(0x5, 0xF)),
 	IWL_DEV_INFO(iwl1000_bg_cfg, iwl1000_bg_name,
-		     DEVICE(0x0084), SUBDEV(0x1216)),
-	IWL_DEV_INFO(iwl1000_bg_cfg, iwl1000_bg_name,
-		     DEVICE(0x0084), SUBDEV(0x1316)),
+		     DEVICE(0x0084), SUBDEV_MASKED(0x6, 0xF)),
 
 /* 100 Series WiFi */
 	IWL_DEV_INFO(iwl100_bgn_cfg, iwl100_bgn_name,
@@ -964,6 +966,12 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = {
 		     DEVICE(0x24F3), SUBDEV(0x0004)),
 	IWL_DEV_INFO(iwl8260_cfg, iwl8260_2n_name,
 		     DEVICE(0x24F3), SUBDEV(0x0044)),
+	IWL_DEV_INFO(iwl8260_cfg, iwl8260_2ac_name,
+		     DEVICE(0x24F4)),
+	IWL_DEV_INFO(iwl8260_cfg, iwl4165_2ac_name,
+		     DEVICE(0x24F5)),
+	IWL_DEV_INFO(iwl8260_cfg, iwl4165_2ac_name,
+		     DEVICE(0x24F6)),
 	IWL_DEV_INFO(iwl8265_cfg, iwl8265_2ac_name,
 		     DEVICE(0x24FD)),
 	IWL_DEV_INFO(iwl8265_cfg, iwl8275_2ac_name,
@@ -1222,11 +1230,15 @@ static int _iwl_pci_resume(struct device *device, bool restore)
 	 * Note: MAC (bits 0:7) will be cleared upon suspend even with wowlan,
 	 * but not bits [15:8]. So if we have bits set in lower word, assume
 	 * the device is alive.
+	 * Alternatively, if the scratch value is 0xFFFFFFFF, then we no longer
+	 * have access to the device and consider it powered off.
 	 * For older devices, just try silently to grab the NIC.
 	 */
 	if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_BZ) {
-		if (!(iwl_read32(trans, CSR_FUNC_SCRATCH) &
-		      CSR_FUNC_SCRATCH_POWER_OFF_MASK))
+		u32 scratch = iwl_read32(trans, CSR_FUNC_SCRATCH);
+
+		if (!(scratch & CSR_FUNC_SCRATCH_POWER_OFF_MASK) ||
+		    scratch == ~0U)
 			device_was_powered_off = true;
 	} else {
 		/*
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c
index 84a05cc1c27a..bb03dad4a300 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c
@@ -2092,7 +2092,8 @@ static void iwl_txq_gen1_update_byte_cnt_tbl(struct iwl_trans *trans,
 		break;
 	}
 
-	if (trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210)
+	if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_7000 &&
+	    trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210)
 		len = DIV_ROUND_UP(len, 4);
 
 	if (WARN_ON(len > 0xFFF || write_ptr >= TFD_QUEUE_SIZE_MAX))
diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c
index 94dd488becaf..caba7491cd5a 100644
--- a/drivers/net/wireless/marvell/libertas/cfg.c
+++ b/drivers/net/wireless/marvell/libertas/cfg.c
@@ -1151,10 +1151,13 @@ static int lbs_associate(struct lbs_private *priv,
 	/* add SSID TLV */
 	rcu_read_lock();
 	ssid_eid = ieee80211_bss_get_ie(bss, WLAN_EID_SSID);
-	if (ssid_eid)
-		pos += lbs_add_ssid_tlv(pos, ssid_eid + 2, ssid_eid[1]);
-	else
+	if (ssid_eid) {
+		u32 ssid_len = min(ssid_eid[1], IEEE80211_MAX_SSID_LEN);
+
+		pos += lbs_add_ssid_tlv(pos, ssid_eid + 2, ssid_len);
+	} else {
 		lbs_deb_assoc("no SSID\n");
+	}
 	rcu_read_unlock();
 
 	/* add DS param TLV */
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 3498743d5ec0..4c8c7a5fdf23 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -4673,8 +4673,9 @@ int mwifiex_init_channel_scan_gap(struct mwifiex_adapter *adapter)
 	 * additional active scan request for hidden SSIDs on passive channels.
 	 */
 	adapter->num_in_chan_stats = 2 * (n_channels_bg + n_channels_a);
-	adapter->chan_stats = vmalloc(array_size(sizeof(*adapter->chan_stats),
-						 adapter->num_in_chan_stats));
+	adapter->chan_stats = kcalloc(adapter->num_in_chan_stats,
+				      sizeof(*adapter->chan_stats),
+				      GFP_KERNEL);
 
 	if (!adapter->chan_stats)
 		return -ENOMEM;
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 7b50a88a18e5..1ec069bc8ea1 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -642,7 +642,7 @@ static int _mwifiex_fw_dpc(const struct firmware *firmware, void *context)
 	goto done;
 
 err_add_intf:
-	vfree(adapter->chan_stats);
+	kfree(adapter->chan_stats);
 err_init_chan_scan:
 	wiphy_unregister(adapter->wiphy);
 	wiphy_free(adapter->wiphy);
@@ -1485,7 +1485,7 @@ static void mwifiex_uninit_sw(struct mwifiex_adapter *adapter)
 	wiphy_free(adapter->wiphy);
 	adapter->wiphy = NULL;
 
-	vfree(adapter->chan_stats);
+	kfree(adapter->chan_stats);
 	mwifiex_free_cmd_buffers(adapter);
 }
 
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 3afe4c4cd7bb..59adf3312617 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -818,6 +818,43 @@ void mt76_free_device(struct mt76_dev *dev)
 }
 EXPORT_SYMBOL_GPL(mt76_free_device);
 
+static void mt76_reset_phy(struct mt76_phy *phy)
+{
+	if (!phy)
+		return;
+
+	INIT_LIST_HEAD(&phy->tx_list);
+}
+
+void mt76_reset_device(struct mt76_dev *dev)
+{
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < ARRAY_SIZE(dev->wcid); i++) {
+		struct mt76_wcid *wcid;
+
+		wcid = rcu_dereference(dev->wcid[i]);
+		if (!wcid)
+			continue;
+
+		wcid->sta = 0;
+		mt76_wcid_cleanup(dev, wcid);
+		rcu_assign_pointer(dev->wcid[i], NULL);
+	}
+	rcu_read_unlock();
+
+	INIT_LIST_HEAD(&dev->wcid_list);
+	INIT_LIST_HEAD(&dev->sta_poll_list);
+	dev->vif_mask = 0;
+	memset(dev->wcid_mask, 0, sizeof(dev->wcid_mask));
+
+	mt76_reset_phy(&dev->phy);
+	for (i = 0; i < ARRAY_SIZE(dev->phys); i++)
+		mt76_reset_phy(dev->phys[i]);
+}
+EXPORT_SYMBOL_GPL(mt76_reset_device);
+
 struct mt76_phy *mt76_vif_phy(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif)
 {
@@ -1679,6 +1716,10 @@ void mt76_wcid_cleanup(struct mt76_dev *dev, struct mt76_wcid *wcid)
 	skb_queue_splice_tail_init(&wcid->tx_pending, &list);
 	spin_unlock(&wcid->tx_pending.lock);
 
+	spin_lock(&wcid->tx_offchannel.lock);
+	skb_queue_splice_tail_init(&wcid->tx_offchannel, &list);
+	spin_unlock(&wcid->tx_offchannel.lock);
+
 	spin_unlock_bh(&phy->tx_lock);
 
 	while ((skb = __skb_dequeue(&list)) != NULL) {
@@ -1690,7 +1731,7 @@ EXPORT_SYMBOL_GPL(mt76_wcid_cleanup);
 
 void mt76_wcid_add_poll(struct mt76_dev *dev, struct mt76_wcid *wcid)
 {
-	if (test_bit(MT76_MCU_RESET, &dev->phy.state))
+	if (test_bit(MT76_MCU_RESET, &dev->phy.state) || !wcid->sta)
 		return;
 
 	spin_lock_bh(&dev->sta_poll_lock);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 8dd5c29fb75b..127637454c82 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -1243,6 +1243,7 @@ int mt76_register_device(struct mt76_dev *dev, bool vht,
 			 struct ieee80211_rate *rates, int n_rates);
 void mt76_unregister_device(struct mt76_dev *dev);
 void mt76_free_device(struct mt76_dev *dev);
+void mt76_reset_device(struct mt76_dev *dev);
 void mt76_unregister_phy(struct mt76_phy *phy);
 
 struct mt76_phy *mt76_alloc_radio_phy(struct mt76_dev *dev, unsigned int size,
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c
index 6639976afcee..1c0d310146d6 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c
@@ -1460,17 +1460,15 @@ mt7915_mac_full_reset(struct mt7915_dev *dev)
 	if (i == 10)
 		dev_err(dev->mt76.dev, "chip full reset failed\n");
 
-	spin_lock_bh(&dev->mt76.sta_poll_lock);
-	while (!list_empty(&dev->mt76.sta_poll_list))
-		list_del_init(dev->mt76.sta_poll_list.next);
-	spin_unlock_bh(&dev->mt76.sta_poll_lock);
-
-	memset(dev->mt76.wcid_mask, 0, sizeof(dev->mt76.wcid_mask));
-	dev->mt76.vif_mask = 0;
 	dev->phy.omac_mask = 0;
 	if (phy2)
 		phy2->omac_mask = 0;
 
+	mt76_reset_device(&dev->mt76);
+
+	INIT_LIST_HEAD(&dev->sta_rc_list);
+	INIT_LIST_HEAD(&dev->twt_list);
+
 	i = mt76_wcid_alloc(dev->mt76.wcid_mask, MT7915_WTBL_STA);
 	dev->mt76.global_wcid.idx = i;
 	dev->recovery.hw_full_reset = false;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 40954e64c7fc..5881040ac195 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -1459,11 +1459,8 @@ static int mt7921_pre_channel_switch(struct ieee80211_hw *hw,
 	if (vif->type != NL80211_IFTYPE_STATION || !vif->cfg.assoc)
 		return -EOPNOTSUPP;
 
-	/* Avoid beacon loss due to the CAC(Channel Availability Check) time
-	 * of the AP.
-	 */
 	if (!cfg80211_chandef_usable(hw->wiphy, &chsw->chandef,
-				     IEEE80211_CHAN_RADAR))
+				     IEEE80211_CHAN_DISABLED))
 		return -EOPNOTSUPP;
 
 	return 0;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
index 75823c9fd3a1..b581ab9427f2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
@@ -1449,7 +1449,7 @@ void mt7925_usb_sdio_tx_complete_skb(struct mt76_dev *mdev,
 	sta = wcid_to_sta(wcid);
 
 	if (sta && likely(e->skb->protocol != cpu_to_be16(ETH_P_PAE)))
-		mt76_connac2_tx_check_aggr(sta, txwi);
+		mt7925_tx_check_aggr(sta, e->skb, wcid);
 
 	skb_pull(e->skb, headroom);
 	mt76_tx_complete_skb(mdev, e->wcid, e->skb);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index a8d25b7d47d0..b0e053b15227 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -1191,6 +1191,9 @@ mt7925_mac_sta_remove_links(struct mt792x_dev *dev, struct ieee80211_vif *vif,
 		struct mt792x_bss_conf *mconf;
 		struct mt792x_link_sta *mlink;
 
+		if (vif->type == NL80211_IFTYPE_AP)
+			break;
+
 		link_sta = mt792x_sta_to_link_sta(vif, sta, link_id);
 		if (!link_sta)
 			continue;
@@ -2069,8 +2072,10 @@ mt7925_change_vif_links(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 					     GFP_KERNEL);
 			mlink = devm_kzalloc(dev->mt76.dev, sizeof(*mlink),
 					     GFP_KERNEL);
-			if (!mconf || !mlink)
+			if (!mconf || !mlink) {
+				mt792x_mutex_release(dev);
 				return -ENOMEM;
+			}
 		}
 
 		mconfs[link_id] = mconf;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index 300c863f0e3e..cd457be26523 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1834,13 +1834,13 @@ mt7925_mcu_sta_eht_mld_tlv(struct sk_buff *skb,
 	struct tlv *tlv;
 	u16 eml_cap;
 
+	if (!ieee80211_vif_is_mld(vif))
+		return;
+
 	tlv = mt76_connac_mcu_add_tlv(skb, STA_REC_EHT_MLD, sizeof(*eht_mld));
 	eht_mld = (struct sta_rec_eht_mld *)tlv;
 	eht_mld->mld_type = 0xff;
 
-	if (!ieee80211_vif_is_mld(vif))
-		return;
-
 	ext_capa = cfg80211_get_iftype_ext_capa(wiphy,
 						ieee80211_vif_type_p2p(vif));
 	if (!ext_capa)
@@ -1912,6 +1912,7 @@ mt7925_mcu_sta_cmd(struct mt76_phy *phy,
 	struct mt76_dev *dev = phy->dev;
 	struct mt792x_bss_conf *mconf;
 	struct sk_buff *skb;
+	int conn_state;
 
 	mconf = mt792x_vif_to_link(mvif, info->wcid->link_id);
 
@@ -1920,10 +1921,13 @@ mt7925_mcu_sta_cmd(struct mt76_phy *phy,
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
+	conn_state = info->enable ? CONN_STATE_PORT_SECURE :
+				    CONN_STATE_DISCONNECT;
+
 	if (info->enable && info->link_sta) {
 		mt76_connac_mcu_sta_basic_tlv(dev, skb, info->link_conf,
 					      info->link_sta,
-					      info->enable, info->newly);
+					      conn_state, info->newly);
 		mt7925_mcu_sta_phy_tlv(skb, info->vif, info->link_sta);
 		mt7925_mcu_sta_ht_tlv(skb, info->link_sta);
 		mt7925_mcu_sta_vht_tlv(skb, info->link_sta);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c
index 226534490792..b3fcca9bbb95 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c
@@ -62,7 +62,7 @@ static struct mt76_wcid *mt7996_rx_get_wcid(struct mt7996_dev *dev,
 	int i;
 
 	wcid = mt76_wcid_ptr(dev, idx);
-	if (!wcid)
+	if (!wcid || !wcid->sta)
 		return NULL;
 
 	if (!mt7996_band_valid(dev, band_idx))
@@ -903,8 +903,12 @@ void mt7996_mac_write_txwi(struct mt7996_dev *dev, __le32 *txwi,
 				       IEEE80211_TX_CTRL_MLO_LINK);
 
 	mvif = vif ? (struct mt7996_vif *)vif->drv_priv : NULL;
-	if (mvif)
-		mlink = rcu_dereference(mvif->mt76.link[link_id]);
+	if (mvif) {
+		if (wcid->offchannel)
+			mlink = rcu_dereference(mvif->mt76.offchannel_link);
+		if (!mlink)
+			mlink = rcu_dereference(mvif->mt76.link[link_id]);
+	}
 
 	if (mlink) {
 		omac_idx = mlink->omac_idx;
@@ -1243,8 +1247,10 @@ mt7996_mac_tx_free(struct mt7996_dev *dev, void *data, int len)
 			idx = FIELD_GET(MT_TXFREE_INFO_WLAN_ID, info);
 			wcid = mt76_wcid_ptr(dev, idx);
 			sta = wcid_to_sta(wcid);
-			if (!sta)
+			if (!sta) {
+				link_sta = NULL;
 				goto next;
+			}
 
 			link_sta = rcu_dereference(sta->link[wcid->link_id]);
 			if (!link_sta)
@@ -1694,43 +1700,53 @@ mt7996_wait_reset_state(struct mt7996_dev *dev, u32 state)
 static void
 mt7996_update_vif_beacon(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	struct ieee80211_hw *hw = priv;
+	struct ieee80211_bss_conf *link_conf;
+	struct mt7996_phy *phy = priv;
+	struct mt7996_dev *dev = phy->dev;
+	unsigned int link_id;
+
 
 	switch (vif->type) {
 	case NL80211_IFTYPE_MESH_POINT:
 	case NL80211_IFTYPE_ADHOC:
 	case NL80211_IFTYPE_AP:
-		mt7996_mcu_add_beacon(hw, vif, &vif->bss_conf);
 		break;
 	default:
-		break;
+		return;
+	}
+
+	for_each_vif_active_link(vif, link_conf, link_id) {
+		struct mt7996_vif_link *link;
+
+		link = mt7996_vif_link(dev, vif, link_id);
+		if (!link || link->phy != phy)
+			continue;
+
+		mt7996_mcu_add_beacon(dev->mt76.hw, vif, link_conf);
 	}
 }
 
+void mt7996_mac_update_beacons(struct mt7996_phy *phy)
+{
+	ieee80211_iterate_active_interfaces(phy->mt76->hw,
+					    IEEE80211_IFACE_ITER_RESUME_ALL,
+					    mt7996_update_vif_beacon, phy);
+}
+
 static void
 mt7996_update_beacons(struct mt7996_dev *dev)
 {
 	struct mt76_phy *phy2, *phy3;
 
-	ieee80211_iterate_active_interfaces(dev->mt76.hw,
-					    IEEE80211_IFACE_ITER_RESUME_ALL,
-					    mt7996_update_vif_beacon, dev->mt76.hw);
+	mt7996_mac_update_beacons(&dev->phy);
 
 	phy2 = dev->mt76.phys[MT_BAND1];
-	if (!phy2)
-		return;
-
-	ieee80211_iterate_active_interfaces(phy2->hw,
-					    IEEE80211_IFACE_ITER_RESUME_ALL,
-					    mt7996_update_vif_beacon, phy2->hw);
+	if (phy2)
+		mt7996_mac_update_beacons(phy2->priv);
 
 	phy3 = dev->mt76.phys[MT_BAND2];
-	if (!phy3)
-		return;
-
-	ieee80211_iterate_active_interfaces(phy3->hw,
-					    IEEE80211_IFACE_ITER_RESUME_ALL,
-					    mt7996_update_vif_beacon, phy3->hw);
+	if (phy3)
+		mt7996_mac_update_beacons(phy3->priv);
 }
 
 void mt7996_tx_token_put(struct mt7996_dev *dev)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
index 92b57bcce749..84f731b387d2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
@@ -516,6 +516,9 @@ int mt7996_set_channel(struct mt76_phy *mphy)
 	struct mt7996_phy *phy = mphy->priv;
 	int ret;
 
+	if (mphy->offchannel)
+		mt7996_mac_update_beacons(phy);
+
 	ret = mt7996_mcu_set_chan_info(phy, UNI_CHANNEL_SWITCH);
 	if (ret)
 		goto out;
@@ -533,6 +536,8 @@ int mt7996_set_channel(struct mt76_phy *mphy)
 
 	mt7996_mac_reset_counters(phy);
 	phy->noise = 0;
+	if (!mphy->offchannel)
+		mt7996_mac_update_beacons(phy);
 
 out:
 	ieee80211_queue_delayed_work(mphy->hw, &mphy->mac_work,
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
index 3593fd40c51b..0be03eb3cf46 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
@@ -1879,8 +1879,8 @@ mt7996_mcu_get_mmps_mode(enum ieee80211_smps_mode smps)
 int mt7996_mcu_set_fixed_rate_ctrl(struct mt7996_dev *dev,
 				   void *data, u16 version)
 {
+	struct uni_header hdr = {};
 	struct ra_fixed_rate *req;
-	struct uni_header hdr;
 	struct sk_buff *skb;
 	struct tlv *tlv;
 	int len;
@@ -2755,13 +2755,15 @@ int mt7996_mcu_add_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			  struct ieee80211_bss_conf *link_conf)
 {
 	struct mt7996_dev *dev = mt7996_hw_dev(hw);
-	struct mt76_vif_link *mlink = mt76_vif_conf_link(&dev->mt76, vif, link_conf);
+	struct mt7996_vif_link *link = mt7996_vif_conf_link(dev, vif, link_conf);
+	struct mt76_vif_link *mlink = link ? &link->mt76 : NULL;
 	struct ieee80211_mutable_offsets offs;
 	struct ieee80211_tx_info *info;
 	struct sk_buff *skb, *rskb;
 	struct tlv *tlv;
 	struct bss_bcn_content_tlv *bcn;
 	int len, extra_len = 0;
+	bool enabled = link_conf->enable_beacon;
 
 	if (link_conf->nontransmitted)
 		return 0;
@@ -2769,13 +2771,16 @@ int mt7996_mcu_add_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	if (!mlink)
 		return -EINVAL;
 
+	if (link->phy && link->phy->mt76->offchannel)
+		enabled = false;
+
 	rskb = __mt7996_mcu_alloc_bss_req(&dev->mt76, mlink,
 					  MT7996_MAX_BSS_OFFLOAD_SIZE);
 	if (IS_ERR(rskb))
 		return PTR_ERR(rskb);
 
 	skb = ieee80211_beacon_get_template(hw, vif, &offs, link_conf->link_id);
-	if (link_conf->enable_beacon && !skb) {
+	if (enabled && !skb) {
 		dev_kfree_skb(rskb);
 		return -EINVAL;
 	}
@@ -2794,7 +2799,7 @@ int mt7996_mcu_add_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	len = ALIGN(sizeof(*bcn) + MT_TXD_SIZE + extra_len, 4);
 	tlv = mt7996_mcu_add_uni_tlv(rskb, UNI_BSS_INFO_BCN_CONTENT, len);
 	bcn = (struct bss_bcn_content_tlv *)tlv;
-	bcn->enable = link_conf->enable_beacon;
+	bcn->enable = enabled;
 	if (!bcn->enable)
 		goto out;
 
@@ -3372,7 +3377,7 @@ int mt7996_mcu_set_hdr_trans(struct mt7996_dev *dev, bool hdr_trans)
 {
 	struct {
 		u8 __rsv[4];
-	} __packed hdr;
+	} __packed hdr = {};
 	struct hdr_trans_blacklist *req_blacklist;
 	struct hdr_trans_en *req_en;
 	struct sk_buff *skb;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mt7996.h b/drivers/net/wireless/mediatek/mt76/mt7996/mt7996.h
index 33ac16b64ef1..8509d508e1e1 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mt7996.h
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mt7996.h
@@ -732,6 +732,7 @@ void mt7996_mac_write_txwi(struct mt7996_dev *dev, __le32 *txwi,
 			   struct sk_buff *skb, struct mt76_wcid *wcid,
 			   struct ieee80211_key_conf *key, int pid,
 			   enum mt76_txq_id qid, u32 changed);
+void mt7996_mac_update_beacons(struct mt7996_phy *phy);
 void mt7996_mac_set_coverage_class(struct mt7996_phy *phy);
 void mt7996_mac_work(struct work_struct *work);
 void mt7996_mac_reset_work(struct work_struct *work);
diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index e6cf16706667..8ab5840fee57 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -332,6 +332,7 @@ mt76_tx(struct mt76_phy *phy, struct ieee80211_sta *sta,
 	struct mt76_wcid *wcid, struct sk_buff *skb)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
 	struct sk_buff_head *head;
 
 	if (mt76_testmode_enabled(phy)) {
@@ -349,7 +350,8 @@ mt76_tx(struct mt76_phy *phy, struct ieee80211_sta *sta,
 	info->hw_queue |= FIELD_PREP(MT_TX_HW_QUEUE_PHY, phy->band_idx);
 
 	if ((info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) ||
-	    (info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK))
+	    ((info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK) &&
+	     ieee80211_is_probe_req(hdr->frame_control)))
 		head = &wcid->tx_offchannel;
 	else
 		head = &wcid->tx_pending;
@@ -644,6 +646,7 @@ mt76_txq_schedule_pending_wcid(struct mt76_phy *phy, struct mt76_wcid *wcid,
 static void mt76_txq_schedule_pending(struct mt76_phy *phy)
 {
 	LIST_HEAD(tx_list);
+	int ret = 0;
 
 	if (list_empty(&phy->tx_list))
 		return;
@@ -655,13 +658,13 @@ static void mt76_txq_schedule_pending(struct mt76_phy *phy)
 	list_splice_init(&phy->tx_list, &tx_list);
 	while (!list_empty(&tx_list)) {
 		struct mt76_wcid *wcid;
-		int ret;
 
 		wcid = list_first_entry(&tx_list, struct mt76_wcid, tx_list);
 		list_del_init(&wcid->tx_list);
 
 		spin_unlock(&phy->tx_lock);
-		ret = mt76_txq_schedule_pending_wcid(phy, wcid, &wcid->tx_offchannel);
+		if (ret >= 0)
+			ret = mt76_txq_schedule_pending_wcid(phy, wcid, &wcid->tx_offchannel);
 		if (ret >= 0 && !phy->offchannel)
 			ret = mt76_txq_schedule_pending_wcid(phy, wcid, &wcid->tx_pending);
 		spin_lock(&phy->tx_lock);
@@ -670,9 +673,6 @@ static void mt76_txq_schedule_pending(struct mt76_phy *phy)
 		    !skb_queue_empty(&wcid->tx_offchannel) &&
 		    list_empty(&wcid->tx_list))
 			list_add_tail(&wcid->tx_list, &phy->tx_list);
-
-		if (ret < 0)
-			break;
 	}
 	spin_unlock(&phy->tx_lock);
 
diff --git a/drivers/net/wireless/microchip/wilc1000/wlan_cfg.c b/drivers/net/wireless/microchip/wilc1000/wlan_cfg.c
index 131388886acb..cfabd5aebb54 100644
--- a/drivers/net/wireless/microchip/wilc1000/wlan_cfg.c
+++ b/drivers/net/wireless/microchip/wilc1000/wlan_cfg.c
@@ -41,10 +41,10 @@ static const struct wilc_cfg_word g_cfg_word[] = {
 };
 
 static const struct wilc_cfg_str g_cfg_str[] = {
-	{WID_FIRMWARE_VERSION, NULL},
-	{WID_MAC_ADDR, NULL},
-	{WID_ASSOC_RES_INFO, NULL},
-	{WID_NIL, NULL}
+	{WID_FIRMWARE_VERSION, 0, NULL},
+	{WID_MAC_ADDR, 0, NULL},
+	{WID_ASSOC_RES_INFO, 0, NULL},
+	{WID_NIL, 0, NULL}
 };
 
 #define WILC_RESP_MSG_TYPE_CONFIG_REPLY		'R'
@@ -147,44 +147,58 @@ static void wilc_wlan_parse_response_frame(struct wilc *wl, u8 *info, int size)
 
 		switch (FIELD_GET(WILC_WID_TYPE, wid)) {
 		case WID_CHAR:
+			len = 3;
+			if (len + 2  > size)
+				return;
+
 			while (cfg->b[i].id != WID_NIL && cfg->b[i].id != wid)
 				i++;
 
 			if (cfg->b[i].id == wid)
 				cfg->b[i].val = info[4];
 
-			len = 3;
 			break;
 
 		case WID_SHORT:
+			len = 4;
+			if (len + 2  > size)
+				return;
+
 			while (cfg->hw[i].id != WID_NIL && cfg->hw[i].id != wid)
 				i++;
 
 			if (cfg->hw[i].id == wid)
 				cfg->hw[i].val = get_unaligned_le16(&info[4]);
 
-			len = 4;
 			break;
 
 		case WID_INT:
+			len = 6;
+			if (len + 2  > size)
+				return;
+
 			while (cfg->w[i].id != WID_NIL && cfg->w[i].id != wid)
 				i++;
 
 			if (cfg->w[i].id == wid)
 				cfg->w[i].val = get_unaligned_le32(&info[4]);
 
-			len = 6;
 			break;
 
 		case WID_STR:
+			len = 2 + get_unaligned_le16(&info[2]);
+
 			while (cfg->s[i].id != WID_NIL && cfg->s[i].id != wid)
 				i++;
 
-			if (cfg->s[i].id == wid)
+			if (cfg->s[i].id == wid) {
+				if (len > cfg->s[i].len || (len + 2  > size))
+					return;
+
 				memcpy(cfg->s[i].str, &info[2],
-				       get_unaligned_le16(&info[2]) + 2);
+				       len);
+			}
 
-			len = 2 + get_unaligned_le16(&info[2]);
 			break;
 
 		default:
@@ -384,12 +398,15 @@ int wilc_wlan_cfg_init(struct wilc *wl)
 	/* store the string cfg parameters */
 	wl->cfg.s[i].id = WID_FIRMWARE_VERSION;
 	wl->cfg.s[i].str = str_vals->firmware_version;
+	wl->cfg.s[i].len = sizeof(str_vals->firmware_version);
 	i++;
 	wl->cfg.s[i].id = WID_MAC_ADDR;
 	wl->cfg.s[i].str = str_vals->mac_address;
+	wl->cfg.s[i].len = sizeof(str_vals->mac_address);
 	i++;
 	wl->cfg.s[i].id = WID_ASSOC_RES_INFO;
 	wl->cfg.s[i].str = str_vals->assoc_rsp;
+	wl->cfg.s[i].len = sizeof(str_vals->assoc_rsp);
 	i++;
 	wl->cfg.s[i].id = WID_NIL;
 	wl->cfg.s[i].str = NULL;
diff --git a/drivers/net/wireless/microchip/wilc1000/wlan_cfg.h b/drivers/net/wireless/microchip/wilc1000/wlan_cfg.h
index 7038b74f8e8f..5ae74bced7d7 100644
--- a/drivers/net/wireless/microchip/wilc1000/wlan_cfg.h
+++ b/drivers/net/wireless/microchip/wilc1000/wlan_cfg.h
@@ -24,12 +24,13 @@ struct wilc_cfg_word {
 
 struct wilc_cfg_str {
 	u16 id;
+	u16 len;
 	u8 *str;
 };
 
 struct wilc_cfg_str_vals {
-	u8 mac_address[7];
-	u8 firmware_version[129];
+	u8 mac_address[8];
+	u8 firmware_version[130];
 	u8 assoc_rsp[WILC_MAX_ASSOC_RESP_FRAME_SIZE];
 };
 
diff --git a/drivers/net/wireless/ralink/rt2x00/Kconfig b/drivers/net/wireless/ralink/rt2x00/Kconfig
index 4d98b7723c56..17f063fc0b57 100644
--- a/drivers/net/wireless/ralink/rt2x00/Kconfig
+++ b/drivers/net/wireless/ralink/rt2x00/Kconfig
@@ -66,7 +66,6 @@ config RT2800PCI
 	select RT2X00_LIB_PCI
 	select RT2X00_LIB_FIRMWARE
 	select RT2X00_LIB_CRYPTO
-	select CRC_CCITT
 	select EEPROM_93CX6
 	help
 	  This adds support for rt27xx/rt28xx/rt30xx wireless chipset family.
@@ -142,7 +141,6 @@ config RT2800USB
 	select RT2X00_LIB_USB
 	select RT2X00_LIB_FIRMWARE
 	select RT2X00_LIB_CRYPTO
-	select CRC_CCITT
 	help
 	  This adds support for rt27xx/rt28xx/rt30xx wireless chipset family.
 	  Supported chips: RT2770, RT2870 & RT3070, RT3071 & RT3072
@@ -217,6 +215,7 @@ config RT2800SOC
 
 config RT2800_LIB
 	tristate
+	select CRC_CCITT
 
 config RT2800_LIB_MMIO
 	tristate
@@ -225,6 +224,7 @@ config RT2800_LIB_MMIO
 
 config RT2X00_LIB_MMIO
 	tristate
+	select RT2X00_LIB
 
 config RT2X00_LIB_PCI
 	tristate
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index b1dd76e8aecb..5d8eaa700779 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -1291,7 +1291,7 @@ static void cw1200_do_join(struct cw1200_common *priv)
 		rcu_read_lock();
 		ssidie = ieee80211_bss_get_ie(bss, WLAN_EID_SSID);
 		if (ssidie) {
-			join.ssid_len = ssidie[1];
+			join.ssid_len = min(ssidie[1], IEEE80211_MAX_SSID_LEN);
 			memcpy(join.ssid, &ssidie[2], join.ssid_len);
 		}
 		rcu_read_unlock();
diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c
index 1fffeff2190c..4eae89376feb 100644
--- a/drivers/net/wireless/virtual/virt_wifi.c
+++ b/drivers/net/wireless/virtual/virt_wifi.c
@@ -277,7 +277,9 @@ static void virt_wifi_connect_complete(struct work_struct *work)
 		priv->is_connected = true;
 
 	/* Schedules an event that acquires the rtnl lock. */
-	cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0,
+	cfg80211_connect_result(priv->upperdev,
+				priv->is_connected ? fake_router_bssid : NULL,
+				NULL, 0, NULL, 0,
 				status, GFP_KERNEL);
 	netif_carrier_on(priv->upperdev);
 }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 812c1565114f..6b7493934535 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -903,6 +903,15 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 	u32 upper, lower;
 	u64 ref48;
 
+	/* only type1 and type 2 PI formats have a reftag */
+	switch (ns->head->pi_type) {
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		break;
+	default:
+		return;
+	}
+
 	/* both rw and write zeroes share the same reftag format */
 	switch (ns->head->guard_type) {
 	case NVME_NVM_NS_16B_GUARD:
@@ -942,13 +951,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 
 	if (nvme_ns_has_pi(ns->head)) {
 		cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
-
-		switch (ns->head->pi_type) {
-		case NVME_NS_DPS_PI_TYPE1:
-		case NVME_NS_DPS_PI_TYPE2:
-			nvme_set_ref_tag(ns, cmnd, req);
-			break;
-		}
+		nvme_set_ref_tag(ns, cmnd, req);
 	}
 
 	return BLK_STS_OK;
@@ -1039,6 +1042,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
 				return BLK_STS_NOTSUPP;
 			control |= NVME_RW_PRINFO_PRACT;
+			nvme_set_ref_tag(ns, cmnd, req);
 		}
 
 		if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
diff --git a/drivers/of/device.c b/drivers/of/device.c
index c80426510ec2..f7e75e527667 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -17,8 +17,8 @@
 
 /**
  * of_match_device - Tell if a struct device matches an of_device_id list
- * @matches: array of of device match structures to search in
- * @dev: the of device structure to match against
+ * @matches: array of of_device_id match structures to search in
+ * @dev: the OF device structure to match against
  *
  * Used by a driver to check whether an platform_device present in the
  * system is in its list of supported devices.
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 0aba760f7577..2eaaddcb0ec4 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -935,10 +935,15 @@ static int of_changeset_add_prop_helper(struct of_changeset *ocs,
 		return -ENOMEM;
 
 	ret = of_changeset_add_property(ocs, np, new_pp);
-	if (ret)
+	if (ret) {
 		__of_prop_free(new_pp);
+		return ret;
+	}
 
-	return ret;
+	new_pp->next = np->deadprops;
+	np->deadprops = new_pp;
+
+	return 0;
 }
 
 /**
diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c
index 230d5f628c1b..cd2dc8e825c9 100644
--- a/drivers/of/of_numa.c
+++ b/drivers/of/of_numa.c
@@ -59,8 +59,11 @@ static int __init of_numa_parse_memory_nodes(void)
 			r = -EINVAL;
 		}
 
-		for (i = 0; !r && !of_address_to_resource(np, i, &rsrc); i++)
+		for (i = 0; !r && !of_address_to_resource(np, i, &rsrc); i++) {
 			r = numa_add_memblk(nid, rsrc.start, rsrc.end + 1);
+			if (!r)
+				node_set(nid, numa_nodes_parsed);
+		}
 
 		if (!i || r) {
 			of_node_put(np);
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 77016c0cc296..2e9ea751ed2d 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -25,6 +25,7 @@
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
 #include <linux/cma.h>
+#include <linux/dma-map-ops.h>
 
 #include "of_private.h"
 
@@ -175,13 +176,17 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
 		base = dt_mem_next_cell(dt_root_addr_cells, &prop);
 		size = dt_mem_next_cell(dt_root_size_cells, &prop);
 
-		if (size &&
-		    early_init_dt_reserve_memory(base, size, nomap) == 0)
+		if (size && early_init_dt_reserve_memory(base, size, nomap) == 0) {
+			/* Architecture specific contiguous memory fixup. */
+			if (of_flat_dt_is_compatible(node, "shared-dma-pool") &&
+			    of_get_flat_dt_prop(node, "reusable", NULL))
+				dma_contiguous_early_fixup(base, size);
 			pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
 				uname, &base, (unsigned long)(size / SZ_1M));
-		else
+		} else {
 			pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
 			       uname, &base, (unsigned long)(size / SZ_1M));
+		}
 
 		len -= t_len;
 	}
@@ -472,7 +477,10 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam
 		       uname, (unsigned long)(size / SZ_1M));
 		return -ENOMEM;
 	}
-
+	/* Architecture specific contiguous memory fixup. */
+	if (of_flat_dt_is_compatible(node, "shared-dma-pool") &&
+	    of_get_flat_dt_prop(node, "reusable", NULL))
+		dma_contiguous_early_fixup(base, size);
 	/* Save region in the reserved_mem array */
 	fdt_reserved_mem_save_node(node, uname, base, size);
 	return 0;
@@ -771,6 +779,7 @@ int of_reserved_mem_region_to_resource(const struct device_node *np,
 		return -EINVAL;
 
 	resource_set_range(res, rmem->base, rmem->size);
+	res->flags = IORESOURCE_MEM;
 	res->name = rmem->name;
 	return 0;
 }
diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c
index 755651f33811..a72aa57591c0 100644
--- a/drivers/pci/controller/pci-mvebu.c
+++ b/drivers/pci/controller/pci-mvebu.c
@@ -1168,12 +1168,6 @@ static void __iomem *mvebu_pcie_map_registers(struct platform_device *pdev,
 	return devm_ioremap_resource(&pdev->dev, &port->regs);
 }
 
-#define DT_FLAGS_TO_TYPE(flags)       (((flags) >> 24) & 0x03)
-#define    DT_TYPE_IO                 0x1
-#define    DT_TYPE_MEM32              0x2
-#define DT_CPUADDR_TO_TARGET(cpuaddr) (((cpuaddr) >> 56) & 0xFF)
-#define DT_CPUADDR_TO_ATTR(cpuaddr)   (((cpuaddr) >> 48) & 0xFF)
-
 static int mvebu_get_tgt_attr(struct device_node *np, int devfn,
 			      unsigned long type,
 			      unsigned int *tgt,
@@ -1189,19 +1183,12 @@ static int mvebu_get_tgt_attr(struct device_node *np, int devfn,
 		return -EINVAL;
 
 	for_each_of_range(&parser, &range) {
-		unsigned long rtype;
 		u32 slot = upper_32_bits(range.bus_addr);
 
-		if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_IO)
-			rtype = IORESOURCE_IO;
-		else if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_MEM32)
-			rtype = IORESOURCE_MEM;
-		else
-			continue;
-
-		if (slot == PCI_SLOT(devfn) && type == rtype) {
-			*tgt = DT_CPUADDR_TO_TARGET(range.cpu_addr);
-			*attr = DT_CPUADDR_TO_ATTR(range.cpu_addr);
+		if (slot == PCI_SLOT(devfn) &&
+		    type == (range.flags & IORESOURCE_TYPE_BITS)) {
+			*tgt = (range.parent_bus_addr >> 56) & 0xFF;
+			*attr = (range.parent_bus_addr >> 48) & 0xFF;
 			return 0;
 		}
 	}
diff --git a/drivers/pci/controller/pcie-xilinx.c b/drivers/pci/controller/pcie-xilinx.c
index f121836c3cf4..937ea6ae1ac4 100644
--- a/drivers/pci/controller/pcie-xilinx.c
+++ b/drivers/pci/controller/pcie-xilinx.c
@@ -400,7 +400,7 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 		if (val & XILINX_PCIE_RPIFR1_MSI_INTR) {
 			val = pcie_read(pcie, XILINX_PCIE_REG_RPIFR2) &
 				XILINX_PCIE_RPIFR2_MSG_DATA;
-			domain = pcie->msi_domain->parent;
+			domain = pcie->msi_domain;
 		} else {
 			val = (val & XILINX_PCIE_RPIFR1_INTR_MASK) >>
 				XILINX_PCIE_RPIFR1_INTR_SHIFT;
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index b679c7f28f51..1bd5bf4a6097 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -306,9 +306,6 @@ static bool vmd_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 				  struct irq_domain *real_parent,
 				  struct msi_domain_info *info)
 {
-	if (WARN_ON_ONCE(info->bus_token != DOMAIN_BUS_PCI_DEVICE_MSIX))
-		return false;
-
 	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
 		return false;
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b0f4d98036cd..005b92e6585e 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5932,6 +5932,7 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
 {
 	u16 v;
 	int ret;
+	unsigned int firstbit;
 	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
 
 	if (rq < 128 || rq > 4096 || !is_power_of_2(rq))
@@ -5949,7 +5950,10 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
 			rq = mps;
 	}
 
-	v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, ffs(rq) - 8);
+	firstbit = ffs(rq);
+	if (firstbit < 8)
+		return -EINVAL;
+	v = FIELD_PREP(PCI_EXP_DEVCTL_READRQ, firstbit - 8);
 
 	if (bridge->no_inc_mrrs) {
 		int max_mrrs = pcie_get_readrq(dev);
diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig
index dddb235dd020..660a95805524 100644
--- a/drivers/pcmcia/Kconfig
+++ b/drivers/pcmcia/Kconfig
@@ -250,7 +250,4 @@ config ELECTRA_CF
 config PCCARD_NONSTATIC
 	bool
 
-config PCCARD_IODYN
-	bool
-
 endif	# PCCARD
diff --git a/drivers/pcmcia/Makefile b/drivers/pcmcia/Makefile
index c9d51b150682..d16a0317ce43 100644
--- a/drivers/pcmcia/Makefile
+++ b/drivers/pcmcia/Makefile
@@ -12,7 +12,6 @@ obj-$(CONFIG_PCMCIA)				+= pcmcia.o
 
 pcmcia_rsrc-y					+= rsrc_mgr.o
 pcmcia_rsrc-$(CONFIG_PCCARD_NONSTATIC)		+= rsrc_nonstatic.o
-pcmcia_rsrc-$(CONFIG_PCCARD_IODYN)		+= rsrc_iodyn.o
 obj-$(CONFIG_PCCARD)				+= pcmcia_rsrc.o
 
 
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index c75f55e1250a..adbc486af2ea 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -229,23 +229,6 @@ void pcmcia_unregister_socket(struct pcmcia_socket *socket)
 EXPORT_SYMBOL(pcmcia_unregister_socket);
 
 
-struct pcmcia_socket *pcmcia_get_socket_by_nr(unsigned int nr)
-{
-	struct pcmcia_socket *s;
-
-	down_read(&pcmcia_socket_list_rwsem);
-	list_for_each_entry(s, &pcmcia_socket_list, socket_list)
-		if (s->sock == nr) {
-			up_read(&pcmcia_socket_list_rwsem);
-			return s;
-		}
-	up_read(&pcmcia_socket_list_rwsem);
-
-	return NULL;
-
-}
-EXPORT_SYMBOL(pcmcia_get_socket_by_nr);
-
 static int socket_reset(struct pcmcia_socket *skt)
 {
 	int status, i;
diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h
index 02a83ca44e77..5ac810ffda31 100644
--- a/drivers/pcmcia/cs_internal.h
+++ b/drivers/pcmcia/cs_internal.h
@@ -116,7 +116,6 @@ extern struct list_head pcmcia_socket_list;
 extern const struct class pcmcia_socket_class;
 
 int pccard_register_pcmcia(struct pcmcia_socket *s, struct pcmcia_callback *c);
-struct pcmcia_socket *pcmcia_get_socket_by_nr(unsigned int nr);
 
 void pcmcia_parse_uevents(struct pcmcia_socket *socket, unsigned int events);
 #define PCMCIA_UEVENT_EJECT	0x0001
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index da6f66f357cc..18f4eef28dbc 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -1308,7 +1308,7 @@ static int pcmcia_bus_early_resume(struct pcmcia_socket *skt)
  * physically present, even if the call to this function returns
  * non-NULL. Furthermore, the device driver most likely is unbound
  * almost immediately, so the timeframe where pcmcia_dev_present
- * returns NULL is probably really really small.
+ * returns NULL is probably really, really small.
  */
 struct pcmcia_device *pcmcia_dev_present(struct pcmcia_device *_p_dev)
 {
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 1b1dff56ec7b..d6f24c7d1562 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -215,6 +215,8 @@ static int __init omap_cf_probe(struct platform_device *pdev)
 		return -EINVAL;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -EINVAL;
 
 	cf = kzalloc(sizeof *cf, GFP_KERNEL);
 	if (!cf)
@@ -302,7 +304,13 @@ static void __exit omap_cf_remove(struct platform_device *pdev)
 	kfree(cf);
 }
 
-static struct platform_driver omap_cf_driver = {
+/*
+ * omap_cf_remove() lives in .exit.text. For drivers registered via
+ * platform_driver_probe() this is ok because they cannot get unbound at
+ * runtime. So mark the driver struct with __refdata to prevent modpost
+ * triggering a section mismatch warning.
+ */
+static struct platform_driver omap_cf_driver __refdata = {
 	.driver = {
 		.name	= driver_name,
 	},
diff --git a/drivers/pcmcia/rsrc_iodyn.c b/drivers/pcmcia/rsrc_iodyn.c
deleted file mode 100644
index b04b16496b0c..000000000000
--- a/drivers/pcmcia/rsrc_iodyn.c
+++ /dev/null
@@ -1,168 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * rsrc_iodyn.c -- Resource management routines for MEM-static sockets.
- *
- * The initial developer of the original code is David A. Hinds
- * <dahinds@users.sourceforge.net>.  Portions created by David A. Hinds
- * are Copyright (C) 1999 David A. Hinds.  All Rights Reserved.
- *
- * (C) 1999		David A. Hinds
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <pcmcia/ss.h>
-#include <pcmcia/cistpl.h>
-#include "cs_internal.h"
-
-
-struct pcmcia_align_data {
-	unsigned long	mask;
-	unsigned long	offset;
-};
-
-static resource_size_t pcmcia_align(void *align_data,
-				const struct resource *res,
-				resource_size_t size, resource_size_t align)
-{
-	struct pcmcia_align_data *data = align_data;
-	resource_size_t start;
-
-	start = (res->start & ~data->mask) + data->offset;
-	if (start < res->start)
-		start += data->mask + 1;
-
-#ifdef CONFIG_X86
-	if (res->flags & IORESOURCE_IO) {
-		if (start & 0x300)
-			start = (start + 0x3ff) & ~0x3ff;
-	}
-#endif
-
-#ifdef CONFIG_M68K
-	if (res->flags & IORESOURCE_IO) {
-		if ((res->start + size - 1) >= 1024)
-			start = res->end;
-	}
-#endif
-
-	return start;
-}
-
-
-static struct resource *__iodyn_find_io_region(struct pcmcia_socket *s,
-					unsigned long base, int num,
-					unsigned long align)
-{
-	struct resource *res = pcmcia_make_resource(0, num, IORESOURCE_IO,
-						dev_name(&s->dev));
-	struct pcmcia_align_data data;
-	unsigned long min = base;
-	int ret;
-
-	data.mask = align - 1;
-	data.offset = base & data.mask;
-
-#ifdef CONFIG_PCI
-	if (s->cb_dev) {
-		ret = pci_bus_alloc_resource(s->cb_dev->bus, res, num, 1,
-					     min, 0, pcmcia_align, &data);
-	} else
-#endif
-		ret = allocate_resource(&ioport_resource, res, num, min, ~0UL,
-					1, pcmcia_align, &data);
-
-	if (ret != 0) {
-		kfree(res);
-		res = NULL;
-	}
-	return res;
-}
-
-static int iodyn_find_io(struct pcmcia_socket *s, unsigned int attr,
-			unsigned int *base, unsigned int num,
-			unsigned int align, struct resource **parent)
-{
-	int i, ret = 0;
-
-	/* Check for an already-allocated window that must conflict with
-	 * what was asked for.  It is a hack because it does not catch all
-	 * potential conflicts, just the most obvious ones.
-	 */
-	for (i = 0; i < MAX_IO_WIN; i++) {
-		if (!s->io[i].res)
-			continue;
-
-		if (!*base)
-			continue;
-
-		if ((s->io[i].res->start & (align-1)) == *base)
-			return -EBUSY;
-	}
-
-	for (i = 0; i < MAX_IO_WIN; i++) {
-		struct resource *res = s->io[i].res;
-		unsigned int try;
-
-		if (res && (res->flags & IORESOURCE_BITS) !=
-			(attr & IORESOURCE_BITS))
-			continue;
-
-		if (!res) {
-			if (align == 0)
-				align = 0x10000;
-
-			res = s->io[i].res = __iodyn_find_io_region(s, *base,
-								num, align);
-			if (!res)
-				return -EINVAL;
-
-			*base = res->start;
-			s->io[i].res->flags =
-				((res->flags & ~IORESOURCE_BITS) |
-					(attr & IORESOURCE_BITS));
-			s->io[i].InUse = num;
-			*parent = res;
-			return 0;
-		}
-
-		/* Try to extend top of window */
-		try = res->end + 1;
-		if ((*base == 0) || (*base == try)) {
-			if (adjust_resource(s->io[i].res, res->start,
-					    resource_size(res) + num))
-				continue;
-			*base = try;
-			s->io[i].InUse += num;
-			*parent = res;
-			return 0;
-		}
-
-		/* Try to extend bottom of window */
-		try = res->start - num;
-		if ((*base == 0) || (*base == try)) {
-			if (adjust_resource(s->io[i].res,
-					    res->start - num,
-					    resource_size(res) + num))
-				continue;
-			*base = try;
-			s->io[i].InUse += num;
-			*parent = res;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-
-struct pccard_resource_ops pccard_iodyn_ops = {
-	.validate_mem = NULL,
-	.find_io = iodyn_find_io,
-	.find_mem = NULL,
-	.init = static_init,
-	.exit = NULL,
-};
-EXPORT_SYMBOL(pccard_iodyn_ops);
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index bf9d070a4496..da494fe451ba 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -375,7 +375,9 @@ static int do_validate_mem(struct pcmcia_socket *s,
 
 	if (validate && !s->fake_cis) {
 		/* move it to the validated data set */
-		add_interval(&s_data->mem_db_valid, base, size);
+		ret = add_interval(&s_data->mem_db_valid, base, size);
+		if (ret)
+			return ret;
 		sub_interval(&s_data->mem_db, base, size);
 	}
 
diff --git a/drivers/pcmcia/socket_sysfs.c b/drivers/pcmcia/socket_sysfs.c
index c7a906664c36..4eadd0485066 100644
--- a/drivers/pcmcia/socket_sysfs.c
+++ b/drivers/pcmcia/socket_sysfs.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/major.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
@@ -98,7 +99,7 @@ static ssize_t pccard_show_card_pm_state(struct device *dev,
 					 char *buf)
 {
 	struct pcmcia_socket *s = to_socket(dev);
-	return sysfs_emit(buf, "%s\n", s->state & SOCKET_SUSPEND ? "off" : "on");
+	return sysfs_emit(buf, "%s\n", str_off_on(s->state & SOCKET_SUSPEND));
 }
 
 static ssize_t pccard_store_card_pm_state(struct device *dev,
@@ -177,7 +178,7 @@ static ssize_t pccard_show_resource(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
 	struct pcmcia_socket *s = to_socket(dev);
-	return sysfs_emit(buf, "%s\n", s->resource_setup_done ? "yes" : "no");
+	return sysfs_emit(buf, "%s\n", str_yes_no(s->resource_setup_done));
 }
 
 static ssize_t pccard_store_resource(struct device *dev,
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index a9188dec36fe..638321fc9800 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -178,6 +178,15 @@ config FSL_IMX9_DDR_PMU
 	 can give information about memory throughput and other related
 	 events.
 
+config FUJITSU_UNCORE_PMU
+	tristate "Fujitsu Uncore PMU"
+	depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT)
+	 help
+	 Provides support for the Uncore performance monitor unit (PMU)
+	 in Fujitsu processors.
+	 Adds the Uncore PMU into the perf events subsystem for
+	 monitoring Uncore events.
+
 config QCOM_L2_PMU
 	bool "Qualcomm Technologies L2-cache PMU"
 	depends on ARCH_QCOM && ARM64 && ACPI
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 192fc8b16204..ea52711a87e3 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_ARM_XSCALE_PMU) += arm_xscale_pmu.o
 obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o
 obj-$(CONFIG_FSL_IMX9_DDR_PMU) += fsl_imx9_ddr_perf.o
+obj-$(CONFIG_FUJITSU_UNCORE_PMU) += fujitsu_uncore_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 1a0d0e1a2263..8af3563fdf60 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -565,7 +565,7 @@ module_param_named(pmu_poll_period_us, arm_ccn_pmu_poll_period_us, uint,
 
 static ktime_t arm_ccn_pmu_timer_period(void)
 {
-	return ns_to_ktime((u64)arm_ccn_pmu_poll_period_us * 1000);
+	return us_to_ktime((u64)arm_ccn_pmu_poll_period_us);
 }
 
 
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 11fb2234b10f..23245352a3fc 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -65,7 +65,7 @@
 /* PMU registers occupy the 3rd 4KB page of each node's region */
 #define CMN_PMU_OFFSET			0x2000
 /* ...except when they don't :( */
-#define CMN_S3_DTM_OFFSET		0xa000
+#define CMN_S3_R1_DTM_OFFSET		0xa000
 #define CMN_S3_PMU_OFFSET		0xd900
 
 /* For most nodes, this is all there is */
@@ -233,6 +233,9 @@ enum cmn_revision {
 	REV_CMN700_R1P0,
 	REV_CMN700_R2P0,
 	REV_CMN700_R3P0,
+	REV_CMNS3_R0P0 = 0,
+	REV_CMNS3_R0P1,
+	REV_CMNS3_R1P0,
 	REV_CI700_R0P0 = 0,
 	REV_CI700_R1P0,
 	REV_CI700_R2P0,
@@ -425,8 +428,8 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn)
 static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn)
 {
 	if (cmn->part == PART_CMN_S3) {
-		if (dn->type == CMN_TYPE_XP)
-			return CMN_S3_DTM_OFFSET;
+		if (cmn->rev >= REV_CMNS3_R1P0 && dn->type == CMN_TYPE_XP)
+			return CMN_S3_R1_DTM_OFFSET;
 		return CMN_S3_PMU_OFFSET;
 	}
 	return CMN_PMU_OFFSET;
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index f6d7bab5d555..69c5cc8f5606 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -978,6 +978,32 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
 	return -EAGAIN;
 }
 
+static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc,
+				     struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
+
+	if (evtype != ARMV8_PMUV3_PERFCTR_CPU_CYCLES)
+		return false;
+
+	/*
+	 * A CPU_CYCLES event with threshold counting cannot use PMCCNTR_EL0
+	 * since it lacks threshold support.
+	 */
+	if (armv8pmu_event_get_threshold(&event->attr))
+		return false;
+
+	/*
+	 * PMCCNTR_EL0 is not affected by BRBE controls like BRBCR_ELx.FZP.
+	 * So don't use it for branch events.
+	 */
+	if (has_branch_stack(event))
+		return false;
+
+	return true;
+}
+
 static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
 				  struct perf_event *event)
 {
@@ -986,8 +1012,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
 	unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
 
 	/* Always prefer to place a cycle counter into the cycle counter. */
-	if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
-	    !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) {
+	if (armv8pmu_can_use_pmccntr(cpuc, event)) {
 		if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask))
 			return ARMV8_PMU_CYCLE_IDX;
 		else if (armv8pmu_event_is_64bit(event) &&
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 369e77ad5f13..fa50645fedda 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -86,9 +86,11 @@ struct arm_spe_pmu {
 #define SPE_PMU_FEAT_ERND			(1UL << 5)
 #define SPE_PMU_FEAT_INV_FILT_EVT		(1UL << 6)
 #define SPE_PMU_FEAT_DISCARD			(1UL << 7)
+#define SPE_PMU_FEAT_EFT			(1UL << 8)
 #define SPE_PMU_FEAT_DEV_PROBED			(1UL << 63)
 	u64					features;
 
+	u64					pmsevfr_res0;
 	u16					max_record_sz;
 	u16					align;
 	struct perf_output_handle __percpu	*handle;
@@ -97,7 +99,8 @@ struct arm_spe_pmu {
 #define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu))
 
 /* Convert a free-running index from perf into an SPE buffer offset */
-#define PERF_IDX2OFF(idx, buf)	((idx) % ((buf)->nr_pages << PAGE_SHIFT))
+#define PERF_IDX2OFF(idx, buf) \
+	((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT))
 
 /* Keep track of our dynamic hotplug state */
 static enum cpuhp_state arm_spe_pmu_online;
@@ -115,6 +118,7 @@ enum arm_spe_pmu_capabilities {
 	SPE_PMU_CAP_FEAT_MAX,
 	SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
 	SPE_PMU_CAP_MIN_IVAL,
+	SPE_PMU_CAP_EVENT_FILTER,
 };
 
 static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
@@ -122,7 +126,7 @@ static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
 	[SPE_PMU_CAP_ERND]	= SPE_PMU_FEAT_ERND,
 };
 
-static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
+static u64 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
 {
 	if (cap < SPE_PMU_CAP_FEAT_MAX)
 		return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]);
@@ -132,6 +136,8 @@ static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
 		return spe_pmu->counter_sz;
 	case SPE_PMU_CAP_MIN_IVAL:
 		return spe_pmu->min_period;
+	case SPE_PMU_CAP_EVENT_FILTER:
+		return ~spe_pmu->pmsevfr_res0;
 	default:
 		WARN(1, "unknown cap %d\n", cap);
 	}
@@ -148,7 +154,19 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev,
 		container_of(attr, struct dev_ext_attribute, attr);
 	int cap = (long)ea->var;
 
-	return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap));
+	return sysfs_emit(buf, "%llu\n", arm_spe_pmu_cap_get(spe_pmu, cap));
+}
+
+static ssize_t arm_spe_pmu_cap_show_hex(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev);
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	int cap = (long)ea->var;
+
+	return sysfs_emit(buf, "0x%llx\n", arm_spe_pmu_cap_get(spe_pmu, cap));
 }
 
 #define SPE_EXT_ATTR_ENTRY(_name, _func, _var)				\
@@ -158,12 +176,15 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev,
 
 #define SPE_CAP_EXT_ATTR_ENTRY(_name, _var)				\
 	SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var)
+#define SPE_CAP_EXT_ATTR_ENTRY_HEX(_name, _var)				\
+	SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show_hex, _var)
 
 static struct attribute *arm_spe_pmu_cap_attr[] = {
 	SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST),
 	SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND),
 	SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ),
 	SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL),
+	SPE_CAP_EXT_ATTR_ENTRY_HEX(event_filter, SPE_PMU_CAP_EVENT_FILTER),
 	NULL,
 };
 
@@ -197,6 +218,27 @@ static const struct attribute_group arm_spe_pmu_cap_group = {
 #define ATTR_CFG_FLD_discard_CFG		config	/* PMBLIMITR_EL1.FM = DISCARD */
 #define ATTR_CFG_FLD_discard_LO			35
 #define ATTR_CFG_FLD_discard_HI			35
+#define ATTR_CFG_FLD_branch_filter_mask_CFG	config	/* PMSFCR_EL1.Bm */
+#define ATTR_CFG_FLD_branch_filter_mask_LO	36
+#define ATTR_CFG_FLD_branch_filter_mask_HI	36
+#define ATTR_CFG_FLD_load_filter_mask_CFG	config	/* PMSFCR_EL1.LDm */
+#define ATTR_CFG_FLD_load_filter_mask_LO	37
+#define ATTR_CFG_FLD_load_filter_mask_HI	37
+#define ATTR_CFG_FLD_store_filter_mask_CFG	config	/* PMSFCR_EL1.STm */
+#define ATTR_CFG_FLD_store_filter_mask_LO	38
+#define ATTR_CFG_FLD_store_filter_mask_HI	38
+#define ATTR_CFG_FLD_simd_filter_CFG		config	/* PMSFCR_EL1.SIMD */
+#define ATTR_CFG_FLD_simd_filter_LO		39
+#define ATTR_CFG_FLD_simd_filter_HI		39
+#define ATTR_CFG_FLD_simd_filter_mask_CFG	config	/* PMSFCR_EL1.SIMDm */
+#define ATTR_CFG_FLD_simd_filter_mask_LO	40
+#define ATTR_CFG_FLD_simd_filter_mask_HI	40
+#define ATTR_CFG_FLD_float_filter_CFG		config	/* PMSFCR_EL1.FP */
+#define ATTR_CFG_FLD_float_filter_LO		41
+#define ATTR_CFG_FLD_float_filter_HI		41
+#define ATTR_CFG_FLD_float_filter_mask_CFG	config	/* PMSFCR_EL1.FPm */
+#define ATTR_CFG_FLD_float_filter_mask_LO	42
+#define ATTR_CFG_FLD_float_filter_mask_HI	42
 
 #define ATTR_CFG_FLD_event_filter_CFG		config1	/* PMSEVFR_EL1 */
 #define ATTR_CFG_FLD_event_filter_LO		0
@@ -215,8 +257,15 @@ GEN_PMU_FORMAT_ATTR(pa_enable);
 GEN_PMU_FORMAT_ATTR(pct_enable);
 GEN_PMU_FORMAT_ATTR(jitter);
 GEN_PMU_FORMAT_ATTR(branch_filter);
+GEN_PMU_FORMAT_ATTR(branch_filter_mask);
 GEN_PMU_FORMAT_ATTR(load_filter);
+GEN_PMU_FORMAT_ATTR(load_filter_mask);
 GEN_PMU_FORMAT_ATTR(store_filter);
+GEN_PMU_FORMAT_ATTR(store_filter_mask);
+GEN_PMU_FORMAT_ATTR(simd_filter);
+GEN_PMU_FORMAT_ATTR(simd_filter_mask);
+GEN_PMU_FORMAT_ATTR(float_filter);
+GEN_PMU_FORMAT_ATTR(float_filter_mask);
 GEN_PMU_FORMAT_ATTR(event_filter);
 GEN_PMU_FORMAT_ATTR(inv_event_filter);
 GEN_PMU_FORMAT_ATTR(min_latency);
@@ -228,8 +277,15 @@ static struct attribute *arm_spe_pmu_formats_attr[] = {
 	&format_attr_pct_enable.attr,
 	&format_attr_jitter.attr,
 	&format_attr_branch_filter.attr,
+	&format_attr_branch_filter_mask.attr,
 	&format_attr_load_filter.attr,
+	&format_attr_load_filter_mask.attr,
 	&format_attr_store_filter.attr,
+	&format_attr_store_filter_mask.attr,
+	&format_attr_simd_filter.attr,
+	&format_attr_simd_filter_mask.attr,
+	&format_attr_float_filter.attr,
+	&format_attr_float_filter_mask.attr,
 	&format_attr_event_filter.attr,
 	&format_attr_inv_event_filter.attr,
 	&format_attr_min_latency.attr,
@@ -250,6 +306,16 @@ static umode_t arm_spe_pmu_format_attr_is_visible(struct kobject *kobj,
 	if (attr == &format_attr_inv_event_filter.attr && !(spe_pmu->features & SPE_PMU_FEAT_INV_FILT_EVT))
 		return 0;
 
+	if ((attr == &format_attr_branch_filter_mask.attr ||
+	     attr == &format_attr_load_filter_mask.attr ||
+	     attr == &format_attr_store_filter_mask.attr ||
+	     attr == &format_attr_simd_filter.attr ||
+	     attr == &format_attr_simd_filter_mask.attr ||
+	     attr == &format_attr_float_filter.attr ||
+	     attr == &format_attr_float_filter_mask.attr) &&
+	     !(spe_pmu->features & SPE_PMU_FEAT_EFT))
+		return 0;
+
 	return attr->mode;
 }
 
@@ -345,8 +411,15 @@ static u64 arm_spe_event_to_pmsfcr(struct perf_event *event)
 	u64 reg = 0;
 
 	reg |= FIELD_PREP(PMSFCR_EL1_LD, ATTR_CFG_GET_FLD(attr, load_filter));
+	reg |= FIELD_PREP(PMSFCR_EL1_LDm, ATTR_CFG_GET_FLD(attr, load_filter_mask));
 	reg |= FIELD_PREP(PMSFCR_EL1_ST, ATTR_CFG_GET_FLD(attr, store_filter));
+	reg |= FIELD_PREP(PMSFCR_EL1_STm, ATTR_CFG_GET_FLD(attr, store_filter_mask));
 	reg |= FIELD_PREP(PMSFCR_EL1_B, ATTR_CFG_GET_FLD(attr, branch_filter));
+	reg |= FIELD_PREP(PMSFCR_EL1_Bm, ATTR_CFG_GET_FLD(attr, branch_filter_mask));
+	reg |= FIELD_PREP(PMSFCR_EL1_SIMD, ATTR_CFG_GET_FLD(attr, simd_filter));
+	reg |= FIELD_PREP(PMSFCR_EL1_SIMDm, ATTR_CFG_GET_FLD(attr, simd_filter_mask));
+	reg |= FIELD_PREP(PMSFCR_EL1_FP, ATTR_CFG_GET_FLD(attr, float_filter));
+	reg |= FIELD_PREP(PMSFCR_EL1_FPm, ATTR_CFG_GET_FLD(attr, float_filter_mask));
 
 	if (reg)
 		reg |= PMSFCR_EL1_FT;
@@ -697,20 +770,6 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static u64 arm_spe_pmsevfr_res0(u16 pmsver)
-{
-	switch (pmsver) {
-	case ID_AA64DFR0_EL1_PMSVer_IMP:
-		return PMSEVFR_EL1_RES0_IMP;
-	case ID_AA64DFR0_EL1_PMSVer_V1P1:
-		return PMSEVFR_EL1_RES0_V1P1;
-	case ID_AA64DFR0_EL1_PMSVer_V1P2:
-	/* Return the highest version we support in default */
-	default:
-		return PMSEVFR_EL1_RES0_V1P2;
-	}
-}
-
 /* Perf callbacks */
 static int arm_spe_pmu_event_init(struct perf_event *event)
 {
@@ -726,10 +785,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event)
 	    !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
 		return -ENOENT;
 
-	if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver))
+	if (arm_spe_event_to_pmsevfr(event) & spe_pmu->pmsevfr_res0)
 		return -EOPNOTSUPP;
 
-	if (arm_spe_event_to_pmsnevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver))
+	if (arm_spe_event_to_pmsnevfr(event) & spe_pmu->pmsevfr_res0)
 		return -EOPNOTSUPP;
 
 	if (attr->exclude_idle)
@@ -762,6 +821,16 @@ static int arm_spe_pmu_event_init(struct perf_event *event)
 	    !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
 		return -EOPNOTSUPP;
 
+	if ((FIELD_GET(PMSFCR_EL1_LDm, reg) ||
+	     FIELD_GET(PMSFCR_EL1_STm, reg) ||
+	     FIELD_GET(PMSFCR_EL1_Bm, reg) ||
+	     FIELD_GET(PMSFCR_EL1_SIMD, reg) ||
+	     FIELD_GET(PMSFCR_EL1_SIMDm, reg) ||
+	     FIELD_GET(PMSFCR_EL1_FP, reg) ||
+	     FIELD_GET(PMSFCR_EL1_FPm, reg)) &&
+	    !(spe_pmu->features & SPE_PMU_FEAT_EFT))
+		return -EOPNOTSUPP;
+
 	if (ATTR_CFG_GET_FLD(&event->attr, discard) &&
 	    !(spe_pmu->features & SPE_PMU_FEAT_DISCARD))
 		return -EOPNOTSUPP;
@@ -1053,6 +1122,9 @@ static void __arm_spe_pmu_dev_probe(void *info)
 	if (spe_pmu->pmsver >= ID_AA64DFR0_EL1_PMSVer_V1P2)
 		spe_pmu->features |= SPE_PMU_FEAT_DISCARD;
 
+	if (FIELD_GET(PMSIDR_EL1_EFT, reg))
+		spe_pmu->features |= SPE_PMU_FEAT_EFT;
+
 	/* This field has a spaced out encoding, so just use a look-up */
 	fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg);
 	switch (fld) {
@@ -1107,6 +1179,10 @@ static void __arm_spe_pmu_dev_probe(void *info)
 		spe_pmu->counter_sz = 16;
 	}
 
+	/* Write all 1s and then read back. Unsupported filter bits are RAZ/WI. */
+	write_sysreg_s(U64_MAX, SYS_PMSEVFR_EL1);
+	spe_pmu->pmsevfr_res0 = ~read_sysreg_s(SYS_PMSEVFR_EL1);
+
 	dev_info(dev,
 		 "probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n",
 		 spe_pmu->pmsver - 1, cpumask_pr_args(&spe_pmu->supported_cpus),
diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index 146ff57813fb..22f73ac894e9 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -39,6 +39,10 @@
 #define DWC_PCIE_EVENT_CLEAR			GENMASK(1, 0)
 #define DWC_PCIE_EVENT_PER_CLEAR		0x1
 
+/* Event Selection Field has two subfields */
+#define DWC_PCIE_CNT_EVENT_SEL_GROUP		GENMASK(11, 8)
+#define DWC_PCIE_CNT_EVENT_SEL_EVID		GENMASK(7, 0)
+
 #define DWC_PCIE_EVENT_CNT_DATA			0xC
 
 #define DWC_PCIE_TIME_BASED_ANAL_CTL		0x10
@@ -73,6 +77,10 @@ enum dwc_pcie_event_type {
 	DWC_PCIE_EVENT_TYPE_MAX,
 };
 
+#define DWC_PCIE_LANE_GROUP_6 6
+#define DWC_PCIE_LANE_GROUP_7 7
+#define DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP 256
+
 #define DWC_PCIE_LANE_EVENT_MAX_PERIOD		GENMASK_ULL(31, 0)
 #define DWC_PCIE_MAX_PERIOD			GENMASK_ULL(63, 0)
 
@@ -82,8 +90,11 @@ struct dwc_pcie_pmu {
 	u16			ras_des_offset;
 	u32			nr_lanes;
 
+	/* Groups #6 and #7 */
+	DECLARE_BITMAP(lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP);
+	struct perf_event	*time_based_event;
+
 	struct hlist_node	cpuhp_node;
-	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
 	int			on_cpu;
 };
 
@@ -246,19 +257,26 @@ static const struct attribute_group *dwc_pcie_attr_groups[] = {
 };
 
 static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu,
+					   struct perf_event *event,
 					   bool enable)
 {
 	struct pci_dev *pdev = pcie_pmu->pdev;
 	u16 ras_des_offset = pcie_pmu->ras_des_offset;
+	int event_id = DWC_PCIE_EVENT_ID(event);
+	int lane = DWC_PCIE_EVENT_LANE(event);
+	u32 ctrl;
+
+	ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
+		FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
+		FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
 
 	if (enable)
-		pci_clear_and_set_config_dword(pdev,
-					ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
-					DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
+		ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
 	else
-		pci_clear_and_set_config_dword(pdev,
-					ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
-					DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
+		ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF);
+
+	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
+			       ctrl);
 }
 
 static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu,
@@ -276,11 +294,22 @@ static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event)
 {
 	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
 	struct pci_dev *pdev = pcie_pmu->pdev;
+	int event_id = DWC_PCIE_EVENT_ID(event);
+	int lane = DWC_PCIE_EVENT_LANE(event);
 	u16 ras_des_offset = pcie_pmu->ras_des_offset;
-	u32 val;
+	u32 val, ctrl;
 
+	ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
+		FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
+		FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON);
+	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
+			       ctrl);
 	pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val);
 
+	ctrl |= FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR);
+	pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
+			       ctrl);
+
 	return val;
 }
 
@@ -329,26 +358,77 @@ static void dwc_pcie_pmu_event_update(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event);
-	u64 delta, prev, now = 0;
+	u64 delta, prev, now;
+
+	if (type == DWC_PCIE_LANE_EVENT) {
+		now = dwc_pcie_pmu_read_lane_event_counter(event) &
+			DWC_PCIE_LANE_EVENT_MAX_PERIOD;
+		local64_add(now, &event->count);
+		return;
+	}
 
 	do {
 		prev = local64_read(&hwc->prev_count);
-
-		if (type == DWC_PCIE_LANE_EVENT)
-			now = dwc_pcie_pmu_read_lane_event_counter(event);
-		else if (type == DWC_PCIE_TIME_BASE_EVENT)
-			now = dwc_pcie_pmu_read_time_based_counter(event);
+		now = dwc_pcie_pmu_read_time_based_counter(event);
 
 	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
 
 	delta = (now - prev) & DWC_PCIE_MAX_PERIOD;
-	/* 32-bit counter for Lane Event Counting */
-	if (type == DWC_PCIE_LANE_EVENT)
-		delta &= DWC_PCIE_LANE_EVENT_MAX_PERIOD;
-
 	local64_add(delta, &event->count);
 }
 
+static int dwc_pcie_pmu_validate_add_lane_event(struct perf_event *event,
+						unsigned long val_lane_events[])
+{
+	int event_id, event_nr, group;
+
+	event_id = DWC_PCIE_EVENT_ID(event);
+	event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id);
+	group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id);
+
+	if (group != DWC_PCIE_LANE_GROUP_6 && group != DWC_PCIE_LANE_GROUP_7)
+		return -EINVAL;
+
+	group -= DWC_PCIE_LANE_GROUP_6;
+
+	if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr,
+			     val_lane_events))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int dwc_pcie_pmu_validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP);
+	bool time_event = false;
+	int type;
+
+	type = DWC_PCIE_EVENT_TYPE(leader);
+	if (type == DWC_PCIE_TIME_BASE_EVENT)
+		time_event = true;
+	else
+		if (dwc_pcie_pmu_validate_add_lane_event(leader, val_lane_events))
+			return -ENOSPC;
+
+	for_each_sibling_event(sibling, leader) {
+		type = DWC_PCIE_EVENT_TYPE(sibling);
+		if (type == DWC_PCIE_TIME_BASE_EVENT) {
+			if (time_event)
+				return -ENOSPC;
+
+			time_event = true;
+			continue;
+		}
+
+		if (dwc_pcie_pmu_validate_add_lane_event(sibling, val_lane_events))
+			return -ENOSPC;
+	}
+
+	return 0;
+}
+
 static int dwc_pcie_pmu_event_init(struct perf_event *event)
 {
 	struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu);
@@ -367,10 +447,6 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event)
 	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
 		return -EINVAL;
 
-	if (event->group_leader != event &&
-	    !is_software_event(event->group_leader))
-		return -EINVAL;
-
 	for_each_sibling_event(sibling, event->group_leader) {
 		if (sibling->pmu != event->pmu && !is_software_event(sibling))
 			return -EINVAL;
@@ -385,6 +461,9 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event)
 			return -EINVAL;
 	}
 
+	if (dwc_pcie_pmu_validate_group(event))
+		return -ENOSPC;
+
 	event->cpu = pcie_pmu->on_cpu;
 
 	return 0;
@@ -400,7 +479,7 @@ static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags)
 	local64_set(&hwc->prev_count, 0);
 
 	if (type == DWC_PCIE_LANE_EVENT)
-		dwc_pcie_pmu_lane_event_enable(pcie_pmu, true);
+		dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, true);
 	else if (type == DWC_PCIE_TIME_BASE_EVENT)
 		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true);
 }
@@ -414,12 +493,13 @@ static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags)
 	if (event->hw.state & PERF_HES_STOPPED)
 		return;
 
+	dwc_pcie_pmu_event_update(event);
+
 	if (type == DWC_PCIE_LANE_EVENT)
-		dwc_pcie_pmu_lane_event_enable(pcie_pmu, false);
+		dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, false);
 	else if (type == DWC_PCIE_TIME_BASE_EVENT)
 		dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false);
 
-	dwc_pcie_pmu_event_update(event);
 	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
 }
 
@@ -434,14 +514,17 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
 	u16 ras_des_offset = pcie_pmu->ras_des_offset;
 	u32 ctrl;
 
-	/* one counter for each type and it is in use */
-	if (pcie_pmu->event[type])
-		return -ENOSPC;
-
-	pcie_pmu->event[type] = event;
 	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
 
 	if (type == DWC_PCIE_LANE_EVENT) {
+		int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id);
+		int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) -
+			DWC_PCIE_LANE_GROUP_6;
+
+		if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr,
+				     pcie_pmu->lane_events))
+			return -ENOSPC;
+
 		/* EVENT_COUNTER_DATA_REG needs clear manually */
 		ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) |
 			FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) |
@@ -450,6 +533,11 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags)
 		pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL,
 				       ctrl);
 	} else if (type == DWC_PCIE_TIME_BASE_EVENT) {
+		if (pcie_pmu->time_based_event)
+			return -ENOSPC;
+
+		pcie_pmu->time_based_event = event;
+
 		/*
 		 * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely
 		 * use it with any manually controlled duration. And it is
@@ -478,7 +566,18 @@ static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags)
 
 	dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE);
 	perf_event_update_userpage(event);
-	pcie_pmu->event[type] = NULL;
+
+	if (type == DWC_PCIE_TIME_BASE_EVENT) {
+		pcie_pmu->time_based_event = NULL;
+	} else {
+		int event_id = DWC_PCIE_EVENT_ID(event);
+		int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id);
+		int group    = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) -
+			DWC_PCIE_LANE_GROUP_6;
+
+		clear_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr,
+			  pcie_pmu->lane_events);
+	}
 }
 
 static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node)
diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index 267754fdf581..7050b48c0467 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -104,6 +104,11 @@ static const struct imx_ddr_devtype_data imx93_devtype_data = {
 	.filter_ver = DDR_PERF_AXI_FILTER_V1
 };
 
+static const struct imx_ddr_devtype_data imx94_devtype_data = {
+	.identifier = "imx94",
+	.filter_ver = DDR_PERF_AXI_FILTER_V2
+};
+
 static const struct imx_ddr_devtype_data imx95_devtype_data = {
 	.identifier = "imx95",
 	.filter_ver = DDR_PERF_AXI_FILTER_V2
@@ -122,6 +127,7 @@ static inline bool axi_filter_v2(struct ddr_pmu *pmu)
 static const struct of_device_id imx_ddr_pmu_dt_ids[] = {
 	{ .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data },
 	{ .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data },
+	{ .compatible = "fsl,imx94-ddr-pmu", .data = &imx94_devtype_data },
 	{ .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data },
 	{ /* sentinel */ }
 };
diff --git a/drivers/perf/fujitsu_uncore_pmu.c b/drivers/perf/fujitsu_uncore_pmu.c
new file mode 100644
index 000000000000..c3c6f56474ad
--- /dev/null
+++ b/drivers/perf/fujitsu_uncore_pmu.c
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for the Uncore PMUs in Fujitsu chips.
+ *
+ * See Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst for more details.
+ *
+ * Copyright (c) 2025 Fujitsu. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+/* Number of counters on each PMU */
+#define MAC_NUM_COUNTERS  8
+#define PCI_NUM_COUNTERS  8
+/* Mask for the event type field within perf_event_attr.config and EVTYPE reg */
+#define UNCORE_EVTYPE_MASK   0xFF
+
+/* Perfmon registers */
+#define PM_EVCNTR(__cntr)           (0x000 + (__cntr) * 8)
+#define PM_CNTCTL(__cntr)           (0x100 + (__cntr) * 8)
+#define PM_CNTCTL_RESET             0
+#define PM_EVTYPE(__cntr)           (0x200 + (__cntr) * 8)
+#define PM_EVTYPE_EVSEL(__val)      FIELD_GET(UNCORE_EVTYPE_MASK, __val)
+#define PM_CR                       0x400
+#define PM_CR_RESET                 BIT(1)
+#define PM_CR_ENABLE                BIT(0)
+#define PM_CNTENSET                 0x410
+#define PM_CNTENSET_IDX(__cntr)     BIT(__cntr)
+#define PM_CNTENCLR                 0x418
+#define PM_CNTENCLR_IDX(__cntr)     BIT(__cntr)
+#define PM_CNTENCLR_RESET           0xFF
+#define PM_INTENSET                 0x420
+#define PM_INTENSET_IDX(__cntr)     BIT(__cntr)
+#define PM_INTENCLR                 0x428
+#define PM_INTENCLR_IDX(__cntr)     BIT(__cntr)
+#define PM_INTENCLR_RESET           0xFF
+#define PM_OVSR                     0x440
+#define PM_OVSR_OVSRCLR_RESET       0xFF
+
+enum fujitsu_uncore_pmu {
+	FUJITSU_UNCORE_PMU_MAC = 1,
+	FUJITSU_UNCORE_PMU_PCI = 2,
+};
+
+struct uncore_pmu {
+	int			num_counters;
+	struct pmu		pmu;
+	struct hlist_node	node;
+	void __iomem		*regs;
+	struct perf_event	**events;
+	unsigned long		*used_mask;
+	int			cpu;
+	int			irq;
+	struct device		*dev;
+};
+
+#define to_uncore_pmu(p) (container_of(p, struct uncore_pmu, pmu))
+
+static int uncore_pmu_cpuhp_state;
+
+static void fujitsu_uncore_counter_start(struct perf_event *event)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	/* Initialize the hardware counter and reset prev_count*/
+	local64_set(&event->hw.prev_count, 0);
+	writeq_relaxed(0, uncorepmu->regs + PM_EVCNTR(idx));
+
+	/* Set the event type */
+	writeq_relaxed(PM_EVTYPE_EVSEL(event->attr.config), uncorepmu->regs + PM_EVTYPE(idx));
+
+	/* Enable interrupt generation by this counter */
+	writeq_relaxed(PM_INTENSET_IDX(idx), uncorepmu->regs + PM_INTENSET);
+
+	/* Finally, enable the counter */
+	writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(idx));
+	writeq_relaxed(PM_CNTENSET_IDX(idx), uncorepmu->regs + PM_CNTENSET);
+}
+
+static void fujitsu_uncore_counter_stop(struct perf_event *event)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	int idx = event->hw.idx;
+
+	/* Disable the counter */
+	writeq_relaxed(PM_CNTENCLR_IDX(idx), uncorepmu->regs + PM_CNTENCLR);
+
+	/* Disable interrupt generation by this counter */
+	writeq_relaxed(PM_INTENCLR_IDX(idx), uncorepmu->regs + PM_INTENCLR);
+}
+
+static void fujitsu_uncore_counter_update(struct perf_event *event)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u64 prev, new;
+
+	do {
+		prev = local64_read(&event->hw.prev_count);
+		new = readq_relaxed(uncorepmu->regs + PM_EVCNTR(idx));
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static inline void fujitsu_uncore_init(struct uncore_pmu *uncorepmu)
+{
+	int i;
+
+	writeq_relaxed(PM_CR_RESET, uncorepmu->regs + PM_CR);
+
+	writeq_relaxed(PM_CNTENCLR_RESET, uncorepmu->regs + PM_CNTENCLR);
+	writeq_relaxed(PM_INTENCLR_RESET, uncorepmu->regs + PM_INTENCLR);
+	writeq_relaxed(PM_OVSR_OVSRCLR_RESET, uncorepmu->regs + PM_OVSR);
+
+	for (i = 0; i < uncorepmu->num_counters; ++i) {
+		writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(i));
+		writeq_relaxed(PM_EVTYPE_EVSEL(0), uncorepmu->regs + PM_EVTYPE(i));
+	}
+	writeq_relaxed(PM_CR_ENABLE, uncorepmu->regs + PM_CR);
+}
+
+static irqreturn_t fujitsu_uncore_handle_irq(int irq_num, void *data)
+{
+	struct uncore_pmu *uncorepmu = data;
+	/* Read the overflow status register */
+	long status = readq_relaxed(uncorepmu->regs + PM_OVSR);
+	int idx;
+
+	if (status == 0)
+		return IRQ_NONE;
+
+	/* Clear the bits we read on the overflow status register */
+	writeq_relaxed(status, uncorepmu->regs + PM_OVSR);
+
+	for_each_set_bit(idx, &status, uncorepmu->num_counters) {
+		struct perf_event *event;
+
+		event = uncorepmu->events[idx];
+		if (!event)
+			continue;
+
+		fujitsu_uncore_counter_update(event);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void fujitsu_uncore_pmu_enable(struct pmu *pmu)
+{
+	writeq_relaxed(PM_CR_ENABLE, to_uncore_pmu(pmu)->regs + PM_CR);
+}
+
+static void fujitsu_uncore_pmu_disable(struct pmu *pmu)
+{
+	writeq_relaxed(0, to_uncore_pmu(pmu)->regs + PM_CR);
+}
+
+static bool fujitsu_uncore_validate_event_group(struct perf_event *event)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	struct perf_event *leader = event->group_leader;
+	struct perf_event *sibling;
+	int counters = 1;
+
+	if (leader == event)
+		return true;
+
+	if (leader->pmu == event->pmu)
+		counters++;
+
+	for_each_sibling_event(sibling, leader) {
+		if (sibling->pmu == event->pmu)
+			counters++;
+	}
+
+	/*
+	 * If the group requires more counters than the HW has, it
+	 * cannot ever be scheduled.
+	 */
+	return counters <= uncorepmu->num_counters;
+}
+
+static int fujitsu_uncore_event_init(struct perf_event *event)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Is the event for this PMU? */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * Sampling not supported since these events are not
+	 * core-attributable.
+	 */
+	if (is_sampling_event(event))
+		return -EINVAL;
+
+	/*
+	 * Task mode not available, we run the counters as socket counters,
+	 * not attributable to any CPU and therefore cannot attribute per-task.
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* Validate the group */
+	if (!fujitsu_uncore_validate_event_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+
+	event->cpu = uncorepmu->cpu;
+
+	return 0;
+}
+
+static void fujitsu_uncore_event_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->state = 0;
+	fujitsu_uncore_counter_start(event);
+}
+
+static void fujitsu_uncore_event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	fujitsu_uncore_counter_stop(event);
+	if (flags & PERF_EF_UPDATE)
+		fujitsu_uncore_counter_update(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int fujitsu_uncore_event_add(struct perf_event *event, int flags)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	/* Try to allocate a counter. */
+	idx = bitmap_find_free_region(uncorepmu->used_mask, uncorepmu->num_counters, 0);
+	if (idx < 0)
+		/* The counters are all in use. */
+		return -EAGAIN;
+
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	uncorepmu->events[idx] = event;
+
+	if (flags & PERF_EF_START)
+		fujitsu_uncore_event_start(event, 0);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void fujitsu_uncore_event_del(struct perf_event *event, int flags)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* Stop and clean up */
+	fujitsu_uncore_event_stop(event, flags | PERF_EF_UPDATE);
+	uncorepmu->events[hwc->idx] = NULL;
+	bitmap_release_region(uncorepmu->used_mask, hwc->idx, 0);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+}
+
+static void fujitsu_uncore_event_read(struct perf_event *event)
+{
+	fujitsu_uncore_counter_update(event);
+}
+
+#define UNCORE_PMU_FORMAT_ATTR(_name, _config)				      \
+	(&((struct dev_ext_attribute[]) {				      \
+		{ .attr = __ATTR(_name, 0444, device_show_string, NULL),      \
+		  .var = (void *)_config, }				      \
+	})[0].attr.attr)
+
+static struct attribute *fujitsu_uncore_pmu_formats[] = {
+	UNCORE_PMU_FORMAT_ATTR(event, "config:0-7"),
+	NULL
+};
+
+static const struct attribute_group fujitsu_uncore_pmu_format_group = {
+	.name = "format",
+	.attrs = fujitsu_uncore_pmu_formats,
+};
+
+static ssize_t fujitsu_uncore_pmu_event_show(struct device *dev,
+					     struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define MAC_EVENT_ATTR(_name, _id)					     \
+	PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id)
+
+static struct attribute *fujitsu_uncore_mac_pmu_events[] = {
+	MAC_EVENT_ATTR(cycles,				0x00),
+	MAC_EVENT_ATTR(read-count,			0x10),
+	MAC_EVENT_ATTR(read-count-request,		0x11),
+	MAC_EVENT_ATTR(read-count-return,		0x12),
+	MAC_EVENT_ATTR(read-count-request-pftgt,	0x13),
+	MAC_EVENT_ATTR(read-count-request-normal,	0x14),
+	MAC_EVENT_ATTR(read-count-return-pftgt-hit,	0x15),
+	MAC_EVENT_ATTR(read-count-return-pftgt-miss,	0x16),
+	MAC_EVENT_ATTR(read-wait,			0x17),
+	MAC_EVENT_ATTR(write-count,			0x20),
+	MAC_EVENT_ATTR(write-count-write,		0x21),
+	MAC_EVENT_ATTR(write-count-pwrite,		0x22),
+	MAC_EVENT_ATTR(memory-read-count,		0x40),
+	MAC_EVENT_ATTR(memory-write-count,		0x50),
+	MAC_EVENT_ATTR(memory-pwrite-count,		0x60),
+	MAC_EVENT_ATTR(ea-mac,				0x80),
+	MAC_EVENT_ATTR(ea-memory,			0x90),
+	MAC_EVENT_ATTR(ea-memory-mac-write,		0x92),
+	MAC_EVENT_ATTR(ea-ha,				0xa0),
+	NULL
+};
+
+#define PCI_EVENT_ATTR(_name, _id)					     \
+	PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id)
+
+static struct attribute *fujitsu_uncore_pci_pmu_events[] = {
+	PCI_EVENT_ATTR(pci-port0-cycles,		0x00),
+	PCI_EVENT_ATTR(pci-port0-read-count,		0x10),
+	PCI_EVENT_ATTR(pci-port0-read-count-bus,	0x14),
+	PCI_EVENT_ATTR(pci-port0-write-count,		0x20),
+	PCI_EVENT_ATTR(pci-port0-write-count-bus,	0x24),
+	PCI_EVENT_ATTR(pci-port1-cycles,		0x40),
+	PCI_EVENT_ATTR(pci-port1-read-count,		0x50),
+	PCI_EVENT_ATTR(pci-port1-read-count-bus,	0x54),
+	PCI_EVENT_ATTR(pci-port1-write-count,		0x60),
+	PCI_EVENT_ATTR(pci-port1-write-count-bus,	0x64),
+	PCI_EVENT_ATTR(ea-pci,				0x80),
+	NULL
+};
+
+static const struct attribute_group fujitsu_uncore_mac_pmu_events_group = {
+	.name = "events",
+	.attrs = fujitsu_uncore_mac_pmu_events,
+};
+
+static const struct attribute_group fujitsu_uncore_pci_pmu_events_group = {
+	.name = "events",
+	.attrs = fujitsu_uncore_pci_pmu_events,
+};
+
+static ssize_t cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct uncore_pmu *uncorepmu = to_uncore_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(uncorepmu->cpu));
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *fujitsu_uncore_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group fujitsu_uncore_pmu_cpumask_attr_group = {
+	.attrs = fujitsu_uncore_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *fujitsu_uncore_mac_pmu_attr_grps[] = {
+	&fujitsu_uncore_pmu_format_group,
+	&fujitsu_uncore_mac_pmu_events_group,
+	&fujitsu_uncore_pmu_cpumask_attr_group,
+	NULL
+};
+
+static const struct attribute_group *fujitsu_uncore_pci_pmu_attr_grps[] = {
+	&fujitsu_uncore_pmu_format_group,
+	&fujitsu_uncore_pci_pmu_events_group,
+	&fujitsu_uncore_pmu_cpumask_attr_group,
+	NULL
+};
+
+static void fujitsu_uncore_pmu_migrate(struct uncore_pmu *uncorepmu, unsigned int cpu)
+{
+	perf_pmu_migrate_context(&uncorepmu->pmu, uncorepmu->cpu, cpu);
+	irq_set_affinity(uncorepmu->irq, cpumask_of(cpu));
+	uncorepmu->cpu = cpu;
+}
+
+static int fujitsu_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct uncore_pmu *uncorepmu;
+	int node;
+
+	uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node);
+	node = dev_to_node(uncorepmu->dev);
+	if (cpu_to_node(uncorepmu->cpu) != node && cpu_to_node(cpu) == node)
+		fujitsu_uncore_pmu_migrate(uncorepmu, cpu);
+
+	return 0;
+}
+
+static int fujitsu_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct uncore_pmu *uncorepmu;
+	unsigned int target;
+	int node;
+
+	uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node);
+	if (cpu != uncorepmu->cpu)
+		return 0;
+
+	node = dev_to_node(uncorepmu->dev);
+	target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		target = cpumask_any_but(cpu_online_mask, cpu);
+
+	if (target < nr_cpu_ids)
+		fujitsu_uncore_pmu_migrate(uncorepmu, target);
+
+	return 0;
+}
+
+static int fujitsu_uncore_pmu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	unsigned long device_type = (unsigned long)device_get_match_data(dev);
+	const struct attribute_group **attr_groups;
+	struct uncore_pmu *uncorepmu;
+	struct resource *memrc;
+	size_t alloc_size;
+	char *name;
+	int ret;
+	int irq;
+	u64 uid;
+
+	ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid);
+	if (ret)
+		return dev_err_probe(dev, ret, "unable to read ACPI uid\n");
+
+	uncorepmu = devm_kzalloc(dev, sizeof(*uncorepmu), GFP_KERNEL);
+	if (!uncorepmu)
+		return -ENOMEM;
+	uncorepmu->dev = dev;
+	uncorepmu->cpu = cpumask_local_spread(0, dev_to_node(dev));
+	platform_set_drvdata(pdev, uncorepmu);
+
+	switch (device_type) {
+	case FUJITSU_UNCORE_PMU_MAC:
+		uncorepmu->num_counters = MAC_NUM_COUNTERS;
+		attr_groups = fujitsu_uncore_mac_pmu_attr_grps;
+		name = devm_kasprintf(dev, GFP_KERNEL, "mac_iod%llu_mac%llu_ch%llu",
+				      (uid >> 8) & 0xF, (uid >> 4) & 0xF, uid & 0xF);
+		break;
+	case FUJITSU_UNCORE_PMU_PCI:
+		uncorepmu->num_counters = PCI_NUM_COUNTERS;
+		attr_groups = fujitsu_uncore_pci_pmu_attr_grps;
+		name = devm_kasprintf(dev, GFP_KERNEL, "pci_iod%llu_pci%llu",
+				      (uid >> 4) & 0xF, uid & 0xF);
+		break;
+	default:
+		return dev_err_probe(dev, -EINVAL, "illegal device type: %lu\n", device_type);
+	}
+	if (!name)
+		return -ENOMEM;
+
+	uncorepmu->pmu = (struct pmu) {
+		.parent		= dev,
+		.task_ctx_nr	= perf_invalid_context,
+
+		.attr_groups	= attr_groups,
+
+		.pmu_enable	= fujitsu_uncore_pmu_enable,
+		.pmu_disable	= fujitsu_uncore_pmu_disable,
+		.event_init	= fujitsu_uncore_event_init,
+		.add		= fujitsu_uncore_event_add,
+		.del		= fujitsu_uncore_event_del,
+		.start		= fujitsu_uncore_event_start,
+		.stop		= fujitsu_uncore_event_stop,
+		.read		= fujitsu_uncore_event_read,
+
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
+	};
+
+	alloc_size = sizeof(uncorepmu->events[0]) * uncorepmu->num_counters;
+	uncorepmu->events = devm_kzalloc(dev, alloc_size, GFP_KERNEL);
+	if (!uncorepmu->events)
+		return -ENOMEM;
+
+	alloc_size = sizeof(uncorepmu->used_mask[0]) * BITS_TO_LONGS(uncorepmu->num_counters);
+	uncorepmu->used_mask = devm_kzalloc(dev, alloc_size, GFP_KERNEL);
+	if (!uncorepmu->used_mask)
+		return -ENOMEM;
+
+	uncorepmu->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &memrc);
+	if (IS_ERR(uncorepmu->regs))
+		return PTR_ERR(uncorepmu->regs);
+
+	fujitsu_uncore_init(uncorepmu);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	ret = devm_request_irq(dev, irq, fujitsu_uncore_handle_irq,
+			       IRQF_NOBALANCING | IRQF_NO_THREAD,
+			       name, uncorepmu);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to request IRQ:%d\n", irq);
+
+	ret = irq_set_affinity(irq, cpumask_of(uncorepmu->cpu));
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to set irq affinity:%d\n", irq);
+
+	uncorepmu->irq = irq;
+
+	/* Add this instance to the list used by the offline callback */
+	ret = cpuhp_state_add_instance(uncore_pmu_cpuhp_state, &uncorepmu->node);
+	if (ret)
+		return dev_err_probe(dev, ret, "Error registering hotplug");
+
+	ret = perf_pmu_register(&uncorepmu->pmu, name, -1);
+	if (ret < 0) {
+		cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node);
+		return dev_err_probe(dev, ret, "Failed to register %s PMU\n", name);
+	}
+
+	dev_dbg(dev, "Registered %s, type: %d\n", name, uncorepmu->pmu.type);
+
+	return 0;
+}
+
+static void fujitsu_uncore_pmu_remove(struct platform_device *pdev)
+{
+	struct uncore_pmu *uncorepmu = platform_get_drvdata(pdev);
+
+	writeq_relaxed(0, uncorepmu->regs + PM_CR);
+
+	perf_pmu_unregister(&uncorepmu->pmu);
+	cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node);
+}
+
+static const struct acpi_device_id fujitsu_uncore_pmu_acpi_match[] = {
+	{ "FUJI200C", FUJITSU_UNCORE_PMU_MAC },
+	{ "FUJI200D", FUJITSU_UNCORE_PMU_PCI },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, fujitsu_uncore_pmu_acpi_match);
+
+static struct platform_driver fujitsu_uncore_pmu_driver = {
+	.driver = {
+		.name = "fujitsu-uncore-pmu",
+		.acpi_match_table = fujitsu_uncore_pmu_acpi_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = fujitsu_uncore_pmu_probe,
+	.remove = fujitsu_uncore_pmu_remove,
+};
+
+static int __init fujitsu_uncore_pmu_init(void)
+{
+	int ret;
+
+	/* Install a hook to update the reader CPU in case it goes offline */
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/fujitsu/uncore:online",
+				      fujitsu_uncore_pmu_online_cpu,
+				      fujitsu_uncore_pmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+
+	uncore_pmu_cpuhp_state = ret;
+
+	ret = platform_driver_register(&fujitsu_uncore_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(uncore_pmu_cpuhp_state);
+
+	return ret;
+}
+
+static void __exit fujitsu_uncore_pmu_exit(void)
+{
+	platform_driver_unregister(&fujitsu_uncore_pmu_driver);
+	cpuhp_remove_multi_state(uncore_pmu_cpuhp_state);
+}
+
+module_init(fujitsu_uncore_pmu_init);
+module_exit(fujitsu_uncore_pmu_exit);
+
+MODULE_AUTHOR("Koichi Okuno <fj2767dz@fujitsu.com>");
+MODULE_DESCRIPTION("Fujitsu Uncore PMU driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 48dcc8381ea7..186be3d02238 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
 			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \
-			  hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o
+			  hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \
+			  hisi_uncore_noc_pmu.o hisi_uncore_mn_pmu.o
 
 obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
 obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 412fc3a97963..bbd81a43047d 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -39,6 +39,7 @@
 
 /* L3C has 8-counters */
 #define L3C_NR_COUNTERS		0x8
+#define L3C_MAX_EXT		2
 
 #define L3C_PERF_CTRL_EN	0x10000
 #define L3C_TRACETAG_EN		BIT(31)
@@ -55,59 +56,152 @@
 #define L3C_V1_NR_EVENTS	0x59
 #define L3C_V2_NR_EVENTS	0xFF
 
-HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config1, 7, 0);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(ext, config, 17, 16);
 HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8);
 HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11);
 HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0);
 
-static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event)
+struct hisi_l3c_pmu {
+	struct hisi_pmu l3c_pmu;
+
+	/* MMIO and IRQ resources for extension events */
+	void __iomem *ext_base[L3C_MAX_EXT];
+	int ext_irq[L3C_MAX_EXT];
+	int ext_num;
+};
+
+#define to_hisi_l3c_pmu(_l3c_pmu) \
+	container_of(_l3c_pmu, struct hisi_l3c_pmu, l3c_pmu)
+
+/*
+ * The hardware counter idx used in counter enable/disable,
+ * interrupt enable/disable and status check, etc.
+ */
+#define L3C_HW_IDX(_cntr_idx)		((_cntr_idx) % L3C_NR_COUNTERS)
+
+/* Range of ext counters in used mask. */
+#define L3C_CNTR_EXT_L(_ext)		(((_ext) + 1) * L3C_NR_COUNTERS)
+#define L3C_CNTR_EXT_H(_ext)		(((_ext) + 2) * L3C_NR_COUNTERS)
+
+struct hisi_l3c_pmu_ext {
+	bool support_ext;
+};
+
+static bool support_ext(struct hisi_l3c_pmu *pmu)
+{
+	struct hisi_l3c_pmu_ext *l3c_pmu_ext = pmu->l3c_pmu.dev_info->private;
+
+	return l3c_pmu_ext->support_ext;
+}
+
+static int hisi_l3c_pmu_get_event_idx(struct perf_event *event)
 {
 	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	unsigned long *used_mask = l3c_pmu->pmu_events.used_mask;
+	int ext = hisi_get_ext(event);
+	int idx;
+
+	/*
+	 * For an L3C PMU that supports extension events, we can monitor
+	 * maximum 2 * num_counters to 3 * num_counters events, depending on
+	 * the number of ext regions supported by hardware. Thus use bit
+	 * [0, num_counters - 1] for normal events and bit
+	 * [ext * num_counters, (ext + 1) * num_counters - 1] for extension
+	 * events. The idx allocation will keep unchanged for normal events and
+	 * we can also use the idx to distinguish whether it's an extension
+	 * event or not.
+	 *
+	 * Since normal events and extension events locates on the different
+	 * address space, save the base address to the event->hw.event_base.
+	 */
+	if (ext && !support_ext(hisi_l3c_pmu))
+		return -EOPNOTSUPP;
+
+	if (ext)
+		event->hw.event_base = (unsigned long)hisi_l3c_pmu->ext_base[ext - 1];
+	else
+		event->hw.event_base = (unsigned long)l3c_pmu->base;
+
+	ext -= 1;
+	idx = find_next_zero_bit(used_mask, L3C_CNTR_EXT_H(ext), L3C_CNTR_EXT_L(ext));
+
+	if (idx >= L3C_CNTR_EXT_H(ext))
+		return -EAGAIN;
+
+	set_bit(idx, used_mask);
+
+	return idx;
+}
+
+static u32 hisi_l3c_pmu_event_readl(struct hw_perf_event *hwc, u32 reg)
+{
+	return readl((void __iomem *)hwc->event_base + reg);
+}
+
+static void hisi_l3c_pmu_event_writel(struct hw_perf_event *hwc, u32 reg, u32 val)
+{
+	writel(val, (void __iomem *)hwc->event_base + reg);
+}
+
+static u64 hisi_l3c_pmu_event_readq(struct hw_perf_event *hwc, u32 reg)
+{
+	return readq((void __iomem *)hwc->event_base + reg);
+}
+
+static void hisi_l3c_pmu_event_writeq(struct hw_perf_event *hwc, u32 reg, u64 val)
+{
+	writeq(val, (void __iomem *)hwc->event_base + reg);
+}
+
+static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
 	u32 tt_req = hisi_get_tt_req(event);
 
 	if (tt_req) {
 		u32 val;
 
 		/* Set request-type for tracetag */
-		val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL);
 		val |= tt_req << L3C_TRACETAG_REQ_SHIFT;
 		val |= L3C_TRACETAG_REQ_EN;
-		writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val);
 
 		/* Enable request-tracetag statistics */
-		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL);
 		val |= L3C_TRACETAG_EN;
-		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val);
 	}
 }
 
 static void hisi_l3c_pmu_clear_req_tracetag(struct perf_event *event)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u32 tt_req = hisi_get_tt_req(event);
 
 	if (tt_req) {
 		u32 val;
 
 		/* Clear request-type */
-		val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL);
 		val &= ~(tt_req << L3C_TRACETAG_REQ_SHIFT);
 		val &= ~L3C_TRACETAG_REQ_EN;
-		writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val);
 
 		/* Disable request-tracetag statistics */
-		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL);
 		val &= ~L3C_TRACETAG_EN;
-		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val);
 	}
 }
 
 static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 	u32 reg, reg_idx, shift, val;
-	int idx = hwc->idx;
+	int idx = L3C_HW_IDX(hwc->idx);
 
 	/*
 	 * Select the appropriate datasource register(L3C_DATSRC_TYPE0/1).
@@ -120,15 +214,15 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg)
 	reg_idx = idx % 4;
 	shift = 8 * reg_idx;
 
-	val = readl(l3c_pmu->base + reg);
+	val = hisi_l3c_pmu_event_readl(hwc, reg);
 	val &= ~(L3C_DATSRC_MASK << shift);
 	val |= ds_cfg << shift;
-	writel(val, l3c_pmu->base + reg);
+	hisi_l3c_pmu_event_writel(hwc, reg, val);
 }
 
 static void hisi_l3c_pmu_config_ds(struct perf_event *event)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u32 ds_cfg = hisi_get_datasrc_cfg(event);
 	u32 ds_skt = hisi_get_datasrc_skt(event);
 
@@ -138,15 +232,15 @@ static void hisi_l3c_pmu_config_ds(struct perf_event *event)
 	if (ds_skt) {
 		u32 val;
 
-		val = readl(l3c_pmu->base + L3C_DATSRC_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL);
 		val |= L3C_DATSRC_SKT_EN;
-		writel(val, l3c_pmu->base + L3C_DATSRC_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val);
 	}
 }
 
 static void hisi_l3c_pmu_clear_ds(struct perf_event *event)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u32 ds_cfg = hisi_get_datasrc_cfg(event);
 	u32 ds_skt = hisi_get_datasrc_skt(event);
 
@@ -156,57 +250,63 @@ static void hisi_l3c_pmu_clear_ds(struct perf_event *event)
 	if (ds_skt) {
 		u32 val;
 
-		val = readl(l3c_pmu->base + L3C_DATSRC_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL);
 		val &= ~L3C_DATSRC_SKT_EN;
-		writel(val, l3c_pmu->base + L3C_DATSRC_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val);
 	}
 }
 
 static void hisi_l3c_pmu_config_core_tracetag(struct perf_event *event)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u32 core = hisi_get_tt_core(event);
 
 	if (core) {
 		u32 val;
 
 		/* Config and enable core information */
-		writel(core, l3c_pmu->base + L3C_CORE_CTRL);
-		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, core);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL);
 		val |= L3C_CORE_EN;
-		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val);
 
 		/* Enable core-tracetag statistics */
-		val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL);
 		val |= L3C_TRACETAG_CORE_EN;
-		writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val);
 	}
 }
 
 static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event)
 {
-	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u32 core = hisi_get_tt_core(event);
 
 	if (core) {
 		u32 val;
 
 		/* Clear core information */
-		writel(L3C_COER_NONE, l3c_pmu->base + L3C_CORE_CTRL);
-		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, L3C_COER_NONE);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL);
 		val &= ~L3C_CORE_EN;
-		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val);
 
 		/* Disable core-tracetag statistics */
-		val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL);
+		val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL);
 		val &= ~L3C_TRACETAG_CORE_EN;
-		writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL);
+		hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val);
 	}
 }
 
+static bool hisi_l3c_pmu_have_filter(struct perf_event *event)
+{
+	return hisi_get_tt_req(event) || hisi_get_tt_core(event) ||
+	       hisi_get_datasrc_cfg(event) || hisi_get_datasrc_skt(event);
+}
+
 static void hisi_l3c_pmu_enable_filter(struct perf_event *event)
 {
-	if (event->attr.config1 != 0x0) {
+	if (hisi_l3c_pmu_have_filter(event)) {
 		hisi_l3c_pmu_config_req_tracetag(event);
 		hisi_l3c_pmu_config_core_tracetag(event);
 		hisi_l3c_pmu_config_ds(event);
@@ -215,38 +315,53 @@ static void hisi_l3c_pmu_enable_filter(struct perf_event *event)
 
 static void hisi_l3c_pmu_disable_filter(struct perf_event *event)
 {
-	if (event->attr.config1 != 0x0) {
+	if (hisi_l3c_pmu_have_filter(event)) {
 		hisi_l3c_pmu_clear_ds(event);
 		hisi_l3c_pmu_clear_core_tracetag(event);
 		hisi_l3c_pmu_clear_req_tracetag(event);
 	}
 }
 
+static int hisi_l3c_pmu_check_filter(struct perf_event *event)
+{
+	struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu);
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	int ext = hisi_get_ext(event);
+
+	if (ext < 0 || ext > hisi_l3c_pmu->ext_num)
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * Select the counter register offset using the counter index
  */
 static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx)
 {
-	return (L3C_CNTR0_LOWER + (cntr_idx * 8));
+	return L3C_CNTR0_LOWER + L3C_HW_IDX(cntr_idx) * 8;
 }
 
 static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu,
 				     struct hw_perf_event *hwc)
 {
-	return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx));
+	return hisi_l3c_pmu_event_readq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx));
 }
 
 static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu,
 				       struct hw_perf_event *hwc, u64 val)
 {
-	writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx));
+	hisi_l3c_pmu_event_writeq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx), val);
 }
 
 static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx,
 				      u32 type)
 {
+	struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw;
 	u32 reg, reg_idx, shift, val;
 
+	idx = L3C_HW_IDX(idx);
+
 	/*
 	 * Select the appropriate event select register(L3C_EVENT_TYPE0/1).
 	 * There are 2 event select registers for the 8 hardware counters.
@@ -259,36 +374,72 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx,
 	shift = 8 * reg_idx;
 
 	/* Write event code to L3C_EVENT_TYPEx Register */
-	val = readl(l3c_pmu->base + reg);
+	val = hisi_l3c_pmu_event_readl(hwc, reg);
 	val &= ~(L3C_EVTYPE_NONE << shift);
-	val |= (type << shift);
-	writel(val, l3c_pmu->base + reg);
+	val |= type << shift;
+	hisi_l3c_pmu_event_writel(hwc, reg, val);
 }
 
 static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu)
 {
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	unsigned long *used_mask = l3c_pmu->pmu_events.used_mask;
+	unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters);
 	u32 val;
+	int i;
 
 	/*
-	 * Set perf_enable bit in L3C_PERF_CTRL register to start counting
-	 * for all enabled counters.
+	 * Check if any counter belongs to the normal range (instead of ext
+	 * range). If so, enable it.
 	 */
-	val = readl(l3c_pmu->base + L3C_PERF_CTRL);
-	val |= L3C_PERF_CTRL_EN;
-	writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+	if (used_cntr < L3C_NR_COUNTERS) {
+		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		val |= L3C_PERF_CTRL_EN;
+		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+	}
+
+	/* If not, do enable it on ext ranges. */
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++) {
+		/* Find used counter in this ext range, skip the range if not. */
+		used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i));
+		if (used_cntr >= L3C_CNTR_EXT_H(i))
+			continue;
+
+		val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL);
+		val |= L3C_PERF_CTRL_EN;
+		writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL);
+	}
 }
 
 static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu)
 {
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	unsigned long *used_mask = l3c_pmu->pmu_events.used_mask;
+	unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters);
 	u32 val;
+	int i;
 
 	/*
-	 * Clear perf_enable bit in L3C_PERF_CTRL register to stop counting
-	 * for all enabled counters.
+	 * Check if any counter belongs to the normal range (instead of ext
+	 * range). If so, stop it.
 	 */
-	val = readl(l3c_pmu->base + L3C_PERF_CTRL);
-	val &= ~(L3C_PERF_CTRL_EN);
-	writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+	if (used_cntr < L3C_NR_COUNTERS) {
+		val = readl(l3c_pmu->base + L3C_PERF_CTRL);
+		val &= ~L3C_PERF_CTRL_EN;
+		writel(val, l3c_pmu->base + L3C_PERF_CTRL);
+	}
+
+	/* If not, do stop it on ext ranges. */
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++) {
+		/* Find used counter in this ext range, skip the range if not. */
+		used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i));
+		if (used_cntr >= L3C_CNTR_EXT_H(i))
+			continue;
+
+		val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL);
+		val &= ~L3C_PERF_CTRL_EN;
+		writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL);
+	}
 }
 
 static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu,
@@ -297,9 +448,9 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu,
 	u32 val;
 
 	/* Enable counter index in L3C_EVENT_CTRL register */
-	val = readl(l3c_pmu->base + L3C_EVENT_CTRL);
-	val |= (1 << hwc->idx);
-	writel(val, l3c_pmu->base + L3C_EVENT_CTRL);
+	val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL);
+	val |= 1 << L3C_HW_IDX(hwc->idx);
+	hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val);
 }
 
 static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu,
@@ -308,9 +459,9 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu,
 	u32 val;
 
 	/* Clear counter index in L3C_EVENT_CTRL register */
-	val = readl(l3c_pmu->base + L3C_EVENT_CTRL);
-	val &= ~(1 << hwc->idx);
-	writel(val, l3c_pmu->base + L3C_EVENT_CTRL);
+	val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL);
+	val &= ~(1 << L3C_HW_IDX(hwc->idx));
+	hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val);
 }
 
 static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu,
@@ -318,10 +469,10 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu,
 {
 	u32 val;
 
-	val = readl(l3c_pmu->base + L3C_INT_MASK);
+	val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK);
 	/* Write 0 to enable interrupt */
-	val &= ~(1 << hwc->idx);
-	writel(val, l3c_pmu->base + L3C_INT_MASK);
+	val &= ~(1 << L3C_HW_IDX(hwc->idx));
+	hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val);
 }
 
 static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu,
@@ -329,28 +480,37 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu,
 {
 	u32 val;
 
-	val = readl(l3c_pmu->base + L3C_INT_MASK);
+	val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK);
 	/* Write 1 to mask interrupt */
-	val |= (1 << hwc->idx);
-	writel(val, l3c_pmu->base + L3C_INT_MASK);
+	val |= 1 << L3C_HW_IDX(hwc->idx);
+	hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val);
 }
 
 static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu)
 {
-	return readl(l3c_pmu->base + L3C_INT_STATUS);
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	u32 ext_int, status, status_ext = 0;
+	int i;
+
+	status = readl(l3c_pmu->base + L3C_INT_STATUS);
+
+	if (!support_ext(hisi_l3c_pmu))
+		return status;
+
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++) {
+		ext_int = readl(hisi_l3c_pmu->ext_base[i] + L3C_INT_STATUS);
+		status_ext |= ext_int << (L3C_NR_COUNTERS * i);
+	}
+
+	return status | (status_ext << L3C_NR_COUNTERS);
 }
 
 static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx)
 {
-	writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR);
-}
+	struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw;
 
-static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = {
-	{ "HISI0213", },
-	{ "HISI0214", },
-	{}
-};
-MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match);
+	hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << L3C_HW_IDX(idx));
+}
 
 static int hisi_l3c_pmu_init_data(struct platform_device *pdev,
 				  struct hisi_pmu *l3c_pmu)
@@ -371,6 +531,10 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev,
 		return -EINVAL;
 	}
 
+	l3c_pmu->dev_info = device_get_match_data(&pdev->dev);
+	if (!l3c_pmu->dev_info)
+		return -ENODEV;
+
 	l3c_pmu->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(l3c_pmu->base)) {
 		dev_err(&pdev->dev, "ioremap failed for l3c_pmu resource\n");
@@ -382,6 +546,50 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev,
 	return 0;
 }
 
+static int hisi_l3c_pmu_init_ext(struct hisi_pmu *l3c_pmu, struct platform_device *pdev)
+{
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	int ret, irq, ext_num, i;
+	char *irqname;
+
+	/* HiSilicon L3C PMU supporting ext should have more than 1 irq resources. */
+	ext_num = platform_irq_count(pdev);
+	if (ext_num < L3C_MAX_EXT)
+		return -ENODEV;
+
+	/*
+	 * The number of ext supported equals the number of irq - 1, since one
+	 * of the irqs belongs to the normal part of PMU.
+	 */
+	hisi_l3c_pmu->ext_num = ext_num - 1;
+
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++) {
+		hisi_l3c_pmu->ext_base[i] = devm_platform_ioremap_resource(pdev, i + 1);
+		if (IS_ERR(hisi_l3c_pmu->ext_base[i]))
+			return PTR_ERR(hisi_l3c_pmu->ext_base[i]);
+
+		irq = platform_get_irq(pdev, i + 1);
+		if (irq < 0)
+			return irq;
+
+		irqname = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s ext%d",
+					 dev_name(&pdev->dev), i + 1);
+		if (!irqname)
+			return -ENOMEM;
+
+		ret = devm_request_irq(&pdev->dev, irq, hisi_uncore_pmu_isr,
+				       IRQF_NOBALANCING | IRQF_NO_THREAD,
+				       irqname, l3c_pmu);
+		if (ret < 0)
+			return dev_err_probe(&pdev->dev, ret,
+				"Fail to request EXT IRQ: %d.\n", irq);
+
+		hisi_l3c_pmu->ext_irq[i] = irq;
+	}
+
+	return 0;
+}
+
 static struct attribute *hisi_l3c_pmu_v1_format_attr[] = {
 	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
 	NULL,
@@ -394,7 +602,7 @@ static const struct attribute_group hisi_l3c_pmu_v1_format_group = {
 
 static struct attribute *hisi_l3c_pmu_v2_format_attr[] = {
 	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
-	HISI_PMU_FORMAT_ATTR(tt_core, "config1:0-7"),
+	HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"),
 	HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"),
 	HISI_PMU_FORMAT_ATTR(datasrc_cfg, "config1:11-15"),
 	HISI_PMU_FORMAT_ATTR(datasrc_skt, "config1:16"),
@@ -406,6 +614,19 @@ static const struct attribute_group hisi_l3c_pmu_v2_format_group = {
 	.attrs = hisi_l3c_pmu_v2_format_attr,
 };
 
+static struct attribute *hisi_l3c_pmu_v3_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	HISI_PMU_FORMAT_ATTR(ext, "config:16-17"),
+	HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"),
+	HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"),
+	NULL
+};
+
+static const struct attribute_group hisi_l3c_pmu_v3_format_group = {
+	.name = "format",
+	.attrs = hisi_l3c_pmu_v3_format_attr,
+};
+
 static struct attribute *hisi_l3c_pmu_v1_events_attr[] = {
 	HISI_PMU_EVENT_ATTR(rd_cpipe,		0x00),
 	HISI_PMU_EVENT_ATTR(wr_cpipe,		0x01),
@@ -441,6 +662,26 @@ static const struct attribute_group hisi_l3c_pmu_v2_events_group = {
 	.attrs = hisi_l3c_pmu_v2_events_attr,
 };
 
+static struct attribute *hisi_l3c_pmu_v3_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(rd_spipe,		0x18),
+	HISI_PMU_EVENT_ATTR(rd_hit_spipe,	0x19),
+	HISI_PMU_EVENT_ATTR(wr_spipe,		0x1a),
+	HISI_PMU_EVENT_ATTR(wr_hit_spipe,	0x1b),
+	HISI_PMU_EVENT_ATTR(io_rd_spipe,	0x1c),
+	HISI_PMU_EVENT_ATTR(io_rd_hit_spipe,	0x1d),
+	HISI_PMU_EVENT_ATTR(io_wr_spipe,	0x1e),
+	HISI_PMU_EVENT_ATTR(io_wr_hit_spipe,	0x1f),
+	HISI_PMU_EVENT_ATTR(cycles,		0x7f),
+	HISI_PMU_EVENT_ATTR(l3c_ref,		0xbc),
+	HISI_PMU_EVENT_ATTR(l3c2ring,		0xbd),
+	NULL
+};
+
+static const struct attribute_group hisi_l3c_pmu_v3_events_group = {
+	.name = "events",
+	.attrs = hisi_l3c_pmu_v3_events_attr,
+};
+
 static const struct attribute_group *hisi_l3c_pmu_v1_attr_groups[] = {
 	&hisi_l3c_pmu_v1_format_group,
 	&hisi_l3c_pmu_v1_events_group,
@@ -457,9 +698,46 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = {
 	NULL
 };
 
+static const struct attribute_group *hisi_l3c_pmu_v3_attr_groups[] = {
+	&hisi_l3c_pmu_v3_format_group,
+	&hisi_l3c_pmu_v3_events_group,
+	&hisi_pmu_cpumask_attr_group,
+	&hisi_pmu_identifier_group,
+	NULL
+};
+
+static struct hisi_l3c_pmu_ext hisi_l3c_pmu_support_ext = {
+	.support_ext = true,
+};
+
+static struct hisi_l3c_pmu_ext hisi_l3c_pmu_not_support_ext = {
+	.support_ext = false,
+};
+
+static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = {
+	.attr_groups = hisi_l3c_pmu_v1_attr_groups,
+	.counter_bits = 48,
+	.check_event = L3C_V1_NR_EVENTS,
+	.private = &hisi_l3c_pmu_not_support_ext,
+};
+
+static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = {
+	.attr_groups = hisi_l3c_pmu_v2_attr_groups,
+	.counter_bits = 64,
+	.check_event = L3C_V2_NR_EVENTS,
+	.private = &hisi_l3c_pmu_not_support_ext,
+};
+
+static const struct hisi_pmu_dev_info hisi_l3c_pmu_v3 = {
+	.attr_groups = hisi_l3c_pmu_v3_attr_groups,
+	.counter_bits = 64,
+	.check_event = L3C_V2_NR_EVENTS,
+	.private = &hisi_l3c_pmu_support_ext,
+};
+
 static const struct hisi_uncore_ops hisi_uncore_l3c_ops = {
 	.write_evtype		= hisi_l3c_pmu_write_evtype,
-	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.get_event_idx		= hisi_l3c_pmu_get_event_idx,
 	.start_counters		= hisi_l3c_pmu_start_counters,
 	.stop_counters		= hisi_l3c_pmu_stop_counters,
 	.enable_counter		= hisi_l3c_pmu_enable_counter,
@@ -472,11 +750,14 @@ static const struct hisi_uncore_ops hisi_uncore_l3c_ops = {
 	.clear_int_status	= hisi_l3c_pmu_clear_int_status,
 	.enable_filter		= hisi_l3c_pmu_enable_filter,
 	.disable_filter		= hisi_l3c_pmu_disable_filter,
+	.check_filter		= hisi_l3c_pmu_check_filter,
 };
 
 static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev,
 				  struct hisi_pmu *l3c_pmu)
 {
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	struct hisi_l3c_pmu_ext *l3c_pmu_dev_ext;
 	int ret;
 
 	ret = hisi_l3c_pmu_init_data(pdev, l3c_pmu);
@@ -487,42 +768,55 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev,
 	if (ret)
 		return ret;
 
-	if (l3c_pmu->identifier >= HISI_PMU_V2) {
-		l3c_pmu->counter_bits = 64;
-		l3c_pmu->check_event = L3C_V2_NR_EVENTS;
-		l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v2_attr_groups;
-	} else {
-		l3c_pmu->counter_bits = 48;
-		l3c_pmu->check_event = L3C_V1_NR_EVENTS;
-		l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v1_attr_groups;
-	}
-
+	l3c_pmu->pmu_events.attr_groups = l3c_pmu->dev_info->attr_groups;
+	l3c_pmu->counter_bits = l3c_pmu->dev_info->counter_bits;
+	l3c_pmu->check_event = l3c_pmu->dev_info->check_event;
 	l3c_pmu->num_counters = L3C_NR_COUNTERS;
 	l3c_pmu->ops = &hisi_uncore_l3c_ops;
 	l3c_pmu->dev = &pdev->dev;
 	l3c_pmu->on_cpu = -1;
 
+	l3c_pmu_dev_ext = l3c_pmu->dev_info->private;
+	if (l3c_pmu_dev_ext->support_ext) {
+		ret = hisi_l3c_pmu_init_ext(l3c_pmu, pdev);
+		if (ret)
+			return ret;
+		/*
+		 * The extension events have their own counters with the
+		 * same number of the normal events counters. So we can
+		 * have at maximum num_counters * ext events monitored.
+		 */
+		l3c_pmu->num_counters += hisi_l3c_pmu->ext_num * L3C_NR_COUNTERS;
+	}
+
 	return 0;
 }
 
 static int hisi_l3c_pmu_probe(struct platform_device *pdev)
 {
+	struct hisi_l3c_pmu *hisi_l3c_pmu;
 	struct hisi_pmu *l3c_pmu;
 	char *name;
 	int ret;
 
-	l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*l3c_pmu), GFP_KERNEL);
-	if (!l3c_pmu)
+	hisi_l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*hisi_l3c_pmu), GFP_KERNEL);
+	if (!hisi_l3c_pmu)
 		return -ENOMEM;
 
+	l3c_pmu = &hisi_l3c_pmu->l3c_pmu;
 	platform_set_drvdata(pdev, l3c_pmu);
 
 	ret = hisi_l3c_pmu_dev_probe(pdev, l3c_pmu);
 	if (ret)
 		return ret;
 
-	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d",
-			      l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id);
+	if (l3c_pmu->topo.sub_id >= 0)
+		name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d_%d",
+				      l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id,
+				      l3c_pmu->topo.sub_id);
+	else
+		name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d",
+				      l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id);
 	if (!name)
 		return -ENOMEM;
 
@@ -554,6 +848,14 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev)
 					    &l3c_pmu->node);
 }
 
+static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = {
+	{ "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 },
+	{ "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 },
+	{ "HISI0215", (kernel_ulong_t)&hisi_l3c_pmu_v3 },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match);
+
 static struct platform_driver hisi_l3c_pmu_driver = {
 	.driver = {
 		.name = "hisi_l3c_pmu",
@@ -564,14 +866,60 @@ static struct platform_driver hisi_l3c_pmu_driver = {
 	.remove = hisi_l3c_pmu_remove,
 };
 
+static int hisi_l3c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node);
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	int ret, i;
+
+	ret = hisi_uncore_pmu_online_cpu(cpu, node);
+	if (ret)
+		return ret;
+
+	/* Avoid L3C pmu not supporting ext from ext irq migrating. */
+	if (!support_ext(hisi_l3c_pmu))
+		return 0;
+
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++)
+		WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i],
+					 cpumask_of(l3c_pmu->on_cpu)));
+
+	return 0;
+}
+
+static int hisi_l3c_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node);
+	struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu);
+	int ret, i;
+
+	ret = hisi_uncore_pmu_offline_cpu(cpu, node);
+	if (ret)
+		return ret;
+
+	/* If failed to find any available CPU, skip irq migration. */
+	if (l3c_pmu->on_cpu < 0)
+		return 0;
+
+	/* Avoid L3C pmu not supporting ext from ext irq migrating. */
+	if (!support_ext(hisi_l3c_pmu))
+		return 0;
+
+	for (i = 0; i < hisi_l3c_pmu->ext_num; i++)
+		WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i],
+					 cpumask_of(l3c_pmu->on_cpu)));
+
+	return 0;
+}
+
 static int __init hisi_l3c_pmu_module_init(void)
 {
 	int ret;
 
 	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 				      "AP_PERF_ARM_HISI_L3_ONLINE",
-				      hisi_uncore_pmu_online_cpu,
-				      hisi_uncore_pmu_offline_cpu);
+				      hisi_l3c_pmu_online_cpu,
+				      hisi_l3c_pmu_offline_cpu);
 	if (ret) {
 		pr_err("L3C PMU: Error setup hotplug, ret = %d\n", ret);
 		return ret;
diff --git a/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c
new file mode 100644
index 000000000000..4df4eebe243e
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HiSilicon SoC MN uncore Hardware event counters support
+ *
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd.
+ */
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/mod_devicetable.h>
+#include <linux/property.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* Dynamic CPU hotplug state used by MN PMU */
+static enum cpuhp_state hisi_mn_pmu_online;
+
+/* MN register definition */
+#define HISI_MN_DYNAMIC_CTRL_REG	0x400
+#define   HISI_MN_DYNAMIC_CTRL_EN	BIT(0)
+#define HISI_MN_PERF_CTRL_REG		0x408
+#define   HISI_MN_PERF_CTRL_EN		BIT(6)
+#define HISI_MN_INT_MASK_REG		0x800
+#define HISI_MN_INT_STATUS_REG		0x808
+#define HISI_MN_INT_CLEAR_REG		0x80C
+#define HISI_MN_EVENT_CTRL_REG		0x1C00
+#define HISI_MN_VERSION_REG		0x1C04
+#define HISI_MN_EVTYPE0_REG		0x1d00
+#define   HISI_MN_EVTYPE_MASK		GENMASK(7, 0)
+#define HISI_MN_CNTR0_REG		0x1e00
+#define HISI_MN_EVTYPE_REGn(evtype0, n)	((evtype0) + (n) * 4)
+#define HISI_MN_CNTR_REGn(cntr0, n)	((cntr0) + (n) * 8)
+
+#define HISI_MN_NR_COUNTERS		4
+#define HISI_MN_TIMEOUT_US		500U
+
+struct hisi_mn_pmu_regs {
+	u32 version;
+	u32 dyn_ctrl;
+	u32 perf_ctrl;
+	u32 int_mask;
+	u32 int_clear;
+	u32 int_status;
+	u32 event_ctrl;
+	u32 event_type0;
+	u32 event_cntr0;
+};
+
+/*
+ * Each event request takes a certain amount of time to complete. If
+ * we counting the latency related event, we need to wait for the all
+ * requests complete. Otherwise, the value of counter is slightly larger.
+ */
+static void hisi_mn_pmu_counter_flush(struct hisi_pmu *mn_pmu)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	int ret;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->dyn_ctrl);
+	val |= HISI_MN_DYNAMIC_CTRL_EN;
+	writel(val, mn_pmu->base + reg_info->dyn_ctrl);
+
+	ret = readl_poll_timeout_atomic(mn_pmu->base + reg_info->dyn_ctrl,
+					val, !(val & HISI_MN_DYNAMIC_CTRL_EN),
+					1, HISI_MN_TIMEOUT_US);
+	if (ret)
+		dev_warn(mn_pmu->dev, "Counter flush timeout\n");
+}
+
+static u64 hisi_mn_pmu_read_counter(struct hisi_pmu *mn_pmu,
+				    struct hw_perf_event *hwc)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+
+	return readq(mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx));
+}
+
+static void hisi_mn_pmu_write_counter(struct hisi_pmu *mn_pmu,
+				      struct hw_perf_event *hwc, u64 val)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+
+	writeq(val, mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx));
+}
+
+static void hisi_mn_pmu_write_evtype(struct hisi_pmu *mn_pmu, int idx, u32 type)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	/*
+	 * Select the appropriate event select register.
+	 * There are 2 32-bit event select registers for the
+	 * 8 hardware counters, each event code is 8-bit wide.
+	 */
+	val = readl(mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4));
+	val &= ~(HISI_MN_EVTYPE_MASK << HISI_PMU_EVTYPE_SHIFT(idx));
+	val |= (type << HISI_PMU_EVTYPE_SHIFT(idx));
+	writel(val, mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4));
+}
+
+static void hisi_mn_pmu_start_counters(struct hisi_pmu *mn_pmu)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->perf_ctrl);
+	val |= HISI_MN_PERF_CTRL_EN;
+	writel(val, mn_pmu->base + reg_info->perf_ctrl);
+}
+
+static void hisi_mn_pmu_stop_counters(struct hisi_pmu *mn_pmu)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->perf_ctrl);
+	val &= ~HISI_MN_PERF_CTRL_EN;
+	writel(val, mn_pmu->base + reg_info->perf_ctrl);
+
+	hisi_mn_pmu_counter_flush(mn_pmu);
+}
+
+static void hisi_mn_pmu_enable_counter(struct hisi_pmu *mn_pmu,
+				       struct hw_perf_event *hwc)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->event_ctrl);
+	val |= BIT(hwc->idx);
+	writel(val, mn_pmu->base + reg_info->event_ctrl);
+}
+
+static void hisi_mn_pmu_disable_counter(struct hisi_pmu *mn_pmu,
+					struct hw_perf_event *hwc)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->event_ctrl);
+	val &= ~BIT(hwc->idx);
+	writel(val, mn_pmu->base + reg_info->event_ctrl);
+}
+
+static void hisi_mn_pmu_enable_counter_int(struct hisi_pmu *mn_pmu,
+					   struct hw_perf_event *hwc)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->int_mask);
+	val &= ~BIT(hwc->idx);
+	writel(val, mn_pmu->base + reg_info->int_mask);
+}
+
+static void hisi_mn_pmu_disable_counter_int(struct hisi_pmu *mn_pmu,
+					    struct hw_perf_event *hwc)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+	u32 val;
+
+	val = readl(mn_pmu->base + reg_info->int_mask);
+	val |= BIT(hwc->idx);
+	writel(val, mn_pmu->base + reg_info->int_mask);
+}
+
+static u32 hisi_mn_pmu_get_int_status(struct hisi_pmu *mn_pmu)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+
+	return readl(mn_pmu->base + reg_info->int_status);
+}
+
+static void hisi_mn_pmu_clear_int_status(struct hisi_pmu *mn_pmu, int idx)
+{
+	struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private;
+
+	writel(BIT(idx), mn_pmu->base + reg_info->int_clear);
+}
+
+static struct attribute *hisi_mn_pmu_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	NULL
+};
+
+static const struct attribute_group hisi_mn_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_mn_pmu_format_attr,
+};
+
+static struct attribute *hisi_mn_pmu_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(req_eobarrier_num,		0x00),
+	HISI_PMU_EVENT_ATTR(req_ecbarrier_num,		0x01),
+	HISI_PMU_EVENT_ATTR(req_dvmop_num,		0x02),
+	HISI_PMU_EVENT_ATTR(req_dvmsync_num,		0x03),
+	HISI_PMU_EVENT_ATTR(req_retry_num,		0x04),
+	HISI_PMU_EVENT_ATTR(req_writenosnp_num,		0x05),
+	HISI_PMU_EVENT_ATTR(req_readnosnp_num,		0x06),
+	HISI_PMU_EVENT_ATTR(snp_dvm_num,		0x07),
+	HISI_PMU_EVENT_ATTR(snp_dvmsync_num,		0x08),
+	HISI_PMU_EVENT_ATTR(l3t_req_dvm_num,		0x09),
+	HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_num,	0x0A),
+	HISI_PMU_EVENT_ATTR(mn_req_dvm_num,		0x0B),
+	HISI_PMU_EVENT_ATTR(mn_req_dvmsync_num,		0x0C),
+	HISI_PMU_EVENT_ATTR(pa_req_dvm_num,		0x0D),
+	HISI_PMU_EVENT_ATTR(pa_req_dvmsync_num,		0x0E),
+	HISI_PMU_EVENT_ATTR(snp_dvm_latency,		0x80),
+	HISI_PMU_EVENT_ATTR(snp_dvmsync_latency,	0x81),
+	HISI_PMU_EVENT_ATTR(l3t_req_dvm_latency,	0x82),
+	HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_latency,	0x83),
+	HISI_PMU_EVENT_ATTR(mn_req_dvm_latency,		0x84),
+	HISI_PMU_EVENT_ATTR(mn_req_dvmsync_latency,	0x85),
+	HISI_PMU_EVENT_ATTR(pa_req_dvm_latency,		0x86),
+	HISI_PMU_EVENT_ATTR(pa_req_dvmsync_latency,	0x87),
+	NULL
+};
+
+static const struct attribute_group hisi_mn_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_mn_pmu_events_attr,
+};
+
+static const struct attribute_group *hisi_mn_pmu_attr_groups[] = {
+	&hisi_mn_pmu_format_group,
+	&hisi_mn_pmu_events_group,
+	&hisi_pmu_cpumask_attr_group,
+	&hisi_pmu_identifier_group,
+	NULL
+};
+
+static const struct hisi_uncore_ops hisi_uncore_mn_ops = {
+	.write_evtype		= hisi_mn_pmu_write_evtype,
+	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.start_counters		= hisi_mn_pmu_start_counters,
+	.stop_counters		= hisi_mn_pmu_stop_counters,
+	.enable_counter		= hisi_mn_pmu_enable_counter,
+	.disable_counter	= hisi_mn_pmu_disable_counter,
+	.enable_counter_int	= hisi_mn_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_mn_pmu_disable_counter_int,
+	.write_counter		= hisi_mn_pmu_write_counter,
+	.read_counter		= hisi_mn_pmu_read_counter,
+	.get_int_status		= hisi_mn_pmu_get_int_status,
+	.clear_int_status	= hisi_mn_pmu_clear_int_status,
+};
+
+static int hisi_mn_pmu_dev_init(struct platform_device *pdev,
+				struct hisi_pmu *mn_pmu)
+{
+	struct hisi_mn_pmu_regs *reg_info;
+	int ret;
+
+	hisi_uncore_pmu_init_topology(mn_pmu, &pdev->dev);
+
+	if (mn_pmu->topo.scl_id < 0)
+		return dev_err_probe(&pdev->dev, -EINVAL,
+				     "Failed to read MN scl id\n");
+
+	if (mn_pmu->topo.index_id < 0)
+		return dev_err_probe(&pdev->dev, -EINVAL,
+				     "Failed to read MN index id\n");
+
+	mn_pmu->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(mn_pmu->base))
+		return dev_err_probe(&pdev->dev, PTR_ERR(mn_pmu->base),
+				     "Failed to ioremap resource\n");
+
+	ret = hisi_uncore_pmu_init_irq(mn_pmu, pdev);
+	if (ret)
+		return ret;
+
+	mn_pmu->dev_info = device_get_match_data(&pdev->dev);
+	if (!mn_pmu->dev_info)
+		return -ENODEV;
+
+	mn_pmu->pmu_events.attr_groups = mn_pmu->dev_info->attr_groups;
+	mn_pmu->counter_bits = mn_pmu->dev_info->counter_bits;
+	mn_pmu->check_event = mn_pmu->dev_info->check_event;
+	mn_pmu->num_counters = HISI_MN_NR_COUNTERS;
+	mn_pmu->ops = &hisi_uncore_mn_ops;
+	mn_pmu->dev = &pdev->dev;
+	mn_pmu->on_cpu = -1;
+
+	reg_info = mn_pmu->dev_info->private;
+	mn_pmu->identifier = readl(mn_pmu->base + reg_info->version);
+
+	return 0;
+}
+
+static void hisi_mn_pmu_remove_cpuhp(void *hotplug_node)
+{
+	cpuhp_state_remove_instance_nocalls(hisi_mn_pmu_online, hotplug_node);
+}
+
+static void hisi_mn_pmu_unregister(void *pmu)
+{
+	perf_pmu_unregister(pmu);
+}
+
+static int hisi_mn_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *mn_pmu;
+	char *name;
+	int ret;
+
+	mn_pmu = devm_kzalloc(&pdev->dev, sizeof(*mn_pmu), GFP_KERNEL);
+	if (!mn_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, mn_pmu);
+
+	ret = hisi_mn_pmu_dev_init(pdev, mn_pmu);
+	if (ret)
+		return ret;
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_scl%d_mn%d",
+				mn_pmu->topo.scl_id, mn_pmu->topo.index_id);
+	if (!name)
+		return -ENOMEM;
+
+	ret = cpuhp_state_add_instance(hisi_mn_pmu_online, &mn_pmu->node);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret, "Failed to register cpu hotplug\n");
+
+	ret = devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_remove_cpuhp, &mn_pmu->node);
+	if (ret)
+		return ret;
+
+	hisi_pmu_init(mn_pmu, THIS_MODULE);
+
+	ret = perf_pmu_register(&mn_pmu->pmu, name, -1);
+	if (ret)
+		return dev_err_probe(mn_pmu->dev, ret, "Failed to register MN PMU\n");
+
+	return devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_unregister, &mn_pmu->pmu);
+}
+
+static struct hisi_mn_pmu_regs hisi_mn_v1_pmu_regs = {
+	.version = HISI_MN_VERSION_REG,
+	.dyn_ctrl = HISI_MN_DYNAMIC_CTRL_REG,
+	.perf_ctrl = HISI_MN_PERF_CTRL_REG,
+	.int_mask = HISI_MN_INT_MASK_REG,
+	.int_clear = HISI_MN_INT_CLEAR_REG,
+	.int_status = HISI_MN_INT_STATUS_REG,
+	.event_ctrl = HISI_MN_EVENT_CTRL_REG,
+	.event_type0 = HISI_MN_EVTYPE0_REG,
+	.event_cntr0 = HISI_MN_CNTR0_REG,
+};
+
+static const struct hisi_pmu_dev_info hisi_mn_v1 = {
+	.attr_groups = hisi_mn_pmu_attr_groups,
+	.counter_bits = 48,
+	.check_event = HISI_MN_EVTYPE_MASK,
+	.private = &hisi_mn_v1_pmu_regs,
+};
+
+static const struct acpi_device_id hisi_mn_pmu_acpi_match[] = {
+	{ "HISI0222", (kernel_ulong_t) &hisi_mn_v1 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, hisi_mn_pmu_acpi_match);
+
+static struct platform_driver hisi_mn_pmu_driver = {
+	.driver = {
+		.name = "hisi_mn_pmu",
+		.acpi_match_table = hisi_mn_pmu_acpi_match,
+		/*
+		 * We have not worked out a safe bind/unbind process,
+		 * Forcefully unbinding during sampling will lead to a
+		 * kernel panic, so this is not supported yet.
+		 */
+		.suppress_bind_attrs = true,
+	},
+	.probe = hisi_mn_pmu_probe,
+};
+
+static int __init hisi_mn_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/mn:online",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret < 0) {
+		pr_err("hisi_mn_pmu: Failed to setup MN PMU hotplug: %d\n", ret);
+		return ret;
+	}
+	hisi_mn_pmu_online = ret;
+
+	ret = platform_driver_register(&hisi_mn_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(hisi_mn_pmu_online);
+
+	return ret;
+}
+module_init(hisi_mn_pmu_module_init);
+
+static void __exit hisi_mn_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_mn_pmu_driver);
+	cpuhp_remove_multi_state(hisi_mn_pmu_online);
+}
+module_exit(hisi_mn_pmu_module_exit);
+
+MODULE_IMPORT_NS("HISI_PMU");
+MODULE_DESCRIPTION("HiSilicon SoC MN uncore PMU driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Junhao He <hejunhao3@huawei.com>");
diff --git a/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c
new file mode 100644
index 000000000000..de3b9cc7aada
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for HiSilicon Uncore NoC (Network on Chip) PMU device
+ *
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd.
+ * Author: Yicong Yang <yangyicong@hisilicon.com>
+ */
+#include <linux/bitops.h>
+#include <linux/cpuhotplug.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/sysfs.h>
+
+#include "hisi_uncore_pmu.h"
+
+#define NOC_PMU_VERSION			0x1e00
+#define NOC_PMU_GLOBAL_CTRL		0x1e04
+#define   NOC_PMU_GLOBAL_CTRL_PMU_EN	BIT(0)
+#define   NOC_PMU_GLOBAL_CTRL_TT_EN	BIT(1)
+#define NOC_PMU_CNT_INFO		0x1e08
+#define   NOC_PMU_CNT_INFO_OVERFLOW(n)	BIT(n)
+#define NOC_PMU_EVENT_CTRL0		0x1e20
+#define   NOC_PMU_EVENT_CTRL_TYPE	GENMASK(4, 0)
+/*
+ * Note channel of 0x0 will reset the counter value, so don't do it before
+ * we read out the counter.
+ */
+#define   NOC_PMU_EVENT_CTRL_CHANNEL	GENMASK(10, 8)
+#define   NOC_PMU_EVENT_CTRL_EN		BIT(11)
+#define NOC_PMU_EVENT_COUNTER0		0x1e80
+
+#define NOC_PMU_NR_COUNTERS		4
+#define NOC_PMU_CH_DEFAULT		0x7
+
+#define NOC_PMU_EVENT_CTRLn(ctrl0, n)	((ctrl0) + 4 * (n))
+#define NOC_PMU_EVENT_CNTRn(cntr0, n)	((cntr0) + 8 * (n))
+
+HISI_PMU_EVENT_ATTR_EXTRACTOR(ch, config1, 2, 0);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_en, config1, 3, 3);
+
+/* Dynamic CPU hotplug state used by this PMU driver */
+static enum cpuhp_state hisi_noc_pmu_cpuhp_state;
+
+struct hisi_noc_pmu_regs {
+	u32 version;
+	u32 pmu_ctrl;
+	u32 event_ctrl0;
+	u32 event_cntr0;
+	u32 overflow_status;
+};
+
+/*
+ * Tracetag filtering is not per event and all the events should keep
+ * the consistence. Return true if the new comer doesn't match the
+ * tracetag filtering configuration of the current scheduled events.
+ */
+static bool hisi_noc_pmu_check_global_filter(struct perf_event *curr,
+					     struct perf_event *new)
+{
+	return hisi_get_tt_en(curr) == hisi_get_tt_en(new);
+}
+
+static void hisi_noc_pmu_write_evtype(struct hisi_pmu *noc_pmu, int idx, u32 type)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx));
+	reg &= ~NOC_PMU_EVENT_CTRL_TYPE;
+	reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_TYPE, type);
+	writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx));
+}
+
+static int hisi_noc_pmu_get_event_idx(struct perf_event *event)
+{
+	struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu);
+	struct hisi_pmu_hwevents *pmu_events = &noc_pmu->pmu_events;
+	int cur_idx;
+
+	cur_idx = find_first_bit(pmu_events->used_mask, noc_pmu->num_counters);
+	if (cur_idx != noc_pmu->num_counters &&
+	    !hisi_noc_pmu_check_global_filter(pmu_events->hw_events[cur_idx], event))
+		return -EAGAIN;
+
+	return hisi_uncore_pmu_get_event_idx(event);
+}
+
+static u64 hisi_noc_pmu_read_counter(struct hisi_pmu *noc_pmu,
+				     struct hw_perf_event *hwc)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+
+	return readq(noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx));
+}
+
+static void hisi_noc_pmu_write_counter(struct hisi_pmu *noc_pmu,
+				       struct hw_perf_event *hwc, u64 val)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+
+	writeq(val, noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx));
+}
+
+static void hisi_noc_pmu_enable_counter(struct hisi_pmu *noc_pmu,
+					struct hw_perf_event *hwc)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+	reg |= NOC_PMU_EVENT_CTRL_EN;
+	writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+}
+
+static void hisi_noc_pmu_disable_counter(struct hisi_pmu *noc_pmu,
+					 struct hw_perf_event *hwc)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+	reg &= ~NOC_PMU_EVENT_CTRL_EN;
+	writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+}
+
+static void hisi_noc_pmu_enable_counter_int(struct hisi_pmu *noc_pmu,
+					    struct hw_perf_event *hwc)
+{
+	/* We don't support interrupt, so a stub here. */
+}
+
+static void hisi_noc_pmu_disable_counter_int(struct hisi_pmu *noc_pmu,
+					     struct hw_perf_event *hwc)
+{
+}
+
+static void hisi_noc_pmu_start_counters(struct hisi_pmu *noc_pmu)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + reg_info->pmu_ctrl);
+	reg |= NOC_PMU_GLOBAL_CTRL_PMU_EN;
+	writel(reg, noc_pmu->base + reg_info->pmu_ctrl);
+}
+
+static void hisi_noc_pmu_stop_counters(struct hisi_pmu *noc_pmu)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + reg_info->pmu_ctrl);
+	reg &= ~NOC_PMU_GLOBAL_CTRL_PMU_EN;
+	writel(reg, noc_pmu->base + reg_info->pmu_ctrl);
+}
+
+static u32 hisi_noc_pmu_get_int_status(struct hisi_pmu *noc_pmu)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+
+	return readl(noc_pmu->base + reg_info->overflow_status);
+}
+
+static void hisi_noc_pmu_clear_int_status(struct hisi_pmu *noc_pmu, int idx)
+{
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 reg;
+
+	reg = readl(noc_pmu->base + reg_info->overflow_status);
+	reg &= ~NOC_PMU_CNT_INFO_OVERFLOW(idx);
+	writel(reg, noc_pmu->base + reg_info->overflow_status);
+}
+
+static void hisi_noc_pmu_enable_filter(struct perf_event *event)
+{
+	struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu);
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	struct hw_perf_event *hwc = &event->hw;
+	u32 tt_en = hisi_get_tt_en(event);
+	u32 ch = hisi_get_ch(event);
+	u32 reg;
+
+	if (!ch)
+		ch = NOC_PMU_CH_DEFAULT;
+
+	reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+	reg &= ~NOC_PMU_EVENT_CTRL_CHANNEL;
+	reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_CHANNEL, ch);
+	writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx));
+
+	/*
+	 * Since tracetag filter applies to all the counters, don't touch it
+	 * if user doesn't specify it explicitly.
+	 */
+	if (tt_en) {
+		reg = readl(noc_pmu->base + reg_info->pmu_ctrl);
+		reg |= NOC_PMU_GLOBAL_CTRL_TT_EN;
+		writel(reg, noc_pmu->base + reg_info->pmu_ctrl);
+	}
+}
+
+static void hisi_noc_pmu_disable_filter(struct perf_event *event)
+{
+	struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu);
+	struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private;
+	u32 tt_en = hisi_get_tt_en(event);
+	u32 reg;
+
+	/*
+	 * If we're not the last counter, don't touch the global tracetag
+	 * configuration.
+	 */
+	if (bitmap_weight(noc_pmu->pmu_events.used_mask, noc_pmu->num_counters) > 1)
+		return;
+
+	if (tt_en) {
+		reg = readl(noc_pmu->base + reg_info->pmu_ctrl);
+		reg &= ~NOC_PMU_GLOBAL_CTRL_TT_EN;
+		writel(reg, noc_pmu->base + reg_info->pmu_ctrl);
+	}
+}
+
+static const struct hisi_uncore_ops hisi_uncore_noc_ops = {
+	.write_evtype		= hisi_noc_pmu_write_evtype,
+	.get_event_idx		= hisi_noc_pmu_get_event_idx,
+	.read_counter		= hisi_noc_pmu_read_counter,
+	.write_counter		= hisi_noc_pmu_write_counter,
+	.enable_counter		= hisi_noc_pmu_enable_counter,
+	.disable_counter	= hisi_noc_pmu_disable_counter,
+	.enable_counter_int	= hisi_noc_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_noc_pmu_disable_counter_int,
+	.start_counters		= hisi_noc_pmu_start_counters,
+	.stop_counters		= hisi_noc_pmu_stop_counters,
+	.get_int_status		= hisi_noc_pmu_get_int_status,
+	.clear_int_status	= hisi_noc_pmu_clear_int_status,
+	.enable_filter		= hisi_noc_pmu_enable_filter,
+	.disable_filter		= hisi_noc_pmu_disable_filter,
+};
+
+static struct attribute *hisi_noc_pmu_format_attrs[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	HISI_PMU_FORMAT_ATTR(ch, "config1:0-2"),
+	HISI_PMU_FORMAT_ATTR(tt_en, "config1:3"),
+	NULL
+};
+
+static const struct attribute_group hisi_noc_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_noc_pmu_format_attrs,
+};
+
+static struct attribute *hisi_noc_pmu_events_attrs[] = {
+	HISI_PMU_EVENT_ATTR(cycles, 0x0e),
+	/* Flux on/off the ring */
+	HISI_PMU_EVENT_ATTR(ingress_flow_sum, 0x1a),
+	HISI_PMU_EVENT_ATTR(egress_flow_sum, 0x17),
+	/* Buffer full duration on/off the ring */
+	HISI_PMU_EVENT_ATTR(ingress_buf_full, 0x19),
+	HISI_PMU_EVENT_ATTR(egress_buf_full, 0x12),
+	/* Failure packets count on/off the ring */
+	HISI_PMU_EVENT_ATTR(cw_ingress_fail, 0x01),
+	HISI_PMU_EVENT_ATTR(cc_ingress_fail, 0x09),
+	HISI_PMU_EVENT_ATTR(cw_egress_fail, 0x03),
+	HISI_PMU_EVENT_ATTR(cc_egress_fail, 0x0b),
+	/* Flux of the ring */
+	HISI_PMU_EVENT_ATTR(cw_main_flow_sum, 0x05),
+	HISI_PMU_EVENT_ATTR(cc_main_flow_sum, 0x0d),
+	NULL
+};
+
+static const struct attribute_group hisi_noc_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_noc_pmu_events_attrs,
+};
+
+static const struct attribute_group *hisi_noc_pmu_attr_groups[] = {
+	&hisi_noc_pmu_format_group,
+	&hisi_noc_pmu_events_group,
+	&hisi_pmu_cpumask_attr_group,
+	&hisi_pmu_identifier_group,
+	NULL
+};
+
+static int hisi_noc_pmu_dev_init(struct platform_device *pdev, struct hisi_pmu *noc_pmu)
+{
+	struct hisi_noc_pmu_regs *reg_info;
+
+	hisi_uncore_pmu_init_topology(noc_pmu, &pdev->dev);
+
+	if (noc_pmu->topo.scl_id < 0)
+		return dev_err_probe(&pdev->dev, -EINVAL, "failed to get scl-id\n");
+
+	if (noc_pmu->topo.index_id < 0)
+		return dev_err_probe(&pdev->dev, -EINVAL, "failed to get idx-id\n");
+
+	if (noc_pmu->topo.sub_id < 0)
+		return dev_err_probe(&pdev->dev, -EINVAL, "failed to get sub-id\n");
+
+	noc_pmu->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(noc_pmu->base))
+		return dev_err_probe(&pdev->dev, PTR_ERR(noc_pmu->base),
+				     "fail to remap io memory\n");
+
+	noc_pmu->dev_info = device_get_match_data(&pdev->dev);
+	if (!noc_pmu->dev_info)
+		return -ENODEV;
+
+	noc_pmu->pmu_events.attr_groups = noc_pmu->dev_info->attr_groups;
+	noc_pmu->counter_bits = noc_pmu->dev_info->counter_bits;
+	noc_pmu->check_event = noc_pmu->dev_info->check_event;
+	noc_pmu->num_counters = NOC_PMU_NR_COUNTERS;
+	noc_pmu->ops = &hisi_uncore_noc_ops;
+	noc_pmu->dev = &pdev->dev;
+	noc_pmu->on_cpu = -1;
+
+	reg_info = noc_pmu->dev_info->private;
+	noc_pmu->identifier = readl(noc_pmu->base + reg_info->version);
+
+	return 0;
+}
+
+static void hisi_noc_pmu_remove_cpuhp_instance(void *hotplug_node)
+{
+	cpuhp_state_remove_instance_nocalls(hisi_noc_pmu_cpuhp_state, hotplug_node);
+}
+
+static void hisi_noc_pmu_unregister_pmu(void *pmu)
+{
+	perf_pmu_unregister(pmu);
+}
+
+static int hisi_noc_pmu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct hisi_pmu *noc_pmu;
+	char *name;
+	int ret;
+
+	noc_pmu = devm_kzalloc(dev, sizeof(*noc_pmu), GFP_KERNEL);
+	if (!noc_pmu)
+		return -ENOMEM;
+
+	/*
+	 * HiSilicon Uncore PMU framework needs to get common hisi_pmu device
+	 * from device's drvdata.
+	 */
+	platform_set_drvdata(pdev, noc_pmu);
+
+	ret = hisi_noc_pmu_dev_init(pdev, noc_pmu);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(hisi_noc_pmu_cpuhp_state, &noc_pmu->node);
+	if (ret)
+		return dev_err_probe(dev, ret, "Fail to register cpuhp instance\n");
+
+	ret = devm_add_action_or_reset(dev, hisi_noc_pmu_remove_cpuhp_instance,
+				       &noc_pmu->node);
+	if (ret)
+		return ret;
+
+	hisi_pmu_init(noc_pmu, THIS_MODULE);
+
+	name = devm_kasprintf(dev, GFP_KERNEL, "hisi_scl%d_noc%d_%d",
+			      noc_pmu->topo.scl_id, noc_pmu->topo.index_id,
+			      noc_pmu->topo.sub_id);
+	if (!name)
+		return -ENOMEM;
+
+	ret = perf_pmu_register(&noc_pmu->pmu, name, -1);
+	if (ret)
+		return dev_err_probe(dev, ret, "Fail to register PMU\n");
+
+	return devm_add_action_or_reset(dev, hisi_noc_pmu_unregister_pmu,
+					&noc_pmu->pmu);
+}
+
+static struct hisi_noc_pmu_regs hisi_noc_v1_pmu_regs = {
+	.version = NOC_PMU_VERSION,
+	.pmu_ctrl = NOC_PMU_GLOBAL_CTRL,
+	.event_ctrl0 = NOC_PMU_EVENT_CTRL0,
+	.event_cntr0 = NOC_PMU_EVENT_COUNTER0,
+	.overflow_status = NOC_PMU_CNT_INFO,
+};
+
+static const struct hisi_pmu_dev_info hisi_noc_v1 = {
+	.attr_groups = hisi_noc_pmu_attr_groups,
+	.counter_bits = 64,
+	.check_event = NOC_PMU_EVENT_CTRL_TYPE,
+	.private = &hisi_noc_v1_pmu_regs,
+};
+
+static const struct acpi_device_id hisi_noc_pmu_ids[] = {
+	{ "HISI04E0", (kernel_ulong_t) &hisi_noc_v1 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, hisi_noc_pmu_ids);
+
+static struct platform_driver hisi_noc_pmu_driver = {
+	.driver = {
+		.name = "hisi_noc_pmu",
+		.acpi_match_table = hisi_noc_pmu_ids,
+		.suppress_bind_attrs = true,
+	},
+	.probe = hisi_noc_pmu_probe,
+};
+
+static int __init hisi_noc_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/noc:online",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret < 0) {
+		pr_err("hisi_noc_pmu: Fail to setup cpuhp callbacks, ret = %d\n", ret);
+		return ret;
+	}
+	hisi_noc_pmu_cpuhp_state = ret;
+
+	ret = platform_driver_register(&hisi_noc_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state);
+
+	return ret;
+}
+module_init(hisi_noc_pmu_module_init);
+
+static void __exit hisi_noc_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_noc_pmu_driver);
+	cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state);
+}
+module_exit(hisi_noc_pmu_module_exit);
+
+MODULE_IMPORT_NS("HISI_PMU");
+MODULE_DESCRIPTION("HiSilicon SoC Uncore NoC PMU driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yicong Yang <yangyicong@hisilicon.com>");
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index a449651f79c9..de71dcf11653 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -149,7 +149,7 @@ static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx)
 	clear_bit(idx, hisi_pmu->pmu_events.used_mask);
 }
 
-static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data)
+irqreturn_t hisi_uncore_pmu_isr(int irq, void *data)
 {
 	struct hisi_pmu *hisi_pmu = data;
 	struct perf_event *event;
@@ -178,6 +178,7 @@ static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data)
 
 	return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_NS_GPL(hisi_uncore_pmu_isr, "HISI_PMU");
 
 int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu,
 			     struct platform_device *pdev)
@@ -234,7 +235,7 @@ int hisi_uncore_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	hisi_pmu = to_hisi_pmu(event->pmu);
-	if (event->attr.config > hisi_pmu->check_event)
+	if ((event->attr.config & HISI_EVENTID_MASK) > hisi_pmu->check_event)
 		return -EINVAL;
 
 	if (hisi_pmu->on_cpu == -1)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h
index 777675838b80..3ffe6acda653 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.h
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h
@@ -24,7 +24,7 @@
 #define pr_fmt(fmt)     "hisi_pmu: " fmt
 
 #define HISI_PMU_V2		0x30
-#define HISI_MAX_COUNTERS 0x10
+#define HISI_MAX_COUNTERS	0x18
 #define to_hisi_pmu(p)	(container_of(p, struct hisi_pmu, pmu))
 
 #define HISI_PMU_ATTR(_name, _func, _config)				\
@@ -43,7 +43,8 @@
 		return FIELD_GET(GENMASK_ULL(hi, lo), event->attr.config);  \
 	}
 
-#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff)
+#define HISI_EVENTID_MASK		GENMASK(7, 0)
+#define HISI_GET_EVENTID(ev)		((ev)->hw.config_base & HISI_EVENTID_MASK)
 
 #define HISI_PMU_EVTYPE_BITS		8
 #define HISI_PMU_EVTYPE_SHIFT(idx)	((idx) % 4 * HISI_PMU_EVTYPE_BITS)
@@ -164,6 +165,7 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node);
 ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev,
 					     struct device_attribute *attr,
 					     char *page);
+irqreturn_t hisi_uncore_pmu_isr(int irq, void *data);
 int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu,
 			     struct platform_device *pdev);
 void hisi_uncore_pmu_init_topology(struct hisi_pmu *hisi_pmu, struct device *dev);
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 698de8ddf895..3fc16bbab025 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -339,7 +339,7 @@ static bool pmu_sbi_ctr_is_fw(int cidx)
 	if (!info)
 		return false;
 
-	return (info->type == SBI_PMU_CTR_TYPE_FW) ? true : false;
+	return info->type == SBI_PMU_CTR_TYPE_FW;
 }
 
 /*
@@ -877,8 +877,10 @@ static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
 	for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
 		ctr_start_mask = cpu_hw_evt->used_hw_ctrs[i] & ~ctr_ovf_mask;
 		/* Start all the counters that did not overflow in a single shot */
-		sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG, ctr_start_mask,
-			0, 0, 0, 0);
+		if (ctr_start_mask) {
+			sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, i * BITS_PER_LONG,
+				  ctr_start_mask, 0, 0, 0, 0);
+		}
 	}
 
 	/* Reinitialize and start all the counter that overflowed */
diff --git a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c
index e0f2acc8109c..8fcbc312fd61 100644
--- a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c
+++ b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c
@@ -127,13 +127,13 @@ static int eusb2_repeater_init(struct phy *phy)
 			     rptr->cfg->init_tbl[i].value);
 
 	/* Override registers from devicetree values */
-	if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val))
+	if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val))
 		regmap_write(regmap, base + EUSB2_TUNE_USB2_PREEM, val);
 
 	if (!of_property_read_u8(np, "qcom,tune-usb2-disc-thres", &val))
 		regmap_write(regmap, base + EUSB2_TUNE_HSDISC, val);
 
-	if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val))
+	if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val))
 		regmap_write(regmap, base + EUSB2_TUNE_IUSB2, val);
 
 	/* Wait for status OK */
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c
index 95830dcfdec9..0fa63b734b67 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c
@@ -3067,6 +3067,14 @@ struct qmp_pcie {
 	struct clk_fixed_rate aux_clk_fixed;
 };
 
+static bool qphy_checkbits(const void __iomem *base, u32 offset, u32 val)
+{
+	u32 reg;
+
+	reg = readl(base + offset);
+	return (reg & val) == val;
+}
+
 static inline void qphy_setbits(void __iomem *base, u32 offset, u32 val)
 {
 	u32 reg;
@@ -4339,16 +4347,21 @@ static int qmp_pcie_init(struct phy *phy)
 	struct qmp_pcie *qmp = phy_get_drvdata(phy);
 	const struct qmp_phy_cfg *cfg = qmp->cfg;
 	void __iomem *pcs = qmp->pcs;
-	bool phy_initialized = !!(readl(pcs + cfg->regs[QPHY_START_CTRL]));
 	int ret;
 
-	qmp->skip_init = qmp->nocsr_reset && phy_initialized;
 	/*
-	 * We need to check the existence of init sequences in two cases:
-	 * 1. The PHY doesn't support no_csr reset.
-	 * 2. The PHY supports no_csr reset but isn't initialized by bootloader.
-	 * As we can't skip init in these two cases.
+	 * We can skip PHY initialization if all of the following conditions
+	 * are met:
+	 *  1. The PHY supports the nocsr_reset that preserves the PHY config.
+	 *  2. The PHY was started (and not powered down again) by the
+	 *     bootloader, with all of the expected bits set correctly.
+	 * In this case, we can continue without having the init sequence
+	 * defined in the driver.
 	 */
+	qmp->skip_init = qmp->nocsr_reset &&
+		qphy_checkbits(pcs, cfg->regs[QPHY_START_CTRL], SERDES_START | PCS_START) &&
+		qphy_checkbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], cfg->pwrdn_ctrl);
+
 	if (!qmp->skip_init && !cfg->tbls.serdes_num) {
 		dev_err(qmp->dev, "Init sequence not available\n");
 		return -ENODATA;
diff --git a/drivers/phy/tegra/xusb-tegra210.c b/drivers/phy/tegra/xusb-tegra210.c
index ebc8a7e21a31..3409924498e9 100644
--- a/drivers/phy/tegra/xusb-tegra210.c
+++ b/drivers/phy/tegra/xusb-tegra210.c
@@ -3164,18 +3164,22 @@ tegra210_xusb_padctl_probe(struct device *dev,
 	}
 
 	pdev = of_find_device_by_node(np);
+	of_node_put(np);
 	if (!pdev) {
 		dev_warn(dev, "PMC device is not available\n");
 		goto out;
 	}
 
-	if (!platform_get_drvdata(pdev))
+	if (!platform_get_drvdata(pdev)) {
+		put_device(&pdev->dev);
 		return ERR_PTR(-EPROBE_DEFER);
+	}
 
 	padctl->regmap = dev_get_regmap(&pdev->dev, "usb_sleepwalk");
 	if (!padctl->regmap)
 		dev_info(dev, "failed to find PMC regmap\n");
 
+	put_device(&pdev->dev);
 out:
 	return &padctl->base;
 }
diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c
index ff5d5e29629f..50adabb867cb 100644
--- a/drivers/phy/ti/phy-gmii-sel.c
+++ b/drivers/phy/ti/phy-gmii-sel.c
@@ -34,6 +34,7 @@ enum {
 	PHY_GMII_SEL_PORT_MODE = 0,
 	PHY_GMII_SEL_RGMII_ID_MODE,
 	PHY_GMII_SEL_RMII_IO_CLK_EN,
+	PHY_GMII_SEL_FIXED_TX_DELAY,
 	PHY_GMII_SEL_LAST,
 };
 
@@ -127,6 +128,11 @@ static int phy_gmii_sel_mode(struct phy *phy, enum phy_mode mode, int submode)
 		goto unsupported;
 	}
 
+	/* With a fixed delay, some modes are not supported at all. */
+	if (soc_data->features & BIT(PHY_GMII_SEL_FIXED_TX_DELAY) &&
+	    rgmii_id != 0)
+		return -EINVAL;
+
 	if_phy->phy_if_mode = submode;
 
 	dev_dbg(dev, "%s id:%u mode:%u rgmii_id:%d rmii_clk_ext:%d\n",
@@ -210,25 +216,46 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_soc_dm814 = {
 
 static const
 struct reg_field phy_gmii_sel_fields_am654[][PHY_GMII_SEL_LAST] = {
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), },
-	{ [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), },
+	{
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x0, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x4, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x8, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0xC, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x10, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x14, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x18, 4, 4),
+	}, {
+		[PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2),
+		[PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x1C, 4, 4),
+	},
 };
 
 static const
 struct phy_gmii_sel_soc_data phy_gmii_sel_soc_am654 = {
 	.use_of_data = true,
+	.features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) |
+		    BIT(PHY_GMII_SEL_FIXED_TX_DELAY),
 	.regfields = phy_gmii_sel_fields_am654,
 };
 
 static const
 struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = {
 	.use_of_data = true,
+	.features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) |
+		    BIT(PHY_GMII_SEL_FIXED_TX_DELAY),
 	.regfields = phy_gmii_sel_fields_am654,
 	.extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) |
 		       BIT(PHY_INTERFACE_MODE_USXGMII),
@@ -239,6 +266,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = {
 static const
 struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = {
 	.use_of_data = true,
+	.features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) |
+		    BIT(PHY_GMII_SEL_FIXED_TX_DELAY),
 	.regfields = phy_gmii_sel_fields_am654,
 	.extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII),
 	.num_ports = 8,
@@ -248,6 +277,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = {
 static const
 struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j784s4 = {
 	.use_of_data = true,
+	.features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) |
+		    BIT(PHY_GMII_SEL_FIXED_TX_DELAY),
 	.regfields = phy_gmii_sel_fields_am654,
 	.extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) |
 		       BIT(PHY_INTERFACE_MODE_USXGMII),
diff --git a/drivers/phy/ti/phy-omap-usb2.c b/drivers/phy/ti/phy-omap-usb2.c
index c1a0ef979142..c444bb2530ca 100644
--- a/drivers/phy/ti/phy-omap-usb2.c
+++ b/drivers/phy/ti/phy-omap-usb2.c
@@ -363,6 +363,13 @@ static void omap_usb2_init_errata(struct omap_usb *phy)
 		phy->flags |= OMAP_USB2_DISABLE_CHRG_DET;
 }
 
+static void omap_usb2_put_device(void *_dev)
+{
+	struct device *dev = _dev;
+
+	put_device(dev);
+}
+
 static int omap_usb2_probe(struct platform_device *pdev)
 {
 	struct omap_usb	*phy;
@@ -373,6 +380,7 @@ static int omap_usb2_probe(struct platform_device *pdev)
 	struct device_node *control_node;
 	struct platform_device *control_pdev;
 	const struct usb_phy_data *phy_data;
+	int ret;
 
 	phy_data = device_get_match_data(&pdev->dev);
 	if (!phy_data)
@@ -423,6 +431,11 @@ static int omap_usb2_probe(struct platform_device *pdev)
 			return -EINVAL;
 		}
 		phy->control_dev = &control_pdev->dev;
+
+		ret = devm_add_action_or_reset(&pdev->dev, omap_usb2_put_device,
+					       phy->control_dev);
+		if (ret)
+			return ret;
 	} else {
 		if (of_property_read_u32_index(node,
 					       "syscon-phy-power", 1,
diff --git a/drivers/phy/ti/phy-ti-pipe3.c b/drivers/phy/ti/phy-ti-pipe3.c
index da2cbacb982c..ae764d6524c9 100644
--- a/drivers/phy/ti/phy-ti-pipe3.c
+++ b/drivers/phy/ti/phy-ti-pipe3.c
@@ -667,12 +667,20 @@ static int ti_pipe3_get_clk(struct ti_pipe3 *phy)
 	return 0;
 }
 
+static void ti_pipe3_put_device(void *_dev)
+{
+	struct device *dev = _dev;
+
+	put_device(dev);
+}
+
 static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy)
 {
 	struct device *dev = phy->dev;
 	struct device_node *node = dev->of_node;
 	struct device_node *control_node;
 	struct platform_device *control_pdev;
+	int ret;
 
 	phy->phy_power_syscon = syscon_regmap_lookup_by_phandle(node,
 							"syscon-phy-power");
@@ -704,6 +712,11 @@ static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy)
 		}
 
 		phy->control_dev = &control_pdev->dev;
+
+		ret = devm_add_action_or_reset(dev, ti_pipe3_put_device,
+					       phy->control_dev);
+		if (ret)
+			return ret;
 	}
 
 	if (phy->mode == PIPE3_MODE_PCIE) {
diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig
index ddd11668457c..be1ca8e85754 100644
--- a/drivers/pinctrl/Kconfig
+++ b/drivers/pinctrl/Kconfig
@@ -539,6 +539,7 @@ config PINCTRL_STMFX
 	tristate "STMicroelectronics STMFX GPIO expander pinctrl driver"
 	depends on I2C
 	depends on OF_GPIO
+	depends on HAS_IOMEM
 	select GENERIC_PINCONF
 	select GPIOLIB_IRQCHIP
 	select MFD_STMFX
diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c
index 5f1ec9e0de21..b405dfa20891 100644
--- a/drivers/pinctrl/mediatek/pinctrl-airoha.c
+++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c
@@ -108,6 +108,9 @@
 #define JTAG_UDI_EN_MASK			BIT(4)
 #define JTAG_DFD_EN_MASK			BIT(3)
 
+#define REG_FORCE_GPIO_EN			0x0228
+#define FORCE_GPIO_EN(n)			BIT(n)
+
 /* LED MAP */
 #define REG_LAN_LED0_MAPPING			0x027c
 #define REG_LAN_LED1_MAPPING			0x0280
@@ -719,16 +722,16 @@ static const struct airoha_pinctrl_func_group mdio_func_group[] = {
 		.name = "mdio",
 		.regmap[0] = {
 			AIROHA_FUNC_MUX,
-			REG_GPIO_PON_MODE,
-			GPIO_SGMII_MDIO_MODE_MASK,
-			GPIO_SGMII_MDIO_MODE_MASK
-		},
-		.regmap[1] = {
-			AIROHA_FUNC_MUX,
 			REG_GPIO_2ND_I2C_MODE,
 			GPIO_MDC_IO_MASTER_MODE_MODE,
 			GPIO_MDC_IO_MASTER_MODE_MODE
 		},
+		.regmap[1] = {
+			AIROHA_FUNC_MUX,
+			REG_FORCE_GPIO_EN,
+			FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2),
+			FORCE_GPIO_EN(1) | FORCE_GPIO_EN(2)
+		},
 		.regmap_size = 2,
 	},
 };
@@ -1752,8 +1755,8 @@ static const struct airoha_pinctrl_func_group phy1_led1_func_group[] = {
 		.regmap[0] = {
 			AIROHA_FUNC_MUX,
 			REG_GPIO_2ND_I2C_MODE,
-			GPIO_LAN3_LED0_MODE_MASK,
-			GPIO_LAN3_LED0_MODE_MASK
+			GPIO_LAN3_LED1_MODE_MASK,
+			GPIO_LAN3_LED1_MODE_MASK
 		},
 		.regmap[1] = {
 			AIROHA_FUNC_MUX,
@@ -1816,8 +1819,8 @@ static const struct airoha_pinctrl_func_group phy2_led1_func_group[] = {
 		.regmap[0] = {
 			AIROHA_FUNC_MUX,
 			REG_GPIO_2ND_I2C_MODE,
-			GPIO_LAN3_LED0_MODE_MASK,
-			GPIO_LAN3_LED0_MODE_MASK
+			GPIO_LAN3_LED1_MODE_MASK,
+			GPIO_LAN3_LED1_MODE_MASK
 		},
 		.regmap[1] = {
 			AIROHA_FUNC_MUX,
@@ -1880,8 +1883,8 @@ static const struct airoha_pinctrl_func_group phy3_led1_func_group[] = {
 		.regmap[0] = {
 			AIROHA_FUNC_MUX,
 			REG_GPIO_2ND_I2C_MODE,
-			GPIO_LAN3_LED0_MODE_MASK,
-			GPIO_LAN3_LED0_MODE_MASK
+			GPIO_LAN3_LED1_MODE_MASK,
+			GPIO_LAN3_LED1_MODE_MASK
 		},
 		.regmap[1] = {
 			AIROHA_FUNC_MUX,
@@ -1944,8 +1947,8 @@ static const struct airoha_pinctrl_func_group phy4_led1_func_group[] = {
 		.regmap[0] = {
 			AIROHA_FUNC_MUX,
 			REG_GPIO_2ND_I2C_MODE,
-			GPIO_LAN3_LED0_MODE_MASK,
-			GPIO_LAN3_LED0_MODE_MASK
+			GPIO_LAN3_LED1_MODE_MASK,
+			GPIO_LAN3_LED1_MODE_MASK
 		},
 		.regmap[1] = {
 			AIROHA_FUNC_MUX,
@@ -2696,7 +2699,7 @@ static int airoha_pinconf_get(struct pinctrl_dev *pctrl_dev,
 		arg = 1;
 		break;
 	default:
-		return -EOPNOTSUPP;
+		return -ENOTSUPP;
 	}
 
 	*config = pinconf_to_config_packed(param, arg);
@@ -2788,7 +2791,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev,
 			break;
 		}
 		default:
-			return -EOPNOTSUPP;
+			return -ENOTSUPP;
 		}
 	}
 
@@ -2805,10 +2808,10 @@ static int airoha_pinconf_group_get(struct pinctrl_dev *pctrl_dev,
 		if (airoha_pinconf_get(pctrl_dev,
 				       airoha_pinctrl_groups[group].pins[i],
 				       config))
-			return -EOPNOTSUPP;
+			return -ENOTSUPP;
 
 		if (i && cur_config != *config)
-			return -EOPNOTSUPP;
+			return -ENOTSUPP;
 
 		cur_config = *config;
 	}
diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
index e34e984c2b38..6132710aff68 100644
--- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
+++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
@@ -1093,7 +1093,7 @@ static const struct of_device_id aml_pctl_of_match[] = {
 	{ .compatible = "amlogic,pinctrl-s6", .data = &s6_priv_data, },
 	{ /* sentinel */ }
 };
-MODULE_DEVICE_TABLE(of, aml_pctl_dt_match);
+MODULE_DEVICE_TABLE(of, aml_pctl_of_match);
 
 static struct platform_driver aml_pctl_driver = {
 	.driver = {
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 69336bd778ee..13eb22b35aa8 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -129,6 +129,7 @@ enum acer_wmi_predator_v4_oc {
 enum acer_wmi_gaming_misc_setting {
 	ACER_WMID_MISC_SETTING_OC_1			= 0x0005,
 	ACER_WMID_MISC_SETTING_OC_2			= 0x0007,
+	/* Unreliable on some models */
 	ACER_WMID_MISC_SETTING_SUPPORTED_PROFILES	= 0x000A,
 	ACER_WMID_MISC_SETTING_PLATFORM_PROFILE		= 0x000B,
 };
@@ -794,9 +795,6 @@ static bool platform_profile_support;
  */
 static int last_non_turbo_profile = INT_MIN;
 
-/* The most performant supported profile */
-static int acer_predator_v4_max_perf;
-
 enum acer_predator_v4_thermal_profile {
 	ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET		= 0x00,
 	ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED	= 0x01,
@@ -2014,7 +2012,7 @@ acer_predator_v4_platform_profile_set(struct device *dev,
 	if (err)
 		return err;
 
-	if (tp != acer_predator_v4_max_perf)
+	if (tp != ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO)
 		last_non_turbo_profile = tp;
 
 	return 0;
@@ -2023,55 +2021,14 @@ acer_predator_v4_platform_profile_set(struct device *dev,
 static int
 acer_predator_v4_platform_profile_probe(void *drvdata, unsigned long *choices)
 {
-	unsigned long supported_profiles;
-	int err;
+	set_bit(PLATFORM_PROFILE_PERFORMANCE, choices);
+	set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE, choices);
+	set_bit(PLATFORM_PROFILE_BALANCED, choices);
+	set_bit(PLATFORM_PROFILE_QUIET, choices);
+	set_bit(PLATFORM_PROFILE_LOW_POWER, choices);
 
-	err = WMID_gaming_get_misc_setting(ACER_WMID_MISC_SETTING_SUPPORTED_PROFILES,
-					   (u8 *)&supported_profiles);
-	if (err)
-		return err;
-
-	/* Iterate through supported profiles in order of increasing performance */
-	if (test_bit(ACER_PREDATOR_V4_THERMAL_PROFILE_ECO, &supported_profiles)) {
-		set_bit(PLATFORM_PROFILE_LOW_POWER, choices);
-		acer_predator_v4_max_perf = ACER_PREDATOR_V4_THERMAL_PROFILE_ECO;
-		last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_ECO;
-	}
-
-	if (test_bit(ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET, &supported_profiles)) {
-		set_bit(PLATFORM_PROFILE_QUIET, choices);
-		acer_predator_v4_max_perf = ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET;
-		last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_QUIET;
-	}
-
-	if (test_bit(ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED, &supported_profiles)) {
-		set_bit(PLATFORM_PROFILE_BALANCED, choices);
-		acer_predator_v4_max_perf = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED;
-		last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED;
-	}
-
-	if (test_bit(ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE, &supported_profiles)) {
-		set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE, choices);
-		acer_predator_v4_max_perf = ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE;
-
-		/* We only use this profile as a fallback option in case no prior
-		 * profile is supported.
-		 */
-		if (last_non_turbo_profile < 0)
-			last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_PERFORMANCE;
-	}
-
-	if (test_bit(ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO, &supported_profiles)) {
-		set_bit(PLATFORM_PROFILE_PERFORMANCE, choices);
-		acer_predator_v4_max_perf = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO;
-
-		/* We need to handle the hypothetical case where only the turbo profile
-		 * is supported. In this case the turbo toggle will essentially be a
-		 * no-op.
-		 */
-		if (last_non_turbo_profile < 0)
-			last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO;
-	}
+	/* Set default non-turbo profile */
+	last_non_turbo_profile = ACER_PREDATOR_V4_THERMAL_PROFILE_BALANCED;
 
 	return 0;
 }
@@ -2108,19 +2065,15 @@ static int acer_thermal_profile_change(void)
 		if (cycle_gaming_thermal_profile) {
 			platform_profile_cycle();
 		} else {
-			/* Do nothing if no suitable platform profiles where found */
-			if (last_non_turbo_profile < 0)
-				return 0;
-
 			err = WMID_gaming_get_misc_setting(
 				ACER_WMID_MISC_SETTING_PLATFORM_PROFILE, &current_tp);
 			if (err)
 				return err;
 
-			if (current_tp == acer_predator_v4_max_perf)
+			if (current_tp == ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO)
 				tp = last_non_turbo_profile;
 			else
-				tp = acer_predator_v4_max_perf;
+				tp = ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO;
 
 			err = WMID_gaming_set_misc_setting(
 				ACER_WMID_MISC_SETTING_PLATFORM_PROFILE, tp);
@@ -2128,7 +2081,7 @@ static int acer_thermal_profile_change(void)
 				return err;
 
 			/* Store last profile for toggle */
-			if (current_tp != acer_predator_v4_max_perf)
+			if (current_tp != ACER_PREDATOR_V4_THERMAL_PROFILE_TURBO)
 				last_non_turbo_profile = current_tp;
 
 			platform_profile_notify(platform_profile_device);
diff --git a/drivers/platform/x86/amd/hfi/hfi.c b/drivers/platform/x86/amd/hfi/hfi.c
index 4f56149b3774..a465ac6f607e 100644
--- a/drivers/platform/x86/amd/hfi/hfi.c
+++ b/drivers/platform/x86/amd/hfi/hfi.c
@@ -385,12 +385,16 @@ static int amd_hfi_metadata_parser(struct platform_device *pdev,
 	amd_hfi_data->pcct_entry = pcct_entry;
 	pcct_ext = (struct acpi_pcct_ext_pcc_slave *)pcct_entry;
 
-	if (pcct_ext->length <= 0)
-		return -EINVAL;
+	if (pcct_ext->length <= 0) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	amd_hfi_data->shmem = devm_kzalloc(amd_hfi_data->dev, pcct_ext->length, GFP_KERNEL);
-	if (!amd_hfi_data->shmem)
-		return -ENOMEM;
+	if (!amd_hfi_data->shmem) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	pcc_chan->shmem_base_addr = pcct_ext->base_address;
 	pcc_chan->shmem_size = pcct_ext->length;
@@ -398,6 +402,8 @@ static int amd_hfi_metadata_parser(struct platform_device *pdev,
 	/* parse the shared memory info from the PCCT table */
 	ret = amd_hfi_fill_metadata(amd_hfi_data);
 
+out:
+	/* Don't leak any ACPI memory */
 	acpi_put_table(pcct_tbl);
 
 	return ret;
diff --git a/drivers/platform/x86/amd/hsmp/acpi.c b/drivers/platform/x86/amd/hsmp/acpi.c
index 54986a752f7d..a94009203e01 100644
--- a/drivers/platform/x86/amd/hsmp/acpi.c
+++ b/drivers/platform/x86/amd/hsmp/acpi.c
@@ -504,7 +504,7 @@ static int init_acpi(struct device *dev)
 
 	dev_set_drvdata(dev, &hsmp_pdev->sock[sock_ind]);
 
-	return ret;
+	return 0;
 }
 
 static const struct bin_attribute  hsmp_metric_tbl_attr = {
diff --git a/drivers/platform/x86/amd/hsmp/hsmp.c b/drivers/platform/x86/amd/hsmp/hsmp.c
index 885e2f8136fd..19f82c1d3090 100644
--- a/drivers/platform/x86/amd/hsmp/hsmp.c
+++ b/drivers/platform/x86/amd/hsmp/hsmp.c
@@ -356,6 +356,11 @@ ssize_t hsmp_metric_tbl_read(struct hsmp_socket *sock, char *buf, size_t size)
 	if (!sock || !buf)
 		return -EINVAL;
 
+	if (!sock->metric_tbl_addr) {
+		dev_err(sock->dev, "Metrics table address not available\n");
+		return -ENOMEM;
+	}
+
 	/* Do not support lseek(), also don't allow more than the size of metric table */
 	if (size != sizeof(struct hsmp_metric_table)) {
 		dev_err(sock->dev, "Wrong buffer size\n");
diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c
index ded4c84f5ed1..d63aaad7ef59 100644
--- a/drivers/platform/x86/amd/pmc/pmc-quirks.c
+++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c
@@ -28,10 +28,15 @@ static struct quirk_entry quirk_spurious_8042 = {
 	.spurious_8042 = true,
 };
 
+static struct quirk_entry quirk_s2idle_spurious_8042 = {
+	.s2idle_bug_mmio = FCH_PM_BASE + FCH_PM_SCRATCH,
+	.spurious_8042 = true,
+};
+
 static const struct dmi_system_id fwbug_list[] = {
 	{
 		.ident = "L14 Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20X5"),
@@ -39,7 +44,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14s Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20XF"),
@@ -47,7 +52,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "X13 Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20XH"),
@@ -55,7 +60,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14 Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20XK"),
@@ -63,7 +68,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14 Gen1 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20UD"),
@@ -71,7 +76,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14 Gen1 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20UE"),
@@ -79,7 +84,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14s Gen1 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20UH"),
@@ -87,7 +92,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "T14s Gen1 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20UJ"),
@@ -95,7 +100,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "P14s Gen1 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "20Y1"),
@@ -103,7 +108,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "P14s Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "21A0"),
@@ -111,7 +116,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "P14s Gen2 AMD",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "21A1"),
@@ -152,7 +157,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "IdeaPad 1 14AMN7",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82VF"),
@@ -160,7 +165,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "IdeaPad 1 15AMN7",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82VG"),
@@ -168,7 +173,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "IdeaPad 1 15AMN7",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82X5"),
@@ -176,7 +181,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "IdeaPad Slim 3 14AMN8",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82XN"),
@@ -184,7 +189,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	},
 	{
 		.ident = "IdeaPad Slim 3 15AMN8",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82XQ"),
@@ -193,7 +198,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	/* https://gitlab.freedesktop.org/drm/amd/-/issues/4434 */
 	{
 		.ident = "Lenovo Yoga 6 13ALC6",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "82ND"),
@@ -202,7 +207,7 @@ static const struct dmi_system_id fwbug_list[] = {
 	/* https://gitlab.freedesktop.org/drm/amd/-/issues/2684 */
 	{
 		.ident = "HP Laptop 15s-eq2xxx",
-		.driver_data = &quirk_s2idle_bug,
+		.driver_data = &quirk_s2idle_spurious_8042,
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "HP"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "HP Laptop 15s-eq2xxx"),
@@ -234,6 +239,14 @@ static const struct dmi_system_id fwbug_list[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"),
 		}
 	},
+	{
+		.ident = "MECHREVO Yilong15Pro Series GM5HG7A",
+		.driver_data = &quirk_spurious_8042,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "MECHREVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Yilong15Pro Series GM5HG7A"),
+		}
+	},
 	/* https://bugzilla.kernel.org/show_bug.cgi?id=220116 */
 	{
 		.ident = "PCSpecialist Lafite Pro V 14M",
@@ -243,6 +256,27 @@ static const struct dmi_system_id fwbug_list[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Lafite Pro V 14M"),
 		}
 	},
+	{
+		.ident = "TUXEDO Stellaris Slim 15 AMD Gen6",
+		.driver_data = &quirk_spurious_8042,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GMxHGxx"),
+		}
+	},
+	{
+		.ident = "TUXEDO InfinityBook Pro 14/15 AMD Gen10",
+		.driver_data = &quirk_spurious_8042,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "XxHP4NAx"),
+		}
+	},
+	{
+		.ident = "TUXEDO InfinityBook Pro 14/15 AMD Gen10",
+		.driver_data = &quirk_spurious_8042,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "XxKK4NAx_XxSP4NAx"),
+		}
+	},
 	{}
 };
 
@@ -285,6 +319,16 @@ void amd_pmc_quirks_init(struct amd_pmc_dev *dev)
 {
 	const struct dmi_system_id *dmi_id;
 
+	/*
+	 * IRQ1 may cause an interrupt during resume even without a keyboard
+	 * press.
+	 *
+	 * Affects Renoir, Cezanne and Barcelo SoCs
+	 *
+	 * A solution is available in PMFW 64.66.0, but it must be activated by
+	 * SBIOS. If SBIOS is known to have the fix a quirk can be added for
+	 * a given system to avoid workaround.
+	 */
 	if (dev->cpu_id == AMD_CPU_ID_CZN)
 		dev->disable_8042_wakeup = true;
 
@@ -295,6 +339,5 @@ void amd_pmc_quirks_init(struct amd_pmc_dev *dev)
 	if (dev->quirks->s2idle_bug_mmio)
 		pr_info("Using s2idle quirk to avoid %s platform firmware bug\n",
 			dmi_id->ident);
-	if (dev->quirks->spurious_8042)
-		dev->disable_8042_wakeup = true;
+	dev->disable_8042_wakeup = dev->quirks->spurious_8042;
 }
diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c
index 0b9b23eb7c2c..bd318fd02ccf 100644
--- a/drivers/platform/x86/amd/pmc/pmc.c
+++ b/drivers/platform/x86/amd/pmc/pmc.c
@@ -530,19 +530,6 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev)
 static int amd_pmc_wa_irq1(struct amd_pmc_dev *pdev)
 {
 	struct device *d;
-	int rc;
-
-	/* cezanne platform firmware has a fix in 64.66.0 */
-	if (pdev->cpu_id == AMD_CPU_ID_CZN) {
-		if (!pdev->major) {
-			rc = amd_pmc_get_smu_version(pdev);
-			if (rc)
-				return rc;
-		}
-
-		if (pdev->major > 64 || (pdev->major == 64 && pdev->minor > 65))
-			return 0;
-	}
 
 	d = bus_find_device_by_name(&serio_bus, NULL, "serio0");
 	if (!d)
diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c
index ef988605c4da..bc544a4a5266 100644
--- a/drivers/platform/x86/amd/pmf/core.c
+++ b/drivers/platform/x86/amd/pmf/core.c
@@ -403,6 +403,7 @@ static const struct acpi_device_id amd_pmf_acpi_ids[] = {
 	{"AMDI0103", 0},
 	{"AMDI0105", 0},
 	{"AMDI0107", 0},
+	{"AMDI0108", 0},
 	{ }
 };
 MODULE_DEVICE_TABLE(acpi, amd_pmf_acpi_ids);
diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index f84c3d03c1de..6a62bc5b02fd 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -147,7 +147,12 @@ static struct quirk_entry quirk_asus_ignore_fan = {
 };
 
 static struct quirk_entry quirk_asus_zenbook_duo_kbd = {
-	.ignore_key_wlan = true,
+	.key_wlan_event = ASUS_WMI_KEY_IGNORE,
+};
+
+static struct quirk_entry quirk_asus_z13 = {
+	.key_wlan_event = ASUS_WMI_KEY_ARMOURY,
+	.tablet_switch_mode = asus_wmi_kbd_dock_devid,
 };
 
 static int dmi_matched(const struct dmi_system_id *dmi)
@@ -539,6 +544,15 @@ static const struct dmi_system_id asus_quirks[] = {
 		},
 		.driver_data = &quirk_asus_zenbook_duo_kbd,
 	},
+	{
+		.callback = dmi_matched,
+		.ident = "ASUS ROG Z13",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow Z13"),
+		},
+		.driver_data = &quirk_asus_z13,
+	},
 	{},
 };
 
@@ -618,6 +632,7 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
 	{ KE_KEY, 0x93, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV + DVI */
 	{ KE_KEY, 0x95, { KEY_MEDIA } },
 	{ KE_KEY, 0x99, { KEY_PHONE } }, /* Conflicts with fan mode switch */
+	{ KE_KEY, 0X9D, { KEY_FN_F } },
 	{ KE_KEY, 0xA0, { KEY_SWITCHVIDEOMODE } }, /* SDSP HDMI only */
 	{ KE_KEY, 0xA1, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + HDMI */
 	{ KE_KEY, 0xA2, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + HDMI */
@@ -632,10 +647,13 @@ static const struct key_entry asus_nb_wmi_keymap[] = {
 	{ KE_IGNORE, 0xC0, }, /* External display connect/disconnect notification */
 	{ KE_KEY, 0xC4, { KEY_KBDILLUMUP } },
 	{ KE_KEY, 0xC5, { KEY_KBDILLUMDOWN } },
+	{ KE_KEY, 0xCA, { KEY_F13 } }, /* Noise cancelling on Expertbook B9 */
+	{ KE_KEY, 0xCB, { KEY_F14 } }, /* Fn+noise-cancel */
 	{ KE_IGNORE, 0xC6, },  /* Ambient Light Sensor notification */
 	{ KE_IGNORE, 0xCF, },	/* AC mode */
 	{ KE_KEY, 0xFA, { KEY_PROG2 } },           /* Lid flip action */
 	{ KE_KEY, 0xBD, { KEY_PROG2 } },           /* Lid flip action on ROG xflow laptops */
+	{ KE_KEY, ASUS_WMI_KEY_ARMOURY, { KEY_PROG3 } },
 	{ KE_END, 0},
 };
 
@@ -656,10 +674,10 @@ static void asus_nb_wmi_key_filter(struct asus_wmi_driver *asus_wmi, int *code,
 			*code = ASUS_WMI_KEY_IGNORE;
 		break;
 	case 0x5D: /* Wireless console Toggle */
-	case 0x5E: /* Wireless console Enable */
-	case 0x5F: /* Wireless console Disable */
-		if (quirks->ignore_key_wlan)
-			*code = ASUS_WMI_KEY_IGNORE;
+	case 0x5E: /* Wireless console Enable / Keyboard Attach, Detach */
+	case 0x5F: /* Wireless console Disable / Special Key */
+		if (quirks->key_wlan_event)
+			*code = quirks->key_wlan_event;
 		break;
 	}
 }
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index f7191fdded14..e72a2b5d158e 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -5088,16 +5088,22 @@ static int asus_wmi_probe(struct platform_device *pdev)
 
 	asus_s2idle_check_register();
 
-	return asus_wmi_add(pdev);
+	ret = asus_wmi_add(pdev);
+	if (ret)
+		asus_s2idle_check_unregister();
+
+	return ret;
 }
 
 static bool used;
+static DEFINE_MUTEX(register_mutex);
 
 int __init_or_module asus_wmi_register_driver(struct asus_wmi_driver *driver)
 {
 	struct platform_driver *platform_driver;
 	struct platform_device *platform_device;
 
+	guard(mutex)(&register_mutex);
 	if (used)
 		return -EBUSY;
 
@@ -5120,6 +5126,7 @@ EXPORT_SYMBOL_GPL(asus_wmi_register_driver);
 
 void asus_wmi_unregister_driver(struct asus_wmi_driver *driver)
 {
+	guard(mutex)(&register_mutex);
 	asus_s2idle_check_unregister();
 
 	platform_device_unregister(driver->platform_device);
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index 018dfde4025e..5cd4392b964e 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -18,6 +18,7 @@
 #include <linux/i8042.h>
 
 #define ASUS_WMI_KEY_IGNORE (-1)
+#define ASUS_WMI_KEY_ARMOURY	0xffff01
 #define ASUS_WMI_BRN_DOWN	0x2e
 #define ASUS_WMI_BRN_UP		0x2f
 
@@ -40,7 +41,7 @@ struct quirk_entry {
 	bool wmi_force_als_set;
 	bool wmi_ignore_fan;
 	bool filter_i8042_e1_extended_codes;
-	bool ignore_key_wlan;
+	int key_wlan_event;
 	enum asus_wmi_tablet_switch_mode tablet_switch_mode;
 	int wapf;
 	/*
diff --git a/drivers/platform/x86/dell/dell-lis3lv02d.c b/drivers/platform/x86/dell/dell-lis3lv02d.c
index 732de5f556f8..77905a9ddde9 100644
--- a/drivers/platform/x86/dell/dell-lis3lv02d.c
+++ b/drivers/platform/x86/dell/dell-lis3lv02d.c
@@ -48,6 +48,7 @@ static const struct dmi_system_id lis3lv02d_devices[] __initconst = {
 	DELL_LIS3LV02D_DMI_ENTRY("Latitude 5500",      0x29),
 	DELL_LIS3LV02D_DMI_ENTRY("Latitude E6330",     0x29),
 	DELL_LIS3LV02D_DMI_ENTRY("Latitude E6430",     0x29),
+	DELL_LIS3LV02D_DMI_ENTRY("Latitude E6530",     0x29),
 	DELL_LIS3LV02D_DMI_ENTRY("Precision 3540",     0x29),
 	DELL_LIS3LV02D_DMI_ENTRY("Precision 3551",     0x29),
 	DELL_LIS3LV02D_DMI_ENTRY("Precision M6800",    0x29),
diff --git a/drivers/platform/x86/dell/dell-pc.c b/drivers/platform/x86/dell/dell-pc.c
index 48cc7511905a..becdd9aaef29 100644
--- a/drivers/platform/x86/dell/dell-pc.c
+++ b/drivers/platform/x86/dell/dell-pc.c
@@ -228,6 +228,8 @@ static int thermal_platform_profile_get(struct device *dev,
 
 static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices)
 {
+	int current_mode;
+
 	if (supported_modes & DELL_QUIET)
 		__set_bit(PLATFORM_PROFILE_QUIET, choices);
 	if (supported_modes & DELL_COOL_BOTTOM)
@@ -237,6 +239,13 @@ static int thermal_platform_profile_probe(void *drvdata, unsigned long *choices)
 	if (supported_modes & DELL_PERFORMANCE)
 		__set_bit(PLATFORM_PROFILE_PERFORMANCE, choices);
 
+	/* Make sure that ACPI is in sync with the profile set by USTT */
+	current_mode = thermal_get_mode();
+	if (current_mode < 0)
+		return current_mode;
+
+	thermal_set_mode(current_mode);
+
 	return 0;
 }
 
diff --git a/drivers/platform/x86/dell/dell-smbios-base.c b/drivers/platform/x86/dell/dell-smbios-base.c
index 01c72b91a50d..444786102f02 100644
--- a/drivers/platform/x86/dell/dell-smbios-base.c
+++ b/drivers/platform/x86/dell/dell-smbios-base.c
@@ -39,6 +39,7 @@ struct token_sysfs_data {
 struct smbios_device {
 	struct list_head list;
 	struct device *device;
+	int priority;
 	int (*call_fn)(struct calling_interface_buffer *arg);
 };
 
@@ -145,7 +146,7 @@ int dell_smbios_error(int value)
 }
 EXPORT_SYMBOL_GPL(dell_smbios_error);
 
-int dell_smbios_register_device(struct device *d, void *call_fn)
+int dell_smbios_register_device(struct device *d, int priority, void *call_fn)
 {
 	struct smbios_device *priv;
 
@@ -154,6 +155,7 @@ int dell_smbios_register_device(struct device *d, void *call_fn)
 		return -ENOMEM;
 	get_device(d);
 	priv->device = d;
+	priv->priority = priority;
 	priv->call_fn = call_fn;
 	mutex_lock(&smbios_mutex);
 	list_add_tail(&priv->list, &smbios_device_list);
@@ -292,28 +294,25 @@ EXPORT_SYMBOL_GPL(dell_smbios_call_filter);
 
 int dell_smbios_call(struct calling_interface_buffer *buffer)
 {
-	int (*call_fn)(struct calling_interface_buffer *) = NULL;
-	struct device *selected_dev = NULL;
+	struct smbios_device *selected = NULL;
 	struct smbios_device *priv;
 	int ret;
 
 	mutex_lock(&smbios_mutex);
 	list_for_each_entry(priv, &smbios_device_list, list) {
-		if (!selected_dev || priv->device->id >= selected_dev->id) {
-			dev_dbg(priv->device, "Trying device ID: %d\n",
-				priv->device->id);
-			call_fn = priv->call_fn;
-			selected_dev = priv->device;
+		if (!selected || priv->priority >= selected->priority) {
+			dev_dbg(priv->device, "Trying device ID: %d\n", priv->priority);
+			selected = priv;
 		}
 	}
 
-	if (!selected_dev) {
+	if (!selected) {
 		ret = -ENODEV;
 		pr_err("No dell-smbios drivers are loaded\n");
 		goto out_smbios_call;
 	}
 
-	ret = call_fn(buffer);
+	ret = selected->call_fn(buffer);
 
 out_smbios_call:
 	mutex_unlock(&smbios_mutex);
diff --git a/drivers/platform/x86/dell/dell-smbios-smm.c b/drivers/platform/x86/dell/dell-smbios-smm.c
index 4d375985c85f..7055e2c40f34 100644
--- a/drivers/platform/x86/dell/dell-smbios-smm.c
+++ b/drivers/platform/x86/dell/dell-smbios-smm.c
@@ -125,8 +125,7 @@ int init_dell_smbios_smm(void)
 	if (ret)
 		goto fail_platform_device_add;
 
-	ret = dell_smbios_register_device(&platform_device->dev,
-					  &dell_smbios_smm_call);
+	ret = dell_smbios_register_device(&platform_device->dev, 0, &dell_smbios_smm_call);
 	if (ret)
 		goto fail_register;
 
diff --git a/drivers/platform/x86/dell/dell-smbios-wmi.c b/drivers/platform/x86/dell/dell-smbios-wmi.c
index ae9012549560..a7dca8c59d60 100644
--- a/drivers/platform/x86/dell/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell/dell-smbios-wmi.c
@@ -264,9 +264,7 @@ static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context)
 	if (ret)
 		return ret;
 
-	/* ID is used by dell-smbios to set priority of drivers */
-	wdev->dev.id = 1;
-	ret = dell_smbios_register_device(&wdev->dev, &dell_smbios_wmi_call);
+	ret = dell_smbios_register_device(&wdev->dev, 1, &dell_smbios_wmi_call);
 	if (ret)
 		return ret;
 
diff --git a/drivers/platform/x86/dell/dell-smbios.h b/drivers/platform/x86/dell/dell-smbios.h
index 77baa15eb523..f421b8533a9e 100644
--- a/drivers/platform/x86/dell/dell-smbios.h
+++ b/drivers/platform/x86/dell/dell-smbios.h
@@ -64,7 +64,7 @@ struct calling_interface_structure {
 	struct calling_interface_token tokens[];
 } __packed;
 
-int dell_smbios_register_device(struct device *d, void *call_fn);
+int dell_smbios_register_device(struct device *d, int priority, void *call_fn);
 void dell_smbios_unregister_device(struct device *d);
 
 int dell_smbios_error(int value);
diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c
index db5fdee2109c..8b3533d6ba09 100644
--- a/drivers/platform/x86/hp/hp-wmi.c
+++ b/drivers/platform/x86/hp/hp-wmi.c
@@ -92,9 +92,9 @@ static const char * const victus_thermal_profile_boards[] = {
 	"8A25"
 };
 
-/* DMI Board names of Victus 16-s1000 laptops */
+/* DMI Board names of Victus 16-r1000 and Victus 16-s1000 laptops */
 static const char * const victus_s_thermal_profile_boards[] = {
-	"8C9C"
+	"8C99", "8C9C"
 };
 
 enum hp_wmi_radio {
@@ -122,6 +122,7 @@ enum hp_wmi_event_ids {
 	HPWMI_BATTERY_CHARGE_PERIOD	= 0x10,
 	HPWMI_SANITIZATION_MODE		= 0x17,
 	HPWMI_CAMERA_TOGGLE		= 0x1A,
+	HPWMI_FN_P_HOTKEY		= 0x1B,
 	HPWMI_OMEN_KEY			= 0x1D,
 	HPWMI_SMART_EXPERIENCE_APP	= 0x21,
 };
@@ -981,6 +982,9 @@ static void hp_wmi_notify(union acpi_object *obj, void *context)
 						key_code, 1, true))
 			pr_info("Unknown key code - 0x%x\n", key_code);
 		break;
+	case HPWMI_FN_P_HOTKEY:
+		platform_profile_cycle();
+		break;
 	case HPWMI_OMEN_KEY:
 		if (event_data) /* Only should be true for HP Omen */
 			key_code = event_data;
diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c
index 4c0aed6e626f..bdfb8a800c54 100644
--- a/drivers/platform/x86/intel/int3472/discrete.c
+++ b/drivers/platform/x86/intel/int3472/discrete.c
@@ -193,6 +193,10 @@ static void int3472_get_con_id_and_polarity(struct int3472_discrete_device *int3
 		*con_id = "privacy-led";
 		*gpio_flags = GPIO_ACTIVE_HIGH;
 		break;
+	case INT3472_GPIO_TYPE_HOTPLUG_DETECT:
+		*con_id = "hpd";
+		*gpio_flags = GPIO_ACTIVE_HIGH;
+		break;
 	case INT3472_GPIO_TYPE_POWER_ENABLE:
 		*con_id = "avdd";
 		*gpio_flags = GPIO_ACTIVE_HIGH;
@@ -223,6 +227,7 @@ static void int3472_get_con_id_and_polarity(struct int3472_discrete_device *int3
  * 0x0b Power enable
  * 0x0c Clock enable
  * 0x0d Privacy LED
+ * 0x13 Hotplug detect
  *
  * There are some known platform specific quirks where that does not quite
  * hold up; for example where a pin with type 0x01 (Power down) is mapped to
@@ -292,6 +297,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares,
 	switch (type) {
 	case INT3472_GPIO_TYPE_RESET:
 	case INT3472_GPIO_TYPE_POWERDOWN:
+	case INT3472_GPIO_TYPE_HOTPLUG_DETECT:
 		ret = skl_int3472_map_gpio_to_sensor(int3472, agpio, con_id, gpio_flags);
 		if (ret)
 			err_msg = "Failed to map GPIO pin to sensor\n";
diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c
index 540cd2fb0673..d040290e80ff 100644
--- a/drivers/platform/x86/intel/pmc/core.c
+++ b/drivers/platform/x86/intel/pmc/core.c
@@ -1625,6 +1625,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = {
 	X86_MATCH_VFM(INTEL_RAPTORLAKE_P,	&tgl_l_pmc_dev),
 	X86_MATCH_VFM(INTEL_RAPTORLAKE,		&adl_pmc_dev),
 	X86_MATCH_VFM(INTEL_RAPTORLAKE_S,	&adl_pmc_dev),
+	X86_MATCH_VFM(INTEL_BARTLETTLAKE,       &adl_pmc_dev),
 	X86_MATCH_VFM(INTEL_METEORLAKE_L,	&mtl_pmc_dev),
 	X86_MATCH_VFM(INTEL_ARROWLAKE,		&arl_pmc_dev),
 	X86_MATCH_VFM(INTEL_ARROWLAKE_H,	&arl_h_pmc_dev),
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
index 71e104a068e9..7449873c3d40 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c
@@ -790,7 +790,7 @@ static const struct x86_cpu_id isst_cpu_ids[] = {
 	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X,	SST_HPM_SUPPORTED),
 	X86_MATCH_VFM(INTEL_ICELAKE_D,		0),
 	X86_MATCH_VFM(INTEL_ICELAKE_X,		0),
-	X86_MATCH_VFM(INTEL_PANTHERCOVE_X,	SST_HPM_SUPPORTED),
+	X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X,	SST_HPM_SUPPORTED),
 	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X,	0),
 	X86_MATCH_VFM(INTEL_SKYLAKE_X,		SST_MBOX_SUPPORTED),
 	{}
diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c
index 9d8247bb9cfa..7d93119a4c30 100644
--- a/drivers/platform/x86/intel/tpmi_power_domains.c
+++ b/drivers/platform/x86/intel/tpmi_power_domains.c
@@ -85,7 +85,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = {
 	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT,	NULL),
 	X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X,	NULL),
 	X86_MATCH_VFM(INTEL_GRANITERAPIDS_D,	NULL),
-	X86_MATCH_VFM(INTEL_PANTHERCOVE_X,	NULL),
+	X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X,	NULL),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, tpmi_cpu_ids);
@@ -178,7 +178,7 @@ static int tpmi_get_logical_id(unsigned int cpu, struct tpmi_cpu_info *info)
 
 	info->punit_thread_id = FIELD_GET(LP_ID_MASK, data);
 	info->punit_core_id = FIELD_GET(MODULE_ID_MASK, data);
-	info->pkg_id = topology_physical_package_id(cpu);
+	info->pkg_id = topology_logical_package_id(cpu);
 	info->linux_cpu = cpu;
 
 	return 0;
diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
index 6df55c8e16b7..bfcf92aa4d69 100644
--- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
+++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
@@ -192,9 +192,14 @@ static int uncore_read_control_freq(struct uncore_data *data, unsigned int *valu
 static int write_eff_lat_ctrl(struct uncore_data *data, unsigned int val, enum uncore_index index)
 {
 	struct tpmi_uncore_cluster_info *cluster_info;
+	struct tpmi_uncore_struct *uncore_root;
 	u64 control;
 
 	cluster_info = container_of(data, struct tpmi_uncore_cluster_info, uncore_data);
+	uncore_root = cluster_info->uncore_root;
+
+	if (uncore_root->write_blocked)
+		return -EPERM;
 
 	if (cluster_info->root_domain)
 		return -ENODATA;
diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c
index 4b57102c7f62..6af6cf477c5b 100644
--- a/drivers/platform/x86/lg-laptop.c
+++ b/drivers/platform/x86/lg-laptop.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/acpi.h>
+#include <linux/bitfield.h>
 #include <linux/bits.h>
 #include <linux/device.h>
 #include <linux/dev_printk.h>
@@ -75,6 +76,9 @@ MODULE_PARM_DESC(fw_debug, "Enable printing of firmware debug messages");
 #define WMBB_USB_CHARGE 0x10B
 #define WMBB_BATT_LIMIT 0x10C
 
+#define FAN_MODE_LOWER GENMASK(1, 0)
+#define FAN_MODE_UPPER GENMASK(5, 4)
+
 #define PLATFORM_NAME   "lg-laptop"
 
 MODULE_ALIAS("wmi:" WMI_EVENT_GUID0);
@@ -274,29 +278,19 @@ static ssize_t fan_mode_store(struct device *dev,
 			      struct device_attribute *attr,
 			      const char *buffer, size_t count)
 {
-	bool value;
+	unsigned long value;
 	union acpi_object *r;
-	u32 m;
 	int ret;
 
-	ret = kstrtobool(buffer, &value);
+	ret = kstrtoul(buffer, 10, &value);
 	if (ret)
 		return ret;
+	if (value >= 3)
+		return -EINVAL;
 
-	r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0);
-	if (!r)
-		return -EIO;
-
-	if (r->type != ACPI_TYPE_INTEGER) {
-		kfree(r);
-		return -EIO;
-	}
-
-	m = r->integer.value;
-	kfree(r);
-	r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xffffff0f) | (value << 4));
-	kfree(r);
-	r = lg_wmab(dev, WM_FAN_MODE, WM_SET, (m & 0xfffffff0) | value);
+	r = lg_wmab(dev, WM_FAN_MODE, WM_SET,
+		FIELD_PREP(FAN_MODE_LOWER, value) |
+		FIELD_PREP(FAN_MODE_UPPER, value));
 	kfree(r);
 
 	return count;
@@ -305,7 +299,7 @@ static ssize_t fan_mode_store(struct device *dev,
 static ssize_t fan_mode_show(struct device *dev,
 			     struct device_attribute *attr, char *buffer)
 {
-	unsigned int status;
+	unsigned int mode;
 	union acpi_object *r;
 
 	r = lg_wmab(dev, WM_FAN_MODE, WM_GET, 0);
@@ -317,10 +311,10 @@ static ssize_t fan_mode_show(struct device *dev,
 		return -EIO;
 	}
 
-	status = r->integer.value & 0x01;
+	mode = FIELD_GET(FAN_MODE_LOWER, r->integer.value);
 	kfree(r);
 
-	return sysfs_emit(buffer, "%d\n", status);
+	return sysfs_emit(buffer, "%d\n", mode);
 }
 
 static ssize_t usb_charge_store(struct device *dev,
diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c
index eb076bb4099b..54377b282ff8 100644
--- a/drivers/platform/x86/oxpec.c
+++ b/drivers/platform/x86/oxpec.c
@@ -126,6 +126,13 @@ static const struct dmi_system_id dmi_table[] = {
 	},
 	{
 		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "AOKZOE"),
+			DMI_EXACT_MATCH(DMI_BOARD_NAME, "AOKZOE A1X"),
+		},
+		.driver_data = (void *)oxp_fly,
+	},
+	{
+		.matches = {
 			DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"),
 			DMI_MATCH(DMI_BOARD_NAME, "AYANEO 2"),
 		},
@@ -306,6 +313,13 @@ static const struct dmi_system_id dmi_table[] = {
 		},
 		.driver_data = (void *)oxp_x1,
 	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "ONE-NETBOOK"),
+			DMI_EXACT_MATCH(DMI_BOARD_NAME, "ONEXPLAYER X1Pro EVA-02"),
+		},
+		.driver_data = (void *)oxp_x1,
+	},
 	{},
 };
 
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 0006ab3d0789..61c2277c9ce3 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = {
 #define genpd_is_opp_table_fw(genpd)	(genpd->flags & GENPD_FLAG_OPP_TABLE_FW)
 #define genpd_is_dev_name_fw(genpd)	(genpd->flags & GENPD_FLAG_DEV_NAME_FW)
 #define genpd_is_no_sync_state(genpd)	(genpd->flags & GENPD_FLAG_NO_SYNC_STATE)
+#define genpd_is_no_stay_on(genpd)	(genpd->flags & GENPD_FLAG_NO_STAY_ON)
 
 static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
 		const struct generic_pm_domain *genpd)
@@ -1357,7 +1358,6 @@ err_poweroff:
 	return ret;
 }
 
-#ifndef CONFIG_PM_GENERIC_DOMAINS_OF
 static bool pd_ignore_unused;
 static int __init pd_ignore_unused_setup(char *__unused)
 {
@@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void)
 	mutex_lock(&gpd_list_lock);
 
 	list_for_each_entry(genpd, &gpd_list, gpd_list_node) {
-		genpd_lock(genpd);
-		genpd->stay_on = false;
-		genpd_unlock(genpd);
 		genpd_queue_power_off_work(genpd);
 	}
 
@@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void)
 	return 0;
 }
 late_initcall_sync(genpd_power_off_unused);
-#endif
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd)
 	}
 }
 
+#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
+static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off)
+{
+	genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off;
+}
+#else
+static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off)
+{
+	genpd->stay_on = false;
+}
+#endif
+
 /**
  * pm_genpd_init - Initialize a generic I/O PM domain object.
  * @genpd: PM domain object to initialize.
@@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
 	genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON;
-	genpd->stay_on = !is_off;
+	genpd_set_stay_on(genpd, is_off);
 	genpd->sync_state = GENPD_SYNC_STATE_OFF;
 	genpd->device_count = 0;
 	genpd->provider = NULL;
diff --git a/drivers/pmdomain/mediatek/mt8195-pm-domains.h b/drivers/pmdomain/mediatek/mt8195-pm-domains.h
index 59aa031ae632..414dfe82361f 100644
--- a/drivers/pmdomain/mediatek/mt8195-pm-domains.h
+++ b/drivers/pmdomain/mediatek/mt8195-pm-domains.h
@@ -123,6 +123,7 @@ static const struct scpsys_domain_data scpsys_domain_data_mt8195[] = {
 				    MT8195_TOP_AXI_PROT_EN_2_CLR,
 				    MT8195_TOP_AXI_PROT_EN_2_STA1),
 		},
+		.caps = MTK_SCPD_KEEP_DEFAULT_OFF | MTK_SCPD_ACTIVE_WAKEUP,
 	},
 	[MT8195_POWER_DOMAIN_MFG0] = {
 		.name = "mfg0",
diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
index 5aa7fa1df8fe..7434bf42d215 100644
--- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c
+++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
@@ -251,6 +251,7 @@ static int __init rcar_gen4_sysc_pd_setup(struct rcar_gen4_sysc_pd *pd)
 		genpd->detach_dev = cpg_mssr_detach_dev;
 	}
 
+	genpd->flags |= GENPD_FLAG_NO_STAY_ON;
 	genpd->power_off = rcar_gen4_sysc_pd_power_off;
 	genpd->power_on = rcar_gen4_sysc_pd_power_on;
 
diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c
index 4b310c1d35fa..d8a8ffcde38d 100644
--- a/drivers/pmdomain/renesas/rcar-sysc.c
+++ b/drivers/pmdomain/renesas/rcar-sysc.c
@@ -241,6 +241,7 @@ static int __init rcar_sysc_pd_setup(struct rcar_sysc_pd *pd)
 		}
 	}
 
+	genpd->flags |= GENPD_FLAG_NO_STAY_ON;
 	genpd->power_off = rcar_sysc_pd_power_off;
 	genpd->power_on = rcar_sysc_pd_power_on;
 
@@ -342,7 +343,7 @@ struct rcar_pm_domains {
 };
 
 static struct genpd_onecell_data *rcar_sysc_onecell_data;
-static struct device_node *rcar_sysc_onecell_np;
+static struct device_node *rcar_sysc_onecell_np __initdata = NULL;
 
 static int __init rcar_sysc_pd_init(void)
 {
diff --git a/drivers/pmdomain/renesas/rmobile-sysc.c b/drivers/pmdomain/renesas/rmobile-sysc.c
index 8eedc9a1d825..a6bf7295e909 100644
--- a/drivers/pmdomain/renesas/rmobile-sysc.c
+++ b/drivers/pmdomain/renesas/rmobile-sysc.c
@@ -100,7 +100,8 @@ static void rmobile_init_pm_domain(struct rmobile_pm_domain *rmobile_pd)
 	struct generic_pm_domain *genpd = &rmobile_pd->genpd;
 	struct dev_power_governor *gov = rmobile_pd->gov;
 
-	genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP;
+	genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP |
+		GENPD_FLAG_NO_STAY_ON;
 	genpd->attach_dev = cpg_mstp_attach_dev;
 	genpd->detach_dev = cpg_mstp_detach_dev;
 
diff --git a/drivers/pmdomain/rockchip/pm-domains.c b/drivers/pmdomain/rockchip/pm-domains.c
index 242570c505fb..1955c6d453e4 100644
--- a/drivers/pmdomain/rockchip/pm-domains.c
+++ b/drivers/pmdomain/rockchip/pm-domains.c
@@ -865,7 +865,7 @@ static int rockchip_pm_add_one_domain(struct rockchip_pmu *pmu,
 	pd->genpd.power_on = rockchip_pd_power_on;
 	pd->genpd.attach_dev = rockchip_pd_attach_dev;
 	pd->genpd.detach_dev = rockchip_pd_detach_dev;
-	pd->genpd.flags = GENPD_FLAG_PM_CLK;
+	pd->genpd.flags = GENPD_FLAG_PM_CLK | GENPD_FLAG_NO_STAY_ON;
 	if (pd_info->active_wakeup)
 		pd->genpd.flags |= GENPD_FLAG_ACTIVE_WAKEUP;
 	pm_genpd_init(&pd->genpd, NULL,
diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 93dcebbe1141..ad2d9ecf32a5 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1919,8 +1919,8 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 	bool has_singe_flag = di->opts & BQ27XXX_O_ZERO;
 
 	cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag);
-	if ((cache.flags & 0xff) == 0xff)
-		cache.flags = -1; /* read error */
+	if (di->chip == BQ27000 && (cache.flags & 0xff) == 0xff)
+		cache.flags = -ENODEV; /* bq27000 hdq read error */
 	if (cache.flags >= 0) {
 		cache.capacity = bq27xxx_battery_read_soc(di);
 
diff --git a/drivers/ps3/ps3stor_lib.c b/drivers/ps3/ps3stor_lib.c
index a12a1ad9b5fe..3d4d343ee0c8 100644
--- a/drivers/ps3/ps3stor_lib.c
+++ b/drivers/ps3/ps3stor_lib.c
@@ -8,6 +8,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/string_choices.h>
 
 #include <asm/lv1call.h>
 #include <asm/ps3stor.h>
@@ -265,7 +266,7 @@ u64 ps3stor_read_write_sectors(struct ps3_storage_device *dev, u64 lpar,
 			       u64 start_sector, u64 sectors, int write)
 {
 	unsigned int region_id = dev->regions[dev->region_idx].id;
-	const char *op = write ? "write" : "read";
+	const char *op = str_write_read(write);
 	int res;
 
 	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index d39073dc4072..4e1286ce05c9 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -4557,8 +4557,7 @@ ptp_ocp_detach(struct ptp_ocp *bp)
 	ptp_ocp_debugfs_remove_device(bp);
 	ptp_ocp_detach_sysfs(bp);
 	ptp_ocp_attr_group_del(bp);
-	if (timer_pending(&bp->watchdog))
-		timer_delete_sync(&bp->watchdog);
+	timer_delete_sync(&bp->watchdog);
 	if (bp->ts0)
 		ptp_ocp_unregister_ext(bp->ts0);
 	if (bp->ts1)
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index a6aad743c282..b352df4cd3f9 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -24,6 +24,11 @@
 #define PTP_DEFAULT_MAX_VCLOCKS 20
 #define PTP_MAX_CHANNELS 2048
 
+enum {
+	PTP_LOCK_PHYSICAL = 0,
+	PTP_LOCK_VIRTUAL,
+};
+
 struct timestamp_event_queue {
 	struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
 	int head;
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index 2fdeedd60e21..64c950456517 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -154,6 +154,11 @@ static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
 	return PTP_VCLOCK_REFRESH_INTERVAL;
 }
 
+static void ptp_vclock_set_subclass(struct ptp_clock *ptp)
+{
+	lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL);
+}
+
 static const struct ptp_clock_info ptp_vclock_info = {
 	.owner		= THIS_MODULE,
 	.name		= "ptp virtual clock",
@@ -213,6 +218,8 @@ struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
 		return NULL;
 	}
 
+	ptp_vclock_set_subclass(vclock->clock);
+
 	timecounter_init(&vclock->tc, &vclock->cc, 0);
 	ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
 
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index a6e4792a1b2e..ac0e132ccc3e 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -51,6 +51,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
 {
 	trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
 }
+EXPORT_SYMBOL_GPL(log_non_standard_event);
 
 void log_arm_hw_error(struct cper_sec_proc_arm *err)
 {
diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c
index feadb21a8f30..4be270f4d6c3 100644
--- a/drivers/regulator/pca9450-regulator.c
+++ b/drivers/regulator/pca9450-regulator.c
@@ -40,7 +40,6 @@ struct pca9450 {
 	struct device *dev;
 	struct regmap *regmap;
 	struct gpio_desc *sd_vsel_gpio;
-	struct notifier_block restart_nb;
 	enum pca9450_chip_type type;
 	unsigned int rcnt;
 	int irq;
@@ -1100,10 +1099,9 @@ static irqreturn_t pca9450_irq_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int pca9450_i2c_restart_handler(struct notifier_block *nb,
-				unsigned long action, void *data)
+static int pca9450_i2c_restart_handler(struct sys_off_data *data)
 {
-	struct pca9450 *pca9450 = container_of(nb, struct pca9450, restart_nb);
+	struct pca9450 *pca9450 = data->cb_data;
 	struct i2c_client *i2c = container_of(pca9450->dev, struct i2c_client, dev);
 
 	dev_dbg(&i2c->dev, "Restarting device..\n");
@@ -1261,10 +1259,9 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 	pca9450->sd_vsel_fixed_low =
 		of_property_read_bool(ldo5->dev.of_node, "nxp,sd-vsel-fixed-low");
 
-	pca9450->restart_nb.notifier_call = pca9450_i2c_restart_handler;
-	pca9450->restart_nb.priority = PCA9450_RESTART_HANDLER_PRIORITY;
-
-	if (register_restart_handler(&pca9450->restart_nb))
+	if (devm_register_sys_off_handler(&i2c->dev, SYS_OFF_MODE_RESTART,
+					  PCA9450_RESTART_HANDLER_PRIORITY,
+					  pca9450_i2c_restart_handler, pca9450))
 		dev_warn(&i2c->dev, "Failed to register restart handler\n");
 
 	dev_info(&i2c->dev, "%s probed.\n",
diff --git a/drivers/regulator/qcom-pm8008-regulator.c b/drivers/regulator/qcom-pm8008-regulator.c
index da017c1969d0..90c78ee1c37b 100644
--- a/drivers/regulator/qcom-pm8008-regulator.c
+++ b/drivers/regulator/qcom-pm8008-regulator.c
@@ -96,7 +96,7 @@ static int pm8008_regulator_get_voltage_sel(struct regulator_dev *rdev)
 
 	uV = le16_to_cpu(val) * 1000;
 
-	return (uV - preg->desc.min_uV) / preg->desc.uV_step;
+	return regulator_map_voltage_linear_range(rdev, uV, INT_MAX);
 }
 
 static const struct regulator_ops pm8008_regulator_ops = {
diff --git a/drivers/regulator/sy7636a-regulator.c b/drivers/regulator/sy7636a-regulator.c
index d1e7ba1fb3e1..27e3d939b7bb 100644
--- a/drivers/regulator/sy7636a-regulator.c
+++ b/drivers/regulator/sy7636a-regulator.c
@@ -83,9 +83,11 @@ static int sy7636a_regulator_probe(struct platform_device *pdev)
 	if (!regmap)
 		return -EPROBE_DEFER;
 
-	gdp = devm_gpiod_get(pdev->dev.parent, "epd-pwr-good", GPIOD_IN);
+	device_set_of_node_from_dev(&pdev->dev, pdev->dev.parent);
+
+	gdp = devm_gpiod_get(&pdev->dev, "epd-pwr-good", GPIOD_IN);
 	if (IS_ERR(gdp)) {
-		dev_err(pdev->dev.parent, "Power good GPIO fault %ld\n", PTR_ERR(gdp));
+		dev_err(&pdev->dev, "Power good GPIO fault %ld\n", PTR_ERR(gdp));
 		return PTR_ERR(gdp);
 	}
 
@@ -105,7 +107,6 @@ static int sy7636a_regulator_probe(struct platform_device *pdev)
 	}
 
 	config.dev = &pdev->dev;
-	config.dev->of_node = pdev->dev.parent->of_node;
 	config.regmap = regmap;
 
 	rdev = devm_regulator_register(&pdev->dev, &desc, &config);
diff --git a/drivers/regulator/tps65219-regulator.c b/drivers/regulator/tps65219-regulator.c
index 5e67fdc88f49..d77ca486879f 100644
--- a/drivers/regulator/tps65219-regulator.c
+++ b/drivers/regulator/tps65219-regulator.c
@@ -454,9 +454,9 @@ static int tps65219_regulator_probe(struct platform_device *pdev)
 						  irq_type->irq_name,
 						  irq_data);
 		if (error)
-			return dev_err_probe(tps->dev, PTR_ERR(rdev),
-					     "Failed to request %s IRQ %d: %d\n",
-					     irq_type->irq_name, irq, error);
+			return dev_err_probe(tps->dev, error,
+					     "Failed to request %s IRQ %d\n",
+					     irq_type->irq_name, irq);
 	}
 
 	for (i = 0; i < pmic->dev_irq_size; ++i) {
@@ -477,9 +477,9 @@ static int tps65219_regulator_probe(struct platform_device *pdev)
 						  irq_type->irq_name,
 						  irq_data);
 		if (error)
-			return dev_err_probe(tps->dev, PTR_ERR(rdev),
-					     "Failed to request %s IRQ %d: %d\n",
-					     irq_type->irq_name, irq, error);
+			return dev_err_probe(tps->dev, error,
+					     "Failed to request %s IRQ %d\n",
+					     irq_type->irq_name, irq);
 	}
 
 	return 0;
diff --git a/drivers/reset/reset-eyeq.c b/drivers/reset/reset-eyeq.c
index 02d50041048b..2d3998368a1c 100644
--- a/drivers/reset/reset-eyeq.c
+++ b/drivers/reset/reset-eyeq.c
@@ -410,6 +410,13 @@ static int eqr_of_xlate_twocells(struct reset_controller_dev *rcdev,
 	return eqr_of_xlate_internal(rcdev, reset_spec->args[0], reset_spec->args[1]);
 }
 
+static void eqr_of_node_put(void *_dev)
+{
+	struct device *dev = _dev;
+
+	of_node_put(dev->of_node);
+}
+
 static int eqr_probe(struct auxiliary_device *adev,
 		     const struct auxiliary_device_id *id)
 {
@@ -428,6 +435,10 @@ static int eqr_probe(struct auxiliary_device *adev,
 	if (!dev->of_node)
 		return -ENODEV;
 
+	ret = devm_add_action_or_reset(dev, eqr_of_node_put, dev);
+	if (ret)
+		return ret;
+
 	/*
 	 * Using our newfound OF node, we can get match data. We cannot use
 	 * device_get_match_data() because it does not match reused OF nodes.
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 8c1c908d2c6e..877a9bc7f04b 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -5,19 +5,11 @@ comment "S/390 block device drivers"
 config DCSSBLK
 	def_tristate m
 	prompt "DCSSBLK support"
-	depends on S390 && BLOCK && (DAX || DAX=n)
+	depends on S390 && BLOCK && ZONE_DEVICE
+	select FS_DAX
 	help
 	  Support for dcss block device
 
-config DCSSBLK_DAX
-	def_bool y
-	depends on DCSSBLK
-	# requires S390 ZONE_DEVICE support
-	depends on BROKEN
-	prompt "DCSSBLK DAX support"
-	help
-	  Enable DAX operation for the dcss block device
-
 config DASD
 	def_tristate y
 	prompt "Support for DASD devices"
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 94fa5edecadd..86fef4b15015 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -79,6 +79,8 @@ struct dcssblk_dev_info {
 	int num_of_segments;
 	struct list_head seg_list;
 	struct dax_device *dax_dev;
+	struct dev_pagemap pgmap;
+	void *pgmap_addr;
 };
 
 struct segment_info {
@@ -415,6 +417,8 @@ removeseg:
 	dax_remove_host(dev_info->gd);
 	kill_dax(dev_info->dax_dev);
 	put_dax(dev_info->dax_dev);
+	if (dev_info->pgmap_addr)
+		devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap);
 	del_gendisk(dev_info->gd);
 	put_disk(dev_info->gd);
 
@@ -537,9 +541,6 @@ static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info)
 {
 	struct dax_device *dax_dev;
 
-	if (!IS_ENABLED(CONFIG_DCSSBLK_DAX))
-		return 0;
-
 	dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
 	if (IS_ERR(dax_dev))
 		return PTR_ERR(dax_dev);
@@ -562,6 +563,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	struct dcssblk_dev_info *dev_info;
 	struct segment_info *seg_info, *temp;
 	char *local_buf;
+	void *addr;
 	unsigned long seg_byte_size;
 
 	dev_info = NULL;
@@ -687,9 +689,26 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	if (rc)
 		goto put_dev;
 
-	rc = dcssblk_setup_dax(dev_info);
-	if (rc)
-		goto out_dax;
+	if (!IS_ALIGNED(dev_info->start, SUBSECTION_SIZE) ||
+	    !IS_ALIGNED(dev_info->end + 1, SUBSECTION_SIZE)) {
+		pr_info("DCSS %s is not aligned to %lu bytes, DAX support disabled\n",
+			local_buf, SUBSECTION_SIZE);
+	} else {
+		dev_info->pgmap.type		= MEMORY_DEVICE_FS_DAX;
+		dev_info->pgmap.range.start	= dev_info->start;
+		dev_info->pgmap.range.end	= dev_info->end;
+		dev_info->pgmap.nr_range	= 1;
+		addr = devm_memremap_pages(&dev_info->dev, &dev_info->pgmap);
+		if (IS_ERR(addr)) {
+			rc = PTR_ERR(addr);
+			goto put_dev;
+		}
+		dev_info->pgmap_addr = addr;
+		rc = dcssblk_setup_dax(dev_info);
+		if (rc)
+			goto out_dax;
+		pr_info("DAX support enabled for DCSS %s\n", local_buf);
+	}
 
 	get_device(&dev_info->dev);
 	rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL);
@@ -716,6 +735,8 @@ out_dax_host:
 out_dax:
 	kill_dax(dev_info->dax_dev);
 	put_dax(dev_info->dax_dev);
+	if (dev_info->pgmap_addr)
+		devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap);
 put_dev:
 	list_del(&dev_info->lh);
 	put_disk(dev_info->gd);
@@ -801,6 +822,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch
 	dax_remove_host(dev_info->gd);
 	kill_dax(dev_info->dax_dev);
 	put_dax(dev_info->dax_dev);
+	if (dev_info->pgmap_addr)
+		devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap);
 	del_gendisk(dev_info->gd);
 	put_disk(dev_info->gd);
 
diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile
index 81d6744e1861..dcbd51152ee3 100644
--- a/drivers/s390/char/Makefile
+++ b/drivers/s390/char/Makefile
@@ -21,6 +21,7 @@ obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \
 	 sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \
 	 sclp_early.o sclp_early_core.o sclp_sd.o
 
+obj-$(CONFIG_MEMORY_HOTPLUG) += sclp_mem.o
 obj-$(CONFIG_TN3270) += raw3270.o con3270.o
 obj-$(CONFIG_TN3270_FS) += fs3270.o
 
diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c
index e069dd685899..b26fcf6849f2 100644
--- a/drivers/s390/char/hmcdrv_dev.c
+++ b/drivers/s390/char/hmcdrv_dev.c
@@ -244,24 +244,17 @@ static ssize_t hmcdrv_dev_write(struct file *fp, const char __user *ubuf,
 				size_t len, loff_t *pos)
 {
 	ssize_t retlen;
+	void *pdata;
 
 	pr_debug("writing file '/dev/%pD' at pos. %lld with length %zd\n",
 		 fp, (long long) *pos, len);
 
 	if (!fp->private_data) { /* first expect a cmd write */
-		fp->private_data = kmalloc(len + 1, GFP_KERNEL);
-
-		if (!fp->private_data)
-			return -ENOMEM;
-
-		if (!copy_from_user(fp->private_data, ubuf, len)) {
-			((char *)fp->private_data)[len] = '\0';
-			return len;
-		}
-
-		kfree(fp->private_data);
-		fp->private_data = NULL;
-		return -EFAULT;
+		pdata = memdup_user_nul(ubuf, len);
+		if (IS_ERR(pdata))
+			return PTR_ERR(pdata);
+		fp->private_data = pdata;
+		return len;
 	}
 
 	retlen = hmcdrv_dev_transfer((char *) fp->private_data,
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index f2e42c1d51aa..98e334724a62 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -77,6 +77,13 @@ unsigned long sclp_console_full;
 /* The currently active SCLP command word. */
 static sclp_cmdw_t active_cmd;
 
+static inline struct sccb_header *sclpint_to_sccb(u32 sccb_int)
+{
+	if (sccb_int)
+		return __va(sccb_int);
+	return NULL;
+}
+
 static inline void sclp_trace(int prio, char *id, u32 a, u64 b, bool err)
 {
 	struct sclp_trace_entry e;
@@ -620,7 +627,7 @@ __sclp_find_req(u32 sccb)
 
 static bool ok_response(u32 sccb_int, sclp_cmdw_t cmd)
 {
-	struct sccb_header *sccb = (struct sccb_header *)__va(sccb_int);
+	struct sccb_header *sccb = sclpint_to_sccb(sccb_int);
 	struct evbuf_header *evbuf;
 	u16 response;
 
@@ -659,7 +666,7 @@ static void sclp_interrupt_handler(struct ext_code ext_code,
 
 	/* INT: Interrupt received (a=intparm, b=cmd) */
 	sclp_trace_sccb(0, "INT", param32, active_cmd, active_cmd,
-			(struct sccb_header *)__va(finished_sccb),
+			sclpint_to_sccb(finished_sccb),
 			!ok_response(finished_sccb, active_cmd));
 
 	if (finished_sccb) {
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index 16469678548f..3480198eac02 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -8,31 +8,46 @@
 #define KMSG_COMPONENT "sclp_cmd"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
-#include <linux/cpufeature.h>
 #include <linux/completion.h>
-#include <linux/init.h>
-#include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/memory.h>
-#include <linux/memory_hotplug.h>
-#include <linux/module.h>
-#include <asm/ctlreg.h>
 #include <asm/chpid.h>
-#include <asm/setup.h>
-#include <asm/page.h>
+#include <asm/ctlreg.h>
 #include <asm/sclp.h>
-#include <asm/numa.h>
-#include <asm/facility.h>
-#include <asm/page-states.h>
 
 #include "sclp.h"
 
-#define SCLP_CMDW_ASSIGN_STORAGE	0x000d0001
-#define SCLP_CMDW_UNASSIGN_STORAGE	0x000c0001
+/* CPU configuration related functions */
+#define SCLP_CMDW_CONFIGURE_CPU			0x00110001
+#define SCLP_CMDW_DECONFIGURE_CPU		0x00100001
+/* Channel path configuration related functions */
+#define SCLP_CMDW_CONFIGURE_CHPATH		0x000f0001
+#define SCLP_CMDW_DECONFIGURE_CHPATH		0x000e0001
+#define SCLP_CMDW_READ_CHPATH_INFORMATION	0x00030001
+
+struct cpu_configure_sccb {
+	struct sccb_header header;
+} __packed __aligned(8);
+
+struct chp_cfg_sccb {
+	struct sccb_header header;
+	u8 ccm;
+	u8 reserved[6];
+	u8 cssid;
+} __packed;
+
+struct chp_info_sccb {
+	struct sccb_header header;
+	u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
+	u8 standby[SCLP_CHP_INFO_MASK_SIZE];
+	u8 configured[SCLP_CHP_INFO_MASK_SIZE];
+	u8 ccm;
+	u8 reserved[6];
+	u8 cssid;
+} __packed;
 
 static void sclp_sync_callback(struct sclp_req *req, void *data)
 {
@@ -64,13 +79,11 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout)
 	request->callback_data = &completion;
 	init_completion(&completion);
 
-	/* Perform sclp request. */
 	rc = sclp_add_request(request);
 	if (rc)
 		goto out;
 	wait_for_completion(&completion);
 
-	/* Check response. */
 	if (request->status != SCLP_REQ_DONE) {
 		pr_warn("sync request failed (cmd=0x%08x, status=0x%02x)\n",
 			cmd, request->status);
@@ -81,22 +94,15 @@ out:
 	return rc;
 }
 
-/*
- * CPU configuration related functions.
- */
-
-#define SCLP_CMDW_CONFIGURE_CPU		0x00110001
-#define SCLP_CMDW_DECONFIGURE_CPU	0x00100001
-
 int _sclp_get_core_info(struct sclp_core_info *info)
 {
-	int rc;
-	int length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE;
 	struct read_cpu_info_sccb *sccb;
+	int rc, length;
 
 	if (!SCLP_HAS_CPU_INFO)
 		return -EOPNOTSUPP;
 
+	length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE;
 	sccb = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA | __GFP_ZERO, get_order(length));
 	if (!sccb)
 		return -ENOMEM;
@@ -114,14 +120,10 @@ int _sclp_get_core_info(struct sclp_core_info *info)
 	}
 	sclp_fill_core_info(info, sccb);
 out:
-	free_pages((unsigned long) sccb, get_order(length));
+	free_pages((unsigned long)sccb, get_order(length));
 	return rc;
 }
 
-struct cpu_configure_sccb {
-	struct sccb_header header;
-} __attribute__((packed, aligned(8)));
-
 static int do_core_configure(sclp_cmdw_t cmd)
 {
 	struct cpu_configure_sccb *sccb;
@@ -130,8 +132,8 @@ static int do_core_configure(sclp_cmdw_t cmd)
 	if (!SCLP_HAS_CPU_RECONFIG)
 		return -EOPNOTSUPP;
 	/*
-	 * This is not going to cross a page boundary since we force
-	 * kmalloc to have a minimum alignment of 8 bytes on s390.
+	 * Use kmalloc to have a minimum alignment of 8 bytes and ensure sccb
+	 * is not going to cross a page boundary.
 	 */
 	sccb = kzalloc(sizeof(*sccb), GFP_KERNEL | GFP_DMA);
 	if (!sccb)
@@ -165,394 +167,6 @@ int sclp_core_deconfigure(u8 core)
 	return do_core_configure(SCLP_CMDW_DECONFIGURE_CPU | core << 8);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-
-static DEFINE_MUTEX(sclp_mem_mutex);
-static LIST_HEAD(sclp_mem_list);
-static u8 sclp_max_storage_id;
-static DECLARE_BITMAP(sclp_storage_ids, 256);
-
-struct memory_increment {
-	struct list_head list;
-	u16 rn;
-	int standby;
-};
-
-struct assign_storage_sccb {
-	struct sccb_header header;
-	u16 rn;
-} __packed;
-
-int arch_get_memory_phys_device(unsigned long start_pfn)
-{
-	if (!sclp.rzm)
-		return 0;
-	return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm);
-}
-
-static unsigned long long rn2addr(u16 rn)
-{
-	return (unsigned long long) (rn - 1) * sclp.rzm;
-}
-
-static int do_assign_storage(sclp_cmdw_t cmd, u16 rn)
-{
-	struct assign_storage_sccb *sccb;
-	int rc;
-
-	sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
-	if (!sccb)
-		return -ENOMEM;
-	sccb->header.length = PAGE_SIZE;
-	sccb->rn = rn;
-	rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL);
-	if (rc)
-		goto out;
-	switch (sccb->header.response_code) {
-	case 0x0020:
-	case 0x0120:
-		break;
-	default:
-		pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n",
-			cmd, sccb->header.response_code, rn);
-		rc = -EIO;
-		break;
-	}
-out:
-	free_page((unsigned long) sccb);
-	return rc;
-}
-
-static int sclp_assign_storage(u16 rn)
-{
-	unsigned long long start;
-	int rc;
-
-	rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn);
-	if (rc)
-		return rc;
-	start = rn2addr(rn);
-	storage_key_init_range(start, start + sclp.rzm);
-	return 0;
-}
-
-static int sclp_unassign_storage(u16 rn)
-{
-	return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn);
-}
-
-struct attach_storage_sccb {
-	struct sccb_header header;
-	u16 :16;
-	u16 assigned;
-	u32 :32;
-	u32 entries[];
-} __packed;
-
-static int sclp_attach_storage(u8 id)
-{
-	struct attach_storage_sccb *sccb;
-	int rc;
-	int i;
-
-	sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
-	if (!sccb)
-		return -ENOMEM;
-	sccb->header.length = PAGE_SIZE;
-	sccb->header.function_code = 0x40;
-	rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb,
-				       SCLP_QUEUE_INTERVAL);
-	if (rc)
-		goto out;
-	switch (sccb->header.response_code) {
-	case 0x0020:
-		set_bit(id, sclp_storage_ids);
-		for (i = 0; i < sccb->assigned; i++) {
-			if (sccb->entries[i])
-				sclp_unassign_storage(sccb->entries[i] >> 16);
-		}
-		break;
-	default:
-		rc = -EIO;
-		break;
-	}
-out:
-	free_page((unsigned long) sccb);
-	return rc;
-}
-
-static int sclp_mem_change_state(unsigned long start, unsigned long size,
-				 int online)
-{
-	struct memory_increment *incr;
-	unsigned long long istart;
-	int rc = 0;
-
-	list_for_each_entry(incr, &sclp_mem_list, list) {
-		istart = rn2addr(incr->rn);
-		if (start + size - 1 < istart)
-			break;
-		if (start > istart + sclp.rzm - 1)
-			continue;
-		if (online)
-			rc |= sclp_assign_storage(incr->rn);
-		else
-			sclp_unassign_storage(incr->rn);
-		if (rc == 0)
-			incr->standby = online ? 0 : 1;
-	}
-	return rc ? -EIO : 0;
-}
-
-static bool contains_standby_increment(unsigned long start, unsigned long end)
-{
-	struct memory_increment *incr;
-	unsigned long istart;
-
-	list_for_each_entry(incr, &sclp_mem_list, list) {
-		istart = rn2addr(incr->rn);
-		if (end - 1 < istart)
-			continue;
-		if (start > istart + sclp.rzm - 1)
-			continue;
-		if (incr->standby)
-			return true;
-	}
-	return false;
-}
-
-static int sclp_mem_notifier(struct notifier_block *nb,
-			     unsigned long action, void *data)
-{
-	unsigned long start, size;
-	struct memory_notify *arg;
-	unsigned char id;
-	int rc = 0;
-
-	arg = data;
-	start = arg->start_pfn << PAGE_SHIFT;
-	size = arg->nr_pages << PAGE_SHIFT;
-	mutex_lock(&sclp_mem_mutex);
-	for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
-		sclp_attach_storage(id);
-	switch (action) {
-	case MEM_GOING_OFFLINE:
-		/*
-		 * We do not allow to set memory blocks offline that contain
-		 * standby memory. This is done to simplify the "memory online"
-		 * case.
-		 */
-		if (contains_standby_increment(start, start + size))
-			rc = -EPERM;
-		break;
-	case MEM_PREPARE_ONLINE:
-		/*
-		 * Access the altmap_start_pfn and altmap_nr_pages fields
-		 * within the struct memory_notify specifically when dealing
-		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
-		 *
-		 * When altmap is in use, take the specified memory range
-		 * online, which includes the altmap.
-		 */
-		if (arg->altmap_nr_pages) {
-			start = PFN_PHYS(arg->altmap_start_pfn);
-			size += PFN_PHYS(arg->altmap_nr_pages);
-		}
-		rc = sclp_mem_change_state(start, size, 1);
-		if (rc || !arg->altmap_nr_pages)
-			break;
-		/*
-		 * Set CMMA state to nodat here, since the struct page memory
-		 * at the beginning of the memory block will not go through the
-		 * buddy allocator later.
-		 */
-		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
-		break;
-	case MEM_FINISH_OFFLINE:
-		/*
-		 * When altmap is in use, take the specified memory range
-		 * offline, which includes the altmap.
-		 */
-		if (arg->altmap_nr_pages) {
-			start = PFN_PHYS(arg->altmap_start_pfn);
-			size += PFN_PHYS(arg->altmap_nr_pages);
-		}
-		sclp_mem_change_state(start, size, 0);
-		break;
-	default:
-		break;
-	}
-	mutex_unlock(&sclp_mem_mutex);
-	return rc ? NOTIFY_BAD : NOTIFY_OK;
-}
-
-static struct notifier_block sclp_mem_nb = {
-	.notifier_call = sclp_mem_notifier,
-};
-
-static void __init align_to_block_size(unsigned long long *start,
-				       unsigned long long *size,
-				       unsigned long long alignment)
-{
-	unsigned long long start_align, size_align;
-
-	start_align = roundup(*start, alignment);
-	size_align = rounddown(*start + *size, alignment) - start_align;
-
-	pr_info("Standby memory at 0x%llx (%lluM of %lluM usable)\n",
-		*start, size_align >> 20, *size >> 20);
-	*start = start_align;
-	*size = size_align;
-}
-
-static void __init add_memory_merged(u16 rn)
-{
-	unsigned long long start, size, addr, block_size;
-	static u16 first_rn, num;
-
-	if (rn && first_rn && (first_rn + num == rn)) {
-		num++;
-		return;
-	}
-	if (!first_rn)
-		goto skip_add;
-	start = rn2addr(first_rn);
-	size = (unsigned long long) num * sclp.rzm;
-	if (start >= ident_map_size)
-		goto skip_add;
-	if (start + size > ident_map_size)
-		size = ident_map_size - start;
-	block_size = memory_block_size_bytes();
-	align_to_block_size(&start, &size, block_size);
-	if (!size)
-		goto skip_add;
-	for (addr = start; addr < start + size; addr += block_size)
-		add_memory(0, addr, block_size,
-			   cpu_has_edat1() ?
-			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
-skip_add:
-	first_rn = rn;
-	num = 1;
-}
-
-static void __init sclp_add_standby_memory(void)
-{
-	struct memory_increment *incr;
-
-	list_for_each_entry(incr, &sclp_mem_list, list)
-		if (incr->standby)
-			add_memory_merged(incr->rn);
-	add_memory_merged(0);
-}
-
-static void __init insert_increment(u16 rn, int standby, int assigned)
-{
-	struct memory_increment *incr, *new_incr;
-	struct list_head *prev;
-	u16 last_rn;
-
-	new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL);
-	if (!new_incr)
-		return;
-	new_incr->rn = rn;
-	new_incr->standby = standby;
-	last_rn = 0;
-	prev = &sclp_mem_list;
-	list_for_each_entry(incr, &sclp_mem_list, list) {
-		if (assigned && incr->rn > rn)
-			break;
-		if (!assigned && incr->rn - last_rn > 1)
-			break;
-		last_rn = incr->rn;
-		prev = &incr->list;
-	}
-	if (!assigned)
-		new_incr->rn = last_rn + 1;
-	if (new_incr->rn > sclp.rnmax) {
-		kfree(new_incr);
-		return;
-	}
-	list_add(&new_incr->list, prev);
-}
-
-static int __init sclp_detect_standby_memory(void)
-{
-	struct read_storage_sccb *sccb;
-	int i, id, assigned, rc;
-
-	if (oldmem_data.start) /* No standby memory in kdump mode */
-		return 0;
-	if ((sclp.facilities & 0xe00000000000ULL) != 0xe00000000000ULL)
-		return 0;
-	rc = -ENOMEM;
-	sccb = (void *) __get_free_page(GFP_KERNEL | GFP_DMA);
-	if (!sccb)
-		goto out;
-	assigned = 0;
-	for (id = 0; id <= sclp_max_storage_id; id++) {
-		memset(sccb, 0, PAGE_SIZE);
-		sccb->header.length = PAGE_SIZE;
-		rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb);
-		if (rc)
-			goto out;
-		switch (sccb->header.response_code) {
-		case 0x0010:
-			set_bit(id, sclp_storage_ids);
-			for (i = 0; i < sccb->assigned; i++) {
-				if (!sccb->entries[i])
-					continue;
-				assigned++;
-				insert_increment(sccb->entries[i] >> 16, 0, 1);
-			}
-			break;
-		case 0x0310:
-			break;
-		case 0x0410:
-			for (i = 0; i < sccb->assigned; i++) {
-				if (!sccb->entries[i])
-					continue;
-				assigned++;
-				insert_increment(sccb->entries[i] >> 16, 1, 1);
-			}
-			break;
-		default:
-			rc = -EIO;
-			break;
-		}
-		if (!rc)
-			sclp_max_storage_id = sccb->max_id;
-	}
-	if (rc || list_empty(&sclp_mem_list))
-		goto out;
-	for (i = 1; i <= sclp.rnmax - assigned; i++)
-		insert_increment(0, 1, 0);
-	rc = register_memory_notifier(&sclp_mem_nb);
-	if (rc)
-		goto out;
-	sclp_add_standby_memory();
-out:
-	free_page((unsigned long) sccb);
-	return rc;
-}
-__initcall(sclp_detect_standby_memory);
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-/*
- * Channel path configuration related functions.
- */
-
-#define SCLP_CMDW_CONFIGURE_CHPATH		0x000f0001
-#define SCLP_CMDW_DECONFIGURE_CHPATH		0x000e0001
-#define SCLP_CMDW_READ_CHPATH_INFORMATION	0x00030001
-
-struct chp_cfg_sccb {
-	struct sccb_header header;
-	u8 ccm;
-	u8 reserved[6];
-	u8 cssid;
-} __attribute__((packed));
-
 static int do_chp_configure(sclp_cmdw_t cmd)
 {
 	struct chp_cfg_sccb *sccb;
@@ -560,8 +174,7 @@ static int do_chp_configure(sclp_cmdw_t cmd)
 
 	if (!SCLP_HAS_CHP_RECONFIG)
 		return -EOPNOTSUPP;
-	/* Prepare sccb. */
-	sccb = (struct chp_cfg_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	sccb = (struct chp_cfg_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!sccb)
 		return -ENOMEM;
 	sccb->header.length = sizeof(*sccb);
@@ -581,7 +194,7 @@ static int do_chp_configure(sclp_cmdw_t cmd)
 		break;
 	}
 out:
-	free_page((unsigned long) sccb);
+	free_page((unsigned long)sccb);
 	return rc;
 }
 
@@ -609,16 +222,6 @@ int sclp_chp_deconfigure(struct chp_id chpid)
 	return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8);
 }
 
-struct chp_info_sccb {
-	struct sccb_header header;
-	u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
-	u8 standby[SCLP_CHP_INFO_MASK_SIZE];
-	u8 configured[SCLP_CHP_INFO_MASK_SIZE];
-	u8 ccm;
-	u8 reserved[6];
-	u8 cssid;
-} __attribute__((packed));
-
 /**
  * sclp_chp_read_info - perform read channel-path information sclp command
  * @info: resulting channel-path information data
@@ -634,8 +237,7 @@ int sclp_chp_read_info(struct sclp_chp_info *info)
 
 	if (!SCLP_HAS_CHP_INFO)
 		return -EOPNOTSUPP;
-	/* Prepare sccb. */
-	sccb = (struct chp_info_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	sccb = (struct chp_info_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!sccb)
 		return -ENOMEM;
 	sccb->header.length = sizeof(*sccb);
@@ -652,6 +254,6 @@ int sclp_chp_read_info(struct sclp_chp_info *info)
 	memcpy(info->standby, sccb->standby, SCLP_CHP_INFO_MASK_SIZE);
 	memcpy(info->configured, sccb->configured, SCLP_CHP_INFO_MASK_SIZE);
 out:
-	free_page((unsigned long) sccb);
+	free_page((unsigned long)sccb);
 	return rc;
 }
diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
new file mode 100644
index 000000000000..27f49f5fd358
--- /dev/null
+++ b/drivers/s390/char/sclp_mem.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory hotplug support via sclp
+ *
+ * Copyright IBM Corp. 2025
+ */
+
+#define KMSG_COMPONENT "sclp_mem"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/cpufeature.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <asm/facility.h>
+#include <asm/page.h>
+#include <asm/page-states.h>
+#include <asm/sclp.h>
+
+#include "sclp.h"
+
+#define SCLP_CMDW_ASSIGN_STORAGE		0x000d0001
+#define SCLP_CMDW_UNASSIGN_STORAGE		0x000c0001
+
+static DEFINE_MUTEX(sclp_mem_mutex);
+static LIST_HEAD(sclp_mem_list);
+static u8 sclp_max_storage_id;
+static DECLARE_BITMAP(sclp_storage_ids, 256);
+
+struct memory_increment {
+	struct list_head list;
+	u16 rn;
+	int standby;
+};
+
+struct assign_storage_sccb {
+	struct sccb_header header;
+	u16 rn;
+} __packed;
+
+struct attach_storage_sccb {
+	struct sccb_header header;
+	u16 :16;
+	u16 assigned;
+	u32 :32;
+	u32 entries[];
+} __packed;
+
+int arch_get_memory_phys_device(unsigned long start_pfn)
+{
+	if (!sclp.rzm)
+		return 0;
+	return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm);
+}
+
+static unsigned long rn2addr(u16 rn)
+{
+	return (unsigned long)(rn - 1) * sclp.rzm;
+}
+
+static int do_assign_storage(sclp_cmdw_t cmd, u16 rn)
+{
+	struct assign_storage_sccb *sccb;
+	int rc;
+
+	sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	if (!sccb)
+		return -ENOMEM;
+	sccb->header.length = PAGE_SIZE;
+	sccb->rn = rn;
+	rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL);
+	if (rc)
+		goto out;
+	switch (sccb->header.response_code) {
+	case 0x0020:
+	case 0x0120:
+		break;
+	default:
+		pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n",
+			cmd, sccb->header.response_code, rn);
+		rc = -EIO;
+		break;
+	}
+out:
+	free_page((unsigned long)sccb);
+	return rc;
+}
+
+static int sclp_assign_storage(u16 rn)
+{
+	unsigned long start;
+	int rc;
+
+	rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn);
+	if (rc)
+		return rc;
+	start = rn2addr(rn);
+	storage_key_init_range(start, start + sclp.rzm);
+	return 0;
+}
+
+static int sclp_unassign_storage(u16 rn)
+{
+	return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn);
+}
+
+static int sclp_attach_storage(u8 id)
+{
+	struct attach_storage_sccb *sccb;
+	int rc, i;
+
+	sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	if (!sccb)
+		return -ENOMEM;
+	sccb->header.length = PAGE_SIZE;
+	sccb->header.function_code = 0x40;
+	rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb,
+				       SCLP_QUEUE_INTERVAL);
+	if (rc)
+		goto out;
+	switch (sccb->header.response_code) {
+	case 0x0020:
+		set_bit(id, sclp_storage_ids);
+		for (i = 0; i < sccb->assigned; i++) {
+			if (sccb->entries[i])
+				sclp_unassign_storage(sccb->entries[i] >> 16);
+		}
+		break;
+	default:
+		rc = -EIO;
+		break;
+	}
+out:
+	free_page((unsigned long)sccb);
+	return rc;
+}
+
+static int sclp_mem_change_state(unsigned long start, unsigned long size,
+				 int online)
+{
+	struct memory_increment *incr;
+	unsigned long istart;
+	int rc = 0;
+
+	list_for_each_entry(incr, &sclp_mem_list, list) {
+		istart = rn2addr(incr->rn);
+		if (start + size - 1 < istart)
+			break;
+		if (start > istart + sclp.rzm - 1)
+			continue;
+		if (online)
+			rc |= sclp_assign_storage(incr->rn);
+		else
+			sclp_unassign_storage(incr->rn);
+		if (rc == 0)
+			incr->standby = online ? 0 : 1;
+	}
+	return rc ? -EIO : 0;
+}
+
+static bool contains_standby_increment(unsigned long start, unsigned long end)
+{
+	struct memory_increment *incr;
+	unsigned long istart;
+
+	list_for_each_entry(incr, &sclp_mem_list, list) {
+		istart = rn2addr(incr->rn);
+		if (end - 1 < istart)
+			continue;
+		if (start > istart + sclp.rzm - 1)
+			continue;
+		if (incr->standby)
+			return true;
+	}
+	return false;
+}
+
+static int sclp_mem_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	unsigned long start, size;
+	struct memory_notify *arg;
+	unsigned char id;
+	int rc = 0;
+
+	arg = data;
+	start = arg->start_pfn << PAGE_SHIFT;
+	size = arg->nr_pages << PAGE_SHIFT;
+	mutex_lock(&sclp_mem_mutex);
+	for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
+		sclp_attach_storage(id);
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		/*
+		 * Do not allow to set memory blocks offline that contain
+		 * standby memory. This is done to simplify the "memory online"
+		 * case.
+		 */
+		if (contains_standby_increment(start, start + size))
+			rc = -EPERM;
+		break;
+	case MEM_PREPARE_ONLINE:
+		/*
+		 * Access the altmap_start_pfn and altmap_nr_pages fields
+		 * within the struct memory_notify specifically when dealing
+		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+		 *
+		 * When altmap is in use, take the specified memory range
+		 * online, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
+		rc = sclp_mem_change_state(start, size, 1);
+		if (rc || !arg->altmap_nr_pages)
+			break;
+		/*
+		 * Set CMMA state to nodat here, since the struct page memory
+		 * at the beginning of the memory block will not go through the
+		 * buddy allocator later.
+		 */
+		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
+		break;
+	case MEM_FINISH_OFFLINE:
+		/*
+		 * When altmap is in use, take the specified memory range
+		 * offline, which includes the altmap.
+		 */
+		if (arg->altmap_nr_pages) {
+			start = PFN_PHYS(arg->altmap_start_pfn);
+			size += PFN_PHYS(arg->altmap_nr_pages);
+		}
+		sclp_mem_change_state(start, size, 0);
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(&sclp_mem_mutex);
+	return rc ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static struct notifier_block sclp_mem_nb = {
+	.notifier_call = sclp_mem_notifier,
+};
+
+static void __init align_to_block_size(unsigned long *start,
+				       unsigned long *size,
+				       unsigned long alignment)
+{
+	unsigned long start_align, size_align;
+
+	start_align = roundup(*start, alignment);
+	size_align = rounddown(*start + *size, alignment) - start_align;
+
+	pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n",
+		*start, size_align >> 20, *size >> 20);
+	*start = start_align;
+	*size = size_align;
+}
+
+static void __init add_memory_merged(u16 rn)
+{
+	unsigned long start, size, addr, block_size;
+	static u16 first_rn, num;
+
+	if (rn && first_rn && (first_rn + num == rn)) {
+		num++;
+		return;
+	}
+	if (!first_rn)
+		goto skip_add;
+	start = rn2addr(first_rn);
+	size = (unsigned long)num * sclp.rzm;
+	if (start >= ident_map_size)
+		goto skip_add;
+	if (start + size > ident_map_size)
+		size = ident_map_size - start;
+	block_size = memory_block_size_bytes();
+	align_to_block_size(&start, &size, block_size);
+	if (!size)
+		goto skip_add;
+	for (addr = start; addr < start + size; addr += block_size) {
+		add_memory(0, addr, block_size,
+			   cpu_has_edat1() ?
+			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
+	}
+skip_add:
+	first_rn = rn;
+	num = 1;
+}
+
+static void __init sclp_add_standby_memory(void)
+{
+	struct memory_increment *incr;
+
+	list_for_each_entry(incr, &sclp_mem_list, list) {
+		if (incr->standby)
+			add_memory_merged(incr->rn);
+	}
+	add_memory_merged(0);
+}
+
+static void __init insert_increment(u16 rn, int standby, int assigned)
+{
+	struct memory_increment *incr, *new_incr;
+	struct list_head *prev;
+	u16 last_rn;
+
+	new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL);
+	if (!new_incr)
+		return;
+	new_incr->rn = rn;
+	new_incr->standby = standby;
+	last_rn = 0;
+	prev = &sclp_mem_list;
+	list_for_each_entry(incr, &sclp_mem_list, list) {
+		if (assigned && incr->rn > rn)
+			break;
+		if (!assigned && incr->rn - last_rn > 1)
+			break;
+		last_rn = incr->rn;
+		prev = &incr->list;
+	}
+	if (!assigned)
+		new_incr->rn = last_rn + 1;
+	if (new_incr->rn > sclp.rnmax) {
+		kfree(new_incr);
+		return;
+	}
+	list_add(&new_incr->list, prev);
+}
+
+static int __init sclp_detect_standby_memory(void)
+{
+	struct read_storage_sccb *sccb;
+	int i, id, assigned, rc;
+
+	/* No standby memory in kdump mode */
+	if (oldmem_data.start)
+		return 0;
+	if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL)
+		return 0;
+	rc = -ENOMEM;
+	sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+	if (!sccb)
+		goto out;
+	assigned = 0;
+	for (id = 0; id <= sclp_max_storage_id; id++) {
+		memset(sccb, 0, PAGE_SIZE);
+		sccb->header.length = PAGE_SIZE;
+		rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb);
+		if (rc)
+			goto out;
+		switch (sccb->header.response_code) {
+		case 0x0010:
+			set_bit(id, sclp_storage_ids);
+			for (i = 0; i < sccb->assigned; i++) {
+				if (!sccb->entries[i])
+					continue;
+				assigned++;
+				insert_increment(sccb->entries[i] >> 16, 0, 1);
+			}
+			break;
+		case 0x0310:
+			break;
+		case 0x0410:
+			for (i = 0; i < sccb->assigned; i++) {
+				if (!sccb->entries[i])
+					continue;
+				assigned++;
+				insert_increment(sccb->entries[i] >> 16, 1, 1);
+			}
+			break;
+		default:
+			rc = -EIO;
+			break;
+		}
+		if (!rc)
+			sclp_max_storage_id = sccb->max_id;
+	}
+	if (rc || list_empty(&sclp_mem_list))
+		goto out;
+	for (i = 1; i <= sclp.rnmax - assigned; i++)
+		insert_increment(0, 1, 0);
+	rc = register_memory_notifier(&sclp_mem_nb);
+	if (rc)
+		goto out;
+	sclp_add_standby_memory();
+out:
+	free_page((unsigned long)sccb);
+	return rc;
+}
+__initcall(sclp_detect_standby_memory);
diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c
index a1bafaf73f87..2a2931d303cb 100644
--- a/drivers/s390/char/tape_3590.c
+++ b/drivers/s390/char/tape_3590.c
@@ -1671,7 +1671,7 @@ tape_3590_init(void)
 
 	DBF_EVENT(3, "3590 init\n");
 
-	tape_3590_wq = alloc_workqueue("tape_3590", 0, 0);
+	tape_3590_wq = alloc_workqueue("tape_3590", WQ_PERCPU, 0);
 	if (!tape_3590_wq)
 		return -ENOMEM;
 
diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c
index 3bf09a89a089..e92e2fd8ce5d 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.c
+++ b/drivers/s390/crypto/zcrypt_ep11misc.c
@@ -1405,7 +1405,9 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
 	/* Step 3: import the encrypted key value as a new key */
 	rc = ep11_unwrapkey(card, domain, kek, keklen,
 			    encbuf, encbuflen, 0, def_iv,
-			    keybitsize, 0, keybuf, keybufsize, keytype, xflags);
+			    keybitsize, keygenflags,
+			    keybuf, keybufsize,
+			    keytype, xflags);
 	if (rc) {
 		ZCRYPT_DBF_ERR("%s importing key value as new key failed, rc=%d\n",
 			       __func__, rc);
diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h
index c2fdc6553e62..1199d701c3f5 100644
--- a/drivers/scsi/fnic/fnic.h
+++ b/drivers/scsi/fnic/fnic.h
@@ -323,8 +323,6 @@ enum fnic_state {
 	FNIC_IN_ETH_TRANS_FC_MODE,
 };
 
-struct mempool;
-
 enum fnic_role_e {
 	FNIC_ROLE_FCP_INITIATOR = 0,
 };
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 2db8d9529b8f..7c4d7bb3a56f 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -6280,7 +6280,6 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport)
 			}
 			phba->nvmeio_trc_on = 1;
 			phba->nvmeio_trc_output_idx = 0;
-			phba->nvmeio_trc = NULL;
 		} else {
 nvmeio_off:
 			phba->nvmeio_trc_size = 0;
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index fba2e62027b7..4cfc928bcf2d 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -1243,7 +1243,7 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport,
 	struct lpfc_nvmet_tgtport *tgtp;
 	struct lpfc_async_xchg_ctx *ctxp =
 		container_of(rsp, struct lpfc_async_xchg_ctx, hdlrctx.fcp_req);
-	struct rqb_dmabuf *nvmebuf = ctxp->rqb_buffer;
+	struct rqb_dmabuf *nvmebuf;
 	struct lpfc_hba *phba = ctxp->phba;
 	unsigned long iflag;
 
@@ -1251,13 +1251,18 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport,
 	lpfc_nvmeio_data(phba, "NVMET DEFERRCV: xri x%x sz %d CPU %02x\n",
 			 ctxp->oxid, ctxp->size, raw_smp_processor_id());
 
+	spin_lock_irqsave(&ctxp->ctxlock, iflag);
+	nvmebuf = ctxp->rqb_buffer;
 	if (!nvmebuf) {
+		spin_unlock_irqrestore(&ctxp->ctxlock, iflag);
 		lpfc_printf_log(phba, KERN_INFO, LOG_NVME_IOERR,
 				"6425 Defer rcv: no buffer oxid x%x: "
 				"flg %x ste %x\n",
 				ctxp->oxid, ctxp->flag, ctxp->state);
 		return;
 	}
+	ctxp->rqb_buffer = NULL;
+	spin_unlock_irqrestore(&ctxp->ctxlock, iflag);
 
 	tgtp = phba->targetport->private;
 	if (tgtp)
@@ -1265,9 +1270,6 @@ lpfc_nvmet_defer_rcv(struct nvmet_fc_target_port *tgtport,
 
 	/* Free the nvmebuf since a new buffer already replaced it */
 	nvmebuf->hrq->rqbp->rqb_free_buffer(phba, nvmebuf);
-	spin_lock_irqsave(&ctxp->ctxlock, iflag);
-	ctxp->rqb_buffer = NULL;
-	spin_unlock_irqrestore(&ctxp->ctxlock, iflag);
 }
 
 /**
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index 2797aa75a689..aff6c9d5e7c2 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -666,7 +666,7 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
 	 * Take early refcount for outstanding I/O requests we schedule during
 	 * delete processing for unreg_vpi.  Always keep this before
 	 * scsi_remove_host() as we can no longer obtain a reference through
-	 * scsi_host_get() after scsi_host_remove as shost is set to SHOST_DEL.
+	 * scsi_host_get() after scsi_remove_host as shost is set to SHOST_DEL.
 	 */
 	if (!scsi_host_get(shost))
 		return VPORT_INVAL;
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index a39f1da4ce47..a761c0aa5127 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -6606,6 +6606,8 @@ static struct iscsi_endpoint *qla4xxx_get_ep_fwdb(struct scsi_qla_host *ha,
 
 	ep = qla4xxx_ep_connect(ha->host, (struct sockaddr *)dst_addr, 0);
 	vfree(dst_addr);
+	if (IS_ERR(ep))
+		return NULL;
 	return ep;
 }
 
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 0847767d4d43..353cb60e1abe 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -2674,8 +2674,10 @@ static int resp_rsup_tmfs(struct scsi_cmnd *scp,
 
 static int resp_err_recov_pg(unsigned char *p, int pcontrol, int target)
 {	/* Read-Write Error Recovery page for mode_sense */
-	unsigned char err_recov_pg[] = {0x1, 0xa, 0xc0, 11, 240, 0, 0, 0,
-					5, 0, 0xff, 0xff};
+	static const unsigned char err_recov_pg[] = {
+		0x1, 0xa, 0xc0, 11, 240, 0, 0, 0,
+		5, 0, 0xff, 0xff
+	};
 
 	memcpy(p, err_recov_pg, sizeof(err_recov_pg));
 	if (1 == pcontrol)
@@ -2685,8 +2687,10 @@ static int resp_err_recov_pg(unsigned char *p, int pcontrol, int target)
 
 static int resp_disconnect_pg(unsigned char *p, int pcontrol, int target)
 { 	/* Disconnect-Reconnect page for mode_sense */
-	unsigned char disconnect_pg[] = {0x2, 0xe, 128, 128, 0, 10, 0, 0,
-					 0, 0, 0, 0, 0, 0, 0, 0};
+	static const unsigned char disconnect_pg[] = {
+		0x2, 0xe, 128, 128, 0, 10, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0
+	};
 
 	memcpy(p, disconnect_pg, sizeof(disconnect_pg));
 	if (1 == pcontrol)
@@ -2696,9 +2700,11 @@ static int resp_disconnect_pg(unsigned char *p, int pcontrol, int target)
 
 static int resp_format_pg(unsigned char *p, int pcontrol, int target)
 {       /* Format device page for mode_sense */
-	unsigned char format_pg[] = {0x3, 0x16, 0, 0, 0, 0, 0, 0,
-				     0, 0, 0, 0, 0, 0, 0, 0,
-				     0, 0, 0, 0, 0x40, 0, 0, 0};
+	static const unsigned char format_pg[] = {
+		0x3, 0x16, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0x40, 0, 0, 0
+	};
 
 	memcpy(p, format_pg, sizeof(format_pg));
 	put_unaligned_be16(sdebug_sectors_per, p + 10);
@@ -2716,10 +2722,14 @@ static unsigned char caching_pg[] = {0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0,
 
 static int resp_caching_pg(unsigned char *p, int pcontrol, int target)
 { 	/* Caching page for mode_sense */
-	unsigned char ch_caching_pg[] = {/* 0x8, 18, */ 0x4, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-	unsigned char d_caching_pg[] = {0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0,
-		0xff, 0xff, 0xff, 0xff, 0x80, 0x14, 0, 0,     0, 0, 0, 0};
+	static const unsigned char ch_caching_pg[] = {
+		/* 0x8, 18, */ 0x4, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+	static const unsigned char d_caching_pg[] = {
+		0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0,
+		0xff, 0xff, 0xff, 0xff, 0x80, 0x14, 0, 0, 0, 0, 0, 0
+	};
 
 	if (SDEBUG_OPT_N_WCE & sdebug_opts)
 		caching_pg[2] &= ~0x4;	/* set WCE=0 (default WCE=1) */
@@ -2738,8 +2748,10 @@ static int resp_ctrl_m_pg(unsigned char *p, int pcontrol, int target)
 { 	/* Control mode page for mode_sense */
 	unsigned char ch_ctrl_m_pg[] = {/* 0xa, 10, */ 0x6, 0, 0, 0, 0, 0,
 					0, 0, 0, 0};
-	unsigned char d_ctrl_m_pg[] = {0xa, 10, 2, 0, 0, 0, 0, 0,
-				     0, 0, 0x2, 0x4b};
+	static const unsigned char d_ctrl_m_pg[] = {
+		0xa, 10, 2, 0, 0, 0, 0, 0,
+		0, 0, 0x2, 0x4b
+	};
 
 	if (sdebug_dsense)
 		ctrl_m_pg[2] |= 0x4;
@@ -2794,10 +2806,14 @@ static int resp_grouping_m_pg(unsigned char *p, int pcontrol, int target)
 
 static int resp_iec_m_pg(unsigned char *p, int pcontrol, int target)
 {	/* Informational Exceptions control mode page for mode_sense */
-	unsigned char ch_iec_m_pg[] = {/* 0x1c, 0xa, */ 0x4, 0xf, 0, 0, 0, 0,
-				       0, 0, 0x0, 0x0};
-	unsigned char d_iec_m_pg[] = {0x1c, 0xa, 0x08, 0, 0, 0, 0, 0,
-				      0, 0, 0x0, 0x0};
+	static const unsigned char ch_iec_m_pg[] = {
+		/* 0x1c, 0xa, */ 0x4, 0xf, 0, 0, 0, 0,
+		0, 0, 0x0, 0x0
+	};
+	static const unsigned char d_iec_m_pg[] = {
+		0x1c, 0xa, 0x08, 0, 0, 0, 0, 0,
+		0, 0, 0x0, 0x0
+	};
 
 	memcpy(p, iec_m_pg, sizeof(iec_m_pg));
 	if (1 == pcontrol)
@@ -2809,8 +2825,9 @@ static int resp_iec_m_pg(unsigned char *p, int pcontrol, int target)
 
 static int resp_sas_sf_m_pg(unsigned char *p, int pcontrol, int target)
 {	/* SAS SSP mode page - short format for mode_sense */
-	unsigned char sas_sf_m_pg[] = {0x19, 0x6,
-		0x6, 0x0, 0x7, 0xd0, 0x0, 0x0};
+	static const unsigned char sas_sf_m_pg[] = {
+		0x19, 0x6, 0x6, 0x0, 0x7, 0xd0, 0x0, 0x0
+	};
 
 	memcpy(p, sas_sf_m_pg, sizeof(sas_sf_m_pg));
 	if (1 == pcontrol)
@@ -2854,9 +2871,10 @@ static int resp_sas_pcd_m_spg(unsigned char *p, int pcontrol, int target,
 
 static int resp_sas_sha_m_spg(unsigned char *p, int pcontrol)
 {	/* SAS SSP shared protocol specific port mode subpage */
-	unsigned char sas_sha_m_pg[] = {0x59, 0x2, 0, 0xc, 0, 0x6, 0x10, 0,
-		    0, 0, 0, 0, 0, 0, 0, 0,
-		};
+	static const unsigned char sas_sha_m_pg[] = {
+		0x59, 0x2, 0, 0xc, 0, 0x6, 0x10, 0,
+		0, 0, 0, 0, 0, 0, 0, 0,
+	};
 
 	memcpy(p, sas_sha_m_pg, sizeof(sas_sha_m_pg));
 	if (1 == pcontrol)
@@ -2923,8 +2941,10 @@ static int process_medium_part_m_pg(struct sdebug_dev_info *devip,
 static int resp_compression_m_pg(unsigned char *p, int pcontrol, int target,
 	unsigned char dce)
 {	/* Compression page for mode_sense (tape) */
-	unsigned char compression_pg[] = {0x0f, 14, 0x40, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 00, 00};
+	static const unsigned char compression_pg[] = {
+		0x0f, 14, 0x40, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0
+	};
 
 	memcpy(p, compression_pg, sizeof(compression_pg));
 	if (dce)
@@ -3282,9 +3302,10 @@ bad_pcode:
 
 static int resp_temp_l_pg(unsigned char *arr)
 {
-	unsigned char temp_l_pg[] = {0x0, 0x0, 0x3, 0x2, 0x0, 38,
-				     0x0, 0x1, 0x3, 0x2, 0x0, 65,
-		};
+	static const unsigned char temp_l_pg[] = {
+		0x0, 0x0, 0x3, 0x2, 0x0, 38,
+		0x0, 0x1, 0x3, 0x2, 0x0, 65,
+	};
 
 	memcpy(arr, temp_l_pg, sizeof(temp_l_pg));
 	return sizeof(temp_l_pg);
@@ -3292,8 +3313,9 @@ static int resp_temp_l_pg(unsigned char *arr)
 
 static int resp_ie_l_pg(unsigned char *arr)
 {
-	unsigned char ie_l_pg[] = {0x0, 0x0, 0x3, 0x3, 0x0, 0x0, 38,
-		};
+	static const unsigned char ie_l_pg[] = {
+		0x0, 0x0, 0x3, 0x3, 0x0, 0x0, 38,
+	};
 
 	memcpy(arr, ie_l_pg, sizeof(ie_l_pg));
 	if (iec_m_pg[2] & 0x4) {	/* TEST bit set */
@@ -3305,11 +3327,12 @@ static int resp_ie_l_pg(unsigned char *arr)
 
 static int resp_env_rep_l_spg(unsigned char *arr)
 {
-	unsigned char env_rep_l_spg[] = {0x0, 0x0, 0x23, 0x8,
-					 0x0, 40, 72, 0xff, 45, 18, 0, 0,
-					 0x1, 0x0, 0x23, 0x8,
-					 0x0, 55, 72, 35, 55, 45, 0, 0,
-		};
+	static const unsigned char env_rep_l_spg[] = {
+		0x0, 0x0, 0x23, 0x8,
+		0x0, 40, 72, 0xff, 45, 18, 0, 0,
+		0x1, 0x0, 0x23, 0x8,
+		0x0, 55, 72, 35, 55, 45, 0, 0,
+	};
 
 	memcpy(arr, env_rep_l_spg, sizeof(env_rep_l_spg));
 	return sizeof(env_rep_l_spg);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 169af7d47ce7..15ba493d2138 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -265,7 +265,7 @@ show_shost_supported_mode(struct device *dev, struct device_attribute *attr,
 	return show_shost_mode(supported_mode, buf);
 }
 
-static DEVICE_ATTR(supported_mode, S_IRUGO | S_IWUSR, show_shost_supported_mode, NULL);
+static DEVICE_ATTR(supported_mode, S_IRUGO, show_shost_supported_mode, NULL);
 
 static ssize_t
 show_shost_active_mode(struct device *dev,
@@ -279,7 +279,7 @@ show_shost_active_mode(struct device *dev,
 		return show_shost_mode(shost->active_mode, buf);
 }
 
-static DEVICE_ATTR(active_mode, S_IRUGO | S_IWUSR, show_shost_active_mode, NULL);
+static DEVICE_ATTR(active_mode, S_IRUGO, show_shost_active_mode, NULL);
 
 static int check_reset_type(const char *str)
 {
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index b17796d5ee66..add13e306898 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -475,13 +475,21 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
 
 static int sr_revalidate_disk(struct scsi_cd *cd)
 {
+	struct request_queue *q = cd->device->request_queue;
 	struct scsi_sense_hdr sshdr;
+	struct queue_limits lim;
+	int sector_size;
 
 	/* if the unit is not ready, nothing more to do */
 	if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
 		return 0;
 	sr_cd_check(&cd->cdi);
-	return get_sectorsize(cd);
+	sector_size = get_sectorsize(cd);
+
+	lim = queue_limits_start_update(q);
+	lim.logical_block_size = sector_size;
+	lim.features |= BLK_FEAT_ROTATIONAL;
+	return queue_limits_commit_update_frozen(q, &lim);
 }
 
 static int sr_block_open(struct gendisk *disk, blk_mode_t mode)
@@ -721,10 +729,8 @@ fail:
 
 static int get_sectorsize(struct scsi_cd *cd)
 {
-	struct request_queue *q = cd->device->request_queue;
 	static const u8 cmd[10] = { READ_CAPACITY };
 	unsigned char buffer[8] = { };
-	struct queue_limits lim;
 	int err;
 	int sector_size;
 	struct scsi_failure failure_defs[] = {
@@ -795,9 +801,7 @@ static int get_sectorsize(struct scsi_cd *cd)
 		set_capacity(cd->disk, cd->capacity);
 	}
 
-	lim = queue_limits_start_update(q);
-	lim.logical_block_size = sector_size;
-	return queue_limits_commit_update_frozen(q, &lim);
+	return sector_size;
 }
 
 static int get_capabilities(struct scsi_cd *cd)
diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c
index 0ca268bdf1f8..5710ac0c07a8 100644
--- a/drivers/soc/qcom/mdt_loader.c
+++ b/drivers/soc/qcom/mdt_loader.c
@@ -39,12 +39,14 @@ static bool mdt_header_valid(const struct firmware *fw)
 	if (phend > fw->size)
 		return false;
 
-	if (ehdr->e_shentsize != sizeof(struct elf32_shdr))
-		return false;
+	if (ehdr->e_shentsize || ehdr->e_shnum) {
+		if (ehdr->e_shentsize != sizeof(struct elf32_shdr))
+			return false;
 
-	shend = size_add(size_mul(sizeof(struct elf32_shdr), ehdr->e_shnum), ehdr->e_shoff);
-	if (shend > fw->size)
-		return false;
+		shend = size_add(size_mul(sizeof(struct elf32_shdr), ehdr->e_shnum), ehdr->e_shoff);
+		if (shend > fw->size)
+			return false;
+	}
 
 	return true;
 }
diff --git a/drivers/soc/qcom/ubwc_config.c b/drivers/soc/qcom/ubwc_config.c
index bd0a98aad9f3..15d373bff231 100644
--- a/drivers/soc/qcom/ubwc_config.c
+++ b/drivers/soc/qcom/ubwc_config.c
@@ -12,6 +12,10 @@
 
 #include <linux/soc/qcom/ubwc.h>
 
+static const struct qcom_ubwc_cfg_data no_ubwc_data = {
+	/* no UBWC, no HBB */
+};
+
 static const struct qcom_ubwc_cfg_data msm8937_data = {
 	.ubwc_enc_version = UBWC_1_0,
 	.ubwc_dec_version = UBWC_1_0,
@@ -215,12 +219,20 @@ static const struct qcom_ubwc_cfg_data x1e80100_data = {
 };
 
 static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = {
+	{ .compatible = "qcom,apq8016", .data = &no_ubwc_data },
+	{ .compatible = "qcom,apq8026", .data = &no_ubwc_data },
+	{ .compatible = "qcom,apq8074", .data = &no_ubwc_data },
 	{ .compatible = "qcom,apq8096", .data = &msm8998_data },
-	{ .compatible = "qcom,msm8917", .data = &msm8937_data },
+	{ .compatible = "qcom,msm8226", .data = &no_ubwc_data },
+	{ .compatible = "qcom,msm8916", .data = &no_ubwc_data },
+	{ .compatible = "qcom,msm8917", .data = &no_ubwc_data },
 	{ .compatible = "qcom,msm8937", .data = &msm8937_data },
+	{ .compatible = "qcom,msm8929", .data = &no_ubwc_data },
+	{ .compatible = "qcom,msm8939", .data = &no_ubwc_data },
 	{ .compatible = "qcom,msm8953", .data = &msm8937_data },
-	{ .compatible = "qcom,msm8956", .data = &msm8937_data },
-	{ .compatible = "qcom,msm8976", .data = &msm8937_data },
+	{ .compatible = "qcom,msm8956", .data = &no_ubwc_data },
+	{ .compatible = "qcom,msm8974", .data = &no_ubwc_data },
+	{ .compatible = "qcom,msm8976", .data = &no_ubwc_data },
 	{ .compatible = "qcom,msm8996", .data = &msm8998_data },
 	{ .compatible = "qcom,msm8998", .data = &msm8998_data },
 	{ .compatible = "qcom,qcm2290", .data = &qcm2290_data, },
@@ -233,7 +245,10 @@ static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = {
 	{ .compatible = "qcom,sc7280", .data = &sc7280_data, },
 	{ .compatible = "qcom,sc8180x", .data = &sc8180x_data, },
 	{ .compatible = "qcom,sc8280xp", .data = &sc8280xp_data, },
+	{ .compatible = "qcom,sda660", .data = &msm8937_data },
+	{ .compatible = "qcom,sdm450", .data = &msm8937_data },
 	{ .compatible = "qcom,sdm630", .data = &msm8937_data },
+	{ .compatible = "qcom,sdm632", .data = &msm8937_data },
 	{ .compatible = "qcom,sdm636", .data = &msm8937_data },
 	{ .compatible = "qcom,sdm660", .data = &msm8937_data },
 	{ .compatible = "qcom,sdm670", .data = &sdm670_data, },
@@ -246,6 +261,8 @@ static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = {
 	{ .compatible = "qcom,sm6375", .data = &sm6350_data, },
 	{ .compatible = "qcom,sm7125", .data = &sc7180_data },
 	{ .compatible = "qcom,sm7150", .data = &sm7150_data, },
+	{ .compatible = "qcom,sm7225", .data = &sm6350_data, },
+	{ .compatible = "qcom,sm7325", .data = &sc7280_data, },
 	{ .compatible = "qcom,sm8150", .data = &sm8150_data, },
 	{ .compatible = "qcom,sm8250", .data = &sm8250_data, },
 	{ .compatible = "qcom,sm8350", .data = &sm8350_data, },
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 2a5f24ee858c..034a2a535a1e 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -1232,7 +1232,7 @@ err:
 }
 
 static int tegra_powergate_of_get_resets(struct tegra_powergate *pg,
-					 struct device_node *np, bool off)
+					 struct device_node *np)
 {
 	struct device *dev = pg->pmc->dev;
 	int err;
@@ -1247,22 +1247,6 @@ static int tegra_powergate_of_get_resets(struct tegra_powergate *pg,
 	err = reset_control_acquire(pg->reset);
 	if (err < 0) {
 		pr_err("failed to acquire resets: %d\n", err);
-		goto out;
-	}
-
-	if (off) {
-		err = reset_control_assert(pg->reset);
-	} else {
-		err = reset_control_deassert(pg->reset);
-		if (err < 0)
-			goto out;
-
-		reset_control_release(pg->reset);
-	}
-
-out:
-	if (err) {
-		reset_control_release(pg->reset);
 		reset_control_put(pg->reset);
 	}
 
@@ -1308,20 +1292,43 @@ static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np)
 		goto set_available;
 	}
 
-	err = tegra_powergate_of_get_resets(pg, np, off);
+	err = tegra_powergate_of_get_resets(pg, np);
 	if (err < 0) {
 		dev_err(dev, "failed to get resets for %pOFn: %d\n", np, err);
 		goto remove_clks;
 	}
 
-	if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) {
-		if (off)
-			WARN_ON(tegra_powergate_power_up(pg, true));
+	/*
+	 * If the power-domain is off, then ensure the resets are asserted.
+	 * If the power-domain is on, then power down to ensure that when is
+	 * it turned on the power-domain, clocks and resets are all in the
+	 * expected state.
+	 */
+	if (off) {
+		err = reset_control_assert(pg->reset);
+		if (err) {
+			pr_err("failed to assert resets: %d\n", err);
+			goto remove_resets;
+		}
+	} else {
+		err = tegra_powergate_power_down(pg);
+		if (err) {
+			dev_err(dev, "failed to turn off PM domain %s: %d\n",
+				pg->genpd.name, err);
+			goto remove_resets;
+		}
+	}
 
+	/*
+	 * If PM_GENERIC_DOMAINS is not enabled, power-on
+	 * the domain and skip the genpd registration.
+	 */
+	if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) {
+		WARN_ON(tegra_powergate_power_up(pg, true));
 		goto remove_resets;
 	}
 
-	err = pm_genpd_init(&pg->genpd, NULL, off);
+	err = pm_genpd_init(&pg->genpd, NULL, true);
 	if (err < 0) {
 		dev_err(dev, "failed to initialise PM domain %pOFn: %d\n", np,
 		       err);
diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index 177f9a33f3a2..d288e9d9c187 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -46,6 +46,7 @@ static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX);
 #define CQSPI_DMA_SET_MASK		BIT(7)
 #define CQSPI_SUPPORT_DEVICE_RESET	BIT(8)
 #define CQSPI_DISABLE_STIG_MODE		BIT(9)
+#define CQSPI_DISABLE_RUNTIME_PM	BIT(10)
 
 /* Capabilities */
 #define CQSPI_SUPPORTS_OCTAL		BIT(0)
@@ -108,6 +109,8 @@ struct cqspi_st {
 
 	bool			is_jh7110; /* Flag for StarFive JH7110 SoC */
 	bool			disable_stig_mode;
+	refcount_t		refcount;
+	refcount_t		inflight_ops;
 
 	const struct cqspi_driver_platdata *ddata;
 };
@@ -735,6 +738,9 @@ static int cqspi_indirect_read_execute(struct cqspi_flash_pdata *f_pdata,
 	u8 *rxbuf_end = rxbuf + n_rx;
 	int ret = 0;
 
+	if (!refcount_read(&cqspi->refcount))
+		return -ENODEV;
+
 	writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
 	writel(remaining, reg_base + CQSPI_REG_INDIRECTRDBYTES);
 
@@ -1071,6 +1077,9 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
 	unsigned int write_bytes;
 	int ret;
 
+	if (!refcount_read(&cqspi->refcount))
+		return -ENODEV;
+
 	writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
 	writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES);
 
@@ -1460,20 +1469,41 @@ static int cqspi_exec_mem_op(struct spi_mem *mem, const struct spi_mem_op *op)
 	int ret;
 	struct cqspi_st *cqspi = spi_controller_get_devdata(mem->spi->controller);
 	struct device *dev = &cqspi->pdev->dev;
+	const struct cqspi_driver_platdata *ddata = of_device_get_match_data(dev);
 
-	ret = pm_runtime_resume_and_get(dev);
-	if (ret) {
-		dev_err(&mem->spi->dev, "resume failed with %d\n", ret);
-		return ret;
+	if (refcount_read(&cqspi->inflight_ops) == 0)
+		return -ENODEV;
+
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+		ret = pm_runtime_resume_and_get(dev);
+		if (ret) {
+			dev_err(&mem->spi->dev, "resume failed with %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (!refcount_read(&cqspi->refcount))
+		return -EBUSY;
+
+	refcount_inc(&cqspi->inflight_ops);
+
+	if (!refcount_read(&cqspi->refcount)) {
+		if (refcount_read(&cqspi->inflight_ops))
+			refcount_dec(&cqspi->inflight_ops);
+		return -EBUSY;
 	}
 
 	ret = cqspi_mem_process(mem, op);
 
-	pm_runtime_put_autosuspend(dev);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM)))
+		pm_runtime_put_autosuspend(dev);
 
 	if (ret)
 		dev_err(&mem->spi->dev, "operation failed with %d\n", ret);
 
+	if (refcount_read(&cqspi->inflight_ops) > 1)
+		refcount_dec(&cqspi->inflight_ops);
+
 	return ret;
 }
 
@@ -1925,6 +1955,9 @@ static int cqspi_probe(struct platform_device *pdev)
 		}
 	}
 
+	refcount_set(&cqspi->refcount, 1);
+	refcount_set(&cqspi->inflight_ops, 1);
+
 	ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
 			       pdev->name, cqspi);
 	if (ret) {
@@ -1957,11 +1990,12 @@ static int cqspi_probe(struct platform_device *pdev)
 			goto probe_setup_failed;
 	}
 
-	pm_runtime_enable(dev);
-
-	pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT);
-	pm_runtime_use_autosuspend(dev);
-	pm_runtime_get_noresume(dev);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+		pm_runtime_enable(dev);
+		pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT);
+		pm_runtime_use_autosuspend(dev);
+		pm_runtime_get_noresume(dev);
+	}
 
 	ret = spi_register_controller(host);
 	if (ret) {
@@ -1969,12 +2003,17 @@ static int cqspi_probe(struct platform_device *pdev)
 		goto probe_setup_failed;
 	}
 
-	pm_runtime_put_autosuspend(dev);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+		pm_runtime_put_autosuspend(dev);
+		pm_runtime_mark_last_busy(dev);
+		pm_runtime_put_autosuspend(dev);
+	}
 
 	return 0;
 probe_setup_failed:
 	cqspi_controller_enable(cqspi, 0);
-	pm_runtime_disable(dev);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM)))
+		pm_runtime_disable(dev);
 probe_reset_failed:
 	if (cqspi->is_jh7110)
 		cqspi_jh7110_disable_clk(pdev, cqspi);
@@ -1985,7 +2024,16 @@ probe_clk_failed:
 
 static void cqspi_remove(struct platform_device *pdev)
 {
+	const struct cqspi_driver_platdata *ddata;
 	struct cqspi_st *cqspi = platform_get_drvdata(pdev);
+	struct device *dev = &pdev->dev;
+
+	ddata = of_device_get_match_data(dev);
+
+	refcount_set(&cqspi->refcount, 0);
+
+	if (!refcount_dec_and_test(&cqspi->inflight_ops))
+		cqspi_wait_idle(cqspi);
 
 	spi_unregister_controller(cqspi->host);
 	cqspi_controller_enable(cqspi, 0);
@@ -1993,14 +2041,17 @@ static void cqspi_remove(struct platform_device *pdev)
 	if (cqspi->rx_chan)
 		dma_release_channel(cqspi->rx_chan);
 
-	if (pm_runtime_get_sync(&pdev->dev) >= 0)
-		clk_disable(cqspi->clk);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM)))
+		if (pm_runtime_get_sync(&pdev->dev) >= 0)
+			clk_disable(cqspi->clk);
 
 	if (cqspi->is_jh7110)
 		cqspi_jh7110_disable_clk(pdev, cqspi);
 
-	pm_runtime_put_sync(&pdev->dev);
-	pm_runtime_disable(&pdev->dev);
+	if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+		pm_runtime_put_sync(&pdev->dev);
+		pm_runtime_disable(&pdev->dev);
+	}
 }
 
 static int cqspi_runtime_suspend(struct device *dev)
@@ -2079,7 +2130,8 @@ static const struct cqspi_driver_platdata socfpga_qspi = {
 	.quirks = CQSPI_DISABLE_DAC_MODE
 			| CQSPI_NO_SUPPORT_WR_COMPLETION
 			| CQSPI_SLOW_SRAM
-			| CQSPI_DISABLE_STIG_MODE,
+			| CQSPI_DISABLE_STIG_MODE
+			| CQSPI_DISABLE_RUNTIME_PM,
 };
 
 static const struct cqspi_driver_platdata versal_ospi = {
diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c
index 67d4000c3cef..431439d4cdda 100644
--- a/drivers/spi/spi-fsl-lpspi.c
+++ b/drivers/spi/spi-fsl-lpspi.c
@@ -3,8 +3,9 @@
 // Freescale i.MX7ULP LPSPI driver
 //
 // Copyright 2016 Freescale Semiconductor, Inc.
-// Copyright 2018 NXP Semiconductors
+// Copyright 2018, 2023, 2025 NXP
 
+#include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
@@ -70,7 +71,7 @@
 #define DER_TDDE	BIT(0)
 #define CFGR1_PCSCFG	BIT(27)
 #define CFGR1_PINCFG	(BIT(24)|BIT(25))
-#define CFGR1_PCSPOL	BIT(8)
+#define CFGR1_PCSPOL_MASK	GENMASK(11, 8)
 #define CFGR1_NOSTALL	BIT(3)
 #define CFGR1_HOST	BIT(0)
 #define FSR_TXCOUNT	(0xFF)
@@ -82,8 +83,11 @@
 #define TCR_RXMSK	BIT(19)
 #define TCR_TXMSK	BIT(18)
 
+#define SR_CLEAR_MASK	GENMASK(13, 8)
+
 struct fsl_lpspi_devtype_data {
-	u8 prescale_max;
+	u8 prescale_max : 3; /* 0 == no limit */
+	bool query_hw_for_num_cs : 1;
 };
 
 struct lpspi_config {
@@ -129,20 +133,26 @@ struct fsl_lpspi_data {
 };
 
 /*
- * ERR051608 fixed or not:
- * https://www.nxp.com/docs/en/errata/i.MX93_1P87f.pdf
+ * Devices with ERR051608 have a max TCR_PRESCALE value of 1, otherwise there is
+ * no prescale limit: https://www.nxp.com/docs/en/errata/i.MX93_1P87f.pdf
  */
-static struct fsl_lpspi_devtype_data imx93_lpspi_devtype_data = {
+static const struct fsl_lpspi_devtype_data imx93_lpspi_devtype_data = {
 	.prescale_max = 1,
+	.query_hw_for_num_cs = true,
 };
 
-static struct fsl_lpspi_devtype_data imx7ulp_lpspi_devtype_data = {
-	.prescale_max = 7,
+static const struct fsl_lpspi_devtype_data imx7ulp_lpspi_devtype_data = {
+	/* All defaults */
+};
+
+static const struct fsl_lpspi_devtype_data s32g_lpspi_devtype_data = {
+	.query_hw_for_num_cs = true,
 };
 
 static const struct of_device_id fsl_lpspi_dt_ids[] = {
 	{ .compatible = "fsl,imx7ulp-spi", .data = &imx7ulp_lpspi_devtype_data,},
 	{ .compatible = "fsl,imx93-spi", .data = &imx93_lpspi_devtype_data,},
+	{ .compatible = "nxp,s32g2-lpspi", .data = &s32g_lpspi_devtype_data,},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, fsl_lpspi_dt_ids);
@@ -321,7 +331,7 @@ static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi)
 	int scldiv;
 
 	perclk_rate = clk_get_rate(fsl_lpspi->clk_per);
-	prescale_max = fsl_lpspi->devtype_data->prescale_max;
+	prescale_max = fsl_lpspi->devtype_data->prescale_max ?: 7;
 
 	if (!config.speed_hz) {
 		dev_err(fsl_lpspi->dev,
@@ -330,13 +340,11 @@ static int fsl_lpspi_set_bitrate(struct fsl_lpspi_data *fsl_lpspi)
 	}
 
 	if (config.speed_hz > perclk_rate / 2) {
-		dev_err(fsl_lpspi->dev,
-		      "per-clk should be at least two times of transfer speed");
-		return -EINVAL;
+		div = 2;
+	} else {
+		div = DIV_ROUND_UP(perclk_rate, config.speed_hz);
 	}
 
-	div = DIV_ROUND_UP(perclk_rate, config.speed_hz);
-
 	for (prescale = 0; prescale <= prescale_max; prescale++) {
 		scldiv = div / (1 << prescale) - 2;
 		if (scldiv >= 0 && scldiv < 256) {
@@ -425,7 +433,9 @@ static int fsl_lpspi_config(struct fsl_lpspi_data *fsl_lpspi)
 	else
 		temp = CFGR1_PINCFG;
 	if (fsl_lpspi->config.mode & SPI_CS_HIGH)
-		temp |= CFGR1_PCSPOL;
+		temp |= FIELD_PREP(CFGR1_PCSPOL_MASK,
+				   BIT(fsl_lpspi->config.chip_select));
+
 	writel(temp, fsl_lpspi->base + IMX7ULP_CFGR1);
 
 	temp = readl(fsl_lpspi->base + IMX7ULP_CR);
@@ -534,14 +544,13 @@ static int fsl_lpspi_reset(struct fsl_lpspi_data *fsl_lpspi)
 		fsl_lpspi_intctrl(fsl_lpspi, 0);
 	}
 
-	/* W1C for all flags in SR */
-	temp = 0x3F << 8;
-	writel(temp, fsl_lpspi->base + IMX7ULP_SR);
-
 	/* Clear FIFO and disable module */
 	temp = CR_RRF | CR_RTF;
 	writel(temp, fsl_lpspi->base + IMX7ULP_CR);
 
+	/* W1C for all flags in SR */
+	writel(SR_CLEAR_MASK, fsl_lpspi->base + IMX7ULP_SR);
+
 	return 0;
 }
 
@@ -732,12 +741,10 @@ static int fsl_lpspi_pio_transfer(struct spi_controller *controller,
 	fsl_lpspi_write_tx_fifo(fsl_lpspi);
 
 	ret = fsl_lpspi_wait_for_completion(controller);
-	if (ret)
-		return ret;
 
 	fsl_lpspi_reset(fsl_lpspi);
 
-	return 0;
+	return ret;
 }
 
 static int fsl_lpspi_transfer_one(struct spi_controller *controller,
@@ -787,7 +794,7 @@ static irqreturn_t fsl_lpspi_isr(int irq, void *dev_id)
 	if (temp_SR & SR_MBF ||
 	    readl(fsl_lpspi->base + IMX7ULP_FSR) & FSR_TXCOUNT) {
 		writel(SR_FCF, fsl_lpspi->base + IMX7ULP_SR);
-		fsl_lpspi_intctrl(fsl_lpspi, IER_FCIE);
+		fsl_lpspi_intctrl(fsl_lpspi, IER_FCIE | (temp_IER & IER_TDIE));
 		return IRQ_HANDLED;
 	}
 
@@ -932,7 +939,7 @@ static int fsl_lpspi_probe(struct platform_device *pdev)
 	fsl_lpspi->rxfifosize = 1 << ((temp >> 8) & 0x0f);
 	if (of_property_read_u32((&pdev->dev)->of_node, "num-cs",
 				 &num_cs)) {
-		if (of_device_is_compatible(pdev->dev.of_node, "fsl,imx93-spi"))
+		if (devtype_data->query_hw_for_num_cs)
 			num_cs = ((temp >> 16) & 0xf);
 		else
 			num_cs = 1;
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index d3b7e857b377..064b99204d9a 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -265,6 +265,9 @@ static bool spi_mem_internal_supports_op(struct spi_mem *mem,
  */
 bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op)
 {
+	/* Make sure the operation frequency is correct before going futher */
+	spi_mem_adjust_op_freq(mem, (struct spi_mem_op *)op);
+
 	if (spi_mem_check_op(op))
 		return false;
 
@@ -577,6 +580,7 @@ EXPORT_SYMBOL_GPL(spi_mem_adjust_op_freq);
  * spi_mem_calc_op_duration() - Derives the theoretical length (in ns) of an
  *			        operation. This helps finding the best variant
  *			        among a list of possible choices.
+ * @mem: the SPI memory
  * @op: the operation to benchmark
  *
  * Some chips have per-op frequency limitations, PCBs usually have their own
diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c
index d13a9b755c7f..8dc98b17f77b 100644
--- a/drivers/spi/spi-microchip-core-qspi.c
+++ b/drivers/spi/spi-microchip-core-qspi.c
@@ -531,10 +531,6 @@ error:
 
 static bool mchp_coreqspi_supports_op(struct spi_mem *mem, const struct spi_mem_op *op)
 {
-	struct mchp_coreqspi *qspi = spi_controller_get_devdata(mem->spi->controller);
-	unsigned long clk_hz;
-	u32 baud_rate_val;
-
 	if (!spi_mem_default_supports_op(mem, op))
 		return false;
 
@@ -557,14 +553,6 @@ static bool mchp_coreqspi_supports_op(struct spi_mem *mem, const struct spi_mem_
 			return false;
 	}
 
-	clk_hz = clk_get_rate(qspi->clk);
-	if (!clk_hz)
-		return false;
-
-	baud_rate_val = DIV_ROUND_UP(clk_hz, 2 * op->max_freq);
-	if (baud_rate_val > MAX_DIVIDER || baud_rate_val < MIN_DIVIDER)
-		return false;
-
 	return true;
 }
 
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 6dc58a30804a..69c2e9d9be3c 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -988,6 +988,7 @@ static int omap2_mcspi_setup_transfer(struct spi_device *spi,
 	else
 		l &= ~OMAP2_MCSPI_CHCONF_PHA;
 
+	mcspi_write_chconf0(spi, l | OMAP2_MCSPI_CHCONF_FORCE);
 	mcspi_write_chconf0(spi, l);
 
 	cs->mode = spi->mode;
diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c
index a8c4eb1cbde1..780abb967822 100644
--- a/drivers/spi/spi-qpic-snand.c
+++ b/drivers/spi/spi-qpic-snand.c
@@ -210,13 +210,21 @@ static int qcom_spi_ooblayout_ecc(struct mtd_info *mtd, int section,
 	struct qcom_nand_controller *snandc = nand_to_qcom_snand(nand);
 	struct qpic_ecc *qecc = snandc->qspi->ecc;
 
-	if (section > 1)
-		return -ERANGE;
-
-	oobregion->length = qecc->ecc_bytes_hw + qecc->spare_bytes;
-	oobregion->offset = mtd->oobsize - oobregion->length;
+	switch (section) {
+	case 0:
+		oobregion->offset = 0;
+		oobregion->length = qecc->bytes * (qecc->steps - 1) +
+				    qecc->bbm_size;
+		return 0;
+	case 1:
+		oobregion->offset = qecc->bytes * (qecc->steps - 1) +
+				    qecc->bbm_size +
+				    qecc->steps * 4;
+		oobregion->length = mtd->oobsize - oobregion->offset;
+		return 0;
+	}
 
-	return 0;
+	return -ERANGE;
 }
 
 static int qcom_spi_ooblayout_free(struct mtd_info *mtd, int section,
@@ -1196,7 +1204,7 @@ static int qcom_spi_program_oob(struct qcom_nand_controller *snandc,
 	u32 cfg0, cfg1, ecc_bch_cfg, ecc_buf_cfg;
 
 	cfg0 = (ecc_cfg->cfg0 & ~CW_PER_PAGE_MASK) |
-	       FIELD_PREP(CW_PER_PAGE_MASK, num_cw - 1);
+	       FIELD_PREP(CW_PER_PAGE_MASK, 0);
 	cfg1 = ecc_cfg->cfg1;
 	ecc_bch_cfg = ecc_cfg->ecc_bch_cfg;
 	ecc_buf_cfg = ecc_cfg->ecc_buf_cfg;
@@ -1607,11 +1615,13 @@ static int qcom_spi_probe(struct platform_device *pdev)
 	ret = spi_register_controller(ctlr);
 	if (ret) {
 		dev_err(&pdev->dev, "spi_register_controller failed.\n");
-		goto err_spi_init;
+		goto err_register_controller;
 	}
 
 	return 0;
 
+err_register_controller:
+	nand_ecc_unregister_on_host_hw_engine(&snandc->qspi->ecc_eng);
 err_spi_init:
 	qcom_nandc_unalloc(snandc);
 err_snand_alloc:
@@ -1633,7 +1643,7 @@ static void qcom_spi_remove(struct platform_device *pdev)
 	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
 	spi_unregister_controller(ctlr);
-
+	nand_ecc_unregister_on_host_hw_engine(&snandc->qspi->ecc_eng);
 	qcom_nandc_unalloc(snandc);
 
 	clk_disable_unprepare(snandc->aon_clk);
diff --git a/drivers/spi/spi-st-ssc4.c b/drivers/spi/spi-st-ssc4.c
index 49ab4c515156..c07c61dc4938 100644
--- a/drivers/spi/spi-st-ssc4.c
+++ b/drivers/spi/spi-st-ssc4.c
@@ -378,7 +378,7 @@ static void spi_st_remove(struct platform_device *pdev)
 	pinctrl_pm_select_sleep_state(&pdev->dev);
 }
 
-static int __maybe_unused spi_st_runtime_suspend(struct device *dev)
+static int spi_st_runtime_suspend(struct device *dev)
 {
 	struct spi_controller *host = dev_get_drvdata(dev);
 	struct spi_st *spi_st = spi_controller_get_devdata(host);
@@ -391,7 +391,7 @@ static int __maybe_unused spi_st_runtime_suspend(struct device *dev)
 	return 0;
 }
 
-static int __maybe_unused spi_st_runtime_resume(struct device *dev)
+static int spi_st_runtime_resume(struct device *dev)
 {
 	struct spi_controller *host = dev_get_drvdata(dev);
 	struct spi_st *spi_st = spi_controller_get_devdata(host);
@@ -428,8 +428,8 @@ static int __maybe_unused spi_st_resume(struct device *dev)
 }
 
 static const struct dev_pm_ops spi_st_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(spi_st_suspend, spi_st_resume)
-	SET_RUNTIME_PM_OPS(spi_st_runtime_suspend, spi_st_runtime_resume, NULL)
+	SYSTEM_SLEEP_PM_OPS(spi_st_suspend, spi_st_resume)
+	RUNTIME_PM_OPS(spi_st_runtime_suspend, spi_st_runtime_resume, NULL)
 };
 
 static const struct of_device_id stm_spi_match[] = {
@@ -441,7 +441,7 @@ MODULE_DEVICE_TABLE(of, stm_spi_match);
 static struct platform_driver spi_st_driver = {
 	.driver = {
 		.name = "spi-st",
-		.pm = pm_sleep_ptr(&spi_st_pm),
+		.pm = pm_ptr(&spi_st_pm),
 		.of_match_table = of_match_ptr(stm_spi_match),
 	},
 	.probe = spi_st_probe,
diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c
index f9ef7d94cebd..a963eed70c1d 100644
--- a/drivers/tee/optee/ffa_abi.c
+++ b/drivers/tee/optee/ffa_abi.c
@@ -657,7 +657,7 @@ static int optee_ffa_do_call_with_arg(struct tee_context *ctx,
  * with a matching configuration.
  */
 
-static bool optee_ffa_api_is_compatbile(struct ffa_device *ffa_dev,
+static bool optee_ffa_api_is_compatible(struct ffa_device *ffa_dev,
 					const struct ffa_ops *ops)
 {
 	const struct ffa_msg_ops *msg_ops = ops->msg_ops;
@@ -908,7 +908,7 @@ static int optee_ffa_probe(struct ffa_device *ffa_dev)
 	ffa_ops = ffa_dev->ops;
 	notif_ops = ffa_ops->notifier_ops;
 
-	if (!optee_ffa_api_is_compatbile(ffa_dev, ffa_ops))
+	if (!optee_ffa_api_is_compatible(ffa_dev, ffa_ops))
 		return -EINVAL;
 
 	if (!optee_ffa_exchange_caps(ffa_dev, ffa_ops, &sec_caps,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index daf6e5cfd59a..2a7d253d9c55 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -230,7 +230,7 @@ int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align,
 	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
 	if (!pages) {
 		rc = -ENOMEM;
-		goto err;
+		goto err_pages;
 	}
 
 	for (i = 0; i < nr_pages; i++)
@@ -243,11 +243,13 @@ int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align,
 		rc = shm_register(shm->ctx, shm, pages, nr_pages,
 				  (unsigned long)shm->kaddr);
 		if (rc)
-			goto err;
+			goto err_kfree;
 	}
 
 	return 0;
-err:
+err_kfree:
+	kfree(pages);
+err_pages:
 	free_pages_exact(shm->kaddr, shm->size);
 	shm->kaddr = NULL;
 	return rc;
@@ -560,9 +562,13 @@ EXPORT_SYMBOL_GPL(tee_shm_get_from_id);
  */
 void tee_shm_put(struct tee_shm *shm)
 {
-	struct tee_device *teedev = shm->ctx->teedev;
+	struct tee_device *teedev;
 	bool do_release = false;
 
+	if (!shm || !shm->ctx || !shm->ctx->teedev)
+		return;
+
+	teedev = shm->ctx->teedev;
 	mutex_lock(&teedev->mutex);
 	if (refcount_dec_and_test(&shm->refcount)) {
 		/*
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index cd1f657f782d..13c663a154c4 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -543,10 +543,10 @@ static ssize_t hvc_write(struct tty_struct *tty, const u8 *buf, size_t count)
 	}
 
 	/*
-	 * Racy, but harmless, kick thread if there is still pending data.
+	 * Kick thread to flush if there's still pending data
+	 * or to wakeup the write queue.
 	 */
-	if (hp->n_outbuf)
-		hvc_kick();
+	hvc_kick();
 
 	return written;
 }
diff --git a/drivers/tty/serial/8250/8250_rsa.c b/drivers/tty/serial/8250/8250_rsa.c
index d34093cc03ad..12a65b79583c 100644
--- a/drivers/tty/serial/8250/8250_rsa.c
+++ b/drivers/tty/serial/8250/8250_rsa.c
@@ -147,7 +147,7 @@ void rsa_enable(struct uart_8250_port *up)
 	if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
 		serial_out(up, UART_RSA_FRR, 0);
 }
-EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_enable, "8250_base");
+EXPORT_SYMBOL_FOR_MODULES(rsa_enable, "8250_base");
 
 /*
  * Attempts to turn off the RSA FIFO and resets the RSA board back to 115kbps compat mode. It is
@@ -179,7 +179,7 @@ void rsa_disable(struct uart_8250_port *up)
 		up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
 	uart_port_unlock_irq(&up->port);
 }
-EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_disable, "8250_base");
+EXPORT_SYMBOL_FOR_MODULES(rsa_disable, "8250_base");
 
 void rsa_autoconfig(struct uart_8250_port *up)
 {
@@ -192,7 +192,7 @@ void rsa_autoconfig(struct uart_8250_port *up)
 	if (__rsa_enable(up))
 		up->port.type = PORT_RSA;
 }
-EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_autoconfig, "8250_base");
+EXPORT_SYMBOL_FOR_MODULES(rsa_autoconfig, "8250_base");
 
 void rsa_reset(struct uart_8250_port *up)
 {
@@ -201,7 +201,7 @@ void rsa_reset(struct uart_8250_port *up)
 
 	serial_out(up, UART_RSA_FRR, 0);
 }
-EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_reset, "8250_base");
+EXPORT_SYMBOL_FOR_MODULES(rsa_reset, "8250_base");
 
 #ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS
 #ifndef MODULE
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 3f38fba8f6ea..a668e0bb26b3 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1177,17 +1177,6 @@ static int sc16is7xx_startup(struct uart_port *port)
 	sc16is7xx_port_write(port, SC16IS7XX_FCR_REG,
 			     SC16IS7XX_FCR_FIFO_BIT);
 
-	/* Enable EFR */
-	sc16is7xx_port_write(port, SC16IS7XX_LCR_REG,
-			     SC16IS7XX_LCR_CONF_MODE_B);
-
-	regcache_cache_bypass(one->regmap, true);
-
-	/* Enable write access to enhanced features and internal clock div */
-	sc16is7xx_port_update(port, SC16IS7XX_EFR_REG,
-			      SC16IS7XX_EFR_ENABLE_BIT,
-			      SC16IS7XX_EFR_ENABLE_BIT);
-
 	/* Enable TCR/TLR */
 	sc16is7xx_port_update(port, SC16IS7XX_MCR_REG,
 			      SC16IS7XX_MCR_TCRTLR_BIT,
@@ -1199,7 +1188,8 @@ static int sc16is7xx_startup(struct uart_port *port)
 			     SC16IS7XX_TCR_RX_RESUME(24) |
 			     SC16IS7XX_TCR_RX_HALT(48));
 
-	regcache_cache_bypass(one->regmap, false);
+	/* Disable TCR/TLR access */
+	sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, 0);
 
 	/* Now, initialize the UART */
 	sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, SC16IS7XX_LCR_WORD_LEN_8);
diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c
index fe457bf1e15b..a66b44d21fba 100644
--- a/drivers/tty/serial/xilinx_uartps.c
+++ b/drivers/tty/serial/xilinx_uartps.c
@@ -33,7 +33,6 @@
 #define CDNS_UART_MINOR		0	/* works best with devtmpfs */
 #define CDNS_UART_NR_PORTS	16
 #define CDNS_UART_FIFO_SIZE	64	/* FIFO size */
-#define CDNS_UART_REGISTER_SPACE	0x1000
 #define TX_TIMEOUT		500000
 
 /* Rx Trigger level */
@@ -1098,15 +1097,15 @@ static int cdns_uart_verify_port(struct uart_port *port,
  */
 static int cdns_uart_request_port(struct uart_port *port)
 {
-	if (!request_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE,
+	if (!request_mem_region(port->mapbase, port->mapsize,
 					 CDNS_UART_NAME)) {
 		return -ENOMEM;
 	}
 
-	port->membase = ioremap(port->mapbase, CDNS_UART_REGISTER_SPACE);
+	port->membase = ioremap(port->mapbase, port->mapsize);
 	if (!port->membase) {
 		dev_err(port->dev, "Unable to map registers\n");
-		release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE);
+		release_mem_region(port->mapbase, port->mapsize);
 		return -ENOMEM;
 	}
 	return 0;
@@ -1121,7 +1120,7 @@ static int cdns_uart_request_port(struct uart_port *port)
  */
 static void cdns_uart_release_port(struct uart_port *port)
 {
-	release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE);
+	release_mem_region(port->mapbase, port->mapsize);
 	iounmap(port->membase);
 	port->membase = NULL;
 }
@@ -1780,6 +1779,7 @@ static int cdns_uart_probe(struct platform_device *pdev)
 	 * and triggers invocation of the config_port() entry point.
 	 */
 	port->mapbase = res->start;
+	port->mapsize = resource_size(res);
 	port->irq = irq;
 	port->dev = &pdev->dev;
 	port->uartclk = clk_get_rate(cdns_uart_data->uartclk);
diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index 1e50675772fe..cc88aaa106da 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -243,7 +243,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba)
 		hwq->sqe_base_addr = dmam_alloc_coherent(hba->dev, utrdl_size,
 							 &hwq->sqe_dma_addr,
 							 GFP_KERNEL);
-		if (!hwq->sqe_dma_addr) {
+		if (!hwq->sqe_base_addr) {
 			dev_err(hba->dev, "SQE allocation failed\n");
 			return -ENOMEM;
 		}
@@ -252,7 +252,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba)
 		hwq->cqe_base_addr = dmam_alloc_coherent(hba->dev, cqe_size,
 							 &hwq->cqe_dma_addr,
 							 GFP_KERNEL);
-		if (!hwq->cqe_dma_addr) {
+		if (!hwq->cqe_base_addr) {
 			dev_err(hba->dev, "CQE allocation failed\n");
 			return -ENOMEM;
 		}
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 96ad57c3144b..9a43102b2b21 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1303,7 +1303,7 @@ static u32 ufshcd_pending_cmds(struct ufs_hba *hba)
  *
  * Return: 0 upon success; -EBUSY upon timeout.
  */
-static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
+static int ufshcd_wait_for_pending_cmds(struct ufs_hba *hba,
 					u64 wait_timeout_us)
 {
 	int ret = 0;
@@ -1431,7 +1431,7 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba, u64 timeout_us)
 	down_write(&hba->clk_scaling_lock);
 
 	if (!hba->clk_scaling.is_allowed ||
-	    ufshcd_wait_for_doorbell_clr(hba, timeout_us)) {
+	    ufshcd_wait_for_pending_cmds(hba, timeout_us)) {
 		ret = -EBUSY;
 		up_write(&hba->clk_scaling_lock);
 		mutex_unlock(&hba->wb_mutex);
@@ -3199,7 +3199,8 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 }
 
 /*
- * Return: 0 upon success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba,
 		struct ufshcd_lrb *lrbp, int max_timeout)
@@ -3275,7 +3276,6 @@ retry:
 		}
 	}
 
-	WARN_ONCE(err > 0, "Incorrect return value %d > 0\n", err);
 	return err;
 }
 
@@ -3294,7 +3294,8 @@ static void ufshcd_dev_man_unlock(struct ufs_hba *hba)
 }
 
 /*
- * Return: 0 upon success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 			  const u32 tag, int timeout)
@@ -3317,7 +3318,8 @@ static int ufshcd_issue_dev_cmd(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
  * @cmd_type: specifies the type (NOP, Query...)
  * @timeout: timeout in milliseconds
  *
- * Return: 0 upon success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  *
  * NOTE: Since there is only one available tag for device management commands,
  * it is expected you hold the hba->dev_cmd.lock mutex.
@@ -3363,6 +3365,10 @@ static inline void ufshcd_init_query(struct ufs_hba *hba,
 	(*request)->upiu_req.selector = selector;
 }
 
+/*
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
+ */
 static int ufshcd_query_flag_retry(struct ufs_hba *hba,
 	enum query_opcode opcode, enum flag_idn idn, u8 index, bool *flag_res)
 {
@@ -3383,7 +3389,6 @@ static int ufshcd_query_flag_retry(struct ufs_hba *hba,
 		dev_err(hba->dev,
 			"%s: query flag, opcode %d, idn %d, failed with error %d after %d retries\n",
 			__func__, opcode, idn, ret, retries);
-	WARN_ONCE(ret > 0, "Incorrect return value %d > 0\n", ret);
 	return ret;
 }
 
@@ -3395,7 +3400,8 @@ static int ufshcd_query_flag_retry(struct ufs_hba *hba,
  * @index: flag index to access
  * @flag_res: the flag value after the query request completes
  *
- * Return: 0 for success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode,
 			enum flag_idn idn, u8 index, bool *flag_res)
@@ -3451,7 +3457,6 @@ int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode,
 
 out_unlock:
 	ufshcd_dev_man_unlock(hba);
-	WARN_ONCE(err > 0, "Incorrect return value %d > 0\n", err);
 	return err;
 }
 
@@ -3464,8 +3469,9 @@ out_unlock:
  * @selector: selector field
  * @attr_val: the attribute value after the query request completes
  *
- * Return: 0 upon success; < 0 upon failure.
-*/
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
+ */
 int ufshcd_query_attr(struct ufs_hba *hba, enum query_opcode opcode,
 		      enum attr_idn idn, u8 index, u8 selector, u32 *attr_val)
 {
@@ -3513,7 +3519,6 @@ int ufshcd_query_attr(struct ufs_hba *hba, enum query_opcode opcode,
 
 out_unlock:
 	ufshcd_dev_man_unlock(hba);
-	WARN_ONCE(err > 0, "Incorrect return value %d > 0\n", err);
 	return err;
 }
 
@@ -3528,8 +3533,9 @@ out_unlock:
  * @attr_val: the attribute value after the query request
  * completes
  *
- * Return: 0 for success; < 0 upon failure.
-*/
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
+ */
 int ufshcd_query_attr_retry(struct ufs_hba *hba,
 	enum query_opcode opcode, enum attr_idn idn, u8 index, u8 selector,
 	u32 *attr_val)
@@ -3551,12 +3557,12 @@ int ufshcd_query_attr_retry(struct ufs_hba *hba,
 		dev_err(hba->dev,
 			"%s: query attribute, idn %d, failed with error %d after %d retries\n",
 			__func__, idn, ret, QUERY_REQ_RETRIES);
-	WARN_ONCE(ret > 0, "Incorrect return value %d > 0\n", ret);
 	return ret;
 }
 
 /*
- * Return: 0 if successful; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int __ufshcd_query_descriptor(struct ufs_hba *hba,
 			enum query_opcode opcode, enum desc_idn idn, u8 index,
@@ -3615,7 +3621,6 @@ static int __ufshcd_query_descriptor(struct ufs_hba *hba,
 out_unlock:
 	hba->dev_cmd.query.descriptor = NULL;
 	ufshcd_dev_man_unlock(hba);
-	WARN_ONCE(err > 0, "Incorrect return value %d > 0\n", err);
 	return err;
 }
 
@@ -3632,7 +3637,8 @@ out_unlock:
  * The buf_len parameter will contain, on return, the length parameter
  * received on the response.
  *
- * Return: 0 for success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 int ufshcd_query_descriptor_retry(struct ufs_hba *hba,
 				  enum query_opcode opcode,
@@ -3650,7 +3656,6 @@ int ufshcd_query_descriptor_retry(struct ufs_hba *hba,
 			break;
 	}
 
-	WARN_ONCE(err > 0, "Incorrect return value %d > 0\n", err);
 	return err;
 }
 
@@ -3663,7 +3668,8 @@ int ufshcd_query_descriptor_retry(struct ufs_hba *hba,
  * @param_read_buf: pointer to buffer where parameter would be read
  * @param_size: sizeof(param_read_buf)
  *
- * Return: 0 in case of success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 int ufshcd_read_desc_param(struct ufs_hba *hba,
 			   enum desc_idn desc_id,
@@ -3730,7 +3736,6 @@ int ufshcd_read_desc_param(struct ufs_hba *hba,
 out:
 	if (is_kmalloc)
 		kfree(desc_buf);
-	WARN_ONCE(ret > 0, "Incorrect return value %d > 0\n", ret);
 	return ret;
 }
 
@@ -4781,7 +4786,8 @@ EXPORT_SYMBOL_GPL(ufshcd_config_pwr_mode);
  *
  * Set fDeviceInit flag and poll until device toggles it.
  *
- * Return: 0 upon success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_complete_dev_init(struct ufs_hba *hba)
 {
@@ -5135,7 +5141,8 @@ out:
  * not respond with NOP IN UPIU within timeout of %NOP_OUT_TIMEOUT
  * and we retry sending NOP OUT for %NOP_OUT_RETRIES iterations.
  *
- * Return: 0 upon success; < 0 upon failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_verify_dev_init(struct ufs_hba *hba)
 {
@@ -5559,9 +5566,9 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status)
 	irqreturn_t retval = IRQ_NONE;
 	struct uic_command *cmd;
 
-	spin_lock(hba->host->host_lock);
+	guard(spinlock_irqsave)(hba->host->host_lock);
 	cmd = hba->active_uic_cmd;
-	if (WARN_ON_ONCE(!cmd))
+	if (!cmd)
 		goto unlock;
 
 	if (ufshcd_is_auto_hibern8_error(hba, intr_status))
@@ -5586,8 +5593,6 @@ static irqreturn_t ufshcd_uic_cmd_compl(struct ufs_hba *hba, u32 intr_status)
 		ufshcd_add_uic_command_trace(hba, cmd, UFS_CMD_COMP);
 
 unlock:
-	spin_unlock(hba->host->host_lock);
-
 	return retval;
 }
 
@@ -5869,7 +5874,8 @@ static inline int ufshcd_enable_ee(struct ufs_hba *hba, u16 mask)
  * as the device is allowed to manage its own way of handling background
  * operations.
  *
- * Return: zero on success, non-zero on failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_enable_auto_bkops(struct ufs_hba *hba)
 {
@@ -5908,7 +5914,8 @@ out:
  * host is idle so that BKOPS are managed effectively without any negative
  * impacts.
  *
- * Return: zero on success, non-zero on failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 static int ufshcd_disable_auto_bkops(struct ufs_hba *hba)
 {
@@ -6058,6 +6065,10 @@ out:
 				__func__, err);
 }
 
+/*
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
+ */
 int ufshcd_read_device_lvl_exception_id(struct ufs_hba *hba, u64 *exception_id)
 {
 	struct utp_upiu_query_v4_0 *upiu_resp;
@@ -6920,7 +6931,7 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status)
 	bool queue_eh_work = false;
 	irqreturn_t retval = IRQ_NONE;
 
-	spin_lock(hba->host->host_lock);
+	guard(spinlock_irqsave)(hba->host->host_lock);
 	hba->errors |= UFSHCD_ERROR_MASK & intr_status;
 
 	if (hba->errors & INT_FATAL_ERRORS) {
@@ -6979,7 +6990,7 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba, u32 intr_status)
 	 */
 	hba->errors = 0;
 	hba->uic_error = 0;
-	spin_unlock(hba->host->host_lock);
+
 	return retval;
 }
 
@@ -7138,14 +7149,19 @@ static irqreturn_t ufshcd_threaded_intr(int irq, void *__hba)
 static irqreturn_t ufshcd_intr(int irq, void *__hba)
 {
 	struct ufs_hba *hba = __hba;
+	u32 intr_status, enabled_intr_status;
 
 	/* Move interrupt handling to thread when MCQ & ESI are not enabled */
 	if (!hba->mcq_enabled || !hba->mcq_esi_enabled)
 		return IRQ_WAKE_THREAD;
 
+	intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS);
+	enabled_intr_status = intr_status & ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
+
+	ufshcd_writel(hba, intr_status, REG_INTERRUPT_STATUS);
+
 	/* Directly handle interrupts since MCQ ESI handlers does the hard job */
-	return ufshcd_sl_intr(hba, ufshcd_readl(hba, REG_INTERRUPT_STATUS) &
-				   ufshcd_readl(hba, REG_INTERRUPT_ENABLE));
+	return ufshcd_sl_intr(hba, enabled_intr_status);
 }
 
 static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag)
@@ -7449,7 +7465,8 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba,
  * @sg_list:	Pointer to SG list when DATA IN/OUT UPIU is required in ARPMB operation
  * @dir:	DMA direction
  *
- * Return: zero on success, non-zero on failure.
+ * Return: 0 upon success; > 0 in case the UFS device reported an OCS error;
+ * < 0 if another error occurred.
  */
 int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *req_upiu,
 			 struct utp_upiu_req *rsp_upiu, struct ufs_ehs *req_ehs,
@@ -10516,8 +10533,7 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
 	err = devm_add_action_or_reset(dev, ufshcd_devres_release,
 				       host);
 	if (err)
-		return dev_err_probe(dev, err,
-				     "failed to add ufshcd dealloc action\n");
+		return err;
 
 	host->nr_maps = HCTX_TYPE_POLL + 1;
 	hba = shost_priv(host);
diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index 86ae73b89d4d..f902ce08c95a 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -818,7 +818,7 @@ static u32 ufs_mtk_mcq_get_irq(struct ufs_hba *hba, unsigned int cpu)
 	unsigned int q_index;
 
 	q_index = map->mq_map[cpu];
-	if (q_index > nr) {
+	if (q_index >= nr) {
 		dev_err(hba->dev, "hwq index %d exceed %d\n",
 			q_index, nr);
 		return MTK_MCQ_INVALID_IRQ;
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 76fc70503a62..9574fdc2bb0f 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -2070,17 +2070,6 @@ static irqreturn_t ufs_qcom_mcq_esi_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void ufs_qcom_irq_free(struct ufs_qcom_irq *uqi)
-{
-	for (struct ufs_qcom_irq *q = uqi; q->irq; q++)
-		devm_free_irq(q->hba->dev, q->irq, q->hba);
-
-	platform_device_msi_free_irqs_all(uqi->hba->dev);
-	devm_kfree(uqi->hba->dev, uqi);
-}
-
-DEFINE_FREE(ufs_qcom_irq, struct ufs_qcom_irq *, if (_T) ufs_qcom_irq_free(_T))
-
 static int ufs_qcom_config_esi(struct ufs_hba *hba)
 {
 	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
@@ -2095,18 +2084,18 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba)
 	 */
 	nr_irqs = hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL];
 
-	struct ufs_qcom_irq *qi __free(ufs_qcom_irq) =
-		devm_kcalloc(hba->dev, nr_irqs, sizeof(*qi), GFP_KERNEL);
-	if (!qi)
-		return -ENOMEM;
-	/* Preset so __free() has a pointer to hba in all error paths */
-	qi[0].hba = hba;
-
 	ret = platform_device_msi_init_and_alloc_irqs(hba->dev, nr_irqs,
 						      ufs_qcom_write_msi_msg);
 	if (ret) {
-		dev_err(hba->dev, "Failed to request Platform MSI %d\n", ret);
-		return ret;
+		dev_warn(hba->dev, "Platform MSI not supported or failed, continuing without ESI\n");
+		return ret; /* Continue without ESI */
+	}
+
+	struct ufs_qcom_irq *qi = devm_kcalloc(hba->dev, nr_irqs, sizeof(*qi), GFP_KERNEL);
+
+	if (!qi) {
+		platform_device_msi_free_irqs_all(hba->dev);
+		return -ENOMEM;
 	}
 
 	for (int idx = 0; idx < nr_irqs; idx++) {
@@ -2117,15 +2106,17 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba)
 		ret = devm_request_irq(hba->dev, qi[idx].irq, ufs_qcom_mcq_esi_handler,
 				       IRQF_SHARED, "qcom-mcq-esi", qi + idx);
 		if (ret) {
-			dev_err(hba->dev, "%s: Fail to request IRQ for %d, err = %d\n",
+			dev_err(hba->dev, "%s: Failed to request IRQ for %d, err = %d\n",
 				__func__, qi[idx].irq, ret);
-			qi[idx].irq = 0;
+			/* Free previously allocated IRQs */
+			for (int j = 0; j < idx; j++)
+				devm_free_irq(hba->dev, qi[j].irq, qi + j);
+			platform_device_msi_free_irqs_all(hba->dev);
+			devm_kfree(hba->dev, qi);
 			return ret;
 		}
 	}
 
-	retain_and_null_ptr(qi);
-
 	if (host->hw_ver.major >= 6) {
 		ufshcd_rmwl(hba, ESI_VEC_MASK, FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1),
 			    REG_UFS_CFG3);
diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c
index b39239f641f2..b87e03777395 100644
--- a/drivers/ufs/host/ufshcd-pci.c
+++ b/drivers/ufs/host/ufshcd-pci.c
@@ -630,6 +630,7 @@ static const struct pci_device_id ufshcd_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, 0xA847), (kernel_ulong_t)&ufs_intel_mtl_hba_vops },
 	{ PCI_VDEVICE(INTEL, 0x7747), (kernel_ulong_t)&ufs_intel_mtl_hba_vops },
 	{ PCI_VDEVICE(INTEL, 0xE447), (kernel_ulong_t)&ufs_intel_mtl_hba_vops },
+	{ PCI_VDEVICE(INTEL, 0x4D47), (kernel_ulong_t)&ufs_intel_mtl_hba_vops },
 	{ }	/* terminate list */
 };
 
diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c
index e1ec9b38f5b9..d7c2a1a3c271 100644
--- a/drivers/usb/chipidea/ci_hdrc_imx.c
+++ b/drivers/usb/chipidea/ci_hdrc_imx.c
@@ -338,7 +338,8 @@ static int ci_hdrc_imx_notify_event(struct ci_hdrc *ci, unsigned int event)
 			schedule_work(&ci->usb_phy->chg_work);
 		break;
 	case CI_HDRC_CONTROLLER_PULLUP_EVENT:
-		if (ci->role == CI_ROLE_GADGET)
+		if (ci->role == CI_ROLE_GADGET &&
+		    ci->gadget.speed == USB_SPEED_HIGH)
 			imx_usbmisc_pullup(data->usbmisc_data,
 					   ci->gadget.connected);
 		break;
diff --git a/drivers/usb/chipidea/usbmisc_imx.c b/drivers/usb/chipidea/usbmisc_imx.c
index 3d20c5e76c6a..b1418885707c 100644
--- a/drivers/usb/chipidea/usbmisc_imx.c
+++ b/drivers/usb/chipidea/usbmisc_imx.c
@@ -1068,15 +1068,24 @@ static void usbmisc_imx7d_pullup(struct imx_usbmisc_data *data, bool on)
 	unsigned long flags;
 	u32 val;
 
+	if (on)
+		return;
+
 	spin_lock_irqsave(&usbmisc->lock, flags);
 	val = readl(usbmisc->base + MX7D_USBNC_USB_CTRL2);
-	if (!on) {
-		val &= ~MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_MASK;
-		val |= MX7D_USBNC_USB_CTRL2_OPMODE(1);
-		val |= MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_EN;
-	} else {
-		val &= ~MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_EN;
-	}
+	val &= ~MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_MASK;
+	val |= MX7D_USBNC_USB_CTRL2_OPMODE(1);
+	val |= MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_EN;
+	writel(val, usbmisc->base + MX7D_USBNC_USB_CTRL2);
+	spin_unlock_irqrestore(&usbmisc->lock, flags);
+
+	/* Last for at least 1 micro-frame to let host see disconnect signal */
+	usleep_range(125, 150);
+
+	spin_lock_irqsave(&usbmisc->lock, flags);
+	val &= ~MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_MASK;
+	val |= MX7D_USBNC_USB_CTRL2_OPMODE(0);
+	val &= ~MX7D_USBNC_USB_CTRL2_OPMODE_OVERRIDE_EN;
 	writel(val, usbmisc->base + MX7D_USBNC_USB_CTRL2);
 	spin_unlock_irqrestore(&usbmisc->lock, flags);
 }
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index c3177034b779..f441958b0ef4 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -119,11 +119,11 @@ ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf)
 	guard(mutex)(&usb_dynids_lock);
 	list_for_each_entry(dynid, &dynids->list, node)
 		if (dynid->id.bInterfaceClass != 0)
-			count += sysfs_emit_at(&buf[count], count, "%04x %04x %02x\n",
+			count += sysfs_emit_at(buf, count, "%04x %04x %02x\n",
 					   dynid->id.idVendor, dynid->id.idProduct,
 					   dynid->id.bInterfaceClass);
 		else
-			count += sysfs_emit_at(&buf[count], count, "%04x %04x\n",
+			count += sysfs_emit_at(buf, count, "%04x %04x\n",
 					   dynid->id.idVendor, dynid->id.idProduct);
 	return count;
 }
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 03771bbc6c01..9dd79769cad1 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1636,7 +1636,6 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
 	struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus);
 	struct usb_anchor *anchor = urb->anchor;
 	int status = urb->unlinked;
-	unsigned long flags;
 
 	urb->hcpriv = NULL;
 	if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) &&
@@ -1654,14 +1653,13 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
 	/* pass ownership to the completion handler */
 	urb->status = status;
 	/*
-	 * Only collect coverage in the softirq context and disable interrupts
-	 * to avoid scenarios with nested remote coverage collection sections
-	 * that KCOV does not support.
-	 * See the comment next to kcov_remote_start_usb_softirq() for details.
+	 * This function can be called in task context inside another remote
+	 * coverage collection section, but kcov doesn't support that kind of
+	 * recursion yet. Only collect coverage in softirq context for now.
 	 */
-	flags = kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum);
+	kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum);
 	urb->complete(urb);
-	kcov_remote_stop_softirq(flags);
+	kcov_remote_stop_softirq();
 
 	usb_anchor_resume_wakeups(anchor);
 	atomic_dec(&urb->use_count);
@@ -1719,10 +1717,10 @@ static void usb_giveback_urb_bh(struct work_struct *work)
  * @urb: urb being returned to the USB device driver.
  * @status: completion status code for the URB.
  *
- * Context: atomic. The completion callback is invoked in caller's context.
- * For HCDs with HCD_BH flag set, the completion callback is invoked in BH
- * context (except for URBs submitted to the root hub which always complete in
- * caller's context).
+ * Context: atomic. The completion callback is invoked either in a work queue
+ * (BH) context or in the caller's context, depending on whether the HCD_BH
+ * flag is set in the @hcd structure, except that URBs submitted to the
+ * root hub always complete in BH context.
  *
  * This hands the URB from HCD to its USB device driver, using its
  * completion function.  The HCD has freed all per-urb resources
@@ -2166,7 +2164,7 @@ static struct urb *request_single_step_set_feature_urb(
 	urb->complete = usb_ehset_completion;
 	urb->status = -EINPROGRESS;
 	urb->actual_length = 0;
-	urb->transfer_flags = URB_DIR_IN;
+	urb->transfer_flags = URB_DIR_IN | URB_NO_TRANSFER_DMA_MAP;
 	usb_get_urb(urb);
 	atomic_inc(&urb->use_count);
 	atomic_inc(&urb->dev->urbnum);
@@ -2230,9 +2228,15 @@ int ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
 
 	/* Complete remaining DATA and STATUS stages using the same URB */
 	urb->status = -EINPROGRESS;
+	urb->transfer_flags &= ~URB_NO_TRANSFER_DMA_MAP;
 	usb_get_urb(urb);
 	atomic_inc(&urb->use_count);
 	atomic_inc(&urb->dev->urbnum);
+	if (map_urb_for_dma(hcd, urb, GFP_KERNEL)) {
+		usb_put_urb(urb);
+		goto out1;
+	}
+
 	retval = hcd->driver->submit_single_step_set_feature(hcd, urb, 0);
 	if (!retval && !wait_for_completion_timeout(&done,
 						msecs_to_jiffies(2000))) {
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index ff0ff95d5cca..f5bc53875330 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -371,6 +371,7 @@ static const struct usb_device_id usb_quirk_list[] = {
 	{ USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM },
 
 	/* SanDisk Corp. SanDisk 3.2Gen1 */
+	{ USB_DEVICE(0x0781, 0x5596), .driver_info = USB_QUIRK_DELAY_INIT },
 	{ USB_DEVICE(0x0781, 0x55a3), .driver_info = USB_QUIRK_DELAY_INIT },
 
 	/* SanDisk Extreme 55AE */
diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index 54a4ee2b90b7..39c72cb52ce7 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c
@@ -41,6 +41,7 @@
 #define PCI_DEVICE_ID_INTEL_TGPLP		0xa0ee
 #define PCI_DEVICE_ID_INTEL_TGPH		0x43ee
 #define PCI_DEVICE_ID_INTEL_JSP			0x4dee
+#define PCI_DEVICE_ID_INTEL_WCL			0x4d7e
 #define PCI_DEVICE_ID_INTEL_ADL			0x460e
 #define PCI_DEVICE_ID_INTEL_ADL_PCH		0x51ee
 #define PCI_DEVICE_ID_INTEL_ADLN		0x465e
@@ -431,6 +432,7 @@ static const struct pci_device_id dwc3_pci_id_table[] = {
 	{ PCI_DEVICE_DATA(INTEL, TGPLP, &dwc3_pci_intel_swnode) },
 	{ PCI_DEVICE_DATA(INTEL, TGPH, &dwc3_pci_intel_swnode) },
 	{ PCI_DEVICE_DATA(INTEL, JSP, &dwc3_pci_intel_swnode) },
+	{ PCI_DEVICE_DATA(INTEL, WCL, &dwc3_pci_intel_swnode) },
 	{ PCI_DEVICE_DATA(INTEL, ADL, &dwc3_pci_intel_swnode) },
 	{ PCI_DEVICE_DATA(INTEL, ADL_PCH, &dwc3_pci_intel_swnode) },
 	{ PCI_DEVICE_DATA(INTEL, ADLN, &dwc3_pci_intel_swnode) },
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index 666ac432f52d..b4229aa13f37 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -288,7 +288,9 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
 	dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
 			DWC3_TRBCTL_CONTROL_SETUP, false);
 	ret = dwc3_ep0_start_trans(dep);
-	WARN_ON(ret < 0);
+	if (ret < 0)
+		dev_err(dwc->dev, "ep0 out start transfer failed: %d\n", ret);
+
 	for (i = 2; i < DWC3_ENDPOINTS_NUM; i++) {
 		struct dwc3_ep *dwc3_ep;
 
@@ -1061,7 +1063,9 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc,
 		ret = dwc3_ep0_start_trans(dep);
 	}
 
-	WARN_ON(ret < 0);
+	if (ret < 0)
+		dev_err(dwc->dev,
+			"ep0 data phase start transfer failed: %d\n", ret);
 }
 
 static int dwc3_ep0_start_control_status(struct dwc3_ep *dep)
@@ -1078,7 +1082,12 @@ static int dwc3_ep0_start_control_status(struct dwc3_ep *dep)
 
 static void __dwc3_ep0_do_control_status(struct dwc3 *dwc, struct dwc3_ep *dep)
 {
-	WARN_ON(dwc3_ep0_start_control_status(dep));
+	int	ret;
+
+	ret = dwc3_ep0_start_control_status(dep);
+	if (ret)
+		dev_err(dwc->dev,
+			"ep0 status phase start transfer failed: %d\n", ret);
 }
 
 static void dwc3_ep0_do_control_status(struct dwc3 *dwc,
@@ -1121,7 +1130,10 @@ void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
 	cmd |= DWC3_DEPCMD_PARAM(dep->resource_index);
 	memset(&params, 0, sizeof(params));
 	ret = dwc3_send_gadget_ep_cmd(dep, cmd, &params);
-	WARN_ON_ONCE(ret);
+	if (ret)
+		dev_err_ratelimited(dwc->dev,
+			"ep0 data phase end transfer failed: %d\n", ret);
+
 	dep->resource_index = 0;
 }
 
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 25db36c63951..554f997eb8c4 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1772,7 +1772,11 @@ static int __dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool int
 		dep->flags |= DWC3_EP_DELAY_STOP;
 		return 0;
 	}
-	WARN_ON_ONCE(ret);
+
+	if (ret)
+		dev_err_ratelimited(dep->dwc->dev,
+				"end transfer failed: %d\n", ret);
+
 	dep->resource_index = 0;
 
 	if (!interrupt)
@@ -3777,6 +3781,15 @@ static void dwc3_gadget_endpoint_transfer_complete(struct dwc3_ep *dep,
 static void dwc3_gadget_endpoint_transfer_not_ready(struct dwc3_ep *dep,
 		const struct dwc3_event_depevt *event)
 {
+	/*
+	 * During a device-initiated disconnect, a late xferNotReady event can
+	 * be generated after the End Transfer command resets the event filter,
+	 * but before the controller is halted. Ignore it to prevent a new
+	 * transfer from starting.
+	 */
+	if (!dep->dwc->connected)
+		return;
+
 	dwc3_gadget_endpoint_frame_from_event(dep, event);
 
 	/*
@@ -4039,7 +4052,9 @@ static void dwc3_clear_stall_all_ep(struct dwc3 *dwc)
 		dep->flags &= ~DWC3_EP_STALL;
 
 		ret = dwc3_send_clear_stall_ep_cmd(dep);
-		WARN_ON_ONCE(ret);
+		if (ret)
+			dev_err_ratelimited(dwc->dev,
+				"failed to clear STALL on %s\n", dep->name);
 	}
 }
 
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 08a251df20c4..5246fa6af3d6 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb,
 /* Super block */
 static const struct super_operations ffs_sb_operations = {
 	.statfs =	simple_statfs,
-	.drop_inode =	generic_delete_inode,
+	.drop_inode =	inode_just_drop,
 };
 
 struct ffs_sb_fill_data {
diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c
index 0a800ba53816..de16b02d857e 100644
--- a/drivers/usb/gadget/function/f_midi2.c
+++ b/drivers/usb/gadget/function/f_midi2.c
@@ -1599,6 +1599,7 @@ static int f_midi2_create_card(struct f_midi2 *midi2)
 			strscpy(fb->info.name, ump_fb_name(b),
 				sizeof(fb->info.name));
 		}
+		snd_ump_update_group_attrs(ump);
 	}
 
 	for (i = 0; i < midi2->num_eps; i++) {
@@ -1736,9 +1737,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2,
 	case USB_SPEED_HIGH:
 		midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(512);
 		midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(512);
-		for (i = 0; i < midi2->num_eps; i++)
+		for (i = 0; i < midi2->num_eps; i++) {
 			midi2_midi2_ep_out_desc[i].wMaxPacketSize =
 				cpu_to_le16(512);
+			midi2_midi2_ep_in_desc[i].wMaxPacketSize =
+				cpu_to_le16(512);
+		}
 		fallthrough;
 	case USB_SPEED_FULL:
 		midi1_in_eps = midi2_midi1_ep_in_descs;
@@ -1747,9 +1751,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2,
 	case USB_SPEED_SUPER:
 		midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(1024);
 		midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(1024);
-		for (i = 0; i < midi2->num_eps; i++)
+		for (i = 0; i < midi2->num_eps; i++) {
 			midi2_midi2_ep_out_desc[i].wMaxPacketSize =
 				cpu_to_le16(1024);
+			midi2_midi2_ep_in_desc[i].wMaxPacketSize =
+				cpu_to_le16(1024);
+		}
 		midi1_in_eps = midi2_midi1_ep_in_ss_descs;
 		midi1_out_eps = midi2_midi1_ep_out_ss_descs;
 		break;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index b51e132b0cd2..13c3da49348c 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name,
 
 static const struct super_operations gadget_fs_operations = {
 	.statfs =	simple_statfs,
-	.drop_inode =	generic_delete_inode,
+	.drop_inode =	inode_just_drop,
 };
 
 static int
diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c
index 21dbfb0b3bac..1cefca660773 100644
--- a/drivers/usb/gadget/udc/dummy_hcd.c
+++ b/drivers/usb/gadget/udc/dummy_hcd.c
@@ -765,8 +765,7 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req)
 	if (!dum->driver)
 		return -ESHUTDOWN;
 
-	local_irq_save(flags);
-	spin_lock(&dum->lock);
+	spin_lock_irqsave(&dum->lock, flags);
 	list_for_each_entry(iter, &ep->queue, queue) {
 		if (&iter->req != _req)
 			continue;
@@ -776,15 +775,16 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req)
 		retval = 0;
 		break;
 	}
-	spin_unlock(&dum->lock);
 
 	if (retval == 0) {
 		dev_dbg(udc_dev(dum),
 				"dequeued req %p from %s, len %d buf %p\n",
 				req, _ep->name, _req->length, _req->buf);
+		spin_unlock(&dum->lock);
 		usb_gadget_giveback_request(_ep, _req);
+		spin_lock(&dum->lock);
 	}
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&dum->lock, flags);
 	return retval;
 }
 
diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c
index 2957316fd3d0..1d3085cc9d22 100644
--- a/drivers/usb/gadget/udc/tegra-xudc.c
+++ b/drivers/usb/gadget/udc/tegra-xudc.c
@@ -502,6 +502,7 @@ struct tegra_xudc {
 	struct clk_bulk_data *clks;
 
 	bool device_mode;
+	bool current_device_mode;
 	struct work_struct usb_role_sw_work;
 
 	struct phy **usb3_phy;
@@ -715,6 +716,8 @@ static void tegra_xudc_device_mode_on(struct tegra_xudc *xudc)
 
 	phy_set_mode_ext(xudc->curr_utmi_phy, PHY_MODE_USB_OTG,
 			 USB_ROLE_DEVICE);
+
+	xudc->current_device_mode = true;
 }
 
 static void tegra_xudc_device_mode_off(struct tegra_xudc *xudc)
@@ -725,6 +728,8 @@ static void tegra_xudc_device_mode_off(struct tegra_xudc *xudc)
 
 	dev_dbg(xudc->dev, "device mode off\n");
 
+	xudc->current_device_mode = false;
+
 	connected = !!(xudc_readl(xudc, PORTSC) & PORTSC_CCS);
 
 	reinit_completion(&xudc->disconnect_complete);
@@ -4044,10 +4049,10 @@ static int __maybe_unused tegra_xudc_resume(struct device *dev)
 
 	spin_lock_irqsave(&xudc->lock, flags);
 	xudc->suspended = false;
+	if (xudc->device_mode != xudc->current_device_mode)
+		schedule_work(&xudc->usb_role_sw_work);
 	spin_unlock_irqrestore(&xudc->lock, flags);
 
-	schedule_work(&xudc->usb_role_sw_work);
-
 	pm_runtime_enable(dev);
 
 	return 0;
diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c
index 06a2edb9e86e..63edf2d8f245 100644
--- a/drivers/usb/host/xhci-dbgcap.c
+++ b/drivers/usb/host/xhci-dbgcap.c
@@ -101,13 +101,34 @@ static u32 xhci_dbc_populate_strings(struct dbc_str_descs *strings)
 	return string_length;
 }
 
+static void xhci_dbc_init_ep_contexts(struct xhci_dbc *dbc)
+{
+	struct xhci_ep_ctx      *ep_ctx;
+	unsigned int		max_burst;
+	dma_addr_t		deq;
+
+	max_burst               = DBC_CTRL_MAXBURST(readl(&dbc->regs->control));
+
+	/* Populate bulk out endpoint context: */
+	ep_ctx                  = dbc_bulkout_ctx(dbc);
+	deq                     = dbc_bulkout_enq(dbc);
+	ep_ctx->ep_info         = 0;
+	ep_ctx->ep_info2        = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst);
+	ep_ctx->deq             = cpu_to_le64(deq | dbc->ring_out->cycle_state);
+
+	/* Populate bulk in endpoint context: */
+	ep_ctx                  = dbc_bulkin_ctx(dbc);
+	deq                     = dbc_bulkin_enq(dbc);
+	ep_ctx->ep_info         = 0;
+	ep_ctx->ep_info2        = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst);
+	ep_ctx->deq             = cpu_to_le64(deq | dbc->ring_in->cycle_state);
+}
+
 static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length)
 {
 	struct dbc_info_context	*info;
-	struct xhci_ep_ctx	*ep_ctx;
 	u32			dev_info;
-	dma_addr_t		deq, dma;
-	unsigned int		max_burst;
+	dma_addr_t		dma;
 
 	if (!dbc)
 		return;
@@ -121,20 +142,8 @@ static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length)
 	info->serial		= cpu_to_le64(dma + DBC_MAX_STRING_LENGTH * 3);
 	info->length		= cpu_to_le32(string_length);
 
-	/* Populate bulk out endpoint context: */
-	ep_ctx			= dbc_bulkout_ctx(dbc);
-	max_burst		= DBC_CTRL_MAXBURST(readl(&dbc->regs->control));
-	deq			= dbc_bulkout_enq(dbc);
-	ep_ctx->ep_info		= 0;
-	ep_ctx->ep_info2	= dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst);
-	ep_ctx->deq		= cpu_to_le64(deq | dbc->ring_out->cycle_state);
-
-	/* Populate bulk in endpoint context: */
-	ep_ctx			= dbc_bulkin_ctx(dbc);
-	deq			= dbc_bulkin_enq(dbc);
-	ep_ctx->ep_info		= 0;
-	ep_ctx->ep_info2	= dbc_epctx_info2(BULK_IN_EP, 1024, max_burst);
-	ep_ctx->deq		= cpu_to_le64(deq | dbc->ring_in->cycle_state);
+	/* Populate bulk in and out endpoint contexts: */
+	xhci_dbc_init_ep_contexts(dbc);
 
 	/* Set DbC context and info registers: */
 	lo_hi_writeq(dbc->ctx->dma, &dbc->regs->dccp);
@@ -436,6 +445,42 @@ dbc_alloc_ctx(struct device *dev, gfp_t flags)
 	return ctx;
 }
 
+static void xhci_dbc_ring_init(struct xhci_ring *ring)
+{
+	struct xhci_segment *seg = ring->first_seg;
+
+	/* clear all trbs on ring in case of old ring */
+	memset(seg->trbs, 0, TRB_SEGMENT_SIZE);
+
+	/* Only event ring does not use link TRB */
+	if (ring->type != TYPE_EVENT) {
+		union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1];
+
+		trb->link.segment_ptr = cpu_to_le64(ring->first_seg->dma);
+		trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK));
+	}
+	xhci_initialize_ring_info(ring);
+}
+
+static int xhci_dbc_reinit_ep_rings(struct xhci_dbc *dbc)
+{
+	struct xhci_ring *in_ring = dbc->eps[BULK_IN].ring;
+	struct xhci_ring *out_ring = dbc->eps[BULK_OUT].ring;
+
+	if (!in_ring || !out_ring || !dbc->ctx) {
+		dev_warn(dbc->dev, "Can't re-init unallocated endpoints\n");
+		return -ENODEV;
+	}
+
+	xhci_dbc_ring_init(in_ring);
+	xhci_dbc_ring_init(out_ring);
+
+	/* set ep context enqueue, dequeue, and cycle to initial values */
+	xhci_dbc_init_ep_contexts(dbc);
+
+	return 0;
+}
+
 static struct xhci_ring *
 xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags)
 {
@@ -464,15 +509,10 @@ xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags)
 
 	seg->dma = dma;
 
-	/* Only event ring does not use link TRB */
-	if (type != TYPE_EVENT) {
-		union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1];
-
-		trb->link.segment_ptr = cpu_to_le64(dma);
-		trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK));
-	}
 	INIT_LIST_HEAD(&ring->td_list);
-	xhci_initialize_ring_info(ring);
+
+	xhci_dbc_ring_init(ring);
+
 	return ring;
 dma_fail:
 	kfree(seg);
@@ -864,7 +904,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc)
 			dev_info(dbc->dev, "DbC cable unplugged\n");
 			dbc->state = DS_ENABLED;
 			xhci_dbc_flush_requests(dbc);
-
+			xhci_dbc_reinit_ep_rings(dbc);
 			return EVT_DISC;
 		}
 
@@ -874,7 +914,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc)
 			writel(portsc, &dbc->regs->portsc);
 			dbc->state = DS_ENABLED;
 			xhci_dbc_flush_requests(dbc);
-
+			xhci_dbc_reinit_ep_rings(dbc);
 			return EVT_DISC;
 		}
 
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 92bb84f8132a..b3a59ce1b3f4 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -704,8 +704,7 @@ static int xhci_enter_test_mode(struct xhci_hcd *xhci,
 		if (!xhci->devs[i])
 			continue;
 
-		retval = xhci_disable_slot(xhci, i);
-		xhci_free_virt_device(xhci, i);
+		retval = xhci_disable_and_free_slot(xhci, i);
 		if (retval)
 			xhci_err(xhci, "Failed to disable slot %d, %d. Enter test mode anyway\n",
 				 i, retval);
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 07289333a1e8..c4a6544aa107 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -865,21 +865,20 @@ free_tts:
  * will be manipulated by the configure endpoint, allocate device, or update
  * hub functions while this function is removing the TT entries from the list.
  */
-void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id)
+void xhci_free_virt_device(struct xhci_hcd *xhci, struct xhci_virt_device *dev,
+		int slot_id)
 {
-	struct xhci_virt_device *dev;
 	int i;
 	int old_active_eps = 0;
 
 	/* Slot ID 0 is reserved */
-	if (slot_id == 0 || !xhci->devs[slot_id])
+	if (slot_id == 0 || !dev)
 		return;
 
-	dev = xhci->devs[slot_id];
-
-	xhci->dcbaa->dev_context_ptrs[slot_id] = 0;
-	if (!dev)
-		return;
+	/* If device ctx array still points to _this_ device, clear it */
+	if (dev->out_ctx &&
+	    xhci->dcbaa->dev_context_ptrs[slot_id] == cpu_to_le64(dev->out_ctx->dma))
+		xhci->dcbaa->dev_context_ptrs[slot_id] = 0;
 
 	trace_xhci_free_virt_device(dev);
 
@@ -920,8 +919,9 @@ void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id)
 		dev->udev->slot_id = 0;
 	if (dev->rhub_port && dev->rhub_port->slot_id == slot_id)
 		dev->rhub_port->slot_id = 0;
-	kfree(xhci->devs[slot_id]);
-	xhci->devs[slot_id] = NULL;
+	if (xhci->devs[slot_id] == dev)
+		xhci->devs[slot_id] = NULL;
+	kfree(dev);
 }
 
 /*
@@ -962,7 +962,7 @@ static void xhci_free_virt_devices_depth_first(struct xhci_hcd *xhci, int slot_i
 out:
 	/* we are now at a leaf device */
 	xhci_debugfs_remove_slot(xhci, slot_id);
-	xhci_free_virt_device(xhci, slot_id);
+	xhci_free_virt_device(xhci, xhci->devs[slot_id], slot_id);
 }
 
 int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id,
diff --git a/drivers/usb/host/xhci-pci-renesas.c b/drivers/usb/host/xhci-pci-renesas.c
index 620f8f0febb8..86df80399c9f 100644
--- a/drivers/usb/host/xhci-pci-renesas.c
+++ b/drivers/usb/host/xhci-pci-renesas.c
@@ -47,8 +47,9 @@
 #define RENESAS_ROM_ERASE_MAGIC				0x5A65726F
 #define RENESAS_ROM_WRITE_MAGIC				0x53524F4D
 
-#define RENESAS_RETRY	10000
-#define RENESAS_DELAY	10
+#define RENESAS_RETRY			50000	/* 50000 * RENESAS_DELAY ~= 500ms */
+#define RENESAS_CHIP_ERASE_RETRY	500000	/* 500000 * RENESAS_DELAY ~= 5s */
+#define RENESAS_DELAY			10
 
 #define RENESAS_FW_NAME	"renesas_usb_fw.mem"
 
@@ -407,7 +408,7 @@ static void renesas_rom_erase(struct pci_dev *pdev)
 	/* sleep a bit while ROM is erased */
 	msleep(20);
 
-	for (i = 0; i < RENESAS_RETRY; i++) {
+	for (i = 0; i < RENESAS_CHIP_ERASE_RETRY; i++) {
 		retval = pci_read_config_byte(pdev, RENESAS_ROM_STATUS,
 					      &status);
 		status &= RENESAS_ROM_STATUS_ERASE;
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index ecd757d482c5..4f8f5aab109d 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1592,7 +1592,8 @@ static void xhci_handle_cmd_enable_slot(int slot_id, struct xhci_command *comman
 		command->slot_id = 0;
 }
 
-static void xhci_handle_cmd_disable_slot(struct xhci_hcd *xhci, int slot_id)
+static void xhci_handle_cmd_disable_slot(struct xhci_hcd *xhci, int slot_id,
+					u32 cmd_comp_code)
 {
 	struct xhci_virt_device *virt_dev;
 	struct xhci_slot_ctx *slot_ctx;
@@ -1607,6 +1608,10 @@ static void xhci_handle_cmd_disable_slot(struct xhci_hcd *xhci, int slot_id)
 	if (xhci->quirks & XHCI_EP_LIMIT_QUIRK)
 		/* Delete default control endpoint resources */
 		xhci_free_device_endpoint_resources(xhci, virt_dev, true);
+	if (cmd_comp_code == COMP_SUCCESS) {
+		xhci->dcbaa->dev_context_ptrs[slot_id] = 0;
+		xhci->devs[slot_id] = NULL;
+	}
 }
 
 static void xhci_handle_cmd_config_ep(struct xhci_hcd *xhci, int slot_id)
@@ -1856,7 +1861,7 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
 		xhci_handle_cmd_enable_slot(slot_id, cmd, cmd_comp_code);
 		break;
 	case TRB_DISABLE_SLOT:
-		xhci_handle_cmd_disable_slot(xhci, slot_id);
+		xhci_handle_cmd_disable_slot(xhci, slot_id, cmd_comp_code);
 		break;
 	case TRB_CONFIG_EP:
 		if (!cmd->completion)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 47151ca527bf..742c23826e17 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -309,6 +309,7 @@ int xhci_enable_interrupter(struct xhci_interrupter *ir)
 		return -EINVAL;
 
 	iman = readl(&ir->ir_set->iman);
+	iman &= ~IMAN_IP;
 	iman |= IMAN_IE;
 	writel(iman, &ir->ir_set->iman);
 
@@ -325,6 +326,7 @@ int xhci_disable_interrupter(struct xhci_hcd *xhci, struct xhci_interrupter *ir)
 		return -EINVAL;
 
 	iman = readl(&ir->ir_set->iman);
+	iman &= ~IMAN_IP;
 	iman &= ~IMAN_IE;
 	writel(iman, &ir->ir_set->iman);
 
@@ -3932,8 +3934,7 @@ static int xhci_discover_or_reset_device(struct usb_hcd *hcd,
 		 * Obtaining a new device slot to inform the xHCI host that
 		 * the USB device has been reset.
 		 */
-		ret = xhci_disable_slot(xhci, udev->slot_id);
-		xhci_free_virt_device(xhci, udev->slot_id);
+		ret = xhci_disable_and_free_slot(xhci, udev->slot_id);
 		if (!ret) {
 			ret = xhci_alloc_dev(hcd, udev);
 			if (ret == 1)
@@ -4090,7 +4091,7 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	xhci_disable_slot(xhci, udev->slot_id);
 
 	spin_lock_irqsave(&xhci->lock, flags);
-	xhci_free_virt_device(xhci, udev->slot_id);
+	xhci_free_virt_device(xhci, virt_dev, udev->slot_id);
 	spin_unlock_irqrestore(&xhci->lock, flags);
 
 }
@@ -4139,6 +4140,16 @@ int xhci_disable_slot(struct xhci_hcd *xhci, u32 slot_id)
 	return 0;
 }
 
+int xhci_disable_and_free_slot(struct xhci_hcd *xhci, u32 slot_id)
+{
+	struct xhci_virt_device *vdev = xhci->devs[slot_id];
+	int ret;
+
+	ret = xhci_disable_slot(xhci, slot_id);
+	xhci_free_virt_device(xhci, vdev, slot_id);
+	return ret;
+}
+
 /*
  * Checks if we have enough host controller resources for the default control
  * endpoint.
@@ -4245,8 +4256,7 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
 	return 1;
 
 disable_slot:
-	xhci_disable_slot(xhci, udev->slot_id);
-	xhci_free_virt_device(xhci, udev->slot_id);
+	xhci_disable_and_free_slot(xhci, udev->slot_id);
 
 	return 0;
 }
@@ -4382,8 +4392,7 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev,
 		dev_warn(&udev->dev, "Device not responding to setup %s.\n", act);
 
 		mutex_unlock(&xhci->mutex);
-		ret = xhci_disable_slot(xhci, udev->slot_id);
-		xhci_free_virt_device(xhci, udev->slot_id);
+		ret = xhci_disable_and_free_slot(xhci, udev->slot_id);
 		if (!ret) {
 			if (xhci_alloc_dev(hcd, udev) == 1)
 				xhci_setup_addressable_virt_dev(xhci, udev);
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index a20f4e7cd43a..85d5b964bf1e 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1791,7 +1791,7 @@ void xhci_dbg_trace(struct xhci_hcd *xhci, void (*trace)(struct va_format *),
 /* xHCI memory management */
 void xhci_mem_cleanup(struct xhci_hcd *xhci);
 int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags);
-void xhci_free_virt_device(struct xhci_hcd *xhci, int slot_id);
+void xhci_free_virt_device(struct xhci_hcd *xhci, struct xhci_virt_device *dev, int slot_id);
 int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, struct usb_device *udev, gfp_t flags);
 int xhci_setup_addressable_virt_dev(struct xhci_hcd *xhci, struct usb_device *udev);
 void xhci_copy_ep0_dequeue_into_input_ctx(struct xhci_hcd *xhci,
@@ -1888,6 +1888,7 @@ void xhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev);
 int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev,
 			   struct usb_tt *tt, gfp_t mem_flags);
 int xhci_disable_slot(struct xhci_hcd *xhci, u32 slot_id);
+int xhci_disable_and_free_slot(struct xhci_hcd *xhci, u32 slot_id);
 int xhci_ext_cap_init(struct xhci_hcd *xhci);
 
 int xhci_suspend(struct xhci_hcd *xhci, bool do_wakeup);
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index e5cd33093423..fc869b7f803f 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1322,7 +1322,18 @@ static const struct usb_device_id option_ids[] = {
 	 .driver_info = NCTRL(0) | RSVD(3) },
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1033, 0xff),	/* Telit LE910C1-EUX (ECM) */
 	 .driver_info = NCTRL(0) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1034, 0xff),	/* Telit LE910C4-WWX (rmnet) */
+	 .driver_info = RSVD(2) },
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1035, 0xff) }, /* Telit LE910C4-WWX (ECM) */
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1036, 0xff) },  /* Telit LE910C4-WWX */
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1037, 0xff),	/* Telit LE910C4-WWX (rmnet) */
+	 .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1038, 0xff),	/* Telit LE910C4-WWX (rmnet) */
+	 .driver_info = NCTRL(0) | RSVD(3) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103b, 0xff),	/* Telit LE910C4-WWX */
+	 .driver_info = NCTRL(0) | NCTRL(1) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103c, 0xff),	/* Telit LE910C4-WWX */
+	 .driver_info = NCTRL(0) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0),
 	  .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG1),
@@ -1369,6 +1380,12 @@ static const struct usb_device_id option_ids[] = {
 	  .driver_info = NCTRL(0) | RSVD(1) },
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff),	/* Telit FN990A (PCIe) */
 	  .driver_info = RSVD(0) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1077, 0xff),	/* Telit FN990A (rmnet + audio) */
+	  .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1078, 0xff),	/* Telit FN990A (MBIM + audio) */
+	  .driver_info = NCTRL(0) | RSVD(1) },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1079, 0xff),	/* Telit FN990A (RNDIS + audio) */
+	  .driver_info = NCTRL(2) | RSVD(3) },
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff),	/* Telit FE990A (rmnet) */
 	  .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) },
 	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1081, 0xff),	/* Telit FE990A (MBIM) */
diff --git a/drivers/usb/storage/realtek_cr.c b/drivers/usb/storage/realtek_cr.c
index 7dea28c2b8ee..cb5bbb19060e 100644
--- a/drivers/usb/storage/realtek_cr.c
+++ b/drivers/usb/storage/realtek_cr.c
@@ -252,7 +252,7 @@ static int rts51x_bulk_transport(struct us_data *us, u8 lun,
 		return USB_STOR_TRANSPORT_ERROR;
 	}
 
-	residue = bcs->Residue;
+	residue = le32_to_cpu(bcs->Residue);
 	if (bcs->Tag != us->tag)
 		return USB_STOR_TRANSPORT_ERROR;
 
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 54f0b1c83317..dfa5276a5a43 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -934,6 +934,13 @@ UNUSUAL_DEV(  0x05e3, 0x0723, 0x9451, 0x9451,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_SANE_SENSE ),
 
+/* Added by Maël GUERIN <mael.guerin@murena.io> */
+UNUSUAL_DEV(  0x0603, 0x8611, 0x0000, 0xffff,
+		"Novatek",
+		"NTK96550-based camera",
+		USB_SC_SCSI, USB_PR_BULK, NULL,
+		US_FL_BULK_IGNORE_TAG ),
+
 /*
  * Reported by Hanno Boeck <hanno@gmx.de>
  * Taken from the Lycoris Kernel
@@ -1494,6 +1501,28 @@ UNUSUAL_DEV( 0x0bc2, 0x3332, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_WP_DETECT ),
 
+/*
+ * Reported by Zenm Chen <zenmchen@gmail.com>
+ * Ignore driver CD mode, otherwise usb_modeswitch may fail to switch
+ * the device into Wi-Fi mode.
+ */
+UNUSUAL_DEV( 0x0bda, 0x1a2b, 0x0000, 0xffff,
+		"Realtek",
+		"DISK",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_DEVICE ),
+
+/*
+ * Reported by Zenm Chen <zenmchen@gmail.com>
+ * Ignore driver CD mode, otherwise usb_modeswitch may fail to switch
+ * the device into Wi-Fi mode.
+ */
+UNUSUAL_DEV( 0x0bda, 0xa192, 0x0000, 0xffff,
+		"Realtek",
+		"DISK",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_DEVICE ),
+
 UNUSUAL_DEV(  0x0d49, 0x7310, 0x0000, 0x9999,
 		"Maxtor",
 		"USB to SATA",
diff --git a/drivers/usb/typec/tcpm/fusb302.c b/drivers/usb/typec/tcpm/fusb302.c
index a4ff2403ddd6..870a71f953f6 100644
--- a/drivers/usb/typec/tcpm/fusb302.c
+++ b/drivers/usb/typec/tcpm/fusb302.c
@@ -1485,6 +1485,9 @@ static irqreturn_t fusb302_irq_intn(int irq, void *dev_id)
 	struct fusb302_chip *chip = dev_id;
 	unsigned long flags;
 
+	/* Disable our level triggered IRQ until our irq_work has cleared it */
+	disable_irq_nosync(chip->gpio_int_n_irq);
+
 	spin_lock_irqsave(&chip->irq_lock, flags);
 	if (chip->irq_suspended)
 		chip->irq_while_suspended = true;
@@ -1627,6 +1630,7 @@ static void fusb302_irq_work(struct work_struct *work)
 	}
 done:
 	mutex_unlock(&chip->lock);
+	enable_irq(chip->gpio_int_n_irq);
 }
 
 static int init_gpio(struct fusb302_chip *chip)
@@ -1751,10 +1755,9 @@ static int fusb302_probe(struct i2c_client *client)
 		goto destroy_workqueue;
 	}
 
-	ret = devm_request_threaded_irq(dev, chip->gpio_int_n_irq,
-					NULL, fusb302_irq_intn,
-					IRQF_ONESHOT | IRQF_TRIGGER_LOW,
-					"fsc_interrupt_int_n", chip);
+	ret = request_irq(chip->gpio_int_n_irq, fusb302_irq_intn,
+			  IRQF_ONESHOT | IRQF_TRIGGER_LOW,
+			  "fsc_interrupt_int_n", chip);
 	if (ret < 0) {
 		dev_err(dev, "cannot request IRQ for GPIO Int_N, ret=%d", ret);
 		goto tcpm_unregister_port;
@@ -1779,6 +1782,7 @@ static void fusb302_remove(struct i2c_client *client)
 	struct fusb302_chip *chip = i2c_get_clientdata(client);
 
 	disable_irq_wake(chip->gpio_int_n_irq);
+	free_irq(chip->gpio_int_n_irq, chip);
 	cancel_work_sync(&chip->irq_work);
 	cancel_delayed_work_sync(&chip->bc_lvl_handler);
 	tcpm_unregister_port(chip->tcpm_port);
diff --git a/drivers/usb/typec/tcpm/maxim_contaminant.c b/drivers/usb/typec/tcpm/maxim_contaminant.c
index 0cdda06592fd..af8da6dc60ae 100644
--- a/drivers/usb/typec/tcpm/maxim_contaminant.c
+++ b/drivers/usb/typec/tcpm/maxim_contaminant.c
@@ -188,6 +188,11 @@ static int max_contaminant_read_comparators(struct max_tcpci_chip *chip, u8 *ven
 	if (ret < 0)
 		return ret;
 
+	/* Disable low power mode */
+	ret = regmap_update_bits(regmap, TCPC_VENDOR_CC_CTRL2, CCLPMODESEL,
+				 FIELD_PREP(CCLPMODESEL,
+					    LOW_POWER_MODE_DISABLE));
+
 	/* Sleep to allow comparators settle */
 	usleep_range(5000, 6000);
 	ret = regmap_update_bits(regmap, TCPC_TCPC_CTRL, TCPC_TCPC_CTRL_ORIENTATION, PLUG_ORNT_CC1);
@@ -324,6 +329,39 @@ static int max_contaminant_enable_dry_detection(struct max_tcpci_chip *chip)
 	return 0;
 }
 
+static int max_contaminant_enable_toggling(struct max_tcpci_chip *chip)
+{
+	struct regmap *regmap = chip->data.regmap;
+	int ret;
+
+	/* Disable dry detection if enabled. */
+	ret = regmap_update_bits(regmap, TCPC_VENDOR_CC_CTRL2, CCLPMODESEL,
+				 FIELD_PREP(CCLPMODESEL,
+					    LOW_POWER_MODE_DISABLE));
+	if (ret)
+		return ret;
+
+	ret = regmap_update_bits(regmap, TCPC_VENDOR_CC_CTRL1, CCCONNDRY, 0);
+	if (ret)
+		return ret;
+
+	ret = max_tcpci_write8(chip, TCPC_ROLE_CTRL, TCPC_ROLE_CTRL_DRP |
+			       FIELD_PREP(TCPC_ROLE_CTRL_CC1,
+					  TCPC_ROLE_CTRL_CC_RD) |
+			       FIELD_PREP(TCPC_ROLE_CTRL_CC2,
+					  TCPC_ROLE_CTRL_CC_RD));
+	if (ret)
+		return ret;
+
+	ret = regmap_update_bits(regmap, TCPC_TCPC_CTRL,
+				 TCPC_TCPC_CTRL_EN_LK4CONN_ALRT,
+				 TCPC_TCPC_CTRL_EN_LK4CONN_ALRT);
+	if (ret)
+		return ret;
+
+	return max_tcpci_write8(chip, TCPC_COMMAND, TCPC_CMD_LOOK4CONNECTION);
+}
+
 bool max_contaminant_is_contaminant(struct max_tcpci_chip *chip, bool disconnect_while_debounce,
 				    bool *cc_handled)
 {
@@ -340,6 +378,12 @@ bool max_contaminant_is_contaminant(struct max_tcpci_chip *chip, bool disconnect
 	if (ret < 0)
 		return false;
 
+	if (cc_status & TCPC_CC_STATUS_TOGGLING) {
+		if (chip->contaminant_state == DETECTED)
+			return true;
+		return false;
+	}
+
 	if (chip->contaminant_state == NOT_DETECTED || chip->contaminant_state == SINK) {
 		if (!disconnect_while_debounce)
 			msleep(100);
@@ -372,6 +416,12 @@ bool max_contaminant_is_contaminant(struct max_tcpci_chip *chip, bool disconnect
 				max_contaminant_enable_dry_detection(chip);
 				return true;
 			}
+
+			ret = max_contaminant_enable_toggling(chip);
+			if (ret)
+				dev_err(chip->dev,
+					"Failed to enable toggling, ret=%d",
+					ret);
 		}
 	} else if (chip->contaminant_state == DETECTED) {
 		if (!(cc_status & TCPC_CC_STATUS_TOGGLING)) {
@@ -379,6 +429,14 @@ bool max_contaminant_is_contaminant(struct max_tcpci_chip *chip, bool disconnect
 			if (chip->contaminant_state == DETECTED) {
 				max_contaminant_enable_dry_detection(chip);
 				return true;
+			} else {
+				ret = max_contaminant_enable_toggling(chip);
+				if (ret) {
+					dev_err(chip->dev,
+						"Failed to enable toggling, ret=%d",
+						ret);
+					return true;
+				}
 			}
 		}
 	}
diff --git a/drivers/usb/typec/tcpm/tcpci_maxim.h b/drivers/usb/typec/tcpm/tcpci_maxim.h
index 76270d5c2838..b33540a42a95 100644
--- a/drivers/usb/typec/tcpm/tcpci_maxim.h
+++ b/drivers/usb/typec/tcpm/tcpci_maxim.h
@@ -21,6 +21,7 @@
 #define CCOVPDIS                                BIT(6)
 #define SBURPCTRL                               BIT(5)
 #define CCLPMODESEL                             GENMASK(4, 3)
+#define LOW_POWER_MODE_DISABLE                  0
 #define ULTRA_LOW_POWER_MODE                    1
 #define CCRPCTRL                                GENMASK(2, 0)
 #define UA_1_SRC                                1
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 1f6fdfaa34bf..b2a568a5bc9b 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -2426,17 +2426,21 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port,
 		case ADEV_NONE:
 			break;
 		case ADEV_NOTIFY_USB_AND_QUEUE_VDM:
-			WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL));
-			typec_altmode_vdm(adev, p[0], &p[1], cnt);
+			if (rx_sop_type == TCPC_TX_SOP_PRIME) {
+				typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt);
+			} else {
+				WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL));
+				typec_altmode_vdm(adev, p[0], &p[1], cnt);
+			}
 			break;
 		case ADEV_QUEUE_VDM:
-			if (response_tx_sop_type == TCPC_TX_SOP_PRIME)
+			if (rx_sop_type == TCPC_TX_SOP_PRIME)
 				typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt);
 			else
 				typec_altmode_vdm(adev, p[0], &p[1], cnt);
 			break;
 		case ADEV_QUEUE_VDM_SEND_EXIT_MODE_ON_FAIL:
-			if (response_tx_sop_type == TCPC_TX_SOP_PRIME) {
+			if (rx_sop_type == TCPC_TX_SOP_PRIME) {
 				if (typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P,
 							    p[0], &p[1], cnt)) {
 					int svdm_version = typec_get_cable_svdm_version(
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6edac0c1ba9b..35ded4330431 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -99,6 +99,7 @@ struct vhost_net_ubuf_ref {
 	atomic_t refcount;
 	wait_queue_head_t wait;
 	struct vhost_virtqueue *vq;
+	struct rcu_head rcu;
 };
 
 #define VHOST_NET_BATCH 64
@@ -250,9 +251,13 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
 
 static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
 {
-	int r = atomic_sub_return(1, &ubufs->refcount);
+	int r;
+
+	rcu_read_lock();
+	r = atomic_sub_return(1, &ubufs->refcount);
 	if (unlikely(!r))
 		wake_up(&ubufs->wait);
+	rcu_read_unlock();
 	return r;
 }
 
@@ -265,7 +270,7 @@ static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
 static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
 {
 	vhost_net_ubuf_put_and_wait(ubufs);
-	kfree(ubufs);
+	kfree_rcu(ubufs, rcu);
 }
 
 static void vhost_net_clear_ubuf_info(struct vhost_net *n)
@@ -760,11 +765,11 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 	int err;
 	int sent_pkts = 0;
 	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
-	bool busyloop_intr;
 	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
 
 	do {
-		busyloop_intr = false;
+		bool busyloop_intr = false;
+
 		if (nvq->done_idx == VHOST_NET_BATCH)
 			vhost_tx_batch(net, nvq, sock, &msg);
 
@@ -775,10 +780,18 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 			break;
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
-			/* Kicks are disabled at this point, break loop and
-			 * process any remaining batched packets. Queue will
-			 * be re-enabled afterwards.
+			/* Flush batched packets to handle pending RX
+			 * work (if busyloop_intr is set) and to avoid
+			 * unnecessary virtqueue kicks.
 			 */
+			vhost_tx_batch(net, nvq, sock, &msg);
+			if (unlikely(busyloop_intr)) {
+				vhost_poll_queue(&vq->poll);
+			} else if (unlikely(vhost_enable_notify(&net->dev,
+								vq))) {
+				vhost_disable_notify(&net->dev, vq);
+				continue;
+			}
 			break;
 		}
 
@@ -834,22 +847,7 @@ done:
 		++nvq->done_idx;
 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
 
-	/* Kicks are still disabled, dispatch any remaining batched msgs. */
 	vhost_tx_batch(net, nvq, sock, &msg);
-
-	if (unlikely(busyloop_intr))
-		/* If interrupted while doing busy polling, requeue the
-		 * handler to be fair handle_rx as well as other tasks
-		 * waiting on cpu.
-		 */
-		vhost_poll_queue(&vq->poll);
-	else
-		/* All of our work has been completed; however, before
-		 * leaving the TX handler, do one last check for work,
-		 * and requeue handler if necessary. If there is no work,
-		 * queue will be reenabled.
-		 */
-		vhost_net_busy_poll_try_queue(net, vq);
 }
 
 static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
@@ -1009,7 +1007,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 }
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
-				      bool *busyloop_intr, unsigned int count)
+				      bool *busyloop_intr, unsigned int *count)
 {
 	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
 	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -1019,7 +1017,8 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
 
 	if (!len && rvq->busyloop_timeout) {
 		/* Flush batched heads first */
-		vhost_net_signal_used(rnvq, count);
+		vhost_net_signal_used(rnvq, *count);
+		*count = 0;
 		/* Both tx vq and rx socket were polled here */
 		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
 
@@ -1175,7 +1174,7 @@ static void handle_rx(struct vhost_net *net)
 
 	do {
 		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
-						      &busyloop_intr, count);
+						      &busyloop_intr, &count);
 		if (!sock_len)
 			break;
 		sock_len += sock_hlen;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index abf51332a5c5..98e4f68f4e3c 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -2884,7 +2884,7 @@ vhost_scsi_make_tport(struct target_fabric_configfs *tf,
 check_len:
 	if (strlen(name) >= VHOST_SCSI_NAMELEN) {
 		pr_err("Emulated %s Address: %s, exceeds"
-			" max: %d\n", name, vhost_scsi_dump_proto_id(tport),
+			" max: %d\n", vhost_scsi_dump_proto_id(tport), name,
 			VHOST_SCSI_NAMELEN);
 		kfree(tport);
 		return ERR_PTR(-EINVAL);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 55f5731e94c3..5940e2eb9231 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2504,7 +2504,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font,
 	unsigned charcount = font->charcount;
 	int w = font->width;
 	int h = font->height;
-	int size;
+	int size, alloc_size;
 	int i, csum;
 	u8 *new_data, *data = font->data;
 	int pitch = PITCH(font->width);
@@ -2531,9 +2531,16 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font,
 	if (fbcon_invalid_charcount(info, charcount))
 		return -EINVAL;
 
-	size = CALC_FONTSZ(h, pitch, charcount);
+	/* Check for integer overflow in font size calculation */
+	if (check_mul_overflow(h, pitch, &size) ||
+	    check_mul_overflow(size, charcount, &size))
+		return -EINVAL;
+
+	/* Check for overflow in allocation size calculation */
+	if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &alloc_size))
+		return -EINVAL;
 
-	new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, GFP_USER);
+	new_data = kmalloc(alloc_size, GFP_USER);
 
 	if (!new_data)
 		return -ENOMEM;
diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig
index 4404d198f3b2..94d88e5da707 100644
--- a/drivers/virt/coco/efi_secret/Kconfig
+++ b/drivers/virt/coco/efi_secret/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config EFI_SECRET
 	tristate "EFI secret area securityfs support"
-	depends on EFI && X86_64
+	depends on EFI && (X86_64 || ARM64)
 	select EFI_COCO_SECRET
 	select SECURITYFS
 	help
diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c
index d2b3ae7113ab..b01ec99106cd 100644
--- a/drivers/virt/coco/sev-guest/sev-guest.c
+++ b/drivers/virt/coco/sev-guest/sev-guest.c
@@ -116,13 +116,11 @@ e_free:
 
 static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg)
 {
+	struct snp_derived_key_resp *derived_key_resp __free(kfree) = NULL;
 	struct snp_derived_key_req *derived_key_req __free(kfree) = NULL;
-	struct snp_derived_key_resp derived_key_resp = {0};
 	struct snp_msg_desc *mdesc = snp_dev->msg_desc;
 	struct snp_guest_req req = {};
 	int rc, resp_len;
-	/* Response data is 64 bytes and max authsize for GCM is 16 bytes. */
-	u8 buf[64 + 16];
 
 	if (!arg->req_data || !arg->resp_data)
 		return -EINVAL;
@@ -132,8 +130,9 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque
 	 * response payload. Make sure that it has enough space to cover the
 	 * authtag.
 	 */
-	resp_len = sizeof(derived_key_resp.data) + mdesc->ctx->authsize;
-	if (sizeof(buf) < resp_len)
+	resp_len = sizeof(derived_key_resp->data) + mdesc->ctx->authsize;
+	derived_key_resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT);
+	if (!derived_key_resp)
 		return -ENOMEM;
 
 	derived_key_req = kzalloc(sizeof(*derived_key_req), GFP_KERNEL_ACCOUNT);
@@ -149,23 +148,21 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque
 	req.vmpck_id = mdesc->vmpck_id;
 	req.req_buf = derived_key_req;
 	req.req_sz = sizeof(*derived_key_req);
-	req.resp_buf = buf;
+	req.resp_buf = derived_key_resp;
 	req.resp_sz = resp_len;
 	req.exit_code = SVM_VMGEXIT_GUEST_REQUEST;
 
 	rc = snp_send_guest_request(mdesc, &req);
 	arg->exitinfo2 = req.exitinfo2;
-	if (rc)
-		return rc;
-
-	memcpy(derived_key_resp.data, buf, sizeof(derived_key_resp.data));
-	if (copy_to_user((void __user *)arg->resp_data, &derived_key_resp,
-			 sizeof(derived_key_resp)))
-		rc = -EFAULT;
+	if (!rc) {
+		if (copy_to_user((void __user *)arg->resp_data, derived_key_resp,
+				 sizeof(derived_key_resp->data)))
+			rc = -EFAULT;
+	}
 
 	/* The response buffer contains the sensitive data, explicitly clear it. */
-	memzero_explicit(buf, sizeof(buf));
-	memzero_explicit(&derived_key_resp, sizeof(derived_key_resp));
+	memzero_explicit(derived_key_resp, sizeof(*derived_key_resp));
+
 	return rc;
 }
 
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index a5d63269f20b..d0728285b6ce 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -360,11 +360,15 @@ static int virtinput_freeze(struct virtio_device *vdev)
 {
 	struct virtio_input *vi = vdev->priv;
 	unsigned long flags;
+	void *buf;
 
 	spin_lock_irqsave(&vi->lock, flags);
 	vi->ready = false;
 	spin_unlock_irqrestore(&vi->lock, flags);
 
+	virtio_reset_device(vdev);
+	while ((buf = virtqueue_detach_unused_buf(vi->sts)) != NULL)
+		kfree(buf);
 	vdev->config->del_vqs(vdev);
 	return 0;
 }
diff --git a/drivers/virtio/virtio_pci_legacy_dev.c b/drivers/virtio/virtio_pci_legacy_dev.c
index 677d1f68bc9b..bbbf89c22880 100644
--- a/drivers/virtio/virtio_pci_legacy_dev.c
+++ b/drivers/virtio/virtio_pci_legacy_dev.c
@@ -140,9 +140,9 @@ EXPORT_SYMBOL_GPL(vp_legacy_set_status);
  * vp_legacy_queue_vector - set the MSIX vector for a specific virtqueue
  * @ldev: the legacy virtio-pci device
  * @index: queue index
- * @vector: the config vector
+ * @vector: the queue vector
  *
- * Returns the config vector read from the device
+ * Returns the queue vector read from the device
  */
 u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev,
 			   u16 index, u16 vector)
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index d665f8f73ea8..9e503b7a58d8 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -546,9 +546,9 @@ EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset);
  * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
  * @mdev: the modern virtio-pci device
  * @index: queue index
- * @vector: the config vector
+ * @vector: the queue vector
  *
- * Returns the config vector read from the device
+ * Returns the queue vector read from the device
  */
 u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev,
 			   u16 index, u16 vector)
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 2de37dcd7556..49c3f9926394 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -302,7 +302,7 @@ static enum bp_state reserve_additional_memory(void)
          * are not restored since this region is now known not to
          * conflict with any devices.
          */ 
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (xen_pv_domain()) {
 		unsigned long pfn, i;
 
 		pfn = PFN_DOWN(resource->start);
@@ -626,7 +626,7 @@ int xen_alloc_ballooned_pages(unsigned int nr_pages, struct page **pages)
 			 */
 			BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
 
-			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			if (xen_pv_domain()) {
 				ret = xen_alloc_p2m_entry(page_to_pfn(page));
 				if (ret < 0)
 					goto out_undo;
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 41309d38f78c..9478fae014e5 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1314,14 +1314,17 @@ int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev,
 }
 EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi);
 
-static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn)
+static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn,
+		     bool percpu)
 {
 	struct evtchn_status status;
 	evtchn_port_t port;
-	int rc = -ENOENT;
+	bool exists = false;
 
 	memset(&status, 0, sizeof(status));
 	for (port = 0; port < xen_evtchn_max_channels(); port++) {
+		int rc;
+
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -1329,12 +1332,16 @@ static int find_virq(unsigned int virq, unsigned int cpu, evtchn_port_t *evtchn)
 			continue;
 		if (status.status != EVTCHNSTAT_virq)
 			continue;
-		if (status.u.virq == virq && status.vcpu == xen_vcpu_nr(cpu)) {
+		if (status.u.virq != virq)
+			continue;
+		if (status.vcpu == xen_vcpu_nr(cpu)) {
 			*evtchn = port;
-			break;
+			return 0;
+		} else if (!percpu) {
+			exists = true;
 		}
 	}
-	return rc;
+	return exists ? -EEXIST : -ENOENT;
 }
 
 /**
@@ -1381,8 +1388,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 			evtchn = bind_virq.port;
 		else {
 			if (ret == -EEXIST)
-				ret = find_virq(virq, cpu, &evtchn);
-			BUG_ON(ret < 0);
+				ret = find_virq(virq, cpu, &evtchn, percpu);
+			if (ret) {
+				__unbind_from_irq(info, info->irq);
+				goto out;
+			}
 		}
 
 		ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq);
@@ -1787,9 +1797,20 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
 	 * virq or IPI channel, which don't actually need to be rebound. Ignore
 	 * it, but don't do the xenlinux-level rebind in that case.
 	 */
-	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) {
+		int old_cpu = info->cpu;
+
 		bind_evtchn_to_cpu(info, tcpu, false);
 
+		if (info->type == IRQT_VIRQ) {
+			int virq = info->u.virq;
+			int irq = per_cpu(virq_to_irq, old_cpu)[virq];
+
+			per_cpu(virq_to_irq, old_cpu)[virq] = -1;
+			per_cpu(virq_to_irq, tcpu)[virq] = irq;
+		}
+	}
+
 	do_unmask(info, EVT_MASK_REASON_TEMPORARY);
 
 	return 0;
diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c
index 82855105ab85..550980dd3b0b 100644
--- a/drivers/xen/gntdev-dmabuf.c
+++ b/drivers/xen/gntdev-dmabuf.c
@@ -720,16 +720,15 @@ static void dmabuf_imp_release_all(struct gntdev_dmabuf_priv *priv)
 
 /* DMA buffer IOCTL support. */
 
-long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod,
+long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv,
 				       struct ioctl_gntdev_dmabuf_exp_from_refs __user *u)
 {
 	struct ioctl_gntdev_dmabuf_exp_from_refs op;
 	u32 *refs;
 	long ret;
 
-	if (use_ptemod) {
-		pr_debug("Cannot provide dma-buf: use_ptemode %d\n",
-			 use_ptemod);
+	if (xen_pv_domain()) {
+		pr_debug("Cannot provide dma-buf in a PV domain\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/xen/gntdev-dmabuf.h b/drivers/xen/gntdev-dmabuf.h
index 3d9b9cf9d5a1..9adf96ac74d3 100644
--- a/drivers/xen/gntdev-dmabuf.h
+++ b/drivers/xen/gntdev-dmabuf.h
@@ -18,7 +18,7 @@ struct gntdev_dmabuf_priv *gntdev_dmabuf_init(struct file *filp);
 
 void gntdev_dmabuf_fini(struct gntdev_dmabuf_priv *priv);
 
-long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv, int use_ptemod,
+long gntdev_ioctl_dmabuf_exp_from_refs(struct gntdev_priv *priv,
 				       struct ioctl_gntdev_dmabuf_exp_from_refs __user *u);
 
 long gntdev_ioctl_dmabuf_exp_wait_released(struct gntdev_priv *priv,
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 1f2160765618..91ba5078c9d9 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -73,9 +73,6 @@ module_param(limit, uint, 0644);
 MODULE_PARM_DESC(limit,
 	"Maximum number of grants that may be mapped by one mapping request");
 
-/* True in PV mode, false otherwise */
-static int use_ptemod;
-
 static void unmap_grant_pages(struct gntdev_grant_map *map,
 			      int offset, int pages);
 
@@ -163,7 +160,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count,
 	    NULL == add->pages     ||
 	    NULL == add->being_removed)
 		goto err;
-	if (use_ptemod) {
+	if (xen_pv_domain()) {
 		add->kmap_ops   = kvmalloc_array(count, sizeof(add->kmap_ops[0]),
 						 GFP_KERNEL);
 		add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]),
@@ -211,7 +208,7 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count,
 		add->grants[i].ref = INVALID_GRANT_REF;
 		add->map_ops[i].handle = INVALID_GRANT_HANDLE;
 		add->unmap_ops[i].handle = INVALID_GRANT_HANDLE;
-		if (use_ptemod) {
+		if (xen_pv_domain()) {
 			add->kmap_ops[i].handle = INVALID_GRANT_HANDLE;
 			add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE;
 		}
@@ -268,7 +265,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
 	if (!refcount_dec_and_test(&map->users))
 		return;
 
-	if (map->pages && !use_ptemod) {
+	if (map->pages && !xen_pv_domain()) {
 		/*
 		 * Increment the reference count.  This ensures that the
 		 * subsequent call to unmap_grant_pages() will not wind up
@@ -298,7 +295,7 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
 		 */
 	}
 
-	if (use_ptemod && map->notifier_init)
+	if (xen_pv_domain() && map->notifier_init)
 		mmu_interval_notifier_remove(&map->notifier);
 
 	if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
@@ -334,7 +331,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
 	size_t alloced = 0;
 	int i, err = 0;
 
-	if (!use_ptemod) {
+	if (!xen_pv_domain()) {
 		/* Note: it could already be mapped */
 		if (map->map_ops[0].handle != INVALID_GRANT_HANDLE)
 			return 0;
@@ -389,7 +386,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
 		if (map->flags & GNTMAP_device_map)
 			map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr;
 
-		if (use_ptemod) {
+		if (xen_pv_domain()) {
 			if (map->kmap_ops[i].status == GNTST_okay) {
 				alloced++;
 				map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
@@ -421,7 +418,7 @@ static void __unmap_grant_pages_done(int result,
 			map->unmap_ops[offset+i].handle,
 			map->unmap_ops[offset+i].status);
 		map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE;
-		if (use_ptemod) {
+		if (xen_pv_domain()) {
 			if (map->kunmap_ops[offset + i].status == GNTST_okay &&
 			    map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE)
 				successful_unmaps++;
@@ -464,7 +461,7 @@ static void __unmap_grant_pages(struct gntdev_grant_map *map, int offset,
 	}
 
 	map->unmap_data.unmap_ops = map->unmap_ops + offset;
-	map->unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL;
+	map->unmap_data.kunmap_ops = xen_pv_domain() ? map->kunmap_ops + offset : NULL;
 	map->unmap_data.pages = map->pages + offset;
 	map->unmap_data.count = pages;
 	map->unmap_data.done = __unmap_grant_pages_done;
@@ -1039,7 +1036,7 @@ static long gntdev_ioctl(struct file *flip,
 
 #ifdef CONFIG_XEN_GNTDEV_DMABUF
 	case IOCTL_GNTDEV_DMABUF_EXP_FROM_REFS:
-		return gntdev_ioctl_dmabuf_exp_from_refs(priv, use_ptemod, ptr);
+		return gntdev_ioctl_dmabuf_exp_from_refs(priv, ptr);
 
 	case IOCTL_GNTDEV_DMABUF_EXP_WAIT_RELEASED:
 		return gntdev_ioctl_dmabuf_exp_wait_released(priv, ptr);
@@ -1086,7 +1083,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 
 	vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP);
 
-	if (use_ptemod)
+	if (xen_pv_domain())
 		vm_flags_set(vma, VM_DONTCOPY);
 
 	vma->vm_private_data = map;
@@ -1102,7 +1099,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 
 	map->pages_vm_start = vma->vm_start;
 
-	if (use_ptemod) {
+	if (xen_pv_domain()) {
 		err = mmu_interval_notifier_insert_locked(
 			&map->notifier, vma->vm_mm, vma->vm_start,
 			vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
@@ -1113,7 +1110,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	}
 	mutex_unlock(&priv->lock);
 
-	if (use_ptemod) {
+	if (xen_pv_domain()) {
 		/*
 		 * gntdev takes the address of the PTE in find_grant_ptes() and
 		 * passes it to the hypervisor in gntdev_map_grant_pages(). The
@@ -1139,7 +1136,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 	if (err)
 		goto out_put_map;
 
-	if (!use_ptemod) {
+	if (!xen_pv_domain()) {
 		err = vm_map_pages_zero(vma, map->pages, map->count);
 		if (err)
 			goto out_put_map;
@@ -1154,7 +1151,7 @@ unlock_out:
 out_unlock_put:
 	mutex_unlock(&priv->lock);
 out_put_map:
-	if (use_ptemod)
+	if (xen_pv_domain())
 		unmap_grant_pages(map, 0, map->count);
 	gntdev_put_map(priv, map);
 	return err;
@@ -1183,8 +1180,6 @@ static int __init gntdev_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap);
-
 	err = misc_register(&gntdev_miscdev);
 	if (err != 0) {
 		pr_err("Could not register gntdev device\n");
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 04a6b470b15d..478d2ad725ac 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1449,7 +1449,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	unsigned int nr_gframes = end_idx + 1;
 	int rc;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (!xen_pv_domain()) {
 		struct xen_add_to_physmap xatp;
 		unsigned int i = end_idx;
 		rc = 0;
@@ -1570,7 +1570,7 @@ static int gnttab_setup(void)
 	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) {
+	if (!xen_pv_domain() && gnttab_shared.addr == NULL) {
 		gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr;
 		if (gnttab_shared.addr == NULL) {
 			pr_warn("gnttab share frames is not mapped!\n");
@@ -1588,7 +1588,7 @@ int gnttab_resume(void)
 
 int gnttab_suspend(void)
 {
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_pv_domain())
 		gnttab_interface->unmap_frames();
 	return 0;
 }
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 841afa4933c7..e20c40a62e64 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -11,6 +11,7 @@
 #include <linux/reboot.h>
 #include <linux/sysrq.h>
 #include <linux/stop_machine.h>
+#include <linux/suspend.h>
 #include <linux/freezer.h>
 #include <linux/syscore_ops.h>
 #include <linux/export.h>
@@ -95,10 +96,16 @@ static void do_suspend(void)
 
 	shutting_down = SHUTDOWN_SUSPEND;
 
+	if (!mutex_trylock(&system_transition_mutex))
+	{
+		pr_err("%s: failed to take system_transition_mutex\n", __func__);
+		goto out;
+	}
+
 	err = freeze_processes();
 	if (err) {
 		pr_err("%s: freeze processes failed %d\n", __func__, err);
-		goto out;
+		goto out_unlock;
 	}
 
 	err = freeze_kernel_threads();
@@ -110,7 +117,7 @@ static void do_suspend(void)
 	err = dpm_suspend_start(PMSG_FREEZE);
 	if (err) {
 		pr_err("%s: dpm_suspend_start %d\n", __func__, err);
-		goto out_thaw;
+		goto out_resume_end;
 	}
 
 	printk(KERN_DEBUG "suspending xenstore...\n");
@@ -150,10 +157,13 @@ out_resume:
 	else
 		xs_suspend_cancel();
 
+out_resume_end:
 	dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
 out_thaw:
 	thaw_processes();
+out_unlock:
+	mutex_unlock(&system_transition_mutex);
 out:
 	shutting_down = SHUTDOWN_INVALID;
 }
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 13a10f3294a8..f52a457b302d 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -271,7 +271,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
 	struct mmap_gfn_state state;
 
 	/* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return -ENOSYS;
 
 	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
@@ -353,7 +353,7 @@ static int mmap_batch_fn(void *data, int nr, void *state)
 	struct page **cur_pages = NULL;
 	int ret;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		cur_pages = &pages[st->index];
 
 	BUG_ON(nr < 0);
@@ -535,7 +535,7 @@ static long privcmd_ioctl_mmap_batch(
 			ret = -EINVAL;
 			goto out_unlock;
 		}
-		if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		if (!xen_pv_domain()) {
 			ret = alloc_empty_pages(vma, nr_pages);
 			if (ret < 0)
 				goto out_unlock;
@@ -779,8 +779,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
 		goto out;
 	}
 
-	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
-	    xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) {
 		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
 		struct page **pages;
 		unsigned int i;
@@ -811,8 +810,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
 	if (rc)
 		goto out;
 
-	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
-	    xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && !xen_pv_domain()) {
 		rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT);
 	} else {
 		unsigned int domid =
@@ -1591,7 +1589,7 @@ static void privcmd_close(struct vm_area_struct *vma)
 	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
 	int rc;
 
-	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
+	if (xen_pv_domain() || !numpgs || !pages)
 		return;
 
 	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
diff --git a/drivers/xen/unpopulated-alloc.c b/drivers/xen/unpopulated-alloc.c
index a39f2d36dd9c..d6fc2aefe264 100644
--- a/drivers/xen/unpopulated-alloc.c
+++ b/drivers/xen/unpopulated-alloc.c
@@ -105,7 +105,7 @@ static int fill_list(unsigned int nr_pages)
          * are not restored since this region is now known not to
          * conflict with any devices.
          */
-	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+	if (xen_pv_domain()) {
 		xen_pfn_t pfn = PFN_DOWN(res->start);
 
 		for (i = 0; i < alloc_pages; i++) {
@@ -184,7 +184,7 @@ int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages)
 		pages[i] = pg;
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
-		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		if (xen_pv_domain()) {
 			ret = xen_alloc_p2m_entry(page_to_pfn(pg));
 			if (ret < 0) {
 				unsigned int j;
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index e73ec225d4a6..2dc874fb5506 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -955,7 +955,7 @@ static const struct xenbus_ring_ops ring_ops_hvm = {
 void __init xenbus_ring_ops_init(void)
 {
 #ifdef CONFIG_XEN_PV
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_pv_domain())
 		ring_ops = &ring_ops_pv;
 	else
 #endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 3c9da446b85d..528682bf0c7f 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -718,26 +718,6 @@ int xs_watch_msg(struct xs_watch_event *event)
 	return 0;
 }
 
-/*
- * Certain older XenBus toolstack cannot handle reading values that are
- * not populated. Some Xen 3.4 installation are incapable of doing this
- * so if we are running on anything older than 4 do not attempt to read
- * control/platform-feature-xs_reset_watches.
- */
-static bool xen_strict_xenbus_quirk(void)
-{
-#ifdef CONFIG_X86
-	uint32_t eax, ebx, ecx, edx, base;
-
-	base = xen_cpuid_base();
-	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
-	if ((eax >> 16) < 4)
-		return true;
-#endif
-	return false;
-
-}
 static void xs_reset_watches(void)
 {
 	int err;
@@ -745,9 +725,6 @@ static void xs_reset_watches(void)
 	if (!xen_hvm_domain() || xen_initial_domain())
 		return;
 
-	if (xen_strict_xenbus_quirk())
-		return;
-
 	if (!xenbus_read_unsigned("control",
 				  "platform-feature-xs_reset_watches", 0))
 		return;
diff --git a/drivers/zorro/names.c b/drivers/zorro/names.c
index 077114ccc840..b44f90989a66 100644
--- a/drivers/zorro/names.c
+++ b/drivers/zorro/names.c
@@ -36,21 +36,21 @@ struct zorro_manuf_info {
  * real memory.. Parse the same file multiple times
  * to get all the info.
  */
-#define MANUF( manuf, name )		static char __manufstr_##manuf[] __initdata = name;
+#define MANUF(manuf, name)		static char __manufstr_##manuf[] __initdata = name;
 #define ENDMANUF()
-#define PRODUCT( manuf, prod, name ) 	static char __prodstr_##manuf##prod[] __initdata = name;
+#define PRODUCT(manuf, prod, name)	static char __prodstr_##manuf##prod[] __initdata = name;
 #include "devlist.h"
 
 
-#define MANUF( manuf, name )		static struct zorro_prod_info __prods_##manuf[] __initdata = {
+#define MANUF(manuf, name)		static struct zorro_prod_info __prods_##manuf[] __initdata = {
 #define ENDMANUF()			};
-#define PRODUCT( manuf, prod, name )	{ 0x##prod, 0, __prodstr_##manuf##prod },
+#define PRODUCT(manuf, prod, name)	{ 0x##prod, 0, __prodstr_##manuf##prod },
 #include "devlist.h"
 
 static struct zorro_manuf_info __initdata zorro_manuf_list[] = {
-#define MANUF( manuf, name )		{ 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf },
+#define MANUF(manuf, name)		{ 0x##manuf, ARRAY_SIZE(__prods_##manuf), __manufstr_##manuf, __prods_##manuf },
 #define ENDMANUF()
-#define PRODUCT( manuf, prod, name )
+#define PRODUCT(manuf, prod, name)
 #include "devlist.h"
 };
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 795c6388744c..1581ebac5bb4 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode)
 
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		return generic_drop_inode(inode);
+		return inode_generic_drop(inode);
 	/*
 	 * in case of non cached mode always drop the
 	 * inode because we want the inode attribute
diff --git a/fs/Kconfig b/fs/Kconfig
index c654a3642897..7815379032da 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bd2f530e5740..1949e25c7741 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST
 	  This builds the exec KUnit tests, which tests boundary conditions
 	  of various aspects of the exec internals.
 
+config ARCH_HAS_ELF_CORE_EFLAGS
+	bool
+	depends on BINFMT_ELF && ELF_CORE
+	default n
+	help
+	  Select this option if the architecture makes use of the e_flags
+	  field in the ELF header to store ABI or other architecture-specific
+	  information that should be preserved in core dumps.
+
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 334654f9584b..e3523ab2e587 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
-obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 69e1dd55b160..894d2bad6b6c 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
 	list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
 		if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
 			afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
-			queue_work(system_unbound_wq, &vnode->cb_work);
+			queue_work(system_dfl_wq, &vnode->cb_work);
 		}
 	}
 
@@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
 		if (reason != afs_cb_break_for_deleted &&
 		    vnode->status.type == AFS_FTYPE_FILE &&
 		    atomic_read(&vnode->cb_nr_mmap))
-			queue_work(system_unbound_wq, &vnode->cb_work);
+			queue_work(system_dfl_wq, &vnode->cb_work);
 
 		trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
 	} else {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index bfb69e066672..89d36e3e5c79 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1823,7 +1823,8 @@ error:
 
 static void afs_rename_success(struct afs_operation *op)
 {
-	struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+	struct afs_vnode *vnode = op->more_files[0].vnode;
+	struct afs_vnode *new_vnode = op->more_files[1].vnode;
 
 	_enter("op=%08x", op->debug_id);
 
@@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op)
 		op->ctime = op->file[1].scb.status.mtime_client;
 		afs_vnode_commit_status(op, &op->file[1]);
 	}
+	if (op->more_files[0].scb.have_status)
+		afs_vnode_commit_status(op, &op->more_files[0]);
+	if (op->more_files[1].scb.have_status)
+		afs_vnode_commit_status(op, &op->more_files[1]);
 
 	/* If we're moving a subdir between dirs, we need to update
 	 * its DV counter too as the ".." will be altered.
 	 */
-	if (S_ISDIR(vnode->netfs.inode.i_mode) &&
-	    op->file[0].vnode != op->file[1].vnode) {
-		u64 new_dv;
+	if (op->file[0].vnode != op->file[1].vnode) {
+		if (S_ISDIR(vnode->netfs.inode.i_mode)) {
+			u64 new_dv;
 
-		write_seqlock(&vnode->cb_lock);
+			write_seqlock(&vnode->cb_lock);
 
-		new_dv = vnode->status.data_version + 1;
-		trace_afs_set_dv(vnode, new_dv);
-		vnode->status.data_version = new_dv;
-		inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+			new_dv = vnode->status.data_version + 1;
+			trace_afs_set_dv(vnode, new_dv);
+			vnode->status.data_version = new_dv;
+			inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
 
-		write_sequnlock(&vnode->cb_lock);
+			write_sequnlock(&vnode->cb_lock);
+		}
+
+		if ((op->rename.rename_flags & RENAME_EXCHANGE) &&
+		    S_ISDIR(new_vnode->netfs.inode.i_mode)) {
+			u64 new_dv;
+
+			write_seqlock(&new_vnode->cb_lock);
+
+			new_dv = new_vnode->status.data_version + 1;
+			new_vnode->status.data_version = new_dv;
+			inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv);
+
+			write_sequnlock(&new_vnode->cb_lock);
+		}
 	}
 }
 
@@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 	if (S_ISDIR(vnode->netfs.inode.i_mode) &&
 	    new_dvnode != orig_dvnode &&
 	    test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-		afs_edit_dir_update_dotdot(vnode, new_dvnode,
-					   afs_edit_dir_for_rename_sub);
+		afs_edit_dir_update(vnode, &dotdot_name, new_dvnode,
+				    afs_edit_dir_for_rename_sub);
 
 	new_inode = d_inode(new_dentry);
 	if (new_inode) {
@@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 
 	/* Now we can update d_fsdata on the dentries to reflect their
 	 * new parent's data_version.
-	 *
-	 * Note that if we ever implement RENAME_EXCHANGE, we'll have
-	 * to update both dentries with opposing dir versions.
 	 */
 	afs_update_dentry_version(op, new_dvp, op->dentry);
 	afs_update_dentry_version(op, new_dvp, op->dentry_2);
@@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 		fscache_end_operation(&new_cres);
 }
 
+static void afs_rename_exchange_edit_dir(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode *orig_dvnode = orig_dvp->vnode;
+	struct afs_vnode *new_dvnode = new_dvp->vnode;
+	struct afs_vnode *old_vnode = op->more_files[0].vnode;
+	struct afs_vnode *new_vnode = op->more_files[1].vnode;
+	struct dentry *old_dentry = op->dentry;
+	struct dentry *new_dentry = op->dentry_2;
+
+	_enter("op=%08x", op->debug_id);
+
+	if (new_dvnode == orig_dvnode) {
+		down_write(&orig_dvnode->validate_lock);
+		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+		    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) {
+			afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+					    new_vnode, afs_edit_dir_for_rename_0);
+			afs_edit_dir_update(orig_dvnode, &new_dentry->d_name,
+					    old_vnode, afs_edit_dir_for_rename_1);
+		}
+
+		d_exchange(old_dentry, new_dentry);
+		up_write(&orig_dvnode->validate_lock);
+	} else {
+		down_write(&orig_dvnode->validate_lock);
+		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+		    orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
+			afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+					    new_vnode, afs_edit_dir_for_rename_0);
+
+		up_write(&orig_dvnode->validate_lock);
+		down_write(&new_dvnode->validate_lock);
+
+		if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+		    new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta)
+			afs_edit_dir_update(new_dvnode, &new_dentry->d_name,
+					    old_vnode, afs_edit_dir_for_rename_1);
+
+		if (S_ISDIR(old_vnode->netfs.inode.i_mode) &&
+		    test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags))
+			afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode,
+					    afs_edit_dir_for_rename_sub);
+
+		if (S_ISDIR(new_vnode->netfs.inode.i_mode) &&
+		    test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags))
+			afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode,
+					    afs_edit_dir_for_rename_sub);
+
+		/* Now we can update d_fsdata on the dentries to reflect their
+		 * new parents' data_version.
+		 */
+		afs_update_dentry_version(op, new_dvp, old_dentry);
+		afs_update_dentry_version(op, orig_dvp, new_dentry);
+
+		d_exchange(old_dentry, new_dentry);
+		up_write(&new_dvnode->validate_lock);
+	}
+}
+
 static void afs_rename_put(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
@@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = {
 	.put		= afs_rename_put,
 };
 
+#if 0 /* Autoswitched in yfs_fs_rename_replace(). */
+static const struct afs_operation_ops afs_rename_replace_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_replace,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_edit_dir,
+	.put		= afs_rename_put,
+};
+#endif
+
+static const struct afs_operation_ops afs_rename_noreplace_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_noreplace,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_edit_dir,
+	.put		= afs_rename_put,
+};
+
+static const struct afs_operation_ops afs_rename_exchange_operation = {
+	.issue_afs_rpc	= NULL,
+	.issue_yfs_rpc	= yfs_fs_rename_exchange,
+	.success	= afs_rename_success,
+	.edit_dir	= afs_rename_exchange_edit_dir,
+	.put		= afs_rename_put,
+};
+
 /*
  * rename a file in an AFS filesystem and/or move it between directories
  */
@@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		      struct dentry *new_dentry, unsigned int flags)
 {
 	struct afs_operation *op;
-	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL;
 	int ret;
 
-	if (flags)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 		return -EINVAL;
 
 	/* Don't allow silly-rename files be moved around. */
@@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	vnode = AFS_FS_I(d_inode(old_dentry));
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
+	if (d_is_positive(new_dentry))
+		new_vnode = AFS_FS_I(d_inode(new_dentry));
 
 	_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
 	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	if (ret < 0)
 		goto error;
 
+	ret = -ENOMEM;
+	op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+	if (!op->more_files)
+		goto error;
+
 	afs_op_set_vnode(op, 0, orig_dvnode);
 	afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
 	op->file[0].dv_delta = 1;
@@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	op->file[1].modification = true;
 	op->file[0].update_ctime = true;
 	op->file[1].update_ctime = true;
+	op->more_files[0].vnode		= vnode;
+	op->more_files[0].speculative	= true;
+	op->more_files[1].vnode		= new_vnode;
+	op->more_files[1].speculative	= true;
+	op->nr_files = 4;
 
 	op->dentry		= old_dentry;
 	op->dentry_2		= new_dentry;
+	op->rename.rename_flags	= flags;
 	op->rename.new_negative	= d_is_negative(new_dentry);
-	op->ops			= &afs_rename_operation;
 
-	/* For non-directories, check whether the target is busy and if so,
-	 * make a copy of the dentry and then do a silly-rename.  If the
-	 * silly-rename succeeds, the copied dentry is hashed and becomes the
-	 * new target.
-	 */
-	if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
-		/* To prevent any new references to the target during the
-		 * rename, we unhash the dentry in advance.
+	if (flags & RENAME_NOREPLACE) {
+		op->ops		= &afs_rename_noreplace_operation;
+	} else if (flags & RENAME_EXCHANGE) {
+		op->ops		= &afs_rename_exchange_operation;
+		d_drop(new_dentry);
+	} else {
+		/* If we might displace the target, we might need to do silly
+		 * rename.
 		 */
-		if (!d_unhashed(new_dentry)) {
-			d_drop(new_dentry);
-			op->rename.rehash = new_dentry;
-		}
+		op->ops	= &afs_rename_operation;
 
-		if (d_count(new_dentry) > 2) {
-			/* copy the target dentry's name */
-			op->rename.tmp = d_alloc(new_dentry->d_parent,
-						 &new_dentry->d_name);
-			if (!op->rename.tmp) {
-				afs_op_nomem(op);
-				goto error;
+		/* For non-directories, check whether the target is busy and if
+		 * so, make a copy of the dentry and then do a silly-rename.
+		 * If the silly-rename succeeds, the copied dentry is hashed
+		 * and becomes the new target.
+		 */
+		if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+			/* To prevent any new references to the target during
+			 * the rename, we unhash the dentry in advance.
+			 */
+			if (!d_unhashed(new_dentry)) {
+				d_drop(new_dentry);
+				op->rename.rehash = new_dentry;
 			}
 
-			ret = afs_sillyrename(new_dvnode,
-					      AFS_FS_I(d_inode(new_dentry)),
-					      new_dentry, op->key);
-			if (ret) {
-				afs_op_set_error(op, ret);
-				goto error;
+			if (d_count(new_dentry) > 2) {
+				/* copy the target dentry's name */
+				op->rename.tmp = d_alloc(new_dentry->d_parent,
+							 &new_dentry->d_name);
+				if (!op->rename.tmp) {
+					afs_op_nomem(op);
+					goto error;
+				}
+
+				ret = afs_sillyrename(new_dvnode,
+						      AFS_FS_I(d_inode(new_dentry)),
+						      new_dentry, op->key);
+				if (ret) {
+					afs_op_set_error(op, ret);
+					goto error;
+				}
+
+				op->dentry_2 = op->rename.tmp;
+				op->rename.rehash = NULL;
+				op->rename.new_negative = true;
 			}
-
-			op->dentry_2 = op->rename.tmp;
-			op->rename.rehash = NULL;
-			op->rename.new_negative = true;
 		}
 	}
 
@@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	d_drop(old_dentry);
 
 	ret = afs_do_sync_operation(op);
+	if (ret == -ENOTSUPP)
+		ret = -EINVAL;
 out:
 	afs_dir_unuse_cookie(orig_dvnode, ret);
 	if (new_dvnode != orig_dvnode)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 60a549f1d9c5..4b1342c72089 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -522,11 +522,11 @@ error:
 }
 
 /*
- * Edit a subdirectory that has been moved between directories to update the
- * ".." entry.
+ * Edit an entry in a directory to update the vnode it refers to.  This is also
+ * used to update the ".." entry in a directory.
  */
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
-				enum afs_edit_dir_reason why)
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+			 struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why)
 {
 	union afs_xdr_dir_block *block;
 	union afs_xdr_dirent *de;
@@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 		if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
 			goto already_invalidated;
 
-		slot = afs_dir_scan_block(block, &dotdot_name, b);
+		slot = afs_dir_scan_block(block, name, b);
 		if (slot >= 0)
 			goto found_dirent;
 
@@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
 
 	/* Didn't find the dirent to clobber.  Download the directory again. */
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
-			   0, 0, 0, 0, "..");
+			   0, 0, 0, 0, name->name);
 	afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
 	goto out;
 
@@ -576,7 +576,7 @@ found_dirent:
 	de->u.unique = htonl(new_dvnode->fid.unique);
 
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
-			   ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+			   ntohl(de->u.vnode), ntohl(de->u.unique), name->name);
 
 	kunmap_local(block);
 	netfs_single_mark_inode_dirty(&vnode->netfs.inode);
@@ -589,12 +589,12 @@ out:
 already_invalidated:
 	kunmap_local(block);
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
-			   0, 0, 0, 0, "..");
+			   0, 0, 0, 0, name->name);
 	goto out;
 
 error:
 	trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
-			   0, 0, 0, 0, "..");
+			   0, 0, 0, 0, name->name);
 	goto out;
 }
 
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 0b80eb93fa40..014495d4b868 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 
+	op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+	if (!op->more_files) {
+		afs_put_operation(op);
+		return -ENOMEM;
+	}
+
 	afs_op_set_vnode(op, 0, dvnode);
 	afs_op_set_vnode(op, 1, dvnode);
 	op->file[0].dv_delta = 1;
@@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
 	op->file[1].modification = true;
 	op->file[0].update_ctime = true;
 	op->file[1].update_ctime = true;
+	op->more_files[0].vnode		= AFS_FS_I(d_inode(old));
+	op->more_files[0].speculative	= true;
+	op->more_files[1].vnode		= AFS_FS_I(d_inode(new));
+	op->more_files[1].speculative	= true;
+	op->nr_files = 4;
 
 	op->dentry		= old;
 	op->dentry_2		= new;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e9538e91f848..e1cb17b85791 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
 	_enter("");
 
 	if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
-		return generic_delete_inode(inode);
+		return inode_just_drop(inode);
 	else
-		return generic_drop_inode(inode);
+		return inode_generic_drop(inode);
 }
 
 /*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1124ea4000cb..444a3ea4fdf6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -562,6 +562,7 @@ struct afs_server {
 #define AFS_SERVER_FL_NO_IBULK	17		/* Fileserver doesn't support FS.InlineBulkStatus */
 #define AFS_SERVER_FL_NO_RM2	18		/* Fileserver doesn't support YFS.RemoveFile2 */
 #define AFS_SERVER_FL_HAS_FS64	19		/* Fileserver supports FS.{Fetch,Store}Data64 */
+#define AFS_SERVER_FL_NO_RENAME2 20		/* YFS Fileserver doesn't support enhanced rename */
 	refcount_t		ref;		/* Object refcount */
 	atomic_t		active;		/* Active user count */
 	u32			addr_version;	/* Address list version */
@@ -891,9 +892,10 @@ struct afs_operation {
 			bool	need_rehash;
 		} unlink;
 		struct {
-			struct dentry *rehash;
-			struct dentry *tmp;
-			bool	new_negative;
+			struct dentry	*rehash;
+			struct dentry	*tmp;
+			unsigned int	rename_flags;
+			bool		new_negative;
 		} rename;
 		struct {
 			struct netfs_io_subrequest *subreq;
@@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping,
 extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
 			     enum afs_edit_dir_reason);
 extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
-				enum afs_edit_dir_reason why);
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+			 struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
 void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
 
 /*
@@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *);
 extern void yfs_fs_link(struct afs_operation *);
 extern void yfs_fs_symlink(struct afs_operation *);
 extern void yfs_fs_rename(struct afs_operation *);
+void yfs_fs_rename_replace(struct afs_operation *op);
+void yfs_fs_rename_noreplace(struct afs_operation *op);
+void yfs_fs_rename_exchange(struct afs_operation *op);
 extern void yfs_fs_store_data(struct afs_operation *);
 extern void yfs_fs_setattr(struct afs_operation *);
 extern void yfs_fs_get_volume_status(struct afs_operation *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 02475d415d88..e6bb8237db98 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -169,13 +169,13 @@ static int __init afs_init(void)
 
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
 
-	afs_wq = alloc_workqueue("afs", 0, 0);
+	afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0);
 	if (!afs_wq)
 		goto error_afs_wq;
 	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
 	if (!afs_async_calls)
 		goto error_async;
-	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
 	if (!afs_lock_manager)
 		goto error_lockmgr;
 
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 8f2b3a177690..c8a7f266080d 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code)
 	case KRB5_PROG_KEYTYPE_NOSUPP:	return -ENOPKG;
 
 	case RXGEN_OPCODE:	return -ENOTSUPP;
+	case RX_INVALID_OPERATION:	return -ENOTSUPP;
 
 	default:		return -EREMOTEIO;
 	}
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index e4cd89c44c46..b2f06c1917c2 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -50,6 +50,9 @@ enum YFS_FS_Operations {
 	YFSREMOVEACL		= 64171,
 	YFSREMOVEFILE2		= 64173,
 	YFSSTOREOPAQUEACL2	= 64174,
+	YFSRENAME_REPLACE	= 64176,
+	YFSRENAME_NOREPLACE	= 64177,
+	YFSRENAME_EXCHANGE	= 64187,
 	YFSINLINEBULKSTATUS	= 64536, /* YFS Fetch multiple file statuses with errors */
 	YFSFETCHDATA64		= 64537, /* YFS Fetch file data */
 	YFSSTOREDATA64		= 64538, /* YFS Store file data */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a1c24f589d9e..6a4e7da10fc4 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op)
 			afs_op_set_error(op, -EDQUOT);
 			goto failed_but_online;
 
+		case RX_INVALID_OPERATION:
+		case RXGEN_OPCODE:
+			/* Handle downgrading to an older operation. */
+			afs_op_set_error(op, -ENOTSUPP);
+			if (op->flags & AFS_OPERATION_DOWNGRADE) {
+				op->flags &= ~AFS_OPERATION_DOWNGRADE;
+				goto go_again;
+			}
+			goto failed_but_online;
+
 		default:
 			afs_op_accumulate_error(op, error, abort_code);
 		failed_but_online:
@@ -620,12 +630,13 @@ iterate_address:
 	op->addr_index = addr_index;
 	set_bit(addr_index, &op->addr_tried);
 
-	op->volsync.creation = TIME64_MIN;
-	op->volsync.update = TIME64_MIN;
-	op->call_responded = false;
 	_debug("address [%u] %u/%u %pISp",
 	       op->server_index, addr_index, alist->nr_addrs,
 	       rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
+go_again:
+	op->volsync.creation = TIME64_MIN;
+	op->volsync.update = TIME64_MIN;
+	op->call_responded = false;
 	_leave(" = t");
 	return true;
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index a97562f831eb..c4428ebddb1d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
 void afs_put_server(struct afs_net *net, struct afs_server *server,
 		    enum afs_server_trace reason)
 {
-	unsigned int a, debug_id = server->debug_id;
+	unsigned int a, debug_id;
 	bool zero;
 	int r;
 
 	if (!server)
 		return;
 
+	debug_id = server->debug_id;
 	a = atomic_read(&server->active);
 	zero = __refcount_dec_and_test(&server->ref, &r);
 	trace_afs_server(debug_id, r - 1, a, reason);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2e7526ea883a..93ad86ff3345 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work)
 void afs_issue_write(struct netfs_io_subrequest *subreq)
 {
 	subreq->work.func = afs_issue_write_worker;
-	if (!queue_work(system_unbound_wq, &subreq->work))
+	if (!queue_work(system_dfl_wq, &subreq->work))
 		WARN_ON_ONCE(1);
 }
 
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 257af259c04a..febf13a49f0b 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op)
 
 	_enter("");
 
+	if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags))
+		return yfs_fs_rename_replace(op);
+
 	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename,
 				   sizeof(__be32) +
 				   sizeof(struct yfs_xdr_RPCFlags) +
@@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op)
 }
 
 /*
+ * Deliver reply data to a YFS.Rename_NoReplace operation.  This does not
+ * return the status of a displaced target inode as there cannot be one.
+ */
+static int yfs_deliver_fs_rename_1(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode_param *old_vp = &op->more_files[0];
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	/* If the two dirs are the same, we have two copies of the same status
+	 * report, so we just decode it twice.
+	 */
+	xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+	xdr_decode_YFSFid(&bp, &old_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+	xdr_decode_YFSVolSync(&bp, &op->volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange
+ * operation.  These return the status of the displaced target inode if there
+ * was one.
+ */
+static int yfs_deliver_fs_rename_2(struct afs_call *call)
+{
+	struct afs_operation *op = call->op;
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	struct afs_vnode_param *old_vp = &op->more_files[0];
+	struct afs_vnode_param *new_vp = &op->more_files[1];
+	const __be32 *bp;
+	int ret;
+
+	_enter("{%u}", call->unmarshall);
+
+	ret = afs_transfer_reply(call);
+	if (ret < 0)
+		return ret;
+
+	bp = call->buffer;
+	/* If the two dirs are the same, we have two copies of the same status
+	 * report, so we just decode it twice.
+	 */
+	xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+	xdr_decode_YFSFid(&bp, &old_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+	xdr_decode_YFSFid(&bp, &new_vp->fid);
+	xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb);
+	xdr_decode_YFSVolSync(&bp, &op->volsync);
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void yfs_done_fs_rename_replace(struct afs_call *call)
+{
+	if (call->error == -ECONNABORTED &&
+	    (call->abort_code == RX_INVALID_OPERATION ||
+	     call->abort_code == RXGEN_OPCODE)) {
+		set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags);
+		call->op->flags |= AFS_OPERATION_DOWNGRADE;
+	}
+}
+
+/*
+ * YFS.Rename_Replace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Replace = {
+	.name		= "FS.Rename_Replace",
+	.op		= yfs_FS_Rename_Replace,
+	.deliver	= yfs_deliver_fs_rename_2,
+	.done		= yfs_done_fs_rename_replace,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_NoReplace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_NoReplace = {
+	.name		= "FS.Rename_NoReplace",
+	.op		= yfs_FS_Rename_NoReplace,
+	.deliver	= yfs_deliver_fs_rename_1,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_Exchange operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Exchange = {
+	.name		= "FS.Rename_Exchange",
+	.op		= yfs_FS_Rename_Exchange,
+	.deliver	= yfs_deliver_fs_rename_2,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory, replacing the target if it exists.  The status
+ * of a displaced target is returned.
+ */
+void yfs_fs_rename_replace(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_REPLACE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Rename a file or directory, failing if the target dirent exists.
+ */
+void yfs_fs_rename_noreplace(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Exchange a pair of files directories.
+ */
+void yfs_fs_rename_exchange(struct afs_operation *op)
+{
+	struct afs_vnode_param *orig_dvp = &op->file[0];
+	struct afs_vnode_param *new_dvp = &op->file[1];
+	const struct qstr *orig_name = &op->dentry->d_name;
+	const struct qstr *new_name = &op->dentry_2->d_name;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange,
+				   sizeof(__be32) +
+				   sizeof(struct yfs_xdr_RPCFlags) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(orig_name->len) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   xdr_strlen(new_name->len),
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSFid) +
+				   sizeof(struct yfs_xdr_YFSFetchStatus) +
+				   sizeof(struct yfs_xdr_YFSVolSync));
+	if (!call)
+		return afs_op_nomem(op);
+
+	/* Marshall the parameters. */
+	bp = call->request;
+	bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE);
+	bp = xdr_encode_u32(bp, 0); /* RPC flags */
+	bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+	bp = xdr_encode_name(bp, orig_name);
+	bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+	bp = xdr_encode_name(bp, new_name);
+	yfs_check_req(call, bp);
+
+	call->fid = orig_dvp->fid;
+	trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+	afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
  * YFS.StoreData64 operation type.
  */
 static const struct afs_call_type yfs_RXYFSStoreData64 = {
diff --git a/fs/aio.c b/fs/aio.c
index 7fc7b6221312..6002617f078c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 
 	/* Synchronize against RCU protected table->table[] dereferences */
 	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
-	queue_rcu_work(system_wq, &ctx->free_rwork);
+	queue_rcu_work(system_percpu_wq, &ctx->free_rwork);
 }
 
 /*
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1d847a939f29..180a458fc4f7 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -129,7 +129,7 @@ struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *n
 	}
 	return inode;
 }
-EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
+EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
 
 static struct file *__anon_inode_getfile(const char *name,
 					 const struct file_operations *fops,
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
deleted file mode 100644
index 8cb2b9d5da96..000000000000
--- a/fs/bcachefs/Kconfig
+++ /dev/null
@@ -1,121 +0,0 @@
-
-config BCACHEFS_FS
-	tristate "bcachefs filesystem support (EXPERIMENTAL)"
-	depends on BLOCK
-	select EXPORTFS
-	select CLOSURES
-	select CRC32
-	select CRC64
-	select FS_POSIX_ACL
-	select LZ4_COMPRESS
-	select LZ4_DECOMPRESS
-	select LZ4HC_COMPRESS
-	select LZ4HC_DECOMPRESS
-	select ZLIB_DEFLATE
-	select ZLIB_INFLATE
-	select ZSTD_COMPRESS
-	select ZSTD_DECOMPRESS
-	select CRYPTO_LIB_SHA256
-	select CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_POLY1305
-	select KEYS
-	select RAID6_PQ
-	select XOR_BLOCKS
-	select XXHASH
-	select SRCU
-	select SYMBOLIC_ERRNAME
-	select MIN_HEAP
-	select XARRAY_MULTI
-	help
-	The bcachefs filesystem - a modern, copy on write filesystem, with
-	support for multiple devices, compression, checksumming, etc.
-
-config BCACHEFS_QUOTA
-	bool "bcachefs quota support"
-	depends on BCACHEFS_FS
-	select QUOTACTL
-
-config BCACHEFS_ERASURE_CODING
-	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
-	depends on BCACHEFS_FS
-	select QUOTACTL
-	help
-	This enables the "erasure_code" filesysystem and inode option, which
-	organizes data into reed-solomon stripes instead of ordinary
-	replication.
-
-	WARNING: this feature is still undergoing on disk format changes, and
-	should only be enabled for testing purposes.
-
-config BCACHEFS_POSIX_ACL
-	bool "bcachefs POSIX ACL support"
-	depends on BCACHEFS_FS
-	select FS_POSIX_ACL
-
-config BCACHEFS_DEBUG
-	bool "bcachefs debugging"
-	depends on BCACHEFS_FS
-	help
-	Enables many extra debugging checks and assertions.
-
-	The resulting code will be significantly slower than normal; you
-	probably shouldn't select this option unless you're a developer.
-
-config BCACHEFS_INJECT_TRANSACTION_RESTARTS
-	bool "Randomly inject transaction restarts"
-	depends on BCACHEFS_DEBUG
-	help
-	Randomly inject transaction restarts in a few core paths - may have a
-	significant performance penalty
-
-config BCACHEFS_TESTS
-	bool "bcachefs unit and performance tests"
-	depends on BCACHEFS_FS
-	help
-	Include some unit and performance tests for the core btree code
-
-config BCACHEFS_LOCK_TIME_STATS
-       bool "bcachefs lock time statistics"
-       depends on BCACHEFS_FS
-       help
-       Expose statistics for how long we held a lock in debugfs
-
-config BCACHEFS_NO_LATENCY_ACCT
-	bool "disable latency accounting and time stats"
-	depends on BCACHEFS_FS
-	help
-	This disables device latency tracking and time stats, only for performance testing
-
-config BCACHEFS_SIX_OPTIMISTIC_SPIN
-	bool "Optimistic spinning for six locks"
-	depends on BCACHEFS_FS
-	depends on SMP
-	default y
-	help
-	Instead of immediately sleeping when attempting to take a six lock that
-	is held by another thread, spin for a short while, as long as the
-	thread owning the lock is running.
-
-config BCACHEFS_PATH_TRACEPOINTS
-	bool "Extra btree_path tracepoints"
-	depends on BCACHEFS_FS && TRACING
-	help
-	Enable extra tracepoints for debugging btree_path operations; we don't
-	normally want these enabled because they happen at very high rates.
-
-config BCACHEFS_TRANS_KMALLOC_TRACE
-	bool "Trace bch2_trans_kmalloc() calls"
-	depends on BCACHEFS_FS
-
-config BCACHEFS_ASYNC_OBJECT_LISTS
-	bool "Keep async objects on fast_lists for debugfs visibility"
-	depends on BCACHEFS_FS && DEBUG_FS
-
-config MEAN_AND_VARIANCE_UNIT_TEST
-	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
-	depends on KUNIT
-	depends on BCACHEFS_FS
-	default KUNIT_ALL_TESTS
-	help
-	  This option enables the kunit tests for mean_and_variance module.
-	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
deleted file mode 100644
index 93c8ee5425c8..000000000000
--- a/fs/bcachefs/Makefile
+++ /dev/null
@@ -1,107 +0,0 @@
-
-obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
-
-bcachefs-y		:=	\
-	acl.o			\
-	alloc_background.o	\
-	alloc_foreground.o	\
-	backpointers.o		\
-	bkey.o			\
-	bkey_methods.o		\
-	bkey_sort.o		\
-	bset.o			\
-	btree_cache.o		\
-	btree_gc.o		\
-	btree_io.o		\
-	btree_iter.o		\
-	btree_journal_iter.o	\
-	btree_key_cache.o	\
-	btree_locking.o		\
-	btree_node_scan.o	\
-	btree_trans_commit.o	\
-	btree_update.o		\
-	btree_update_interior.o	\
-	btree_write_buffer.o	\
-	buckets.o		\
-	buckets_waiting_for_journal.o	\
-	chardev.o		\
-	checksum.o		\
-	clock.o			\
-	compress.o		\
-	darray.o		\
-	data_update.o		\
-	debug.o			\
-	dirent.o		\
-	disk_accounting.o	\
-	disk_groups.o		\
-	ec.o			\
-	enumerated_ref.o	\
-	errcode.o		\
-	error.o			\
-	extents.o		\
-	extent_update.o		\
-	eytzinger.o		\
-	fast_list.o		\
-	fs.o			\
-	fs-ioctl.o		\
-	fs-io.o			\
-	fs-io-buffered.o	\
-	fs-io-direct.o		\
-	fs-io-pagecache.o	\
-	fsck.o			\
-	inode.o			\
-	io_read.o		\
-	io_misc.o		\
-	io_write.o		\
-	journal.o		\
-	journal_io.o		\
-	journal_reclaim.o	\
-	journal_sb.o		\
-	journal_seq_blacklist.o	\
-	keylist.o		\
-	logged_ops.o		\
-	lru.o			\
-	mean_and_variance.o	\
-	migrate.o		\
-	move.o			\
-	movinggc.o		\
-	namei.o			\
-	nocow_locking.o		\
-	opts.o			\
-	printbuf.o		\
-	progress.o		\
-	quota.o			\
-	rebalance.o		\
-	rcu_pending.o		\
-	recovery.o		\
-	recovery_passes.o	\
-	reflink.o		\
-	replicas.o		\
-	sb-clean.o		\
-	sb-counters.o		\
-	sb-downgrade.o		\
-	sb-errors.o		\
-	sb-members.o		\
-	siphash.o		\
-	six.o			\
-	snapshot.o		\
-	str_hash.o		\
-	subvolume.o		\
-	super.o			\
-	super-io.o		\
-	sysfs.o			\
-	tests.o			\
-	time_stats.o		\
-	thread_with_file.o	\
-	trace.o			\
-	two_state_shared_lock.o	\
-	util.o			\
-	varint.o		\
-	xattr.o
-
-bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS)   += async_objs.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
-
-# Silence "note: xyz changed in GCC X.X" messages
-subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
deleted file mode 100644
index d03adc36100e..000000000000
--- a/fs/bcachefs/acl.c
+++ /dev/null
@@ -1,445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-
-#include "acl.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static const char * const acl_types[] = {
-	[ACL_USER_OBJ]	= "user_obj",
-	[ACL_USER]	= "user",
-	[ACL_GROUP_OBJ]	= "group_obj",
-	[ACL_GROUP]	= "group",
-	[ACL_MASK]	= "mask",
-	[ACL_OTHER]	= "other",
-	NULL,
-};
-
-void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
-{
-	const void *p, *end = value + size;
-
-	if (!value ||
-	    size < sizeof(bch_acl_header) ||
-	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
-		return;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *in = p;
-		unsigned tag = le16_to_cpu(in->e_tag);
-
-		prt_str(out, acl_types[tag]);
-
-		switch (tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		}
-
-		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
-
-		if (p != end)
-			prt_char(out, ' ');
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-#include "fs.h"
-
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
-{
-	return sizeof(bch_acl_header) +
-		sizeof(bch_acl_entry_short) * nr_short +
-		sizeof(bch_acl_entry) * nr_long;
-}
-
-static inline int acl_to_xattr_type(int type)
-{
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
-	case ACL_TYPE_DEFAULT:
-		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
-	default:
-		BUG();
-	}
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
-					    const void *value, size_t size)
-{
-	const void *p, *end = value + size;
-	struct posix_acl *acl;
-	struct posix_acl_entry *out;
-	unsigned count = 0;
-	int ret;
-
-	if (!value)
-		return NULL;
-	if (size < sizeof(bch_acl_header))
-		goto invalid;
-	if (((bch_acl_header *)value)->a_version !=
-	    cpu_to_le32(BCH_ACL_VERSION))
-		goto invalid;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *entry = p;
-
-		if (p + sizeof(bch_acl_entry_short) > end)
-			goto invalid;
-
-		switch (le16_to_cpu(entry->e_tag)) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-		case ACL_GROUP:
-			p += sizeof(bch_acl_entry);
-			break;
-		default:
-			goto invalid;
-		}
-
-		count++;
-	}
-
-	if (p > end)
-		goto invalid;
-
-	if (!count)
-		return NULL;
-
-	acl = allocate_dropping_locks(trans, ret,
-			posix_acl_alloc(count, _gfp));
-	if (!acl)
-		return ERR_PTR(-ENOMEM);
-	if (ret) {
-		kfree(acl);
-		return ERR_PTR(ret);
-	}
-
-	out = acl->a_entries;
-
-	p = value + sizeof(bch_acl_header);
-	while (p < end) {
-		const bch_acl_entry *in = p;
-
-		out->e_tag  = le16_to_cpu(in->e_tag);
-		out->e_perm = le16_to_cpu(in->e_perm);
-
-		switch (out->e_tag) {
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			p += sizeof(bch_acl_entry_short);
-			break;
-		case ACL_USER:
-			out->e_uid = make_kuid(&init_user_ns,
-					       le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			out->e_gid = make_kgid(&init_user_ns,
-					       le32_to_cpu(in->e_id));
-			p += sizeof(bch_acl_entry);
-			break;
-		}
-
-		out++;
-	}
-
-	BUG_ON(out != acl->a_entries + acl->a_count);
-
-	return acl;
-invalid:
-	pr_err("invalid acl entry");
-	return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static struct bkey_i_xattr *
-bch2_acl_to_xattr(struct btree_trans *trans,
-		  const struct posix_acl *acl,
-		  int type)
-{
-	struct bkey_i_xattr *xattr;
-	bch_acl_header *acl_header;
-	const struct posix_acl_entry *acl_e, *pe;
-	void *outptr;
-	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
-
-	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
-		switch (acl_e->e_tag) {
-		case ACL_USER:
-		case ACL_GROUP:
-			nr_long++;
-			break;
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			nr_short++;
-			break;
-		default:
-			return ERR_PTR(-EINVAL);
-		}
-	}
-
-	acl_len = bch2_acl_size(nr_short, nr_long);
-	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
-
-	if (u64s > U8_MAX)
-		return ERR_PTR(-E2BIG);
-
-	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-	if (IS_ERR(xattr))
-		return xattr;
-
-	bkey_xattr_init(&xattr->k_i);
-	xattr->k.u64s		= u64s;
-	xattr->v.x_type		= acl_to_xattr_type(type);
-	xattr->v.x_name_len	= 0;
-	xattr->v.x_val_len	= cpu_to_le16(acl_len);
-
-	acl_header = xattr_val(&xattr->v);
-	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
-
-	outptr = (void *) acl_header + sizeof(*acl_header);
-
-	FOREACH_ACL_ENTRY(acl_e, acl, pe) {
-		bch_acl_entry *entry = outptr;
-
-		entry->e_tag = cpu_to_le16(acl_e->e_tag);
-		entry->e_perm = cpu_to_le16(acl_e->e_perm);
-		switch (acl_e->e_tag) {
-		case ACL_USER:
-			entry->e_id = cpu_to_le32(
-				from_kuid(&init_user_ns, acl_e->e_uid));
-			outptr += sizeof(bch_acl_entry);
-			break;
-		case ACL_GROUP:
-			entry->e_id = cpu_to_le32(
-				from_kgid(&init_user_ns, acl_e->e_gid));
-			outptr += sizeof(bch_acl_entry);
-			break;
-
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			outptr += sizeof(bch_acl_entry_short);
-			break;
-		}
-	}
-
-	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
-
-	return xattr;
-}
-
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-	struct btree_iter iter = {};
-	struct posix_acl *acl = NULL;
-
-	if (rcu)
-		return ERR_PTR(-ECHILD);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-					     &hash, inode_inum(inode), &search, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-				 le16_to_cpu(xattr.v->x_val_len));
-	ret = PTR_ERR_OR_ZERO(acl);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret)
-		acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
-
-	if (!IS_ERR_OR_NULL(acl))
-		set_cached_acl(&inode->v, type, acl);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return acl;
-}
-
-int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-		       struct bch_inode_unpacked *inode_u,
-		       struct posix_acl *acl, int type)
-{
-	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
-	int ret;
-
-	if (type == ACL_TYPE_DEFAULT &&
-	    !S_ISDIR(inode_u->bi_mode))
-		return acl ? -EACCES : 0;
-
-	if (acl) {
-		struct bkey_i_xattr *xattr =
-			bch2_acl_to_xattr(trans, acl, type);
-		if (IS_ERR(xattr))
-			return PTR_ERR(xattr);
-
-		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
-				    inum, &xattr->k_i, 0);
-	} else {
-		struct xattr_search_key search =
-			X_SEARCH(acl_to_xattr_type(type), "", 0);
-
-		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
-				       inum, &search);
-	}
-
-	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-}
-
-int bch2_set_acl(struct mnt_idmap *idmap,
-		 struct dentry *dentry,
-		 struct posix_acl *_acl, int type)
-{
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_iter inode_iter = {};
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *acl;
-	umode_t mode;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-	struct btree_trans *trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-	acl = _acl;
-
-	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
-		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-			      BTREE_ITER_intent);
-	if (ret)
-		goto btree_err;
-
-	mode = inode_u.bi_mode;
-
-	if (type == ACL_TYPE_ACCESS) {
-		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
-		if (ret)
-			goto btree_err;
-	}
-
-	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
-	if (ret)
-		goto btree_err;
-
-	inode_u.bi_ctime	= bch2_current_time(c);
-	inode_u.bi_mode		= mode;
-
-	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-btree_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	if (unlikely(ret))
-		goto err;
-
-	bch2_inode_update_after_write(trans, inode, &inode_u,
-				      ATTR_CTIME|ATTR_MODE);
-
-	set_cached_acl(&inode->v, type, acl);
-err:
-	bch2_trans_put(trans);
-	mutex_unlock(&inode->ei_update_lock);
-
-	return ret;
-}
-
-int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-		   struct bch_inode_unpacked *inode,
-		   umode_t mode,
-		   struct posix_acl **new_acl)
-{
-	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
-	struct btree_iter iter;
-	struct posix_acl *acl = NULL;
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-			       &hash_info, inum, &search, BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
-	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-			le16_to_cpu(xattr.v->x_val_len));
-	ret = PTR_ERR_OR_ZERO(acl);
-	if (ret)
-		goto err;
-
-	ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
-	if (ret)
-		goto err;
-
-	struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		goto err;
-
-	new->k.p = iter.pos;
-	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-	*new_acl = acl;
-	acl = NULL;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (!IS_ERR_OR_NULL(acl))
-		kfree(acl);
-	return ret;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
deleted file mode 100644
index fe730a6bf0c1..000000000000
--- a/fs/bcachefs/acl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ACL_H
-#define _BCACHEFS_ACL_H
-
-struct bch_inode_unpacked;
-struct bch_hash_info;
-struct bch_inode_info;
-struct posix_acl;
-
-#define BCH_ACL_VERSION	0x0001
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-	__le32		e_id;
-} bch_acl_entry;
-
-typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
-	__le32		a_version;
-} bch_acl_header;
-
-void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-
-int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
-		       struct bch_inode_unpacked *,
-		       struct posix_acl *, int);
-int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, subvol_inum,
-		   struct bch_inode_unpacked *,
-		   umode_t, struct posix_acl **);
-
-#else
-
-static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
-				     struct bch_inode_unpacked *inode_u,
-				     struct posix_acl *acl, int type)
-{
-	return 0;
-}
-
-static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
-				 struct bch_inode_unpacked *inode,
-				 umode_t mode,
-				 struct posix_acl **new_acl)
-{
-	return 0;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
-
-#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
deleted file mode 100644
index 66de46318620..000000000000
--- a/fs/bcachefs/alloc_background.c
+++ /dev/null
@@ -1,2680 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-#include "varint.h"
-
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-#include <linux/jiffies.h>
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
-
-/* Persistent alloc info: */
-
-static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
-	BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bkey_alloc_unpacked {
-	u64		journal_seq;
-	u8		gen;
-	u8		oldest_gen;
-	u8		data_type;
-	bool		need_discard:1;
-	bool		need_inc_gen:1;
-#define x(_name, _bits)	u##_bits _name;
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-};
-
-static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
-				     const void **p, unsigned field)
-{
-	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
-	u64 v;
-
-	if (!(a->fields & (1 << field)))
-		return 0;
-
-	switch (bytes) {
-	case 1:
-		v = *((const u8 *) *p);
-		break;
-	case 2:
-		v = le16_to_cpup(*p);
-		break;
-	case 4:
-		v = le32_to_cpup(*p);
-		break;
-	case 8:
-		v = le64_to_cpup(*p);
-		break;
-	default:
-		BUG();
-	}
-
-	*p += bytes;
-	return v;
-}
-
-static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
-				 struct bkey_s_c k)
-{
-	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
-	const void *d = in->data;
-	unsigned idx = 0;
-
-	out->gen = in->gen;
-
-#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
-	BCH_ALLOC_FIELDS_V1()
-#undef  x
-}
-
-static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
-	const u8 *in = a.v->data;
-	const u8 *end = bkey_val_end(a);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v;
-
-	out->gen	= a.v->gen;
-	out->oldest_gen	= a.v->oldest_gen;
-	out->data_type	= a.v->data_type;
-
-#define x(_name, _bits)							\
-	if (fieldnr < a.v->nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-	} else {							\
-		v = 0;							\
-	}								\
-	out->_name = v;							\
-	if (v != out->_name)						\
-		return -1;						\
-	fieldnr++;
-
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	return 0;
-}
-
-static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
-	const u8 *in = a.v->data;
-	const u8 *end = bkey_val_end(a);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v;
-
-	out->gen	= a.v->gen;
-	out->oldest_gen	= a.v->oldest_gen;
-	out->data_type	= a.v->data_type;
-	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
-	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
-	out->journal_seq = le64_to_cpu(a.v->journal_seq);
-
-#define x(_name, _bits)							\
-	if (fieldnr < a.v->nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-	} else {							\
-		v = 0;							\
-	}								\
-	out->_name = v;							\
-	if (v != out->_name)						\
-		return -1;						\
-	fieldnr++;
-
-	BCH_ALLOC_FIELDS_V2()
-#undef  x
-	return 0;
-}
-
-static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
-	struct bkey_alloc_unpacked ret = { .gen	= 0 };
-
-	switch (k.k->type) {
-	case KEY_TYPE_alloc:
-		bch2_alloc_unpack_v1(&ret, k);
-		break;
-	case KEY_TYPE_alloc_v2:
-		bch2_alloc_unpack_v2(&ret, k);
-		break;
-	case KEY_TYPE_alloc_v3:
-		bch2_alloc_unpack_v3(&ret, k);
-		break;
-	}
-
-	return ret;
-}
-
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
-{
-	unsigned i, bytes = offsetof(struct bch_alloc, data);
-
-	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
-		if (a->fields & (1 << i))
-			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
-
-	return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-	int ret = 0;
-
-	/* allow for unknown fields */
-	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
-			 c, alloc_v1_val_size_bad,
-			 "incorrect value size (%zu < %u)",
-			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_alloc_unpacked u;
-	int ret = 0;
-
-	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
-			 c, alloc_v2_unpack_error,
-			 "unpack error");
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_alloc_unpacked u;
-	int ret = 0;
-
-	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
-			 c, alloc_v3_unpack_error,
-			 "unpack error");
-fsck_err:
-	return ret;
-}
-
-int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bch_alloc_v4 a;
-	int ret = 0;
-
-	bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
-
-	bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
-			 c, alloc_v4_val_size_bad,
-			 "bad val size (%u > %zu)",
-			 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
-
-	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
-			 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
-			 c, alloc_v4_backpointers_start_bad,
-			 "invalid backpointers_start");
-
-	bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
-			 c, alloc_key_data_type_bad,
-			 "invalid data type (got %u should be %u)",
-			 a.data_type, alloc_data_type(a, a.data_type));
-
-	for (unsigned i = 0; i < 2; i++)
-		bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
-				 c, alloc_key_io_time_bad,
-				 "invalid io_time[%s]: %llu, max %llu",
-				 i == READ ? "read" : "write",
-				 a.io_time[i], LRU_TIME_MAX);
-
-	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
-		offsetof(struct bch_alloc_v4, stripe_sectors)
-		? a.stripe_sectors
-		: 0;
-
-	switch (a.data_type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		bkey_fsck_err_on(stripe_sectors ||
-				 a.dirty_sectors ||
-				 a.cached_sectors ||
-				 a.stripe,
-				 c, alloc_key_empty_but_have_data,
-				 "empty data type free but have data %u.%u.%u %u",
-				 stripe_sectors,
-				 a.dirty_sectors,
-				 a.cached_sectors,
-				 a.stripe);
-		break;
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-	case BCH_DATA_btree:
-	case BCH_DATA_user:
-	case BCH_DATA_parity:
-		bkey_fsck_err_on(!a.dirty_sectors &&
-				 !stripe_sectors,
-				 c, alloc_key_dirty_sectors_0,
-				 "data_type %s but dirty_sectors==0",
-				 bch2_data_type_str(a.data_type));
-		break;
-	case BCH_DATA_cached:
-		bkey_fsck_err_on(!a.cached_sectors ||
-				 a.dirty_sectors ||
-				 stripe_sectors ||
-				 a.stripe,
-				 c, alloc_key_cached_inconsistency,
-				 "data type inconsistency");
-
-		bkey_fsck_err_on(!a.io_time[READ] &&
-				 !(c->recovery.passes_to_run &
-				   BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
-				 c, alloc_key_cached_but_read_time_zero,
-				 "cached bucket with read_time == 0");
-		break;
-	case BCH_DATA_stripe:
-		break;
-	}
-fsck_err:
-	return ret;
-}
-
-void bch2_alloc_v4_swab(struct bkey_s k)
-{
-	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-
-	a->journal_seq_nonempty	= swab64(a->journal_seq_nonempty);
-	a->journal_seq_empty	= swab64(a->journal_seq_empty);
-	a->flags		= swab32(a->flags);
-	a->dirty_sectors	= swab32(a->dirty_sectors);
-	a->cached_sectors	= swab32(a->cached_sectors);
-	a->io_time[0]		= swab64(a->io_time[0]);
-	a->io_time[1]		= swab64(a->io_time[1]);
-	a->stripe		= swab32(a->stripe);
-	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
-	a->stripe_sectors	= swab32(a->stripe_sectors);
-}
-
-static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
-					   unsigned dev, const struct bch_alloc_v4 *a)
-{
-	struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
-
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
-	bch2_prt_data_type(out, a->data_type);
-	prt_newline(out);
-	prt_printf(out, "journal_seq_nonempty %llu\n",	a->journal_seq_nonempty);
-	prt_printf(out, "journal_seq_empty    %llu\n",	a->journal_seq_empty);
-	prt_printf(out, "need_discard         %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
-	prt_printf(out, "need_inc_gen         %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
-	prt_printf(out, "dirty_sectors        %u\n",	a->dirty_sectors);
-	prt_printf(out, "stripe_sectors       %u\n",	a->stripe_sectors);
-	prt_printf(out, "cached_sectors       %u\n",	a->cached_sectors);
-	prt_printf(out, "stripe               %u\n",	a->stripe);
-	prt_printf(out, "stripe_redundancy    %u\n",	a->stripe_redundancy);
-	prt_printf(out, "io_time[READ]        %llu\n",	a->io_time[READ]);
-	prt_printf(out, "io_time[WRITE]       %llu\n",	a->io_time[WRITE]);
-
-	if (ca)
-		prt_printf(out, "fragmentation     %llu\n",	alloc_lru_idx_fragmentation(*a, ca));
-	prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
-	printbuf_indent_sub(out, 2);
-
-	bch2_dev_put(ca);
-}
-
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_alloc_v4 _a;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-
-	__bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
-}
-
-void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	__bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
-}
-
-void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		void *src, *dst;
-
-		*out = *bkey_s_c_to_alloc_v4(k).v;
-
-		src = alloc_v4_backpointers(out);
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-		dst = alloc_v4_backpointers(out);
-
-		if (src < dst)
-			memset(src, 0, dst - src);
-
-		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
-	} else {
-		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
-		*out = (struct bch_alloc_v4) {
-			.journal_seq_nonempty	= u.journal_seq,
-			.flags			= u.need_discard,
-			.gen			= u.gen,
-			.oldest_gen		= u.oldest_gen,
-			.data_type		= u.data_type,
-			.stripe_redundancy	= u.stripe_redundancy,
-			.dirty_sectors		= u.dirty_sectors,
-			.cached_sectors		= u.cached_sectors,
-			.io_time[READ]		= u.read_time,
-			.io_time[WRITE]		= u.write_time,
-			.stripe			= u.stripe,
-		};
-
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
-	}
-}
-
-static noinline struct bkey_i_alloc_v4 *
-__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_i_alloc_v4 *ret;
-
-	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
-	if (IS_ERR(ret))
-		return ret;
-
-	if (k.k->type == KEY_TYPE_alloc_v4) {
-		void *src, *dst;
-
-		bkey_reassemble(&ret->k_i, k);
-
-		src = alloc_v4_backpointers(&ret->v);
-		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
-		dst = alloc_v4_backpointers(&ret->v);
-
-		if (src < dst)
-			memset(src, 0, dst - src);
-
-		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
-		set_alloc_v4_u64s(ret);
-	} else {
-		bkey_alloc_v4_init(&ret->k_i);
-		ret->k.p = k.k->p;
-		bch2_alloc_to_v4(k, &ret->v);
-	}
-	return ret;
-}
-
-static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bkey_s_c_alloc_v4 a;
-
-	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
-	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
-	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
-		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
-
-	return __bch2_alloc_to_v4_mut(trans, k);
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
-	return bch2_alloc_to_v4_mut_inlined(trans, k);
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
-				       struct bpos pos)
-{
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
-					       BTREE_ITER_with_updates|
-					       BTREE_ITER_cached|
-					       BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-
-	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (unlikely(ret))
-		goto err;
-	return a;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ERR_PTR(ret);
-}
-
-__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
-						      enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
-					       BTREE_ITER_with_updates|
-					       BTREE_ITER_cached|
-					       BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-
-	if ((void *) k.v >= trans->mem &&
-	    (void *) k.v <  trans->mem + trans->mem_top) {
-		bch2_trans_iter_exit(trans, &iter);
-		return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
-	}
-
-	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
-	if (IS_ERR(a)) {
-		bch2_trans_iter_exit(trans, &iter);
-		return a;
-	}
-
-	ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
-	bch2_trans_iter_exit(trans, &iter);
-	return unlikely(ret) ? ERR_PTR(ret) : a;
-}
-
-static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
-{
-	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
-
-	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
-	return pos;
-}
-
-static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
-{
-	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
-	pos.offset += offset;
-	return pos;
-}
-
-static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
-{
-	return k.k->type == KEY_TYPE_bucket_gens
-		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
-		: 0;
-}
-
-int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
-			      struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
-			 c, bucket_gens_val_size_bad,
-			 "bad val size (%zu != %zu)",
-			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-fsck_err:
-	return ret;
-}
-
-void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
-		if (i)
-			prt_char(out, ' ');
-		prt_printf(out, "%u", g.v->gens[i]);
-	}
-}
-
-int bch2_bucket_gens_init(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bkey_i_bucket_gens g;
-	bool have_bucket_gens_key = false;
-	int ret;
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-				 BTREE_ITER_prefetch, k, ({
-		/*
-		 * Not a fsck error because this is checked/repaired by
-		 * bch2_check_alloc_key() which runs later:
-		 */
-		if (!bch2_dev_bucket_exists(c, k.k->p))
-			continue;
-
-		struct bch_alloc_v4 a;
-		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
-		unsigned offset;
-		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
-		int ret2 = 0;
-
-		if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
-			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			if (ret2)
-				goto iter_err;
-			have_bucket_gens_key = false;
-		}
-
-		if (!have_bucket_gens_key) {
-			bkey_bucket_gens_init(&g.k_i);
-			g.k.p = pos;
-			have_bucket_gens_key = true;
-		}
-
-		g.v.gens[offset] = gen;
-iter_err:
-		ret2;
-	}));
-
-	if (have_bucket_gens_key && !ret)
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc,
-			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-
-	bch2_trans_put(trans);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
-	down_read(&c->state_lock);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bch_dev *ca = NULL;
-	int ret;
-
-	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
-		ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-
-			if (k.k->type != KEY_TYPE_bucket_gens)
-				continue;
-
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			/*
-			 * Not a fsck error because this is checked/repaired by
-			 * bch2_check_alloc_key() which runs later:
-			 */
-			if (!ca) {
-				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
-			for (u64 b = max_t(u64, ca->mi.first_bucket, start);
-			     b < min_t(u64, ca->mi.nbuckets, end);
-			     b++)
-				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
-			0;
-		}));
-	} else {
-		ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
-					 BTREE_ITER_prefetch, k, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			/*
-			 * Not a fsck error because this is checked/repaired by
-			 * bch2_check_alloc_key() which runs later:
-			 */
-			if (!ca) {
-				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			if (k.k->p.offset < ca->mi.first_bucket) {
-				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
-				continue;
-			}
-
-			if (k.k->p.offset >= ca->mi.nbuckets) {
-				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-
-			struct bch_alloc_v4 a;
-			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
-			0;
-		}));
-	}
-
-	bch2_dev_put(ca);
-	bch2_trans_put(trans);
-
-	up_read(&c->state_lock);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* Free space/discard btree: */
-
-static int __need_discard_or_freespace_err(struct btree_trans *trans,
-					   struct bkey_s_c alloc_k,
-					   bool set, bool discard, bool repair)
-{
-	struct bch_fs *c = trans->c;
-	enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
-	enum bch_sb_error_id err_id = discard
-		? BCH_FSCK_ERR_need_discard_key_wrong
-		: BCH_FSCK_ERR_freespace_key_wrong;
-	enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
-	struct printbuf buf = PRINTBUF;
-
-	bch2_bkey_val_to_text(&buf, c, alloc_k);
-
-	int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
-				  "bucket incorrectly %sset in %s btree\n%s",
-				  set ? "" : "un",
-				  bch2_btree_id_str(btree),
-				  buf.buf);
-	if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
-	    bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
-		ret = 0;
-
-	printbuf_exit(&buf);
-	return ret;
-}
-
-#define need_discard_or_freespace_err(...)		\
-	fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
-
-#define need_discard_or_freespace_err_on(cond, ...)		\
-	(unlikely(cond) ?  need_discard_or_freespace_err(__VA_ARGS__) : false)
-
-static int bch2_bucket_do_index(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bkey_s_c alloc_k,
-				const struct bch_alloc_v4 *a,
-				bool set)
-{
-	enum btree_id btree;
-	struct bpos pos;
-
-	if (a->data_type != BCH_DATA_free &&
-	    a->data_type != BCH_DATA_need_discard)
-		return 0;
-
-	switch (a->data_type) {
-	case BCH_DATA_free:
-		btree = BTREE_ID_freespace;
-		pos = alloc_freespace_pos(alloc_k.k->p, *a);
-		break;
-	case BCH_DATA_need_discard:
-		btree = BTREE_ID_need_discard;
-		pos = alloc_k.k->p;
-		break;
-	default:
-		return 0;
-	}
-
-	struct btree_iter iter;
-	struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
-	int ret = bkey_err(old);
-	if (ret)
-		return ret;
-
-	need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
-					 !old.k->type != set,
-					 trans, alloc_k, set,
-					 btree == BTREE_ID_need_discard, false);
-
-	ret = bch2_btree_bit_mod_iter(trans, &iter, set);
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
-					   struct bpos bucket, u8 gen)
-{
-	struct btree_iter iter;
-	unsigned offset;
-	struct bpos pos = alloc_gens_pos(bucket, &offset);
-	struct bkey_i_bucket_gens *g;
-	struct bkey_s_c k;
-	int ret;
-
-	g = bch2_trans_kmalloc(trans, sizeof(*g));
-	ret = PTR_ERR_OR_ZERO(g);
-	if (ret)
-		return ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
-			       BTREE_ITER_intent|
-			       BTREE_ITER_with_updates);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (k.k->type != KEY_TYPE_bucket_gens) {
-		bkey_bucket_gens_init(&g->k_i);
-		g->k.p = iter.pos;
-	} else {
-		bkey_reassemble(&g->k_i, k);
-	}
-
-	g->v.gens[offset] = gen;
-
-	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
-						    enum bch_data_type data_type,
-						    s64 delta_buckets,
-						    s64 delta_sectors,
-						    s64 delta_fragmented, unsigned flags)
-{
-	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
-
-	return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
-					 d, dev_data_type,
-					 .dev		= ca->dev_idx,
-					 .data_type	= data_type);
-}
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
-				   const struct bch_alloc_v4 *old,
-				   const struct bch_alloc_v4 *new,
-				   unsigned flags)
-{
-	s64 old_sectors = bch2_bucket_sectors(*old);
-	s64 new_sectors = bch2_bucket_sectors(*new);
-	if (old->data_type != new->data_type) {
-		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
-				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
-			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
-				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
-		if (ret)
-			return ret;
-	} else if (old_sectors != new_sectors) {
-		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
-					 0,
-					 new_sectors - old_sectors,
-					 bch2_bucket_sectors_fragmented(ca, *new) -
-					 bch2_bucket_sectors_fragmented(ca, *old), flags);
-		if (ret)
-			return ret;
-	}
-
-	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
-	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
-	if (old_unstriped != new_unstriped) {
-		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
-					 !!new_unstriped - !!old_unstriped,
-					 new_unstriped - old_unstriped,
-					 0,
-					 flags);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_trigger_alloc(struct btree_trans *trans,
-		       enum btree_id btree, unsigned level,
-		       struct bkey_s_c old, struct bkey_s new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
-	if (!ca)
-		return bch_err_throw(c, trigger_alloc);
-
-	struct bch_alloc_v4 old_a_convert;
-	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-
-	struct bch_alloc_v4 *new_a;
-	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
-		new_a = bkey_s_to_alloc_v4(new).v;
-	} else {
-		BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
-
-		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
-		ret = PTR_ERR_OR_ZERO(new_ka);
-		if (unlikely(ret))
-			goto err;
-		new_a = &new_ka->v;
-	}
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		alloc_data_type_set(new_a, new_a->data_type);
-
-		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
-				     (int) data_type_is_empty(old_a->data_type);
-
-		if (is_empty_delta < 0) {
-			new_a->io_time[READ] = bch2_current_io_time(c, READ);
-			new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-			SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
-		}
-
-		if (data_type_is_empty(new_a->data_type) &&
-		    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
-		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
-			if (new_a->oldest_gen == new_a->gen &&
-			    !bch2_bucket_sectors_total(*new_a))
-				new_a->oldest_gen++;
-			new_a->gen++;
-			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
-			alloc_data_type_set(new_a, new_a->data_type);
-		}
-
-		if (old_a->data_type != new_a->data_type ||
-		    (new_a->data_type == BCH_DATA_free &&
-		     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
-			ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
-				bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
-			if (ret)
-				goto err;
-		}
-
-		if (new_a->data_type == BCH_DATA_cached &&
-		    !new_a->io_time[READ])
-			new_a->io_time[READ] = bch2_current_io_time(c, READ);
-
-		ret = bch2_lru_change(trans, new.k->p.inode,
-				      bucket_to_u64(new.k->p),
-				      alloc_lru_idx_read(*old_a),
-				      alloc_lru_idx_read(*new_a));
-		if (ret)
-			goto err;
-
-		ret = bch2_lru_change(trans,
-				      BCH_LRU_BUCKET_FRAGMENTATION,
-				      bucket_to_u64(new.k->p),
-				      alloc_lru_idx_fragmentation(*old_a, ca),
-				      alloc_lru_idx_fragmentation(*new_a, ca));
-		if (ret)
-			goto err;
-
-		if (old_a->gen != new_a->gen) {
-			ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
-			if (ret)
-				goto err;
-		}
-
-		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
-		if (ret)
-			goto err;
-	}
-
-	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		u64 transaction_seq = trans->journal_res.seq;
-		BUG_ON(!transaction_seq);
-
-		if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
-				    trans, alloc_key_journal_seq_in_future,
-				    "bucket journal seq in future (currently at %llu)\n%s",
-				    journal_cur_seq(&c->journal),
-				    (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
-			new_a->journal_seq_nonempty = transaction_seq;
-
-		int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
-				     (int) data_type_is_empty(old_a->data_type);
-
-		/*
-		 * Record journal sequence number of empty -> nonempty transition:
-		 * Note that there may be multiple empty -> nonempty
-		 * transitions, data in a bucket may be overwritten while we're
-		 * still writing to it - so be careful to only record the first:
-		 * */
-		if (is_empty_delta < 0 &&
-		    new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
-			new_a->journal_seq_nonempty	= transaction_seq;
-			new_a->journal_seq_empty	= 0;
-		}
-
-		/*
-		 * Bucket becomes empty: mark it as waiting for a journal flush,
-		 * unless updates since empty -> nonempty transition were never
-		 * flushed - we may need to ask the journal not to flush
-		 * intermediate sequence numbers:
-		 */
-		if (is_empty_delta > 0) {
-			if (new_a->journal_seq_nonempty == transaction_seq ||
-			    bch2_journal_noflush_seq(&c->journal,
-						     new_a->journal_seq_nonempty,
-						     transaction_seq)) {
-				new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
-			} else {
-				new_a->journal_seq_empty = transaction_seq;
-
-				ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-									   c->journal.flushed_seq_ondisk,
-									   new.k->p.inode, new.k->p.offset,
-									   transaction_seq);
-				if (bch2_fs_fatal_err_on(ret, c,
-						"setting bucket_needs_journal_commit: %s",
-						bch2_err_str(ret)))
-					goto err;
-			}
-		}
-
-		if (new_a->gen != old_a->gen) {
-			guard(rcu)();
-			u8 *gen = bucket_gen(ca, new.k->p.offset);
-			if (unlikely(!gen))
-				goto invalid_bucket;
-			*gen = new_a->gen;
-		}
-
-#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
-#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a)		(a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
-
-		if (statechange(a->data_type == BCH_DATA_free) &&
-		    bucket_flushed(new_a))
-			closure_wake_up(&c->freelist_wait);
-
-		if (statechange(a->data_type == BCH_DATA_need_discard) &&
-		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
-		    bucket_flushed(new_a))
-			bch2_discard_one_bucket_fast(ca, new.k->p.offset);
-
-		if (statechange(a->data_type == BCH_DATA_cached) &&
-		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
-		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
-			bch2_dev_do_invalidates(ca);
-
-		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
-			bch2_gc_gens_async(c);
-	}
-
-	if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
-		guard(rcu)();
-		struct bucket *g = gc_bucket(ca, new.k->p.offset);
-		if (unlikely(!g))
-			goto invalid_bucket;
-		g->gen_valid	= 1;
-		g->gen		= new_a->gen;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch2_dev_put(ca);
-	return ret;
-invalid_bucket:
-	bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
-	ret = bch_err_throw(c, trigger_alloc);
-	goto err;
-}
-
-/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
- * extents style btrees, but works on non-extents btrees:
- */
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
-					    struct bpos end, struct bkey *hole)
-{
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-
-	if (bkey_err(k))
-		return k;
-
-	if (k.k->type) {
-		return k;
-	} else {
-		struct btree_iter iter2;
-		struct bpos next;
-
-		bch2_trans_copy_iter(trans, &iter2, iter);
-
-		struct btree_path *path = btree_iter_path(trans, iter);
-		if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
-			end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
-
-		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
-
-		/*
-		 * btree node min/max is a closed interval, upto takes a half
-		 * open interval:
-		 */
-		k = bch2_btree_iter_peek_max(trans, &iter2, end);
-		next = iter2.pos;
-		bch2_trans_iter_exit(trans, &iter2);
-
-		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
-
-		if (bkey_err(k))
-			return k;
-
-		bkey_init(hole);
-		hole->p = iter->pos;
-
-		bch2_key_resize(hole, next.offset - iter->pos.offset);
-		return (struct bkey_s_c) { hole, NULL };
-	}
-}
-
-static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
-{
-	if (*ca) {
-		if (bucket->offset < (*ca)->mi.first_bucket)
-			bucket->offset = (*ca)->mi.first_bucket;
-
-		if (bucket->offset < (*ca)->mi.nbuckets)
-			return true;
-
-		bch2_dev_put(*ca);
-		*ca = NULL;
-		bucket->inode++;
-		bucket->offset = 0;
-	}
-
-	guard(rcu)();
-	*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
-	if (*ca) {
-		*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
-		bch2_dev_get(*ca);
-	}
-
-	return *ca != NULL;
-}
-
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
-							struct btree_iter *iter,
-							struct bch_dev **ca, struct bkey *hole)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-again:
-	k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
-	if (bkey_err(k))
-		return k;
-
-	*ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
-
-	if (!k.k->type) {
-		struct bpos hole_start = bkey_start_pos(k.k);
-
-		if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
-			if (!next_bucket(c, ca, &hole_start))
-				return bkey_s_c_null;
-
-			bch2_btree_iter_set_pos(trans, iter, hole_start);
-			goto again;
-		}
-
-		if (k.k->p.offset > (*ca)->mi.nbuckets)
-			bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
-	}
-
-	return k;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_key(struct btree_trans *trans,
-			 struct bkey_s_c alloc_k,
-			 struct btree_iter *alloc_iter,
-			 struct btree_iter *discard_iter,
-			 struct btree_iter *freespace_iter,
-			 struct btree_iter *bucket_gens_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	unsigned gens_offset;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
-	if (fsck_err_on(!ca,
-			trans, alloc_key_to_missing_dev_bucket,
-			"alloc key for invalid device:bucket %llu:%llu",
-			alloc_k.k->p.inode, alloc_k.k->p.offset))
-		ret = bch2_btree_delete_at(trans, alloc_iter, 0);
-	if (!ca)
-		return ret;
-
-	if (!ca->mi.freespace_initialized)
-		goto out;
-
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
-	k = bch2_btree_iter_peek_slot(trans, discard_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bool is_discarded = a->data_type == BCH_DATA_need_discard;
-	if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
-					     trans, alloc_k, !is_discarded, true, true)) {
-		ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
-		if (ret)
-			goto err;
-	}
-
-	bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
-	k = bch2_btree_iter_peek_slot(trans, freespace_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bool is_free = a->data_type == BCH_DATA_free;
-	if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
-					     trans, alloc_k, !is_free, false, true)) {
-		ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
-		if (ret)
-			goto err;
-	}
-
-	bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
-	k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
-			trans, bucket_gens_key_wrong,
-			"incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
-			alloc_gen(k, gens_offset), a->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i_bucket_gens *g =
-			bch2_trans_kmalloc(trans, sizeof(*g));
-
-		ret = PTR_ERR_OR_ZERO(g);
-		if (ret)
-			goto err;
-
-		if (k.k->type == KEY_TYPE_bucket_gens) {
-			bkey_reassemble(&g->k_i, k);
-		} else {
-			bkey_bucket_gens_init(&g->k_i);
-			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
-		}
-
-		g->v.gens[gens_offset] = a->gen;
-
-		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
-		if (ret)
-			goto err;
-	}
-out:
-err:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
-				    struct bch_dev *ca,
-				    struct bpos start,
-				    struct bpos *end,
-				    struct btree_iter *freespace_iter)
-{
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	if (!ca->mi.freespace_initialized)
-		return 0;
-
-	bch2_btree_iter_set_pos(trans, freespace_iter, start);
-
-	k = bch2_btree_iter_peek_slot(trans, freespace_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	*end = bkey_min(k.k->p, *end);
-
-	if (fsck_err_on(k.k->type != KEY_TYPE_set,
-			trans, freespace_hole_missing,
-			"hole in alloc btree missing in freespace btree\n"
-			"device %llu buckets %llu-%llu",
-			freespace_iter->pos.inode,
-			freespace_iter->pos.offset,
-			end->offset)) {
-		struct bkey_i *update =
-			bch2_trans_kmalloc(trans, sizeof(*update));
-
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_init(&update->k);
-		update->k.type	= KEY_TYPE_set;
-		update->k.p	= freespace_iter->pos;
-		bch2_key_resize(&update->k,
-				min_t(u64, U32_MAX, end->offset -
-				      freespace_iter->pos.offset));
-
-		ret = bch2_trans_update(trans, freespace_iter, update, 0);
-		if (ret)
-			goto err;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
-				      struct bpos start,
-				      struct bpos *end,
-				      struct btree_iter *bucket_gens_iter)
-{
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	unsigned i, gens_offset, gens_end_offset;
-	int ret;
-
-	bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
-
-	k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
-		     alloc_gens_pos(*end,  &gens_end_offset)))
-		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
-
-	if (k.k->type == KEY_TYPE_bucket_gens) {
-		struct bkey_i_bucket_gens g;
-		bool need_update = false;
-
-		bkey_reassemble(&g.k_i, k);
-
-		for (i = gens_offset; i < gens_end_offset; i++) {
-			if (fsck_err_on(g.v.gens[i], trans,
-					bucket_gens_hole_wrong,
-					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
-					bucket_gens_pos_to_alloc(k.k->p, i).inode,
-					bucket_gens_pos_to_alloc(k.k->p, i).offset,
-					g.v.gens[i])) {
-				g.v.gens[i] = 0;
-				need_update = true;
-			}
-		}
-
-		if (need_update) {
-			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-			ret = PTR_ERR_OR_ZERO(u);
-			if (ret)
-				goto err;
-
-			memcpy(u, &g, sizeof(g));
-
-			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
-			if (ret)
-				goto err;
-		}
-	}
-
-	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-struct check_discard_freespace_key_async {
-	struct work_struct	work;
-	struct bch_fs		*c;
-	struct bbpos		pos;
-};
-
-static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	u8 gen;
-	ret = k.k->type != KEY_TYPE_set
-		? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
-		: 0;
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void check_discard_freespace_key_work(struct work_struct *work)
-{
-	struct check_discard_freespace_key_async *w =
-		container_of(work, struct check_discard_freespace_key_async, work);
-
-	bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
-	enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
-	kfree(w);
-}
-
-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
-				     bool async_repair)
-{
-	struct bch_fs *c = trans->c;
-	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
-		? BCH_DATA_need_discard
-		: BCH_DATA_free;
-	struct printbuf buf = PRINTBUF;
-
-	unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
-		FSCK_CAN_FIX|FSCK_CAN_IGNORE;
-
-	struct bpos bucket = iter->pos;
-	bucket.offset &= ~(~0ULL << 56);
-	u64 genbits = iter->pos.offset & (~0ULL << 56);
-
-	struct btree_iter alloc_iter;
-	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
-						     BTREE_ID_alloc, bucket,
-						     async_repair ? BTREE_ITER_cached : 0);
-	int ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
-	if (!bch2_dev_bucket_exists(c, bucket)) {
-		if (__fsck_err(trans, fsck_flags,
-			       need_discard_freespace_key_to_invalid_dev_bucket,
-			       "entry in %s btree for nonexistant dev:bucket %llu:%llu",
-			       bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
-			goto delete;
-		ret = 1;
-		goto out;
-	}
-
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	if (a->data_type != state ||
-	    (state == BCH_DATA_free &&
-	     genbits != alloc_freespace_genbits(*a))) {
-		if (__fsck_err(trans, fsck_flags,
-			       need_discard_freespace_key_bad,
-			     "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
-			     (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-			     bch2_btree_id_str(iter->btree_id),
-			     iter->pos.inode,
-			     iter->pos.offset,
-			     a->data_type == state,
-			     genbits >> 56, alloc_freespace_genbits(*a) >> 56))
-			goto delete;
-		ret = 1;
-		goto out;
-	}
-
-	*gen = a->gen;
-out:
-fsck_err:
-	bch2_set_btree_iter_dontneed(trans, &alloc_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	printbuf_exit(&buf);
-	return ret;
-delete:
-	if (!async_repair) {
-		ret =   bch2_btree_bit_mod_iter(trans, iter, false) ?:
-			bch2_trans_commit(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc) ?:
-			bch_err_throw(c, transaction_restart_commit);
-		goto out;
-	} else {
-		/*
-		 * We can't repair here when called from the allocator path: the
-		 * commit will recurse back into the allocator
-		 */
-		struct check_discard_freespace_key_async *w =
-			kzalloc(sizeof(*w), GFP_KERNEL);
-		if (!w)
-			goto out;
-
-		if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
-			kfree(w);
-			goto out;
-		}
-
-		INIT_WORK(&w->work, check_discard_freespace_key_work);
-		w->c = c;
-		w->pos = BBPOS(iter->btree_id, iter->pos);
-		queue_work(c->write_ref_wq, &w->work);
-
-		ret = 1; /* don't allocate from this bucket */
-		goto out;
-	}
-}
-
-static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
-{
-	u8 gen;
-	int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
-	return ret < 0 ? ret : 0;
-}
-
-/*
- * We've already checked that generation numbers in the bucket_gens btree are
- * valid for buckets that exist; this just checks for keys for nonexistent
- * buckets.
- */
-static noinline_for_stack
-int bch2_check_bucket_gens_key(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_bucket_gens g;
-	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
-	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-	u64 b;
-	bool need_update = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
-	bkey_reassemble(&g.k_i, k);
-
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
-	if (!ca) {
-		if (fsck_err(trans, bucket_gens_to_invalid_dev,
-			     "bucket_gens key for invalid device:\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
-
-	if (fsck_err_on(end <= ca->mi.first_bucket ||
-			start >= ca->mi.nbuckets,
-			trans, bucket_gens_to_invalid_buckets,
-			"bucket_gens key for invalid buckets:\n%s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto out;
-	}
-
-	for (b = start; b < ca->mi.first_bucket; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
-				trans, bucket_gens_nonzero_for_invalid_buckets,
-				"bucket_gens key has nonzero gen for invalid bucket")) {
-			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-			need_update = true;
-		}
-
-	for (b = ca->mi.nbuckets; b < end; b++)
-		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
-				trans, bucket_gens_nonzero_for_invalid_buckets,
-				"bucket_gens key has nonzero gen for invalid bucket")) {
-			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
-			need_update = true;
-		}
-
-	if (need_update) {
-		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto out;
-
-		memcpy(u, &g, sizeof(g));
-		ret = bch2_trans_update(trans, iter, u, 0);
-	}
-out:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_alloc_info(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
-	struct bch_dev *ca = NULL;
-	struct bkey hole;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
-			     BTREE_ITER_prefetch);
-
-	while (1) {
-		struct bpos next;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
-		ret = bkey_err(k);
-		if (ret)
-			goto bkey_err;
-
-		if (!k.k)
-			break;
-
-		if (k.k->type) {
-			next = bpos_nosnap_successor(k.k->p);
-
-			ret = bch2_check_alloc_key(trans,
-						   k, &iter,
-						   &discard_iter,
-						   &freespace_iter,
-						   &bucket_gens_iter);
-			if (ret)
-				goto bkey_err;
-		} else {
-			next = k.k->p;
-
-			ret = bch2_check_alloc_hole_freespace(trans, ca,
-						    bkey_start_pos(k.k),
-						    &next,
-						    &freespace_iter) ?:
-				bch2_check_alloc_hole_bucket_gens(trans,
-						    bkey_start_pos(k.k),
-						    &next,
-						    &bucket_gens_iter);
-			if (ret)
-				goto bkey_err;
-		}
-
-		ret = bch2_trans_commit(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc);
-		if (ret)
-			goto bkey_err;
-
-		bch2_btree_iter_set_pos(trans, &iter, next);
-bkey_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &bucket_gens_iter);
-	bch2_trans_iter_exit(trans, &freespace_iter);
-	bch2_trans_iter_exit(trans, &discard_iter);
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_dev_put(ca);
-	ca = NULL;
-
-	if (ret < 0)
-		goto err;
-
-	ret = for_each_btree_key(trans, iter,
-			BTREE_ID_need_discard, POS_MIN,
-			BTREE_ITER_prefetch, k,
-		bch2_check_discard_freespace_key_fsck(trans, &iter));
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
-			     BTREE_ITER_prefetch);
-	while (1) {
-		bch2_trans_begin(trans);
-		k = bch2_btree_iter_peek(trans, &iter);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k) ?:
-			bch2_check_discard_freespace_key_fsck(trans, &iter);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			ret = 0;
-			continue;
-		}
-		if (ret) {
-			struct printbuf buf = PRINTBUF;
-			bch2_bkey_val_to_text(&buf, c, k);
-
-			bch_err(c, "while checking %s", buf.buf);
-			printbuf_exit(&buf);
-			break;
-		}
-
-		bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	if (ret)
-		goto err;
-
-	ret = for_each_btree_key_commit(trans, iter,
-			BTREE_ID_bucket_gens, POS_MIN,
-			BTREE_ITER_prefetch, k,
-			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		bch2_check_bucket_gens_key(trans, &iter, k));
-err:
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
-				       struct btree_iter *alloc_iter,
-				       struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-	struct bkey_s_c alloc_k;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
-	if (!alloc_k.k)
-		return 0;
-
-	ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
-	if (!ca)
-		return 0;
-
-	a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
-	if (lru_idx) {
-		ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
-					 bucket_to_u64(alloc_k.k->p),
-					 lru_idx, alloc_k, last_flushed);
-		if (ret)
-			goto err;
-	}
-
-	if (a->data_type != BCH_DATA_cached)
-		goto err;
-
-	if (fsck_err_on(!a->io_time[READ],
-			trans, alloc_key_cached_but_read_time_zero,
-			"cached bucket with read_time 0\n%s",
-		(printbuf_reset(&buf),
-		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
-		struct bkey_i_alloc_v4 *a_mut =
-			bch2_alloc_to_v4_mut(trans, alloc_k);
-		ret = PTR_ERR_OR_ZERO(a_mut);
-		if (ret)
-			goto err;
-
-		a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
-		ret = bch2_trans_update(trans, alloc_iter,
-					&a_mut->k_i, BTREE_TRIGGER_norun);
-		if (ret)
-			goto err;
-
-		a = &a_mut->v;
-	}
-
-	ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
-				 bucket_to_u64(alloc_k.k->p),
-				 a->io_time[READ],
-				 alloc_k, last_flushed);
-	if (ret)
-		goto err;
-err:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
-{
-	struct bkey_buf last_flushed;
-
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-				POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
-		bch2_check_stripe_to_lru_refs(c);
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
-{
-	struct bch_fs *c = ca->fs;
-	int ret;
-
-	mutex_lock(&ca->discard_buckets_in_flight_lock);
-	struct discard_in_flight *i =
-		darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
-	if (i) {
-		ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
-		goto out;
-	}
-
-	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
-			   .in_progress = in_progress,
-			   .bucket	= bucket,
-	}));
-out:
-	mutex_unlock(&ca->discard_buckets_in_flight_lock);
-	return ret;
-}
-
-static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
-{
-	mutex_lock(&ca->discard_buckets_in_flight_lock);
-	struct discard_in_flight *i =
-		darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
-	BUG_ON(!i || !i->in_progress);
-
-	darray_remove_item(&ca->discard_buckets_in_flight, i);
-	mutex_unlock(&ca->discard_buckets_in_flight_lock);
-}
-
-struct discard_buckets_state {
-	u64		seen;
-	u64		open;
-	u64		need_journal_commit;
-	u64		discarded;
-};
-
-static int bch2_discard_one_bucket(struct btree_trans *trans,
-				   struct bch_dev *ca,
-				   struct btree_iter *need_discard_iter,
-				   struct bpos *discard_pos_done,
-				   struct discard_buckets_state *s,
-				   bool fastpath)
-{
-	struct bch_fs *c = trans->c;
-	struct bpos pos = need_discard_iter->pos;
-	struct btree_iter iter = {};
-	struct bkey_s_c k;
-	struct bkey_i_alloc_v4 *a;
-	struct printbuf buf = PRINTBUF;
-	bool discard_locked = false;
-	int ret = 0;
-
-	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
-		s->open++;
-		goto out;
-	}
-
-	u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
-						      pos.inode, pos.offset);
-	if (seq_ready > c->journal.flushed_seq_ondisk) {
-		if (seq_ready > c->journal.flushing_seq)
-			s->need_journal_commit++;
-		goto out;
-	}
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-			       need_discard_iter->pos,
-			       BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
-	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (need_discard_or_freespace_err(trans, k, true, true, true)) {
-			ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
-			if (ret)
-				goto out;
-			goto commit;
-		}
-
-		goto out;
-	}
-
-	if (!fastpath) {
-		if (discard_in_flight_add(ca, iter.pos.offset, true))
-			goto out;
-
-		discard_locked = true;
-	}
-
-	if (!bkey_eq(*discard_pos_done, iter.pos)) {
-		s->discarded++;
-		*discard_pos_done = iter.pos;
-
-		if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
-			/*
-			 * This works without any other locks because this is the only
-			 * thread that removes items from the need_discard tree
-			 */
-			bch2_trans_unlock_long(trans);
-			blkdev_issue_discard(ca->disk_sb.bdev,
-					     k.k->p.offset * ca->mi.bucket_size,
-					     ca->mi.bucket_size,
-					     GFP_KERNEL);
-			ret = bch2_trans_relock_notrace(trans);
-			if (ret)
-				goto out;
-		}
-	}
-
-	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
-	alloc_data_type_set(&a->v, a->v.data_type);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	if (ret)
-		goto out;
-commit:
-	ret = bch2_trans_commit(trans, NULL, NULL,
-				BCH_WATERMARK_btree|
-				BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto out;
-
-	if (!fastpath)
-		count_event(c, bucket_discard);
-	else
-		count_event(c, bucket_discard_fast);
-out:
-fsck_err:
-	if (discard_locked)
-		discard_in_flight_remove(ca, iter.pos.offset);
-	if (!ret)
-		s->seen++;
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static void bch2_do_discards_work(struct work_struct *work)
-{
-	struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
-	struct bch_fs *c = ca->fs;
-	struct discard_buckets_state s = {};
-	struct bpos discard_pos_done = POS_MAX;
-	int ret;
-
-	/*
-	 * We're doing the commit in bch2_discard_one_bucket instead of using
-	 * for_each_btree_key_commit() so that we can increment counters after
-	 * successful commit:
-	 */
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter,
-				   BTREE_ID_need_discard,
-				   POS(ca->dev_idx, 0),
-				   POS(ca->dev_idx, U64_MAX), 0, k,
-			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
-
-	if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
-		bch2_journal_flush_async(&c->journal, NULL);
-
-	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
-			      bch2_err_str(ret));
-
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_dev_do_discards(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
-		return;
-
-	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
-		goto put_write_ref;
-
-	if (queue_work(c->write_ref_wq, &ca->discard_work))
-		return;
-
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
-put_write_ref:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_do_discards(struct bch_fs *c)
-{
-	for_each_member_device(c, ca)
-		bch2_dev_do_discards(ca);
-}
-
-static int bch2_do_discards_fast_one(struct btree_trans *trans,
-				     struct bch_dev *ca,
-				     u64 bucket,
-				     struct bpos *discard_pos_done,
-				     struct discard_buckets_state *s)
-{
-	struct btree_iter need_discard_iter;
-	struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
-					BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
-	int ret = bkey_err(discard_k);
-	if (ret)
-		return ret;
-
-	if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
-			    trans, discarding_bucket_not_in_need_discard_btree,
-			    "attempting to discard bucket %u:%llu not in need_discard btree",
-			    ca->dev_idx, bucket))
-		goto out;
-
-	ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &need_discard_iter);
-	return ret;
-}
-
-static void bch2_do_discards_fast_work(struct work_struct *work)
-{
-	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
-	struct bch_fs *c = ca->fs;
-	struct discard_buckets_state s = {};
-	struct bpos discard_pos_done = POS_MAX;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = 0;
-
-	while (1) {
-		bool got_bucket = false;
-		u64 bucket;
-
-		mutex_lock(&ca->discard_buckets_in_flight_lock);
-		darray_for_each(ca->discard_buckets_in_flight, i) {
-			if (i->in_progress)
-				continue;
-
-			got_bucket = true;
-			bucket = i->bucket;
-			i->in_progress = true;
-			break;
-		}
-		mutex_unlock(&ca->discard_buckets_in_flight_lock);
-
-		if (!got_bucket)
-			break;
-
-		ret = lockrestart_do(trans,
-			bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
-		bch_err_fn(c, ret);
-
-		discard_in_flight_remove(ca, bucket);
-
-		if (ret)
-			break;
-	}
-
-	trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
-
-	bch2_trans_put(trans);
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
-{
-	struct bch_fs *c = ca->fs;
-
-	if (discard_in_flight_add(ca, bucket, false))
-		return;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
-		return;
-
-	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
-		goto put_ref;
-
-	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
-		return;
-
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
-put_ref:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static int invalidate_one_bp(struct btree_trans *trans,
-			     struct bch_dev *ca,
-			     struct bkey_s_c_backpointer bp,
-			     struct bkey_buf *last_flushed)
-{
-	struct btree_iter extent_iter;
-	struct bkey_s_c extent_k =
-		bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
-	int ret = bkey_err(extent_k);
-	if (ret)
-		return ret;
-
-	if (!extent_k.k)
-		return 0;
-
-	struct bkey_i *n =
-		bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
-				   BTREE_UPDATE_internal_snapshot_node);
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		goto err;
-
-	bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
-err:
-	bch2_trans_iter_exit(trans, &extent_iter);
-	return ret;
-}
-
-static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
-					struct bch_dev *ca,
-					struct bpos bucket,
-					u8 gen,
-					struct bkey_buf *last_flushed)
-{
-	struct bpos bp_start	= bucket_pos_to_bp_start(ca,	bucket);
-	struct bpos bp_end	= bucket_pos_to_bp_end(ca,	bucket);
-
-	return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
-				      bp_start, bp_end, 0, k,
-				      NULL, NULL,
-				      BCH_WATERMARK_btree|
-				      BCH_TRANS_COMMIT_no_enospc, ({
-		if (k.k->type != KEY_TYPE_backpointer)
-			continue;
-
-		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-		if (bp.v->bucket_gen != gen)
-			continue;
-
-		/* filter out bps with gens that don't match */
-
-		invalidate_one_bp(trans, ca, bp, last_flushed);
-	}));
-}
-
-noinline_for_stack
-static int invalidate_one_bucket(struct btree_trans *trans,
-				 struct bch_dev *ca,
-				 struct btree_iter *lru_iter,
-				 struct bkey_s_c lru_k,
-				 struct bkey_buf *last_flushed,
-				 s64 *nr_to_invalidate)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
-	struct btree_iter alloc_iter = {};
-	int ret = 0;
-
-	if (*nr_to_invalidate <= 0)
-		return 1;
-
-	if (!bch2_dev_bucket_exists(c, bucket)) {
-		if (fsck_err(trans, lru_entry_to_invalid_bucket,
-			     "lru key points to nonexistent device:bucket %llu:%llu",
-			     bucket.inode, bucket.offset))
-			return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
-		goto out;
-	}
-
-	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
-		return 0;
-
-	struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
-						     BTREE_ID_alloc, bucket,
-						     BTREE_ITER_cached);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		return ret;
-
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
-	/* We expect harmless races here due to the btree write buffer: */
-	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
-		goto out;
-
-	/*
-	 * Impossible since alloc_lru_idx_read() only returns nonzero if the
-	 * bucket is supposed to be on the cached bucket LRU (i.e.
-	 * BCH_DATA_cached)
-	 *
-	 * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
-	 */
-	BUG_ON(a->data_type != BCH_DATA_cached);
-	BUG_ON(a->dirty_sectors);
-
-	if (!a->cached_sectors) {
-		bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
-						       true, last_flushed);
-		goto out;
-	}
-
-	unsigned cached_sectors = a->cached_sectors;
-	u8 gen = a->gen;
-
-	ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
-	if (ret)
-		goto out;
-
-	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
-	--*nr_to_invalidate;
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
-				    struct bch_dev *ca, bool *wrapped)
-{
-	struct bkey_s_c k;
-again:
-	k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
-	if (!k.k && !*wrapped) {
-		bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
-		*wrapped = true;
-		goto again;
-	}
-
-	return k;
-}
-
-static void bch2_do_invalidates_work(struct work_struct *work)
-{
-	struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
-	struct bch_fs *c = ca->fs;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = 0;
-
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	ret = bch2_btree_write_buffer_tryflush(trans);
-	if (ret)
-		goto err;
-
-	s64 nr_to_invalidate =
-		should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
-	struct btree_iter iter;
-	bool wrapped = false;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-			     lru_pos(ca->dev_idx, 0,
-				     ((bch2_current_io_time(c, READ) + U32_MAX) &
-				      LRU_TIME_MAX)), 0);
-
-	while (true) {
-		bch2_trans_begin(trans);
-
-		struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
-		ret = bkey_err(k);
-		if (ret)
-			goto restart_err;
-		if (!k.k)
-			break;
-
-		ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
-restart_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		bch2_btree_iter_advance(trans, &iter);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&last_flushed, c);
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
-		return;
-
-	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
-		goto put_ref;
-
-	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
-		return;
-
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
-put_ref:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_do_invalidates(struct bch_fs *c)
-{
-	for_each_member_device(c, ca)
-		bch2_dev_do_invalidates(ca);
-}
-
-int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
-			    u64 bucket_start, u64 bucket_end)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey hole;
-	struct bpos end = POS(ca->dev_idx, bucket_end);
-	struct bch_member *m;
-	unsigned long last_updated = jiffies;
-	int ret;
-
-	BUG_ON(bucket_start > bucket_end);
-	BUG_ON(bucket_end > ca->mi.nbuckets);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
-		BTREE_ITER_prefetch);
-	/*
-	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
-	 * freespace/need_discard/need_gc_gens btrees as needed:
-	 */
-	while (1) {
-		if (time_after(jiffies, last_updated + HZ * 10)) {
-			bch_info(ca, "%s: currently at %llu/%llu",
-				 __func__, iter.pos.offset, ca->mi.nbuckets);
-			last_updated = jiffies;
-		}
-
-		bch2_trans_begin(trans);
-
-		if (bkey_ge(iter.pos, end)) {
-			ret = 0;
-			break;
-		}
-
-		k = bch2_get_key_or_hole(trans, &iter, end, &hole);
-		ret = bkey_err(k);
-		if (ret)
-			goto bkey_err;
-
-		if (k.k->type) {
-			/*
-			 * We process live keys in the alloc btree one at a
-			 * time:
-			 */
-			struct bch_alloc_v4 a_convert;
-			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
-			ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto bkey_err;
-
-			bch2_btree_iter_advance(trans, &iter);
-		} else {
-			struct bkey_i *freespace;
-
-			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
-			ret = PTR_ERR_OR_ZERO(freespace);
-			if (ret)
-				goto bkey_err;
-
-			bkey_init(&freespace->k);
-			freespace->k.type	= KEY_TYPE_set;
-			freespace->k.p		= k.k->p;
-			freespace->k.size	= k.k->size;
-
-			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BCH_TRANS_COMMIT_no_enospc);
-			if (ret)
-				goto bkey_err;
-
-			bch2_btree_iter_set_pos(trans, &iter, k.k->p);
-		}
-bkey_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-
-	if (ret < 0) {
-		bch_err_msg(ca, ret, "initializing free space");
-		return ret;
-	}
-
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-int bch2_fs_freespace_init(struct bch_fs *c)
-{
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
-		return 0;
-
-
-	/*
-	 * We can crash during the device add path, so we need to check this on
-	 * every mount:
-	 */
-
-	bool doing_init = false;
-	for_each_member_device(c, ca) {
-		if (ca->mi.freespace_initialized)
-			continue;
-
-		if (!doing_init) {
-			bch_info(c, "initializing freespace");
-			doing_init = true;
-		}
-
-		int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
-		if (ret) {
-			bch2_dev_put(ca);
-			bch_err_fn(c, ret);
-			return ret;
-		}
-	}
-
-	if (doing_init) {
-		mutex_lock(&c->sb_lock);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-		bch_verbose(c, "done initializing freespace");
-	}
-
-	return 0;
-}
-
-/* device removal */
-
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bpos start	= POS(ca->dev_idx, 0);
-	struct bpos end		= POS(ca->dev_idx, U64_MAX);
-	int ret;
-
-	/*
-	 * We clear the LRU and need_discard btrees first so that we don't race
-	 * with bch2_do_invalidates() and bch2_do_discards()
-	 */
-	ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
-					BTREE_TRIGGER_norun, NULL) ?:
-		bch2_dev_usage_remove(c, ca->dev_idx);
-	bch_err_msg(ca, ret, "removing dev alloc info");
-	return ret;
-}
-
-/* Bucket IO clocks: */
-
-static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-				size_t bucket_nr, int rw)
-{
-	struct bch_fs *c = trans->c;
-
-	struct btree_iter iter;
-	struct bkey_i_alloc_v4 *a =
-		bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
-	int ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	u64 now = bch2_current_io_time(c, rw);
-	if (a->v.io_time[rw] == now)
-		goto out;
-
-	a->v.io_time[rw] = now;
-
-	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-			      size_t bucket_nr, int rw)
-{
-	if (bch2_trans_relock(trans))
-		bch2_trans_begin(trans);
-
-	return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
-	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
-	unsigned bucket_size_max = 0;
-	unsigned long ra_pages = 0;
-
-	lockdep_assert_held(&c->state_lock);
-
-	guard(rcu)();
-	for_each_member_device_rcu(c, ca, NULL) {
-		struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
-		if (bdev)
-			ra_pages += bdev->bd_disk->bdi->ra_pages;
-
-		if (ca->mi.state != BCH_MEMBER_STATE_rw)
-			continue;
-
-		u64 dev_reserve = 0;
-
-		/*
-		 * We need to reserve buckets (from the number
-		 * of currently available buckets) against
-		 * foreground writes so that mainly copygc can
-		 * make forward progress.
-		 *
-		 * We need enough to refill the various reserves
-		 * from scratch - copygc will use its entire
-		 * reserve all at once, then run against when
-		 * its reserve is refilled (from the formerly
-		 * available buckets).
-		 *
-		 * This reserve is just used when considering if
-		 * allocations for foreground writes must wait -
-		 * not -ENOSPC calculations.
-		 */
-
-		dev_reserve += ca->nr_btree_reserve * 2;
-		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
-
-		dev_reserve += 1;	/* btree write point */
-		dev_reserve += 1;	/* copygc write point */
-		dev_reserve += 1;	/* rebalance write point */
-
-		dev_reserve *= ca->mi.bucket_size;
-
-		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
-					     ca->mi.first_bucket);
-
-		reserved_sectors += dev_reserve * 2;
-
-		bucket_size_max = max_t(unsigned, bucket_size_max,
-					ca->mi.bucket_size);
-	}
-
-	bch2_set_ra_pages(c, ra_pages);
-
-	gc_reserve = c->opts.gc_reserve_bytes
-		? c->opts.gc_reserve_bytes >> 9
-		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
-	reserved_sectors = max(gc_reserve, reserved_sectors);
-
-	reserved_sectors = min(reserved_sectors, capacity);
-
-	c->reserved = reserved_sectors;
-	c->capacity = capacity - reserved_sectors;
-
-	c->bucket_size_max = bucket_size_max;
-
-	/* Wake up case someone was waiting for buckets */
-	closure_wake_up(&c->freelist_wait);
-}
-
-u64 bch2_min_rw_member_capacity(struct bch_fs *c)
-{
-	u64 ret = U64_MAX;
-
-	guard(rcu)();
-	for_each_rw_member_rcu(c, ca)
-		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
-	return ret;
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct open_bucket *ob;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		scoped_guard(spinlock, &ob->lock) {
-			if (ob->valid && !ob->on_partial_list &&
-			    ob->dev == ca->dev_idx)
-				return true;
-		}
-	}
-
-	return false;
-}
-
-void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
-{
-	/* BCH_DATA_free == all rw devs */
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
-		if (rw &&
-		    (i == BCH_DATA_free ||
-		     (ca->mi.data_allowed & BIT(i))))
-			set_bit(ca->dev_idx, c->rw_devs[i].d);
-		else
-			clear_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
-	lockdep_assert_held(&c->state_lock);
-
-	/* First, remove device from allocation groups: */
-	bch2_dev_allocator_set_rw(c, ca, false);
-
-	c->rw_devs_change_count++;
-
-	/*
-	 * Capacity is calculated based off of devices in allocation groups:
-	 */
-	bch2_recalc_capacity(c);
-
-	bch2_open_buckets_stop(c, ca, false);
-
-	/*
-	 * Wake up threads that were blocked on allocation, so they can notice
-	 * the device can no longer be removed and the capacity has changed:
-	 */
-	closure_wake_up(&c->freelist_wait);
-
-	/*
-	 * journal_res_get() can block waiting for free space in the journal -
-	 * it needs to notice there may not be devices to allocate from anymore:
-	 */
-	wake_up(&c->journal.wait);
-
-	/* Now wait for any in flight writes: */
-
-	closure_wait_event(&c->open_buckets_wait,
-			   !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
-	lockdep_assert_held(&c->state_lock);
-
-	bch2_dev_allocator_set_rw(c, ca, true);
-	c->rw_devs_change_count++;
-}
-
-void bch2_dev_allocator_background_exit(struct bch_dev *ca)
-{
-	darray_exit(&ca->discard_buckets_in_flight);
-}
-
-void bch2_dev_allocator_background_init(struct bch_dev *ca)
-{
-	mutex_init(&ca->discard_buckets_in_flight_lock);
-	INIT_WORK(&ca->discard_work, bch2_do_discards_work);
-	INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
-	INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
-}
-
-void bch2_fs_allocator_background_init(struct bch_fs *c)
-{
-	spin_lock_init(&c->freelist_lock);
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
deleted file mode 100644
index 0cc5adc55b6f..000000000000
--- a/fs/bcachefs/alloc_background.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
-#define _BCACHEFS_ALLOC_BACKGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "debug.h"
-#include "super.h"
-
-/* How out of date a pointer gen is allowed to be: */
-#define BUCKET_GC_GEN_MAX	96U
-
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
-	return ca && bucket_valid(ca, pos.offset);
-}
-
-static inline u64 bucket_to_u64(struct bpos bucket)
-{
-	return (bucket.inode << 48) | bucket.offset;
-}
-
-static inline struct bpos u64_to_bucket(u64 bucket)
-{
-	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
-}
-
-static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
-{
-	return a.gen - a.oldest_gen;
-}
-
-static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
-{
-	dst->gen		= src.gen;
-	dst->data_type		= src.data_type;
-	dst->stripe_sectors	= src.stripe_sectors;
-	dst->dirty_sectors	= src.dirty_sectors;
-	dst->cached_sectors	= src.cached_sectors;
-	dst->stripe		= src.stripe;
-}
-
-static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
-{
-	dst->gen		= src.gen;
-	dst->data_type		= src.data_type;
-	dst->stripe_sectors	= src.stripe_sectors;
-	dst->dirty_sectors	= src.dirty_sectors;
-	dst->cached_sectors	= src.cached_sectors;
-	dst->stripe		= src.stripe;
-}
-
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
-	struct bch_alloc_v4 ret = {};
-	__bucket_m_to_alloc(&ret, b);
-	return ret;
-}
-
-static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
-{
-	switch (data_type) {
-	case BCH_DATA_cached:
-	case BCH_DATA_stripe:
-		return BCH_DATA_user;
-	default:
-		return data_type;
-	}
-}
-
-static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
-					     enum bch_data_type ptr)
-{
-	return !data_type_is_empty(bucket) &&
-		bucket_data_type(bucket) != bucket_data_type(ptr);
-}
-
-/*
- * It is my general preference to use unsigned types for unsigned quantities -
- * however, these helpers are used in disk accounting calculations run by
- * triggers where the output will be negated and added to an s64. unsigned is
- * right out even though all these quantities will fit in 32 bits, since it
- * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
- * simpler option here.
- */
-static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
-{
-	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
-}
-
-static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
-{
-	return a.stripe_sectors + a.dirty_sectors;
-}
-
-static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
-{
-	return a.data_type == BCH_DATA_cached
-		? a.cached_sectors
-		: bch2_bucket_sectors_dirty(a);
-}
-
-static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
-						 struct bch_alloc_v4 a)
-{
-	int d = bch2_bucket_sectors(a);
-
-	return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
-{
-	int d = a.stripe_sectors + a.dirty_sectors;
-
-	return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
-{
-	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
-}
-
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
-						 enum bch_data_type data_type)
-{
-	if (a.stripe)
-		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
-	if (bch2_bucket_sectors_dirty(a))
-		return bucket_data_type(data_type);
-	if (a.cached_sectors)
-		return BCH_DATA_cached;
-	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
-		return BCH_DATA_need_discard;
-	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
-		return BCH_DATA_need_gc_gens;
-	return BCH_DATA_free;
-}
-
-static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
-{
-	a->data_type = alloc_data_type(*a, data_type);
-}
-
-static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
-{
-	return a.data_type == BCH_DATA_cached
-		? a.io_time[READ] & LRU_TIME_MAX
-		: 0;
-}
-
-#define DATA_TYPES_MOVABLE		\
-	((1U << BCH_DATA_btree)|	\
-	 (1U << BCH_DATA_user)|		\
-	 (1U << BCH_DATA_stripe))
-
-static inline bool data_type_movable(enum bch_data_type type)
-{
-	return (1U << type) & DATA_TYPES_MOVABLE;
-}
-
-static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
-					      struct bch_dev *ca)
-{
-	if (a.data_type >= BCH_DATA_NR)
-		return 0;
-
-	if (!data_type_movable(a.data_type) ||
-	    !bch2_bucket_sectors_fragmented(ca, a))
-		return 0;
-
-	/*
-	 * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
-	 * bucket_sectors_dirty is (much) bigger than bucket_size
-	 */
-	u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
-		      ca->mi.bucket_size);
-
-	return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
-}
-
-static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
-{
-	return ((u64) alloc_gc_gen(a) >> 4) << 56;
-}
-
-static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
-{
-	pos.offset |= alloc_freespace_genbits(a);
-	return pos;
-}
-
-static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
-{
-	return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-			BCH_ALLOC_V4_U64s_V0) +
-		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
-		(sizeof(struct bch_backpointer) / sizeof(u64));
-}
-
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
-{
-	unsigned ret = alloc_v4_u64s_noerror(a);
-	BUG_ON(ret > U8_MAX - BKEY_U64s);
-	return ret;
-}
-
-static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
-{
-	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
-			      enum btree_iter_update_trigger_flags);
-
-void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
-
-static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
-{
-	const struct bch_alloc_v4 *ret;
-
-	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
-		goto slowpath;
-
-	ret = bkey_s_c_to_alloc_v4(k).v;
-	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
-		goto slowpath;
-
-	return ret;
-slowpath:
-	__bch2_alloc_to_v4(k, convert);
-	return convert;
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-void bch2_alloc_v4_swab(struct bkey_s);
-void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
-	.key_validate	= bch2_alloc_v1_validate,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
-	.key_validate	= bch2_alloc_v2_validate,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
-	.key_validate	= bch2_alloc_v3_validate,	\
-	.val_to_text	= bch2_alloc_to_text,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 16,				\
-})
-
-#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
-	.key_validate	= bch2_alloc_v4_validate,	\
-	.val_to_text	= bch2_alloc_v4_to_text,	\
-	.swab		= bch2_alloc_v4_swab,		\
-	.trigger	= bch2_trigger_alloc,		\
-	.min_val_size	= 48,				\
-})
-
-int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
-			      struct bkey_validate_context);
-void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
-	.key_validate	= bch2_bucket_gens_validate,	\
-	.val_to_text	= bch2_bucket_gens_to_text,	\
-})
-
-int bch2_bucket_gens_init(struct bch_fs *);
-
-static inline bool bkey_is_alloc(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_alloc ||
-		k->type == KEY_TYPE_alloc_v2 ||
-		k->type == KEY_TYPE_alloc_v3;
-}
-
-int bch2_alloc_read(struct bch_fs *);
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
-				   const struct bch_alloc_v4 *,
-				   const struct bch_alloc_v4 *, unsigned);
-int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s,
-		       enum btree_iter_update_trigger_flags);
-
-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
-int bch2_check_alloc_info(struct bch_fs *);
-int bch2_check_alloc_to_lru_refs(struct bch_fs *);
-void bch2_dev_do_discards(struct bch_dev *);
-void bch2_do_discards(struct bch_fs *);
-
-static inline u64 should_invalidate_buckets(struct bch_dev *ca,
-					    struct bch_dev_usage u)
-{
-	u64 want_free = ca->mi.nbuckets >> 7;
-	u64 free = max_t(s64, 0,
-			   u.buckets[BCH_DATA_free]
-			 + u.buckets[BCH_DATA_need_discard]
-			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
-
-	return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *);
-void bch2_do_invalidates(struct bch_fs *);
-
-static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
-	return (void *) ((u64 *) &a->v +
-			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-			  BCH_ALLOC_V4_U64s_V0));
-}
-
-static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
-{
-	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
-}
-
-int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
-int bch2_fs_freespace_init(struct bch_fs *);
-int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
-
-void bch2_recalc_capacity(struct bch_fs *);
-u64 bch2_min_rw_member_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_background_exit(struct bch_dev *);
-void bch2_dev_allocator_background_init(struct bch_dev *);
-
-void bch2_fs_allocator_background_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
deleted file mode 100644
index 740238369a5a..000000000000
--- a/fs/bcachefs/alloc_background_format.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-
-struct bch_alloc {
-	struct bch_val		v;
-	__u8			fields;
-	__u8			gen;
-	__u8			data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1()			\
-	x(read_time,		16)		\
-	x(write_time,		16)		\
-	x(data_type,		8)		\
-	x(dirty_sectors,	16)		\
-	x(cached_sectors,	16)		\
-	x(oldest_gen,		8)		\
-	x(stripe,		32)		\
-	x(stripe_redundancy,	8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-	BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
-	struct bch_val		v;
-	__u8			nr_fields;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2()			\
-	x(read_time,		64)		\
-	x(write_time,		64)		\
-	x(dirty_sectors,	32)		\
-	x(cached_sectors,	32)		\
-	x(stripe,		32)		\
-	x(stripe_redundancy,	8)
-
-struct bch_alloc_v3 {
-	struct bch_val		v;
-	__le64			journal_seq;
-	__le32			flags;
-	__u8			nr_fields;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
-struct bch_alloc_v4 {
-	struct bch_val		v;
-	__u64			journal_seq_nonempty;
-	__u32			flags;
-	__u8			gen;
-	__u8			oldest_gen;
-	__u8			data_type;
-	__u8			stripe_redundancy;
-	__u32			dirty_sectors;
-	__u32			cached_sectors;
-	__u64			io_time[2];
-	__u32			stripe;
-	__u32			nr_external_backpointers;
-	/* end of fields in original version of alloc_v4 */
-	__u64			journal_seq_empty;
-	__u32			stripe_sectors;
-	__u32			pad;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0	6
-#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
-
-#define KEY_TYPE_BUCKET_GENS_BITS	8
-#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
-	struct bch_val		v;
-	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
deleted file mode 100644
index b58525ec7b4d..000000000000
--- a/fs/bcachefs/alloc_foreground.c
+++ /dev/null
@@ -1,1683 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2012 Google, Inc.
- *
- * Foreground allocator code: allocate buckets from freelist, and allocate in
- * sector granularity from writepoints.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_write.h"
-#include "journal.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "trace.h"
-
-#include <linux/math64.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
-					   struct mutex *lock)
-{
-	if (!mutex_trylock(lock)) {
-		bch2_trans_unlock(trans);
-		mutex_lock(lock);
-	}
-}
-
-const char * const bch2_watermarks[] = {
-#define x(t) #t,
-	BCH_WATERMARKS()
-#undef x
-	NULL
-};
-
-/*
- * Open buckets represent a bucket that's currently being allocated from.  They
- * serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void bch2_reset_alloc_cursors(struct bch_fs *c)
-{
-	guard(rcu)();
-	for_each_member_device_rcu(c, ca, NULL)
-		memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
-}
-
-static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
-{
-	open_bucket_idx_t idx = ob - c->open_buckets;
-	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-	ob->hash = *slot;
-	*slot = idx;
-}
-
-static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
-{
-	open_bucket_idx_t idx = ob - c->open_buckets;
-	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
-	while (*slot != idx) {
-		BUG_ON(!*slot);
-		slot = &c->open_buckets[*slot].hash;
-	}
-
-	*slot = ob->hash;
-	ob->hash = 0;
-}
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	if (ob->ec) {
-		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
-		return;
-	}
-
-	spin_lock(&ob->lock);
-	ob->valid = false;
-	ob->data_type = 0;
-	spin_unlock(&ob->lock);
-
-	spin_lock(&c->freelist_lock);
-	bch2_open_bucket_hash_remove(c, ob);
-
-	ob->freelist = c->open_buckets_freelist;
-	c->open_buckets_freelist = ob - c->open_buckets;
-
-	c->open_buckets_nr_free++;
-	ca->nr_open_buckets--;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *c,
-				  struct open_buckets *obs,
-				  unsigned dev, int err)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, obs, ob, i)
-		if (ob->dev == dev && ob->ec)
-			bch2_ec_bucket_cancel(c, ob, err);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-
-	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
-	ob = c->open_buckets + c->open_buckets_freelist;
-	c->open_buckets_freelist = ob->freelist;
-	atomic_set(&ob->pin, 1);
-	ob->data_type = 0;
-
-	c->open_buckets_nr_free--;
-	return ob;
-}
-
-static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
-		return false;
-
-	return bch2_is_superblock_bucket(ca, b);
-}
-
-static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
-{
-	BUG_ON(c->open_buckets_partial_nr >=
-	       ARRAY_SIZE(c->open_buckets_partial));
-
-	spin_lock(&c->freelist_lock);
-	scoped_guard(rcu)
-		bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
-
-	ob->on_partial_list = true;
-	c->open_buckets_partial[c->open_buckets_partial_nr++] =
-		ob - c->open_buckets;
-	spin_unlock(&c->freelist_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-	closure_wake_up(&c->freelist_wait);
-}
-
-static inline bool may_alloc_bucket(struct bch_fs *c,
-				    struct alloc_request *req,
-				    struct bpos bucket)
-{
-	if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
-		req->counters.skipped_open++;
-		return false;
-	}
-
-	u64 journal_seq_ready =
-		bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
-					      bucket.inode, bucket.offset);
-	if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
-		if (journal_seq_ready > c->journal.flushing_seq)
-			req->counters.need_journal_commit++;
-		req->counters.skipped_need_journal_commit++;
-		return false;
-	}
-
-	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
-		req->counters.skipped_nocow++;
-		return false;
-	}
-
-	return true;
-}
-
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
-					      struct alloc_request *req,
-					      u64 bucket, u8 gen,
-					      struct closure *cl)
-{
-	struct bch_dev *ca = req->ca;
-
-	if (unlikely(is_superblock_bucket(c, ca, bucket)))
-		return NULL;
-
-	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
-		req->counters.skipped_nouse++;
-		return NULL;
-	}
-
-	spin_lock(&c->freelist_lock);
-
-	if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
-		if (cl)
-			closure_wait(&c->open_buckets_wait, cl);
-
-		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
-		spin_unlock(&c->freelist_lock);
-		return ERR_PTR(bch_err_throw(c, open_buckets_empty));
-	}
-
-	/* Recheck under lock: */
-	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
-		spin_unlock(&c->freelist_lock);
-		req->counters.skipped_open++;
-		return NULL;
-	}
-
-	struct open_bucket *ob = bch2_open_bucket_alloc(c);
-
-	spin_lock(&ob->lock);
-	ob->valid	= true;
-	ob->sectors_free = ca->mi.bucket_size;
-	ob->dev		= ca->dev_idx;
-	ob->gen		= gen;
-	ob->bucket	= bucket;
-	spin_unlock(&ob->lock);
-
-	ca->nr_open_buckets++;
-	bch2_open_bucket_hash_add(c, ob);
-
-	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
-	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
-
-	spin_unlock(&c->freelist_lock);
-	return ob;
-}
-
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
-					    struct alloc_request *req,
-					    struct btree_iter *freespace_iter,
-					    struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-
-	if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
-		return NULL;
-
-	u8 gen;
-	int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	if (ret)
-		return NULL;
-
-	return __try_alloc_bucket(c, req, b, gen, cl);
-}
-
-/*
- * This path is for before the freespace btree is initialized:
- */
-static noinline struct open_bucket *
-bch2_bucket_alloc_early(struct btree_trans *trans,
-			struct alloc_request *req,
-			struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = req->ca;
-	struct btree_iter iter, citer;
-	struct bkey_s_c k, ck;
-	struct open_bucket *ob = NULL;
-	u64 first_bucket = ca->mi.first_bucket;
-	u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
-	u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
-	u64 alloc_cursor = alloc_start;
-	int ret;
-
-	/*
-	 * Scan with an uncached iterator to avoid polluting the key cache. An
-	 * uncached iter will return a cached key if one exists, but if not
-	 * there is no other underlying protection for the associated key cache
-	 * slot. To avoid racing bucket allocations, look up the cached key slot
-	 * of any likely allocation candidate before attempting to proceed with
-	 * the allocation. This provides proper exclusion on the associated
-	 * bucket.
-	 */
-again:
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
-			   BTREE_ITER_slots, k, ret) {
-		u64 bucket = k.k->p.offset;
-
-		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
-			break;
-
-		if (req->btree_bitmap != BTREE_BITMAP_ANY &&
-		    req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-				bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-			if (req->btree_bitmap == BTREE_BITMAP_YES &&
-			    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-				break;
-
-			bucket = sector_to_bucket(ca,
-					round_up(bucket_to_sector(ca, bucket) + 1,
-						 1ULL << ca->mi.btree_bitmap_shift));
-			bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
-			req->counters.buckets_seen++;
-			req->counters.skipped_mi_btree_bitmap++;
-			continue;
-		}
-
-		struct bch_alloc_v4 a_convert;
-		const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-		if (a->data_type != BCH_DATA_free)
-			continue;
-
-		/* now check the cached key to serialize concurrent allocs of the bucket */
-		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
-		ret = bkey_err(ck);
-		if (ret)
-			break;
-
-		a = bch2_alloc_to_v4(ck, &a_convert);
-		if (a->data_type != BCH_DATA_free)
-			goto next;
-
-		req->counters.buckets_seen++;
-
-		ob = may_alloc_bucket(c, req, k.k->p)
-			? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
-			: NULL;
-next:
-		bch2_set_btree_iter_dontneed(trans, &citer);
-		bch2_trans_iter_exit(trans, &citer);
-		if (ob)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	alloc_cursor = iter.pos.offset;
-
-	if (!ob && ret)
-		ob = ERR_PTR(ret);
-
-	if (!ob && alloc_start > first_bucket) {
-		alloc_cursor = alloc_start = first_bucket;
-		goto again;
-	}
-
-	*dev_alloc_cursor = alloc_cursor;
-
-	return ob;
-}
-
-static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
-						      struct alloc_request *req,
-						      struct closure *cl)
-{
-	struct bch_dev *ca = req->ca;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct open_bucket *ob = NULL;
-	u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
-	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
-	u64 alloc_cursor = alloc_start;
-	int ret;
-again:
-	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
-					 POS(ca->dev_idx, alloc_cursor),
-					 POS(ca->dev_idx, U64_MAX),
-					 0, k, ret) {
-		/*
-		 * peek normally dosen't trim extents - they can span iter.pos,
-		 * which is not what we want here:
-		 */
-		iter.k.size = iter.k.p.offset - iter.pos.offset;
-
-		while (iter.k.size) {
-			req->counters.buckets_seen++;
-
-			u64 bucket = iter.pos.offset & ~(~0ULL << 56);
-			if (req->btree_bitmap != BTREE_BITMAP_ANY &&
-			    req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
-					bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
-				if (req->btree_bitmap == BTREE_BITMAP_YES &&
-				    bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
-					goto fail;
-
-				bucket = sector_to_bucket(ca,
-						round_up(bucket_to_sector(ca, bucket + 1),
-							 1ULL << ca->mi.btree_bitmap_shift));
-				alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
-
-				bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
-				req->counters.skipped_mi_btree_bitmap++;
-				goto next;
-			}
-
-			ob = try_alloc_bucket(trans, req, &iter, cl);
-			if (ob) {
-				if (!IS_ERR(ob))
-					*dev_alloc_cursor = iter.pos.offset;
-				bch2_set_btree_iter_dontneed(trans, &iter);
-				break;
-			}
-
-			iter.k.size--;
-			iter.pos.offset++;
-		}
-next:
-		if (ob || ret)
-			break;
-	}
-fail:
-	bch2_trans_iter_exit(trans, &iter);
-
-	BUG_ON(ob && ret);
-
-	if (ret)
-		ob = ERR_PTR(ret);
-
-	if (!ob && alloc_start > ca->mi.first_bucket) {
-		alloc_cursor = alloc_start = ca->mi.first_bucket;
-		goto again;
-	}
-
-	return ob;
-}
-
-static noinline void trace_bucket_alloc2(struct bch_fs *c,
-					 struct alloc_request *req,
-					 struct closure *cl,
-					 struct open_bucket *ob)
-{
-	struct printbuf buf = PRINTBUF;
-
-	printbuf_tabstop_push(&buf, 24);
-
-	prt_printf(&buf, "dev\t%s (%u)\n",	req->ca->name, req->ca->dev_idx);
-	prt_printf(&buf, "watermark\t%s\n",	bch2_watermarks[req->watermark]);
-	prt_printf(&buf, "data type\t%s\n",	__bch2_data_types[req->data_type]);
-	prt_printf(&buf, "blocking\t%u\n",	cl != NULL);
-	prt_printf(&buf, "free\t%llu\n",	req->usage.buckets[BCH_DATA_free]);
-	prt_printf(&buf, "avail\t%llu\n",	dev_buckets_free(req->ca, req->usage, req->watermark));
-	prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
-		   bch2_copygc_wait_amount(c),
-		   c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
-	prt_printf(&buf, "seen\t%llu\n",	req->counters.buckets_seen);
-	prt_printf(&buf, "open\t%llu\n",	req->counters.skipped_open);
-	prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
-	prt_printf(&buf, "nocow\t%llu\n",	req->counters.skipped_nocow);
-	prt_printf(&buf, "nouse\t%llu\n",	req->counters.skipped_nouse);
-	prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
-
-	if (!IS_ERR(ob)) {
-		prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
-		trace_bucket_alloc(c, buf.buf);
-	} else {
-		prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
-		trace_bucket_alloc_fail(c, buf.buf);
-	}
-
-	printbuf_exit(&buf);
-}
-
-/**
- * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
- * @trans:	transaction object
- * @req:	state for the entire allocation
- * @cl:		if not NULL, closure to be used to wait if buckets not available
- * @nowait:	if true, do not wait for buckets to become available
- *
- * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
- */
-static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
-						   struct alloc_request *req,
-						   struct closure *cl,
-						   bool nowait)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_dev *ca = req->ca;
-	struct open_bucket *ob = NULL;
-	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
-	u64 avail;
-	bool waiting = nowait;
-
-	req->btree_bitmap = req->data_type == BCH_DATA_btree;
-	memset(&req->counters, 0, sizeof(req->counters));
-again:
-	bch2_dev_usage_read_fast(ca, &req->usage);
-	avail = dev_buckets_free(ca, req->usage, req->watermark);
-
-	if (req->usage.buckets[BCH_DATA_need_discard] >
-	    min(avail, ca->mi.nbuckets >> 7))
-		bch2_dev_do_discards(ca);
-
-	if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
-		bch2_gc_gens_async(c);
-
-	if (should_invalidate_buckets(ca, req->usage))
-		bch2_dev_do_invalidates(ca);
-
-	if (!avail) {
-		if (req->watermark > BCH_WATERMARK_normal &&
-		    c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
-			goto alloc;
-
-		if (cl && !waiting) {
-			closure_wait(&c->freelist_wait, cl);
-			waiting = true;
-			goto again;
-		}
-
-		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
-
-		ob = ERR_PTR(bch_err_throw(c, freelist_empty));
-		goto err;
-	}
-
-	if (waiting)
-		closure_wake_up(&c->freelist_wait);
-alloc:
-	ob = likely(freespace)
-		? bch2_bucket_alloc_freelist(trans, req, cl)
-		: bch2_bucket_alloc_early(trans, req, cl);
-
-	if (req->counters.need_journal_commit * 2 > avail)
-		bch2_journal_flush_async(&c->journal, NULL);
-
-	if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
-		req->btree_bitmap = BTREE_BITMAP_ANY;
-		goto alloc;
-	}
-
-	if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
-		freespace = false;
-		goto alloc;
-	}
-err:
-	if (!ob)
-		ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
-
-	if (!IS_ERR(ob))
-		ob->data_type = req->data_type;
-
-	if (!IS_ERR(ob))
-		count_event(c, bucket_alloc);
-	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
-		count_event(c, bucket_alloc_fail);
-
-	if (!IS_ERR(ob)
-	    ? trace_bucket_alloc_enabled()
-	    : trace_bucket_alloc_fail_enabled())
-		trace_bucket_alloc2(c, req, cl, ob);
-
-	return ob;
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-				      enum bch_watermark watermark,
-				      enum bch_data_type data_type,
-				      struct closure *cl)
-{
-	struct open_bucket *ob;
-	struct alloc_request req = {
-		.watermark	= watermark,
-		.data_type	= data_type,
-		.ca		= ca,
-	};
-
-	bch2_trans_do(c,
-		PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
-	return ob;
-}
-
-static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
-			    unsigned l, unsigned r)
-{
-	return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
-}
-
-#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-
-void bch2_dev_alloc_list(struct bch_fs *c,
-			 struct dev_stripe_state *stripe,
-			 struct bch_devs_mask *devs,
-			 struct dev_alloc_list *ret)
-{
-	ret->nr = 0;
-
-	unsigned i;
-	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
-		ret->data[ret->nr++] = i;
-
-	bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
-}
-
-static const u64 stripe_clock_hand_rescale	= 1ULL << 62; /* trigger rescale at */
-static const u64 stripe_clock_hand_max		= 1ULL << 56; /* max after rescale */
-static const u64 stripe_clock_hand_inv		= 1ULL << 52; /* max increment, if a device is empty */
-
-static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
-{
-	/*
-	 * Avoid underflowing clock hands if at all possible, if clock hands go
-	 * to 0 then we lose information - clock hands can be in a wide range if
-	 * we have devices we rarely try to allocate from, if we generally
-	 * allocate from a specified target but only sometimes have to fall back
-	 * to the whole filesystem.
-	 */
-	u64 scale_max = U64_MAX;	/* maximum we can subtract without underflow */
-	u64 scale_min = 0;		/* minumum we must subtract to avoid overflow */
-
-	for (u64 *v = stripe->next_alloc;
-	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
-		if (*v)
-			scale_max = min(scale_max, *v);
-		if (*v > stripe_clock_hand_max)
-			scale_min = max(scale_min, *v - stripe_clock_hand_max);
-	}
-
-	u64 scale = max(scale_min, scale_max);
-
-	for (u64 *v = stripe->next_alloc;
-	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
-		*v = *v < scale ? 0 : *v - scale;
-}
-
-static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
-			       struct dev_stripe_state *stripe,
-			       struct bch_dev_usage *usage)
-{
-	/*
-	 * Stripe state has a per device clock hand: we allocate from the device
-	 * with the smallest clock hand.
-	 *
-	 * When we allocate, we don't do a simple increment; we add the inverse
-	 * of the device's free space. This results in round robin behavior that
-	 * biases in favor of the device(s) with more free space.
-	 */
-
-	u64 *v = stripe->next_alloc + ca->dev_idx;
-	u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
-	u64 free_space_inv = free_space
-		? div64_u64(stripe_clock_hand_inv, free_space)
-		: stripe_clock_hand_inv;
-
-	/* Saturating add, avoid overflow: */
-	u64 sum = *v + free_space_inv;
-	*v = sum >= *v ? sum : U64_MAX;
-
-	if (unlikely(*v > stripe_clock_hand_rescale))
-		bch2_stripe_state_rescale(stripe);
-}
-
-void bch2_dev_stripe_increment(struct bch_dev *ca,
-			       struct dev_stripe_state *stripe)
-{
-	struct bch_dev_usage usage;
-
-	bch2_dev_usage_read_fast(ca, &usage);
-	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-}
-
-static int add_new_bucket(struct bch_fs *c,
-			  struct alloc_request *req,
-			  struct open_bucket *ob)
-{
-	unsigned durability = ob_dev(c, ob)->mi.durability;
-
-	BUG_ON(req->nr_effective >= req->nr_replicas);
-
-	__clear_bit(ob->dev, req->devs_may_alloc.d);
-	req->nr_effective	+= durability;
-	req->have_cache	|= !durability;
-
-	ob_push(c, &req->ptrs, ob);
-
-	if (req->nr_effective >= req->nr_replicas)
-		return 1;
-	if (ob->ec)
-		return 1;
-	return 0;
-}
-
-inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
-				       struct alloc_request *req,
-				       struct dev_stripe_state *stripe,
-				       struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	BUG_ON(req->nr_effective >= req->nr_replicas);
-
-	bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
-
-	darray_for_each(req->devs_sorted, i) {
-		req->ca = bch2_dev_tryget_noerror(c, *i);
-		if (!req->ca)
-			continue;
-
-		if (!req->ca->mi.durability && req->have_cache) {
-			bch2_dev_put(req->ca);
-			continue;
-		}
-
-		struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
-							req->flags & BCH_WRITE_alloc_nowait);
-		if (!IS_ERR(ob))
-			bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
-		bch2_dev_put(req->ca);
-
-		if (IS_ERR(ob)) {
-			ret = PTR_ERR(ob);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
-				break;
-			continue;
-		}
-
-		ret = add_new_bucket(c, req, ob);
-		if (ret)
-			break;
-	}
-
-	if (ret == 1)
-		return 0;
-	if (ret)
-		return ret;
-	return bch_err_throw(c, insufficient_devices);
-}
-
-/* Allocate from stripes: */
-
-/*
- * if we can't allocate a new stripe because there are already too many
- * partially filled stripes, force allocating from an existing stripe even when
- * it's to a device we don't want:
- */
-
-static int bucket_alloc_from_stripe(struct btree_trans *trans,
-				    struct alloc_request *req,
-				    struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	if (req->nr_replicas < 2)
-		return 0;
-
-	if (ec_open_bucket(c, &req->ptrs))
-		return 0;
-
-	struct ec_stripe_head *h =
-		bch2_ec_stripe_head_get(trans, req, 0, cl);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-	if (!h)
-		return 0;
-
-	bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
-
-	darray_for_each(req->devs_sorted, i)
-		for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
-			if (!h->s->blocks[ec_idx])
-				continue;
-
-			struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
-			if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
-				ob->ec_idx	= ec_idx;
-				ob->ec		= h->s;
-				ec_stripe_new_get(h->s, STRIPE_REF_io);
-
-				ret = add_new_bucket(c, req, ob);
-				goto out;
-			}
-		}
-out:
-	bch2_ec_stripe_head_put(c, h);
-	return ret;
-}
-
-/* Sector allocator */
-
-static bool want_bucket(struct bch_fs *c,
-			struct alloc_request *req,
-			struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	if (!test_bit(ob->dev, req->devs_may_alloc.d))
-		return false;
-
-	if (ob->data_type != req->wp->data_type)
-		return false;
-
-	if (!ca->mi.durability &&
-	    (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
-		return false;
-
-	if (req->ec != (ob->ec != NULL))
-		return false;
-
-	return true;
-}
-
-static int bucket_alloc_set_writepoint(struct bch_fs *c,
-				       struct alloc_request *req)
-{
-	struct open_bucket *ob;
-	unsigned i;
-	int ret = 0;
-
-	req->scratch_ptrs.nr = 0;
-
-	open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
-		if (!ret && want_bucket(c, req, ob))
-			ret = add_new_bucket(c, req, ob);
-		else
-			ob_push(c, &req->scratch_ptrs, ob);
-	}
-	req->wp->ptrs = req->scratch_ptrs;
-
-	return ret;
-}
-
-static int bucket_alloc_set_partial(struct bch_fs *c,
-				    struct alloc_request *req)
-{
-	int i, ret = 0;
-
-	if (!c->open_buckets_partial_nr)
-		return 0;
-
-	spin_lock(&c->freelist_lock);
-
-	if (!c->open_buckets_partial_nr)
-		goto unlock;
-
-	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
-		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
-
-		if (want_bucket(c, req, ob)) {
-			struct bch_dev *ca = ob_dev(c, ob);
-			u64 avail;
-
-			bch2_dev_usage_read_fast(ca, &req->usage);
-			avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
-			if (!avail)
-				continue;
-
-			array_remove_item(c->open_buckets_partial,
-					  c->open_buckets_partial_nr,
-					  i);
-			ob->on_partial_list = false;
-
-			scoped_guard(rcu)
-				bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
-			ret = add_new_bucket(c, req, ob);
-			if (ret)
-				break;
-		}
-	}
-unlock:
-	spin_unlock(&c->freelist_lock);
-	return ret;
-}
-
-static int __open_bucket_add_buckets(struct btree_trans *trans,
-				     struct alloc_request *req,
-				     struct closure *_cl)
-{
-	struct bch_fs *c = trans->c;
-	struct open_bucket *ob;
-	struct closure *cl = NULL;
-	unsigned i;
-	int ret;
-
-	req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
-
-	/* Don't allocate from devices we already have pointers to: */
-	darray_for_each(*req->devs_have, i)
-		__clear_bit(*i, req->devs_may_alloc.d);
-
-	open_bucket_for_each(c, &req->ptrs, ob, i)
-		__clear_bit(ob->dev, req->devs_may_alloc.d);
-
-	ret = bucket_alloc_set_writepoint(c, req);
-	if (ret)
-		return ret;
-
-	ret = bucket_alloc_set_partial(c, req);
-	if (ret)
-		return ret;
-
-	if (req->ec) {
-		ret = bucket_alloc_from_stripe(trans, req, _cl);
-	} else {
-retry_blocking:
-		/*
-		 * Try nonblocking first, so that if one device is full we'll try from
-		 * other devices:
-		 */
-		ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
-		if (ret &&
-		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
-		    !cl && _cl) {
-			cl = _cl;
-			goto retry_blocking;
-		}
-	}
-
-	return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
-				   struct alloc_request *req,
-				   struct closure *cl)
-{
-	int ret;
-
-	if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
-		ret = __open_bucket_add_buckets(trans, req, cl);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
-		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
-		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-			return ret;
-		if (req->nr_effective >= req->nr_replicas)
-			return 0;
-	}
-
-	bool ec = false;
-	swap(ec, req->ec);
-	ret = __open_bucket_add_buckets(trans, req, cl);
-	swap(ec, req->ec);
-
-	return ret < 0 ? ret : 0;
-}
-
-/**
- * should_drop_bucket - check if this is open_bucket should go away
- * @ob:		open_bucket to predicate on
- * @c:		filesystem handle
- * @ca:		if set, we're killing buckets for a particular device
- * @ec:		if true, we're shutting down erasure coding and killing all ec
- *		open_buckets
- *		otherwise, return true
- * Returns: true if we should kill this open_bucket
- *
- * We're killing open_buckets because we're shutting down a device, erasure
- * coding, or the entire filesystem - check if this open_bucket matches:
- */
-static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
-			       struct bch_dev *ca, bool ec)
-{
-	if (ec) {
-		return ob->ec != NULL;
-	} else if (ca) {
-		bool drop = ob->dev == ca->dev_idx;
-		struct open_bucket *ob2;
-		unsigned i;
-
-		if (!drop && ob->ec) {
-			unsigned nr_blocks;
-
-			mutex_lock(&ob->ec->lock);
-			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
-
-			for (i = 0; i < nr_blocks; i++) {
-				if (!ob->ec->blocks[i])
-					continue;
-
-				ob2 = c->open_buckets + ob->ec->blocks[i];
-				drop |= ob2->dev == ca->dev_idx;
-			}
-			mutex_unlock(&ob->ec->lock);
-		}
-
-		return drop;
-	} else {
-		return true;
-	}
-}
-
-static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-				 bool ec, struct write_point *wp)
-{
-	struct open_buckets ptrs = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		if (should_drop_bucket(ob, c, ca, ec))
-			bch2_open_bucket_put(c, ob);
-		else
-			ob_push(c, &ptrs, ob);
-	wp->ptrs = ptrs;
-	mutex_unlock(&wp->lock);
-}
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
-			    bool ec)
-{
-	unsigned i;
-
-	/* Next, close write points that point to this device... */
-	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
-
-	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
-	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
-	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	while (c->btree_reserve_cache_nr) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		bch2_open_buckets_put(c, &a->ob);
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-
-	spin_lock(&c->freelist_lock);
-	i = 0;
-	while (i < c->open_buckets_partial_nr) {
-		struct open_bucket *ob =
-			c->open_buckets + c->open_buckets_partial[i];
-
-		if (should_drop_bucket(ob, c, ca, ec)) {
-			--c->open_buckets_partial_nr;
-			swap(c->open_buckets_partial[i],
-			     c->open_buckets_partial[c->open_buckets_partial_nr]);
-
-			ob->on_partial_list = false;
-
-			scoped_guard(rcu)
-				bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
-			spin_unlock(&c->freelist_lock);
-			bch2_open_bucket_put(c, ob);
-			spin_lock(&c->freelist_lock);
-		} else {
-			i++;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	bch2_ec_stop_dev(c, ca);
-}
-
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
-						 unsigned long write_point)
-{
-	unsigned hash =
-		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-	return &c->write_points_hash[hash];
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
-					     unsigned long write_point)
-{
-	struct write_point *wp;
-
-	guard(rcu)();
-	hlist_for_each_entry_rcu(wp, head, node)
-		if (wp->write_point == write_point)
-			return wp;
-	return NULL;
-}
-
-static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-{
-	u64 stranded	= c->write_points_nr * c->bucket_size_max;
-	u64 free	= bch2_fs_usage_read_short(c).free;
-
-	return stranded * factor > free;
-}
-
-static noinline bool try_increase_writepoints(struct bch_fs *c)
-{
-	struct write_point *wp;
-
-	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
-	    too_many_writepoints(c, 32))
-		return false;
-
-	wp = c->write_points + c->write_points_nr++;
-	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-	return true;
-}
-
-static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp;
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&c->write_points_hash_lock);
-	if (c->write_points_nr < old_nr) {
-		mutex_unlock(&c->write_points_hash_lock);
-		return true;
-	}
-
-	if (c->write_points_nr == 1 ||
-	    !too_many_writepoints(c, 8)) {
-		mutex_unlock(&c->write_points_hash_lock);
-		return false;
-	}
-
-	wp = c->write_points + --c->write_points_nr;
-
-	hlist_del_rcu(&wp->node);
-	mutex_unlock(&c->write_points_hash_lock);
-
-	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob);
-	wp->ptrs.nr = 0;
-	mutex_unlock(&wp->lock);
-	return true;
-}
-
-static struct write_point *writepoint_find(struct btree_trans *trans,
-					   unsigned long write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp, *oldest;
-	struct hlist_head *head;
-
-	if (!(write_point & 1UL)) {
-		wp = (struct write_point *) write_point;
-		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-		return wp;
-	}
-
-	head = writepoint_hash(c, write_point);
-restart_find:
-	wp = __writepoint_find(head, write_point);
-	if (wp) {
-lock_wp:
-		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
-		if (wp->write_point == write_point)
-			goto out;
-		mutex_unlock(&wp->lock);
-		goto restart_find;
-	}
-restart_find_oldest:
-	oldest = NULL;
-	for (wp = c->write_points;
-	     wp < c->write_points + c->write_points_nr; wp++)
-		if (!oldest || time_before64(wp->last_used, oldest->last_used))
-			oldest = wp;
-
-	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
-	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
-	if (oldest >= c->write_points + c->write_points_nr ||
-	    try_increase_writepoints(c)) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto restart_find_oldest;
-	}
-
-	wp = __writepoint_find(head, write_point);
-	if (wp && wp != oldest) {
-		mutex_unlock(&c->write_points_hash_lock);
-		mutex_unlock(&oldest->lock);
-		goto lock_wp;
-	}
-
-	wp = oldest;
-	hlist_del_rcu(&wp->node);
-	wp->write_point = write_point;
-	hlist_add_head_rcu(&wp->node, head);
-	mutex_unlock(&c->write_points_hash_lock);
-out:
-	wp->last_used = local_clock();
-	return wp;
-}
-
-static noinline void
-deallocate_extra_replicas(struct bch_fs *c,
-			  struct alloc_request *req)
-{
-	struct open_bucket *ob;
-	unsigned extra_replicas = req->nr_effective - req->nr_replicas;
-	unsigned i;
-
-	req->scratch_ptrs.nr = 0;
-
-	open_bucket_for_each(c, &req->ptrs, ob, i) {
-		unsigned d = ob_dev(c, ob)->mi.durability;
-
-		if (d && d <= extra_replicas) {
-			extra_replicas -= d;
-			ob_push(c, &req->wp->ptrs, ob);
-		} else {
-			ob_push(c, &req->scratch_ptrs, ob);
-		}
-	}
-
-	req->ptrs = req->scratch_ptrs;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
-			     unsigned target,
-			     unsigned erasure_code,
-			     struct write_point_specifier write_point,
-			     struct bch_devs_list *devs_have,
-			     unsigned nr_replicas,
-			     unsigned nr_replicas_required,
-			     enum bch_watermark watermark,
-			     enum bch_write_flags flags,
-			     struct closure *cl,
-			     struct write_point **wp_ret)
-{
-	struct bch_fs *c = trans->c;
-	struct open_bucket *ob;
-	unsigned write_points_nr;
-	int i;
-
-	struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
-	int ret = PTR_ERR_OR_ZERO(req);
-	if (unlikely(ret))
-		return ret;
-
-	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
-		erasure_code = false;
-
-	req->nr_replicas	= nr_replicas;
-	req->target		= target;
-	req->ec			= erasure_code;
-	req->watermark		= watermark;
-	req->flags		= flags;
-	req->devs_have		= devs_have;
-
-	BUG_ON(!nr_replicas || !nr_replicas_required);
-retry:
-	req->ptrs.nr		= 0;
-	req->nr_effective	= 0;
-	req->have_cache		= false;
-	write_points_nr		= c->write_points_nr;
-
-	*wp_ret = req->wp = writepoint_find(trans, write_point.v);
-
-	req->data_type		= req->wp->data_type;
-
-	ret = bch2_trans_relock(trans);
-	if (ret)
-		goto err;
-
-	/* metadata may not allocate on cache devices: */
-	if (req->data_type != BCH_DATA_user)
-		req->have_cache = true;
-
-	if (target && !(flags & BCH_WRITE_only_specified_devs)) {
-		ret = open_bucket_add_buckets(trans, req, NULL);
-		if (!ret ||
-		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto alloc_done;
-
-		/* Don't retry from all devices if we're out of open buckets: */
-		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret2 = open_bucket_add_buckets(trans, req, cl);
-			if (!ret2 ||
-			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
-				ret = ret2;
-				goto alloc_done;
-			}
-		}
-
-		/*
-		 * Only try to allocate cache (durability = 0 devices) from the
-		 * specified target:
-		 */
-		req->have_cache	= true;
-		req->target	= 0;
-
-		ret = open_bucket_add_buckets(trans, req, cl);
-	} else {
-		ret = open_bucket_add_buckets(trans, req, cl);
-	}
-alloc_done:
-	BUG_ON(!ret && req->nr_effective < req->nr_replicas);
-
-	if (erasure_code && !ec_open_bucket(c, &req->ptrs))
-		pr_debug("failed to get ec bucket: ret %u", ret);
-
-	if (ret == -BCH_ERR_insufficient_devices &&
-	    req->nr_effective >= nr_replicas_required)
-		ret = 0;
-
-	if (ret)
-		goto err;
-
-	if (req->nr_effective > req->nr_replicas)
-		deallocate_extra_replicas(c, req);
-
-	/* Free buckets we didn't use: */
-	open_bucket_for_each(c, &req->wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob);
-
-	req->wp->ptrs = req->ptrs;
-
-	req->wp->sectors_free = UINT_MAX;
-
-	open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
-		/*
-		 * Ensure proper write alignment - either due to misaligned
-		 * bucket sizes (from buggy bcachefs-tools), or writes that mix
-		 * logical/physical alignment:
-		 */
-		struct bch_dev *ca = ob_dev(c, ob);
-		u64 offset = bucket_to_sector(ca, ob->bucket) +
-			ca->mi.bucket_size -
-			ob->sectors_free;
-		unsigned align = round_up(offset, block_sectors(c)) - offset;
-
-		ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
-
-		req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
-	}
-
-	req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
-
-	/* Did alignment use up space in an open_bucket? */
-	if (unlikely(!req->wp->sectors_free)) {
-		bch2_alloc_sectors_done(c, req->wp);
-		goto retry;
-	}
-
-	BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
-
-	return 0;
-err:
-	open_bucket_for_each(c, &req->wp->ptrs, ob, i)
-		if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
-			ob_push(c, &req->ptrs, ob);
-		else
-			open_bucket_free_unused(c, ob);
-	req->wp->ptrs = req->ptrs;
-
-	mutex_unlock(&req->wp->lock);
-
-	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
-	    try_decrease_writepoints(trans, write_points_nr))
-		goto retry;
-
-	if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
-		ret = bch_err_throw(c, bucket_alloc_blocked);
-
-	if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
-	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
-		ret = bch_err_throw(c, bucket_alloc_blocked);
-
-	return ret;
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-				    struct bkey_i *k, unsigned sectors,
-				    bool cached)
-{
-	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
-	bch2_alloc_sectors_done_inlined(c, wp);
-}
-
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->data_type = type;
-
-	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
-	INIT_LIST_HEAD(&wp->writes);
-	spin_lock_init(&wp->writes_lock);
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *c)
-{
-	struct open_bucket *ob;
-	struct write_point *wp;
-
-	mutex_init(&c->write_points_hash_lock);
-	c->write_points_nr = ARRAY_SIZE(c->write_points);
-
-	/* open bucket 0 is a sentinal NULL: */
-	spin_lock_init(&c->open_buckets[0].lock);
-
-	for (ob = c->open_buckets + 1;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-		spin_lock_init(&ob->lock);
-		c->open_buckets_nr_free++;
-
-		ob->freelist = c->open_buckets_freelist;
-		c->open_buckets_freelist = ob - c->open_buckets;
-	}
-
-	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
-	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
-	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
-
-	for (wp = c->write_points;
-	     wp < c->write_points + c->write_points_nr; wp++) {
-		writepoint_init(wp, BCH_DATA_user);
-
-		wp->last_used	= local_clock();
-		wp->write_point	= (unsigned long) wp;
-		hlist_add_head_rcu(&wp->node,
-				   writepoint_hash(c, wp->write_point));
-	}
-}
-
-void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-	unsigned data_type = ob->data_type;
-	barrier(); /* READ_ONCE() doesn't work on bitfields */
-
-	prt_printf(out, "%zu ref %u ",
-		   ob - c->open_buckets,
-		   atomic_read(&ob->pin));
-	bch2_prt_data_type(out, data_type);
-	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
-		   ob->dev, ob->bucket, ob->gen,
-		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
-	if (ob->ec)
-		prt_printf(out, " ec idx %llu", ob->ec->idx);
-	if (ob->on_partial_list)
-		prt_str(out, " partial");
-	prt_newline(out);
-}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct bch_dev *ca)
-{
-	struct open_bucket *ob;
-
-	out->atomic++;
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		if (ob->valid && (!ca || ob->dev == ca->dev_idx))
-			bch2_open_bucket_to_text(out, c, ob);
-		spin_unlock(&ob->lock);
-	}
-
-	--out->atomic;
-}
-
-void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	unsigned i;
-
-	out->atomic++;
-	spin_lock(&c->freelist_lock);
-
-	for (i = 0; i < c->open_buckets_partial_nr; i++)
-		bch2_open_bucket_to_text(out, c,
-				c->open_buckets + c->open_buckets_partial[i]);
-
-	spin_unlock(&c->freelist_lock);
-	--out->atomic;
-}
-
-static const char * const bch2_write_point_states[] = {
-#define x(n)	#n,
-	WRITE_POINT_STATES()
-#undef x
-	NULL
-};
-
-static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
-				     struct write_point *wp)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&wp->lock);
-
-	prt_printf(out, "%lu: ", wp->write_point);
-	prt_human_readable_u64(out, wp->sectors_allocated << 9);
-
-	prt_printf(out, " last wrote: ");
-	bch2_pr_time_units(out, sched_clock() - wp->last_used);
-
-	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-		prt_printf(out, " %s: ", bch2_write_point_states[i]);
-		bch2_pr_time_units(out, wp->time[i]);
-	}
-
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		bch2_open_bucket_to_text(out, c, ob);
-	printbuf_indent_sub(out, 2);
-
-	mutex_unlock(&wp->lock);
-}
-
-void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct write_point *wp;
-
-	prt_str(out, "Foreground write points\n");
-	for (wp = c->write_points;
-	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-	     wp++)
-		bch2_write_point_to_text(out, c, wp);
-
-	prt_str(out, "Copygc write point\n");
-	bch2_write_point_to_text(out, c, &c->copygc_write_point);
-
-	prt_str(out, "Rebalance write point\n");
-	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
-
-	prt_str(out, "Btree write point\n");
-	bch2_write_point_to_text(out, c, &c->btree_write_point);
-}
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	unsigned nr[BCH_DATA_NR];
-
-	memset(nr, 0, sizeof(nr));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-		nr[c->open_buckets[i].data_type]++;
-
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 24);
-
-	prt_printf(out, "capacity\t%llu\n",		c->capacity);
-	prt_printf(out, "reserved\t%llu\n",		c->reserved);
-	prt_printf(out, "hidden\t%llu\n",		percpu_u64_get(&c->usage->hidden));
-	prt_printf(out, "btree\t%llu\n",		percpu_u64_get(&c->usage->btree));
-	prt_printf(out, "data\t%llu\n",			percpu_u64_get(&c->usage->data));
-	prt_printf(out, "cached\t%llu\n",		percpu_u64_get(&c->usage->cached));
-	prt_printf(out, "reserved\t%llu\n",		percpu_u64_get(&c->usage->reserved));
-	prt_printf(out, "online_reserved\t%llu\n",	percpu_u64_get(c->online_reserved));
-	prt_printf(out, "nr_inodes\t%llu\n",		percpu_u64_get(&c->usage->nr_inodes));
-
-	prt_newline(out);
-	prt_printf(out, "freelist_wait\t%s\n",			c->freelist_wait.list.first ? "waiting" : "empty");
-	prt_printf(out, "open buckets allocated\t%i\n",		OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
-	prt_printf(out, "open buckets total\t%u\n",		OPEN_BUCKETS_COUNT);
-	prt_printf(out, "open_buckets_wait\t%s\n",		c->open_buckets_wait.list.first ? "waiting" : "empty");
-	prt_printf(out, "open_buckets_btree\t%u\n",		nr[BCH_DATA_btree]);
-	prt_printf(out, "open_buckets_user\t%u\n",		nr[BCH_DATA_user]);
-	prt_printf(out, "btree reserve cache\t%u\n",		c->btree_reserve_cache_nr);
-}
-
-void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
-	unsigned nr[BCH_DATA_NR];
-
-	memset(nr, 0, sizeof(nr));
-
-	for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-		nr[c->open_buckets[i].data_type]++;
-
-	bch2_dev_usage_to_text(out, ca, &stats);
-
-	prt_newline(out);
-
-	prt_printf(out, "reserves:\n");
-	for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
-		prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
-
-	prt_newline(out);
-
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 12);
-	printbuf_tabstop_push(out, 16);
-
-	prt_printf(out, "open buckets\t%i\r\n",	ca->nr_open_buckets);
-	prt_printf(out, "buckets to invalidate\t%llu\r\n",
-		   should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
-}
-
-static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
-{
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
-		   c->opts.allocator_stuck_timeout);
-
-	prt_printf(&buf, "Allocator debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_fs_alloc_debug_to_text(&buf, c);
-	printbuf_indent_sub(&buf, 2);
-	prt_newline(&buf);
-
-	bch2_printbuf_make_room(&buf, 4096);
-
-	buf.atomic++;
-	scoped_guard(rcu)
-		for_each_online_member_rcu(c, ca) {
-			prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
-			printbuf_indent_add(&buf, 2);
-			bch2_dev_alloc_debug_to_text(&buf, ca);
-			printbuf_indent_sub(&buf, 2);
-			prt_newline(&buf);
-		}
-	--buf.atomic;
-
-	prt_printf(&buf, "Copygc debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_copygc_wait_to_text(&buf, c);
-	printbuf_indent_sub(&buf, 2);
-	prt_newline(&buf);
-
-	prt_printf(&buf, "Journal debug:\n");
-	printbuf_indent_add(&buf, 2);
-	bch2_journal_debug_to_text(&buf, &c->journal);
-	printbuf_indent_sub(&buf, 2);
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static inline unsigned allocator_wait_timeout(struct bch_fs *c)
-{
-	if (c->allocator_last_stuck &&
-	    time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
-		return 0;
-
-	return c->opts.allocator_stuck_timeout * HZ;
-}
-
-void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
-	unsigned t = allocator_wait_timeout(c);
-
-	if (t && closure_sync_timeout(cl, t)) {
-		c->allocator_last_stuck = jiffies;
-		bch2_print_allocator_stuck(c);
-	}
-
-	closure_sync(cl);
-}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
deleted file mode 100644
index 1b3fc8460096..000000000000
--- a/fs/bcachefs/alloc_foreground.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
-#define _BCACHEFS_ALLOC_FOREGROUND_H
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "alloc_types.h"
-#include "extents.h"
-#include "io_write_types.h"
-#include "sb-members.h"
-
-#include <linux/hash.h>
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-extern const char * const bch2_watermarks[];
-
-void bch2_reset_alloc_cursors(struct bch_fs *);
-
-struct dev_alloc_list {
-	unsigned	nr;
-	u8		data[BCH_SB_MEMBERS_MAX];
-};
-
-struct alloc_request {
-	unsigned		nr_replicas;
-	unsigned		target;
-	bool			ec;
-	enum bch_watermark	watermark;
-	enum bch_write_flags	flags;
-	enum bch_data_type	data_type;
-	struct bch_devs_list	*devs_have;
-	struct write_point	*wp;
-
-	/* These fields are used primarily by open_bucket_add_buckets */
-	struct open_buckets	ptrs;
-	unsigned		nr_effective;	/* sum of @ptrs durability */
-	bool			have_cache;	/* have we allocated from a 0 durability dev */
-	struct bch_devs_mask	devs_may_alloc;
-
-	/* bch2_bucket_alloc_set_trans(): */
-	struct dev_alloc_list	devs_sorted;
-	struct bch_dev_usage	usage;
-
-	/* bch2_bucket_alloc_trans(): */
-	struct bch_dev		*ca;
-
-	enum {
-				BTREE_BITMAP_NO,
-				BTREE_BITMAP_YES,
-				BTREE_BITMAP_ANY,
-	}			btree_bitmap;
-
-	struct {
-		u64		buckets_seen;
-		u64		skipped_open;
-		u64		skipped_need_journal_commit;
-		u64		need_journal_commit;
-		u64		skipped_nocow;
-		u64		skipped_nouse;
-		u64		skipped_mi_btree_bitmap;
-	} counters;
-
-	unsigned		scratch_nr_replicas;
-	unsigned		scratch_nr_effective;
-	bool			scratch_have_cache;
-	enum bch_data_type	scratch_data_type;
-	struct open_buckets	scratch_ptrs;
-	struct bch_devs_mask	scratch_devs_may_alloc;
-};
-
-void bch2_dev_alloc_list(struct bch_fs *,
-			 struct dev_stripe_state *,
-			 struct bch_devs_mask *,
-			 struct dev_alloc_list *);
-void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-
-static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
-{
-	return bch2_dev_have_ref(c, ob->dev);
-}
-
-static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
-{
-	switch (watermark) {
-	case BCH_WATERMARK_interior_updates:
-		return 0;
-	case BCH_WATERMARK_reclaim:
-		return OPEN_BUCKETS_COUNT / 6;
-	case BCH_WATERMARK_btree:
-	case BCH_WATERMARK_btree_copygc:
-		return OPEN_BUCKETS_COUNT / 4;
-	case BCH_WATERMARK_copygc:
-		return OPEN_BUCKETS_COUNT / 3;
-	default:
-		return OPEN_BUCKETS_COUNT / 2;
-	}
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
-				      enum bch_watermark, enum bch_data_type,
-				      struct closure *);
-
-static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
-			   struct open_bucket *ob)
-{
-	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
-
-	obs->v[obs->nr++] = ob - c->open_buckets;
-}
-
-#define open_bucket_for_each(_c, _obs, _ob, _i)				\
-	for ((_i) = 0;							\
-	     (_i) < (_obs)->nr &&					\
-	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
-	     (_i)++)
-
-static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
-						 struct open_buckets *obs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, obs, ob, i)
-		if (ob->ec)
-			return ob;
-
-	return NULL;
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *,
-			struct open_buckets *, unsigned, int);
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
-	if (atomic_dec_and_test(&ob->pin))
-		__bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_buckets_put(struct bch_fs *c,
-					 struct open_buckets *ptrs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, ptrs, ob, i)
-		bch2_open_bucket_put(c, ob);
-	ptrs->nr = 0;
-}
-
-static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
-{
-	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		ob_push(c, ob->sectors_free < block_sectors(c)
-			? &ptrs
-			: &keep, ob);
-	wp->ptrs = keep;
-
-	mutex_unlock(&wp->lock);
-
-	bch2_open_buckets_put(c, &ptrs);
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
-					struct write_point *wp,
-					struct open_buckets *ptrs)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		ob->data_type = wp->data_type;
-		atomic_inc(&ob->pin);
-		ob_push(c, ptrs, ob);
-	}
-}
-
-static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
-						  unsigned dev, u64 bucket)
-{
-	return c->open_buckets_hash +
-		(jhash_3words(dev, bucket, bucket >> 32, 0) &
-		 (OPEN_BUCKETS_COUNT - 1));
-}
-
-static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
-
-	while (slot) {
-		struct open_bucket *ob = &c->open_buckets[slot];
-
-		if (ob->dev == dev && ob->bucket == bucket)
-			return true;
-
-		slot = ob->hash;
-	}
-
-	return false;
-}
-
-static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
-{
-	bool ret;
-
-	if (bch2_bucket_is_open(c, dev, bucket))
-		return true;
-
-	spin_lock(&c->freelist_lock);
-	ret = bch2_bucket_is_open(c, dev, bucket);
-	spin_unlock(&c->freelist_lock);
-
-	return ret;
-}
-
-enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
-				struct dev_stripe_state *, struct closure *);
-
-int bch2_alloc_sectors_start_trans(struct btree_trans *,
-				   unsigned, unsigned,
-				   struct write_point_specifier,
-				   struct bch_devs_list *,
-				   unsigned, unsigned,
-				   enum bch_watermark,
-				   enum bch_write_flags,
-				   struct closure *,
-				   struct write_point **);
-
-static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
-	struct bch_dev *ca = ob_dev(c, ob);
-
-	return (struct bch_extent_ptr) {
-		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.gen	= ob->gen,
-		.dev	= ob->dev,
-		.offset	= bucket_to_sector(ca, ob->bucket) +
-			ca->mi.bucket_size -
-			ob->sectors_free,
-	};
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-static inline void
-bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
-				       struct bkey_i *k, unsigned sectors,
-				       bool cached)
-{
-	struct open_bucket *ob;
-	unsigned i;
-
-	BUG_ON(sectors > wp->sectors_free);
-	wp->sectors_free	-= sectors;
-	wp->sectors_allocated	+= sectors;
-
-	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-		struct bch_dev *ca = ob_dev(c, ob);
-		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
-		ptr.cached = cached ||
-			(!ca->mi.durability &&
-			 wp->data_type == BCH_DATA_user);
-
-		bch2_bkey_append_ptr(k, ptr);
-
-		BUG_ON(sectors > ob->sectors_free);
-		ob->sectors_free -= sectors;
-	}
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-				    struct bkey_i *, unsigned, bool);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
-	return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
-	return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *);
-
-void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
-void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
-void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-
-void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
-static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
-	if (cl->closure_get_happened)
-		__bch2_wait_on_allocator(c, cl);
-}
-
-#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
deleted file mode 100644
index e7becdf22cba..000000000000
--- a/fs/bcachefs/alloc_types.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_TYPES_H
-#define _BCACHEFS_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include "clock_types.h"
-#include "fifo.h"
-
-#define BCH_WATERMARKS()		\
-	x(stripe)			\
-	x(normal)			\
-	x(copygc)			\
-	x(btree)			\
-	x(btree_copygc)			\
-	x(reclaim)			\
-	x(interior_updates)
-
-enum bch_watermark {
-#define x(name)	BCH_WATERMARK_##name,
-	BCH_WATERMARKS()
-#undef x
-	BCH_WATERMARK_NR,
-};
-
-#define BCH_WATERMARK_BITS	3
-#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
-
-#define OPEN_BUCKETS_COUNT	1024
-
-#define WRITE_POINT_HASH_NR	32
-#define WRITE_POINT_MAX		32
-
-/*
- * 0 is never a valid open_bucket_idx_t:
- */
-typedef u16			open_bucket_idx_t;
-
-struct open_bucket {
-	spinlock_t		lock;
-	atomic_t		pin;
-	open_bucket_idx_t	freelist;
-	open_bucket_idx_t	hash;
-
-	/*
-	 * When an open bucket has an ec_stripe attached, this is the index of
-	 * the block in the stripe this open_bucket corresponds to:
-	 */
-	u8			ec_idx;
-	enum bch_data_type	data_type:6;
-	unsigned		valid:1;
-	unsigned		on_partial_list:1;
-
-	u8			dev;
-	u8			gen;
-	u32			sectors_free;
-	u64			bucket;
-	struct ec_stripe_new	*ec;
-};
-
-#define OPEN_BUCKET_LIST_MAX	15
-
-struct open_buckets {
-	open_bucket_idx_t	nr;
-	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
-};
-
-struct dev_stripe_state {
-	u64			next_alloc[BCH_SB_MEMBERS_MAX];
-};
-
-#define WRITE_POINT_STATES()		\
-	x(stopped)			\
-	x(waiting_io)			\
-	x(waiting_work)			\
-	x(runnable)			\
-	x(running)
-
-enum write_point_state {
-#define x(n)	WRITE_POINT_##n,
-	WRITE_POINT_STATES()
-#undef x
-	WRITE_POINT_STATE_NR
-};
-
-struct write_point {
-	struct {
-		struct hlist_node	node;
-		struct mutex		lock;
-		u64			last_used;
-		unsigned long		write_point;
-		enum bch_data_type	data_type;
-
-		/* calculated based on how many pointers we're actually going to use: */
-		unsigned		sectors_free;
-
-		struct open_buckets	ptrs;
-		struct dev_stripe_state	stripe;
-
-		u64			sectors_allocated;
-	} __aligned(SMP_CACHE_BYTES);
-
-	struct {
-		struct work_struct	index_update_work;
-
-		struct list_head	writes;
-		spinlock_t		writes_lock;
-
-		enum write_point_state	state;
-		u64			last_state_change;
-		u64			time[WRITE_POINT_STATE_NR];
-		u64			last_runtime;
-	} __aligned(SMP_CACHE_BYTES);
-};
-
-struct write_point_specifier {
-	unsigned long		v;
-};
-
-#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
deleted file mode 100644
index a7cd1f0f0964..000000000000
--- a/fs/bcachefs/async_objs.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Async obj debugging: keep asynchronous objects on (very fast) lists, make
- * them visibile in debugfs:
- */
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "btree_io.h"
-#include "debug.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/debugfs.h>
-
-static void promote_obj_to_text(struct printbuf *out, void *obj)
-{
-	bch2_promote_op_to_text(out, obj);
-}
-
-static void rbio_obj_to_text(struct printbuf *out, void *obj)
-{
-	bch2_read_bio_to_text(out, obj);
-}
-
-static void write_op_obj_to_text(struct printbuf *out, void *obj)
-{
-	bch2_write_op_to_text(out, obj);
-}
-
-static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
-{
-	struct btree_read_bio *rbio = obj;
-	bch2_btree_read_bio_to_text(out, rbio);
-}
-
-static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
-{
-	struct btree_write_bio *wbio = obj;
-	bch2_bio_to_text(out, &wbio->wbio.bio);
-}
-
-static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
-{
-	struct async_obj_list *list = inode->i_private;
-	struct dump_iter *i;
-
-	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-	if (!i)
-		return -ENOMEM;
-
-	file->private_data = i;
-	i->from = POS_MIN;
-	i->iter	= 0;
-	i->c	= container_of(list, struct bch_fs, async_objs[list->idx]);
-	i->list	= list;
-	i->buf	= PRINTBUF;
-	return 0;
-}
-
-static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
-					size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct async_obj_list *list = i->list;
-	ssize_t ret = 0;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	struct genradix_iter iter;
-	void *obj;
-	fast_list_for_each_from(&list->list, iter, obj, i->iter) {
-		ret = bch2_debugfs_flush_buf(i);
-		if (ret)
-			return ret;
-
-		if (!i->size)
-			break;
-
-		list->obj_to_text(&i->buf, obj);
-	}
-
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-	else
-		i->iter = iter.pos;
-
-	if (!ret)
-		ret = bch2_debugfs_flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations async_obj_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_async_obj_list_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_async_obj_list_read,
-};
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
-{
-	c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
-
-#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir,		\
-			    &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
-	BCH_ASYNC_OBJ_LISTS()
-#undef x
-}
-
-void bch2_fs_async_obj_exit(struct bch_fs *c)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
-		fast_list_exit(&c->async_objs[i].list);
-}
-
-int bch2_fs_async_obj_init(struct bch_fs *c)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
-		if (fast_list_init(&c->async_objs[i].list))
-			return -BCH_ERR_ENOMEM_async_obj_init;
-		c->async_objs[i].idx = i;
-	}
-
-#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
-	BCH_ASYNC_OBJ_LISTS()
-#undef x
-
-	return 0;
-}
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
deleted file mode 100644
index cd6489b8cf76..000000000000
--- a/fs/bcachefs/async_objs.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_H
-#define _BCACHEFS_ASYNC_OBJS_H
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
-{
-	fast_list_remove(head, idx);
-}
-
-static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
-{
-	int ret = fast_list_add(head, obj);
-	*idx = ret > 0 ? ret : 0;
-	return ret < 0 ? ret : 0;
-}
-
-#define async_object_list_del(_c, _list, idx)		\
-	__async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
-
-#define async_object_list_add(_c, _list, obj, idx)		\
-	__async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
-void bch2_fs_async_obj_exit(struct bch_fs *);
-int bch2_fs_async_obj_init(struct bch_fs *);
-
-#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#define async_object_list_del(_c, _n, idx)		do {} while (0)
-
-static inline int __async_object_list_add(void)
-{
-	return 0;
-}
-#define async_object_list_add(_c, _n, obj, idx)		__async_object_list_add()
-
-static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
-static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
-static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
-
-#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#endif /* _BCACHEFS_ASYNC_OBJS_H */
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
deleted file mode 100644
index 8d713c0f5841..000000000000
--- a/fs/bcachefs/async_objs_types.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
-#define _BCACHEFS_ASYNC_OBJS_TYPES_H
-
-#define BCH_ASYNC_OBJ_LISTS()						\
-	x(promote)							\
-	x(rbio)								\
-	x(write_op)							\
-	x(btree_read_bio)						\
-	x(btree_write_bio)
-
-enum bch_async_obj_lists {
-#define x(n)		BCH_ASYNC_OBJ_LIST_##n,
-	BCH_ASYNC_OBJ_LISTS()
-#undef x
-	BCH_ASYNC_OBJ_NR
-};
-
-struct async_obj_list {
-	struct fast_list	list;
-	void			(*obj_to_text)(struct printbuf *, void *);
-	unsigned		idx;
-};
-
-#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
deleted file mode 100644
index 77d93beb3c8f..000000000000
--- a/fs/bcachefs/backpointers.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "progress.h"
-#include "recovery_passes.h"
-
-#include <linux/mm.h>
-
-static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
-
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
-	return (struct bbpos) {
-		.btree	= bp.btree_id,
-		.pos	= bp.pos,
-	};
-}
-
-int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
-			      struct bkey_validate_context from)
-{
-	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
-			 c, backpointer_level_bad,
-			 "backpointer level bad: %u >= %u",
-			 bp.v->level, BTREE_MAX_DEPTH);
-
-	bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
-			 c, backpointer_dev_bad,
-			 "backpointer for BCH_SB_MEMBER_INVALID");
-fsck_err:
-	return ret;
-}
-
-void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-	struct bch_dev *ca;
-	u32 bucket_offset;
-	struct bpos bucket;
-	scoped_guard(rcu) {
-		ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
-		if (ca)
-			bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
-	}
-
-	if (ca)
-		prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
-	else
-		prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
-
-	bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
-	prt_str(out, " data_type=");
-	bch2_prt_data_type(out, bp.v->data_type);
-	prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
-		   (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
-		   bp.v->bucket_len,
-		   bp.v->bucket_gen);
-	bch2_bpos_to_text(out, bp.v->pos);
-}
-
-void bch2_backpointer_swab(struct bkey_s k)
-{
-	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
-
-	bp.v->bucket_len	= swab32(bp.v->bucket_len);
-	bch2_bpos_swab(&bp.v->pos);
-}
-
-static bool extent_matches_bp(struct bch_fs *c,
-			      enum btree_id btree_id, unsigned level,
-			      struct bkey_s_c k,
-			      struct bkey_s_c_backpointer bp)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bkey_i_backpointer bp2;
-		bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
-
-		if (bpos_eq(bp.k->p, bp2.k.p) &&
-		    !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
-			return true;
-	}
-
-	return false;
-}
-
-static noinline int backpointer_mod_err(struct btree_trans *trans,
-					struct bkey_s_c orig_k,
-					struct bkey_i_backpointer *new_bp,
-					struct bkey_s_c found_bp,
-					bool insert)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	bool will_check = c->recovery.passes_to_run &
-		BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
-	int ret = 0;
-
-	if (insert) {
-		prt_printf(&buf, "existing backpointer found when inserting ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
-		prt_newline(&buf);
-		printbuf_indent_add(&buf, 2);
-
-		prt_printf(&buf, "found ");
-		bch2_bkey_val_to_text(&buf, c, found_bp);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "for ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-	} else if (!will_check) {
-		prt_printf(&buf, "backpointer not found when deleting\n");
-		printbuf_indent_add(&buf, 2);
-
-		prt_printf(&buf, "searching for ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
-		prt_newline(&buf);
-
-		prt_printf(&buf, "got ");
-		bch2_bkey_val_to_text(&buf, c, found_bp);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "for ");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-	}
-
-	if (!will_check && __bch2_inconsistent_error(c, &buf))
-		ret = bch_err_throw(c, erofs_unfixed_errors);
-
-	bch_err(c, "%s", buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
-				struct bkey_s_c orig_k,
-				struct bkey_i_backpointer *bp,
-				bool insert)
-{
-	struct btree_iter bp_iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
-			       bp->k.p,
-			       BTREE_ITER_intent|
-			       BTREE_ITER_slots|
-			       BTREE_ITER_with_updates);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (insert
-	    ? k.k->type
-	    : (k.k->type != KEY_TYPE_backpointer ||
-	       memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
-		ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
-		if (ret)
-			goto err;
-	}
-
-	if (!insert) {
-		bp->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp->k, 0);
-	}
-
-	ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	return ret;
-}
-
-static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
-{
-	return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
-		? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
-		: bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
-		 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
-					 struct bkey_s_c visiting_k,
-					 struct bkey_buf *last_flushed)
-{
-	return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
-		? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
-		: 0;
-}
-
-static int backpointer_target_not_found(struct btree_trans *trans,
-				  struct bkey_s_c_backpointer bp,
-				  struct bkey_s_c target_k,
-				  struct bkey_buf *last_flushed,
-				  bool commit)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	/*
-	 * If we're using the btree write buffer, the backpointer we were
-	 * looking at may have already been deleted - failure to find what it
-	 * pointed to is not an error:
-	 */
-	ret = last_flushed
-		? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
-		: 0;
-	if (ret)
-		return ret;
-
-	prt_printf(&buf, "backpointer doesn't match %s it points to:\n",
-		   bp.v->level ? "btree node" : "extent");
-	bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, target_k);
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
-		if (p.ptr.dev == bp.k->p.inode) {
-			prt_newline(&buf);
-			struct bkey_i_backpointer bp2;
-			bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
-		}
-
-	if (fsck_err(trans, backpointer_to_missing_ptr,
-		     "%s", buf.buf)) {
-		ret = bch2_backpointer_del(trans, bp.k->p);
-		if (ret || !commit)
-			goto out;
-
-		/*
-		 * Normally, on transaction commit from inside a transaction,
-		 * we'll return -BCH_ERR_transaction_restart_nested, since a
-		 * transaction commit invalidates pointers given out by peek().
-		 *
-		 * However, since we're updating a write buffer btree, if we
-		 * return a transaction restart and loop we won't see that the
-		 * backpointer has been deleted without an additional write
-		 * buffer flush - and those are expensive.
-		 *
-		 * So we're relying on the caller immediately advancing to the
-		 * next backpointer and starting a new transaction immediately
-		 * after backpointer_get_key() returns NULL:
-		 */
-		ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-	}
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
-						 struct bkey_s_c_backpointer bp,
-						 struct btree_iter *iter,
-						 struct bkey_buf *last_flushed,
-						 bool commit)
-{
-	struct bch_fs *c = trans->c;
-
-	BUG_ON(!bp.v->level);
-
-	bch2_trans_node_iter_init(trans, iter,
-				  bp.v->btree_id,
-				  bp.v->pos,
-				  0,
-				  bp.v->level - 1,
-				  0);
-	struct btree *b = bch2_btree_iter_peek_node(trans, iter);
-	if (IS_ERR_OR_NULL(b))
-		goto err;
-
-	BUG_ON(b->c.level != bp.v->level - 1);
-
-	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
-			      bkey_i_to_s_c(&b->key), bp))
-		return b;
-
-	if (btree_node_will_make_reachable(b)) {
-		b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
-	} else {
-		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
-						       last_flushed, commit);
-		b = ret ? ERR_PTR(ret) : NULL;
-	}
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return b;
-}
-
-static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
-						  struct bkey_s_c_backpointer bp,
-						  struct btree_iter *iter,
-						  unsigned iter_flags,
-						  struct bkey_buf *last_flushed,
-						  bool commit)
-{
-	struct bch_fs *c = trans->c;
-
-	if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
-		return bkey_s_c_null;
-
-	bch2_trans_node_iter_init(trans, iter,
-				  bp.v->btree_id,
-				  bp.v->pos,
-				  0,
-				  bp.v->level,
-				  iter_flags);
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-	if (bkey_err(k)) {
-		bch2_trans_iter_exit(trans, iter);
-		return k;
-	}
-
-	/*
-	 * peek_slot() doesn't normally return NULL - except when we ask for a
-	 * key at a btree level that doesn't exist.
-	 *
-	 * We may want to revisit this and change peek_slot():
-	 */
-	if (!k.k) {
-		bkey_init(&iter->k);
-		iter->k.p = bp.v->pos;
-		k.k = &iter->k;
-	}
-
-	if (k.k &&
-	    extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
-		return k;
-
-	bch2_trans_iter_exit(trans, iter);
-
-	if (!bp.v->level) {
-		int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
-		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-	} else {
-		struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
-		if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
-			return bkey_s_c_null;
-		if (IS_ERR_OR_NULL(b))
-			return ((struct bkey_s_c) { .k = ERR_CAST(b) });
-
-		return bkey_i_to_s_c(&b->key);
-	}
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
-					struct bkey_s_c_backpointer bp,
-					struct btree_iter *iter,
-					struct bkey_buf *last_flushed)
-{
-	return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
-}
-
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
-					 struct bkey_s_c_backpointer bp,
-					 struct btree_iter *iter,
-					 unsigned iter_flags,
-					 struct bkey_buf *last_flushed)
-{
-	return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
-}
-
-static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
-						   struct bkey_buf *last_flushed)
-{
-	if (k.k->type != KEY_TYPE_backpointer)
-		return 0;
-
-	struct bch_fs *c = trans->c;
-	struct btree_iter alloc_iter = {};
-	struct bkey_s_c alloc_k;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bpos bucket;
-	if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
-		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
-		if (ret)
-			goto out;
-
-		if (fsck_err(trans, backpointer_to_missing_device,
-			     "backpointer for missing device:\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_backpointer_del(trans, k.k->p);
-		goto out;
-	}
-
-	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
-	ret = bkey_err(alloc_k);
-	if (ret)
-		goto out;
-
-	if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
-		ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
-		if (ret)
-			goto out;
-
-		if (fsck_err(trans, backpointer_to_missing_alloc,
-			     "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
-			     alloc_iter.pos.inode, alloc_iter.pos.offset,
-			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			ret = bch2_backpointer_del(trans, k.k->p);
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/* verify that every backpointer has a corresponding alloc key */
-int bch2_check_btree_backpointers(struct bch_fs *c)
-{
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-			BTREE_ID_backpointers, POS_MIN, 0, k,
-			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		  bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-struct extents_to_bp_state {
-	struct bpos	bp_start;
-	struct bpos	bp_end;
-	struct bkey_buf last_flushed;
-};
-
-static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
-			       struct bkey_s_c extent, unsigned dev)
-{
-	struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bch2_bkey_drop_device(bkey_i_to_s(n), dev);
-	return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-static int check_extent_checksum(struct btree_trans *trans,
-				 enum btree_id btree, struct bkey_s_c extent,
-				 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct printbuf buf = PRINTBUF;
-	void *data_buf = NULL;
-	struct bio *bio = NULL;
-	size_t bytes;
-	int ret = 0;
-
-	if (bkey_is_btree_ptr(extent.k))
-		return false;
-
-	bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
-		if (p.ptr.dev == dev)
-			goto found;
-	BUG();
-found:
-	if (!p.crc.csum_type)
-		return false;
-
-	bytes = p.crc.compressed_size << 9;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
-				BCH_DEV_READ_REF_check_extent_checksums);
-	if (!ca)
-		return false;
-
-	data_buf = kvmalloc(bytes, GFP_KERNEL);
-	if (!data_buf) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
-	bio->bi_iter.bi_sector = p.ptr.offset;
-	bch2_bio_map(bio, data_buf, bytes);
-	ret = submit_bio_wait(bio);
-	if (ret)
-		goto err;
-
-	prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n");
-	bch2_btree_id_to_text(&buf, btree);
-	prt_str(&buf, " ");
-	bch2_bkey_val_to_text(&buf, c, extent);
-	prt_newline(&buf);
-	bch2_btree_id_to_text(&buf, o_btree);
-	prt_str(&buf, " ");
-	bch2_bkey_val_to_text(&buf, c, extent2);
-
-	struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
-	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
-	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
-			trans, dup_backpointer_to_bad_csum_extent,
-			"%s", buf.buf))
-		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
-fsck_err:
-err:
-	if (bio)
-		bio_put(bio);
-	kvfree(data_buf);
-	enumerated_ref_put(&ca->io_ref[READ],
-			   BCH_DEV_READ_REF_check_extent_checksums);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_bp_exists(struct btree_trans *trans,
-			   struct extents_to_bp_state *s,
-			   struct bkey_i_backpointer *bp,
-			   struct bkey_s_c orig_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter other_extent_iter = {};
-	struct printbuf buf = PRINTBUF;
-
-	if (bpos_lt(bp->k.p, s->bp_start) ||
-	    bpos_gt(bp->k.p, s->bp_end))
-		return 0;
-
-	struct btree_iter bp_iter;
-	struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
-	int ret = bkey_err(bp_k);
-	if (ret)
-		goto err;
-
-	if (bp_k.k->type != KEY_TYPE_backpointer ||
-	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
-		ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
-		if (ret)
-			goto err;
-
-		goto check_existing_bp;
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &other_extent_iter);
-	bch2_trans_iter_exit(trans, &bp_iter);
-	printbuf_exit(&buf);
-	return ret;
-check_existing_bp:
-	/* Do we have a backpointer for a different extent? */
-	if (bp_k.k->type != KEY_TYPE_backpointer)
-		goto missing;
-
-	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
-
-	struct bkey_s_c other_extent =
-		__bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
-	ret = bkey_err(other_extent);
-	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-		ret = 0;
-	if (ret)
-		goto err;
-
-	if (!other_extent.k)
-		goto missing;
-
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
-	if (ca) {
-		struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
-		bkey_for_each_ptr(other_extent_ptrs, ptr)
-			if (ptr->dev == bp->k.p.inode &&
-			    dev_ptr_stale_rcu(ca, ptr)) {
-				rcu_read_unlock();
-				ret = drop_dev_and_update(trans, other_bp.v->btree_id,
-							  other_extent, bp->k.p.inode);
-				if (ret)
-					goto err;
-				goto out;
-			}
-	}
-	rcu_read_unlock();
-
-	if (bch2_extents_match(orig_k, other_extent)) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n");
-		bch2_bkey_val_to_text(&buf, c, orig_k);
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, other_extent);
-		bch_err(c, "%s", buf.buf);
-
-		if (other_extent.k->size <= orig_k.k->size) {
-			ret = drop_dev_and_update(trans, other_bp.v->btree_id,
-						  other_extent, bp->k.p.inode);
-			if (ret)
-				goto err;
-			goto out;
-		} else {
-			ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
-			if (ret)
-				goto err;
-			goto missing;
-		}
-	}
-
-	ret = check_extent_checksum(trans,
-				    other_bp.v->btree_id, other_extent,
-				    bp->v.btree_id, orig_k,
-				    bp->k.p.inode);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		ret = 0;
-		goto missing;
-	}
-
-	ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
-				    other_bp.v->btree_id, other_extent, bp->k.p.inode);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		ret = 0;
-		goto out;
-	}
-
-	printbuf_reset(&buf);
-	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode);
-	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, other_extent);
-	bch_err(c, "%s", buf.buf);
-	ret = bch_err_throw(c, fsck_repair_unimplemented);
-	goto err;
-missing:
-	printbuf_reset(&buf);
-	prt_str(&buf, "missing backpointer\nfor:  ");
-	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_printf(&buf, "\nwant: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
-	prt_printf(&buf, "\ngot:  ");
-	bch2_bkey_val_to_text(&buf, c, bp_k);
-
-	if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
-		ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
-
-	goto out;
-}
-
-static int check_extent_to_backpointers(struct btree_trans *trans,
-					struct extents_to_bp_state *s,
-					enum btree_id btree, unsigned level,
-					struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
-			continue;
-
-		bool empty;
-		{
-			/* scoped_guard() is a loop, so it breaks continue */
-			guard(rcu)();
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-			if (!ca)
-				continue;
-
-			if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
-				continue;
-
-			u64 b = PTR_BUCKET_NR(ca, &p.ptr);
-			if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
-				continue;
-
-			empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
-		}
-
-		struct bkey_i_backpointer bp;
-		bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
-
-		int ret = !empty
-			? check_bp_exists(trans, s, &bp, k)
-			: bch2_bucket_backpointer_mod(trans, k, &bp, true);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int check_btree_root_to_backpointers(struct btree_trans *trans,
-					    struct extents_to_bp_state *s,
-					    enum btree_id btree_id,
-					    int *level)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct btree *b;
-	struct bkey_s_c k;
-	int ret;
-retry:
-	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
-				  0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
-	b = bch2_btree_iter_peek_node(trans, &iter);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto err;
-
-	if (b != btree_node_root(c, b)) {
-		bch2_trans_iter_exit(trans, &iter);
-		goto retry;
-	}
-
-	*level = b->c.level;
-
-	k = bkey_i_to_s_c(&b->key);
-	ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static u64 mem_may_pin_bytes(struct bch_fs *c)
-{
-	struct sysinfo i;
-	si_meminfo(&i);
-
-	u64 mem_bytes = i.totalram * i.mem_unit;
-	return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
-}
-
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
-{
-	return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
-}
-
-static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-					u64 btree_leaf_mask,
-					u64 btree_interior_mask,
-					struct bbpos start, struct bbpos *end)
-{
-	struct bch_fs *c = trans->c;
-	s64 mem_may_pin = mem_may_pin_bytes(c);
-	int ret = 0;
-
-	bch2_btree_cache_unpin(c);
-
-	btree_interior_mask |= btree_leaf_mask;
-
-	c->btree_cache.pinned_nodes_mask[0]		= btree_leaf_mask;
-	c->btree_cache.pinned_nodes_mask[1]		= btree_interior_mask;
-	c->btree_cache.pinned_nodes_start		= start;
-	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;
-
-	for (enum btree_id btree = start.btree;
-	     btree < BTREE_ID_NR && !ret;
-	     btree++) {
-		unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
-
-		if (!(BIT_ULL(btree) & btree_leaf_mask) &&
-		    !(BIT_ULL(btree) & btree_interior_mask))
-			continue;
-
-		ret = __for_each_btree_node(trans, iter, btree,
-				      btree == start.btree ? start.pos : POS_MIN,
-				      0, depth, BTREE_ITER_prefetch, b, ({
-			mem_may_pin -= btree_buf_bytes(b);
-			if (mem_may_pin <= 0) {
-				c->btree_cache.pinned_nodes_end = *end =
-					BBPOS(btree, b->key.k.p);
-				break;
-			}
-			bch2_node_pin(c, b);
-			0;
-		}));
-	}
-
-	return ret;
-}
-
-static inline int bch2_fs_going_ro(struct bch_fs *c)
-{
-	return test_bit(BCH_FS_going_ro, &c->flags)
-		? -EROFS
-		: 0;
-}
-
-static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
-						   struct extents_to_bp_state *s)
-{
-	struct bch_fs *c = trans->c;
-	struct progress_indicator_state progress;
-	int ret = 0;
-
-	bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
-
-	for (enum btree_id btree_id = 0;
-	     btree_id < btree_id_nr_alive(c);
-	     btree_id++) {
-		int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc,
-				check_btree_root_to_backpointers(trans, s, btree_id, &level));
-		if (ret)
-			return ret;
-
-		while (level >= depth) {
-			struct btree_iter iter;
-			bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
-						  BTREE_ITER_prefetch);
-
-			ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-				bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
-				bch2_fs_going_ro(c) ?:
-				check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
-				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-			}));
-			if (ret)
-				return ret;
-
-			--level;
-		}
-	}
-
-	return 0;
-}
-
-enum alloc_sector_counter {
-	ALLOC_dirty,
-	ALLOC_cached,
-	ALLOC_stripe,
-	ALLOC_SECTORS_NR
-};
-
-static int data_type_to_alloc_counter(enum bch_data_type t)
-{
-	switch (t) {
-	case BCH_DATA_btree:
-	case BCH_DATA_user:
-		return ALLOC_dirty;
-	case BCH_DATA_cached:
-		return ALLOC_cached;
-	case BCH_DATA_stripe:
-	case BCH_DATA_parity:
-		return ALLOC_stripe;
-	default:
-		return -1;
-	}
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
-
-static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
-					     bool *had_mismatch,
-					     struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-	bool need_commit = false;
-
-	*had_mismatch = false;
-
-	if (a->data_type == BCH_DATA_sb ||
-	    a->data_type == BCH_DATA_journal ||
-	    a->data_type == BCH_DATA_parity)
-		return 0;
-
-	u32 sectors[ALLOC_SECTORS_NR];
-	memset(sectors, 0, sizeof(sectors));
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
-	if (!ca)
-		return 0;
-
-	struct btree_iter iter;
-	struct bkey_s_c bp_k;
-	int ret = 0;
-	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
-				bucket_pos_to_bp_start(ca, alloc_k.k->p),
-				bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
-		if (bp_k.k->type != KEY_TYPE_backpointer)
-			continue;
-
-		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-
-		if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
-		    (bp.v->bucket_gen != a->gen ||
-		     bp.v->pad)) {
-			ret = bch2_backpointer_del(trans, bp_k.k->p);
-			if (ret)
-				break;
-
-			need_commit = true;
-			continue;
-		}
-
-		if (bp.v->bucket_gen != a->gen)
-			continue;
-
-		int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
-		if (alloc_counter < 0)
-			continue;
-
-		sectors[alloc_counter] += bp.v->bucket_len;
-	};
-	bch2_trans_iter_exit(trans, &iter);
-	if (ret)
-		goto err;
-
-	if (need_commit) {
-		ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-		if (ret)
-			goto err;
-	}
-
-	if (sectors[ALLOC_dirty]  != a->dirty_sectors ||
-	    sectors[ALLOC_cached] != a->cached_sectors ||
-	    sectors[ALLOC_stripe] != a->stripe_sectors) {
-		if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
-			ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
-			if (ret)
-				goto err;
-		}
-
-		if (sectors[ALLOC_dirty]  > a->dirty_sectors ||
-		    sectors[ALLOC_cached] > a->cached_sectors ||
-		    sectors[ALLOC_stripe] > a->stripe_sectors) {
-			ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
-				bch_err_throw(c, transaction_restart_nested);
-			goto err;
-		}
-
-		bool empty = (sectors[ALLOC_dirty] +
-			      sectors[ALLOC_stripe] +
-			      sectors[ALLOC_cached]) == 0;
-
-		ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
-					     alloc_k.k->p.offset) ?:
-			(empty
-			 ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
-						  alloc_k.k->p.offset)
-			 : 0);
-
-		*had_mismatch = true;
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr_v2: {
-		bool ret = false;
-
-		guard(rcu)();
-		struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
-		while (pos.inode <= k.k->p.inode) {
-			if (pos.inode >= c->sb.nr_devices)
-				break;
-
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
-			if (!ca)
-				goto next;
-
-			struct bpos bucket = bp_pos_to_bucket(ca, pos);
-			u64 next = ca->mi.nbuckets;
-
-			unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
-			if (bitmap)
-				next = min_t(u64, next,
-					     find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
-
-			bucket.offset = next;
-			if (bucket.offset == ca->mi.nbuckets)
-				goto next;
-
-			ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
-			if (ret)
-				break;
-next:
-			pos = SPOS(pos.inode + 1, 0, 0);
-		}
-
-		return ret;
-	}
-	case KEY_TYPE_btree_ptr:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
-				  enum btree_id btree, unsigned level)
-{
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
-	struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto err;
-
-	if (b)
-		bch2_node_pin(trans->c, b);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
-						   struct bpos start, struct bpos *end)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	struct bkey_buf tmp;
-	bch2_bkey_buf_init(&tmp);
-
-	bch2_btree_cache_unpin(c);
-
-	*end = SPOS_MAX;
-
-	s64 mem_may_pin = mem_may_pin_bytes(c);
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
-				  0, 1, BTREE_ITER_prefetch);
-	ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-		if (!backpointer_node_has_missing(c, k))
-			continue;
-
-		mem_may_pin -= c->opts.btree_node_size;
-		if (mem_may_pin <= 0)
-			break;
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		struct btree_path *path = btree_iter_path(trans, &iter);
-
-		BUG_ON(path->level != 1);
-
-		bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
-	}));
-	if (ret)
-		return ret;
-
-	struct bpos pinned = SPOS_MAX;
-	mem_may_pin = mem_may_pin_bytes(c);
-	bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
-				  0, 1, BTREE_ITER_prefetch);
-	ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-		if (!backpointer_node_has_missing(c, k))
-			continue;
-
-		mem_may_pin -= c->opts.btree_node_size;
-		if (mem_may_pin <= 0) {
-			*end = pinned;
-			break;
-		}
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		struct btree_path *path = btree_iter_path(trans, &iter);
-
-		BUG_ON(path->level != 1);
-
-		int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
-
-		if (!ret2)
-			pinned = tmp.k->k.p;
-
-		ret;
-	}));
-	if (ret)
-		return ret;
-
-	return ret;
-}
-
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
-{
-	int ret = 0;
-
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct extents_to_bp_state s = { .bp_start = POS_MIN };
-
-	bch2_bkey_buf_init(&s.last_flushed);
-	bkey_init(&s.last_flushed.k->k);
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
-				 POS_MIN, BTREE_ITER_prefetch, k, ({
-		bool had_mismatch;
-		bch2_fs_going_ro(c) ?:
-		check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
-	}));
-	if (ret)
-		goto err;
-
-	u64 nr_buckets = 0, nr_mismatches = 0;
-	for_each_member_device(c, ca) {
-		nr_buckets	+= ca->mi.nbuckets;
-		nr_mismatches	+= ca->bucket_backpointer_mismatch.nr;
-	}
-
-	if (!nr_mismatches)
-		goto err;
-
-	bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
-		 nr_mismatches, nr_buckets);
-
-	while (1) {
-		ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
-		if (ret)
-			break;
-
-		if ( bpos_eq(s.bp_start, POS_MIN) &&
-		    !bpos_eq(s.bp_end, SPOS_MAX))
-			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
-				    __func__, btree_nodes_fit_in_ram(c));
-
-		if (!bpos_eq(s.bp_start, POS_MIN) ||
-		    !bpos_eq(s.bp_end, SPOS_MAX)) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "check_extents_to_backpointers(): ");
-			bch2_bpos_to_text(&buf, s.bp_start);
-			prt_str(&buf, "-");
-			bch2_bpos_to_text(&buf, s.bp_end);
-
-			bch_verbose(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		ret = bch2_check_extents_to_backpointers_pass(trans, &s);
-		if (ret || bpos_eq(s.bp_end, SPOS_MAX))
-			break;
-
-		s.bp_start = bpos_successor(s.bp_end);
-	}
-
-	for_each_member_device(c, ca) {
-		bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
-		bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
-	}
-err:
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&s.last_flushed, c);
-	bch2_btree_cache_unpin(c);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
-						 struct bpos bucket,
-						 bool *had_mismatch,
-						 struct bkey_buf *last_flushed)
-{
-	struct btree_iter alloc_iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
-					       BTREE_ID_alloc, bucket,
-					       BTREE_ITER_cached);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	return ret;
-}
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
-					   struct bch_dev *ca, u64 bucket,
-					   bool copygc,
-					   struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	bool had_mismatch;
-	int ret = lockrestart_do(trans,
-		check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
-						      &had_mismatch, last_flushed));
-	if (ret || !had_mismatch)
-		return ret;
-
-	u64 nr = ca->bucket_backpointer_mismatch.nr;
-	u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
-
-	struct printbuf buf = PRINTBUF;
-	__bch2_log_msg_start(ca->name, &buf);
-
-	prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
-		   bucket, nr, ca->mi.nbuckets);
-
-	bch2_run_explicit_recovery_pass(c, &buf,
-			BCH_RECOVERY_PASS_check_extents_to_backpointers,
-			nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-	return 0;
-}
-
-/* backpointers -> extents */
-
-static int check_one_backpointer(struct btree_trans *trans,
-				 struct bbpos start,
-				 struct bbpos end,
-				 struct bkey_s_c bp_k,
-				 struct bkey_buf *last_flushed)
-{
-	if (bp_k.k->type != KEY_TYPE_backpointer)
-		return 0;
-
-	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-	struct bbpos pos = bp_to_bbpos(*bp.v);
-
-	if (bbpos_cmp(pos, start) < 0 ||
-	    bbpos_cmp(pos, end) > 0)
-		return 0;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
-	int ret = bkey_err(k);
-	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-		return 0;
-	if (ret)
-		return ret;
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
-						struct bch_dev *ca, struct bpos bucket)
-{
-	u32 restart_count = trans->restart_count;
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
-				      bucket_pos_to_bp_start(ca, bucket),
-				      bucket_pos_to_bp_end(ca, bucket),
-				      0, k,
-		check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
-	);
-
-	bch2_bkey_buf_exit(&last_flushed, trans->c);
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
-						   struct bbpos start,
-						   struct bbpos end)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_buf last_flushed;
-	struct progress_indicator_state progress;
-
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-	bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
-
-	int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
-				     POS_MIN, BTREE_ITER_prefetch, k, ({
-			bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
-			check_one_backpointer(trans, start, end, k, &last_flushed);
-	}));
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	return ret;
-}
-
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
-	int ret;
-
-	while (1) {
-		ret = bch2_get_btree_in_memory_pos(trans,
-						   BIT_ULL(BTREE_ID_extents)|
-						   BIT_ULL(BTREE_ID_reflink),
-						   ~0,
-						   start, &end);
-		if (ret)
-			break;
-
-		if (!bbpos_cmp(start, BBPOS_MIN) &&
-		    bbpos_cmp(end, BBPOS_MAX))
-			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
-				    __func__, btree_nodes_fit_in_ram(c));
-
-		if (bbpos_cmp(start, BBPOS_MIN) ||
-		    bbpos_cmp(end, BBPOS_MAX)) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_str(&buf, "check_backpointers_to_extents(): ");
-			bch2_bbpos_to_text(&buf, start);
-			prt_str(&buf, "-");
-			bch2_bbpos_to_text(&buf, end);
-
-			bch_verbose(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
-		if (ret || !bbpos_cmp(end, BBPOS_MAX))
-			break;
-
-		start = bbpos_successor(end);
-	}
-	bch2_trans_put(trans);
-
-	bch2_btree_cache_unpin(c);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
-{
-	scoped_guard(mutex, &b->lock) {
-		if (!b->buckets) {
-			b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
-					      sizeof(unsigned long), GFP_KERNEL);
-			if (!b->buckets)
-				return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
-		}
-
-		b->nr += !__test_and_set_bit(bit, b->buckets);
-	}
-
-	return 0;
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
-			      u64 old_size, u64 new_size)
-{
-	scoped_guard(mutex, &b->lock) {
-		if (!b->buckets)
-			return 0;
-
-		unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
-					    sizeof(unsigned long), GFP_KERNEL);
-		if (!n)
-			return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
-
-		memcpy(n, b->buckets,
-		       BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
-		kvfree(b->buckets);
-		b->buckets = n;
-	}
-
-	return 0;
-}
-
-void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
-{
-	mutex_lock(&b->lock);
-	kvfree(b->buckets);
-	b->buckets = NULL;
-	b->nr	= 0;
-	mutex_unlock(&b->lock);
-}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
deleted file mode 100644
index 7e71afee1ac0..000000000000
--- a/fs/bcachefs/backpointers.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_H
-#define _BCACHEFS_BACKPOINTERS_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "super.h"
-
-static inline u64 swab40(u64 x)
-{
-	return (((x & 0x00000000ffULL) << 32)|
-		((x & 0x000000ff00ULL) << 16)|
-		((x & 0x0000ff0000ULL) >>  0)|
-		((x & 0x00ff000000ULL) >> 16)|
-		((x & 0xff00000000ULL) >> 32));
-}
-
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
-			      struct bkey_validate_context);
-void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_backpointer_swab(struct bkey_s);
-
-#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
-	.key_validate	= bch2_backpointer_validate,	\
-	.val_to_text	= bch2_backpointer_to_text,	\
-	.swab		= bch2_backpointer_swab,	\
-	.min_val_size	= 32,				\
-})
-
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
-
-/*
- * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
- * btree:
- */
-static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
-{
-	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
-}
-
-static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
-						      u32 *bucket_offset)
-{
-	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-	return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
-}
-
-static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
-	if (ca)
-		*bucket = bp_pos_to_bucket(ca, bp_pos);
-	return ca != NULL;
-}
-
-static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
-						   struct bpos bucket,
-						   u64 bucket_offset)
-{
-	return POS(bucket.inode,
-		   (bucket_to_sector(ca, bucket.offset) <<
-		    MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-}
-
-/*
- * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
- */
-static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
-					   struct bpos bucket,
-					   u64 bucket_offset)
-{
-	struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
-	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
-	return ret;
-}
-
-static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
-{
-	return bucket_pos_to_bp(ca, bucket, 0);
-}
-
-static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
-{
-	return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
-				struct bkey_s_c,
-				struct bkey_i_backpointer *,
-				bool);
-
-static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
-				struct bkey_s_c orig_k,
-				struct bkey_i_backpointer *bp,
-				bool insert)
-{
-	if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
-		return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
-
-	if (!insert) {
-		bp->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&bp->k, 0);
-	}
-
-	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
-}
-
-static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
-							 struct extent_ptr_decoded p,
-							 const union bch_extent_entry *entry)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		return BCH_DATA_btree;
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		if (p.has_ec)
-			return BCH_DATA_stripe;
-		if (p.ptr.cached)
-			return BCH_DATA_cached;
-		else
-			return BCH_DATA_user;
-	case KEY_TYPE_stripe: {
-		const struct bch_extent_ptr *ptr = &entry->ptr;
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-		BUG_ON(ptr < s.v->ptrs ||
-		       ptr >= s.v->ptrs + s.v->nr_blocks);
-
-		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
-			? BCH_DATA_parity
-			: BCH_DATA_user;
-	}
-	default:
-		BUG();
-	}
-}
-
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c k, struct extent_ptr_decoded p,
-			   const union bch_extent_entry *entry,
-			   struct bkey_i_backpointer *bp)
-{
-	bkey_backpointer_init(&bp->k_i);
-	bp->k.p.inode = p.ptr.dev;
-
-	if (k.k->type != KEY_TYPE_stripe)
-		bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
-	else {
-		/*
-		 * Put stripe backpointers where they won't collide with the
-		 * extent backpointers within the stripe:
-		 */
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-		bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
-				  MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
-	}
-
-	bp->v	= (struct bch_backpointer) {
-		.btree_id	= btree_id,
-		.level		= level,
-		.data_type	= bch2_bkey_ptr_data_type(k, p, entry),
-		.bucket_gen	= p.ptr.gen,
-		.bucket_len	= ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
-		.pos		= k.k->p,
-	};
-}
-
-struct bkey_buf;
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
-					 struct btree_iter *, unsigned, struct bkey_buf *);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
-					struct btree_iter *, struct bkey_buf *);
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
-					   bool, struct bkey_buf *);
-
-int bch2_check_btree_backpointers(struct bch_fs *);
-int bch2_check_extents_to_backpointers(struct bch_fs *);
-int bch2_check_backpointers_to_extents(struct bch_fs *);
-
-static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
-{
-	unsigned long *bitmap = READ_ONCE(b->buckets);
-	return bitmap && test_bit(i, bitmap);
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
-void bch2_bucket_bitmap_free(struct bucket_bitmap *);
-
-#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
deleted file mode 100644
index 63abe17f35ea..000000000000
--- a/fs/bcachefs/bbpos.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_H
-#define _BCACHEFS_BBPOS_H
-
-#include "bbpos_types.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-
-static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
-{
-	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
-}
-
-static inline struct bbpos bbpos_successor(struct bbpos pos)
-{
-	if (bpos_cmp(pos.pos, SPOS_MAX)) {
-		pos.pos = bpos_successor(pos.pos);
-		return pos;
-	}
-
-	if (pos.btree != BTREE_ID_NR) {
-		pos.btree++;
-		pos.pos = POS_MIN;
-		return pos;
-	}
-
-	BUG();
-}
-
-static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
-{
-	bch2_btree_id_to_text(out, pos.btree);
-	prt_char(out, ':');
-	bch2_bpos_to_text(out, pos.pos);
-}
-
-#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
deleted file mode 100644
index f63893344f80..000000000000
--- a/fs/bcachefs/bbpos_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_TYPES_H
-#define _BCACHEFS_BBPOS_TYPES_H
-
-struct bbpos {
-	enum btree_id		btree;
-	struct bpos		pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
-	return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN	BBPOS(0, POS_MIN)
-#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
-
-#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
deleted file mode 100644
index ddfacad0f70c..000000000000
--- a/fs/bcachefs/bcachefs.h
+++ /dev/null
@@ -1,1295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_H
-#define _BCACHEFS_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#ifdef __KERNEL__
-#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#else
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define ENUMERATED_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...)		0
-#endif
-
-#define race_fault(...)			dynamic_fault("bcachefs:race")
-
-#include <linux/backing-dev-defs.h>
-#include <linux/bug.h>
-#include <linux/bio.h>
-#include <linux/closure.h>
-#include <linux/kobject.h>
-#include <linux/list.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/refcount.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/zstd.h>
-#include <linux/unicode.h>
-
-#include "bcachefs_format.h"
-#include "btree_journal_iter_types.h"
-#include "disk_accounting_types.h"
-#include "errcode.h"
-#include "fast_list.h"
-#include "fifo.h"
-#include "nocow_locking_types.h"
-#include "opts.h"
-#include "sb-errors_types.h"
-#include "seqmutex.h"
-#include "snapshot_types.h"
-#include "time_stats.h"
-#include "util.h"
-
-#include "alloc_types.h"
-#include "async_objs_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "enumerated_ref_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "recovery_passes_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
-#include "trace.h"
-
-#define count_event(_c, _name)	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
-
-#define trace_and_count(_c, _name, ...)					\
-do {									\
-	count_event(_c, _name);						\
-	trace_##_name(__VA_ARGS__);					\
-} while (0)
-
-#define bch2_fs_init_fault(name)					\
-	dynamic_fault("bcachefs:bch_fs_init:" name)
-#define bch2_meta_read_fault(name)					\
-	 dynamic_fault("bcachefs:meta:read:" name)
-#define bch2_meta_write_fault(name)					\
-	 dynamic_fault("bcachefs:meta:write:" name)
-
-#ifdef __KERNEL__
-#define BCACHEFS_LOG_PREFIX
-#endif
-
-#ifdef BCACHEFS_LOG_PREFIX
-
-#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
-	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
-
-#else
-
-#define bch2_log_msg(_c, fmt)			fmt
-#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
-	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
-
-#endif
-
-#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
-
-void bch2_print_str(struct bch_fs *, const char *, const char *);
-
-__printf(2, 3)
-void bch2_print_opts(struct bch_opts *, const char *, ...);
-
-__printf(2, 3)
-void __bch2_print(struct bch_fs *c, const char *fmt, ...);
-
-#define maybe_dev_to_fs(_c)	_Generic((_c),				\
-	struct bch_dev *:	((struct bch_dev *) (_c))->fs,		\
-	struct bch_fs *:	(_c))
-
-#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
-
-#define bch2_print_ratelimited(_c, ...)					\
-do {									\
-	static DEFINE_RATELIMIT_STATE(_rs,				\
-				      DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&_rs))						\
-		bch2_print(_c, __VA_ARGS__);				\
-} while (0)
-
-#define bch2_print_str_ratelimited(_c, ...)				\
-do {									\
-	static DEFINE_RATELIMIT_STATE(_rs,				\
-				      DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&_rs))						\
-		bch2_print_str(_c, __VA_ARGS__);			\
-} while (0)
-
-#define bch_info(c, fmt, ...) \
-	bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_info_ratelimited(c, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
-	bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
-	bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn_ratelimited(c, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_err(c, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev(ca, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
-	bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_ratelimited(c, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
-	bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
-	bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-static inline bool should_print_err(int err)
-{
-	return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
-}
-
-#define bch_err_fn(_c, _ret)						\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_fn_ratelimited(_c, _ret)				\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_msg(_c, _ret, _msg, ...)				\
-do {									\
-	if (should_print_err(_ret))					\
-		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
-			##__VA_ARGS__, bch2_err_str(_ret));		\
-} while (0)
-
-#define bch_verbose(c, fmt, ...)					\
-do {									\
-	if ((c)->opts.verbose)						\
-		bch_info(c, fmt, ##__VA_ARGS__);			\
-} while (0)
-
-#define bch_verbose_ratelimited(c, fmt, ...)				\
-do {									\
-	if ((c)->opts.verbose)						\
-		bch_info_ratelimited(c, fmt, ##__VA_ARGS__);		\
-} while (0)
-
-#define pr_verbose_init(opts, fmt, ...)					\
-do {									\
-	if (opt_get(opts, verbose))					\
-		pr_info(fmt, ##__VA_ARGS__);				\
-} while (0)
-
-static inline int __bch2_err_trace(struct bch_fs *c, int err)
-{
-	trace_error_throw(c, err, _THIS_IP_);
-	return err;
-}
-
-#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS()					\
-	BCH_DEBUG_PARAM(key_merging_disabled,				\
-		"Disables merging of extents")				\
-	BCH_DEBUG_PARAM(btree_node_merging_disabled,			\
-		"Disables merging of btree nodes")			\
-	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
-		"Causes mark and sweep to compact and rewrite every "	\
-		"btree node it traverses")				\
-	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
-		"Disables rewriting of btree nodes during mark and sweep")\
-	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
-		"Disables the shrinker callback for the btree node cache")\
-	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
-		"Reread btree nodes at various points to verify the "	\
-		"mergesort in the read path against modifications "	\
-		"done in memory")					\
-	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
-		"When reading btree nodes, read all replicas and "	\
-		"compare them")						\
-	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
-		"Don't use the write buffer for backpointers, enabling "\
-		"extra runtime checks")					\
-	BCH_DEBUG_PARAM(debug_check_btree_locking,			\
-		"Enable additional asserts for btree locking")		\
-	BCH_DEBUG_PARAM(debug_check_iterators,				\
-		"Enables extra verification for btree iterators")	\
-	BCH_DEBUG_PARAM(debug_check_bset_lookups,			\
-		"Enables extra verification for bset lookups")		\
-	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
-		"Verify btree accounting for keys within a node")	\
-	BCH_DEBUG_PARAM(debug_check_bkey_unpack,			\
-		"Enables extra verification for bkey unpack")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG()					\
-	BCH_DEBUG_PARAM(journal_seq_verify,				\
-		"Store the journal sequence number in the version "	\
-		"number of every btree key, and verify that btree "	\
-		"update ordering is preserved during recovery")		\
-	BCH_DEBUG_PARAM(inject_invalid_keys,				\
-		"Store the journal sequence number in the version "	\
-		"number of every btree key, and verify that btree "	\
-		"update ordering is preserved during recovery")		\
-	BCH_DEBUG_PARAM(test_alloc_startup,				\
-		"Force allocator startup to use the slowpath where it"	\
-		"can't find enough free buckets without invalidating"	\
-		"cached data")						\
-	BCH_DEBUG_PARAM(force_reconstruct_read,				\
-		"Force reads to use the reconstruct path, when reading"	\
-		"from erasure coded extents")				\
-	BCH_DEBUG_PARAM(test_restart_gc,				\
-		"Test restarting mark and sweep gc when bucket gens change")
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_TIME_STATS()			\
-	x(btree_node_mem_alloc)			\
-	x(btree_node_split)			\
-	x(btree_node_compact)			\
-	x(btree_node_merge)			\
-	x(btree_node_sort)			\
-	x(btree_node_get)			\
-	x(btree_node_read)			\
-	x(btree_node_read_done)			\
-	x(btree_node_write)			\
-	x(btree_interior_update_foreground)	\
-	x(btree_interior_update_total)		\
-	x(btree_gc)				\
-	x(data_write)				\
-	x(data_write_to_submit)			\
-	x(data_write_to_queue)			\
-	x(data_write_to_btree_update)		\
-	x(data_write_btree_update)		\
-	x(data_read)				\
-	x(data_promote)				\
-	x(journal_flush_write)			\
-	x(journal_noflush_write)		\
-	x(journal_flush_seq)			\
-	x(blocked_journal_low_on_space)		\
-	x(blocked_journal_low_on_pin)		\
-	x(blocked_journal_max_in_flight)	\
-	x(blocked_journal_max_open)		\
-	x(blocked_key_cache_flush)		\
-	x(blocked_allocate)			\
-	x(blocked_allocate_open_bucket)		\
-	x(blocked_write_buffer_full)		\
-	x(nocow_lock_contended)
-
-enum bch_time_stats {
-#define x(name) BCH_TIME_##name,
-	BCH_TIME_STATS()
-#undef x
-	BCH_TIME_STAT_NR
-};
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES		4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
-
-#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
-
-struct btree;
-
-struct io_count {
-	u64			sectors[2][BCH_DATA_NR];
-};
-
-struct discard_in_flight {
-	bool			in_progress:1;
-	u64			bucket:63;
-};
-
-#define BCH_DEV_READ_REFS()				\
-	x(bch2_online_devs)				\
-	x(trans_mark_dev_sbs)				\
-	x(read_fua_test)				\
-	x(sb_field_resize)				\
-	x(write_super)					\
-	x(journal_read)					\
-	x(fs_journal_alloc)				\
-	x(fs_resize_on_mount)				\
-	x(btree_node_read)				\
-	x(btree_node_read_all_replicas)			\
-	x(btree_node_scrub)				\
-	x(btree_node_write)				\
-	x(btree_node_scan)				\
-	x(btree_verify_replicas)			\
-	x(btree_node_ondisk_to_text)			\
-	x(io_read)					\
-	x(check_extent_checksums)			\
-	x(ec_block)
-
-enum bch_dev_read_ref {
-#define x(n) BCH_DEV_READ_REF_##n,
-	BCH_DEV_READ_REFS()
-#undef x
-	BCH_DEV_READ_REF_NR,
-};
-
-#define BCH_DEV_WRITE_REFS()				\
-	x(journal_write)				\
-	x(journal_do_discards)				\
-	x(dev_do_discards)				\
-	x(discard_one_bucket_fast)			\
-	x(do_invalidates)				\
-	x(nocow_flush)					\
-	x(io_write)					\
-	x(ec_block)					\
-	x(ec_bucket_zero)
-
-enum bch_dev_write_ref {
-#define x(n) BCH_DEV_WRITE_REF_##n,
-	BCH_DEV_WRITE_REFS()
-#undef x
-	BCH_DEV_WRITE_REF_NR,
-};
-
-struct bucket_bitmap {
-	unsigned long		*buckets;
-	u64			nr;
-	struct mutex		lock;
-};
-
-struct bch_dev {
-	struct kobject		kobj;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	atomic_long_t		ref;
-	bool			dying;
-	unsigned long		last_put;
-#else
-	struct percpu_ref	ref;
-#endif
-	struct completion	ref_completion;
-	struct enumerated_ref	io_ref[2];
-
-	struct bch_fs		*fs;
-
-	u8			dev_idx;
-	/*
-	 * Cached version of this device's member info from superblock
-	 * Committed by bch2_write_super() -> bch_fs_mi_update()
-	 */
-	struct bch_member_cpu	mi;
-	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
-	unsigned long		write_errors_start;
-
-	__uuid_t		uuid;
-	char			name[BDEVNAME_SIZE];
-
-	struct bch_sb_handle	disk_sb;
-	struct bch_sb		*sb_read_scratch;
-	int			sb_write_error;
-	dev_t			dev;
-	atomic_t		flush_seq;
-
-	struct bch_devs_mask	self;
-
-	/*
-	 * Buckets:
-	 * Per-bucket arrays are protected by either rcu_read_lock or
-	 * state_lock, for device resize.
-	 */
-	GENRADIX(struct bucket)	buckets_gc;
-	struct bucket_gens __rcu *bucket_gens;
-	u8			*oldest_gen;
-	unsigned long		*buckets_nouse;
-
-	struct bucket_bitmap	bucket_backpointer_mismatch;
-	struct bucket_bitmap	bucket_backpointer_empty;
-
-	struct bch_dev_usage_full __percpu
-				*usage;
-
-	/* Allocator: */
-	u64			alloc_cursor[3];
-
-	unsigned		nr_open_buckets;
-	unsigned		nr_partial_buckets;
-	unsigned		nr_btree_reserve;
-
-	struct work_struct	invalidate_work;
-	struct work_struct	discard_work;
-	struct mutex		discard_buckets_in_flight_lock;
-	DARRAY(struct discard_in_flight)	discard_buckets_in_flight;
-	struct work_struct	discard_fast_work;
-
-	atomic64_t		rebalance_work;
-
-	struct journal_device	journal;
-	u64			prev_journal_sector;
-
-	struct work_struct	io_error_work;
-
-	/* The rest of this all shows up in sysfs */
-	atomic64_t		cur_latency[2];
-	struct bch2_time_stats_quantiles io_latency[2];
-
-#define CONGESTED_MAX		1024
-	atomic_t		congested;
-	u64			congested_last;
-
-	struct io_count __percpu *io_done;
-};
-
-/*
- * initial_gc_unfixed
- * error
- * topology error
- */
-
-#define BCH_FS_FLAGS()			\
-	x(new_fs)			\
-	x(started)			\
-	x(clean_recovery)		\
-	x(btree_running)		\
-	x(accounting_replay_done)	\
-	x(may_go_rw)			\
-	x(rw)				\
-	x(rw_init_done)			\
-	x(was_rw)			\
-	x(stopping)			\
-	x(emergency_ro)			\
-	x(going_ro)			\
-	x(write_disable_complete)	\
-	x(clean_shutdown)		\
-	x(in_recovery)			\
-	x(in_fsck)			\
-	x(initial_gc_unfixed)		\
-	x(need_delete_dead_snapshots)	\
-	x(error)			\
-	x(topology_error)		\
-	x(errors_fixed)			\
-	x(errors_not_fixed)		\
-	x(no_invalid_checks)		\
-	x(discard_mount_opt_set)	\
-
-enum bch_fs_flags {
-#define x(n)		BCH_FS_##n,
-	BCH_FS_FLAGS()
-#undef x
-};
-
-struct btree_debug {
-	unsigned		id;
-};
-
-#define BCH_TRANSACTIONS_NR 128
-
-struct btree_transaction_stats {
-	struct bch2_time_stats	duration;
-	struct bch2_time_stats	lock_hold_times;
-	struct mutex		lock;
-	unsigned		nr_max_paths;
-	unsigned		max_mem;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-	darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
-	char			*max_paths_text;
-};
-
-struct bch_fs_pcpu {
-	u64			sectors_available;
-};
-
-struct journal_seq_blacklist_table {
-	size_t			nr;
-	struct journal_seq_blacklist_table_entry {
-		u64		start;
-		u64		end;
-		bool		dirty;
-	}			entries[];
-};
-
-struct btree_trans_buf {
-	struct btree_trans	*trans;
-};
-
-#define BCH_WRITE_REFS()						\
-	x(journal)							\
-	x(trans)							\
-	x(write)							\
-	x(promote)							\
-	x(node_rewrite)							\
-	x(stripe_create)						\
-	x(stripe_delete)						\
-	x(reflink)							\
-	x(fallocate)							\
-	x(fsync)							\
-	x(dio_write)							\
-	x(discard)							\
-	x(discard_fast)							\
-	x(check_discard_freespace_key)					\
-	x(invalidate)							\
-	x(delete_dead_snapshots)					\
-	x(gc_gens)							\
-	x(snapshot_delete_pagecache)					\
-	x(sysfs)							\
-	x(btree_write_buffer)						\
-	x(btree_node_scrub)						\
-	x(async_recovery_passes)					\
-	x(ioctl_data)
-
-enum bch_write_ref {
-#define x(n) BCH_WRITE_REF_##n,
-	BCH_WRITE_REFS()
-#undef x
-	BCH_WRITE_REF_NR,
-};
-
-#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
-
-struct bch_fs {
-	struct closure		cl;
-
-	struct list_head	list;
-	struct kobject		kobj;
-	struct kobject		counters_kobj;
-	struct kobject		internal;
-	struct kobject		opts_dir;
-	struct kobject		time_stats;
-	unsigned long		flags;
-
-	int			minor;
-	struct device		*chardev;
-	struct super_block	*vfs_sb;
-	dev_t			dev;
-	char			name[40];
-	struct stdio_redirect	*stdio;
-	struct task_struct	*stdio_filter;
-
-	/* ro/rw, add/remove/resize devices: */
-	struct rw_semaphore	state_lock;
-
-	/* Counts outstanding writes, for clean transition to read-only */
-	struct enumerated_ref	writes;
-	/*
-	 * Certain operations are only allowed in single threaded mode, during
-	 * recovery, and we want to assert that this is the case:
-	 */
-	struct task_struct	*recovery_task;
-
-	/*
-	 * Analagous to c->writes, for asynchronous ops that don't necessarily
-	 * need fs to be read-write
-	 */
-	refcount_t		ro_ref;
-	wait_queue_head_t	ro_ref_wait;
-
-	struct work_struct	read_only_work;
-
-	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
-
-	struct bch_accounting_mem accounting;
-
-	struct bch_replicas_cpu replicas;
-	struct bch_replicas_cpu replicas_gc;
-	struct mutex		replicas_gc_lock;
-
-	struct journal_entry_res btree_root_journal_res;
-	struct journal_entry_res clock_journal_res;
-
-	struct bch_disk_groups_cpu __rcu *disk_groups;
-
-	struct bch_opts		opts;
-
-	/* Updated by bch2_sb_update():*/
-	struct {
-		__uuid_t	uuid;
-		__uuid_t	user_uuid;
-
-		u16		version;
-		u16		version_incompat;
-		u16		version_incompat_allowed;
-		u16		version_min;
-		u16		version_upgrade_complete;
-
-		u8		nr_devices;
-		u8		clean;
-		bool		multi_device; /* true if we've ever had more than one device */
-
-		u8		encryption_type;
-
-		u64		time_base_lo;
-		u32		time_base_hi;
-		unsigned	time_units_per_sec;
-		unsigned	nsec_per_time_unit;
-		u64		features;
-		u64		compat;
-		u64		recovery_passes_required;
-		unsigned long	errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
-		u64		btrees_lost_data;
-	}			sb;
-	DARRAY(enum bcachefs_metadata_version)
-				incompat_versions_requested;
-
-	struct unicode_map	*cf_encoding;
-
-	struct bch_sb_handle	disk_sb;
-
-	unsigned short		block_bits;	/* ilog2(block_size) */
-
-	u16			btree_foreground_merge_threshold;
-
-	struct closure		sb_write;
-	struct mutex		sb_lock;
-
-	/* snapshot.c: */
-	struct snapshot_table __rcu *snapshots;
-	struct mutex		snapshot_table_lock;
-	struct rw_semaphore	snapshot_create_lock;
-
-	struct snapshot_delete	snapshot_delete;
-	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
-	snapshot_id_list	snapshots_unlinked;
-	struct mutex		snapshots_unlinked_lock;
-
-	/* BTREE CACHE */
-	struct bio_set		btree_bio;
-	struct workqueue_struct	*btree_read_complete_wq;
-	struct workqueue_struct	*btree_write_submit_wq;
-
-	struct btree_root	btree_roots_known[BTREE_ID_NR];
-	DARRAY(struct btree_root) btree_roots_extra;
-	struct mutex		btree_root_lock;
-
-	struct btree_cache	btree_cache;
-
-	/*
-	 * Cache of allocated btree nodes - if we allocate a btree node and
-	 * don't use it, if we free it that space can't be reused until going
-	 * _all_ the way through the allocator (which exposes us to a livelock
-	 * when allocating btree reserves fail halfway through) - instead, we
-	 * can stick them here:
-	 */
-	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
-	unsigned		btree_reserve_cache_nr;
-	struct mutex		btree_reserve_cache_lock;
-
-	mempool_t		btree_interior_update_pool;
-	struct list_head	btree_interior_update_list;
-	struct list_head	btree_interior_updates_unwritten;
-	struct mutex		btree_interior_update_lock;
-	struct closure_waitlist	btree_interior_update_wait;
-
-	struct workqueue_struct	*btree_interior_update_worker;
-	struct work_struct	btree_interior_update_work;
-
-	struct workqueue_struct	*btree_node_rewrite_worker;
-	struct list_head	btree_node_rewrites;
-	struct list_head	btree_node_rewrites_pending;
-	spinlock_t		btree_node_rewrites_lock;
-	struct closure_waitlist	btree_node_rewrites_wait;
-
-	/* btree_io.c: */
-	spinlock_t		btree_write_error_lock;
-	struct btree_write_stats {
-		atomic64_t	nr;
-		atomic64_t	bytes;
-	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
-
-	/* btree_iter.c: */
-	struct seqmutex		btree_trans_lock;
-	struct list_head	btree_trans_list;
-	mempool_t		btree_trans_pool;
-	mempool_t		btree_trans_mem_pool;
-	struct btree_trans_buf  __percpu	*btree_trans_bufs;
-
-	struct srcu_struct	btree_trans_barrier;
-	bool			btree_trans_barrier_initialized;
-
-	struct btree_key_cache	btree_key_cache;
-	unsigned		btree_key_cache_btrees;
-
-	struct btree_write_buffer btree_write_buffer;
-
-	struct workqueue_struct	*btree_update_wq;
-	struct workqueue_struct	*btree_write_complete_wq;
-	/* copygc needs its own workqueue for index updates.. */
-	struct workqueue_struct	*copygc_wq;
-	/*
-	 * Use a dedicated wq for write ref holder tasks. Required to avoid
-	 * dependency problems with other wq tasks that can block on ref
-	 * draining, such as read-only transition.
-	 */
-	struct workqueue_struct *write_ref_wq;
-
-	/* ALLOCATION */
-	struct bch_devs_mask	online_devs;
-	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
-	unsigned long		rw_devs_change_count;
-
-	u64			capacity; /* sectors */
-	u64			reserved; /* sectors */
-
-	/*
-	 * When capacity _decreases_ (due to a disk being removed), we
-	 * increment capacity_gen - this invalidates outstanding reservations
-	 * and forces them to be revalidated
-	 */
-	u32			capacity_gen;
-	unsigned		bucket_size_max;
-
-	atomic64_t		sectors_available;
-	struct mutex		sectors_available_lock;
-
-	struct bch_fs_pcpu __percpu	*pcpu;
-
-	struct percpu_rw_semaphore	mark_lock;
-
-	seqcount_t			usage_lock;
-	struct bch_fs_usage_base __percpu *usage;
-	u64 __percpu		*online_reserved;
-
-	unsigned long		allocator_last_stuck;
-
-	struct io_clock		io_clock[2];
-
-	/* JOURNAL SEQ BLACKLIST */
-	struct journal_seq_blacklist_table *
-				journal_seq_blacklist_table;
-
-	/* ALLOCATOR */
-	spinlock_t		freelist_lock;
-	struct closure_waitlist	freelist_wait;
-
-	open_bucket_idx_t	open_buckets_freelist;
-	open_bucket_idx_t	open_buckets_nr_free;
-	struct closure_waitlist	open_buckets_wait;
-	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
-	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
-
-	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
-	open_bucket_idx_t	open_buckets_partial_nr;
-
-	struct write_point	btree_write_point;
-	struct write_point	rebalance_write_point;
-
-	struct write_point	write_points[WRITE_POINT_MAX];
-	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
-	struct mutex		write_points_hash_lock;
-	unsigned		write_points_nr;
-
-	struct buckets_waiting_for_journal buckets_waiting_for_journal;
-
-	/* GARBAGE COLLECTION */
-	struct work_struct	gc_gens_work;
-	unsigned long		gc_count;
-
-	enum btree_id		gc_gens_btree;
-	struct bpos		gc_gens_pos;
-
-	/*
-	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
-	 * has been marked by GC.
-	 *
-	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
-	 *
-	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
-	 * can read without a lock.
-	 */
-	seqcount_t		gc_pos_lock;
-	struct gc_pos		gc_pos;
-
-	/*
-	 * The allocation code needs gc_mark in struct bucket to be correct, but
-	 * it's not while a gc is in progress.
-	 */
-	struct rw_semaphore	gc_lock;
-	struct mutex		gc_gens_lock;
-
-	/* IO PATH */
-	struct semaphore	io_in_flight;
-	struct bio_set		bio_read;
-	struct bio_set		bio_read_split;
-	struct bio_set		bio_write;
-	struct bio_set		replica_set;
-	struct mutex		bio_bounce_pages_lock;
-	mempool_t		bio_bounce_pages;
-	struct bucket_nocow_lock_table
-				nocow_locks;
-	struct rhashtable	promote_table;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	struct async_obj_list	async_objs[BCH_ASYNC_OBJ_NR];
-#endif
-
-	mempool_t		compression_bounce[2];
-	mempool_t		compress_workspace[BCH_COMPRESSION_OPT_NR];
-	size_t			zstd_workspace_size;
-
-	struct bch_key		chacha20_key;
-	bool			chacha20_key_set;
-
-	atomic64_t		key_version;
-
-	mempool_t		large_bkey_pool;
-
-	/* MOVE.C */
-	struct list_head	moving_context_list;
-	struct mutex		moving_context_lock;
-
-	/* REBALANCE */
-	struct bch_fs_rebalance	rebalance;
-
-	/* COPYGC */
-	struct task_struct	*copygc_thread;
-	struct write_point	copygc_write_point;
-	s64			copygc_wait_at;
-	s64			copygc_wait;
-	bool			copygc_running;
-	wait_queue_head_t	copygc_running_wq;
-
-	/* STRIPES: */
-	GENRADIX(struct gc_stripe) gc_stripes;
-
-	struct hlist_head	ec_stripes_new[32];
-	spinlock_t		ec_stripes_new_lock;
-
-	/* ERASURE CODING */
-	struct list_head	ec_stripe_head_list;
-	struct mutex		ec_stripe_head_lock;
-
-	struct list_head	ec_stripe_new_list;
-	struct mutex		ec_stripe_new_lock;
-	wait_queue_head_t	ec_stripe_new_wait;
-
-	struct work_struct	ec_stripe_create_work;
-	u64			ec_stripe_hint;
-
-	struct work_struct	ec_stripe_delete_work;
-
-	struct bio_set		ec_bioset;
-
-	/* REFLINK */
-	reflink_gc_table	reflink_gc_table;
-	size_t			reflink_gc_nr;
-
-	/* fs.c */
-	struct list_head	vfs_inodes_list;
-	struct mutex		vfs_inodes_lock;
-	struct rhashtable	vfs_inodes_table;
-	struct rhltable		vfs_inodes_by_inum_table;
-
-	/* VFS IO PATH - fs-io.c */
-	struct bio_set		writepage_bioset;
-	struct bio_set		dio_write_bioset;
-	struct bio_set		dio_read_bioset;
-	struct bio_set		nocow_flush_bioset;
-
-	/* QUOTAS */
-	struct bch_memquota_type quotas[QTYP_NR];
-
-	/* RECOVERY */
-	u64			journal_replay_seq_start;
-	u64			journal_replay_seq_end;
-	struct bch_fs_recovery	recovery;
-
-	/* DEBUG JUNK */
-	struct dentry		*fs_debug_dir;
-	struct dentry		*btree_debug_dir;
-	struct dentry		*async_obj_dir;
-	struct btree_debug	btree_debug[BTREE_ID_NR];
-	struct btree		*verify_data;
-	struct btree_node	*verify_ondisk;
-	struct mutex		verify_lock;
-
-	/*
-	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - have to dynamically allocate them
-	 */
-	mempool_t		fill_iter;
-
-	mempool_t		btree_bounce_pool;
-
-	struct journal		journal;
-	GENRADIX(struct journal_replay *) journal_entries;
-	u64			journal_entries_base_seq;
-	struct journal_keys	journal_keys;
-	struct list_head	journal_iters;
-
-	struct find_btree_nodes	found_btree_nodes;
-
-	u64			last_bucket_seq_cleanup;
-
-	u64			counters_on_mount[BCH_COUNTER_NR];
-	u64 __percpu		*counters;
-
-	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
-
-	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
-
-	/* ERRORS */
-	struct list_head	fsck_error_msgs;
-	struct mutex		fsck_error_msgs_lock;
-	bool			fsck_alloc_msgs_err;
-
-	bch_sb_errors_cpu	fsck_error_counts;
-	struct mutex		fsck_error_counts_lock;
-};
-
-extern struct wait_queue_head bch2_read_only_wait;
-
-static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
-{
-	if (test_bit(BCH_FS_stopping, &c->flags))
-		return false;
-
-	return refcount_inc_not_zero(&c->ro_ref);
-}
-
-static inline void bch2_ro_ref_put(struct bch_fs *c)
-{
-	if (refcount_dec_and_test(&c->ro_ref))
-		wake_up(&c->ro_ref_wait);
-}
-
-static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
-{
-#ifndef NO_BCACHEFS_FS
-	if (c->vfs_sb)
-		c->vfs_sb->s_bdi->ra_pages = ra_pages;
-#endif
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
-	return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
-	return c->opts.block_size;
-}
-
-static inline unsigned block_sectors(const struct bch_fs *c)
-{
-	return c->opts.block_size >> 9;
-}
-
-static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
-{
-	return c->btree_key_cache_btrees & (1U << btree);
-}
-
-static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
-{
-	struct timespec64 t;
-	s64 sec;
-	s32 rem;
-
-	time += c->sb.time_base_lo;
-
-	sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
-
-	set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
-
-	return t;
-}
-
-static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
-{
-	return (ts.tv_sec * c->sb.time_units_per_sec +
-		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
-}
-
-static inline s64 bch2_current_time(const struct bch_fs *c)
-{
-	struct timespec64 now;
-
-	ktime_get_coarse_real_ts64(&now);
-	return timespec_to_bch2_time(c, now);
-}
-
-static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
-{
-	return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
-}
-
-static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
-{
-	struct stdio_redirect *stdio = c->stdio;
-
-	if (c->stdio_filter && c->stdio_filter != current)
-		stdio = NULL;
-	return stdio;
-}
-
-static inline unsigned metadata_replicas_required(struct bch_fs *c)
-{
-	return min(c->opts.metadata_replicas,
-		   c->opts.metadata_replicas_required);
-}
-
-static inline unsigned data_replicas_required(struct bch_fs *c)
-{
-	return min(c->opts.data_replicas,
-		   c->opts.data_replicas_required);
-}
-
-#define BKEY_PADDED_ONSTACK(key, pad)				\
-	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-/*
- * This is needed because discard is both a filesystem option and a device
- * option, and mount options are supposed to apply to that mount and not be
- * persisted, i.e. if it's set as a mount option we can't propagate it to the
- * device.
- */
-static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
-{
-	return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
-		? c->opts.discard
-		: ca->mi.discard;
-}
-
-static inline bool bch2_fs_casefold_enabled(struct bch_fs *c)
-{
-#ifdef CONFIG_UNICODE
-	return !c->opts.casefold_disabled;
-#else
-	return false;
-#endif
-}
-
-#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
deleted file mode 100644
index b4a04df5ea95..000000000000
--- a/fs/bcachefs/bcachefs_format.h
+++ /dev/null
@@ -1,1545 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FORMAT_H
-#define _BCACHEFS_FORMAT_H
-
-/*
- * bcachefs on disk data structures
- *
- * OVERVIEW:
- *
- * There are three main types of on disk data structures in bcachefs (this is
- * reduced from 5 in bcache)
- *
- *  - superblock
- *  - journal
- *  - btree
- *
- * The btree is the primary structure; most metadata exists as keys in the
- * various btrees. There are only a small number of btrees, they're not
- * sharded - we have one btree for extents, another for inodes, et cetera.
- *
- * SUPERBLOCK:
- *
- * The superblock contains the location of the journal, the list of devices in
- * the filesystem, and in general any metadata we need in order to decide
- * whether we can start a filesystem or prior to reading the journal/btree
- * roots.
- *
- * The superblock is extensible, and most of the contents of the superblock are
- * in variable length, type tagged fields; see struct bch_sb_field.
- *
- * Backup superblocks do not reside in a fixed location; also, superblocks do
- * not have a fixed size. To locate backup superblocks we have struct
- * bch_sb_layout; we store a copy of this inside every superblock, and also
- * before the first superblock.
- *
- * JOURNAL:
- *
- * The journal primarily records btree updates in the order they occurred;
- * journal replay consists of just iterating over all the keys in the open
- * journal entries and re-inserting them into the btrees.
- *
- * The journal also contains entry types for the btree roots, and blacklisted
- * journal sequence numbers (see journal_seq_blacklist.c).
- *
- * BTREE:
- *
- * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
- * 128k-256k) and log structured. We use struct btree_node for writing the first
- * entry in a given node (offset 0), and struct btree_node_entry for all
- * subsequent writes.
- *
- * After the header, btree node entries contain a list of keys in sorted order.
- * Values are stored inline with the keys; since values are variable length (and
- * keys effectively are variable length too, due to packing) we can't do random
- * access without building up additional in memory tables in the btree node read
- * path.
- *
- * BTREE KEYS (struct bkey):
- *
- * The various btrees share a common format for the key - so as to avoid
- * switching in fastpath lookup/comparison code - but define their own
- * structures for the key values.
- *
- * The size of a key/value pair is stored as a u8 in units of u64s, so the max
- * size is just under 2k. The common part also contains a type tag for the
- * value, and a format field indicating whether the key is packed or not (and
- * also meant to allow adding new key fields in the future, if desired).
- *
- * bkeys, when stored within a btree node, may also be packed. In that case, the
- * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
- * be generous with field sizes in the common part of the key format (64 bit
- * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
- */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <uapi/linux/magic.h>
-#include "vstructs.h"
-
-#ifdef __KERNEL__
-typedef uuid_t __uuid_t;
-#endif
-
-#define BITMASK(name, type, field, offset, end)				\
-static const __maybe_unused unsigned	name##_OFFSET = offset;		\
-static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
-	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
-}
-
-#define LE_BITMASK(_bits, name, type, field, offset, end)		\
-static const __maybe_unused unsigned	name##_OFFSET = offset;		\
-static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
-static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
-									\
-static inline __u64 name(const type *k)					\
-{									\
-	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
-		~(~0ULL << (end - offset));				\
-}									\
-									\
-static inline void SET_##name(type *k, __u64 v)				\
-{									\
-	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
-									\
-	new &= ~(~(~0ULL << (end - offset)) << offset);			\
-	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
-	k->field = __cpu_to_le##_bits(new);				\
-}
-
-#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
-#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
-#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
-
-struct bkey_format {
-	__u8		key_u64s;
-	__u8		nr_fields;
-	/* One unused slot for now: */
-	__u8		bits_per_field[6];
-	__le64		field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
-	/*
-	 * Word order matches machine byte order - btree code treats a bpos as a
-	 * single large integer, for search/comparison purposes
-	 *
-	 * Note that wherever a bpos is embedded in another on disk data
-	 * structure, it has to be byte swabbed when reading in metadata that
-	 * wasn't written in native endian order:
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u32		snapshot;
-	__u64		offset;
-	__u64		inode;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	__u64		inode;
-	__u64		offset;		/* Points to end of extent - sectors */
-	__u32		snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-#define KEY_INODE_MAX			((__u64)~0ULL)
-#define KEY_OFFSET_MAX			((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX		((__u32)~0U)
-#define KEY_SIZE_MAX			((__u32)~0U)
-
-static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
-{
-	return (struct bpos) {
-		.inode		= inode,
-		.offset		= offset,
-		.snapshot	= snapshot,
-	};
-}
-
-#define POS_MIN				SPOS(0, 0, 0)
-#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
-#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
-#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
-	__u64		__nothing[0];
-};
-
-struct bversion {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u64		lo;
-	__u32		hi;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	__u32		hi;
-	__u64		lo;
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-struct bkey {
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
-	/* Type of the value */
-	__u8		type;
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	__u8		pad[1];
-
-	struct bversion	bversion;
-	__u32		size;		/* extent size, in sectors */
-	struct bpos	p;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	struct bpos	p;
-	__u32		size;		/* extent size, in sectors */
-	struct bversion	bversion;
-
-	__u8		pad[1];
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-/*
- * The big-endian version of bkey can't be compiled by rustc with the "aligned"
- * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
- * So for Rust compatibility, don't include this. It can be included in the LE
- * version because the "packed" attr is redundant in that case.
- *
- * History: (quoting Kent)
- *
- * Specifically, when i was designing bkey, I wanted the header to be no
- * bigger than necessary so that bkey_packed could use the rest. That means that
- * decently offten extent keys will fit into only 8 bytes, instead of spilling over
- * to 16.
- *
- * But packed_bkey treats the part after the header - the packed section -
- * as a single multi word, variable length integer. And bkey, the unpacked
- * version, is just a special case version of a bkey_packed; all the packed
- * bkey code will work on keys in any packed format, the in-memory
- * representation of an unpacked key also is just one type of packed key...
- *
- * So that constrains the key part of a bkig endian bkey to start right
- * after the header.
- *
- * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
- * some reason - that will clean up this wart.
- */
-__aligned(8)
-#endif
-;
-
-struct bkey_packed {
-	__u64		_data[0];
-
-	/* Size of combined key and value, in u64s */
-	__u8		u64s;
-
-	/* Format of key (0 for format local to btree node) */
-
-	/*
-	 * XXX: next incompat on disk format change, switch format and
-	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
-	 * bits of the bitfield
-	 */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8		format:7,
-			needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u8		needs_whiteout:1,
-			format:7;
-#endif
-
-	/* Type of the value */
-	__u8		type;
-	__u8		key_start[0];
-
-	/*
-	 * We copy bkeys with struct assignment in various places, and while
-	 * that shouldn't be done with packed bkeys we can't disallow it in C,
-	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
-	 * to the same size as struct bkey should hopefully be safest.
-	 */
-	__u8		pad[sizeof(struct bkey) - 3];
-} __packed __aligned(8);
-
-typedef struct {
-	__le64			lo;
-	__le64			hi;
-} bch_le128;
-
-#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
-#define BKEY_U64s_MAX			U8_MAX
-#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
-
-#define KEY_PACKED_BITS_START		24
-
-#define KEY_FORMAT_LOCAL_BTREE		0
-#define KEY_FORMAT_CURRENT		1
-
-enum bch_bkey_fields {
-	BKEY_FIELD_INODE,
-	BKEY_FIELD_OFFSET,
-	BKEY_FIELD_SNAPSHOT,
-	BKEY_FIELD_SIZE,
-	BKEY_FIELD_VERSION_HI,
-	BKEY_FIELD_VERSION_LO,
-	BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field)					\
-	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT						\
-((struct bkey_format) {							\
-	.key_u64s	= BKEY_U64s,					\
-	.nr_fields	= BKEY_NR_FIELDS,				\
-	.bits_per_field = {						\
-		bkey_format_field(INODE,	p.inode),		\
-		bkey_format_field(OFFSET,	p.offset),		\
-		bkey_format_field(SNAPSHOT,	p.snapshot),		\
-		bkey_format_field(SIZE,		size),			\
-		bkey_format_field(VERSION_HI,	bversion.hi),		\
-		bkey_format_field(VERSION_LO,	bversion.lo),		\
-	},								\
-})
-
-/* bkey with inline value */
-struct bkey_i {
-	__u64			_data[0];
-
-	struct bkey	k;
-	struct bch_val	v;
-};
-
-#define POS_KEY(_pos)							\
-((struct bkey) {							\
-	.u64s		= BKEY_U64s,					\
-	.format		= KEY_FORMAT_CURRENT,				\
-	.p		= _pos,						\
-})
-
-#define KEY(_inode, _offset, _size)					\
-((struct bkey) {							\
-	.u64s		= BKEY_U64s,					\
-	.format		= KEY_FORMAT_CURRENT,				\
-	.p		= POS(_inode, _offset),				\
-	.size		= _size,					\
-})
-
-static inline void bkey_init(struct bkey *k)
-{
-	*k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad)					\
-	struct bkey_i key; __u64 key ## _pad[pad]
-
-enum bch_bkey_type_flags {
-	BKEY_TYPE_strict_btree_checks	= BIT(0),
-};
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- *   override keys in composition order.  Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- *   discarded. DISCARDs may have a version; if the version is nonzero the key
- *   will be persistent, otherwise the key will be dropped whenever the btree
- *   node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
- *   by new writes or cluster-wide GC. Node repair can also overwrite them with
- *   the same or a more recent version number, but not with an older version
- *   number.
- *
- * - WHITEOUT: for hash table btrees
- */
-#define BCH_BKEY_TYPES()						\
-	x(deleted,		0,	0)				\
-	x(whiteout,		1,	0)				\
-	x(error,		2,	0)				\
-	x(cookie,		3,	0)				\
-	x(hash_whiteout,	4,	BKEY_TYPE_strict_btree_checks)	\
-	x(btree_ptr,		5,	BKEY_TYPE_strict_btree_checks)	\
-	x(extent,		6,	BKEY_TYPE_strict_btree_checks)	\
-	x(reservation,		7,	BKEY_TYPE_strict_btree_checks)	\
-	x(inode,		8,	BKEY_TYPE_strict_btree_checks)	\
-	x(inode_generation,	9,	BKEY_TYPE_strict_btree_checks)	\
-	x(dirent,		10,	BKEY_TYPE_strict_btree_checks)	\
-	x(xattr,		11,	BKEY_TYPE_strict_btree_checks)	\
-	x(alloc,		12,	BKEY_TYPE_strict_btree_checks)	\
-	x(quota,		13,	BKEY_TYPE_strict_btree_checks)	\
-	x(stripe,		14,	BKEY_TYPE_strict_btree_checks)	\
-	x(reflink_p,		15,	BKEY_TYPE_strict_btree_checks)	\
-	x(reflink_v,		16,	BKEY_TYPE_strict_btree_checks)	\
-	x(inline_data,		17,	BKEY_TYPE_strict_btree_checks)	\
-	x(btree_ptr_v2,		18,	BKEY_TYPE_strict_btree_checks)	\
-	x(indirect_inline_data,	19,	BKEY_TYPE_strict_btree_checks)	\
-	x(alloc_v2,		20,	BKEY_TYPE_strict_btree_checks)	\
-	x(subvolume,		21,	BKEY_TYPE_strict_btree_checks)	\
-	x(snapshot,		22,	BKEY_TYPE_strict_btree_checks)	\
-	x(inode_v2,		23,	BKEY_TYPE_strict_btree_checks)	\
-	x(alloc_v3,		24,	BKEY_TYPE_strict_btree_checks)	\
-	x(set,			25,	0)				\
-	x(lru,			26,	BKEY_TYPE_strict_btree_checks)	\
-	x(alloc_v4,		27,	BKEY_TYPE_strict_btree_checks)	\
-	x(backpointer,		28,	BKEY_TYPE_strict_btree_checks)	\
-	x(inode_v3,		29,	BKEY_TYPE_strict_btree_checks)	\
-	x(bucket_gens,		30,	BKEY_TYPE_strict_btree_checks)	\
-	x(snapshot_tree,	31,	BKEY_TYPE_strict_btree_checks)	\
-	x(logged_op_truncate,	32,	BKEY_TYPE_strict_btree_checks)	\
-	x(logged_op_finsert,	33,	BKEY_TYPE_strict_btree_checks)	\
-	x(accounting,		34,	BKEY_TYPE_strict_btree_checks)	\
-	x(inode_alloc_cursor,	35,	BKEY_TYPE_strict_btree_checks)
-
-enum bch_bkey_type {
-#define x(name, nr, ...) KEY_TYPE_##name	= nr,
-	BCH_BKEY_TYPES()
-#undef x
-	KEY_TYPE_MAX,
-};
-
-struct bch_deleted {
-	struct bch_val		v;
-};
-
-struct bch_whiteout {
-	struct bch_val		v;
-};
-
-struct bch_error {
-	struct bch_val		v;
-};
-
-struct bch_cookie {
-	struct bch_val		v;
-	__le64			cookie;
-};
-
-struct bch_hash_whiteout {
-	struct bch_val		v;
-};
-
-struct bch_set {
-	struct bch_val		v;
-};
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
-	__le64			lo;
-	__le64			hi;
-} __packed __aligned(8);
-
-struct bch_backpointer {
-	struct bch_val		v;
-	__u8			btree_id;
-	__u8			level;
-	__u8			data_type;
-	__u8			bucket_gen;
-	__u32			pad;
-	__u32			bucket_len;
-	struct bpos		pos;
-} __packed __aligned(8);
-
-/* Optional/variable size superblock sections: */
-
-struct bch_sb_field {
-	__u64			_data[0];
-	__le32			u64s;
-	__le32			type;
-};
-
-#define BCH_SB_FIELDS()				\
-	x(journal,			0)	\
-	x(members_v1,			1)	\
-	x(crypt,			2)	\
-	x(replicas_v0,			3)	\
-	x(quota,			4)	\
-	x(disk_groups,			5)	\
-	x(clean,			6)	\
-	x(replicas,			7)	\
-	x(journal_seq_blacklist,	8)	\
-	x(journal_v2,			9)	\
-	x(counters,			10)	\
-	x(members_v2,			11)	\
-	x(errors,			12)	\
-	x(ext,				13)	\
-	x(downgrade,			14)	\
-	x(recovery_passes,		15)
-
-#include "alloc_background_format.h"
-#include "dirent_format.h"
-#include "disk_accounting_format.h"
-#include "disk_groups_format.h"
-#include "extents_format.h"
-#include "ec_format.h"
-#include "inode_format.h"
-#include "journal_seq_blacklist_format.h"
-#include "logged_ops_format.h"
-#include "lru_format.h"
-#include "quota_format.h"
-#include "recovery_passes_format.h"
-#include "reflink_format.h"
-#include "replicas_format.h"
-#include "snapshot_format.h"
-#include "subvolume_format.h"
-#include "sb-counters_format.h"
-#include "sb-downgrade_format.h"
-#include "sb-errors_format.h"
-#include "sb-members_format.h"
-#include "xattr_format.h"
-
-enum bch_sb_field_type {
-#define x(f, nr)	BCH_SB_FIELD_##f = nr,
-	BCH_SB_FIELDS()
-#undef x
-	BCH_SB_FIELD_NR
-};
-
-/*
- * Most superblock fields are replicated in all device's superblocks - a few are
- * not:
- */
-#define BCH_SINGLE_DEVICE_SB_FIELDS		\
-	((1U << BCH_SB_FIELD_journal)|		\
-	 (1U << BCH_SB_FIELD_journal_v2))
-
-/* BCH_SB_FIELD_journal: */
-
-struct bch_sb_field_journal {
-	struct bch_sb_field	field;
-	__le64			buckets[];
-};
-
-struct bch_sb_field_journal_v2 {
-	struct bch_sb_field	field;
-
-	struct bch_sb_field_journal_v2_entry {
-		__le64		start;
-		__le64		nr;
-	}			d[];
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-struct nonce {
-	__le32			d[4];
-};
-
-struct bch_key {
-	__le64			key[4];
-};
-
-#define BCH_KEY_MAGIC					\
-	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
-	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
-	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
-	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
-
-struct bch_encrypted_key {
-	__le64			magic;
-	struct bch_key		key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
-	struct bch_sb_field	field;
-
-	__le64			flags;
-	__le64			kdf_flags;
-	struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
-	BCH_KDF_SCRYPT		= 0,
-	BCH_KDF_NR		= 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-/*
- * On clean shutdown, store btree roots and current journal sequence number in
- * the superblock:
- */
-struct jset_entry {
-	__le16			u64s;
-	__u8			btree_id;
-	__u8			level;
-	__u8			type; /* designates what this jset holds */
-	__u8			pad[3];
-
-	struct bkey_i		start[0];
-	__u64			_data[];
-};
-
-struct bch_sb_field_clean {
-	struct bch_sb_field	field;
-
-	__le32			flags;
-	__le16			_read_clock; /* no longer used */
-	__le16			_write_clock;
-	__le64			journal_seq;
-
-	struct jset_entry	start[0];
-	__u64			_data[];
-};
-
-struct bch_sb_field_ext {
-	struct bch_sb_field	field;
-	__le64			recovery_passes_required[2];
-	__le64			errors_silent[8];
-	__le64			btrees_lost_data;
-};
-
-/* Superblock: */
-
-/*
- * New versioning scheme:
- * One common version number for all on disk data structures - superblock, btree
- * nodes, journal entries
- */
-#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
-#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
-#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
-
-/*
- * field 1:		version name
- * field 2:		BCH_VERSION(major, minor)
- * field 3:		recovery passess required on upgrade
- */
-#define BCH_METADATA_VERSIONS()						\
-	x(bkey_renumber,		BCH_VERSION(0, 10))		\
-	x(inode_btree_change,		BCH_VERSION(0, 11))		\
-	x(snapshot,			BCH_VERSION(0, 12))		\
-	x(inode_backpointers,		BCH_VERSION(0, 13))		\
-	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14))		\
-	x(snapshot_2,			BCH_VERSION(0, 15))		\
-	x(reflink_p_fix,		BCH_VERSION(0, 16))		\
-	x(subvol_dirent,		BCH_VERSION(0, 17))		\
-	x(inode_v2,			BCH_VERSION(0, 18))		\
-	x(freespace,			BCH_VERSION(0, 19))		\
-	x(alloc_v4,			BCH_VERSION(0, 20))		\
-	x(new_data_types,		BCH_VERSION(0, 21))		\
-	x(backpointers,			BCH_VERSION(0, 22))		\
-	x(inode_v3,			BCH_VERSION(0, 23))		\
-	x(unwritten_extents,		BCH_VERSION(0, 24))		\
-	x(bucket_gens,			BCH_VERSION(0, 25))		\
-	x(lru_v2,			BCH_VERSION(0, 26))		\
-	x(fragmentation_lru,		BCH_VERSION(0, 27))		\
-	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28))		\
-	x(snapshot_trees,		BCH_VERSION(0, 29))		\
-	x(major_minor,			BCH_VERSION(1,  0))		\
-	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
-	x(deleted_inodes,		BCH_VERSION(1,  2))		\
-	x(rebalance_work,		BCH_VERSION(1,  3))		\
-	x(member_seq,			BCH_VERSION(1,  4))		\
-	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
-	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
-	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
-	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
-	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
-	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
-	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
-	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
-	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))		\
-	x(backpointer_bucket_gen,	BCH_VERSION(1, 14))		\
-	x(disk_accounting_big_endian,	BCH_VERSION(1, 15))		\
-	x(reflink_p_may_update_opts,	BCH_VERSION(1, 16))		\
-	x(inode_depth,			BCH_VERSION(1, 17))		\
-	x(persistent_inode_cursors,	BCH_VERSION(1, 18))		\
-	x(autofix_errors,		BCH_VERSION(1, 19))		\
-	x(directory_size,		BCH_VERSION(1, 20))		\
-	x(cached_backpointers,		BCH_VERSION(1, 21))		\
-	x(stripe_backpointers,		BCH_VERSION(1, 22))		\
-	x(stripe_lru,			BCH_VERSION(1, 23))		\
-	x(casefolding,			BCH_VERSION(1, 24))		\
-	x(extent_flags,			BCH_VERSION(1, 25))		\
-	x(snapshot_deletion_v2,		BCH_VERSION(1, 26))		\
-	x(fast_device_removal,		BCH_VERSION(1, 27))		\
-	x(inode_has_case_insensitive,	BCH_VERSION(1, 28))
-
-enum bcachefs_metadata_version {
-	bcachefs_metadata_version_min = 9,
-#define x(t, n)	bcachefs_metadata_version_##t = n,
-	BCH_METADATA_VERSIONS()
-#undef x
-	bcachefs_metadata_version_max
-};
-
-static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
-
-#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
-
-#define BCH_SB_SECTOR			8
-
-#define BCH_SB_LAYOUT_SIZE_BITS_MAX	16 /* 32 MB */
-
-struct bch_sb_layout {
-	__uuid_t		magic;	/* bcachefs superblock UUID */
-	__u8			layout_type;
-	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
-	__u8			nr_superblocks;
-	__u8			pad[5];
-	__le64			sb_offset[61];
-} __packed __aligned(8);
-
-#define BCH_SB_LAYOUT_SECTOR	7
-
-/*
- * @offset	- sector where this sb was written
- * @version	- on disk format version
- * @version_min	- Oldest metadata version this filesystem contains; so we can
- *		  safely drop compatibility code and refuse to mount filesystems
- *		  we'd need it for
- * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
- * @seq		- incremented each time superblock is written
- * @uuid	- used for generating various magic numbers and identifying
- *                member devices, never changes
- * @user_uuid	- user visible UUID, may be changed
- * @label	- filesystem label
- * @seq		- identifies most recent superblock, incremented each time
- *		  superblock is written
- * @features	- enabled incompatible features
- */
-struct bch_sb {
-	struct bch_csum		csum;
-	__le16			version;
-	__le16			version_min;
-	__le16			pad[2];
-	__uuid_t		magic;
-	__uuid_t		uuid;
-	__uuid_t		user_uuid;
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			offset;
-	__le64			seq;
-
-	__le16			block_size;
-	__u8			dev_idx;
-	__u8			nr_devices;
-	__le32			u64s;
-
-	__le64			time_base_lo;
-	__le32			time_base_hi;
-	__le32			time_precision;
-
-	__le64			flags[7];
-	__le64			write_time;
-	__le64			features[2];
-	__le64			compat[2];
-
-	struct bch_sb_layout	layout;
-
-	struct bch_sb_field	start[0];
-	__le64			_data[];
-} __packed __aligned(8);
-
-/*
- * Flags:
- * BCH_SB_INITALIZED	- set on first mount
- * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
- *			  behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- *			   DATA/META_CSUM_TYPE. Also indicates encryption
- *			   algorithm in use, if/when we get more than one
- */
-
-LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
-
-LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
-LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
-LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
-LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
-LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
-
-LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
-
-LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
-LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
-					struct bch_sb, flags[0], 63, 64);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
-LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
-					struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
-
-LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
-LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
-LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
-
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
-					struct bch_sb, flags[2],  0,  4);
-LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
-
-LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
-LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
-LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
-LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-LE64_BITMASK(BCH_SB_MULTI_DEVICE,	struct bch_sb,	flags[3], 63, 64);
-LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
-LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
-LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
-
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
-					struct bch_sb, flags[4], 60, 64);
-
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
-					struct bch_sb, flags[5],  0, 16);
-LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
-					struct bch_sb, flags[5], 16, 32);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,	struct bch_sb, flags[5], 32, 48);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
-					struct bch_sb, flags[5], 48, 64);
-LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
-LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
-LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR,	struct bch_sb, flags[6], 14, 20);
-LE64_BITMASK(BCH_SB_DEGRADED_ACTION,	struct bch_sb, flags[6], 20, 22);
-LE64_BITMASK(BCH_SB_CASEFOLD,		struct bch_sb, flags[6], 22, 23);
-LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY,	struct bch_sb, flags[6], 23, 24);
-
-static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
-	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
-	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
-		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-/*
- * Features:
- *
- * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
- * reflink:			gates KEY_TYPE_reflink
- * inline_data:			gates KEY_TYPE_inline_data
- * new_siphash:			gates BCH_STR_HASH_siphash
- * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
- */
-#define BCH_SB_FEATURES()			\
-	x(lz4,				0)	\
-	x(gzip,				1)	\
-	x(zstd,				2)	\
-	x(atomic_nlink,			3)	\
-	x(ec,				4)	\
-	x(journal_seq_blacklist_v3,	5)	\
-	x(reflink,			6)	\
-	x(new_siphash,			7)	\
-	x(inline_data,			8)	\
-	x(new_extent_overwrite,		9)	\
-	x(incompressible,		10)	\
-	x(btree_ptr_v2,			11)	\
-	x(extents_above_btree_updates,	12)	\
-	x(btree_updates_journalled,	13)	\
-	x(reflink_inline_data,		14)	\
-	x(new_varint,			15)	\
-	x(journal_no_flush,		16)	\
-	x(alloc_v2,			17)	\
-	x(extents_across_btree_nodes,	18)	\
-	x(incompat_version_field,	19)	\
-	x(casefolding,			20)	\
-	x(no_alloc_info,		21)	\
-	x(small_image,			22)
-
-#define BCH_SB_FEATURES_ALWAYS				\
-	(BIT_ULL(BCH_FEATURE_new_extent_overwrite)|	\
-	 BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
-	 BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
-	 BIT_ULL(BCH_FEATURE_alloc_v2)|\
-	 BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
-
-#define BCH_SB_FEATURES_ALL				\
-	(BCH_SB_FEATURES_ALWAYS|			\
-	 BIT_ULL(BCH_FEATURE_new_siphash)|		\
-	 BIT_ULL(BCH_FEATURE_btree_ptr_v2)|		\
-	 BIT_ULL(BCH_FEATURE_new_varint)|		\
-	 BIT_ULL(BCH_FEATURE_journal_no_flush)|		\
-	 BIT_ULL(BCH_FEATURE_incompat_version_field))
-
-enum bch_sb_feature {
-#define x(f, n) BCH_FEATURE_##f,
-	BCH_SB_FEATURES()
-#undef x
-	BCH_FEATURE_NR,
-};
-
-#define BCH_SB_COMPAT()					\
-	x(alloc_info,				0)	\
-	x(alloc_metadata,			1)	\
-	x(extents_above_btree_updates_done,	2)	\
-	x(bformat_overflow_done,		3)
-
-enum bch_sb_compat {
-#define x(f, n) BCH_COMPAT_##f,
-	BCH_SB_COMPAT()
-#undef x
-	BCH_COMPAT_NR,
-};
-
-/* options: */
-
-#define BCH_VERSION_UPGRADE_OPTS()	\
-	x(compatible,		0)	\
-	x(incompatible,		1)	\
-	x(none,			2)
-
-enum bch_version_upgrade_opts {
-#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
-	BCH_VERSION_UPGRADE_OPTS()
-#undef x
-};
-
-#define BCH_REPLICAS_MAX		4U
-
-#define BCH_BKEY_PTRS_MAX		16U
-
-#define BCH_ERROR_ACTIONS()		\
-	x(continue,		0)	\
-	x(fix_safe,		1)	\
-	x(panic,		2)	\
-	x(ro,			3)
-
-enum bch_error_actions {
-#define x(t, n) BCH_ON_ERROR_##t = n,
-	BCH_ERROR_ACTIONS()
-#undef x
-	BCH_ON_ERROR_NR
-};
-
-#define BCH_DEGRADED_ACTIONS()		\
-	x(ask,			0)	\
-	x(yes,			1)	\
-	x(very,			2)	\
-	x(no,			3)
-
-enum bch_degraded_actions {
-#define x(t, n) BCH_DEGRADED_##t = n,
-	BCH_DEGRADED_ACTIONS()
-#undef x
-	BCH_DEGRADED_ACTIONS_NR
-};
-
-#define BCH_STR_HASH_TYPES()		\
-	x(crc32c,		0)	\
-	x(crc64,		1)	\
-	x(siphash_old,		2)	\
-	x(siphash,		3)
-
-enum bch_str_hash_type {
-#define x(t, n) BCH_STR_HASH_##t = n,
-	BCH_STR_HASH_TYPES()
-#undef x
-	BCH_STR_HASH_NR
-};
-
-#define BCH_STR_HASH_OPTS()		\
-	x(crc32c,		0)	\
-	x(crc64,		1)	\
-	x(siphash,		2)
-
-enum bch_str_hash_opts {
-#define x(t, n) BCH_STR_HASH_OPT_##t = n,
-	BCH_STR_HASH_OPTS()
-#undef x
-	BCH_STR_HASH_OPT_NR
-};
-
-#define BCH_CSUM_TYPES()			\
-	x(none,				0)	\
-	x(crc32c_nonzero,		1)	\
-	x(crc64_nonzero,		2)	\
-	x(chacha20_poly1305_80,		3)	\
-	x(chacha20_poly1305_128,	4)	\
-	x(crc32c,			5)	\
-	x(crc64,			6)	\
-	x(xxhash,			7)
-
-enum bch_csum_type {
-#define x(t, n) BCH_CSUM_##t = n,
-	BCH_CSUM_TYPES()
-#undef x
-	BCH_CSUM_NR
-};
-
-static const __maybe_unused unsigned bch_crc_bytes[] = {
-	[BCH_CSUM_none]				= 0,
-	[BCH_CSUM_crc32c_nonzero]		= 4,
-	[BCH_CSUM_crc32c]			= 4,
-	[BCH_CSUM_crc64_nonzero]		= 8,
-	[BCH_CSUM_crc64]			= 8,
-	[BCH_CSUM_xxhash]			= 8,
-	[BCH_CSUM_chacha20_poly1305_80]		= 10,
-	[BCH_CSUM_chacha20_poly1305_128]	= 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
-	switch (type) {
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-#define BCH_CSUM_OPTS()			\
-	x(none,			0)	\
-	x(crc32c,		1)	\
-	x(crc64,		2)	\
-	x(xxhash,		3)
-
-enum bch_csum_opt {
-#define x(t, n) BCH_CSUM_OPT_##t = n,
-	BCH_CSUM_OPTS()
-#undef x
-	BCH_CSUM_OPT_NR
-};
-
-#define BCH_COMPRESSION_TYPES()		\
-	x(none,			0)	\
-	x(lz4_old,		1)	\
-	x(gzip,			2)	\
-	x(lz4,			3)	\
-	x(zstd,			4)	\
-	x(incompressible,	5)
-
-enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
-	BCH_COMPRESSION_TYPES()
-#undef x
-	BCH_COMPRESSION_TYPE_NR
-};
-
-#define BCH_COMPRESSION_OPTS()		\
-	x(none,		0)		\
-	x(lz4,		1)		\
-	x(gzip,		2)		\
-	x(zstd,		3)
-
-enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
-	BCH_COMPRESSION_OPTS()
-#undef x
-	BCH_COMPRESSION_OPT_NR
-};
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC							\
-	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
-		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-#define BCHFS_MAGIC							\
-	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
-		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCACHEFS_STATFS_MAGIC		BCACHEFS_SUPER_MAGIC
-
-#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
-#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
-{
-	__le64 ret;
-
-	memcpy(&ret, &sb->uuid, sizeof(ret));
-	return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
-	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
-
-#define BCH_JSET_ENTRY_TYPES()			\
-	x(btree_keys,		0)		\
-	x(btree_root,		1)		\
-	x(prio_ptrs,		2)		\
-	x(blacklist,		3)		\
-	x(blacklist_v2,		4)		\
-	x(usage,		5)		\
-	x(data_usage,		6)		\
-	x(clock,		7)		\
-	x(dev_usage,		8)		\
-	x(log,			9)		\
-	x(overwrite,		10)		\
-	x(write_buffer_keys,	11)		\
-	x(datetime,		12)		\
-	x(log_bkey,		13)
-
-enum bch_jset_entry_type {
-#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
-	BCH_JSET_ENTRY_TYPES()
-#undef x
-	BCH_JSET_ENTRY_NR
-};
-
-static inline bool jset_entry_is_key(struct jset_entry *e)
-{
-	switch (e->type) {
-	case BCH_JSET_ENTRY_btree_keys:
-	case BCH_JSET_ENTRY_btree_root:
-	case BCH_JSET_ENTRY_write_buffer_keys:
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Journal sequence numbers can be blacklisted: bsets record the max sequence
- * number of all the journal entries they contain updates for, so that on
- * recovery we can ignore those bsets that contain index updates newer that what
- * made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it, and
- * then record that we skipped it so that the next time we crash and recover we
- * don't think there was a missing journal entry.
- */
-struct jset_entry_blacklist {
-	struct jset_entry	entry;
-	__le64			seq;
-};
-
-struct jset_entry_blacklist_v2 {
-	struct jset_entry	entry;
-	__le64			start;
-	__le64			end;
-};
-
-#define BCH_FS_USAGE_TYPES()			\
-	x(reserved,		0)		\
-	x(inodes,		1)		\
-	x(key_version,		2)
-
-enum bch_fs_usage_type {
-#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
-	BCH_FS_USAGE_TYPES()
-#undef x
-	BCH_FS_USAGE_NR
-};
-
-struct jset_entry_usage {
-	struct jset_entry	entry;
-	__le64			v;
-} __packed;
-
-struct jset_entry_data_usage {
-	struct jset_entry	entry;
-	__le64			v;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct jset_entry_clock {
-	struct jset_entry	entry;
-	__u8			rw;
-	__u8			pad[7];
-	__le64			time;
-} __packed;
-
-struct jset_entry_dev_usage_type {
-	__le64			buckets;
-	__le64			sectors;
-	__le64			fragmented;
-} __packed;
-
-struct jset_entry_dev_usage {
-	struct jset_entry	entry;
-	__le32			dev;
-	__u32			pad;
-
-	__le64			_buckets_ec;		/* No longer used */
-	__le64			_buckets_unavailable;	/* No longer used */
-
-	struct jset_entry_dev_usage_type d[];
-};
-
-static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
-{
-	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
-		sizeof(struct jset_entry_dev_usage_type);
-}
-
-struct jset_entry_log {
-	struct jset_entry	entry;
-	u8			d[];
-} __packed __aligned(8);
-
-static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
-{
-	unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
-
-	while (b && !l->d[b - 1])
-		--b;
-	return b;
-}
-
-struct jset_entry_datetime {
-	struct jset_entry	entry;
-	__le64			seconds;
-} __packed __aligned(8);
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	struct bch_csum		csum;
-
-	__le64			magic;
-	__le64			seq;
-	__le32			version;
-	__le32			flags;
-
-	__le32			u64s; /* size of d[] in u64s */
-
-	__u8			encrypted_start[0];
-
-	__le16			_read_clock; /* no longer used */
-	__le16			_write_clock;
-
-	/* Sequence number of oldest dirty journal entry */
-	__le64			last_seq;
-
-
-	struct jset_entry	start[0];
-	__u64			_data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
-LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
-
-#define BCH_JOURNAL_BUCKETS_MIN		8
-
-/* Btree: */
-
-enum btree_id_flags {
-	BTREE_IS_extents	= BIT(0),
-	BTREE_IS_snapshots	= BIT(1),
-	BTREE_IS_snapshot_field	= BIT(2),
-	BTREE_IS_data		= BIT(3),
-	BTREE_IS_write_buffer	= BIT(4),
-};
-
-#define BCH_BTREE_IDS()								\
-	x(extents,		0,						\
-	  BTREE_IS_extents|							\
-	  BTREE_IS_snapshots|							\
-	  BTREE_IS_data,							\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_error)|						\
-	  BIT_ULL(KEY_TYPE_cookie)|						\
-	  BIT_ULL(KEY_TYPE_extent)|						\
-	  BIT_ULL(KEY_TYPE_reservation)|					\
-	  BIT_ULL(KEY_TYPE_reflink_p)|						\
-	  BIT_ULL(KEY_TYPE_inline_data))					\
-	x(inodes,		1,						\
-	  BTREE_IS_snapshots,							\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_inode)|						\
-	  BIT_ULL(KEY_TYPE_inode_v2)|						\
-	  BIT_ULL(KEY_TYPE_inode_v3)|						\
-	  BIT_ULL(KEY_TYPE_inode_generation))					\
-	x(dirents,		2,						\
-	  BTREE_IS_snapshots,							\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
-	  BIT_ULL(KEY_TYPE_dirent))						\
-	x(xattrs,		3,						\
-	  BTREE_IS_snapshots,							\
-	  BIT_ULL(KEY_TYPE_whiteout)|						\
-	  BIT_ULL(KEY_TYPE_cookie)|						\
-	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
-	  BIT_ULL(KEY_TYPE_xattr))						\
-	x(alloc,		4,	0,					\
-	  BIT_ULL(KEY_TYPE_alloc)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
-	  BIT_ULL(KEY_TYPE_alloc_v4))						\
-	x(quotas,		5,	0,					\
-	  BIT_ULL(KEY_TYPE_quota))						\
-	x(stripes,		6,	0,					\
-	  BIT_ULL(KEY_TYPE_stripe))						\
-	x(reflink,		7,						\
-	  BTREE_IS_extents|							\
-	  BTREE_IS_data,							\
-	  BIT_ULL(KEY_TYPE_reflink_v)|						\
-	  BIT_ULL(KEY_TYPE_indirect_inline_data)|				\
-	  BIT_ULL(KEY_TYPE_error))						\
-	x(subvolumes,		8,	0,					\
-	  BIT_ULL(KEY_TYPE_subvolume))						\
-	x(snapshots,		9,	0,					\
-	  BIT_ULL(KEY_TYPE_snapshot))						\
-	x(lru,			10,						\
-	  BTREE_IS_write_buffer,						\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(freespace,		11,						\
-	  BTREE_IS_extents,							\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(need_discard,		12,	0,					\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(backpointers,		13,						\
-	  BTREE_IS_write_buffer,						\
-	  BIT_ULL(KEY_TYPE_backpointer))					\
-	x(bucket_gens,		14,	0,					\
-	  BIT_ULL(KEY_TYPE_bucket_gens))					\
-	x(snapshot_trees,	15,	0,					\
-	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
-	x(deleted_inodes,	16,						\
-	  BTREE_IS_snapshot_field|						\
-	  BTREE_IS_write_buffer,						\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(logged_ops,		17,	0,					\
-	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
-	  BIT_ULL(KEY_TYPE_logged_op_finsert)|					\
-	  BIT_ULL(KEY_TYPE_inode_alloc_cursor))					\
-	x(rebalance_work,	18,						\
-	  BTREE_IS_snapshot_field|						\
-	  BTREE_IS_write_buffer,						\
-	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
-	x(subvolume_children,	19,	0,					\
-	  BIT_ULL(KEY_TYPE_set))						\
-	x(accounting,		20,						\
-	  BTREE_IS_snapshot_field|						\
-	  BTREE_IS_write_buffer,						\
-	  BIT_ULL(KEY_TYPE_accounting))						\
-
-enum btree_id {
-#define x(name, nr, ...) BTREE_ID_##name = nr,
-	BCH_BTREE_IDS()
-#undef x
-	BTREE_ID_NR
-};
-
-/*
- * Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with 64 bit bitfields - and we also need a bit for
- * the interior btree node type:
- */
-#define BTREE_ID_NR_MAX		63
-
-static inline bool btree_id_is_alloc(enum btree_id id)
-{
-	switch (id) {
-	case BTREE_ID_alloc:
-	case BTREE_ID_backpointers:
-	case BTREE_ID_need_discard:
-	case BTREE_ID_freespace:
-	case BTREE_ID_bucket_gens:
-	case BTREE_ID_lru:
-	case BTREE_ID_accounting:
-		return true;
-	default:
-		return false;
-	}
-}
-
-#define BTREE_MAX_DEPTH		4U
-
-/* Btree nodes */
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
-	__le64			seq;
-
-	/*
-	 * Highest journal entry this bset contains keys for.
-	 * If on recovery we don't see that journal entry, this bset is ignored:
-	 * this allows us to preserve the order of all index updates after a
-	 * crash, since the journal records a total order of all index updates
-	 * and anything that didn't make it to the journal doesn't get used.
-	 */
-	__le64			journal_seq;
-
-	__le32			flags;
-	__le16			version;
-	__le16			u64s; /* count of d[] in u64s */
-
-	struct bkey_packed	start[0];
-	__u64			_data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
-				struct bset, flags, 5, 6);
-
-/* Sector offset within the btree node: */
-LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
-
-struct btree_node {
-	struct bch_csum		csum;
-	__le64			magic;
-
-	/* this flags field is encrypted, unlike bset->flags: */
-	__le64			flags;
-
-	/* Closed interval: */
-	struct bpos		min_key;
-	struct bpos		max_key;
-	struct bch_extent_ptr	_ptr; /* not used anymore */
-	struct bkey_format	format;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-
-	};
-	};
-} __packed __aligned(8);
-
-LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
-LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
-LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
-				struct btree_node, flags,  8,  9);
-LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
-/* 25-32 unused */
-LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
-
-static inline __u64 BTREE_NODE_ID(struct btree_node *n)
-{
-	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
-}
-
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
-{
-	SET_BTREE_NODE_ID_LO(n, v);
-	SET_BTREE_NODE_ID_HI(n, v >> 4);
-}
-
-struct btree_node_entry {
-	struct bch_csum		csum;
-
-	union {
-	struct bset		keys;
-	struct {
-		__u8		pad[22];
-		__le16		u64s;
-		__u64		_data[0];
-	};
-	};
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
deleted file mode 100644
index 52594e925eb7..000000000000
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IOCTL_H
-#define _BCACHEFS_IOCTL_H
-
-#include <linux/uuid.h>
-#include <asm/ioctl.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-
-/*
- * Flags common to multiple ioctls:
- */
-#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
-
-#define BCH_FORCE_IF_LOST			\
-	(BCH_FORCE_IF_DATA_LOST|		\
-	 BCH_FORCE_IF_METADATA_LOST)
-#define BCH_FORCE_IF_DEGRADED			\
-	(BCH_FORCE_IF_DATA_DEGRADED|		\
-	 BCH_FORCE_IF_METADATA_DEGRADED)
-
-/*
- * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
- * (e.g. /dev/sda1); if set, the dev field is the device's index within the
- * filesystem:
- */
-#define BCH_BY_INDEX			(1 << 4)
-
-/*
- * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
- * wide superblock:
- */
-#define BCH_READ_DEV			(1 << 5)
-
-/* global control dev: */
-
-/* These are currently broken, and probably unnecessary: */
-#if 0
-#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
-	__u32			flags;
-	__u32			nr_devs;
-	__u64			pad;
-	__u64			devs[];
-};
-
-struct bch_ioctl_incremental {
-	__u32			flags;
-	__u64			pad;
-	__u64			dev;
-};
-#endif
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
-
-/* These only make sense when we also have incremental assembly */
-#if 0
-#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
-#define BCH_IOCTL_STOP		_IO(0xbc,	3)
-#endif
-
-#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2	_IOWR(0xbc,	18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE	_IOW(0xbc,	19,  struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE	_IOW(0xbc,	20,  struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc,	21,  struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc,	21,  struct bch_ioctl_query_counters)
-
-/* ioctl below act on a particular file, not the filesystem as a whole: */
-
-#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
-
-/*
- * BCH_IOCTL_QUERY_UUID: get filesystem UUID
- *
- * Returns user visible UUID, not internal UUID (which may not ever be changed);
- * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
- * this UUID.
- */
-struct bch_ioctl_query_uuid {
-	__uuid_t		uuid;
-};
-
-#if 0
-struct bch_ioctl_start {
-	__u32			flags;
-	__u32			pad;
-};
-#endif
-
-/*
- * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
- *
- * The specified device must not be open or in use. On success, the new device
- * will be an online member of the filesystem just like any other member.
- *
- * The device must first be prepared by userspace by formatting with a bcachefs
- * superblock, which is only used for passing in superblock options/parameters
- * for that device (in struct bch_member). The new device's superblock should
- * not claim to be a member of any existing filesystem - UUIDs on it will be
- * ignored.
- */
-
-/*
- * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
- *
- * Any data present on @dev will be permanently deleted, and @dev will be
- * removed from its slot in the filesystem's list of member devices. The device
- * may be either offline or offline.
- *
- * Will fail removing @dev would leave us with insufficient read write devices
- * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
- * set.
- */
-
-/*
- * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
- * but is not open (e.g. because we started in degraded mode), bring it online
- *
- * all existing data on @dev will be available once the device is online,
- * exactly as if @dev was present when the filesystem was first mounted
- */
-
-/*
- * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
- * block device, without removing it from the filesystem (so it can be brought
- * back online later)
- *
- * Data present on @dev will be unavailable while @dev is offline (unless
- * replicated), but will still be intact and untouched if @dev is brought back
- * online
- *
- * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
- * leave us with insufficient read write devices or degraded/unavailable data,
- * unless the approprate BCH_FORCE_IF_* flags are set.
- */
-
-struct bch_ioctl_disk {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-};
-
-/*
- * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
- *
- * @new_state		- one of the bch_member_state states (rw, ro, failed,
- *			  spare)
- *
- * Will refuse to change member state if we would then have insufficient devices
- * to write to, or if it would result in degraded data (when @new_state is
- * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
- */
-struct bch_ioctl_disk_set_state {
-	__u32			flags;
-	__u8			new_state;
-	__u8			pad[3];
-	__u64			dev;
-};
-
-#define BCH_DATA_OPS()			\
-	x(scrub,		0)	\
-	x(rereplicate,		1)	\
-	x(migrate,		2)	\
-	x(rewrite_old_nodes,	3)	\
-	x(drop_extra_replicas,	4)
-
-enum bch_data_ops {
-#define x(t, n) BCH_DATA_OP_##t = n,
-	BCH_DATA_OPS()
-#undef x
-	BCH_DATA_OP_NR
-};
-
-/*
- * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
- * scrub, rereplicate, migrate).
- *
- * This ioctl kicks off a job in the background, and returns a file descriptor.
- * Reading from the file descriptor returns a struct bch_ioctl_data_event,
- * indicating current progress, and closing the file descriptor will stop the
- * job. The file descriptor is O_CLOEXEC.
- */
-struct bch_ioctl_data {
-	__u16			op;
-	__u8			start_btree;
-	__u8			end_btree;
-	__u32			flags;
-
-	struct bpos		start_pos;
-	struct bpos		end_pos;
-
-	union {
-	struct {
-		__u32		dev;
-		__u32		data_types;
-	}			scrub;
-	struct {
-		__u32		dev;
-		__u32		pad;
-	}			migrate;
-	struct {
-		__u64		pad[8];
-	};
-	};
-} __packed __aligned(8);
-
-enum bch_data_event {
-	BCH_DATA_EVENT_PROGRESS	= 0,
-	/* XXX: add an event for reporting errors */
-	BCH_DATA_EVENT_NR	= 1,
-};
-
-enum data_progress_data_type_special {
-	DATA_PROGRESS_DATA_TYPE_phys	= 254,
-	DATA_PROGRESS_DATA_TYPE_done	= 255,
-};
-
-struct bch_ioctl_data_progress {
-	__u8			data_type;
-	__u8			btree_id;
-	__u8			pad[2];
-	struct bpos		pos;
-
-	__u64			sectors_done;
-	__u64			sectors_total;
-	__u64			sectors_error_corrected;
-	__u64			sectors_error_uncorrected;
-} __packed __aligned(8);
-
-enum bch_ioctl_data_event_ret {
-	BCH_IOCTL_DATA_EVENT_RET_done		= 1,
-	BCH_IOCTL_DATA_EVENT_RET_device_offline	= 2,
-};
-
-struct bch_ioctl_data_event {
-	__u8			type;
-	__u8			ret;
-	__u8			pad[6];
-	union {
-	struct bch_ioctl_data_progress p;
-	__u64			pad2[15];
-	};
-} __packed __aligned(8);
-
-struct bch_replicas_usage {
-	__u64			sectors;
-	struct bch_replicas_entry_v1 r;
-} __packed;
-
-static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
-{
-	return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
-}
-
-static inline struct bch_replicas_usage *
-replicas_usage_next(struct bch_replicas_usage *u)
-{
-	return (void *) u + replicas_usage_bytes(u);
-}
-
-/* Obsolete */
-/*
- * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_fs_usage {
-	__u64			capacity;
-	__u64			used;
-	__u64			online_reserved;
-	__u64			persistent_reserved[BCH_REPLICAS_MAX];
-
-	__u32			replica_entries_bytes;
-	__u32			pad;
-
-	struct bch_replicas_usage replicas[];
-};
-
-/* Obsolete */
-/*
- * BCH_IOCTL_DEV_USAGE: query device disk space usage
- *
- * Returns disk space usage broken out by data type - both by buckets and
- * sectors.
- */
-struct bch_ioctl_dev_usage {
-	__u64			dev;
-	__u32			flags;
-	__u8			state;
-	__u8			pad[7];
-
-	__u32			bucket_size;
-	__u64			nr_buckets;
-
-	__u64			buckets_ec;
-
-	struct bch_ioctl_dev_usage_type {
-		__u64		buckets;
-		__u64		sectors;
-		__u64		fragmented;
-	}			d[10];
-};
-
-/* Obsolete */
-struct bch_ioctl_dev_usage_v2 {
-	__u64			dev;
-	__u32			flags;
-	__u8			state;
-	__u8			nr_data_types;
-	__u8			pad[6];
-
-	__u32			bucket_size;
-	__u64			nr_buckets;
-
-	struct bch_ioctl_dev_usage_type d[];
-};
-
-/*
- * BCH_IOCTL_READ_SUPER: read filesystem superblock
- *
- * Equivalent to reading the superblock directly from the block device, except
- * avoids racing with the kernel writing the superblock or having to figure out
- * which block device to read
- *
- * @sb		- buffer to read into
- * @size	- size of userspace allocated buffer
- * @dev		- device to read superblock for, if BCH_READ_DEV flag is
- *		  specified
- *
- * Returns -ERANGE if buffer provided is too small
- */
-struct bch_ioctl_read_super {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			size;
-	__u64			sb;
-};
-
-/*
- * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
- * determine if disk is a (online) member - if so, returns device's index
- *
- * Returns -ENOENT if not found
- */
-struct bch_ioctl_disk_get_idx {
-	__u64			dev;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
- *
- * @dev		- member to resize
- * @nbuckets	- new number of buckets
- */
-struct bch_ioctl_disk_resize {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			nbuckets;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
- *
- * @dev		- member to resize
- * @nbuckets	- new number of buckets
- */
-struct bch_ioctl_disk_resize_journal {
-	__u32			flags;
-	__u32			pad;
-	__u64			dev;
-	__u64			nbuckets;
-};
-
-struct bch_ioctl_subvolume {
-	__u32			flags;
-	__u32			dirfd;
-	__u16			mode;
-	__u16			pad[3];
-	__u64			dst_ptr;
-	__u64			src_ptr;
-};
-
-#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
-#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
-
-/*
- * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_offline {
-	__u64			flags;
-	__u64			opts;		/* string */
-	__u64			nr_devs;
-	__u64			devs[] __counted_by(nr_devs);
-};
-
-/*
- * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_online {
-	__u64			flags;
-	__u64			opts;		/* string */
-};
-
-/*
- * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_query_accounting {
-	__u64			capacity;
-	__u64			used;
-	__u64			online_reserved;
-
-	__u32			accounting_u64s; /* input parameter */
-	__u32			accounting_types_mask; /* input parameter */
-
-	struct bkey_i_accounting accounting[];
-};
-
-#define BCH_IOCTL_QUERY_COUNTERS_MOUNT	(1 << 0)
-
-struct bch_ioctl_query_counters {
-	__u16			nr;
-	__u16			flags;
-	__u32			pad;
-	__u64			d[];
-};
-
-#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
deleted file mode 100644
index ee823c640642..000000000000
--- a/fs/bcachefs/bkey.c
+++ /dev/null
@@ -1,1112 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_cmp.h"
-#include "bkey_methods.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *out,
-				     const struct bkey_format *f,
-				     const struct bkey_packed *k)
-{
-	const u64 *p = high_word(f, k);
-	unsigned word_bits = 64 - high_bit_offset;
-	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
-	u64 v = *p & (~0ULL >> high_bit_offset);
-
-	if (!nr_key_bits) {
-		prt_str(out, "(empty)");
-		return;
-	}
-
-	while (1) {
-		unsigned next_key_bits = nr_key_bits;
-
-		if (nr_key_bits < 64) {
-			v >>= 64 - nr_key_bits;
-			next_key_bits = 0;
-		} else {
-			next_key_bits -= 64;
-		}
-
-		bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
-
-		if (!next_key_bits)
-			break;
-
-		prt_char(out, ' ');
-
-		p = next_word(p);
-		v = *p;
-		word_bits = 64;
-		nr_key_bits = next_key_bits;
-	}
-}
-
-static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
-				    const struct bkey *unpacked,
-				    const struct bkey_format *format)
-{
-	struct bkey tmp;
-
-	BUG_ON(bkeyp_val_u64s(format, packed) !=
-	       bkey_val_u64s(unpacked));
-
-	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
-	tmp = __bch2_bkey_unpack_key(format, packed);
-
-	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
-		      format->key_u64s,
-		      format->bits_per_field[0],
-		      format->bits_per_field[1],
-		      format->bits_per_field[2],
-		      format->bits_per_field[3],
-		      format->bits_per_field[4]);
-
-		prt_printf(&buf, "compiled unpack: ");
-		bch2_bkey_to_text(&buf, unpacked);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "c unpack:        ");
-		bch2_bkey_to_text(&buf, &tmp);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "compiled unpack: ");
-		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-						(struct bkey_packed *) unpacked);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "c unpack:        ");
-		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
-						(struct bkey_packed *) &tmp);
-		prt_newline(&buf);
-
-		panic("%s", buf.buf);
-	}
-}
-
-static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
-					 const struct bkey *unpacked,
-					 const struct bkey_format *format)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
-		__bch2_bkey_pack_verify(packed, unpacked, format);
-}
-
-struct pack_state {
-	const struct bkey_format *format;
-	unsigned		bits;	/* bits remaining in current word */
-	u64			w;	/* current word */
-	u64			*p;	/* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
-					 struct bkey_packed *k)
-{
-	u64 *p = high_word(format, k);
-
-	return (struct pack_state) {
-		.format	= format,
-		.bits	= 64 - high_bit_offset,
-		.w	= 0,
-		.p	= p,
-	};
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
-			      struct bkey_packed *k)
-{
-	EBUG_ON(state->p <  k->_data);
-	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
-
-	*state->p = state->w;
-}
-
-struct unpack_state {
-	const struct bkey_format *format;
-	unsigned		bits;	/* bits remaining in current word */
-	u64			w;	/* current word */
-	const u64		*p;	/* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
-					     const struct bkey_packed *k)
-{
-	const u64 *p = high_word(format, k);
-
-	return (struct unpack_state) {
-		.format	= format,
-		.bits	= 64 - high_bit_offset,
-		.w	= *p << high_bit_offset,
-		.p	= p,
-	};
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
-	if (bits >= state->bits) {
-		v = state->w >> (64 - bits);
-		bits -= state->bits;
-
-		state->p = next_word(state->p);
-		state->w = *state->p;
-		state->bits = 64;
-	}
-
-	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-	v |= (state->w >> 1) >> (63 - bits);
-	state->w <<= bits;
-	state->bits -= bits;
-
-	return v + offset;
-}
-
-__always_inline
-static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-
-	if (bits) {
-		if (bits > state->bits) {
-			bits -= state->bits;
-			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
-			state->w |= (v >> 1) >> (bits - 1);
-
-			*state->p = state->w;
-			state->p = next_word(state->p);
-			state->w = 0;
-			state->bits = 64;
-		}
-
-		state->bits -= bits;
-		state->w |= v << state->bits;
-	}
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
-	if (v < offset)
-		return false;
-
-	v -= offset;
-
-	if (fls64(v) > bits)
-		return false;
-
-	__set_inc_field(state, field, v);
-	return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
-				   struct bkey_packed *out,
-				   const struct bkey_format *in_f,
-				   const struct bkey_packed *in)
-{
-	struct pack_state out_s = pack_state_init(out_f, out);
-	struct unpack_state in_s = unpack_state_init(in_f, in);
-	u64 *w = out->_data;
-	unsigned i;
-
-	*w = 0;
-
-	for (i = 0; i < BKEY_NR_FIELDS; i++)
-		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
-			return false;
-
-	/* Can't happen because the val would be too big to unpack: */
-	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
-	pack_state_finish(&out_s, out);
-	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
-	out->needs_whiteout = in->needs_whiteout;
-	out->type	= in->type;
-
-	return true;
-}
-
-bool bch2_bkey_transform(const struct bkey_format *out_f,
-			struct bkey_packed *out,
-			const struct bkey_format *in_f,
-			const struct bkey_packed *in)
-{
-	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
-		return false;
-
-	memcpy_u64s((u64 *) out + out_f->key_u64s,
-		    (u64 *) in + in_f->key_u64s,
-		    (in->u64s - in_f->key_u64s));
-	return true;
-}
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
-			      const struct bkey_packed *in)
-{
-	struct unpack_state state = unpack_state_init(format, in);
-	struct bkey out;
-
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
-	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
-	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
-	out.format	= KEY_FORMAT_CURRENT;
-	out.needs_whiteout = in->needs_whiteout;
-	out.type	= in->type;
-	out.pad[0]	= 0;
-
-#define x(id, field)	out.field = get_inc_field(&state, id);
-	bkey_fields()
-#undef x
-
-	return out;
-}
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
-				     const struct bkey_packed *in)
-{
-	struct unpack_state state = unpack_state_init(format, in);
-	struct bpos out;
-
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
-	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
-	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
-	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
-	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
-	return out;
-}
-#endif
-
-/**
- * bch2_bkey_pack_key -- pack just the key, not the value
- * @out:	packed result
- * @in:		key to pack
- * @format:	format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-			const struct bkey_format *format)
-{
-	struct pack_state state = pack_state_init(format, out);
-	u64 *w = out->_data;
-
-	EBUG_ON((void *) in == (void *) out);
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
-	*w = 0;
-
-#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
-	bkey_fields()
-#undef x
-	pack_state_finish(&state, out);
-	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
-	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->needs_whiteout = in->needs_whiteout;
-	out->type	= in->type;
-
-	bch2_bkey_pack_verify(out, in, format);
-	return true;
-}
-
-/**
- * bch2_bkey_unpack -- unpack the key and the value
- * @b:		btree node of @src key (for packed format)
- * @dst:	unpacked result
- * @src:	packed input
- */
-void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-		      const struct bkey_packed *src)
-{
-	__bkey_unpack_key(b, &dst->k, src);
-
-	memcpy_u64s(&dst->v,
-		    bkeyp_val(&b->format, src),
-		    bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bch2_bkey_pack -- pack the key and the value
- * @dst:	packed result
- * @src:	unpacked input
- * @format:	format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
-		    const struct bkey_format *format)
-{
-	struct bkey_packed tmp;
-
-	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
-		return false;
-
-	memmove_u64s((u64 *) dst + format->key_u64s,
-		     &src->v,
-		     bkey_val_u64s(&src->k));
-	memcpy_u64s_small(dst, &tmp, format->key_u64s);
-
-	return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
-	unsigned bits = state->format->bits_per_field[field];
-	u64 offset = le64_to_cpu(state->format->field_offset[field]);
-	bool ret = true;
-
-	EBUG_ON(v < offset);
-	v -= offset;
-
-	if (fls64(v) > bits) {
-		v = ~(~0ULL << bits);
-		ret = false;
-	}
-
-	__set_inc_field(state, field, v);
-	return ret;
-}
-
-static bool bkey_packed_successor(struct bkey_packed *out,
-				  const struct btree *b,
-				  struct bkey_packed k)
-{
-	const struct bkey_format *f = &b->format;
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned first_bit, offset;
-	u64 *p;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	if (!nr_key_bits)
-		return false;
-
-	*out = k;
-
-	first_bit = high_bit_offset + nr_key_bits - 1;
-	p = nth_word(high_word(f, out), first_bit >> 6);
-	offset = 63 - (first_bit & 63);
-
-	while (nr_key_bits) {
-		unsigned bits = min(64 - offset, nr_key_bits);
-		u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-		if ((*p & mask) != mask) {
-			*p += 1ULL << offset;
-			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
-			return true;
-		}
-
-		*p &= ~mask;
-		p = prev_word(p);
-		nr_key_bits -= bits;
-		offset = 0;
-	}
-
-	return false;
-}
-
-static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
-{
-	for (unsigned i = 0; i < f->nr_fields; i++) {
-		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-		u64 packed_max = f->bits_per_field[i]
-			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-			: 0;
-		u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-		if (packed_max + field_offset < packed_max ||
-		    packed_max + field_offset > unpacked_max)
-			return true;
-	}
-
-	return false;
-}
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
-					   struct bpos in,
-					   const struct btree *b)
-{
-	const struct bkey_format *f = &b->format;
-	struct pack_state state = pack_state_init(f, out);
-	u64 *w = out->_data;
-	struct bpos orig = in;
-	bool exact = true;
-	unsigned i;
-
-	/*
-	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
-	 * byte header, but pack_pos() won't if the len/version fields are big
-	 * enough - we need to make sure to zero them out:
-	 */
-	for (i = 0; i < f->key_u64s; i++)
-		w[i] = 0;
-
-	if (unlikely(in.snapshot <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
-		if (!in.offset-- &&
-		    !in.inode--)
-			return BKEY_PACK_POS_FAIL;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(in.offset <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
-		if (!in.inode--)
-			return BKEY_PACK_POS_FAIL;
-		in.offset	= KEY_OFFSET_MAX;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(in.inode <
-		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
-		return BKEY_PACK_POS_FAIL;
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
-		in.offset	= KEY_OFFSET_MAX;
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
-		in.snapshot	= KEY_SNAPSHOT_MAX;
-		exact = false;
-	}
-
-	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
-		exact = false;
-
-	pack_state_finish(&state, out);
-	out->u64s	= f->key_u64s;
-	out->format	= KEY_FORMAT_LOCAL_BTREE;
-	out->type	= KEY_TYPE_deleted;
-
-	if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-		if (exact) {
-			BUG_ON(bkey_cmp_left_packed(b, out, &orig));
-		} else {
-			struct bkey_packed successor;
-
-			BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
-			BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-			       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
-			       !bkey_format_has_too_big_fields(f));
-		}
-	}
-
-	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch2_bkey_format_init(struct bkey_format_state *s)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
-		s->field_min[i] = U64_MAX;
-
-	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
-		s->field_max[i] = 0;
-
-	/* Make sure we can store a size of 0: */
-	s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
-	unsigned field = 0;
-
-	__bkey_format_add(s, field++, p.inode);
-	__bkey_format_add(s, field++, p.offset);
-	__bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
-			     unsigned bits, u64 offset)
-{
-	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-
-	bits = min(bits, unpacked_bits);
-
-	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
-
-	f->bits_per_field[i]	= bits;
-	f->field_offset[i]	= cpu_to_le64(offset);
-}
-
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
-{
-	unsigned i, bits = KEY_PACKED_BITS_START;
-	struct bkey_format ret = {
-		.nr_fields = BKEY_NR_FIELDS,
-	};
-
-	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
-		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
-		set_format_field(&ret, i,
-				 fls64(s->field_max[i] - s->field_min[i]),
-				 s->field_min[i]);
-
-		bits += ret.bits_per_field[i];
-	}
-
-	/* allow for extent merging: */
-	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
-
-		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
-		bits += b;
-	}
-
-	ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
-	/* if we have enough spare bits, round fields up to nearest byte */
-	bits = ret.key_u64s * 64 - bits;
-
-	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
-		unsigned r = round_up(ret.bits_per_field[i], 8) -
-			ret.bits_per_field[i];
-
-		if (r <= bits) {
-			set_format_field(&ret, i,
-					 ret.bits_per_field[i] + r,
-					 le64_to_cpu(ret.field_offset[i]));
-			bits -= r;
-		}
-	}
-
-	if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-		struct printbuf buf = PRINTBUF;
-
-		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
-		printbuf_exit(&buf);
-	}
-
-	return ret;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *c,
-			     struct bkey_format *f,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
-{
-	unsigned bits = KEY_PACKED_BITS_START;
-
-	if (f->nr_fields != BKEY_NR_FIELDS) {
-		prt_printf(err, "incorrect number of fields: got %u, should be %u",
-			   f->nr_fields, BKEY_NR_FIELDS);
-		return -BCH_ERR_invalid;
-	}
-
-	/*
-	 * Verify that the packed format can't represent fields larger than the
-	 * unpacked format:
-	 */
-	for (unsigned i = 0; i < f->nr_fields; i++) {
-		if (bch2_bkey_format_field_overflows(f, i)) {
-			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-			unsigned packed_bits = min(64, f->bits_per_field[i]);
-			u64 packed_max = packed_bits
-				? ~((~0ULL << 1) << (packed_bits - 1))
-				: 0;
-
-			prt_printf(err, "field %u too large: %llu + %llu > %llu",
-				   i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
-			return -BCH_ERR_invalid;
-		}
-
-		bits += f->bits_per_field[i];
-	}
-
-	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
-		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
-			   f->key_u64s, DIV_ROUND_UP(bits, 64));
-		return -BCH_ERR_invalid;
-	}
-
-	return 0;
-}
-
-void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
-{
-	prt_printf(out, "u64s %u fields ", f->key_u64s);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
-		if (i)
-			prt_str(out, ", ");
-		prt_printf(out, "%u:%llu",
-			   f->bits_per_field[i],
-			   le64_to_cpu(f->field_offset[i]));
-	}
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
-					  const struct bkey_packed *l_k,
-					  const struct bkey_packed *r_k)
-{
-	const u64 *l = high_word(&b->format, l_k);
-	const u64 *r = high_word(&b->format, r_k);
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned word_bits = 64 - high_bit_offset;
-	u64 l_v, r_v;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-	/* for big endian, skip past header */
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (nr_key_bits) {
-		if (nr_key_bits < word_bits) {
-			l_v >>= word_bits - nr_key_bits;
-			r_v >>= word_bits - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= word_bits;
-		}
-
-		if (l_v != r_v)
-			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-		word_bits = 64;
-	}
-
-	return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
-{
-	const u64 *p = high_word(&b->format, k);
-	unsigned nr_key_bits = b->nr_key_bits;
-	unsigned ret = 0, offset;
-
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
-	offset = nr_key_bits;
-	while (offset > 64) {
-		p = next_word(p);
-		offset -= 64;
-	}
-
-	offset = 64 - offset;
-
-	while (nr_key_bits) {
-		unsigned bits = nr_key_bits + offset < 64
-			? nr_key_bits
-			: 64 - offset;
-
-		u64 mask = (~0ULL >> (64 - bits)) << offset;
-
-		if (*p & mask)
-			return ret + __ffs64(*p & mask) - offset;
-
-		p = prev_word(p);
-		nr_key_bits -= bits;
-		ret += bits;
-		offset = 0;
-	}
-
-	return 0;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-#define I(_x)			(*(out)++ = (_x))
-#define I1(i0)						I(i0)
-#define I2(i0, i1)		(I1(i0),		I(i1))
-#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
-#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
-#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
-			      enum bch_bkey_fields field,
-			      unsigned dst_offset, unsigned dst_size,
-			      bool *eax_zeroed)
-{
-	unsigned bits = format->bits_per_field[field];
-	u64 offset = le64_to_cpu(format->field_offset[field]);
-	unsigned i, byte, bit_offset, align, shl, shr;
-
-	if (!bits && !offset) {
-		if (!*eax_zeroed) {
-			/* xor eax, eax */
-			I2(0x31, 0xc0);
-		}
-
-		*eax_zeroed = true;
-		goto set_field;
-	}
-
-	if (!bits) {
-		/* just return offset: */
-
-		switch (dst_size) {
-		case 8:
-			if (offset > S32_MAX) {
-				/* mov [rdi + dst_offset], offset */
-				I3(0xc7, 0x47, dst_offset);
-				memcpy(out, &offset, 4);
-				out += 4;
-
-				I3(0xc7, 0x47, dst_offset + 4);
-				memcpy(out, (void *) &offset + 4, 4);
-				out += 4;
-			} else {
-				/* mov [rdi + dst_offset], offset */
-				/* sign extended */
-				I4(0x48, 0xc7, 0x47, dst_offset);
-				memcpy(out, &offset, 4);
-				out += 4;
-			}
-			break;
-		case 4:
-			/* mov [rdi + dst_offset], offset */
-			I3(0xc7, 0x47, dst_offset);
-			memcpy(out, &offset, 4);
-			out += 4;
-			break;
-		default:
-			BUG();
-		}
-
-		return out;
-	}
-
-	bit_offset = format->key_u64s * 64;
-	for (i = 0; i <= field; i++)
-		bit_offset -= format->bits_per_field[i];
-
-	byte = bit_offset / 8;
-	bit_offset -= byte * 8;
-
-	*eax_zeroed = false;
-
-	if (bit_offset == 0 && bits == 8) {
-		/* movzx eax, BYTE PTR [rsi + imm8] */
-		I4(0x0f, 0xb6, 0x46, byte);
-	} else if (bit_offset == 0 && bits == 16) {
-		/* movzx eax, WORD PTR [rsi + imm8] */
-		I4(0x0f, 0xb7, 0x46, byte);
-	} else if (bit_offset + bits <= 32) {
-		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 32);
-
-		/* mov eax, [rsi + imm8] */
-		I3(0x8b, 0x46, byte);
-
-		if (bit_offset) {
-			/* shr eax, imm8 */
-			I3(0xc1, 0xe8, bit_offset);
-		}
-
-		if (bit_offset + bits < 32) {
-			unsigned mask = ~0U >> (32 - bits);
-
-			/* and eax, imm32 */
-			I1(0x25);
-			memcpy(out, &mask, 4);
-			out += 4;
-		}
-	} else if (bit_offset + bits <= 64) {
-		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 64);
-
-		/* mov rax, [rsi + imm8] */
-		I4(0x48, 0x8b, 0x46, byte);
-
-		shl = 64 - bit_offset - bits;
-		shr = bit_offset + shl;
-
-		if (shl) {
-			/* shl rax, imm8 */
-			I4(0x48, 0xc1, 0xe0, shl);
-		}
-
-		if (shr) {
-			/* shr rax, imm8 */
-			I4(0x48, 0xc1, 0xe8, shr);
-		}
-	} else {
-		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
-		byte -= align;
-		bit_offset += align * 8;
-
-		BUG_ON(bit_offset + bits > 96);
-
-		/* mov rax, [rsi + byte] */
-		I4(0x48, 0x8b, 0x46, byte);
-
-		/* mov edx, [rsi + byte + 8] */
-		I3(0x8b, 0x56, byte + 8);
-
-		/* bits from next word: */
-		shr = bit_offset + bits - 64;
-		BUG_ON(shr > bit_offset);
-
-		/* shr rax, bit_offset */
-		I4(0x48, 0xc1, 0xe8, shr);
-
-		/* shl rdx, imm8 */
-		I4(0x48, 0xc1, 0xe2, 64 - shr);
-
-		/* or rax, rdx */
-		I3(0x48, 0x09, 0xd0);
-
-		shr = bit_offset - shr;
-
-		if (shr) {
-			/* shr rax, imm8 */
-			I4(0x48, 0xc1, 0xe8, shr);
-		}
-	}
-
-	/* rax += offset: */
-	if (offset > S32_MAX) {
-		/* mov rdx, imm64 */
-		I2(0x48, 0xba);
-		memcpy(out, &offset, 8);
-		out += 8;
-		/* add %rdx, %rax */
-		I3(0x48, 0x01, 0xd0);
-	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
-		/* add rax, imm32 */
-		I2(0x48, 0x05);
-		memcpy(out, &offset, 4);
-		out += 4;
-	} else if (offset) {
-		/* add eax, imm32 */
-		I1(0x05);
-		memcpy(out, &offset, 4);
-		out += 4;
-	}
-set_field:
-	switch (dst_size) {
-	case 8:
-		/* mov [rdi + dst_offset], rax */
-		I4(0x48, 0x89, 0x47, dst_offset);
-		break;
-	case 4:
-		/* mov [rdi + dst_offset], eax */
-		I3(0x89, 0x47, dst_offset);
-		break;
-	default:
-		BUG();
-	}
-
-	return out;
-}
-
-int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
-	bool eax_zeroed = false;
-	u8 *out = _out;
-
-	/*
-	 * rdi: dst - unpacked key
-	 * rsi: src - packed key
-	 */
-
-	/* k->u64s, k->format, k->type */
-
-	/* mov eax, [rsi] */
-	I2(0x8b, 0x06);
-
-	/* add eax, BKEY_U64s - format->key_u64s */
-	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
-	/* and eax, imm32: mask out k->pad: */
-	I5(0x25, 0xff, 0xff, 0xff, 0);
-
-	/* mov [rdi], eax */
-	I2(0x89, 0x07);
-
-#define x(id, field)							\
-	out = compile_bkey_field(format, out, id,			\
-				 offsetof(struct bkey, field),		\
-				 sizeof(((struct bkey *) NULL)->field),	\
-				 &eax_zeroed);
-	bkey_fields()
-#undef x
-
-	/* retq */
-	I1(0xc3);
-
-	return (void *) out - _out;
-}
-
-#else
-#endif
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
-					  const struct bkey_packed *r,
-					  const struct btree *b)
-{
-	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
-					       const struct bkey_packed *l,
-					       const struct bpos *r)
-{
-	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int bch2_bkey_cmp_packed(const struct btree *b,
-			 const struct bkey_packed *l,
-			 const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed_inlined(b, l, r);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bpos *r)
-{
-	const struct bkey *l_unpacked;
-
-	return unlikely(l_unpacked = packed_to_bkey_c(l))
-		? bpos_cmp(l_unpacked->p, *r)
-		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch2_bpos_swab(struct bpos *p)
-{
-	u8 *l = (u8 *) p;
-	u8 *h = ((u8 *) &p[1]) - 1;
-
-	while (l < h) {
-		swap(*l, *h);
-		l++;
-		--h;
-	}
-}
-
-void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
-	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
-	u8 *l = k->key_start;
-	u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
-
-	while (l < h) {
-		swap(*l, *h);
-		l++;
-		--h;
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void)
-{
-	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
-	struct bkey_packed p;
-
-	struct bkey_format test_format = {
-		.key_u64s	= 3,
-		.nr_fields	= BKEY_NR_FIELDS,
-		.bits_per_field = {
-			13,
-			64,
-			32,
-		},
-	};
-
-	struct unpack_state in_s =
-		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
-	struct pack_state out_s = pack_state_init(&test_format, &p);
-	unsigned i;
-
-	for (i = 0; i < out_s.format->nr_fields; i++) {
-		u64 a, v = get_inc_field(&in_s, i);
-
-		switch (i) {
-#define x(id, field)	case id: a = t.field; break;
-	bkey_fields()
-#undef x
-		default:
-			BUG();
-		}
-
-		if (a != v)
-			panic("got %llu actual %llu i %u\n", v, a, i);
-
-		if (!set_inc_field(&out_s, i, v))
-			panic("failed at %u\n", i);
-	}
-
-	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
deleted file mode 100644
index 3ccd521c190a..000000000000
--- a/fs/bcachefs/bkey.h
+++ /dev/null
@@ -1,605 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_H
-#define _BCACHEFS_BKEY_H
-
-#include <linux/bug.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-#include "btree_types.h"
-#include "util.h"
-#include "vstructs.h"
-
-#if 0
-
-/*
- * compiled unpack functions are disabled, pending a new interface for
- * dynamically allocating executable memory:
- */
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK	1
-#endif
-#endif
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *,
-				     const struct bkey_format *,
-				     const struct bkey_packed *);
-
-enum bkey_lr_packed {
-	BKEY_PACKED_BOTH,
-	BKEY_PACKED_RIGHT,
-	BKEY_PACKED_LEFT,
-	BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed(_l, _r)						\
-	((_l)->format + ((_r)->format << 1))
-
-static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
-{
-	memcpy_u64s_small(dst, src, src->u64s);
-}
-
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
-	memcpy_u64s_small(dst, src, src->k.u64s);
-}
-
-struct btree;
-
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
-					  const struct bkey_packed *,
-					  const struct bkey_packed *);
-__pure
-unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
-				     const struct bkey_packed *,
-				     const struct btree *);
-
-__pure
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
-					  const struct bkey_packed *,
-					  const struct bpos *);
-
-__pure
-int bch2_bkey_cmp_packed(const struct btree *,
-			 const struct bkey_packed *,
-			 const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_left_packed(const struct btree *,
-				const struct bkey_packed *,
-				const struct bpos *);
-
-static inline __pure
-int bkey_cmp_left_packed(const struct btree *b,
-			 const struct bkey_packed *l, const struct bpos *r)
-{
-	return __bch2_bkey_cmp_left_packed(b, l, r);
-}
-
-/*
- * The compiler generates better code when we pass bpos by ref, but it's often
- * enough terribly convenient to pass it by val... as much as I hate c++, const
- * ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
-					     const struct bkey_packed *l,
-					     struct bpos r)
-{
-	return bkey_cmp_left_packed(b, l, &r);
-}
-
-static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
-{
-	return  !((l.inode	^ r.inode) |
-		  (l.offset	^ r.offset) |
-		  (l.snapshot	^ r.snapshot));
-}
-
-static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode ? l.inode < r.inode :
-		l.offset != r.offset ? l.offset < r.offset :
-		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
-}
-
-static __always_inline bool bpos_le(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode ? l.inode < r.inode :
-		l.offset != r.offset ? l.offset < r.offset :
-		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
-}
-
-static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
-{
-	return bpos_lt(r, l);
-}
-
-static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
-{
-	return bpos_le(r, l);
-}
-
-static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
-{
-	return  cmp_int(l.inode,    r.inode) ?:
-		cmp_int(l.offset,   r.offset) ?:
-		cmp_int(l.snapshot, r.snapshot);
-}
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
-	return bpos_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
-{
-	return bpos_gt(l, r) ? l : r;
-}
-
-static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
-{
-	return  !((l.inode	^ r.inode) |
-		  (l.offset	^ r.offset));
-}
-
-static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode
-		? l.inode < r.inode
-		: l.offset < r.offset;
-}
-
-static __always_inline bool bkey_le(struct bpos l, struct bpos r)
-{
-	return  l.inode	!= r.inode
-		? l.inode < r.inode
-		: l.offset <= r.offset;
-}
-
-static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
-{
-	return bkey_lt(r, l);
-}
-
-static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
-{
-	return bkey_le(r, l);
-}
-
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
-	return  cmp_int(l.inode,    r.inode) ?:
-		cmp_int(l.offset,   r.offset);
-}
-
-static inline struct bpos bkey_min(struct bpos l, struct bpos r)
-{
-	return bkey_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bkey_max(struct bpos l, struct bpos r)
-{
-	return bkey_gt(l, r) ? l : r;
-}
-
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
-	return bpos_eq(l.k->p, r.k->p) &&
-		l.k->size == r.k->size &&
-		bkey_bytes(l.k) == bkey_bytes(r.k) &&
-		!memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
-void bch2_bpos_swab(struct bpos *);
-void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
-	return  cmp_int(l.hi, r.hi) ?:
-		cmp_int(l.lo, r.lo);
-}
-
-#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
-#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
-
-static __always_inline bool bversion_zero(struct bversion v)
-{
-	return bversion_cmp(v, ZERO_VERSION) == 0;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k)							\
-	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
-	 (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
-	return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
-	return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
-	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
-	return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
-	return format->bits_per_field[BKEY_FIELD_INODE] +
-		format->bits_per_field[BKEY_FIELD_OFFSET] +
-		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bpos_successor(struct bpos p)
-{
-	if (!++p.snapshot &&
-	    !++p.offset &&
-	    !++p.inode)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_predecessor(struct bpos p)
-{
-	if (!p.snapshot-- &&
-	    !p.offset-- &&
-	    !p.inode--)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_nosnap_successor(struct bpos p)
-{
-	p.snapshot = 0;
-
-	if (!++p.offset &&
-	    !++p.inode)
-		BUG();
-
-	return p;
-}
-
-static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
-{
-	p.snapshot = 0;
-
-	if (!p.offset-- &&
-	    !p.inode--)
-		BUG();
-
-	return p;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
-	return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
-	return (struct bpos) {
-		.inode		= k->p.inode,
-		.offset		= bkey_start_offset(k),
-		.snapshot	= k->p.snapshot,
-	};
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
-				      const struct bkey_packed *k)
-{
-	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-}
-
-static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
-				    const struct bkey_packed *k)
-{
-	return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
-				       const struct bkey_packed *k)
-{
-	return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
-				      const struct bkey_packed *k)
-{
-	return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
-				     const struct bkey_packed *k)
-{
-	return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
-				      struct bkey_packed *k, unsigned val_u64s)
-{
-	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k)						\
-	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch2_bkey_format_current;
-
-bool bch2_bkey_transform(const struct bkey_format *,
-			 struct bkey_packed *,
-			 const struct bkey_format *,
-			 const struct bkey_packed *);
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-				   const struct bkey_packed *);
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
-			      const struct bkey_packed *);
-#endif
-
-bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
-		   const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
-	BKEY_PACK_POS_EXACT,
-	BKEY_PACK_POS_SMALLER,
-	BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
-					   const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
-				 const struct btree *b)
-{
-	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
-		 const struct bkey_packed *);
-bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
-	       const struct bkey_format *);
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
-			       struct bkey *dst,
-			       const struct bkey_packed *src)
-{
-	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
-		compiled_unpack_fn unpack_fn = b->aux_data;
-		unpack_fn(dst, src);
-
-		if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
-			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
-			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-		}
-	} else {
-		*dst = __bch2_bkey_unpack_key(&b->format, src);
-	}
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-	struct bkey dst;
-
-	__bkey_unpack_key_format_checked(b, &dst, src);
-	return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
-				     struct bkey *dst,
-				     const struct bkey_packed *src)
-{
-	if (likely(bkey_packed(src)))
-		__bkey_unpack_key_format_checked(b, dst, src);
-	else
-		*dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_key_format_checked(b, src)
-		: *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
-			       const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-	return bkey_unpack_key_format_checked(b, src).p;
-#else
-	return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
-					  const struct bkey_packed *src)
-{
-	return likely(bkey_packed(src))
-		? bkey_unpack_pos_format_checked(b, src)
-		: packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
-					       const struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(const struct btree *b,
-					       struct bkey_packed *k,
-					       struct bkey *u)
-{
-	__bkey_unpack_key(b, u, k);
-
-	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
-				 enum bch_bkey_fields nr)
-{
-	return f->bits_per_field[nr] < 64
-		? (le64_to_cpu(f->field_offset[nr]) +
-		   ~(~0ULL << f->bits_per_field[nr]))
-		: U64_MAX;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-int bch2_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch2_compile_bkey_format(const struct bkey_format *format,
-					  void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
-				   struct bkey_s_c src)
-{
-	dst->k = *src.k;
-	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-/* byte order helpers */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-	return f->key_u64s - 1;
-}
-
-#define high_bit_offset		0
-#define nth_word(p, n)		((p) - (n))
-
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
-	return 0;
-}
-
-#define high_bit_offset		KEY_PACKED_BITS_START
-#define nth_word(p, n)		((p) + (n))
-
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
-#define next_word(p)		nth_word(p, 1)
-#define prev_word(p)		nth_word(p, -1)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void);
-#else
-static inline void bch2_bkey_pack_test(void) {}
-#endif
-
-#define bkey_fields()							\
-	x(BKEY_FIELD_INODE,		p.inode)			\
-	x(BKEY_FIELD_OFFSET,		p.offset)			\
-	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
-	x(BKEY_FIELD_SIZE,		size)				\
-	x(BKEY_FIELD_VERSION_HI,	bversion.hi)			\
-	x(BKEY_FIELD_VERSION_LO,	bversion.lo)
-
-struct bkey_format_state {
-	u64 field_min[BKEY_NR_FIELDS];
-	u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-
-static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
-{
-	s->field_min[field] = min(s->field_min[field], v);
-	s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
-	bkey_fields()
-#undef x
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-
-static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
-{
-	unsigned f_bits = f->bits_per_field[i];
-	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-	u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
-	u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-	if (f_bits > unpacked_bits)
-		return true;
-
-	if ((f_bits == unpacked_bits) && field_offset)
-		return true;
-
-	u64 f_mask = f_bits
-		? ~((~0ULL << (f_bits - 1)) << 1)
-		: 0;
-
-	if (((field_offset + f_mask) & unpacked_mask) < field_offset)
-		return true;
-	return false;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
-			     enum bch_validate_flags, struct printbuf *);
-void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
-
-#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
deleted file mode 100644
index a30c4ae8eb36..000000000000
--- a/fs/bcachefs/bkey_buf.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_BUF_H
-#define _BCACHEFS_BKEY_BUF_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-
-struct bkey_buf {
-	struct bkey_i	*k;
-	u64		onstack[12];
-};
-
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
-					 struct bch_fs *c, unsigned u64s)
-{
-	if (s->k == (void *) s->onstack &&
-	    u64s > ARRAY_SIZE(s->onstack)) {
-		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-		memcpy(s->k, s->onstack, sizeof(s->onstack));
-	}
-}
-
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
-					    struct bch_fs *c,
-					    struct bkey_s_c k)
-{
-	bch2_bkey_buf_realloc(s, c, k.k->u64s);
-	bkey_reassemble(s->k, k);
-}
-
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
-				      struct bch_fs *c,
-				      struct bkey_i *src)
-{
-	bch2_bkey_buf_realloc(s, c, src->k.u64s);
-	bkey_copy(s->k, src);
-}
-
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
-					struct bch_fs *c,
-					struct btree *b,
-					struct bkey_packed *src)
-{
-	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
-			      bkeyp_val_u64s(&b->format, src));
-	bch2_bkey_unpack(b, s->k, src);
-}
-
-static inline void bch2_bkey_buf_init(struct bkey_buf *s)
-{
-	s->k = (void *) s->onstack;
-}
-
-static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
-{
-	if (s->k != (void *) s->onstack)
-		mempool_free(s->k, &c->large_bkey_pool);
-	s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
deleted file mode 100644
index 5f42a6e69360..000000000000
--- a/fs/bcachefs/bkey_cmp.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_CMP_H
-#define _BCACHEFS_BKEY_CMP_H
-
-#include "bkey.h"
-
-#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	long d0, d1, d2, d3;
-	int cmp;
-
-	/* we shouldn't need asm for this, but gcc is being retarded: */
-
-	asm(".intel_syntax noprefix;"
-	    "xor eax, eax;"
-	    "xor edx, edx;"
-	    "1:;"
-	    "mov r8, [rdi];"
-	    "mov r9, [rsi];"
-	    "sub ecx, 64;"
-	    "jl 2f;"
-
-	    "cmp r8, r9;"
-	    "jnz 3f;"
-
-	    "lea rdi, [rdi - 8];"
-	    "lea rsi, [rsi - 8];"
-	    "jmp 1b;"
-
-	    "2:;"
-	    "not ecx;"
-	    "shr r8, 1;"
-	    "shr r9, 1;"
-	    "shr r8, cl;"
-	    "shr r9, cl;"
-	    "cmp r8, r9;"
-
-	    "3:\n"
-	    "seta al;"
-	    "setb dl;"
-	    "sub eax, edx;"
-	    ".att_syntax prefix;"
-	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-	    : "0" (l), "1" (r), "3" (nr_key_bits)
-	    : "r8", "r9", "cc", "memory");
-
-	return cmp;
-}
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-				  unsigned nr_key_bits)
-{
-	u64 l_v, r_v;
-
-	if (!nr_key_bits)
-		return 0;
-
-	/* for big endian, skip past header */
-	nr_key_bits += high_bit_offset;
-	l_v = *l & (~0ULL >> high_bit_offset);
-	r_v = *r & (~0ULL >> high_bit_offset);
-
-	while (1) {
-		if (nr_key_bits < 64) {
-			l_v >>= 64 - nr_key_bits;
-			r_v >>= 64 - nr_key_bits;
-			nr_key_bits = 0;
-		} else {
-			nr_key_bits -= 64;
-		}
-
-		if (!nr_key_bits || l_v != r_v)
-			break;
-
-		l = next_word(l);
-		r = next_word(r);
-
-		l_v = *l;
-		r_v = *r;
-	}
-
-	return cmp_int(l_v, r_v);
-}
-#endif
-
-static inline __pure __flatten
-int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
-					  const struct bkey_packed *r,
-					  const struct btree *b)
-{
-	const struct bkey_format *f = &b->format;
-	int ret;
-
-	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
-	ret = __bkey_cmp_bits(high_word(f, l),
-			      high_word(f, r),
-			      b->nr_key_bits);
-
-	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-				bkey_unpack_pos(b, r)));
-	return ret;
-}
-
-static inline __pure __flatten
-int bch2_bkey_cmp_packed_inlined(const struct btree *b,
-			 const struct bkey_packed *l,
-			 const struct bkey_packed *r)
-{
-	struct bkey unpacked;
-
-	if (likely(bkey_packed(l) && bkey_packed(r)))
-		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-
-	if (bkey_packed(l)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, l);
-		l = (void *) &unpacked;
-	} else if (bkey_packed(r)) {
-		__bkey_unpack_key_format_checked(b, &unpacked, r);
-		r = (void *) &unpacked;
-	}
-
-	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
-}
-
-#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
deleted file mode 100644
index fcd8c82cba4f..000000000000
--- a/fs/bcachefs/bkey_methods.c
+++ /dev/null
@@ -1,497 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_types.h"
-#include "alloc_background.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "lru.h"
-#include "quota.h"
-#include "reflink.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-const char * const bch2_bkey_types[] = {
-#define x(name, nr, ...) #name,
-	BCH_BKEY_TYPES()
-#undef x
-	NULL
-};
-
-static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
-				struct bkey_validate_context from)
-{
-	return 0;
-}
-
-#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
-	.key_validate	= deleted_key_validate,		\
-})
-
-#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
-	.key_validate	= deleted_key_validate,		\
-})
-
-static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
-				  struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k),
-			 c, bkey_val_size_nonzero,
-			 "incorrect value size (%zu != 0)",
-			 bkey_val_bytes(k.k));
-fsck_err:
-	return ret;
-}
-
-#define bch2_bkey_ops_error ((struct bkey_ops) {	\
-	.key_validate = empty_val_key_validate,		\
-})
-
-static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
-				    struct bkey_validate_context from)
-{
-	return 0;
-}
-
-static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
-				    struct bkey_s_c k)
-{
-	struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
-
-	prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
-}
-
-#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
-	.key_validate	= key_type_cookie_validate,	\
-	.val_to_text	= key_type_cookie_to_text,	\
-	.min_val_size	= 8,				\
-})
-
-#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
-	.key_validate	= empty_val_key_validate,	\
-})
-
-static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-					 struct bkey_validate_context from)
-{
-	return 0;
-}
-
-static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
-					 struct bkey_s_c k)
-{
-	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
-	unsigned datalen = bkey_inline_data_bytes(k.k);
-
-	prt_printf(out, "datalen %u: %*phN",
-	       datalen, min(datalen, 32U), d.v->data);
-}
-
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) {		\
-	.key_validate	= key_type_inline_data_validate,	\
-	.val_to_text	= key_type_inline_data_to_text,		\
-})
-
-static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-#define bch2_bkey_ops_set ((struct bkey_ops) {		\
-	.key_validate	= empty_val_key_validate,	\
-	.key_merge	= key_type_set_merge,		\
-})
-
-const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr, ...) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
-	BCH_BKEY_TYPES()
-#undef x
-};
-
-const struct bkey_ops bch2_bkey_null_ops = {
-};
-
-int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-		return 0;
-
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
-			 c, bkey_val_size_too_small,
-			 "bad val size (%zu < %u)",
-			 bkey_val_bytes(k.k), ops->min_val_size);
-
-	if (!ops->key_validate)
-		return 0;
-
-	ret = ops->key_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-static u64 bch2_key_types_allowed[] = {
-	[BKEY_TYPE_btree] =
-		BIT_ULL(KEY_TYPE_deleted)|
-		BIT_ULL(KEY_TYPE_btree_ptr)|
-		BIT_ULL(KEY_TYPE_btree_ptr_v2),
-#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
-	BCH_BTREE_IDS()
-#undef x
-};
-
-static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
-#define x(name, nr, flags)	[KEY_TYPE_##name] = flags,
-	BCH_BKEY_TYPES()
-#undef x
-};
-
-const char *bch2_btree_node_type_str(enum btree_node_type type)
-{
-	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
-}
-
-int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-			 struct bkey_validate_context from)
-{
-	enum btree_node_type type = __btree_node_type(from.level, from.btree);
-
-	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
-		return 0;
-
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
-			 c, bkey_u64s_too_small,
-			 "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-
-	if (type >= BKEY_TYPE_NR)
-		return 0;
-
-	enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
-		? bch2_bkey_type_flags[k.k->type]
-		: 0;
-
-	bool strict_key_type_allowed =
-		(from.flags & BCH_VALIDATE_commit) ||
-		type == BKEY_TYPE_btree ||
-		(from.btree < BTREE_ID_NR &&
-		 (bkey_flags & BKEY_TYPE_strict_btree_checks));
-
-	bkey_fsck_err_on(strict_key_type_allowed &&
-			 k.k->type < KEY_TYPE_MAX &&
-			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
-			 c, bkey_invalid_type_for_btree,
-			 "invalid key type for btree %s (%s)",
-			 bch2_btree_node_type_str(type),
-			 k.k->type < KEY_TYPE_MAX
-			 ? bch2_bkey_types[k.k->type]
-			 : "(unknown)");
-
-	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-		bkey_fsck_err_on(k.k->size == 0,
-				 c, bkey_extent_size_zero,
-				 "size == 0");
-
-		bkey_fsck_err_on(k.k->size > k.k->p.offset,
-				 c, bkey_extent_size_greater_than_offset,
-				 "size greater than offset (%u > %llu)",
-				 k.k->size, k.k->p.offset);
-	} else {
-		bkey_fsck_err_on(k.k->size,
-				 c, bkey_size_nonzero,
-				 "size != 0");
-	}
-
-	if (type != BKEY_TYPE_btree) {
-		enum btree_id btree = type - 1;
-
-		if (btree_type_has_snapshots(btree)) {
-			bkey_fsck_err_on(!k.k->p.snapshot,
-					 c, bkey_snapshot_zero,
-					 "snapshot == 0");
-		} else if (!btree_type_has_snapshot_field(btree)) {
-			bkey_fsck_err_on(k.k->p.snapshot,
-					 c, bkey_snapshot_nonzero,
-					 "nonzero snapshot");
-		} else {
-			/*
-			 * btree uses snapshot field but it's not required to be
-			 * nonzero
-			 */
-		}
-
-		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
-				 c, bkey_at_pos_max,
-				 "key at POS_MAX");
-	}
-fsck_err:
-	return ret;
-}
-
-int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
-		       struct bkey_validate_context from)
-{
-	return __bch2_bkey_validate(c, k, from) ?:
-		bch2_bkey_val_validate(c, k, from);
-}
-
-int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
-			 c, bkey_before_start_of_btree_node,
-			 "key before start of btree node");
-
-	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
-			 c, bkey_after_end_of_btree_node,
-			 "key past end of btree node");
-fsck_err:
-	return ret;
-}
-
-void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
-{
-	if (bpos_eq(pos, POS_MIN))
-		prt_printf(out, "POS_MIN");
-	else if (bpos_eq(pos, POS_MAX))
-		prt_printf(out, "POS_MAX");
-	else if (bpos_eq(pos, SPOS_MAX))
-		prt_printf(out, "SPOS_MAX");
-	else {
-		if (pos.inode == U64_MAX)
-			prt_printf(out, "U64_MAX");
-		else
-			prt_printf(out, "%llu", pos.inode);
-		prt_printf(out, ":");
-		if (pos.offset == U64_MAX)
-			prt_printf(out, "U64_MAX");
-		else
-			prt_printf(out, "%llu", pos.offset);
-		prt_printf(out, ":");
-		if (pos.snapshot == U32_MAX)
-			prt_printf(out, "U32_MAX");
-		else
-			prt_printf(out, "%u", pos.snapshot);
-	}
-}
-
-void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
-{
-	if (k) {
-		prt_printf(out, "u64s %u type ", k->u64s);
-
-		if (k->type < KEY_TYPE_MAX)
-			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
-		else
-			prt_printf(out, "%u ", k->type);
-
-		bch2_bpos_to_text(out, k->p);
-
-		prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
-	} else {
-		prt_printf(out, "(null)");
-	}
-}
-
-void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
-		      struct bkey_s_c k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	if (likely(ops->val_to_text))
-		ops->val_to_text(out, c, k);
-}
-
-void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
-			   struct bkey_s_c k)
-{
-	bch2_bkey_to_text(out, k.k);
-
-	if (bkey_val_bytes(k.k)) {
-		prt_printf(out, ": ");
-		bch2_val_to_text(out, c, k);
-	}
-}
-
-void bch2_bkey_swab_val(struct bkey_s k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	if (ops->swab)
-		ops->swab(k);
-}
-
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
-	return ops->key_normalize
-		? ops->key_normalize(c, k)
-		: false;
-}
-
-bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
-
-	return ops->key_merge &&
-		bch2_bkey_maybe_mergable(l.k, r.k) &&
-		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
-		!static_branch_unlikely(&bch2_key_merging_disabled) &&
-		ops->key_merge(c, l, r);
-}
-
-static const struct old_bkey_type {
-	u8		btree_node_type;
-	u8		old;
-	u8		new;
-} bkey_renumber_table[] = {
-	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
-	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
-	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
-	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
-	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
-	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
-	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
-	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
-	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
-	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
-	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
-};
-
-void bch2_bkey_renumber(enum btree_node_type btree_node_type,
-			struct bkey_packed *k,
-			int write)
-{
-	const struct old_bkey_type *i;
-
-	for (i = bkey_renumber_table;
-	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
-	     i++)
-		if (btree_node_type == i->btree_node_type &&
-		    k->type == (write ? i->new : i->old)) {
-			k->type = write ? i->old : i->new;
-			break;
-		}
-}
-
-void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-			unsigned version, unsigned big_endian,
-			int write,
-			struct bkey_format *f,
-			struct bkey_packed *k)
-{
-	const struct bkey_ops *ops;
-	struct bkey uk;
-	unsigned nr_compat = 5;
-	int i;
-
-	/*
-	 * Do these operations in reverse order in the write path:
-	 */
-
-	for (i = 0; i < nr_compat; i++)
-	switch (!write ? i : nr_compat - 1 - i) {
-	case 0:
-		if (big_endian != CPU_BIG_ENDIAN) {
-			bch2_bkey_swab_key(f, k);
-		} else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-			bch2_bkey_swab_key(f, k);
-			bch2_bkey_swab_key(f, k);
-		}
-		break;
-	case 1:
-		if (version < bcachefs_metadata_version_bkey_renumber)
-			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
-		break;
-	case 2:
-		if (version < bcachefs_metadata_version_inode_btree_change &&
-		    btree_id == BTREE_ID_inodes) {
-			if (!bkey_packed(k)) {
-				struct bkey_i *u = packed_to_bkey(k);
-
-				swap(u->k.p.inode, u->k.p.offset);
-			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
-				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
-				struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
-				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
-				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
-				swap(tmp.field_offset[BKEY_FIELD_INODE],
-				     tmp.field_offset[BKEY_FIELD_OFFSET]);
-
-				if (!write)
-					swap(in, out);
-
-				uk = __bch2_bkey_unpack_key(in, k);
-				swap(uk.p.inode, uk.p.offset);
-				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
-			}
-		}
-		break;
-	case 3:
-		if (version < bcachefs_metadata_version_snapshot &&
-		    (level || btree_type_has_snapshots(btree_id))) {
-			struct bkey_i *u = packed_to_bkey(k);
-
-			if (u) {
-				u->k.p.snapshot = write
-					? 0 : U32_MAX;
-			} else {
-				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
-				u64 max_packed = min_packed +
-					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-				uk = __bch2_bkey_unpack_key(f, k);
-				uk.p.snapshot = write
-					? min_packed : min_t(u64, U32_MAX, max_packed);
-
-				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
-			}
-		}
-
-		break;
-	case 4: {
-		struct bkey_s u;
-
-		if (!bkey_packed(k)) {
-			u = bkey_i_to_s(packed_to_bkey(k));
-		} else {
-			uk = __bch2_bkey_unpack_key(f, k);
-			u.k = &uk;
-			u.v = bkeyp_val(f, k);
-		}
-
-		if (big_endian != CPU_BIG_ENDIAN)
-			bch2_bkey_swab_val(u);
-
-		ops = bch2_bkey_type_ops(k->type);
-
-		if (ops->compat)
-			ops->compat(btree_id, version, big_endian, write, u);
-		break;
-	}
-	default:
-		BUG();
-	}
-}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
deleted file mode 100644
index bf34111cdf00..000000000000
--- a/fs/bcachefs/bkey_methods.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_METHODS_H
-#define _BCACHEFS_BKEY_METHODS_H
-
-#include "bkey.h"
-
-struct bch_fs;
-struct btree;
-struct btree_trans;
-struct bkey;
-enum btree_node_type;
-
-extern const char * const bch2_bkey_types[];
-extern const struct bkey_ops bch2_bkey_null_ops;
-
-/*
- * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
- * invalid, entire key will be deleted.
- *
- * When invalid, error string is returned via @err. @rw indicates whether key is
- * being read or written; more aggressive checks can be enabled when rw == WRITE.
- */
-struct bkey_ops {
-	int		(*key_validate)(struct bch_fs *c, struct bkey_s_c k,
-					struct bkey_validate_context from);
-	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
-				       struct bkey_s_c);
-	void		(*swab)(struct bkey_s);
-	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
-	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-	int		(*trigger)(struct btree_trans *, enum btree_id, unsigned,
-				   struct bkey_s_c, struct bkey_s,
-				   enum btree_iter_update_trigger_flags);
-	void		(*compat)(enum btree_id id, unsigned version,
-				  unsigned big_endian, int write,
-				  struct bkey_s);
-
-	/* Size of value type when first created: */
-	unsigned	min_val_size;
-};
-
-extern const struct bkey_ops bch2_bkey_ops[];
-
-static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
-{
-	return likely(type < KEY_TYPE_MAX)
-		? &bch2_bkey_ops[type]
-		: &bch2_bkey_null_ops;
-}
-
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
-			 struct bkey_validate_context);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
-		       struct bkey_validate_context);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
-			    struct bkey_validate_context from);
-
-void bch2_bpos_to_text(struct printbuf *, struct bpos);
-void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *,
-		      struct bkey_s_c);
-void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
-			   struct bkey_s_c);
-
-void bch2_bkey_swab_val(struct bkey_s);
-
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
-static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
-{
-	return l->type == r->type &&
-		!bversion_cmp(l->bversion, r->bversion) &&
-		bpos_eq(l->p, bkey_start_pos(r));
-}
-
-bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-static inline int bch2_key_trigger(struct btree_trans *trans,
-		enum btree_id btree, unsigned level,
-		struct bkey_s_c old, struct bkey_s new,
-		enum btree_iter_update_trigger_flags flags)
-{
-	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
-	return ops->trigger
-		? ops->trigger(trans, btree, level, old, new, flags)
-		: 0;
-}
-
-static inline int bch2_key_trigger_old(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s_c old,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = old.k->p;
-
-	return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
-				BTREE_TRIGGER_overwrite|flags);
-}
-
-static inline int bch2_key_trigger_new(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i deleted;
-
-	bkey_init(&deleted.k);
-	deleted.k.p = new.k->p;
-
-	return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
-				BTREE_TRIGGER_insert|flags);
-}
-
-void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
-
-void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
-			int, struct bkey_format *, struct bkey_packed *);
-
-static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
-			       unsigned version, unsigned big_endian,
-			       int write,
-			       struct bkey_format *f,
-			       struct bkey_packed *k)
-{
-	if (version < bcachefs_metadata_version_current ||
-	    big_endian != CPU_BIG_ENDIAN ||
-	    IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		__bch2_bkey_compat(level, btree_id, version,
-				   big_endian, write, f, k);
-
-}
-
-#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
deleted file mode 100644
index 4536eb50fc40..000000000000
--- a/fs/bcachefs/bkey_sort.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_cmp.h"
-#include "bkey_sort.h"
-#include "bset.h"
-#include "extents.h"
-
-typedef int (*sort_cmp_fn)(const struct btree *,
-			   const struct bkey_packed *,
-			   const struct bkey_packed *);
-
-static inline bool sort_iter_end(struct sort_iter *iter)
-{
-	return !iter->used;
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
-				  sort_cmp_fn cmp)
-{
-	unsigned i;
-
-	for (i = from;
-	     i + 1 < iter->used &&
-	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
-	     i++)
-		swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	unsigned i = iter->used;
-
-	while (i--)
-		sort_iter_sift(iter, i, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
-	return !sort_iter_end(iter) ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-	struct sort_iter_set *i = iter->data;
-
-	BUG_ON(!iter->used);
-
-	i->k = bkey_p_next(i->k);
-
-	BUG_ON(i->k > i->end);
-
-	if (i->k == i->end)
-		array_remove_item(iter->data, iter->used, 0);
-	else
-		sort_iter_sift(iter, 0, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
-						 sort_cmp_fn cmp)
-{
-	struct bkey_packed *ret = sort_iter_peek(iter);
-
-	if (ret)
-		sort_iter_advance(iter, cmp);
-
-	return ret;
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- */
-static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
-					       const struct bkey_packed *l,
-					       const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r) ?:
-		cmp_int((unsigned long) l, (unsigned long) r);
-}
-
-static inline bool should_drop_next_key(struct sort_iter *iter)
-{
-	/*
-	 * key_sort_cmp() ensures that when keys compare equal the older key
-	 * comes first; so if l->k compares equal to r->k then l->k is older
-	 * and should be dropped.
-	 */
-	return iter->used >= 2 &&
-		!bch2_bkey_cmp_packed(iter->b,
-				 iter->data[0].k,
-				 iter->data[1].k);
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-			      struct sort_iter *iter)
-{
-	struct bkey_packed *out = dst->start;
-	struct bkey_packed *k;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-
-	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
-
-	while ((k = sort_iter_peek(iter))) {
-		if (!bkey_deleted(k) &&
-		    !should_drop_next_key(iter)) {
-			bkey_p_copy(out, k);
-			btree_keys_account_key_add(&nr, 0, out);
-			out = bkey_p_next(out);
-		}
-
-		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-/* Sort + repack in a new format: */
-struct btree_nr_keys
-bch2_sort_repack(struct bset *dst, struct btree *src,
-		 struct btree_node_iter *src_iter,
-		 struct bkey_format *out_f,
-		 bool filter_whiteouts)
-{
-	struct bkey_format *in_f = &src->format;
-	struct bkey_packed *in, *out = vstruct_last(dst);
-	struct btree_nr_keys nr;
-	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
-
-	memset(&nr, 0, sizeof(nr));
-
-	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-		if (filter_whiteouts && bkey_deleted(in))
-			continue;
-
-		if (!transform)
-			bkey_p_copy(out, in);
-		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-					     ? in_f : &bch2_bkey_format_current, in))
-			out->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(src, (void *) out, in);
-
-		out->needs_whiteout = false;
-
-		btree_keys_account_key_add(&nr, 0, out);
-		out = bkey_p_next(out);
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	return nr;
-}
-
-static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
-		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
-		(long) l - (long) r;
-}
-
-#include "btree_update_interior.h"
-
-/*
- * For sorting in the btree node write path: whiteouts not in the unwritten
- * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
- * dropped if overwritten by real keys:
- */
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
-{
-	struct bkey_packed *in, *next, *out = dst;
-
-	sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
-
-	while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
-		if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
-			continue;
-
-		if ((next = sort_iter_peek(iter)) &&
-		    !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
-			continue;
-
-		bkey_p_copy(out, in);
-		out = bkey_p_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
-
-/*
- * Main sort routine for compacting a btree node in memory: we always drop
- * whiteouts because any whiteouts that need to be written are in the unwritten
- * whiteouts area:
- */
-unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
-{
-	struct bkey_packed *in, *out = dst;
-
-	sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
-
-	while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
-		if (bkey_deleted(in))
-			continue;
-
-		bkey_p_copy(out, in);
-		out = bkey_p_next(out);
-	}
-
-	return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
deleted file mode 100644
index 9be969d46890..000000000000
--- a/fs/bcachefs/bkey_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_SORT_H
-#define _BCACHEFS_BKEY_SORT_H
-
-struct sort_iter {
-	struct btree		*b;
-	unsigned		used;
-	unsigned		size;
-
-	struct sort_iter_set {
-		struct bkey_packed *k, *end;
-	} data[];
-};
-
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
-{
-	iter->b = b;
-	iter->used = 0;
-	iter->size = size;
-}
-
-struct sort_iter_stack {
-	struct sort_iter	iter;
-	struct sort_iter_set	sets[MAX_BSETS + 1];
-};
-
-static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
-{
-	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
-}
-
-static inline void sort_iter_add(struct sort_iter *iter,
-				 struct bkey_packed *k,
-				 struct bkey_packed *end)
-{
-	BUG_ON(iter->used >= iter->size);
-
-	if (k != end)
-		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
-			      struct sort_iter *);
-
-struct btree_nr_keys
-bch2_sort_repack(struct bset *, struct btree *,
-		 struct btree_node_iter *,
-		 struct bkey_format *, bool);
-
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
-unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
-
-#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
deleted file mode 100644
index b4f328f9853c..000000000000
--- a/fs/bcachefs/bkey_types.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_TYPES_H
-#define _BCACHEFS_BKEY_TYPES_H
-
-#include "bcachefs_format.h"
-
-/*
- * bkey_i	- bkey with inline value
- * bkey_s	- bkey with split value
- * bkey_s_c	- bkey with split value, const
- */
-
-#define bkey_p_next(_k)		vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
-	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-	return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-	unsigned u64s = BKEY_U64s + val_u64s;
-
-	BUG_ON(u64s > U8_MAX);
-	k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
-/* bkey with split value, const */
-struct bkey_s_c {
-	const struct bkey	*k;
-	const struct bch_val	*v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-	union {
-	struct {
-		struct bkey	*k;
-		struct bch_val	*v;
-	};
-	struct bkey_s_c		s_c;
-	};
-};
-
-#define bkey_s_null		((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-	return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-	return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-	return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...)					\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-};									\
-									\
-struct bkey_s_c_##name {						\
-	union {								\
-	struct {							\
-		const struct bkey	*k;				\
-		const struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-struct bkey_s_##name {							\
-	union {								\
-	struct {							\
-		struct bkey		*k;				\
-		struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c_##name		c;				\
-	struct bkey_s			s;				\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline const struct bkey_i_##name *				\
-bkey_i_to_##name##_c(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{									\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-name##_i_to_s_c(const struct bkey_i_##name *k)				\
-{									\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-bkey_i_to_s_c_##name(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{									\
-	struct bkey_i_##name *k =					\
-		container_of(&_k->k, struct bkey_i_##name, k);		\
-									\
-	bkey_init(&k->k);						\
-	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = KEY_TYPE_##name;					\
-	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
-									\
-	return k;							\
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
-enum bch_validate_flags {
-	BCH_VALIDATE_write		= BIT(0),
-	BCH_VALIDATE_commit		= BIT(1),
-	BCH_VALIDATE_silent		= BIT(2),
-};
-
-#define BKEY_VALIDATE_CONTEXTS()	\
-	x(unknown)			\
-	x(superblock)			\
-	x(journal)			\
-	x(btree_root)			\
-	x(btree_node)			\
-	x(commit)
-
-struct bkey_validate_context {
-	enum {
-#define x(n)	BKEY_VALIDATE_##n,
-	BKEY_VALIDATE_CONTEXTS()
-#undef x
-	}			from:8;
-	enum bch_validate_flags	flags:8;
-	u8			level;
-	enum btree_id		btree;
-	bool			root:1;
-	unsigned		journal_offset;
-	u64			journal_seq;
-};
-
-#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
deleted file mode 100644
index 32841f762eb2..000000000000
--- a/fs/bcachefs/bset.c
+++ /dev/null
@@ -1,1576 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "bset.h"
-#include "eytzinger.h"
-#include "trace.h"
-#include "util.h"
-
-#include <linux/unaligned.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
-						  struct btree *);
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-	unsigned n = ARRAY_SIZE(iter->data);
-
-	while (n && __btree_node_iter_set_end(iter, n - 1))
-		--n;
-
-	return n;
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
-	return bch2_bkey_to_bset_inlined(b, k);
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch2_dump_bset(struct bch_fs *c, struct btree *b,
-		    struct bset *i, unsigned set)
-{
-	struct bkey_packed *_k, *_n;
-	struct bkey uk, n;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-
-	if (!i->u64s)
-		return;
-
-	for (_k = i->start;
-	     _k < vstruct_last(i);
-	     _k = _n) {
-		_n = bkey_p_next(_k);
-
-		if (!_k->u64s) {
-			printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
-			       _k->_data - i->_data);
-			break;
-		}
-
-		k = bkey_disassemble(b, _k, &uk);
-
-		printbuf_reset(&buf);
-		if (c)
-			bch2_bkey_val_to_text(&buf, c, k);
-		else
-			bch2_bkey_to_text(&buf, k.k);
-		printk(KERN_ERR "block %u key %5zu: %s\n", set,
-		       _k->_data - i->_data, buf.buf);
-
-		if (_n == vstruct_last(i))
-			continue;
-
-		n = bkey_unpack_key(b, _n);
-
-		if (bpos_lt(n.p, k.k->p)) {
-			printk(KERN_ERR "Key skipped backwards\n");
-			continue;
-		}
-
-		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
-			printk(KERN_ERR "Duplicate keys\n");
-	}
-
-	printbuf_exit(&buf);
-}
-
-void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
-{
-	console_lock();
-	for_each_bset(b, t)
-		bch2_dump_bset(c, b, bset(b, t), t - b->set);
-	console_unlock();
-}
-
-void bch2_dump_btree_node_iter(struct btree *b,
-			      struct btree_node_iter *iter)
-{
-	struct btree_node_iter_set *set;
-	struct printbuf buf = PRINTBUF;
-
-	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
-	       __btree_node_iter_used(iter), b->nsets);
-
-	btree_node_iter_for_each(iter, set) {
-		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-		struct bset_tree *t = bch2_bkey_to_bset(b, k);
-		struct bkey uk = bkey_unpack_key(b, k);
-
-		printbuf_reset(&buf);
-		bch2_bkey_to_text(&buf, &uk);
-		printk(KERN_ERR "set %zu key %u: %s\n",
-		       t - b->set, set->k, buf.buf);
-	}
-
-	printbuf_exit(&buf);
-}
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
-{
-	struct bkey_packed *k;
-	struct btree_nr_keys nr = {};
-
-	for_each_bset(b, t)
-		bset_tree_for_each_key(b, t, k)
-			if (!bkey_deleted(k))
-				btree_keys_account_key_add(&nr, t - b->set, k);
-	return nr;
-}
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
-{
-	struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
-
-	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
-					    struct btree *b)
-{
-	struct btree_node_iter iter = *_iter;
-	const struct bkey_packed *k, *n;
-
-	k = bch2_btree_node_iter_peek_all(&iter, b);
-	__bch2_btree_node_iter_advance(&iter, b);
-	n = bch2_btree_node_iter_peek_all(&iter, b);
-
-	bkey_unpack_key(b, k);
-
-	if (n &&
-	    bkey_iter_cmp(b, k, n) > 0) {
-		struct btree_node_iter_set *set;
-		struct bkey ku = bkey_unpack_key(b, k);
-		struct bkey nu = bkey_unpack_key(b, n);
-		struct printbuf buf1 = PRINTBUF;
-		struct printbuf buf2 = PRINTBUF;
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &ku);
-		bch2_bkey_to_text(&buf2, &nu);
-		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-		       buf1.buf, buf2.buf);
-		printk(KERN_ERR "iter was:");
-
-		btree_node_iter_for_each(_iter, set) {
-			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
-			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
-			printk(" [%zi %zi]", t - b->set,
-			       k2->_data - bset(b, t)->_data);
-		}
-		panic("\n");
-	}
-}
-
-void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-				   struct btree *b)
-{
-	struct btree_node_iter_set *set, *s2;
-	struct bkey_packed *k, *p;
-
-	if (bch2_btree_node_iter_end(iter))
-		return;
-
-	/* Verify no duplicates: */
-	btree_node_iter_for_each(iter, set) {
-		BUG_ON(set->k > set->end);
-		btree_node_iter_for_each(iter, s2)
-			BUG_ON(set != s2 && set->end == s2->end);
-	}
-
-	/* Verify that set->end is correct: */
-	btree_node_iter_for_each(iter, set) {
-		for_each_bset(b, t)
-			if (set->end == t->end_offset) {
-				BUG_ON(set->k < btree_bkey_first_offset(t) ||
-				       set->k >= t->end_offset);
-				goto found;
-			}
-		BUG();
-found:
-		do {} while (0);
-	}
-
-	/* Verify iterator is sorted: */
-	btree_node_iter_for_each(iter, set)
-		BUG_ON(set != iter->data &&
-		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
-
-	k = bch2_btree_node_iter_peek_all(iter, b);
-
-	for_each_bset(b, t) {
-		if (iter->data[0].end == t->end_offset)
-			continue;
-
-		p = bch2_bkey_prev_all(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t));
-
-		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
-	}
-}
-
-static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
-				     struct bkey_packed *insert, unsigned clobber_u64s)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, where);
-	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-#if 0
-	BUG_ON(prev &&
-	       bkey_iter_cmp(b, prev, insert) > 0);
-#else
-	if (prev &&
-	    bkey_iter_cmp(b, prev, insert) > 0) {
-		struct bkey k1 = bkey_unpack_key(b, prev);
-		struct bkey k2 = bkey_unpack_key(b, insert);
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &k1);
-		bch2_bkey_to_text(&buf2, &k2);
-
-		panic("prev > insert:\n"
-		      "prev    key %s\n"
-		      "insert  key %s\n",
-		      buf1.buf, buf2.buf);
-	}
-#endif
-#if 0
-	BUG_ON(next != btree_bkey_last(b, t) &&
-	       bkey_iter_cmp(b, insert, next) > 0);
-#else
-	if (next != btree_bkey_last(b, t) &&
-	    bkey_iter_cmp(b, insert, next) > 0) {
-		struct bkey k1 = bkey_unpack_key(b, insert);
-		struct bkey k2 = bkey_unpack_key(b, next);
-
-		bch2_dump_btree_node(NULL, b);
-		bch2_bkey_to_text(&buf1, &k1);
-		bch2_bkey_to_text(&buf2, &k2);
-
-		panic("insert > next:\n"
-		      "insert  key %s\n"
-		      "next    key %s\n",
-		      buf1.buf, buf2.buf);
-	}
-#endif
-}
-
-static inline void bch2_verify_insert_pos(struct btree *b,
-					  struct bkey_packed *where,
-					  struct bkey_packed *insert,
-					  unsigned clobber_u64s)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-		__bch2_verify_insert_pos(b, where, insert, clobber_u64s);
-}
-
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED	U8_MAX
-#define BFLOAT_FAILED		U8_MAX
-
-struct bkey_float {
-	u8		exponent;
-	u8		key_offset;
-	u16		mantissa;
-};
-#define BKEY_MANTISSA_BITS	16
-
-struct ro_aux_tree {
-	u8			nothing[0];
-	struct bkey_float	f[];
-};
-
-struct rw_aux_tree {
-	u16		offset;
-	struct bpos	k;
-};
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
-	BUG_ON(t->aux_data_offset == U16_MAX);
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		return t->aux_data_offset;
-	case BSET_RO_AUX_TREE:
-		return t->aux_data_offset +
-			DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
-	case BSET_RW_AUX_TREE:
-		return t->aux_data_offset +
-			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
-	default:
-		BUG();
-	}
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
-					const struct bset_tree *t)
-{
-	return t == b->set
-		? DIV_ROUND_UP(b->unpack_fn_len, 8)
-		: bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
-			     const struct bset_tree *t)
-{
-	return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
-					    const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	return __aux_tree_base(b, t);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
-				     const struct bset_tree *t,
-				     unsigned idx)
-{
-	return ro_aux_tree_base(b, t)->f + idx;
-}
-
-static void __bset_aux_tree_verify(struct btree *b)
-{
-	for_each_bset(b, t) {
-		if (t->aux_data_offset == U16_MAX)
-			continue;
-
-		BUG_ON(t != b->set &&
-		       t[-1].aux_data_offset == U16_MAX);
-
-		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
-		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
-		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
-	}
-}
-
-static inline void bset_aux_tree_verify(struct btree *b)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-		__bset_aux_tree_verify(b);
-}
-
-void bch2_btree_keys_init(struct btree *b)
-{
-	unsigned i;
-
-	b->nsets		= 0;
-	memset(&b->nr, 0, sizeof(b->nr));
-
-	for (i = 0; i < MAX_BSETS; i++)
-		b->set[i].data_offset = U16_MAX;
-
-	bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
-				   const struct bset_tree *t,
-				   unsigned cacheline)
-{
-	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
-				   L1_CACHE_BYTES) +
-		cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
-					     const struct bset_tree *t,
-					     unsigned cacheline,
-					     unsigned offset)
-{
-	return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
-				  const struct bset_tree *t,
-				  const struct bkey_packed *k)
-{
-	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
-					  const struct bset_tree *t,
-					  unsigned cacheline,
-					  const struct bkey_packed *k)
-{
-	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
-					 const struct bset_tree *t,
-					 unsigned cacheline,
-					 const struct bkey_packed *k)
-{
-	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
-	EBUG_ON(m > U8_MAX);
-	return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
-					       const struct bset_tree *t,
-					       unsigned j)
-{
-	return cacheline_to_bkey(b, t,
-			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
-			bkey_float(b, t, j)->key_offset);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
-				       const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
-	return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
-					  struct bset_tree *t,
-					  unsigned j)
-{
-	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
-			    unsigned j, struct bkey_packed *k)
-{
-	EBUG_ON(k >= btree_bkey_last(b, t));
-
-	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
-		.offset	= __btree_node_key_to_offset(b, k),
-		.k	= bkey_unpack_pos(b, k),
-	};
-}
-
-static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bkey_packed *k = btree_bkey_first(b, t);
-	unsigned j = 0;
-
-	BUG_ON(bset_has_ro_aux_tree(t));
-
-	if (!bset_has_rw_aux_tree(t))
-		return;
-
-	BUG_ON(t->size < 1);
-	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
-	goto start;
-	while (1) {
-		if (rw_aux_to_bkey(b, t, j) == k) {
-			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
-					bkey_unpack_pos(b, k)));
-start:
-			if (++j == t->size)
-				break;
-
-			BUG_ON(rw_aux_tree(b, t)[j].offset <=
-			       rw_aux_tree(b, t)[j - 1].offset);
-		}
-
-		k = bkey_p_next(k);
-		BUG_ON(k >= btree_bkey_last(b, t));
-	}
-}
-
-static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
-						struct bset_tree *t)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-		__bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
-				    struct bset_tree *t,
-				    unsigned offset)
-{
-	unsigned bset_offs = offset - btree_bkey_first_offset(t);
-	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
-	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
-
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-	EBUG_ON(!t->size);
-	EBUG_ON(idx > t->size);
-
-	while (idx < t->size &&
-	       rw_aux_tree(b, t)[idx].offset < offset)
-		idx++;
-
-	while (idx &&
-	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
-		idx--;
-
-	EBUG_ON(idx < t->size &&
-		rw_aux_tree(b, t)[idx].offset < offset);
-	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
-	EBUG_ON(idx + 1 < t->size &&
-		rw_aux_tree(b, t)[idx].offset ==
-		rw_aux_tree(b, t)[idx + 1].offset);
-
-	return idx;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
-				     const struct bkey_float *f)
-{
-	u64 v;
-
-	EBUG_ON(!bkey_packed(k));
-
-	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
-	/*
-	 * In little endian, we're shifting off low bits (and then the bits we
-	 * want are at the low end), in big endian we're shifting off high bits
-	 * (and then the bits we want are at the high end, so we shift them
-	 * back down):
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	v >>= f->exponent & 7;
-#else
-	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
-#endif
-	return (u16) v;
-}
-
-static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
-					unsigned j,
-					struct bkey_packed *min_key,
-					struct bkey_packed *max_key)
-{
-	struct bkey_float *f = bkey_float(b, t, j);
-	struct bkey_packed *m = tree_to_bkey(b, t, j);
-	struct bkey_packed *l = is_power_of_2(j)
-		? min_key
-		: tree_to_bkey(b, t, j >> ffs(j));
-	struct bkey_packed *r = is_power_of_2(j + 1)
-		? max_key
-		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
-	unsigned mantissa;
-	int shift, exponent, high_bit;
-
-	/*
-	 * for failed bfloats, the lookup code falls back to comparing against
-	 * the original key.
-	 */
-
-	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
-	    !b->nr_key_bits) {
-		f->exponent = BFLOAT_FAILED_UNPACKED;
-		return;
-	}
-
-	/*
-	 * The greatest differing bit of l and r is the first bit we must
-	 * include in the bfloat mantissa we're creating in order to do
-	 * comparisons - that bit always becomes the high bit of
-	 * bfloat->mantissa, and thus the exponent we're calculating here is
-	 * the position of what will become the low bit in bfloat->mantissa:
-	 *
-	 * Note that this may be negative - we may be running off the low end
-	 * of the key: we handle this later:
-	 */
-	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
-		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
-	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
-
-	/*
-	 * Then we calculate the actual shift value, from the start of the key
-	 * (k->_data), to get the key bits starting at exponent:
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
-	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
-#else
-	shift = high_bit_offset +
-		b->nr_key_bits -
-		exponent -
-		BKEY_MANTISSA_BITS;
-
-	EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
-	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
-	f->exponent = shift;
-	mantissa = bkey_mantissa(m, f);
-
-	/*
-	 * If we've got garbage bits, set them to all 1s - it's legal for the
-	 * bfloat to compare larger than the original key, but not smaller:
-	 */
-	if (exponent < 0)
-		mantissa |= ~(~0U << -exponent);
-
-	f->mantissa = mantissa;
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	bset_aux_tree_verify(b);
-
-	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
-	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bkey_packed *k;
-
-	t->size = 1;
-	t->extra = BSET_RW_AUX_TREE_VAL;
-	rw_aux_tree(b, t)[0].offset =
-		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
-	bset_tree_for_each_key(b, t, k) {
-		if (t->size == bset_rw_tree_capacity(b, t))
-			break;
-
-		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
-		    L1_CACHE_BYTES)
-			rw_aux_tree_set(b, t, t->size++, k);
-	}
-}
-
-static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bkey_packed *k = btree_bkey_first(b, t);
-	struct bkey_i min_key, max_key;
-	unsigned cacheline = 1;
-
-	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
-		      bset_ro_tree_capacity(b, t));
-retry:
-	if (t->size < 2) {
-		t->size = 0;
-		t->extra = BSET_NO_AUX_TREE_VAL;
-		return;
-	}
-
-	t->extra = eytzinger1_extra(t->size - 1);
-
-	/* First we figure out where the first key in each cacheline is */
-	eytzinger1_for_each(j, t->size - 1) {
-		while (bkey_to_cacheline(b, t, k) < cacheline)
-			k = bkey_p_next(k);
-
-		if (k >= btree_bkey_last(b, t)) {
-			/* XXX: this path sucks */
-			t->size--;
-			goto retry;
-		}
-
-		bkey_float(b, t, j)->key_offset =
-			bkey_to_cacheline_offset(b, t, cacheline++, k);
-
-		EBUG_ON(tree_to_bkey(b, t, j) != k);
-	}
-
-	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
-		bkey_init(&min_key.k);
-		min_key.k.p = b->data->min_key;
-	}
-
-	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
-		bkey_init(&max_key.k);
-		max_key.k.p = b->data->max_key;
-	}
-
-	/* Then we build the tree */
-	eytzinger1_for_each(j, t->size - 1)
-		make_bfloat(b, t, j,
-			    bkey_to_packed(&min_key),
-			    bkey_to_packed(&max_key));
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
-	struct bset_tree *i;
-
-	for (i = b->set; i != t; i++)
-		BUG_ON(bset_has_rw_aux_tree(i));
-
-	bch2_bset_set_no_aux_tree(b, t);
-
-	/* round up to next cacheline: */
-	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
-				      SMP_CACHE_BYTES / sizeof(u64));
-
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
-			     bool writeable)
-{
-	if (writeable
-	    ? bset_has_rw_aux_tree(t)
-	    : bset_has_ro_aux_tree(t))
-		return;
-
-	bset_alloc_tree(b, t);
-
-	if (!__bset_tree_capacity(b, t))
-		return;
-
-	if (writeable)
-		__build_rw_aux_tree(b, t);
-	else
-		__build_ro_aux_tree(b, t);
-
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_init_first(struct btree *b, struct bset *i)
-{
-	struct bset_tree *t;
-
-	BUG_ON(b->nsets);
-
-	memset(i, 0, sizeof(*i));
-	get_random_bytes(&i->seq, sizeof(i->seq));
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-	t = &b->set[b->nsets++];
-	set_btree_bset(b, t, i);
-}
-
-void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
-{
-	struct bset *i = &bne->keys;
-	struct bset_tree *t;
-
-	BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
-	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
-	BUG_ON(b->nsets >= MAX_BSETS);
-
-	memset(i, 0, sizeof(*i));
-	i->seq = btree_bset_first(b)->seq;
-	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-	t = &b->set[b->nsets++];
-	set_btree_bset(b, t, i);
-}
-
-/*
- * find _some_ key in the same bset as @k that precedes @k - not necessarily the
- * immediate predecessor:
- */
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
-				       struct bkey_packed *k)
-{
-	struct bkey_packed *p;
-	unsigned offset;
-	int j;
-
-	EBUG_ON(k < btree_bkey_first(b, t) ||
-		k > btree_bkey_last(b, t));
-
-	if (k == btree_bkey_first(b, t))
-		return NULL;
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		p = btree_bkey_first(b, t);
-		break;
-	case BSET_RO_AUX_TREE:
-		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
-		do {
-			p = j ? tree_to_bkey(b, t,
-					__inorder_to_eytzinger1(j--,
-							t->size - 1, t->extra))
-			      : btree_bkey_first(b, t);
-		} while (p >= k);
-		break;
-	case BSET_RW_AUX_TREE:
-		offset = __btree_node_key_to_offset(b, k);
-		j = rw_aux_tree_bsearch(b, t, offset);
-		p = j ? rw_aux_to_bkey(b, t, j - 1)
-		      : btree_bkey_first(b, t);
-		break;
-	}
-
-	return p;
-}
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
-					  struct bset_tree *t,
-					  struct bkey_packed *k,
-					  unsigned min_key_type)
-{
-	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
-
-	while ((p = __bkey_prev(b, t, k)) && !ret) {
-		for (i = p; i != k; i = bkey_p_next(i))
-			if (i->type >= min_key_type)
-				ret = i;
-
-		k = p;
-	}
-
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-		BUG_ON(ret >= orig_k);
-
-		for (i = ret
-			? bkey_p_next(ret)
-			: btree_bkey_first(b, t);
-		     i != orig_k;
-		     i = bkey_p_next(i))
-			BUG_ON(i->type >= min_key_type);
-	}
-
-	return ret;
-}
-
-/* Insert */
-
-static void rw_aux_tree_insert_entry(struct btree *b,
-				     struct bset_tree *t,
-				     unsigned idx)
-{
-	EBUG_ON(!idx || idx > t->size);
-	struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
-	struct bkey_packed *end = idx < t->size
-				  ? rw_aux_to_bkey(b, t, idx)
-				  : btree_bkey_last(b, t);
-
-	if (t->size < bset_rw_tree_capacity(b, t) &&
-	    (void *) end - (void *) start > L1_CACHE_BYTES) {
-		struct bkey_packed *k = start;
-
-		while (1) {
-			k = bkey_p_next(k);
-			if (k == end)
-				break;
-
-			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
-				memmove(&rw_aux_tree(b, t)[idx + 1],
-					&rw_aux_tree(b, t)[idx],
-					(void *) &rw_aux_tree(b, t)[t->size] -
-					(void *) &rw_aux_tree(b, t)[idx]);
-				t->size++;
-				rw_aux_tree_set(b, t, idx, k);
-				break;
-			}
-		}
-	}
-}
-
-static void bch2_bset_fix_lookup_table(struct btree *b,
-				       struct bset_tree *t,
-				       struct bkey_packed *_where,
-				       unsigned clobber_u64s,
-				       unsigned new_u64s)
-{
-	int shift = new_u64s - clobber_u64s;
-	unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
-
-	EBUG_ON(bset_has_ro_aux_tree(t));
-
-	if (!bset_has_rw_aux_tree(t))
-		return;
-
-	if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
-		rw_aux_tree_insert_entry(b, t, t->size);
-		goto verify;
-	}
-
-	/* returns first entry >= where */
-	idx = rw_aux_tree_bsearch(b, t, where);
-
-	if (rw_aux_tree(b, t)[idx].offset == where) {
-		if (!idx) { /* never delete first entry */
-			idx++;
-		} else if (where < t->end_offset) {
-			rw_aux_tree_set(b, t, idx++, _where);
-		} else {
-			EBUG_ON(where != t->end_offset);
-			rw_aux_tree_insert_entry(b, t, --t->size);
-			goto verify;
-		}
-	}
-
-	EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
-	if (idx < t->size &&
-	    rw_aux_tree(b, t)[idx].offset + shift ==
-	    rw_aux_tree(b, t)[idx - 1].offset) {
-		memmove(&rw_aux_tree(b, t)[idx],
-			&rw_aux_tree(b, t)[idx + 1],
-			(void *) &rw_aux_tree(b, t)[t->size] -
-			(void *) &rw_aux_tree(b, t)[idx + 1]);
-		t->size -= 1;
-	}
-
-	for (j = idx; j < t->size; j++)
-		rw_aux_tree(b, t)[j].offset += shift;
-
-	EBUG_ON(idx < t->size &&
-		rw_aux_tree(b, t)[idx].offset ==
-		rw_aux_tree(b, t)[idx - 1].offset);
-
-	rw_aux_tree_insert_entry(b, t, idx);
-
-verify:
-	bch2_bset_verify_rw_aux_tree(b, t);
-	bset_aux_tree_verify(b);
-}
-
-void bch2_bset_insert(struct btree *b,
-		      struct bkey_packed *where,
-		      struct bkey_i *insert,
-		      unsigned clobber_u64s)
-{
-	struct bkey_format *f = &b->format;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bkey_packed packed, *src = bkey_to_packed(insert);
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
-
-	if (bch2_bkey_pack_key(&packed, &insert->k, f))
-		src = &packed;
-
-	if (!bkey_deleted(&insert->k))
-		btree_keys_account_key_add(&b->nr, t - b->set, src);
-
-	if (src->u64s != clobber_u64s) {
-		u64 *src_p = (u64 *) where->_data + clobber_u64s;
-		u64 *dst_p = (u64 *) where->_data + src->u64s;
-
-		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
-			(int) clobber_u64s - src->u64s);
-
-		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
-		set_btree_bset_end(b, t);
-	}
-
-	memcpy_u64s_small(where, src,
-		    bkeyp_key_u64s(f, src));
-	memcpy_u64s(bkeyp_val(f, where), &insert->v,
-		    bkeyp_val_u64s(f, src));
-
-	if (src->u64s != clobber_u64s)
-		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
-	bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_bset_delete(struct btree *b,
-		      struct bkey_packed *where,
-		      unsigned clobber_u64s)
-{
-	struct bset_tree *t = bset_tree_last(b);
-	u64 *src_p = (u64 *) where->_data + clobber_u64s;
-	u64 *dst_p = where->_data;
-
-	bch2_bset_verify_rw_aux_tree(b, t);
-
-	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
-	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
-	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
-	set_btree_bset_end(b, t);
-
-	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search)
-{
-	unsigned l = 0, r = t->size;
-
-	while (l + 1 != r) {
-		unsigned m = (l + r) >> 1;
-
-		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
-			l = m;
-		else
-			r = m;
-	}
-
-	return rw_aux_to_bkey(b, t, l);
-}
-
-static inline void prefetch_four_cachelines(void *p)
-{
-#ifdef CONFIG_X86_64
-	asm("prefetcht0 (-127 + 64 * 0)(%0);"
-	    "prefetcht0 (-127 + 64 * 1)(%0);"
-	    "prefetcht0 (-127 + 64 * 2)(%0);"
-	    "prefetcht0 (-127 + 64 * 3)(%0);"
-	    :
-	    : "r" (p + 127));
-#else
-	prefetch(p + L1_CACHE_BYTES * 0);
-	prefetch(p + L1_CACHE_BYTES * 1);
-	prefetch(p + L1_CACHE_BYTES * 2);
-	prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-}
-
-static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
-					      const struct bkey_float *f)
-{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
-
-	return f->exponent > key_bits_start;
-#else
-	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-
-	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
-#endif
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
-				const struct bset_tree *t,
-				const struct bpos *search,
-				const struct bkey_packed *packed_search)
-{
-	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
-	struct bkey_float *f;
-	struct bkey_packed *k;
-	unsigned inorder, n = 1, l, r;
-	int cmp;
-
-	do {
-		if (likely(n << 4 < t->size))
-			prefetch(&base->f[n << 4]);
-
-		f = &base->f[n];
-		if (unlikely(f->exponent >= BFLOAT_FAILED))
-			goto slowpath;
-
-		l = f->mantissa;
-		r = bkey_mantissa(packed_search, f);
-
-		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
-			goto slowpath;
-
-		n = n * 2 + (l < r);
-		continue;
-slowpath:
-		k = tree_to_bkey(b, t, n);
-		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
-		if (!cmp)
-			return k;
-
-		n = n * 2 + (cmp < 0);
-	} while (n < t->size);
-
-	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
-
-	/*
-	 * n would have been the node we recursed to - the low bit tells us if
-	 * we recursed left or recursed right.
-	 */
-	if (likely(!(n & 1))) {
-		--inorder;
-		if (unlikely(!inorder))
-			return btree_bkey_first(b, t);
-
-		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
-	}
-
-	return cacheline_to_bkey(b, t, inorder, f->key_offset);
-}
-
-static __always_inline __flatten
-struct bkey_packed *__bch2_bset_search(struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
-				const struct bkey_packed *lossy_packed_search)
-{
-
-	/*
-	 * First, we search for a cacheline, then lastly we do a linear search
-	 * within that cacheline.
-	 *
-	 * To search for the cacheline, there's three different possibilities:
-	 *  * The set is too small to have a search tree, so we just do a linear
-	 *    search over the whole set.
-	 *  * The set is the one we're currently inserting into; keeping a full
-	 *    auxiliary search tree up to date would be too expensive, so we
-	 *    use a much simpler lookup table to do a binary search -
-	 *    bset_search_write_set().
-	 *  * Or we use the auxiliary search tree we constructed earlier -
-	 *    bset_search_tree()
-	 */
-
-	switch (bset_aux_tree_type(t)) {
-	case BSET_NO_AUX_TREE:
-		return btree_bkey_first(b, t);
-	case BSET_RW_AUX_TREE:
-		return bset_search_write_set(b, t, search);
-	case BSET_RO_AUX_TREE:
-		return bset_search_tree(b, t, search, lossy_packed_search);
-	default:
-		BUG();
-	}
-}
-
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search_linear(struct btree *b,
-				struct bset_tree *t,
-				struct bpos *search,
-				struct bkey_packed *packed_search,
-				const struct bkey_packed *lossy_packed_search,
-				struct bkey_packed *m)
-{
-	if (lossy_packed_search)
-		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_cmp_p_or_unp(b, m,
-					lossy_packed_search, search) < 0)
-			m = bkey_p_next(m);
-
-	if (!packed_search)
-		while (m != btree_bkey_last(b, t) &&
-		       bkey_iter_pos_cmp(b, m, search) < 0)
-			m = bkey_p_next(m);
-
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
-		BUG_ON(prev &&
-		       bkey_iter_cmp_p_or_unp(b, prev,
-					packed_search, search) >= 0);
-	}
-
-	return m;
-}
-
-/* Btree node iterator */
-
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			      struct btree *b,
-			      const struct bkey_packed *k,
-			      const struct bkey_packed *end)
-{
-	if (k != end) {
-		struct btree_node_iter_set *pos;
-
-		btree_node_iter_for_each(iter, pos)
-			;
-
-		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
-		*pos = (struct btree_node_iter_set) {
-			__btree_node_key_to_offset(b, k),
-			__btree_node_key_to_offset(b, end)
-		};
-	}
-}
-
-void bch2_btree_node_iter_push(struct btree_node_iter *iter,
-			       struct btree *b,
-			       const struct bkey_packed *k,
-			       const struct bkey_packed *end)
-{
-	__bch2_btree_node_iter_push(iter, b, k, end);
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-noinline __flatten __cold
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
-			      struct btree *b, struct bpos *search)
-{
-	struct bkey_packed *k;
-
-	trace_bkey_pack_pos_fail(search);
-
-	bch2_btree_node_iter_init_from_start(iter, b);
-
-	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
-	       bkey_iter_pos_cmp(b, k, search) < 0)
-		bch2_btree_node_iter_advance(iter, b);
-}
-
-/**
- * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * @iter:	iterator to initialize
- * @b:		btree node to search
- * @search:	search key
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- *	i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- *  - For non extents, we guarantee that the live key comes last - see
- *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- *    see will only be deleted keys you don't care about.
- *
- *  - For extents, deleted keys sort last (see the comment at the top of this
- *    file). But when you're searching for extents, you actually want the first
- *    key strictly greater than your search key - an extent that compares equal
- *    to the search key is going to have 0 sectors after the search key.
- *
- *    But this does mean that we can't just search for
- *    bpos_successor(start_of_range) to get the first extent that overlaps with
- *    the range we want - if we're unlucky and there's an extent that ends
- *    exactly where we searched, then there could be a deleted key at the same
- *    position and we'd get that when we search instead of the preceding extent
- *    we needed.
- *
- *    So we've got to search for start_of_range, then after the lookup iterate
- *    past any extents that compare equal to the position we searched for.
- */
-__flatten
-void bch2_btree_node_iter_init(struct btree_node_iter *iter,
-			       struct btree *b, struct bpos *search)
-{
-	struct bkey_packed p, *packed_search = NULL;
-	struct btree_node_iter_set *pos = iter->data;
-	struct bkey_packed *k[MAX_BSETS];
-	unsigned i;
-
-	EBUG_ON(bpos_lt(*search, b->data->min_key));
-	EBUG_ON(bpos_gt(*search, b->data->max_key));
-	bset_aux_tree_verify(b);
-
-	memset(iter, 0, sizeof(*iter));
-
-	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
-	case BKEY_PACK_POS_EXACT:
-		packed_search = &p;
-		break;
-	case BKEY_PACK_POS_SMALLER:
-		packed_search = NULL;
-		break;
-	case BKEY_PACK_POS_FAIL:
-		btree_node_iter_init_pack_failed(iter, b, search);
-		return;
-	}
-
-	for (i = 0; i < b->nsets; i++) {
-		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
-		prefetch_four_cachelines(k[i]);
-	}
-
-	for (i = 0; i < b->nsets; i++) {
-		struct bset_tree *t = b->set + i;
-		struct bkey_packed *end = btree_bkey_last(b, t);
-
-		k[i] = bch2_bset_search_linear(b, t, search,
-					       packed_search, &p, k[i]);
-		if (k[i] != end)
-			*pos++ = (struct btree_node_iter_set) {
-				__btree_node_key_to_offset(b, k[i]),
-				__btree_node_key_to_offset(b, end)
-			};
-	}
-
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-					  struct btree *b)
-{
-	memset(iter, 0, sizeof(*iter));
-
-	for_each_bset(b, t)
-		__bch2_btree_node_iter_push(iter, b,
-					   btree_bkey_first(b, t),
-					   btree_bkey_last(b, t));
-	bch2_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
-						  struct btree *b,
-						  struct bset_tree *t)
-{
-	struct btree_node_iter_set *set;
-
-	btree_node_iter_for_each(iter, set)
-		if (set->end == t->end_offset)
-			return __btree_node_offset_to_key(b, set->k);
-
-	return btree_bkey_last(b, t);
-}
-
-static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
-					    struct btree *b,
-					    unsigned first)
-{
-	bool ret;
-
-	if ((ret = (btree_node_iter_cmp(b,
-					iter->data[first],
-					iter->data[first + 1]) > 0)))
-		swap(iter->data[first], iter->data[first + 1]);
-	return ret;
-}
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
-			       struct btree *b)
-{
-	/* unrolled bubble sort: */
-
-	if (!__btree_node_iter_set_end(iter, 2)) {
-		btree_node_iter_sort_two(iter, b, 0);
-		btree_node_iter_sort_two(iter, b, 1);
-	}
-
-	if (!__btree_node_iter_set_end(iter, 1))
-		btree_node_iter_sort_two(iter, b, 0);
-}
-
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
-				   struct btree_node_iter_set *set)
-{
-	struct btree_node_iter_set *last =
-		iter->data + ARRAY_SIZE(iter->data) - 1;
-
-	memmove(&set[0], &set[1], (void *) last - (void *) set);
-	*last = (struct btree_node_iter_set) { 0, 0 };
-}
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-						  struct btree *b)
-{
-	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
-
-	EBUG_ON(iter->data->k > iter->data->end);
-
-	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-		/* avoid an expensive memmove call: */
-		iter->data[0] = iter->data[1];
-		iter->data[1] = iter->data[2];
-		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
-		return;
-	}
-
-	if (__btree_node_iter_set_end(iter, 1))
-		return;
-
-	if (!btree_node_iter_sort_two(iter, b, 0))
-		return;
-
-	if (__btree_node_iter_set_end(iter, 2))
-		return;
-
-	btree_node_iter_sort_two(iter, b, 1);
-}
-
-void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
-				  struct btree *b)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
-		__bch2_btree_node_iter_verify(iter, b);
-		__bch2_btree_node_iter_next_check(iter, b);
-	}
-
-	__bch2_btree_node_iter_advance(iter, b);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
-						  struct btree *b)
-{
-	struct bkey_packed *k, *prev = NULL;
-	struct btree_node_iter_set *set;
-	unsigned end = 0;
-
-	bch2_btree_node_iter_verify(iter, b);
-
-	for_each_bset(b, t) {
-		k = bch2_bkey_prev_all(b, t,
-			bch2_btree_node_iter_bset_pos(iter, b, t));
-		if (k &&
-		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
-			prev = k;
-			end = t->end_offset;
-		}
-	}
-
-	if (!prev)
-		return NULL;
-
-	/*
-	 * We're manually memmoving instead of just calling sort() to ensure the
-	 * prev we picked ends up in slot 0 - sort won't necessarily put it
-	 * there because of duplicate deleted keys:
-	 */
-	btree_node_iter_for_each(iter, set)
-		if (set->end == end)
-			goto found;
-
-	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
-found:
-	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
-
-	memmove(&iter->data[1],
-		&iter->data[0],
-		(void *) set - (void *) &iter->data[0]);
-
-	iter->data[0].k = __btree_node_key_to_offset(b, prev);
-	iter->data[0].end = end;
-
-	bch2_btree_node_iter_verify(iter, b);
-	return prev;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
-					      struct btree *b)
-{
-	struct bkey_packed *prev;
-
-	do {
-		prev = bch2_btree_node_iter_prev_all(iter, b);
-	} while (prev && bkey_deleted(prev));
-
-	return prev;
-}
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
-						 struct btree *b,
-						 struct bkey *u)
-{
-	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
-
-	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-
-/* Mergesort */
-
-void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
-{
-	for_each_bset_c(b, t) {
-		enum bset_aux_tree_type type = bset_aux_tree_type(t);
-		size_t j;
-
-		stats->sets[type].nr++;
-		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
-			sizeof(u64);
-
-		if (bset_has_ro_aux_tree(t)) {
-			stats->floats += t->size - 1;
-
-			for (j = 1; j < t->size; j++)
-				stats->failed +=
-					bkey_float(b, t, j)->exponent ==
-					BFLOAT_FAILED;
-		}
-	}
-}
-
-void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
-			 struct bkey_packed *k)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-	struct bkey uk;
-	unsigned j, inorder;
-
-	if (!bset_has_ro_aux_tree(t))
-		return;
-
-	inorder = bkey_to_cacheline(b, t, k);
-	if (!inorder || inorder >= t->size)
-		return;
-
-	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
-	if (k != tree_to_bkey(b, t, j))
-		return;
-
-	switch (bkey_float(b, t, j)->exponent) {
-	case BFLOAT_FAILED:
-		uk = bkey_unpack_key(b, k);
-		prt_printf(out,
-		       "    failed unpacked at depth %u\n"
-		       "\t",
-		       ilog2(j));
-		bch2_bpos_to_text(out, uk.p);
-		prt_printf(out, "\n");
-		break;
-	}
-}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
deleted file mode 100644
index a15ecf9d006e..000000000000
--- a/fs/bcachefs/bset.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BSET_H
-#define _BCACHEFS_BSET_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that,  we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking.  Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n.  The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-enum bset_aux_tree_type {
-	BSET_NO_AUX_TREE,
-	BSET_RO_AUX_TREE,
-	BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES	3
-
-#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
-#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
-	switch (t->extra) {
-	case BSET_NO_AUX_TREE_VAL:
-		EBUG_ON(t->size);
-		return BSET_NO_AUX_TREE;
-	case BSET_RW_AUX_TREE_VAL:
-		EBUG_ON(!t->size);
-		return BSET_RW_AUX_TREE;
-	default:
-		EBUG_ON(!t->size);
-		return BSET_RO_AUX_TREE;
-	}
-}
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		256
-
-static inline size_t btree_keys_cachelines(const struct btree *b)
-{
-	return (1U << b->byte_order) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(const struct btree *b)
-{
-	return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(const struct btree *b)
-{
-	return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-#define for_each_bset(_b, _t)						\
-	for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define for_each_bset_c(_b, _t)						\
-	for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define bset_tree_for_each_key(_b, _t, _k)				\
-	for (_k = btree_bkey_first(_b, _t);				\
-	     _k != btree_bkey_last(_b, _t);				\
-	     _k = bkey_p_next(_k))
-
-static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
-{
-	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
-	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch2_bset_set_no_aux_tree(struct btree *b,
-					    struct bset_tree *t)
-{
-	BUG_ON(t < b->set);
-
-	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
-		t->size = 0;
-		t->extra = BSET_NO_AUX_TREE_VAL;
-		t->aux_data_offset = U16_MAX;
-	}
-}
-
-static inline void btree_node_set_format(struct btree *b,
-					 struct bkey_format f)
-{
-	int len;
-
-	b->format	= f;
-	b->nr_key_bits	= bkey_format_key_bits(&f);
-
-	len = bch2_compile_bkey_format(&b->format, b->aux_data);
-	BUG_ON(len < 0 || len > U8_MAX);
-
-	b->unpack_fn_len = len;
-
-	bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
-					 unsigned block_bytes)
-{
-	struct bset *i = btree_bset_last(b);
-
-	EBUG_ON(!is_power_of_2(block_bytes));
-
-	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch2_btree_keys_init(struct btree *);
-
-void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
-void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-
-void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
-		      unsigned);
-void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
-				    const struct bkey_packed *l,
-				    const struct bkey_packed *r_packed,
-				    const struct bpos *r)
-{
-	EBUG_ON(r_packed && !bkey_packed(r_packed));
-
-	if (unlikely(!bkey_packed(l)))
-		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
-
-	if (likely(r_packed))
-		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
-
-	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-static inline struct bset_tree *
-bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
-{
-	unsigned offset = __btree_node_key_to_offset(b, k);
-
-	for_each_bset(b, t)
-		if (offset <= t->end_offset) {
-			EBUG_ON(offset < btree_bkey_first_offset(t));
-			return t;
-		}
-
-	BUG();
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
-					  struct bkey_packed *, unsigned);
-
-static inline struct bkey_packed *
-bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-	return bch2_bkey_prev_filter(b, t, k, 0);
-}
-
-static inline struct bkey_packed *
-bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
-	return bch2_bkey_prev_filter(b, t, k, 1);
-}
-
-/* Btree key iteration */
-
-void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
-			      const struct bkey_packed *,
-			      const struct bkey_packed *);
-void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-			       struct bpos *);
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-					  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
-						 struct btree *,
-						 struct bset_tree *);
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
-				   struct btree_node_iter_set *);
-void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set)				\
-	for (_set = (_iter)->data;					\
-	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
-	     (_set)->k != (_set)->end;					\
-	     _set++)
-
-static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
-					     unsigned i)
-{
-	return iter->data[i].k == iter->data[i].end;
-}
-
-static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
-{
-	return __btree_node_iter_set_end(iter, 0);
-}
-
-/*
- * When keys compare equal, deleted keys compare first:
- *
- * XXX: only need to compare pointers for keys that are both within a
- * btree_node_iterator - we need to break ties for prev() to work correctly
- */
-static inline int bkey_iter_cmp(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r)
-		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
-		?: cmp_int(l, r);
-}
-
-static inline int btree_node_iter_cmp(const struct btree *b,
-				      struct btree_node_iter_set l,
-				      struct btree_node_iter_set r)
-{
-	return bkey_iter_cmp(b,
-			__btree_node_offset_to_key(b, l.k),
-			__btree_node_offset_to_key(b, r.k));
-}
-
-/* These assume r (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(const struct btree *b,
-			const struct bkey_packed *l,
-			const struct bpos *r)
-{
-	return bkey_cmp_left_packed(b, l, r)
-		?: -((int) bkey_deleted(l));
-}
-
-static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
-				    const struct bkey_packed *l,
-				    const struct bkey_packed *r_packed,
-				    const struct bpos *r)
-{
-	return bkey_cmp_p_or_unp(b, l, r_packed, r)
-		?: -((int) bkey_deleted(l));
-}
-
-static inline struct bkey_packed *
-__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-				struct btree *b)
-{
-	return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
-{
-	return !bch2_btree_node_iter_end(iter)
-		? __btree_node_offset_to_key(b, iter->data->k)
-		: NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
-	struct bkey_packed *k;
-
-	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
-	       bkey_deleted(k))
-		bch2_btree_node_iter_advance(iter, b);
-
-	return k;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
-	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
-
-	if (ret)
-		bch2_btree_node_iter_advance(iter, b);
-
-	return ret;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
-						  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
-					      struct btree *);
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
-						struct btree *,
-						struct bkey *);
-
-#define for_each_btree_node_key(b, k, iter)				\
-	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
-	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
-	     bch2_btree_node_iter_advance(iter, b))
-
-#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
-	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
-	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
-	     bch2_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
-					  unsigned bset,
-					  struct bkey_packed *k,
-					  int sign)
-{
-	n->live_u64s		+= k->u64s * sign;
-	n->bset_u64s[bset]	+= k->u64s * sign;
-
-	if (bkey_packed(k))
-		n->packed_keys	+= sign;
-	else
-		n->unpacked_keys += sign;
-}
-
-static inline void btree_keys_account_val_delta(struct btree *b,
-						struct bkey_packed *k,
-						int delta)
-{
-	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
-	b->nr.live_u64s			+= delta;
-	b->nr.bset_u64s[t - b->set]	+= delta;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
-	btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
-	btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-#define btree_account_key_add(_b, _k)				\
-	btree_keys_account_key(&(_b)->nr,			\
-		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
-#define btree_account_key_drop(_b, _k)				\
-	btree_keys_account_key(&(_b)->nr,			\
-		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
-
-struct bset_stats {
-	struct {
-		size_t nr, bytes;
-	} sets[BSET_TREE_NR_TYPES];
-
-	size_t floats;
-	size_t failed;
-};
-
-void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
-void bch2_bfloat_to_text(struct printbuf *, struct btree *,
-			 struct bkey_packed *);
-
-/* Debug stuff */
-
-void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct bch_fs *, struct btree *);
-void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-void __bch2_verify_btree_nr_keys(struct btree *);
-void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-
-static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-					       struct btree *b)
-{
-	if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
-		__bch2_btree_node_iter_verify(iter, b);
-}
-
-static inline void bch2_verify_btree_nr_keys(struct btree *b)
-{
-	if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
-		__bch2_verify_btree_nr_keys(b);
-}
-
-#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
deleted file mode 100644
index 83c9860e6b82..000000000000
--- a/fs/bcachefs/btree_cache.c
+++ /dev/null
@@ -1,1516 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sched/mm.h>
-#include <linux/swap.h>
-
-const char * const bch2_btree_node_flags[] = {
-	"typebit",
-	"typebit",
-	"typebit",
-#define x(f)	[BTREE_NODE_##f] = #f,
-	BTREE_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_recalc_btree_reserve(struct bch_fs *c)
-{
-	unsigned reserve = 16;
-
-	if (!c->btree_roots_known[0].b)
-		reserve += 8;
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->b)
-			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
-	}
-
-	c->btree_cache.nr_reserve = reserve;
-}
-
-static inline size_t btree_cache_can_free(struct btree_cache_list *list)
-{
-	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-
-	size_t can_free = list->nr;
-	if (!list->idx)
-		can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
-	return can_free;
-}
-
-static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
-{
-	BUG_ON(!list_empty(&b->list));
-
-	if (b->c.lock.readers)
-		list_add(&b->list, &bc->freed_pcpu);
-	else
-		list_add(&b->list, &bc->freed_nonpcpu);
-}
-
-static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
-{
-	BUG_ON(!list_empty(&b->list));
-	BUG_ON(!b->data);
-
-	bc->nr_freeable++;
-	list_add(&b->list, &bc->freeable);
-}
-
-void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
-{
-	struct btree_cache *bc = &c->btree_cache;
-
-	mutex_lock(&bc->lock);
-	__bch2_btree_node_to_freelist(bc, b);
-	mutex_unlock(&bc->lock);
-
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-}
-
-void __btree_node_data_free(struct btree *b)
-{
-	BUG_ON(!list_empty(&b->list));
-	BUG_ON(btree_node_hashed(b));
-
-	/*
-	 * This should really be done in slub/vmalloc, but we're using the
-	 * kmalloc_large() path, so we're working around a slub bug by doing
-	 * this here:
-	 */
-	if (b->data)
-		mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
-	if (b->aux_data)
-		mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
-
-	EBUG_ON(btree_node_write_in_flight(b));
-
-	clear_btree_node_just_written(b);
-
-	kvfree(b->data);
-	b->data = NULL;
-#ifdef __KERNEL__
-	kvfree(b->aux_data);
-#else
-	munmap(b->aux_data, btree_aux_data_bytes(b));
-#endif
-	b->aux_data = NULL;
-}
-
-static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
-{
-	BUG_ON(list_empty(&b->list));
-	list_del_init(&b->list);
-
-	__btree_node_data_free(b);
-
-	--bc->nr_freeable;
-	btree_node_to_freedlist(bc, b);
-}
-
-static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-				   const void *obj)
-{
-	const struct btree *b = obj;
-	const u64 *v = arg->key;
-
-	return b->hash_val == *v ? 0 : 1;
-}
-
-static const struct rhashtable_params bch_btree_cache_params = {
-	.head_offset		= offsetof(struct btree, hash),
-	.key_offset		= offsetof(struct btree, hash_val),
-	.key_len		= sizeof(u64),
-	.obj_cmpfn		= bch2_btree_cache_cmp_fn,
-	.automatic_shrinking	= true,
-};
-
-static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
-	BUG_ON(b->data || b->aux_data);
-
-	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
-	b->data = kvmalloc(btree_buf_bytes(b), gfp);
-	if (!b->data)
-		return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
-#ifdef __KERNEL__
-	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
-#else
-	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
-			   PROT_READ|PROT_WRITE|PROT_EXEC,
-			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
-	if (b->aux_data == MAP_FAILED)
-		b->aux_data = NULL;
-#endif
-	if (!b->aux_data) {
-		kvfree(b->data);
-		b->data = NULL;
-		return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
-	}
-
-	return 0;
-}
-
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
-	struct btree *b;
-
-	b = kzalloc(sizeof(struct btree), gfp);
-	if (!b)
-		return NULL;
-
-	bkey_btree_ptr_init(&b->key);
-	INIT_LIST_HEAD(&b->list);
-	INIT_LIST_HEAD(&b->write_blocked);
-	b->byte_order = ilog2(c->opts.btree_node_size);
-	return b;
-}
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
-{
-	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
-	if (!b)
-		return NULL;
-
-	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
-		kfree(b);
-		return NULL;
-	}
-
-	bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
-	return b;
-}
-
-static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
-{
-	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
-	u64 mask = bc->pinned_nodes_mask[!!b->c.level];
-
-	return ((mask & BIT_ULL(b->c.btree_id)) &&
-		bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
-		bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
-}
-
-void bch2_node_pin(struct bch_fs *c, struct btree *b)
-{
-	struct btree_cache *bc = &c->btree_cache;
-
-	mutex_lock(&bc->lock);
-	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
-		set_btree_node_pinned(b);
-		list_move(&b->list, &bc->live[1].list);
-		bc->live[0].nr--;
-		bc->live[1].nr++;
-	}
-	mutex_unlock(&bc->lock);
-}
-
-void bch2_btree_cache_unpin(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b, *n;
-
-	mutex_lock(&bc->lock);
-	c->btree_cache.pinned_nodes_mask[0] = 0;
-	c->btree_cache.pinned_nodes_mask[1] = 0;
-
-	list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
-		clear_btree_node_pinned(b);
-		list_move(&b->list, &bc->live[0].list);
-		bc->live[0].nr++;
-		bc->live[1].nr--;
-	}
-
-	mutex_unlock(&bc->lock);
-}
-
-/* Btree in memory cache - hash table */
-
-void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
-	lockdep_assert_held(&bc->lock);
-
-	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
-	BUG_ON(ret);
-
-	/* Cause future lookups for this node to fail: */
-	b->hash_val = 0;
-
-	if (b->c.btree_id < BTREE_ID_NR)
-		--bc->nr_by_btree[b->c.btree_id];
-	--bc->live[btree_node_pinned(b)].nr;
-	list_del_init(&b->list);
-}
-
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
-	__bch2_btree_node_hash_remove(bc, b);
-	__bch2_btree_node_to_freelist(bc, b);
-}
-
-int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
-{
-	BUG_ON(!list_empty(&b->list));
-	BUG_ON(b->hash_val);
-
-	b->hash_val = btree_ptr_hash_val(&b->key);
-	int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
-						bch_btree_cache_params);
-	if (ret)
-		return ret;
-
-	if (b->c.btree_id < BTREE_ID_NR)
-		bc->nr_by_btree[b->c.btree_id]++;
-
-	bool p = __btree_node_pinned(bc, b);
-	mod_bit(BTREE_NODE_pinned, &b->flags, p);
-
-	list_add_tail(&b->list, &bc->live[p].list);
-	bc->live[p].nr++;
-	return 0;
-}
-
-int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
-				unsigned level, enum btree_id id)
-{
-	b->c.level	= level;
-	b->c.btree_id	= id;
-
-	mutex_lock(&bc->lock);
-	int ret = __bch2_btree_node_hash_insert(bc, b);
-	mutex_unlock(&bc->lock);
-
-	return ret;
-}
-
-void bch2_btree_node_update_key_early(struct btree_trans *trans,
-				      enum btree_id btree, unsigned level,
-				      struct bkey_s_c old, struct bkey_i *new)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b;
-	struct bkey_buf tmp;
-	int ret;
-
-	bch2_bkey_buf_init(&tmp);
-	bch2_bkey_buf_reassemble(&tmp, c, old);
-
-	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
-	if (!IS_ERR_OR_NULL(b)) {
-		mutex_lock(&c->btree_cache.lock);
-
-		__bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-		bkey_copy(&b->key, new);
-		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-		BUG_ON(ret);
-
-		mutex_unlock(&c->btree_cache.lock);
-		six_unlock_read(&b->c.lock);
-	}
-
-	bch2_bkey_buf_exit(&tmp, c);
-}
-
-__flatten
-static inline struct btree *btree_cache_find(struct btree_cache *bc,
-				     const struct bkey_i *k)
-{
-	u64 v = btree_ptr_hash_val(k);
-
-	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
-}
-
-static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
-				       bool flush, bool locked)
-{
-	struct btree_cache *bc = &c->btree_cache;
-
-	lockdep_assert_held(&bc->lock);
-
-	if (btree_node_noevict(b)) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
-		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-	}
-	if (btree_node_write_blocked(b)) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
-		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-	}
-	if (btree_node_will_make_reachable(b)) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
-		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-	}
-
-	if (btree_node_dirty(b)) {
-		if (!flush) {
-			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
-			return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-		}
-
-		if (locked) {
-			/*
-			 * Using the underscore version because we don't want to compact
-			 * bsets after the write, since this node is about to be evicted
-			 * - unless btree verify mode is enabled, since it runs out of
-			 * the post write cleanup:
-			 */
-			if (static_branch_unlikely(&bch2_verify_btree_ondisk))
-				bch2_btree_node_write(c, b, SIX_LOCK_intent,
-						      BTREE_WRITE_cache_reclaim);
-			else
-				__bch2_btree_node_write(c, b,
-							BTREE_WRITE_cache_reclaim);
-		}
-	}
-
-	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
-			(1U << BTREE_NODE_write_in_flight))) {
-		if (!flush) {
-			if (btree_node_read_in_flight(b))
-				bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
-			else if (btree_node_write_in_flight(b))
-				bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
-			return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-		}
-
-		if (locked)
-			return -EINTR;
-
-		/* XXX: waiting on IO with btree cache lock held */
-		bch2_btree_node_wait_on_read(b);
-		bch2_btree_node_wait_on_write(b);
-	}
-
-	return 0;
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	int ret = 0;
-
-	lockdep_assert_held(&bc->lock);
-retry_unlocked:
-	ret = __btree_node_reclaim_checks(c, b, flush, false);
-	if (ret)
-		return ret;
-
-	if (!six_trylock_intent(&b->c.lock)) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
-		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-	}
-
-	if (!six_trylock_write(&b->c.lock)) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
-		six_unlock_intent(&b->c.lock);
-		return bch_err_throw(c, ENOMEM_btree_node_reclaim);
-	}
-
-	/* recheck under lock */
-	ret = __btree_node_reclaim_checks(c, b, flush, true);
-	if (ret) {
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		if (ret == -EINTR)
-			goto retry_unlocked;
-		return ret;
-	}
-
-	if (b->hash_val && !ret)
-		trace_and_count(c, btree_cache_reap, c, b);
-	return 0;
-}
-
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
-{
-	return __btree_node_reclaim(c, b, false);
-}
-
-static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
-{
-	return __btree_node_reclaim(c, b, true);
-}
-
-static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
-					   struct shrink_control *sc)
-{
-	struct btree_cache_list *list = shrink->private_data;
-	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-	struct btree *b, *t;
-	unsigned long nr = sc->nr_to_scan;
-	unsigned long can_free = 0;
-	unsigned long freed = 0;
-	unsigned long touched = 0;
-	unsigned i, flags;
-	unsigned long ret = SHRINK_STOP;
-	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
-
-	if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
-		return SHRINK_STOP;
-
-	mutex_lock(&bc->lock);
-	flags = memalloc_nofs_save();
-
-	/*
-	 * It's _really_ critical that we don't free too many btree nodes - we
-	 * have to always leave ourselves a reserve. The reserve is how we
-	 * guarantee that allocating memory for a new btree node can always
-	 * succeed, so that inserting keys into the btree can always succeed and
-	 * IO can always make forward progress:
-	 */
-	can_free = btree_cache_can_free(list);
-	if (nr > can_free) {
-		bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
-		nr = can_free;
-	}
-
-	i = 0;
-	list_for_each_entry_safe(b, t, &bc->freeable, list) {
-		/*
-		 * Leave a few nodes on the freeable list, so that a btree split
-		 * won't have to hit the system allocator:
-		 */
-		if (++i <= 3)
-			continue;
-
-		touched++;
-
-		if (touched >= nr)
-			goto out;
-
-		if (!btree_node_reclaim(c, b)) {
-			btree_node_data_free(bc, b);
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-			freed++;
-			bc->nr_freed++;
-		}
-	}
-restart:
-	list_for_each_entry_safe(b, t, &list->list, list) {
-		touched++;
-
-		if (btree_node_accessed(b)) {
-			clear_btree_node_accessed(b);
-			bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
-			--touched;;
-		} else if (!btree_node_reclaim(c, b)) {
-			__bch2_btree_node_hash_remove(bc, b);
-			__btree_node_data_free(b);
-			btree_node_to_freedlist(bc, b);
-
-			freed++;
-			bc->nr_freed++;
-
-			six_unlock_write(&b->c.lock);
-			six_unlock_intent(&b->c.lock);
-
-			if (freed == nr)
-				goto out_rotate;
-		} else if (trigger_writes &&
-			   btree_node_dirty(b) &&
-			   !btree_node_will_make_reachable(b) &&
-			   !btree_node_write_blocked(b) &&
-			   six_trylock_read(&b->c.lock)) {
-			list_move(&list->list, &b->list);
-			mutex_unlock(&bc->lock);
-			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-			six_unlock_read(&b->c.lock);
-			if (touched >= nr)
-				goto out_nounlock;
-			mutex_lock(&bc->lock);
-			goto restart;
-		}
-
-		if (touched >= nr)
-			break;
-	}
-out_rotate:
-	if (&t->list != &list->list)
-		list_move_tail(&list->list, &t->list);
-out:
-	mutex_unlock(&bc->lock);
-out_nounlock:
-	ret = freed;
-	memalloc_nofs_restore(flags);
-	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
-	return ret;
-}
-
-static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
-					    struct shrink_control *sc)
-{
-	struct btree_cache_list *list = shrink->private_data;
-
-	if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
-		return 0;
-
-	return btree_cache_can_free(list);
-}
-
-void bch2_fs_btree_cache_exit(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b, *t;
-	unsigned long flags;
-
-	shrinker_free(bc->live[1].shrink);
-	shrinker_free(bc->live[0].shrink);
-
-	/* vfree() can allocate memory: */
-	flags = memalloc_nofs_save();
-	mutex_lock(&bc->lock);
-
-	if (c->verify_data)
-		list_move(&c->verify_data->list, &bc->live[0].list);
-
-	kvfree(c->verify_ondisk);
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->b)
-			list_add(&r->b->list, &bc->live[0].list);
-	}
-
-	list_for_each_entry_safe(b, t, &bc->live[1].list, list)
-		bch2_btree_node_hash_remove(bc, b);
-	list_for_each_entry_safe(b, t, &bc->live[0].list, list)
-		bch2_btree_node_hash_remove(bc, b);
-
-	list_for_each_entry_safe(b, t, &bc->freeable, list) {
-		BUG_ON(btree_node_read_in_flight(b) ||
-		       btree_node_write_in_flight(b));
-
-		btree_node_data_free(bc, b);
-		cond_resched();
-	}
-
-	BUG_ON(!bch2_journal_error(&c->journal) &&
-	       atomic_long_read(&c->btree_cache.nr_dirty));
-
-	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
-
-	list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
-		list_del(&b->list);
-		six_lock_exit(&b->c.lock);
-		kfree(b);
-	}
-
-	mutex_unlock(&bc->lock);
-	memalloc_nofs_restore(flags);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
-		BUG_ON(bc->nr_by_btree[i]);
-	BUG_ON(bc->live[0].nr);
-	BUG_ON(bc->live[1].nr);
-	BUG_ON(bc->nr_freeable);
-
-	if (bc->table_init_done)
-		rhashtable_destroy(&bc->table);
-}
-
-int bch2_fs_btree_cache_init(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct shrinker *shrink;
-	unsigned i;
-	int ret = 0;
-
-	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
-	if (ret)
-		goto err;
-
-	bc->table_init_done = true;
-
-	bch2_recalc_btree_reserve(c);
-
-	for (i = 0; i < bc->nr_reserve; i++) {
-		struct btree *b = __bch2_btree_node_mem_alloc(c);
-		if (!b)
-			goto err;
-		__bch2_btree_node_to_freelist(bc, b);
-	}
-
-	list_splice_init(&bc->live[0].list, &bc->freeable);
-
-	mutex_init(&c->verify_lock);
-
-	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
-	if (!shrink)
-		goto err;
-	bc->live[0].shrink	= shrink;
-	shrink->count_objects	= bch2_btree_cache_count;
-	shrink->scan_objects	= bch2_btree_cache_scan;
-	shrink->seeks		= 2;
-	shrink->private_data	= &bc->live[0];
-	shrinker_register(shrink);
-
-	shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
-	if (!shrink)
-		goto err;
-	bc->live[1].shrink	= shrink;
-	shrink->count_objects	= bch2_btree_cache_count;
-	shrink->scan_objects	= bch2_btree_cache_scan;
-	shrink->seeks		= 8;
-	shrink->private_data	= &bc->live[1];
-	shrinker_register(shrink);
-
-	return 0;
-err:
-	return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-}
-
-void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
-{
-	mutex_init(&bc->lock);
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
-		bc->live[i].idx = i;
-		INIT_LIST_HEAD(&bc->live[i].list);
-	}
-	INIT_LIST_HEAD(&bc->freeable);
-	INIT_LIST_HEAD(&bc->freed_pcpu);
-	INIT_LIST_HEAD(&bc->freed_nonpcpu);
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-
-	if (bc->alloc_lock == current) {
-		trace_and_count(c, btree_cache_cannibalize_unlock, trans);
-		bc->alloc_lock = NULL;
-		closure_wake_up(&bc->alloc_wait);
-	}
-}
-
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct task_struct *old;
-
-	old = NULL;
-	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
-		goto success;
-
-	if (!cl) {
-		trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-		return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
-	}
-
-	closure_wait(&bc->alloc_wait, cl);
-
-	/* Try again, after adding ourselves to waitlist */
-	old = NULL;
-	if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
-		/* We raced */
-		closure_wake_up(&bc->alloc_wait);
-		goto success;
-	}
-
-	trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
-	return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
-
-success:
-	trace_and_count(c, btree_cache_cannibalize_lock, trans);
-	return 0;
-}
-
-static struct btree *btree_node_cannibalize(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
-		list_for_each_entry_reverse(b, &bc->live[i].list, list)
-			if (!btree_node_reclaim(c, b))
-				return b;
-
-	while (1) {
-		for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
-			list_for_each_entry_reverse(b, &bc->live[i].list, list)
-				if (!btree_node_write_and_reclaim(c, b))
-					return b;
-
-		/*
-		 * Rare case: all nodes were intent-locked.
-		 * Just busy-wait.
-		 */
-		WARN_ONCE(1, "btree cache cannibalize failed\n");
-		cond_resched();
-	}
-}
-
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct list_head *freed = pcpu_read_locks
-		? &bc->freed_pcpu
-		: &bc->freed_nonpcpu;
-	struct btree *b, *b2;
-	u64 start_time = local_clock();
-
-	mutex_lock(&bc->lock);
-
-	/*
-	 * We never free struct btree itself, just the memory that holds the on
-	 * disk node. Check the freed list before allocating a new one:
-	 */
-	list_for_each_entry(b, freed, list)
-		if (!btree_node_reclaim(c, b)) {
-			list_del_init(&b->list);
-			goto got_node;
-		}
-
-	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
-	if (b) {
-		bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
-	} else {
-		mutex_unlock(&bc->lock);
-		bch2_trans_unlock(trans);
-		b = __btree_node_mem_alloc(c, GFP_KERNEL);
-		if (!b)
-			goto err;
-		bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
-		mutex_lock(&bc->lock);
-	}
-
-	BUG_ON(!six_trylock_intent(&b->c.lock));
-	BUG_ON(!six_trylock_write(&b->c.lock));
-
-got_node:
-	/*
-	 * btree_free() doesn't free memory; it sticks the node on the end of
-	 * the list. Check if there's any freed nodes there:
-	 */
-	list_for_each_entry(b2, &bc->freeable, list)
-		if (!btree_node_reclaim(c, b2)) {
-			swap(b->data, b2->data);
-			swap(b->aux_data, b2->aux_data);
-
-			list_del_init(&b2->list);
-			--bc->nr_freeable;
-			btree_node_to_freedlist(bc, b2);
-			mutex_unlock(&bc->lock);
-
-			six_unlock_write(&b2->c.lock);
-			six_unlock_intent(&b2->c.lock);
-			goto got_mem;
-		}
-
-	mutex_unlock(&bc->lock);
-
-	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
-		bch2_trans_unlock(trans);
-		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
-			goto err;
-	}
-
-got_mem:
-	BUG_ON(!list_empty(&b->list));
-	BUG_ON(btree_node_hashed(b));
-	BUG_ON(btree_node_dirty(b));
-	BUG_ON(btree_node_write_in_flight(b));
-out:
-	b->flags		= 0;
-	b->written		= 0;
-	b->nsets		= 0;
-	b->sib_u64s[0]		= 0;
-	b->sib_u64s[1]		= 0;
-	b->whiteout_u64s	= 0;
-	bch2_btree_keys_init(b);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
-			       start_time);
-
-	int ret = bch2_trans_relock(trans);
-	if (unlikely(ret)) {
-		bch2_btree_node_to_freelist(c, b);
-		return ERR_PTR(ret);
-	}
-
-	return b;
-err:
-	mutex_lock(&bc->lock);
-
-	/* Try to cannibalize another cached btree node: */
-	if (bc->alloc_lock == current) {
-		b2 = btree_node_cannibalize(c);
-		clear_btree_node_just_written(b2);
-		__bch2_btree_node_hash_remove(bc, b2);
-
-		if (b) {
-			swap(b->data, b2->data);
-			swap(b->aux_data, b2->aux_data);
-			btree_node_to_freedlist(bc, b2);
-			six_unlock_write(&b2->c.lock);
-			six_unlock_intent(&b2->c.lock);
-		} else {
-			b = b2;
-		}
-
-		BUG_ON(!list_empty(&b->list));
-		mutex_unlock(&bc->lock);
-
-		trace_and_count(c, btree_cache_cannibalize, trans);
-		goto out;
-	}
-
-	mutex_unlock(&bc->lock);
-	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
-				struct btree_path *path,
-				const struct bkey_i *k,
-				enum btree_id btree_id,
-				unsigned level,
-				enum six_lock_type lock_type,
-				bool sync)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	if (unlikely(level >= BTREE_MAX_DEPTH)) {
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
-						 level, BTREE_MAX_DEPTH);
-		return ERR_PTR(ret);
-	}
-
-	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
-		printbuf_exit(&buf);
-		return ERR_PTR(ret);
-	}
-
-	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
-		struct printbuf buf = PRINTBUF;
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
-		printbuf_exit(&buf);
-		return ERR_PTR(ret);
-	}
-
-	/*
-	 * Parent node must be locked, else we could read in a btree node that's
-	 * been freed:
-	 */
-	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
-		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
-	}
-
-	b = bch2_btree_node_mem_alloc(trans, level != 0);
-
-	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
-		if (!path)
-			return b;
-
-		trans->memory_allocation_failure = true;
-		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
-	}
-
-	if (IS_ERR(b))
-		return b;
-
-	bkey_copy(&b->key, k);
-	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
-		/* raced with another fill: */
-
-		/* mark as unhashed... */
-		b->hash_val = 0;
-
-		mutex_lock(&bc->lock);
-		__bch2_btree_node_to_freelist(bc, b);
-		mutex_unlock(&bc->lock);
-
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		return NULL;
-	}
-
-	set_btree_node_read_in_flight(b);
-	six_unlock_write(&b->c.lock);
-
-	if (path) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
-		/* Unlock before doing IO: */
-		six_unlock_intent(&b->c.lock);
-		bch2_trans_unlock(trans);
-
-		bch2_btree_node_read(trans, b, sync);
-
-		int ret = bch2_trans_relock(trans);
-		if (ret)
-			return ERR_PTR(ret);
-
-		if (!sync)
-			return NULL;
-
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			b = NULL;
-	} else {
-		bch2_btree_node_read(trans, b, sync);
-		if (lock_type == SIX_LOCK_read)
-			six_lock_downgrade(&b->c.lock);
-	}
-
-	return b;
-}
-
-static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
-{
-	struct printbuf buf = PRINTBUF;
-
-	if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
-		return;
-
-	prt_printf(&buf,
-		   "btree node header doesn't match ptr: ");
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_str(&buf, "\nptr: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	prt_str(&buf, "\nheader: ");
-	bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
-	prt_str(&buf, "\nmin ");
-	bch2_bpos_to_text(&buf, b->data->min_key);
-
-	prt_printf(&buf, "\nmax ");
-	bch2_bpos_to_text(&buf, b->data->max_key);
-
-	bch2_fs_topology_error(c, "%s", buf.buf);
-
-	printbuf_exit(&buf);
-}
-
-static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-{
-	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
-	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
-	    !bpos_eq(b->data->max_key, b->key.k.p) ||
-	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	     !bpos_eq(b->data->min_key,
-		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
-		btree_bad_header(c, b);
-}
-
-static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-					   const struct bkey_i *k, unsigned level,
-					   enum six_lock_type lock_type,
-					   unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	bool need_relock = false;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
-	b = btree_cache_find(bc, k);
-	if (unlikely(!b)) {
-		/*
-		 * We must have the parent locked to call bch2_btree_node_fill(),
-		 * else we could read in a btree node from disk that's been
-		 * freed:
-		 */
-		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
-					 level, lock_type, true);
-		need_relock = true;
-
-		/* We raced and found the btree node in the cache */
-		if (!b)
-			goto retry;
-
-		if (IS_ERR(b))
-			return b;
-	} else {
-		if (btree_node_read_locked(path, level + 1))
-			btree_node_unlock(trans, path, level + 1);
-
-		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			return ERR_PTR(ret);
-
-		BUG_ON(ret);
-
-		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-			     b->c.level != level ||
-			     race_fault())) {
-			six_unlock_type(&b->c.lock, lock_type);
-			if (bch2_btree_node_relock(trans, path, level + 1))
-				goto retry;
-
-			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-		}
-
-		/* avoid atomic set bit if it's not needed: */
-		if (!btree_node_accessed(b))
-			set_btree_node_accessed(b);
-	}
-
-	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = six_lock_seq(&b->c.lock);
-
-		six_unlock_type(&b->c.lock, lock_type);
-		bch2_trans_unlock(trans);
-		need_relock = true;
-
-		bch2_btree_node_wait_on_read(b);
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			return ERR_PTR(ret);
-
-		/*
-		 * should_be_locked is not set on this path yet, so we need to
-		 * relock it specifically:
-		 */
-		if (!six_relock_type(&b->c.lock, lock_type, seq))
-			goto retry;
-	}
-
-	if (unlikely(need_relock)) {
-		ret = bch2_trans_relock(trans) ?:
-			bch2_btree_path_relock_intent(trans, path);
-		if (ret) {
-			six_unlock_type(&b->c.lock, lock_type);
-			return ERR_PTR(ret);
-		}
-	}
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-	}
-
-	EBUG_ON(b->c.btree_id != path->btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-
-	return b;
-}
-
-/**
- * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * @trans:	btree transaction object
- * @path:	btree_path being traversed
- * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
- * @level:	level of btree node being looked up (0 == leaf node)
- * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
- * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- *
- * Returns: btree node or ERR_PTR()
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
-				  const struct bkey_i *k, unsigned level,
-				  enum six_lock_type lock_type,
-				  unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	b = btree_node_mem_ptr(k);
-
-	/*
-	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
-	 * be the node we want anymore, and trying to lock the wrong node could
-	 * cause an unneccessary transaction restart:
-	 */
-	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
-		     !b ||
-		     b->hash_val != btree_ptr_hash_val(k)))
-		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-	if (btree_node_read_locked(path, level + 1))
-		btree_node_unlock(trans, path, level + 1);
-
-	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		return ERR_PTR(ret);
-
-	BUG_ON(ret);
-
-	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-		     b->c.level != level ||
-		     race_fault())) {
-		six_unlock_type(&b->c.lock, lock_type);
-		if (bch2_btree_node_relock(trans, path, level + 1))
-			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
-		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
-	}
-
-	if (unlikely(btree_node_read_in_flight(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-	}
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	/* avoid atomic set bit if it's not needed: */
-	if (!btree_node_accessed(b))
-		set_btree_node_accessed(b);
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-	}
-
-	EBUG_ON(b->c.btree_id != path->btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-
-	return b;
-}
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
-					 const struct bkey_i *k,
-					 enum btree_id btree_id,
-					 unsigned level,
-					 bool nofill)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-	int ret;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	if (c->opts.btree_node_mem_ptr_optimization) {
-		b = btree_node_mem_ptr(k);
-		if (b)
-			goto lock_node;
-	}
-retry:
-	b = btree_cache_find(bc, k);
-	if (unlikely(!b)) {
-		if (nofill)
-			goto out;
-
-		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
-					 level, SIX_LOCK_read, true);
-
-		/* We raced and found the btree node in the cache */
-		if (!b)
-			goto retry;
-
-		if (IS_ERR(b) &&
-		    !bch2_btree_cache_cannibalize_lock(trans, NULL))
-			goto retry;
-
-		if (IS_ERR(b))
-			goto out;
-	} else {
-lock_node:
-		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			return ERR_PTR(ret);
-
-		BUG_ON(ret);
-
-		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
-			     b->c.btree_id != btree_id ||
-			     b->c.level != level)) {
-			six_unlock_read(&b->c.lock);
-			goto retry;
-		}
-
-		/* avoid atomic set bit if it's not needed: */
-		if (!btree_node_accessed(b))
-			set_btree_node_accessed(b);
-	}
-
-	/* XXX: waiting on IO with btree locks held: */
-	__bch2_btree_node_wait_on_read(b);
-
-	prefetch(b->aux_data);
-
-	for_each_bset(b, t) {
-		void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
-		prefetch(p + L1_CACHE_BYTES * 0);
-		prefetch(p + L1_CACHE_BYTES * 1);
-		prefetch(p + L1_CACHE_BYTES * 2);
-	}
-
-	if (unlikely(btree_node_read_error(b))) {
-		six_unlock_read(&b->c.lock);
-		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
-		goto out;
-	}
-
-	EBUG_ON(b->c.btree_id != btree_id);
-	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-	btree_check_header(c, b);
-out:
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return b;
-}
-
-int bch2_btree_node_prefetch(struct btree_trans *trans,
-			     struct btree_path *path,
-			     const struct bkey_i *k,
-			     enum btree_id btree_id, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-
-	BUG_ON(path && !btree_node_locked(path, level + 1));
-	BUG_ON(level >= BTREE_MAX_DEPTH);
-
-	struct btree *b = btree_cache_find(bc, k);
-	if (b)
-		return 0;
-
-	b = bch2_btree_node_fill(trans, path, k, btree_id,
-				 level, SIX_LOCK_read, false);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		return ret;
-	if (b)
-		six_unlock_read(&b->c.lock);
-	return 0;
-}
-
-void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
-
-	b = btree_cache_find(bc, k);
-	if (!b)
-		return;
-
-	BUG_ON(b == btree_node_root(trans->c, b));
-wait_on_io:
-	/* not allowed to wait on io with btree locks held: */
-
-	/* XXX we're called from btree_gc which will be holding other btree
-	 * nodes locked
-	 */
-	__bch2_btree_node_wait_on_read(b);
-	__bch2_btree_node_wait_on_write(b);
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
-		goto out;
-
-	if (btree_node_dirty(b)) {
-		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
-		six_unlock_write(&b->c.lock);
-		six_unlock_intent(&b->c.lock);
-		goto wait_on_io;
-	}
-
-	BUG_ON(btree_node_dirty(b));
-
-	mutex_lock(&bc->lock);
-	bch2_btree_node_hash_remove(bc, b);
-	btree_node_data_free(bc, b);
-	mutex_unlock(&bc->lock);
-out:
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-}
-
-const char *bch2_btree_id_str(enum btree_id btree)
-{
-	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
-}
-
-void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
-{
-	if (btree < BTREE_ID_NR)
-		prt_str(out, __bch2_btree_ids[btree]);
-	else
-		prt_printf(out, "(unknown btree %u)", btree);
-}
-
-void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
-{
-	prt_str(out, "btree=");
-	bch2_btree_id_to_text(out, btree);
-	prt_printf(out, " level=%u", level);
-}
-
-void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
-			      enum btree_id btree, unsigned level, struct bkey_s_c k)
-{
-	bch2_btree_id_to_text(out, btree);
-	prt_printf(out, " level %u/", level);
-	struct btree_root *r = bch2_btree_id_root(c, btree);
-	if (r)
-		prt_printf(out, "%u", r->level);
-	else
-		prt_printf(out, "(unknown)");
-	prt_newline(out);
-
-	bch2_bkey_val_to_text(out, c, k);
-}
-
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-	__bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
-}
-
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
-	struct bset_stats stats;
-
-	memset(&stats, 0, sizeof(stats));
-
-	bch2_btree_keys_stats(b, &stats);
-
-	prt_printf(out, "l %u ", b->c.level);
-	bch2_bpos_to_text(out, b->data->min_key);
-	prt_printf(out, " - ");
-	bch2_bpos_to_text(out, b->data->max_key);
-	prt_printf(out, ":\n"
-	       "    ptrs: ");
-	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-	prt_newline(out);
-
-	prt_printf(out,
-	       "    format: ");
-	bch2_bkey_format_to_text(out, &b->format);
-
-	prt_printf(out,
-	       "    unpack fn len: %u\n"
-	       "    bytes used %zu/%zu (%zu%% full)\n"
-	       "    sib u64s: %u, %u (merge threshold %u)\n"
-	       "    nr packed keys %u\n"
-	       "    nr unpacked keys %u\n"
-	       "    floats %zu\n"
-	       "    failed unpacked %zu\n",
-	       b->unpack_fn_len,
-	       b->nr.live_u64s * sizeof(u64),
-	       btree_buf_bytes(b) - sizeof(struct btree_node),
-	       b->nr.live_u64s * 100 / btree_max_u64s(c),
-	       b->sib_u64s[0],
-	       b->sib_u64s[1],
-	       c->btree_foreground_merge_threshold,
-	       b->nr.packed_keys,
-	       b->nr.unpacked_keys,
-	       stats.floats,
-	       stats.failed);
-}
-
-static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
-				 const char *label, size_t nr)
-{
-	prt_printf(out, "%s\t", label);
-	prt_human_readable_u64(out, nr * c->opts.btree_node_size);
-	prt_printf(out, " (%zu)\n", nr);
-}
-
-static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
-#define x(n) #n,
-	BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
-	NULL
-};
-
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	prt_btree_cache_line(out, c, "live:",		bc->live[0].nr);
-	prt_btree_cache_line(out, c, "pinned:",		bc->live[1].nr);
-	prt_btree_cache_line(out, c, "reserve:",	bc->nr_reserve);
-	prt_btree_cache_line(out, c, "freed:",		bc->nr_freeable);
-	prt_btree_cache_line(out, c, "dirty:",		atomic_long_read(&bc->nr_dirty));
-	prt_printf(out, "cannibalize lock:\t%s\n",	bc->alloc_lock ? "held" : "not held");
-	prt_newline(out);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
-		bch2_btree_id_to_text(out, i);
-		prt_printf(out, "\t");
-		prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
-		prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
-	}
-
-	prt_newline(out);
-	prt_printf(out, "counters since mount:\n");
-	prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
-	prt_printf(out, "not freed:\n");
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
-		prt_printf(out, "  %s\t%llu\n",
-			   bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
-}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
deleted file mode 100644
index be275f87a60e..000000000000
--- a/fs/bcachefs/btree_cache.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_CACHE_H
-#define _BCACHEFS_BTREE_CACHE_H
-
-#include "bcachefs.h"
-#include "btree_types.h"
-#include "bkey_methods.h"
-
-extern const char * const bch2_btree_node_flags[];
-
-struct btree_iter;
-
-void bch2_recalc_btree_reserve(struct bch_fs *);
-
-void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
-
-void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-
-int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
-int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
-				unsigned, enum btree_id);
-
-void bch2_node_pin(struct bch_fs *, struct btree *);
-void bch2_btree_cache_unpin(struct bch_fs *);
-
-void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
-				      struct bkey_s_c, struct bkey_i *);
-
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
-
-void __btree_node_data_free(struct btree *);
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
-
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
-				  const struct bkey_i *, unsigned,
-				  enum six_lock_type, unsigned long);
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
-					 enum btree_id, unsigned, bool);
-
-int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
-			     const struct bkey_i *, enum btree_id, unsigned);
-
-void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
-
-void bch2_fs_btree_cache_exit(struct bch_fs *);
-int bch2_fs_btree_cache_init(struct bch_fs *);
-void bch2_fs_btree_cache_init_early(struct btree_cache *);
-
-static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
-{
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
-	case KEY_TYPE_btree_ptr_v2:
-		/*
-		 * The cast/deref is only necessary to avoid sparse endianness
-		 * warnings:
-		 */
-		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
-	default:
-		return 0;
-	}
-}
-
-static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
-{
-	return k->k.type == KEY_TYPE_btree_ptr_v2
-		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
-		: NULL;
-}
-
-/* is btree node in hash table? */
-static inline bool btree_node_hashed(struct btree *b)
-{
-	return b->hash_val != 0;
-}
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
-	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
-					  &(_c)->btree_cache.table),	\
-	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
-		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_buf_bytes(const struct btree *b)
-{
-	return 1UL << b->byte_order;
-}
-
-static inline size_t btree_buf_max_u64s(const struct btree *b)
-{
-	return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_max_u64s(const struct bch_fs *c)
-{
-	return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
-	return c->opts.btree_node_size >> SECTOR_SHIFT;
-}
-
-static inline unsigned btree_blocks(const struct bch_fs *c)
-{
-	return btree_sectors(c) >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
-	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
-	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
-
-static inline unsigned btree_id_nr_alive(struct bch_fs *c)
-{
-	return BTREE_ID_NR + c->btree_roots_extra.nr;
-}
-
-static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
-{
-	if (likely(id < BTREE_ID_NR)) {
-		return &c->btree_roots_known[id];
-	} else {
-		unsigned idx = id - BTREE_ID_NR;
-
-		/* This can happen when we're called from btree_node_scan */
-		if (idx >= c->btree_roots_extra.nr)
-			return NULL;
-
-		return &c->btree_roots_extra.data[idx];
-	}
-}
-
-static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
-{
-	struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
-
-	return r ? r->b : NULL;
-}
-
-const char *bch2_btree_id_str(enum btree_id);	/* avoid */
-void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
-void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
-
-void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
-			      enum btree_id, unsigned, struct bkey_s_c);
-void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-
-#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
deleted file mode 100644
index bac108e93823..000000000000
--- a/fs/bcachefs/btree_gc.c
+++ /dev/null
@@ -1,1308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "progress.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-
-#define DROP_THIS_NODE		10
-#define DROP_PREV_NODE		11
-#define DID_FILL_FROM_SCAN	12
-
-/*
- * Returns true if it's a btree we can easily reconstruct, or otherwise won't
- * cause data loss if it's missing:
- */
-static bool btree_id_important(enum btree_id btree)
-{
-	if (btree_id_is_alloc(btree))
-		return false;
-
-	switch (btree) {
-	case BTREE_ID_quotas:
-	case BTREE_ID_snapshot_trees:
-	case BTREE_ID_logged_ops:
-	case BTREE_ID_rebalance_work:
-	case BTREE_ID_subvolume_children:
-		return false;
-	default:
-		return true;
-	}
-}
-
-static const char * const bch2_gc_phase_strs[] = {
-#define x(n)	#n,
-	GC_PHASES()
-#undef x
-	NULL
-};
-
-void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
-{
-	prt_str(out, bch2_gc_phase_strs[p->phase]);
-	prt_char(out, ' ');
-	bch2_btree_id_level_to_text(out, p->btree, p->level);
-	prt_char(out, ' ');
-	bch2_bpos_to_text(out, p->pos);
-}
-
-static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
-{
-	return (struct bkey_s) {{{
-		(struct bkey *) k.k,
-		(struct bch_val *) k.v
-	}}};
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	preempt_disable();
-	write_seqcount_begin(&c->gc_pos_lock);
-	c->gc_pos = new_pos;
-	write_seqcount_end(&c->gc_pos_lock);
-	preempt_enable();
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
-	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
-	__gc_pos_set(c, new_pos);
-}
-
-static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
-{
-	switch (b->key.k.type) {
-	case KEY_TYPE_btree_ptr: {
-		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
-
-		dst->k.p		= src->k.p;
-		dst->v.mem_ptr		= 0;
-		dst->v.seq		= b->data->keys.seq;
-		dst->v.sectors_written	= 0;
-		dst->v.flags		= 0;
-		dst->v.min_key		= b->data->min_key;
-		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
-		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
-		break;
-	}
-	case KEY_TYPE_btree_ptr_v2:
-		bkey_copy(&dst->k_i, &b->key);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
-	struct bkey_i_btree_ptr_v2 *new;
-	int ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, " -> ");
-		bch2_bpos_to_text(&buf, new_min);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-	if (!new)
-		return bch_err_throw(c, ENOMEM_gc_repair_key);
-
-	btree_ptr_to_v2(b, new);
-	b->data->min_key	= new_min;
-	new->v.min_key		= new_min;
-	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-	if (ret) {
-		kfree(new);
-		return ret;
-	}
-
-	bch2_btree_node_drop_keys_outside_node(b);
-	bkey_copy(&b->key, &new->k_i);
-	return 0;
-}
-
-static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
-{
-	struct bkey_i_btree_ptr_v2 *new;
-	int ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		prt_str(&buf, " -> ");
-		bch2_bpos_to_text(&buf, new_max);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
-	if (ret)
-		return ret;
-
-	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
-	if (!new)
-		return bch_err_throw(c, ENOMEM_gc_repair_key);
-
-	btree_ptr_to_v2(b, new);
-	b->data->max_key	= new_max;
-	new->k.p		= new_max;
-	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
-	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
-	if (ret) {
-		kfree(new);
-		return ret;
-	}
-
-	bch2_btree_node_drop_keys_outside_node(b);
-
-	mutex_lock(&c->btree_cache.lock);
-	__bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-	bkey_copy(&b->key, &new->k_i);
-	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-	BUG_ON(ret);
-	mutex_unlock(&c->btree_cache.lock);
-	return 0;
-}
-
-static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
-				       struct btree *prev, struct btree *cur,
-				       struct bpos *pulled_from_scan)
-{
-	struct bch_fs *c = trans->c;
-	struct bpos expected_start = !prev
-		? b->data->min_key
-		: bpos_successor(prev->key.k.p);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-			b->data->min_key));
-
-	if (bpos_eq(expected_start, cur->data->min_key))
-		return 0;
-
-	prt_printf(&buf, "  at ");
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_printf(&buf, ":\nparent: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	if (prev) {
-		prt_printf(&buf, "\nprev: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
-	}
-
-	prt_str(&buf, "\nnext: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
-
-	if (bpos_lt(expected_start, cur->data->min_key)) {				/* gap */
-		if (b->c.level == 1 &&
-		    bpos_lt(*pulled_from_scan, cur->data->min_key)) {
-			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-						     expected_start,
-						     bpos_predecessor(cur->data->min_key));
-			if (ret)
-				goto err;
-
-			*pulled_from_scan = cur->data->min_key;
-			ret = DID_FILL_FROM_SCAN;
-		} else {
-			if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
-					     "btree node with incorrect min_key%s", buf.buf))
-				ret = set_node_min(c, cur, expected_start);
-		}
-	} else {									/* overlap */
-		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
-			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
-				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
-						     "btree node overwritten by next node%s", buf.buf))
-					ret = DROP_PREV_NODE;
-			} else {
-				if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
-						     "btree node with incorrect max_key%s", buf.buf))
-					ret = set_node_max(c, prev,
-							   bpos_predecessor(cur->data->min_key));
-			}
-		} else {
-			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
-				if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
-						     "btree node overwritten by prev node%s", buf.buf))
-					ret = DROP_THIS_NODE;
-			} else {
-				if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
-						     "btree node with incorrect min_key%s", buf.buf))
-					ret = set_node_min(c, cur, expected_start);
-			}
-		}
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
-				 struct btree *child, struct bpos *pulled_from_scan)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (bpos_eq(child->key.k.p, b->key.k.p))
-		return 0;
-
-	prt_printf(&buf, "\nat: ");
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_printf(&buf, "\nparent: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	prt_str(&buf, "\nchild: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
-
-	if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
-			     "btree node with incorrect max_key%s", buf.buf)) {
-		if (b->c.level == 1 &&
-		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
-			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
-						bpos_successor(child->key.k.p), b->key.k.p);
-			if (ret)
-				goto err;
-
-			*pulled_from_scan = b->key.k.p;
-			ret = DID_FILL_FROM_SCAN;
-		} else {
-			ret = set_node_max(c, child, b->key.k.p);
-		}
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
-					      struct bpos *pulled_from_scan)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct bkey_buf prev_k, cur_k;
-	struct btree *prev = NULL, *cur = NULL;
-	bool have_child, new_pass = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (!b->c.level)
-		return 0;
-
-	bch2_bkey_buf_init(&prev_k);
-	bch2_bkey_buf_init(&cur_k);
-again:
-	cur = prev = NULL;
-	have_child = new_pass = false;
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-	iter.prefetch = true;
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
-		bch2_btree_and_journal_iter_advance(&iter);
-		bch2_bkey_buf_reassemble(&cur_k, c, k);
-
-		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-					b->c.btree_id, b->c.level - 1,
-					false);
-		ret = PTR_ERR_OR_ZERO(cur);
-
-		printbuf_reset(&buf);
-		bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
-		prt_char(&buf, ' ');
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
-
-		if (bch2_err_matches(ret, EIO)) {
-			bch2_btree_node_evict(trans, cur_k.k);
-			cur = NULL;
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			if (ret)
-				break;
-			continue;
-		}
-
-		bch_err_msg(c, ret, "getting btree node");
-		if (ret)
-			break;
-
-		if (bch2_btree_node_is_stale(c, cur)) {
-			bch_info(c, "btree node older than nodes found by scanning\n  %s", buf.buf);
-			six_unlock_read(&cur->c.lock);
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			cur = NULL;
-			if (ret)
-				break;
-			continue;
-		}
-
-		ret = lockrestart_do(trans,
-			btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
-		if (ret < 0)
-			goto err;
-
-		if (ret == DID_FILL_FROM_SCAN) {
-			new_pass = true;
-			ret = 0;
-		}
-
-		if (ret == DROP_THIS_NODE) {
-			six_unlock_read(&cur->c.lock);
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			cur = NULL;
-			if (ret)
-				break;
-			continue;
-		}
-
-		if (prev)
-			six_unlock_read(&prev->c.lock);
-		prev = NULL;
-
-		if (ret == DROP_PREV_NODE) {
-			bch_info(c, "dropped prev node");
-			bch2_btree_node_evict(trans, prev_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, prev_k.k->k.p);
-			if (ret)
-				break;
-
-			bch2_btree_and_journal_iter_exit(&iter);
-			goto again;
-		} else if (ret)
-			break;
-
-		prev = cur;
-		cur = NULL;
-		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
-	}
-
-	if (!ret && !IS_ERR_OR_NULL(prev)) {
-		BUG_ON(cur);
-		ret = lockrestart_do(trans,
-			btree_repair_node_end(trans, b, prev, pulled_from_scan));
-		if (ret == DID_FILL_FROM_SCAN) {
-			new_pass = true;
-			ret = 0;
-		}
-	}
-
-	if (!IS_ERR_OR_NULL(prev))
-		six_unlock_read(&prev->c.lock);
-	prev = NULL;
-	if (!IS_ERR_OR_NULL(cur))
-		six_unlock_read(&cur->c.lock);
-	cur = NULL;
-
-	if (ret)
-		goto err;
-
-	bch2_btree_and_journal_iter_exit(&iter);
-
-	if (new_pass)
-		goto again;
-
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-	iter.prefetch = true;
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		bch2_bkey_buf_reassemble(&cur_k, c, k);
-		bch2_btree_and_journal_iter_advance(&iter);
-
-		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
-					b->c.btree_id, b->c.level - 1,
-					false);
-		ret = PTR_ERR_OR_ZERO(cur);
-
-		bch_err_msg(c, ret, "getting btree node");
-		if (ret)
-			goto err;
-
-		ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
-		six_unlock_read(&cur->c.lock);
-		cur = NULL;
-
-		if (ret == DROP_THIS_NODE) {
-			bch2_btree_node_evict(trans, cur_k.k);
-			ret = bch2_journal_key_delete(c, b->c.btree_id,
-						      b->c.level, cur_k.k->k.p);
-			new_pass = true;
-		}
-
-		if (ret)
-			goto err;
-
-		have_child = true;
-	}
-
-	printbuf_reset(&buf);
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
-	/*
-	 * XXX: we're not passing the trans object here because we're not set up
-	 * to handle a transaction restart - this code needs to be rewritten
-	 * when we start doing online topology repair
-	 */
-	bch2_trans_unlock_long(trans);
-	if (mustfix_fsck_err_on(!have_child,
-			c, btree_node_topology_interior_node_empty,
-			"empty interior btree node at %s", buf.buf))
-		ret = DROP_THIS_NODE;
-err:
-fsck_err:
-	if (!IS_ERR_OR_NULL(prev))
-		six_unlock_read(&prev->c.lock);
-	if (!IS_ERR_OR_NULL(cur))
-		six_unlock_read(&cur->c.lock);
-
-	bch2_btree_and_journal_iter_exit(&iter);
-
-	if (!ret && new_pass)
-		goto again;
-
-	BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
-
-	bch2_bkey_buf_exit(&prev_k, c);
-	bch2_bkey_buf_exit(&cur_k, c);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
-			   bool *reconstructed_root)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_root *r = bch2_btree_id_root(c, btree);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	bch2_btree_id_to_text(&buf, btree);
-
-	if (r->error) {
-		bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
-
-		ret = bch2_btree_has_scanned_nodes(c, btree);
-		if (ret < 0)
-			goto err;
-
-		if (!ret) {
-			__fsck_err(trans,
-				   FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
-				   btree_root_unreadable_and_scan_found_nothing,
-				   "no nodes found for btree %s, continue?", buf.buf);
-
-			r->alive = false;
-			r->error = 0;
-			bch2_btree_root_alloc_fake_trans(trans, btree, 0);
-		} else {
-			r->alive = false;
-			r->error = 0;
-			bch2_btree_root_alloc_fake_trans(trans, btree, 1);
-
-			bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-			ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
-			if (ret)
-				goto err;
-		}
-
-		*reconstructed_root = true;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_check_topology(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bpos pulled_from_scan = POS_MIN;
-	int ret = 0;
-
-	bch2_trans_srcu_unlock(trans);
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-		bool reconstructed_root = false;
-recover:
-		ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
-		if (ret)
-			break;
-
-		struct btree_root *r = bch2_btree_id_root(c, i);
-		struct btree *b = r->b;
-
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
-		six_unlock_read(&b->c.lock);
-
-		if (ret == DROP_THIS_NODE) {
-			mutex_lock(&c->btree_cache.lock);
-			bch2_btree_node_hash_remove(&c->btree_cache, b);
-			mutex_unlock(&c->btree_cache.lock);
-
-			r->b = NULL;
-
-			if (!reconstructed_root) {
-				r->error = -EIO;
-				goto recover;
-			}
-
-			struct printbuf buf = PRINTBUF;
-			bch2_btree_id_to_text(&buf, i);
-			bch_err(c, "empty btree root %s", buf.buf);
-			printbuf_exit(&buf);
-			bch2_btree_root_alloc_fake_trans(trans, i, 0);
-			r->alive = false;
-			ret = 0;
-		}
-	}
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
-			    unsigned level, struct btree **prev,
-			    struct btree_iter *iter, struct bkey_s_c k,
-			    bool initial)
-{
-	struct bch_fs *c = trans->c;
-
-	if (iter) {
-		struct btree_path *path = btree_iter_path(trans, iter);
-		struct btree *b = path_l(path)->b;
-
-		if (*prev != b) {
-			int ret = bch2_btree_node_check_topology(trans, b);
-			if (ret)
-				return ret;
-		}
-		*prev = b;
-	}
-
-	struct bkey deleted = KEY(0, 0, 0);
-	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	deleted.p = k.k->p;
-
-	if (initial) {
-		BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
-		       k.k->bversion.lo > atomic64_read(&c->journal.seq));
-
-		if (fsck_err_on(btree_id != BTREE_ID_accounting &&
-				k.k->bversion.lo > atomic64_read(&c->key_version),
-				trans, bkey_version_in_future,
-				"key version number higher than recorded %llu\n%s",
-				atomic64_read(&c->key_version),
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			atomic64_set(&c->key_version, k.k->bversion.lo);
-	}
-
-	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
-				trans, btree_bitmap_not_marked,
-				"btree ptr not marked in member info btree allocated bitmap\n%s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k),
-				 buf.buf))) {
-		mutex_lock(&c->sb_lock);
-		bch2_dev_btree_bitmap_mark(c, k);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	/*
-	 * We require a commit before key_trigger() because
-	 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
-	 * wrong result if we run it multiple times.
-	 */
-	unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
-
-	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-			       BTREE_TRIGGER_check_repair|flags);
-	if (ret)
-		goto out;
-
-	if (trans->nr_updates) {
-		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-			-BCH_ERR_transaction_restart_nested;
-		goto out;
-	}
-
-	ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
-			       BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans,
-			 struct progress_indicator_state *progress,
-			 enum btree_id btree, bool initial)
-{
-	struct bch_fs *c = trans->c;
-	unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
-	int ret = 0;
-
-	/* We need to make sure every leaf node is readable before going RW */
-	if (initial)
-		target_depth = 0;
-
-	for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
-		struct btree *prev = NULL;
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
-					  BTREE_ITER_prefetch);
-
-		ret = for_each_btree_key_continue(trans, iter, 0, k, ({
-			bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
-			gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
-			bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
-		}));
-		if (ret)
-			goto err;
-	}
-
-	/* root */
-	do {
-retry_root:
-		bch2_trans_begin(trans);
-
-		struct btree_iter iter;
-		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
-					  0, bch2_btree_id_root(c, btree)->b->c.level, 0);
-		struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-		ret = PTR_ERR_OR_ZERO(b);
-		if (ret)
-			goto err_root;
-
-		if (b != btree_node_root(c, b)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto retry_root;
-		}
-
-		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
-		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-		ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
-err_root:
-		bch2_trans_iter_exit(trans, &iter);
-	} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-{
-	return cmp_int(gc_btree_order(l), gc_btree_order(r));
-}
-
-static int bch2_gc_btrees(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct progress_indicator_state progress;
-	bch2_progress_init(&progress, c, ~0ULL);
-
-	enum btree_id ids[BTREE_ID_NR];
-	for (unsigned i = 0; i < BTREE_ID_NR; i++)
-		ids[i] = i;
-	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
-		unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
-
-		if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
-			continue;
-
-		ret = bch2_gc_btree(trans, &progress, btree, true);
-	}
-
-	printbuf_exit(&buf);
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_mark_superblocks(struct bch_fs *c)
-{
-	gc_pos_set(c, gc_phase(GC_PHASE_sb));
-
-	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-}
-
-static void bch2_gc_free(struct bch_fs *c)
-{
-	bch2_accounting_gc_free(c);
-
-	genradix_free(&c->reflink_gc_table);
-	genradix_free(&c->gc_stripes);
-
-	for_each_member_device(c, ca)
-		genradix_free(&ca->buckets_gc);
-}
-
-static int bch2_gc_start(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		int ret = bch2_dev_usage_init(ca, true);
-		if (ret) {
-			bch2_dev_put(ca);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-/* returns true if not equal */
-static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
-				     struct bch_alloc_v4 r)
-{
-	return  l.gen != r.gen				||
-		l.oldest_gen != r.oldest_gen		||
-		l.data_type != r.data_type		||
-		l.dirty_sectors	!= r.dirty_sectors	||
-		l.stripe_sectors != r.stripe_sectors	||
-		l.cached_sectors != r.cached_sectors	 ||
-		l.stripe_redundancy != r.stripe_redundancy ||
-		l.stripe != r.stripe;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
-				struct btree_iter *iter,
-				struct bch_dev *ca,
-				struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i_alloc_v4 *a;
-	struct bch_alloc_v4 old_gc, gc, old_convert, new;
-	const struct bch_alloc_v4 *old;
-	int ret;
-
-	if (!bucket_valid(ca, k.k->p.offset))
-		return 0;
-
-	old = bch2_alloc_to_v4(k, &old_convert);
-	gc = new = *old;
-
-	__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
-
-	old_gc = gc;
-
-	if ((old->data_type == BCH_DATA_sb ||
-	     old->data_type == BCH_DATA_journal) &&
-	    !bch2_dev_is_online(ca)) {
-		gc.data_type = old->data_type;
-		gc.dirty_sectors = old->dirty_sectors;
-	}
-
-	/*
-	 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
-	 * fix that here:
-	 */
-	alloc_data_type_set(&gc, gc.data_type);
-	if (gc.data_type != old_gc.data_type ||
-	    gc.dirty_sectors != old_gc.dirty_sectors) {
-		ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
-		if (ret)
-			return ret;
-
-		/*
-		 * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
-		 * safe w.r.t. transaction restarts, so fixup the gc_bucket so
-		 * we don't run it twice:
-		 */
-		struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
-		gc_m->data_type = gc.data_type;
-		gc_m->dirty_sectors = gc.dirty_sectors;
-	}
-
-	if (fsck_err_on(new.data_type != gc.data_type,
-			trans, alloc_key_data_type_wrong,
-			"bucket %llu:%llu gen %u has wrong data_type"
-			": got %s, should be %s",
-			iter->pos.inode, iter->pos.offset,
-			gc.gen,
-			bch2_data_type_str(new.data_type),
-			bch2_data_type_str(gc.data_type)))
-		new.data_type = gc.data_type;
-
-#define copy_bucket_field(_errtype, _f)					\
-	if (fsck_err_on(new._f != gc._f,				\
-			trans, _errtype,				\
-			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
-			": got %llu, should be %llu",			\
-			iter->pos.inode, iter->pos.offset,		\
-			gc.gen,						\
-			bch2_data_type_str(gc.data_type),		\
-			(u64) new._f, (u64) gc._f))				\
-		new._f = gc._f;						\
-
-	copy_bucket_field(alloc_key_gen_wrong,			gen);
-	copy_bucket_field(alloc_key_dirty_sectors_wrong,	dirty_sectors);
-	copy_bucket_field(alloc_key_stripe_sectors_wrong,	stripe_sectors);
-	copy_bucket_field(alloc_key_cached_sectors_wrong,	cached_sectors);
-	copy_bucket_field(alloc_key_stripe_wrong,		stripe);
-	copy_bucket_field(alloc_key_stripe_redundancy_wrong,	stripe_redundancy);
-#undef copy_bucket_field
-
-	if (!bch2_alloc_v4_cmp(*old, new))
-		return 0;
-
-	a = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	a->v = new;
-
-	/*
-	 * The trigger normally makes sure these are set, but we're not running
-	 * triggers:
-	 */
-	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
-		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
-	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
-fsck_err:
-	return ret;
-}
-
-static int bch2_gc_alloc_done(struct bch_fs *c)
-{
-	int ret = 0;
-
-	for_each_member_device(c, ca) {
-		ret = bch2_trans_run(c,
-			for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
-					POS(ca->dev_idx, ca->mi.first_bucket),
-					POS(ca->dev_idx, ca->mi.nbuckets - 1),
-					BTREE_ITER_slots|BTREE_ITER_prefetch, k,
-					NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				bch2_alloc_write_key(trans, &iter, ca, k)));
-		if (ret) {
-			bch2_dev_put(ca);
-			break;
-		}
-	}
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_alloc_start(struct bch_fs *c)
-{
-	int ret = 0;
-
-	for_each_member_device(c, ca) {
-		ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
-		if (ret) {
-			bch2_dev_put(ca);
-			ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
-			break;
-		}
-	}
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_gc_write_stripes_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	const struct bch_stripe *s;
-	struct gc_stripe *m;
-	bool bad = false;
-	unsigned i;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_stripe)
-		return 0;
-
-	s = bkey_s_c_to_stripe(k).v;
-	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
-	for (i = 0; i < s->nr_blocks; i++) {
-		u32 old = stripe_blockcount_get(s, i);
-		u32 new = (m ? m->block_sectors[i] : 0);
-
-		if (old != new) {
-			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
-				   i, old, new);
-			bad = true;
-		}
-	}
-
-	if (bad)
-		bch2_bkey_val_to_text(&buf, c, k);
-
-	if (fsck_err_on(bad,
-			trans, stripe_sector_count_wrong,
-			"%s", buf.buf)) {
-		struct bkey_i_stripe *new;
-
-		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			return ret;
-
-		bkey_reassemble(&new->k_i, k);
-
-		for (i = 0; i < new->v.nr_blocks; i++)
-			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
-		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c)
-{
-	return bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_stripes, POS_MIN,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_gc_write_stripes_key(trans, &iter, k)));
-}
-
-/**
- * bch2_check_allocations - walk all references to buckets, and recompute them:
- *
- * @c:			filesystem object
- *
- * Returns: 0 on success, or standard errcode on failure
- *
- * Order matters here:
- *  - Concurrent GC relies on the fact that we have a total ordering for
- *    everything that GC walks - see  gc_will_visit_node(),
- *    gc_will_visit_root()
- *
- *  - also, references move around in the course of index updates and
- *    various other crap: everything needs to agree on the ordering
- *    references are allowed to move around in - e.g., we're allowed to
- *    start with a reference owned by an open_bucket (the allocator) and
- *    move it to the btree, but not the reverse.
- *
- *    This is necessary to ensure that gc doesn't miss references that
- *    move around - if references move backwards in the ordering GC
- *    uses, GC could skip past them
- */
-int bch2_check_allocations(struct bch_fs *c)
-{
-	int ret;
-
-	down_read(&c->state_lock);
-	down_write(&c->gc_lock);
-
-	bch2_btree_interior_updates_flush(c);
-
-	ret   = bch2_gc_accounting_start(c) ?:
-		bch2_gc_start(c) ?:
-		bch2_gc_alloc_start(c) ?:
-		bch2_gc_reflink_start(c);
-	if (ret)
-		goto out;
-
-	gc_pos_set(c, gc_phase(GC_PHASE_start));
-
-	ret = bch2_mark_superblocks(c);
-	bch_err_msg(c, ret, "marking superblocks");
-	if (ret)
-		goto out;
-
-	ret = bch2_gc_btrees(c);
-	if (ret)
-		goto out;
-
-	c->gc_count++;
-
-	ret   = bch2_gc_alloc_done(c) ?:
-		bch2_gc_accounting_done(c) ?:
-		bch2_gc_stripes_done(c) ?:
-		bch2_gc_reflink_done(c);
-out:
-	percpu_down_write(&c->mark_lock);
-	/* Indicates that gc is no longer in progress: */
-	__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
-
-	bch2_gc_free(c);
-	percpu_up_write(&c->mark_lock);
-
-	up_write(&c->gc_lock);
-	up_read(&c->state_lock);
-
-	/*
-	 * At startup, allocations can happen directly instead of via the
-	 * allocator thread - issue wakeup in case they blocked on gc_lock:
-	 */
-	closure_wake_up(&c->freelist_wait);
-
-	if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
-		bch2_sb_members_clean_deleted(c);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int gc_btree_gens_key(struct btree_trans *trans,
-			     struct btree_iter *iter,
-			     struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
-		return -EROFS;
-
-	bool too_stale = false;
-	scoped_guard(rcu) {
-		bkey_for_each_ptr(ptrs, ptr) {
-			struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-			if (!ca)
-				continue;
-
-			too_stale |= dev_ptr_stale(ca, ptr) > 16;
-		}
-
-		if (!too_stale)
-			bkey_for_each_ptr(ptrs, ptr) {
-				struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-				if (!ca)
-					continue;
-
-				u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
-				if (gen_after(*gen, ptr->gen))
-					*gen = ptr->gen;
-			}
-	}
-
-	if (too_stale) {
-		struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
-		int ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			return ret;
-
-		bch2_extent_normalize(c, bkey_i_to_s(u));
-	}
-
-	return 0;
-}
-
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
-				       struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-	struct bkey_i_alloc_v4 *a_mut;
-	int ret;
-
-	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
-		return 0;
-
-	a_mut = bch2_alloc_to_v4_mut(trans, k);
-	ret = PTR_ERR_OR_ZERO(a_mut);
-	if (ret)
-		return ret;
-
-	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-
-	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
-}
-
-int bch2_gc_gens(struct bch_fs *c)
-{
-	u64 b, start_time = local_clock();
-	int ret;
-
-	if (!mutex_trylock(&c->gc_gens_lock))
-		return 0;
-
-	trace_and_count(c, gc_gens_start, c);
-
-	/*
-	 * We have to use trylock here. Otherwise, we would
-	 * introduce a deadlock in the RO path - we take the
-	 * state lock at the start of going RO.
-	 */
-	if (!down_read_trylock(&c->state_lock)) {
-		mutex_unlock(&c->gc_gens_lock);
-		return 0;
-	}
-
-	for_each_member_device(c, ca) {
-		struct bucket_gens *gens = bucket_gens(ca);
-
-		BUG_ON(ca->oldest_gen);
-
-		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
-		if (!ca->oldest_gen) {
-			bch2_dev_put(ca);
-			ret = bch_err_throw(c, ENOMEM_gc_gens);
-			goto err;
-		}
-
-		for (b = gens->first_bucket;
-		     b < gens->nbuckets; b++)
-			ca->oldest_gen[b] = gens->b[b];
-	}
-
-	for (unsigned i = 0; i < BTREE_ID_NR; i++)
-		if (btree_type_has_ptrs(i)) {
-			c->gc_gens_btree = i;
-			c->gc_gens_pos = POS_MIN;
-
-			ret = bch2_trans_run(c,
-				for_each_btree_key_commit(trans, iter, i,
-						POS_MIN,
-						BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-						k,
-						NULL, NULL,
-						BCH_TRANS_COMMIT_no_enospc,
-					gc_btree_gens_key(trans, &iter, k)));
-			if (ret)
-				goto err;
-		}
-
-	struct bch_dev *ca = NULL;
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
-				POS_MIN,
-				BTREE_ITER_prefetch,
-				k,
-				NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
-			if (!ca) {
-				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
-				continue;
-			}
-			bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
-		})));
-	bch2_dev_put(ca);
-
-	if (ret)
-		goto err;
-
-	c->gc_gens_btree	= 0;
-	c->gc_gens_pos		= POS_MIN;
-
-	c->gc_count++;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-	trace_and_count(c, gc_gens_end, c);
-err:
-	for_each_member_device(c, ca) {
-		kvfree(ca->oldest_gen);
-		ca->oldest_gen = NULL;
-	}
-
-	up_read(&c->state_lock);
-	mutex_unlock(&c->gc_gens_lock);
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-static void bch2_gc_gens_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
-	bch2_gc_gens(c);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_gc_gens_async(struct bch_fs *c)
-{
-	if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
-	    !queue_work(c->write_ref_wq, &c->gc_gens_work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *c)
-{
-	seqcount_init(&c->gc_pos_lock);
-	INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
-
-	init_rwsem(&c->gc_lock);
-	mutex_init(&c->gc_gens_lock);
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
deleted file mode 100644
index ec77662369a2..000000000000
--- a/fs/bcachefs/btree_gc.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_H
-#define _BCACHEFS_BTREE_GC_H
-
-#include "bkey.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-
-int bch2_check_topology(struct bch_fs *);
-int bch2_check_allocations(struct bch_fs *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
-	return (struct gc_pos) { .phase	= phase, };
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
-					 struct bpos pos)
-{
-	return (struct gc_pos) {
-		.phase	= GC_PHASE_btree,
-		.btree	= btree,
-		.level	= level,
-		.pos	= pos,
-	};
-}
-
-static inline int gc_btree_order(enum btree_id btree)
-{
-	if (btree == BTREE_ID_alloc)
-		return -2;
-	if (btree == BTREE_ID_stripes)
-		return -1;
-	return btree;
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
-	return  cmp_int(l.phase, r.phase) ?:
-		cmp_int(gc_btree_order(l.btree),
-			gc_btree_order(r.btree)) ?:
-		cmp_int(l.level, r.level) ?:
-		bpos_cmp(l.pos, r.pos);
-}
-
-static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
-{
-	unsigned seq;
-	bool ret;
-
-	do {
-		seq = read_seqcount_begin(&c->gc_pos_lock);
-		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
-	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
-	return ret;
-}
-
-void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
-
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_gens_async(struct bch_fs *);
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
deleted file mode 100644
index c24dd6edf377..000000000000
--- a/fs/bcachefs/btree_gc_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_TYPES_H
-#define _BCACHEFS_BTREE_GC_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-#define GC_PHASES()		\
-	x(not_running)		\
-	x(start)		\
-	x(sb)			\
-	x(btree)
-
-enum gc_phase {
-#define x(n)	GC_PHASE_##n,
-	GC_PHASES()
-#undef x
-};
-
-struct gc_pos {
-	enum gc_phase		phase:8;
-	enum btree_id		btree:8;
-	u16			level;
-	struct bpos		pos;
-};
-
-struct reflink_gc {
-	u64		offset;
-	u32		size;
-	u32		refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
-#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
deleted file mode 100644
index 590cd29f3e86..000000000000
--- a/fs/bcachefs/btree_io.c
+++ /dev/null
@@ -1,2742 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "recovery.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
-{
-	bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
-	prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
-	prt_str(out, "min: ");
-	bch2_bpos_to_text(out, bn->min_key);
-	prt_newline(out);
-	prt_str(out, "max: ");
-	bch2_bpos_to_text(out, bn->max_key);
-}
-
-void bch2_btree_node_io_unlock(struct btree *b)
-{
-	EBUG_ON(!btree_node_write_in_flight(b));
-
-	clear_btree_node_write_in_flight_inner(b);
-	clear_btree_node_write_in_flight(b);
-	smp_mb__after_atomic();
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-void bch2_btree_node_io_lock(struct btree *b)
-{
-	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
-			    TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_read(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_write(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_read(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_write(struct btree *b)
-{
-	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
-		       TASK_UNINTERRUPTIBLE);
-}
-
-static void verify_no_dups(struct btree *b,
-			   struct bkey_packed *start,
-			   struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bkey_packed *k, *p;
-
-	if (start == end)
-		return;
-
-	for (p = start, k = bkey_p_next(start);
-	     k != end;
-	     p = k, k = bkey_p_next(k)) {
-		struct bkey l = bkey_unpack_key(b, p);
-		struct bkey r = bkey_unpack_key(b, k);
-
-		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
-	}
-#endif
-}
-
-static void set_needs_whiteout(struct bset *i, int v)
-{
-	struct bkey_packed *k;
-
-	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-		k->needs_whiteout = v;
-}
-
-static void btree_bounce_free(struct bch_fs *c, size_t size,
-			      bool used_mempool, void *p)
-{
-	if (used_mempool)
-		mempool_free(p, &c->btree_bounce_pool);
-	else
-		kvfree(p);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
-				bool *used_mempool)
-{
-	unsigned flags = memalloc_nofs_save();
-	void *p;
-
-	BUG_ON(size > c->opts.btree_node_size);
-
-	*used_mempool = false;
-	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
-	if (!p) {
-		*used_mempool = true;
-		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-	}
-	memalloc_nofs_restore(flags);
-	return p;
-}
-
-static void sort_bkey_ptrs(const struct btree *bt,
-			   struct bkey_packed **ptrs, unsigned nr)
-{
-	unsigned n = nr, a = nr / 2, b, c, d;
-
-	if (!a)
-		return;
-
-	/* Heap sort: see lib/sort.c: */
-	while (1) {
-		if (a)
-			a--;
-		else if (--n)
-			swap(ptrs[0], ptrs[n]);
-		else
-			break;
-
-		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-			b = bch2_bkey_cmp_packed(bt,
-					    ptrs[c],
-					    ptrs[d]) >= 0 ? c : d;
-		if (d == n)
-			b = c;
-
-		while (b != a &&
-		       bch2_bkey_cmp_packed(bt,
-				       ptrs[a],
-				       ptrs[b]) >= 0)
-			b = (b - 1) / 2;
-		c = b;
-		while (b != a) {
-			b = (b - 1) / 2;
-			swap(ptrs[b], ptrs[c]);
-		}
-	}
-}
-
-static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
-{
-	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
-	bool used_mempool = false;
-	size_t bytes = b->whiteout_u64s * sizeof(u64);
-
-	if (!b->whiteout_u64s)
-		return;
-
-	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
-
-	for (k = unwritten_whiteouts_start(b);
-	     k != unwritten_whiteouts_end(b);
-	     k = bkey_p_next(k))
-		*--ptrs = k;
-
-	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
-
-	k = new_whiteouts;
-
-	while (ptrs != ptrs_end) {
-		bkey_p_copy(k, *ptrs);
-		k = bkey_p_next(k);
-		ptrs++;
-	}
-
-	verify_no_dups(b, new_whiteouts,
-		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
-
-	memcpy_u64s(unwritten_whiteouts_start(b),
-		    new_whiteouts, b->whiteout_u64s);
-
-	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
-}
-
-static bool should_compact_bset(struct btree *b, struct bset_tree *t,
-				bool compacting, enum compact_mode mode)
-{
-	if (!bset_dead_u64s(b, t))
-		return false;
-
-	switch (mode) {
-	case COMPACT_LAZY:
-		return should_compact_bset_lazy(b, t) ||
-			(compacting && !bset_written(b, bset(b, t)));
-	case COMPACT_ALL:
-		return true;
-	default:
-		BUG();
-	}
-}
-
-static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
-{
-	bool ret = false;
-
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k, *n, *out, *start, *end;
-		struct btree_node_entry *src = NULL, *dst = NULL;
-
-		if (t != b->set && !bset_written(b, i)) {
-			src = container_of(i, struct btree_node_entry, keys);
-			dst = max(write_block(b),
-				  (void *) btree_bkey_last(b, t - 1));
-		}
-
-		if (src != dst)
-			ret = true;
-
-		if (!should_compact_bset(b, t, ret, mode)) {
-			if (src != dst) {
-				memmove(dst, src, sizeof(*src) +
-					le16_to_cpu(src->keys.u64s) *
-					sizeof(u64));
-				i = &dst->keys;
-				set_btree_bset(b, t, i);
-			}
-			continue;
-		}
-
-		start	= btree_bkey_first(b, t);
-		end	= btree_bkey_last(b, t);
-
-		if (src != dst) {
-			memmove(dst, src, sizeof(*src));
-			i = &dst->keys;
-			set_btree_bset(b, t, i);
-		}
-
-		out = i->start;
-
-		for (k = start; k != end; k = n) {
-			n = bkey_p_next(k);
-
-			if (!bkey_deleted(k)) {
-				bkey_p_copy(out, k);
-				out = bkey_p_next(out);
-			} else {
-				BUG_ON(k->needs_whiteout);
-			}
-		}
-
-		i->u64s = cpu_to_le16((u64 *) out - i->_data);
-		set_btree_bset_end(b, t);
-		bch2_bset_set_no_aux_tree(b, t);
-		ret = true;
-	}
-
-	bch2_verify_btree_nr_keys(b);
-
-	bch2_btree_build_aux_trees(b);
-
-	return ret;
-}
-
-bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
-			    enum compact_mode mode)
-{
-	return bch2_drop_whiteouts(b, mode);
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
-			    unsigned start_idx,
-			    unsigned end_idx)
-{
-	struct btree_node *out;
-	struct sort_iter_stack sort_iter;
-	struct bset_tree *t;
-	struct bset *start_bset = bset(b, &b->set[start_idx]);
-	bool used_mempool = false;
-	u64 start_time, seq = 0;
-	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
-	bool sorting_entire_node = start_idx == 0 &&
-		end_idx == b->nsets;
-
-	sort_iter_stack_init(&sort_iter, b);
-
-	for (t = b->set + start_idx;
-	     t < b->set + end_idx;
-	     t++) {
-		u64s += le16_to_cpu(bset(b, t)->u64s);
-		sort_iter_add(&sort_iter.iter,
-			      btree_bkey_first(b, t),
-			      btree_bkey_last(b, t));
-	}
-
-	bytes = sorting_entire_node
-		? btree_buf_bytes(b)
-		: __vstruct_bytes(struct btree_node, u64s);
-
-	out = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	start_time = local_clock();
-
-	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
-
-	out->keys.u64s = cpu_to_le16(u64s);
-
-	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
-
-	if (sorting_entire_node)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-				       start_time);
-
-	/* Make sure we preserve bset journal_seq: */
-	for (t = b->set + start_idx; t < b->set + end_idx; t++)
-		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
-	start_bset->journal_seq = cpu_to_le64(seq);
-
-	if (sorting_entire_node) {
-		u64s = le16_to_cpu(out->keys.u64s);
-
-		BUG_ON(bytes != btree_buf_bytes(b));
-
-		/*
-		 * Our temporary buffer is the same size as the btree node's
-		 * buffer, we can just swap buffers instead of doing a big
-		 * memcpy()
-		 */
-		*out = *b->data;
-		out->keys.u64s = cpu_to_le16(u64s);
-		swap(out, b->data);
-		set_btree_bset(b, b->set, &b->data->keys);
-	} else {
-		start_bset->u64s = out->keys.u64s;
-		memcpy_u64s(start_bset->start,
-			    out->keys.start,
-			    le16_to_cpu(out->keys.u64s));
-	}
-
-	for (i = start_idx + 1; i < end_idx; i++)
-		b->nr.bset_u64s[start_idx] +=
-			b->nr.bset_u64s[i];
-
-	b->nsets -= shift;
-
-	for (i = start_idx + 1; i < b->nsets; i++) {
-		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
-		b->set[i]		= b->set[i + shift];
-	}
-
-	for (i = b->nsets; i < MAX_BSETS; i++)
-		b->nr.bset_u64s[i] = 0;
-
-	set_btree_bset_end(b, &b->set[start_idx]);
-	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
-	btree_bounce_free(c, bytes, used_mempool, out);
-
-	bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_btree_sort_into(struct bch_fs *c,
-			 struct btree *dst,
-			 struct btree *src)
-{
-	struct btree_nr_keys nr;
-	struct btree_node_iter src_iter;
-	u64 start_time = local_clock();
-
-	BUG_ON(dst->nsets != 1);
-
-	bch2_bset_set_no_aux_tree(dst, dst->set);
-
-	bch2_btree_node_iter_init_from_start(&src_iter, src);
-
-	nr = bch2_sort_repack(btree_bset_first(dst),
-			src, &src_iter,
-			&dst->format,
-			true);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
-			       start_time);
-
-	set_btree_bset_end(dst, dst->set);
-
-	dst->nr.live_u64s	+= nr.live_u64s;
-	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
-	dst->nr.packed_keys	+= nr.packed_keys;
-	dst->nr.unpacked_keys	+= nr.unpacked_keys;
-
-	bch2_verify_btree_nr_keys(dst);
-}
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b)
-{
-	unsigned unwritten_idx;
-	bool ret = false;
-
-	for (unwritten_idx = 0;
-	     unwritten_idx < b->nsets;
-	     unwritten_idx++)
-		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
-			break;
-
-	if (b->nsets - unwritten_idx > 1) {
-		btree_node_sort(c, b, unwritten_idx, b->nsets);
-		ret = true;
-	}
-
-	if (unwritten_idx > 1) {
-		btree_node_sort(c, b, 0, unwritten_idx);
-		ret = true;
-	}
-
-	return ret;
-}
-
-void bch2_btree_build_aux_trees(struct btree *b)
-{
-	for_each_bset(b, t)
-		bch2_bset_build_aux_tree(b, t,
-				!bset_written(b, bset(b, t)) &&
-				t == bset_tree_last(b));
-}
-
-/*
- * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
- *
- * The first bset is going to be of similar order to the size of the node, the
- * last bset is bounded by btree_write_set_buffer(), which is set to keep the
- * memmove on insert from being too expensive: the middle bset should, ideally,
- * be the geometric mean of the first and the last.
- *
- * Returns true if the middle bset is greater than that geometric mean:
- */
-static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
-{
-	unsigned mid_u64s_bits =
-		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
-
-	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_node_entry *bne;
-	bool reinit_iter = false;
-
-	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
-	BUG_ON(bset_written(b, bset(b, &b->set[1])));
-	BUG_ON(btree_node_just_written(b));
-
-	if (b->nsets == MAX_BSETS &&
-	    !btree_node_write_in_flight(b) &&
-	    should_compact_all(c, b)) {
-		bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
-					    BTREE_WRITE_init_next_bset);
-		reinit_iter = true;
-	}
-
-	if (b->nsets == MAX_BSETS &&
-	    btree_node_compact(c, b))
-		reinit_iter = true;
-
-	BUG_ON(b->nsets >= MAX_BSETS);
-
-	bne = want_new_bset(c, b);
-	if (bne)
-		bch2_bset_init_next(b, bne);
-
-	bch2_btree_build_aux_trees(b);
-
-	if (reinit_iter)
-		bch2_trans_node_reinit_iter(trans, b);
-}
-
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
-			  struct bch_dev *ca,
-			  bool print_pos,
-			  struct btree *b, struct bset *i, struct bkey_packed *k,
-			  unsigned offset, int rw)
-{
-	if (print_pos) {
-		prt_str(out, rw == READ
-			? "error validating btree node "
-			: "corrupt btree node before write ");
-		prt_printf(out, "at btree ");
-		bch2_btree_pos_to_text(out, c, b);
-		prt_newline(out);
-	}
-
-	if (ca)
-		prt_printf(out, "%s ", ca->name);
-
-	prt_printf(out, "node offset %u/%u",
-		   b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
-	if (i)
-		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
-	if (k)
-		prt_printf(out, " bset byte offset %lu",
-			   (unsigned long)(void *)k -
-			   ((unsigned long)(void *)i & ~511UL));
-	prt_str(out, ": ");
-}
-
-__printf(11, 12)
-static int __btree_err(int ret,
-		       struct bch_fs *c,
-		       struct bch_dev *ca,
-		       struct btree *b,
-		       struct bset *i,
-		       struct bkey_packed *k,
-		       int rw,
-		       enum bch_sb_error_id err_type,
-		       struct bch_io_failures *failed,
-		       struct printbuf *err_msg,
-		       const char *fmt, ...)
-{
-	if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
-		return ret == -BCH_ERR_btree_node_read_err_fixable
-			? bch_err_throw(c, fsck_fix)
-			: ret;
-
-	bool have_retry = false;
-	int ret2;
-
-	if (ca) {
-		bch2_mark_btree_validate_failure(failed, ca->dev_idx);
-
-		struct extent_ptr_decoded pick;
-		have_retry = bch2_bkey_pick_read_device(c,
-					bkey_i_to_s_c(&b->key),
-					failed, &pick, -1) == 1;
-	}
-
-	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
-		ret = bch_err_throw(c, btree_node_read_err_fixable);
-	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
-		ret = bch_err_throw(c, btree_node_read_err_bad_node);
-
-	bch2_sb_error_count(c, err_type);
-
-	bool print_deferred = err_msg &&
-		rw == READ &&
-		!(test_bit(BCH_FS_in_fsck, &c->flags) &&
-		  c->opts.fix_errors == FSCK_FIX_ask);
-
-	struct printbuf out = PRINTBUF;
-	bch2_log_msg_start(c, &out);
-
-	if (!print_deferred)
-		err_msg = &out;
-
-	btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
-
-	va_list args;
-	va_start(args, fmt);
-	prt_vprintf(err_msg, fmt, args);
-	va_end(args);
-
-	if (print_deferred) {
-		prt_newline(err_msg);
-
-		switch (ret) {
-		case -BCH_ERR_btree_node_read_err_fixable:
-			ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
-			if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
-			    !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
-				ret = ret2;
-				goto fsck_err;
-			}
-
-			if (!have_retry)
-				ret = bch_err_throw(c, fsck_fix);
-			goto out;
-		case -BCH_ERR_btree_node_read_err_bad_node:
-			prt_str(&out, ", ");
-			break;
-		}
-
-		goto out;
-	}
-
-	if (rw == WRITE) {
-		prt_str(&out, ", ");
-		ret = __bch2_inconsistent_error(c, &out)
-			? -BCH_ERR_fsck_errors_not_fixed
-			: 0;
-		goto print;
-	}
-
-	switch (ret) {
-	case -BCH_ERR_btree_node_read_err_fixable:
-		ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
-		if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
-		    !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
-			ret = ret2;
-			goto fsck_err;
-		}
-
-		if (!have_retry)
-			ret = bch_err_throw(c, fsck_fix);
-		goto out;
-	case -BCH_ERR_btree_node_read_err_bad_node:
-		prt_str(&out, ", ");
-		break;
-	}
-print:
-	bch2_print_str(c, KERN_ERR, out.buf);
-out:
-fsck_err:
-	printbuf_exit(&out);
-	return ret;
-}
-
-#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...)		\
-({									\
-	int _ret = __btree_err(type, c, ca, b, i, k, write,		\
-			       BCH_FSCK_ERR_##_err_type,		\
-			       failed, err_msg,				\
-			       msg, ##__VA_ARGS__);			\
-									\
-	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) {		\
-		ret = _ret;						\
-		goto fsck_err;						\
-	}								\
-									\
-	true;								\
-})
-
-#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
-
-/*
- * When btree topology repair changes the start or end of a node, that might
- * mean we have to drop keys that are no longer inside the node:
- */
-__cold
-void bch2_btree_node_drop_keys_outside_node(struct btree *b)
-{
-	for_each_bset(b, t) {
-		struct bset *i = bset(b, t);
-		struct bkey_packed *k;
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
-				break;
-
-		if (k != i->start) {
-			unsigned shift = (u64 *) k - (u64 *) i->start;
-
-			memmove_u64s_down(i->start, k,
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
-			set_btree_bset_end(b, t);
-		}
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
-			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
-				break;
-
-		if (k != vstruct_last(i)) {
-			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
-			set_btree_bset_end(b, t);
-		}
-	}
-
-	/*
-	 * Always rebuild search trees: eytzinger search tree nodes directly
-	 * depend on the values of min/max key:
-	 */
-	bch2_bset_set_no_aux_tree(b, b->set);
-	bch2_btree_build_aux_trees(b);
-	b->nr = bch2_btree_node_count_keys(b);
-
-	struct bkey_s_c k;
-	struct bkey unpacked;
-	struct btree_node_iter iter;
-	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
-		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
-		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-	}
-}
-
-static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
-			 struct btree *b, struct bset *i,
-			 unsigned offset, int write,
-			 struct bch_io_failures *failed,
-			 struct printbuf *err_msg)
-{
-	unsigned version = le16_to_cpu(i->version);
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	int ret = 0;
-
-	btree_err_on(!bch2_version_compatible(version),
-		     -BCH_ERR_btree_node_read_err_incompatible,
-		     c, ca, b, i, NULL,
-		     btree_node_unsupported_version,
-		     "unsupported bset version %u.%u",
-		     BCH_VERSION_MAJOR(version),
-		     BCH_VERSION_MINOR(version));
-
-	if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
-	    btree_err_on(version < c->sb.version_min,
-			 -BCH_ERR_btree_node_read_err_fixable,
-			 c, NULL, b, i, NULL,
-			 btree_node_bset_older_than_sb_min,
-			 "bset version %u older than superblock version_min %u",
-			 version, c->sb.version_min)) {
-		if (bch2_version_compatible(version)) {
-			mutex_lock(&c->sb_lock);
-			c->disk_sb.sb->version_min = cpu_to_le16(version);
-			bch2_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		} else {
-			/* We have no idea what's going on: */
-			i->version = cpu_to_le16(c->sb.version);
-		}
-	}
-
-	if (btree_err_on(BCH_VERSION_MAJOR(version) >
-			 BCH_VERSION_MAJOR(c->sb.version),
-			 -BCH_ERR_btree_node_read_err_fixable,
-			 c, NULL, b, i, NULL,
-			 btree_node_bset_newer_than_sb,
-			 "bset version %u newer than superblock version %u",
-			 version, c->sb.version)) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->version = cpu_to_le16(version);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-		     -BCH_ERR_btree_node_read_err_incompatible,
-		     c, ca, b, i, NULL,
-		     btree_node_unsupported_version,
-		     "BSET_SEPARATE_WHITEOUTS no longer supported");
-
-	btree_err_on(offset && !i->u64s,
-		     -BCH_ERR_btree_node_read_err_fixable,
-		     c, ca, b, i, NULL,
-		     bset_empty,
-		     "empty bset");
-
-	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-		     -BCH_ERR_btree_node_read_err_want_retry,
-		     c, ca, b, i, NULL,
-		     bset_wrong_sector_offset,
-		     "bset at wrong sector offset");
-
-	if (!offset) {
-		struct btree_node *bn =
-			container_of(i, struct btree_node, keys);
-		/* These indicate that we read the wrong btree node: */
-
-		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *bp =
-				&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-			/* XXX endianness */
-			btree_err_on(bp->seq != bn->keys.seq,
-				     -BCH_ERR_btree_node_read_err_must_retry,
-				     c, ca, b, NULL, NULL,
-				     bset_bad_seq,
-				     "incorrect sequence number (wrong btree node)");
-		}
-
-		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i, NULL,
-			     btree_node_bad_btree,
-			     "incorrect btree id");
-
-		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i, NULL,
-			     btree_node_bad_level,
-			     "incorrect level");
-
-		if (!write)
-			compat_btree_node(b->c.level, b->c.btree_id, version,
-					  BSET_BIG_ENDIAN(i), write, bn);
-
-		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-			struct bch_btree_ptr_v2 *bp =
-				&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-			if (BTREE_PTR_RANGE_UPDATED(bp)) {
-				b->data->min_key = bp->min_key;
-				b->data->max_key = b->key.k.p;
-			}
-
-			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-				     -BCH_ERR_btree_node_read_err_must_retry,
-				     c, ca, b, NULL, NULL,
-				     btree_node_bad_min_key,
-				     "incorrect min_key: got %s should be %s",
-				     (printbuf_reset(&buf1),
-				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
-				     (printbuf_reset(&buf2),
-				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
-		}
-
-		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, i, NULL,
-			     btree_node_bad_max_key,
-			     "incorrect max key %s",
-			     (printbuf_reset(&buf1),
-			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
-
-		if (write)
-			compat_btree_node(b->c.level, b->c.btree_id, version,
-					  BSET_BIG_ENDIAN(i), write, bn);
-
-		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-			     -BCH_ERR_btree_node_read_err_bad_node,
-			     c, ca, b, i, NULL,
-			     btree_node_bad_format,
-			     "invalid bkey format: %s\n%s", buf1.buf,
-			     (printbuf_reset(&buf2),
-			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
-		printbuf_reset(&buf1);
-
-		compat_bformat(b->c.level, b->c.btree_id, version,
-			       BSET_BIG_ENDIAN(i), write,
-			       &bn->format);
-	}
-fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
-static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
-					struct bkey_s_c k,
-					enum bch_validate_flags flags)
-{
-	return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
-		.from	= BKEY_VALIDATE_btree_node,
-		.level	= b->c.level,
-		.btree	= b->c.btree_id,
-		.flags	= flags
-	});
-}
-
-static int bset_key_validate(struct bch_fs *c, struct btree *b,
-			     struct bkey_s_c k,
-			     bool updated_range,
-			     enum bch_validate_flags flags)
-{
-	struct bkey_validate_context from = (struct bkey_validate_context) {
-		.from	= BKEY_VALIDATE_btree_node,
-		.level	= b->c.level,
-		.btree	= b->c.btree_id,
-		.flags	= flags,
-	};
-	return __bch2_bkey_validate(c, k, from) ?:
-		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
-		(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
-}
-
-static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
-			 struct bset *i, struct bkey_packed *k)
-{
-	if (bkey_p_next(k) > vstruct_last(i))
-		return false;
-
-	if (k->format > KEY_FORMAT_CURRENT)
-		return false;
-
-	if (!bkeyp_u64s_valid(&b->format, k))
-		return false;
-
-	struct bkey tmp;
-	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-	return !__bch2_bkey_validate(c, u.s_c,
-				     (struct bkey_validate_context) {
-					.from	= BKEY_VALIDATE_btree_node,
-					.level	= b->c.level,
-					.btree	= b->c.btree_id,
-					.flags	= BCH_VALIDATE_silent
-				     });
-}
-
-static inline int btree_node_read_bkey_cmp(const struct btree *b,
-				const struct bkey_packed *l,
-				const struct bkey_packed *r)
-{
-	return bch2_bkey_cmp_packed(b, l, r)
-		?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
-}
-
-static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-			 struct bset *i, int write,
-			 struct bch_io_failures *failed,
-			 struct printbuf *err_msg)
-{
-	unsigned version = le16_to_cpu(i->version);
-	struct bkey_packed *k, *prev = NULL;
-	struct printbuf buf = PRINTBUF;
-	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-	int ret = 0;
-
-	for (k = i->start;
-	     k != vstruct_last(i);) {
-		struct bkey_s u;
-		struct bkey tmp;
-		unsigned next_good_key;
-
-		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i, k,
-				 btree_node_bkey_past_bset_end,
-				 "key extends past end of bset")) {
-			i->u64s = cpu_to_le16((u64 *) k - i->_data);
-			break;
-		}
-
-		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i, k,
-				 btree_node_bkey_bad_format,
-				 "invalid bkey format %u", k->format))
-			goto drop_this_key;
-
-		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, i, k,
-				 btree_node_bkey_bad_u64s,
-				 "bad k->u64s %u (min %u max %zu)", k->u64s,
-				 bkeyp_key_u64s(&b->format, k),
-				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
-			goto drop_this_key;
-
-		if (!write)
-			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-				    BSET_BIG_ENDIAN(i), write,
-				    &b->format, k);
-
-		u = __bkey_disassemble(b, k, &tmp);
-
-		ret = bset_key_validate(c, b, u.s_c, updated_range, write);
-		if (ret == -BCH_ERR_fsck_delete_bkey)
-			goto drop_this_key;
-		if (ret)
-			goto fsck_err;
-
-		if (write)
-			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
-				    BSET_BIG_ENDIAN(i), write,
-				    &b->format, k);
-
-		if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
-			struct bkey up = bkey_unpack_key(b, prev);
-
-			printbuf_reset(&buf);
-			prt_printf(&buf, "keys out of order: ");
-			bch2_bkey_to_text(&buf, &up);
-			prt_printf(&buf, " > ");
-			bch2_bkey_to_text(&buf, u.k);
-
-			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				      c, NULL, b, i, k,
-				      btree_node_bkey_out_of_order,
-				      "%s", buf.buf))
-				goto drop_this_key;
-		}
-
-		prev = k;
-		k = bkey_p_next(k);
-		continue;
-drop_this_key:
-		next_good_key = k->u64s;
-
-		if (!next_good_key ||
-		    (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
-		     version >= bcachefs_metadata_version_snapshot)) {
-			/*
-			 * only do scanning if bch2_bkey_compat() has nothing to
-			 * do
-			 */
-
-			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
-				for (next_good_key = 1;
-				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
-				     next_good_key++)
-					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
-						goto got_good_key;
-			}
-
-			/*
-			 * didn't find a good key, have to truncate the rest of
-			 * the bset
-			 */
-			next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
-		}
-got_good_key:
-		le16_add_cpu(&i->u64s, -next_good_key);
-		memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
-		set_btree_node_need_rewrite(b);
-		set_btree_node_need_rewrite_error(b);
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-			      struct btree *b,
-			      struct bch_io_failures *failed,
-			      struct printbuf *err_msg)
-{
-	struct btree_node_entry *bne;
-	struct sort_iter *iter;
-	struct btree_node *sorted;
-	struct bkey_packed *k;
-	struct bset *i;
-	bool used_mempool, blacklisted;
-	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
-	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
-	u64 max_journal_seq = 0;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0, write = READ;
-	u64 start_time = local_clock();
-
-	b->version_ondisk = U16_MAX;
-	/* We might get called multiple times on read retry: */
-	b->written = 0;
-
-	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
-
-	if (bch2_meta_read_fault("btree"))
-		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
-			  c, ca, b, NULL, NULL,
-			  btree_node_fault_injected,
-			  "dynamic fault");
-
-	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-		     -BCH_ERR_btree_node_read_err_must_retry,
-		     c, ca, b, NULL, NULL,
-		     btree_node_bad_magic,
-		     "bad magic: want %llx, got %llx",
-		     bset_magic(c), le64_to_cpu(b->data->magic));
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		struct bch_btree_ptr_v2 *bp =
-			&bkey_i_to_btree_ptr_v2(&b->key)->v;
-
-		bch2_bpos_to_text(&buf, b->data->min_key);
-		prt_str(&buf, "-");
-		bch2_bpos_to_text(&buf, b->data->max_key);
-
-		btree_err_on(b->data->keys.seq != bp->seq,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, NULL, NULL,
-			     btree_node_bad_seq,
-			     "got wrong btree node: got\n%s",
-			     (printbuf_reset(&buf),
-			      bch2_btree_node_header_to_text(&buf, b->data),
-			      buf.buf));
-	} else {
-		btree_err_on(!b->data->keys.seq,
-			     -BCH_ERR_btree_node_read_err_must_retry,
-			     c, ca, b, NULL, NULL,
-			     btree_node_bad_seq,
-			     "bad btree header: seq 0\n%s",
-			     (printbuf_reset(&buf),
-			      bch2_btree_node_header_to_text(&buf, b->data),
-			      buf.buf));
-	}
-
-	while (b->written < (ptr_written ?: btree_sectors(c))) {
-		unsigned sectors;
-		bool first = !b->written;
-
-		if (first) {
-			bne = NULL;
-			i = &b->data->keys;
-		} else {
-			bne = write_block(b);
-			i = &bne->keys;
-
-			if (i->seq != b->data->keys.seq)
-				break;
-		}
-
-		struct nonce nonce = btree_nonce(i, b->written << 9);
-		bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
-		btree_err_on(!good_csum_type,
-			     bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
-			     ? -BCH_ERR_btree_node_read_err_must_retry
-			     : -BCH_ERR_btree_node_read_err_want_retry,
-			     c, ca, b, i, NULL,
-			     bset_unknown_csum,
-			     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
-		if (first) {
-			sectors = vstruct_sectors(b->data, c->block_bits);
-			if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
-					 -BCH_ERR_btree_node_read_err_fixable,
-					 c, ca, b, i, NULL,
-					 bset_past_end_of_btree_node,
-					 "bset past end of btree node (offset %u len %u but written %zu)",
-					 b->written, sectors, ptr_written ?: btree_sectors(c)))
-				i->u64s = 0;
-			if (good_csum_type) {
-				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-				bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
-				if (csum_bad)
-					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-				btree_err_on(csum_bad,
-					     -BCH_ERR_btree_node_read_err_want_retry,
-					     c, ca, b, i, NULL,
-					     bset_bad_csum,
-					     "%s",
-					     (printbuf_reset(&buf),
-					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
-					      buf.buf));
-
-				ret = bset_encrypt(c, i, b->written << 9);
-				if (bch2_fs_fatal_err_on(ret, c,
-							 "decrypting btree node: %s", bch2_err_str(ret)))
-					goto fsck_err;
-			}
-
-			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
-				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-				     -BCH_ERR_btree_node_read_err_incompatible,
-				     c, NULL, b, NULL, NULL,
-				     btree_node_unsupported_version,
-				     "btree node does not have NEW_EXTENT_OVERWRITE set");
-		} else {
-			sectors = vstruct_sectors(bne, c->block_bits);
-			if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
-					 -BCH_ERR_btree_node_read_err_fixable,
-					 c, ca, b, i, NULL,
-					 bset_past_end_of_btree_node,
-					 "bset past end of btree node (offset %u len %u but written %zu)",
-					 b->written, sectors, ptr_written ?: btree_sectors(c)))
-				i->u64s = 0;
-			if (good_csum_type) {
-				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-				bool csum_bad = bch2_crc_cmp(bne->csum, csum);
-				if (ca && csum_bad)
-					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
-				btree_err_on(csum_bad,
-					     -BCH_ERR_btree_node_read_err_want_retry,
-					     c, ca, b, i, NULL,
-					     bset_bad_csum,
-					     "%s",
-					     (printbuf_reset(&buf),
-					      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
-					      buf.buf));
-
-				ret = bset_encrypt(c, i, b->written << 9);
-				if (bch2_fs_fatal_err_on(ret, c,
-						"decrypting btree node: %s", bch2_err_str(ret)))
-					goto fsck_err;
-			}
-		}
-
-		b->version_ondisk = min(b->version_ondisk,
-					le16_to_cpu(i->version));
-
-		ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
-		if (ret)
-			goto fsck_err;
-
-		if (!b->written)
-			btree_node_set_format(b, b->data->format);
-
-		ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
-		if (ret)
-			goto fsck_err;
-
-		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-		blacklisted = bch2_journal_seq_is_blacklisted(c,
-					le64_to_cpu(i->journal_seq),
-					true);
-
-		btree_err_on(blacklisted && first,
-			     -BCH_ERR_btree_node_read_err_fixable,
-			     c, ca, b, i, NULL,
-			     bset_blacklisted_journal_seq,
-			     "first btree node bset has blacklisted journal seq (%llu)",
-			     le64_to_cpu(i->journal_seq));
-
-		btree_err_on(blacklisted && ptr_written,
-			     -BCH_ERR_btree_node_read_err_fixable,
-			     c, ca, b, i, NULL,
-			     first_bset_blacklisted_journal_seq,
-			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
-			     le64_to_cpu(i->journal_seq),
-			     b->written, b->written + sectors, ptr_written);
-
-		b->written = min(b->written + sectors, btree_sectors(c));
-
-		if (blacklisted && !first)
-			continue;
-
-		sort_iter_add(iter,
-			      vstruct_idx(i, 0),
-			      vstruct_last(i));
-
-		max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
-	}
-
-	if (ptr_written) {
-		btree_err_on(b->written < ptr_written,
-			     -BCH_ERR_btree_node_read_err_want_retry,
-			     c, ca, b, NULL, NULL,
-			     btree_node_data_missing,
-			     "btree node data missing: expected %u sectors, found %u",
-			     ptr_written, b->written);
-	} else {
-		for (bne = write_block(b);
-		     bset_byte_offset(b, bne) < btree_buf_bytes(b);
-		     bne = (void *) bne + block_bytes(c))
-			btree_err_on(bne->keys.seq == b->data->keys.seq &&
-				     !bch2_journal_seq_is_blacklisted(c,
-								      le64_to_cpu(bne->keys.journal_seq),
-								      true),
-				     -BCH_ERR_btree_node_read_err_want_retry,
-				     c, ca, b, NULL, NULL,
-				     btree_node_bset_after_end,
-				     "found bset signature after last bset");
-	}
-
-	sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
-	sorted->keys.u64s = 0;
-
-	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
-	memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
-			btree_buf_bytes(b) -
-			sizeof(struct btree_node) -
-			b->nr.live_u64s * sizeof(u64));
-
-	b->data->keys.u64s = sorted->keys.u64s;
-	*sorted = *b->data;
-	swap(sorted, b->data);
-	set_btree_bset(b, b->set, &b->data->keys);
-	b->nsets = 1;
-	b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
-
-	BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
-
-	btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
-
-	i = &b->data->keys;
-	for (k = i->start; k != vstruct_last(i);) {
-		struct bkey tmp;
-		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-
-		ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
-		if (ret == -BCH_ERR_fsck_delete_bkey ||
-		    (static_branch_unlikely(&bch2_inject_invalid_keys) &&
-		     !bversion_cmp(u.k->bversion, MAX_VERSION))) {
-			btree_keys_account_key_drop(&b->nr, 0, k);
-
-			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
-			memmove_u64s_down(k, bkey_p_next(k),
-					  (u64 *) vstruct_end(i) - (u64 *) k);
-			set_btree_bset_end(b, b->set);
-			set_btree_node_need_rewrite(b);
-			set_btree_node_need_rewrite_error(b);
-			continue;
-		}
-		if (ret)
-			goto fsck_err;
-
-		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
-			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
-
-			bp.v->mem_ptr = 0;
-		}
-
-		k = bkey_p_next(k);
-	}
-
-	bch2_bset_build_aux_tree(b, b->set, false);
-
-	set_needs_whiteout(btree_bset_first(b), true);
-
-	btree_node_reset_sib_u64s(b);
-
-	if (updated_range)
-		bch2_btree_node_drop_keys_outside_node(b);
-
-	/*
-	 * XXX:
-	 *
-	 * We deadlock if too many btree updates require node rewrites while
-	 * we're still in journal replay.
-	 *
-	 * This is because btree node rewrites generate more updates for the
-	 * interior updates (alloc, backpointers), and if those updates touch
-	 * new nodes and generate more rewrites - well, you see the problem.
-	 *
-	 * The biggest cause is that we don't use the btree write buffer (for
-	 * the backpointer updates - this needs some real thought on locking in
-	 * order to fix.
-	 *
-	 * The problem with this workaround (not doing the rewrite for degraded
-	 * nodes in journal replay) is that those degraded nodes persist, and we
-	 * don't want that (this is a real bug when a btree node write completes
-	 * with fewer replicas than we wanted and leaves a degraded node due to
-	 * device _removal_, i.e. the device went away mid write).
-	 *
-	 * It's less of a bug here, but still a problem because we don't yet
-	 * have a way of tracking degraded data - we another index (all
-	 * extents/btree nodes, by replicas entry) in order to fix properly
-	 * (re-replicate degraded data at the earliest possible time).
-	 */
-	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
-		scoped_guard(rcu)
-			bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-				struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-				if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
-					set_btree_node_need_rewrite(b);
-					set_btree_node_need_rewrite_degraded(b);
-				}
-			}
-	}
-
-	if (!ptr_written) {
-		set_btree_node_need_rewrite(b);
-		set_btree_node_need_rewrite_ptr_written_zero(b);
-	}
-fsck_err:
-	mempool_free(iter, &c->fill_iter);
-	printbuf_exit(&buf);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
-	return ret;
-}
-
-static void btree_node_read_work(struct work_struct *work)
-{
-	struct btree_read_bio *rb =
-		container_of(work, struct btree_read_bio, work);
-	struct bch_fs *c	= rb->c;
-	struct bch_dev *ca	= rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-	struct btree *b		= rb->b;
-	struct bio *bio		= &rb->bio;
-	struct bch_io_failures failed = { .nr = 0 };
-	int ret = 0;
-
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	prt_printf(&buf, "btree node read error at btree ");
-	bch2_btree_pos_to_text(&buf, c, b);
-	prt_newline(&buf);
-
-	goto start;
-	while (1) {
-		ret = bch2_bkey_pick_read_device(c,
-					bkey_i_to_s_c(&b->key),
-					&failed, &rb->pick, -1);
-		if (ret <= 0) {
-			set_btree_node_read_error(b);
-			break;
-		}
-
-		ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
-		rb->have_ioref		= ca != NULL;
-		rb->start_time		= local_clock();
-		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
-		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
-		bio->bi_iter.bi_size	= btree_buf_bytes(b);
-
-		if (rb->have_ioref) {
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			submit_bio_wait(bio);
-		} else {
-			bio->bi_status = BLK_STS_REMOVED;
-		}
-
-		bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-					   rb->start_time, !bio->bi_status);
-start:
-		if (rb->have_ioref)
-			enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
-		rb->have_ioref = false;
-
-		if (bio->bi_status) {
-			bch2_mark_io_failure(&failed, &rb->pick, false);
-			continue;
-		}
-
-		ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
-		if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
-		    ret == -BCH_ERR_btree_node_read_err_must_retry)
-			continue;
-
-		if (ret)
-			set_btree_node_read_error(b);
-
-		break;
-	}
-
-	bch2_io_failures_to_text(&buf, c, &failed);
-
-	if (btree_node_read_error(b))
-		bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
-	/*
-	 * only print retry success if we read from a replica with no errors
-	 */
-	if (btree_node_read_error(b))
-		prt_printf(&buf, "ret %s", bch2_err_str(ret));
-	else if (failed.nr) {
-		if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
-			prt_printf(&buf, "retry success");
-		else
-			prt_printf(&buf, "repair success");
-	}
-
-	if ((failed.nr ||
-	     btree_node_need_rewrite(b)) &&
-	    !btree_node_read_error(b) &&
-	    c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
-		prt_printf(&buf, " (rewriting node)");
-		bch2_btree_node_rewrite_async(c, b);
-	}
-	prt_newline(&buf);
-
-	if (failed.nr)
-		bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
-
-	async_object_list_del(c, btree_read_bio, rb->list_idx);
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
-			       rb->start_time);
-	bio_put(&rb->bio);
-	printbuf_exit(&buf);
-	clear_btree_node_read_in_flight(b);
-	smp_mb__after_atomic();
-	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
-	struct btree_read_bio *rb =
-		container_of(bio, struct btree_read_bio, bio);
-	struct bch_fs *c	= rb->c;
-	struct bch_dev *ca	= rb->have_ioref
-		? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-				   rb->start_time, !bio->bi_status);
-
-	queue_work(c->btree_read_complete_wq, &rb->work);
-}
-
-void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
-{
-	bch2_bio_to_text(out, &rbio->bio);
-}
-
-struct btree_node_read_all {
-	struct closure		cl;
-	struct bch_fs		*c;
-	struct btree		*b;
-	unsigned		nr;
-	void			*buf[BCH_REPLICAS_MAX];
-	struct bio		*bio[BCH_REPLICAS_MAX];
-	blk_status_t		err[BCH_REPLICAS_MAX];
-};
-
-static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
-{
-	struct btree_node *bn = data;
-	struct btree_node_entry *bne;
-	unsigned offset = 0;
-
-	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
-		return 0;
-
-	while (offset < btree_sectors(c)) {
-		if (!offset) {
-			offset += vstruct_sectors(bn, c->block_bits);
-		} else {
-			bne = data + (offset << 9);
-			if (bne->keys.seq != bn->keys.seq)
-				break;
-			offset += vstruct_sectors(bne, c->block_bits);
-		}
-	}
-
-	return offset;
-}
-
-static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
-{
-	struct btree_node *bn = data;
-	struct btree_node_entry *bne;
-
-	if (!offset)
-		return false;
-
-	while (offset < btree_sectors(c)) {
-		bne = data + (offset << 9);
-		if (bne->keys.seq == bn->keys.seq)
-			return true;
-		offset++;
-	}
-
-	return false;
-	return offset;
-}
-
-static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
-{
-	closure_type(ra, struct btree_node_read_all, cl);
-	struct bch_fs *c = ra->c;
-	struct btree *b = ra->b;
-	struct printbuf buf = PRINTBUF;
-	bool dump_bset_maps = false;
-	int ret = 0, best = -1, write = READ;
-	unsigned i, written = 0, written2 = 0;
-	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
-		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
-	bool _saw_error = false, *saw_error = &_saw_error;
-	struct printbuf *err_msg = NULL;
-	struct bch_io_failures *failed = NULL;
-
-	for (i = 0; i < ra->nr; i++) {
-		struct btree_node *bn = ra->buf[i];
-
-		if (ra->err[i])
-			continue;
-
-		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
-		    (seq && seq != bn->keys.seq))
-			continue;
-
-		if (best < 0) {
-			best = i;
-			written = btree_node_sectors_written(c, bn);
-			continue;
-		}
-
-		written2 = btree_node_sectors_written(c, ra->buf[i]);
-		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL, NULL,
-				 btree_node_replicas_sectors_written_mismatch,
-				 "btree node sectors written mismatch: %u != %u",
-				 written, written2) ||
-		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL, NULL,
-				 btree_node_bset_after_end,
-				 "found bset signature after last bset") ||
-		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-				 -BCH_ERR_btree_node_read_err_fixable,
-				 c, NULL, b, NULL, NULL,
-				 btree_node_replicas_data_mismatch,
-				 "btree node replicas content mismatch"))
-			dump_bset_maps = true;
-
-		if (written2 > written) {
-			written = written2;
-			best = i;
-		}
-	}
-fsck_err:
-	if (dump_bset_maps) {
-		for (i = 0; i < ra->nr; i++) {
-			struct btree_node *bn = ra->buf[i];
-			struct btree_node_entry *bne = NULL;
-			unsigned offset = 0, sectors;
-			bool gap = false;
-
-			if (ra->err[i])
-				continue;
-
-			printbuf_reset(&buf);
-
-			while (offset < btree_sectors(c)) {
-				if (!offset) {
-					sectors = vstruct_sectors(bn, c->block_bits);
-				} else {
-					bne = ra->buf[i] + (offset << 9);
-					if (bne->keys.seq != bn->keys.seq)
-						break;
-					sectors = vstruct_sectors(bne, c->block_bits);
-				}
-
-				prt_printf(&buf, " %u-%u", offset, offset + sectors);
-				if (bne && bch2_journal_seq_is_blacklisted(c,
-							le64_to_cpu(bne->keys.journal_seq), false))
-					prt_printf(&buf, "*");
-				offset += sectors;
-			}
-
-			while (offset < btree_sectors(c)) {
-				bne = ra->buf[i] + (offset << 9);
-				if (bne->keys.seq == bn->keys.seq) {
-					if (!gap)
-						prt_printf(&buf, " GAP");
-					gap = true;
-
-					sectors = vstruct_sectors(bne, c->block_bits);
-					prt_printf(&buf, " %u-%u", offset, offset + sectors);
-					if (bch2_journal_seq_is_blacklisted(c,
-							le64_to_cpu(bne->keys.journal_seq), false))
-						prt_printf(&buf, "*");
-				}
-				offset++;
-			}
-
-			bch_err(c, "replica %u:%s", i, buf.buf);
-		}
-	}
-
-	if (best >= 0) {
-		memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
-		ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
-	} else {
-		ret = -1;
-	}
-
-	if (ret) {
-		set_btree_node_read_error(b);
-
-		struct printbuf buf = PRINTBUF;
-		bch2_btree_lost_data(c, &buf, b->c.btree_id);
-		if (buf.pos)
-			bch_err(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	} else if (*saw_error)
-		bch2_btree_node_rewrite_async(c, b);
-
-	for (i = 0; i < ra->nr; i++) {
-		mempool_free(ra->buf[i], &c->btree_bounce_pool);
-		bio_put(ra->bio[i]);
-	}
-
-	closure_debug_destroy(&ra->cl);
-	kfree(ra);
-	printbuf_exit(&buf);
-
-	clear_btree_node_read_in_flight(b);
-	smp_mb__after_atomic();
-	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_all_replicas_endio(struct bio *bio)
-{
-	struct btree_read_bio *rb =
-		container_of(bio, struct btree_read_bio, bio);
-	struct bch_fs *c	= rb->c;
-	struct btree_node_read_all *ra = rb->ra;
-
-	if (rb->have_ioref) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
-		bch2_latency_acct(ca, rb->start_time, READ);
-		enumerated_ref_put(&ca->io_ref[READ],
-			BCH_DEV_READ_REF_btree_node_read_all_replicas);
-	}
-
-	ra->err[rb->idx] = bio->bi_status;
-	closure_put(&ra->cl);
-}
-
-/*
- * XXX This allocates multiple times from the same mempools, and can deadlock
- * under sufficient memory pressure (but is only a debug path)
- */
-static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
-{
-	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded pick;
-	struct btree_node_read_all *ra;
-	unsigned i;
-
-	ra = kzalloc(sizeof(*ra), GFP_NOFS);
-	if (!ra)
-		return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
-
-	closure_init(&ra->cl, NULL);
-	ra->c	= c;
-	ra->b	= b;
-	ra->nr	= bch2_bkey_nr_ptrs(k);
-
-	for (i = 0; i < ra->nr; i++) {
-		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
-		ra->bio[i] = bio_alloc_bioset(NULL,
-					      buf_pages(ra->buf[i], btree_buf_bytes(b)),
-					      REQ_OP_READ|REQ_SYNC|REQ_META,
-					      GFP_NOFS,
-					      &c->btree_bio);
-	}
-
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
-		struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-					BCH_DEV_READ_REF_btree_node_read_all_replicas);
-		struct btree_read_bio *rb =
-			container_of(ra->bio[i], struct btree_read_bio, bio);
-		rb->c			= c;
-		rb->b			= b;
-		rb->ra			= ra;
-		rb->start_time		= local_clock();
-		rb->have_ioref		= ca != NULL;
-		rb->idx			= i;
-		rb->pick		= pick;
-		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
-		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
-		bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
-
-		if (rb->have_ioref) {
-			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-				     bio_sectors(&rb->bio));
-			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
-
-			closure_get(&ra->cl);
-			submit_bio(&rb->bio);
-		} else {
-			ra->err[i] = BLK_STS_REMOVED;
-		}
-
-		i++;
-	}
-
-	if (sync) {
-		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl.work);
-	} else {
-		continue_at(&ra->cl, btree_node_read_all_replicas_done,
-			    c->btree_read_complete_wq);
-	}
-
-	return 0;
-}
-
-void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
-			  bool sync)
-{
-	struct bch_fs *c = trans->c;
-	struct extent_ptr_decoded pick;
-	struct btree_read_bio *rb;
-	struct bch_dev *ca;
-	struct bio *bio;
-	int ret;
-
-	trace_and_count(c, btree_node_read, trans, b);
-
-	if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
-	    !btree_node_read_all_replicas(c, b, sync))
-		return;
-
-	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
-					 NULL, &pick, -1);
-
-	if (ret <= 0) {
-		bool ratelimit = true;
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		prt_str(&buf, "btree node read error: no device to read from\n at ");
-		bch2_btree_pos_to_text(&buf, c, b);
-		prt_newline(&buf);
-		bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
-		if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
-		    bch2_fs_emergency_read_only2(c, &buf))
-			ratelimit = false;
-
-		static DEFINE_RATELIMIT_STATE(rs,
-					      DEFAULT_RATELIMIT_INTERVAL,
-					      DEFAULT_RATELIMIT_BURST);
-		if (!ratelimit || __ratelimit(&rs))
-			bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-
-		set_btree_node_read_error(b);
-		clear_btree_node_read_in_flight(b);
-		smp_mb__after_atomic();
-		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-		return;
-	}
-
-	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
-
-	bio = bio_alloc_bioset(NULL,
-			       buf_pages(b->data, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_SYNC|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	rb = container_of(bio, struct btree_read_bio, bio);
-	rb->c			= c;
-	rb->b			= b;
-	rb->ra			= NULL;
-	rb->start_time		= local_clock();
-	rb->have_ioref		= ca != NULL;
-	rb->pick		= pick;
-	INIT_WORK(&rb->work, btree_node_read_work);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bio->bi_end_io		= btree_node_read_endio;
-	bch2_bio_map(bio, b->data, btree_buf_bytes(b));
-
-	async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
-
-	if (rb->have_ioref) {
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
-			     bio_sectors(bio));
-		bio_set_dev(bio, ca->disk_sb.bdev);
-
-		if (sync) {
-			submit_bio_wait(bio);
-			bch2_latency_acct(ca, rb->start_time, READ);
-			btree_node_read_work(&rb->work);
-		} else {
-			submit_bio(bio);
-		}
-	} else {
-		bio->bi_status = BLK_STS_REMOVED;
-
-		if (sync)
-			btree_node_read_work(&rb->work);
-		else
-			queue_work(c->btree_read_complete_wq, &rb->work);
-	}
-}
-
-static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
-				  const struct bkey_i *k, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	do {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		closure_sync(&cl);
-	} while (ret);
-
-	b = bch2_btree_node_mem_alloc(trans, level != 0);
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	BUG_ON(IS_ERR(b));
-
-	bkey_copy(&b->key, k);
-	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
-
-	set_btree_node_read_in_flight(b);
-
-	/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
-	bch2_trans_unlock(trans);
-	bch2_btree_node_read(trans, b, true);
-
-	if (btree_node_read_error(b)) {
-		mutex_lock(&c->btree_cache.lock);
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-		mutex_unlock(&c->btree_cache.lock);
-
-		ret = bch_err_throw(c, btree_node_read_error);
-		goto err;
-	}
-
-	bch2_btree_set_root_for_read(c, b);
-err:
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	return ret;
-}
-
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
-			const struct bkey_i *k, unsigned level)
-{
-	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
-}
-
-struct btree_node_scrub {
-	struct bch_fs		*c;
-	struct bch_dev		*ca;
-	void			*buf;
-	bool			used_mempool;
-	unsigned		written;
-
-	enum btree_id		btree;
-	unsigned		level;
-	struct bkey_buf		key;
-	__le64			seq;
-
-	struct work_struct	work;
-	struct bio		bio;
-};
-
-static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
-				   struct printbuf *err)
-{
-	unsigned written = 0;
-
-	if (le64_to_cpu(data->magic) != bset_magic(c)) {
-		prt_printf(err, "bad magic: want %llx, got %llx",
-			   bset_magic(c), le64_to_cpu(data->magic));
-		return false;
-	}
-
-	while (written < (ptr_written ?: btree_sectors(c))) {
-		struct btree_node_entry *bne;
-		struct bset *i;
-		bool first = !written;
-
-		if (first) {
-			bne = NULL;
-			i = &data->keys;
-		} else {
-			bne = (void *) data + (written << 9);
-			i = &bne->keys;
-
-			if (!ptr_written && i->seq != data->keys.seq)
-				break;
-		}
-
-		struct nonce nonce = btree_nonce(i, written << 9);
-		bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
-		if (first) {
-			if (good_csum_type) {
-				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
-				if (bch2_crc_cmp(data->csum, csum)) {
-					bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
-					return false;
-				}
-			}
-
-			written += vstruct_sectors(data, c->block_bits);
-		} else {
-			if (good_csum_type) {
-				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-				if (bch2_crc_cmp(bne->csum, csum)) {
-					bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
-					return false;
-				}
-			}
-
-			written += vstruct_sectors(bne, c->block_bits);
-		}
-	}
-
-	return true;
-}
-
-static void btree_node_scrub_work(struct work_struct *work)
-{
-	struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
-	struct bch_fs *c = scrub->c;
-	struct printbuf err = PRINTBUF;
-
-	__bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
-				 bkey_i_to_s_c(scrub->key.k));
-	prt_newline(&err);
-
-	if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
-		int ret = bch2_trans_do(c,
-			bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
-						    scrub->key.k, 0));
-		if (!bch2_err_matches(ret, ENOENT) &&
-		    !bch2_err_matches(ret, EROFS))
-			bch_err_fn_ratelimited(c, ret);
-	}
-
-	printbuf_exit(&err);
-	bch2_bkey_buf_exit(&scrub->key, c);;
-	btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
-	enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
-	kfree(scrub);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
-}
-
-static void btree_node_scrub_endio(struct bio *bio)
-{
-	struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
-
-	queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
-}
-
-int bch2_btree_node_scrub(struct btree_trans *trans,
-			  enum btree_id btree, unsigned level,
-			  struct bkey_s_c k, unsigned dev)
-{
-	if (k.k->type != KEY_TYPE_btree_ptr_v2)
-		return 0;
-
-	struct bch_fs *c = trans->c;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
-		return bch_err_throw(c, erofs_no_writes);
-
-	struct extent_ptr_decoded pick;
-	int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
-	if (ret <= 0)
-		goto err;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-						BCH_DEV_READ_REF_btree_node_scrub);
-	if (!ca) {
-		ret = bch_err_throw(c, device_offline);
-		goto err;
-	}
-
-	bool used_mempool = false;
-	void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
-
-	unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
-
-	struct btree_node_scrub *scrub =
-		kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
-	if (!scrub) {
-		ret = -ENOMEM;
-		goto err_free;
-	}
-
-	scrub->c		= c;
-	scrub->ca		= ca;
-	scrub->buf		= buf;
-	scrub->used_mempool	= used_mempool;
-	scrub->written		= btree_ptr_sectors_written(k);
-
-	scrub->btree		= btree;
-	scrub->level		= level;
-	bch2_bkey_buf_init(&scrub->key);
-	bch2_bkey_buf_reassemble(&scrub->key, c, k);
-	scrub->seq		= bkey_s_c_to_btree_ptr_v2(k).v->seq;
-
-	INIT_WORK(&scrub->work, btree_node_scrub_work);
-
-	bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
-	bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
-	scrub->bio.bi_iter.bi_sector	= pick.ptr.offset;
-	scrub->bio.bi_end_io		= btree_node_scrub_endio;
-	submit_bio(&scrub->bio);
-	return 0;
-err_free:
-	btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
-	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
-err:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
-	return ret;
-}
-
-static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
-				      struct btree_write *w)
-{
-	unsigned long old, new;
-
-	old = READ_ONCE(b->will_make_reachable);
-	do {
-		new = old;
-		if (!(old & 1))
-			break;
-
-		new &= ~1UL;
-	} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
-
-	if (old & 1)
-		closure_put(&((struct btree_update *) new)->cl);
-
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-}
-
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
-	struct btree_write *w = btree_prev_write(b);
-	unsigned long old, new;
-	unsigned type = 0;
-
-	bch2_btree_complete_write(c, b, w);
-
-	if (start_time)
-		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
-
-	old = READ_ONCE(b->flags);
-	do {
-		new = old;
-
-		if ((old & (1U << BTREE_NODE_dirty)) &&
-		    (old & (1U << BTREE_NODE_need_write)) &&
-		    !(old & (1U << BTREE_NODE_never_write)) &&
-		    !(old & (1U << BTREE_NODE_write_blocked)) &&
-		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
-			new &= ~(1U << BTREE_NODE_dirty);
-			new &= ~(1U << BTREE_NODE_need_write);
-			new |=  (1U << BTREE_NODE_write_in_flight);
-			new |=  (1U << BTREE_NODE_write_in_flight_inner);
-			new |=  (1U << BTREE_NODE_just_written);
-			new ^=  (1U << BTREE_NODE_write_idx);
-
-			type = new & BTREE_WRITE_TYPE_MASK;
-			new &= ~BTREE_WRITE_TYPE_MASK;
-		} else {
-			new &= ~(1U << BTREE_NODE_write_in_flight);
-			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-		}
-	} while (!try_cmpxchg(&b->flags, &old, new));
-
-	if (new & (1U << BTREE_NODE_write_in_flight))
-		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
-	else {
-		smp_mb__after_atomic();
-		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-	}
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
-	/* we don't need transaction context anymore after we got the lock. */
-	bch2_trans_put(trans);
-	__btree_node_write_done(c, b, start_time);
-	six_unlock_read(&b->c.lock);
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
-	struct btree_write_bio *wbio =
-		container_of(work, struct btree_write_bio, work);
-	struct bch_fs *c	= wbio->wbio.c;
-	struct btree *b		= wbio->wbio.bio.bi_private;
-	u64 start_time		= wbio->start_time;
-	int ret = 0;
-
-	btree_bounce_free(c,
-		wbio->data_bytes,
-		wbio->wbio.used_mempool,
-		wbio->data);
-
-	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
-		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
-	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = bch_err_throw(c, btree_node_write_all_failed);
-		goto err;
-	}
-
-	if (wbio->wbio.first_btree_write) {
-		if (wbio->wbio.failed.nr) {
-
-		}
-	} else {
-		ret = bch2_trans_do(c,
-			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-					BCH_WATERMARK_interior_updates|
-					BCH_TRANS_COMMIT_journal_reclaim|
-					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_check_rw,
-					!wbio->wbio.failed.nr));
-		if (ret)
-			goto err;
-	}
-out:
-	async_object_list_del(c, btree_write_bio, wbio->list_idx);
-	bio_put(&wbio->wbio.bio);
-	btree_node_write_done(c, b, start_time);
-	return;
-err:
-	set_btree_node_noevict(b);
-
-	if (!bch2_err_matches(ret, EROFS)) {
-		struct printbuf buf = PRINTBUF;
-		prt_printf(&buf, "writing btree node: %s\n  ", bch2_err_str(ret));
-		bch2_btree_pos_to_text(&buf, c, b);
-		bch2_fs_fatal_error(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-	goto out;
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
-	struct bch_write_bio *wbio	= to_wbio(bio);
-	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
-	struct bch_write_bio *orig	= parent ?: wbio;
-	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
-	struct bch_fs *c		= wbio->c;
-	struct btree *b			= wbio->bio.bi_private;
-	struct bch_dev *ca		= wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-				   wbio->submit_time, !bio->bi_status);
-
-	if (ca && bio->bi_status) {
-		struct printbuf buf = PRINTBUF;
-		buf.atomic++;
-		prt_printf(&buf, "btree write error: %s\n  ",
-			   bch2_blk_status_to_str(bio->bi_status));
-		bch2_btree_pos_to_text(&buf, c, b);
-		bch_err_dev_ratelimited(ca, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	if (bio->bi_status) {
-		unsigned long flags;
-		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
-		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-	}
-
-	/*
-	 * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
-	 * btree writes yet (due to device removal/ro):
-	 */
-	if (wbio->have_ioref)
-		enumerated_ref_put(&ca->io_ref[READ],
-				   BCH_DEV_READ_REF_btree_node_write);
-
-	if (parent) {
-		bio_put(bio);
-		bio_endio(&parent->bio);
-		return;
-	}
-
-	clear_btree_node_write_in_flight_inner(b);
-	smp_mb__after_atomic();
-	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
-	INIT_WORK(&wb->work, btree_node_write_work);
-	queue_work(c->btree_write_complete_wq, &wb->work);
-}
-
-static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
-				   struct bset *i)
-{
-	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
-				     (struct bkey_validate_context) {
-					.from	= BKEY_VALIDATE_btree_node,
-					.level	= b->c.level + 1,
-					.btree	= b->c.btree_id,
-					.flags	= BCH_VALIDATE_write,
-				     });
-	if (ret) {
-		bch2_fs_inconsistent(c, "invalid btree node key before write");
-		return ret;
-	}
-
-	ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
-		validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
-	if (ret) {
-		bch2_inconsistent_error(c);
-		dump_stack();
-	}
-
-	return ret;
-}
-
-static void btree_write_submit(struct work_struct *work)
-{
-	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
-	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-
-	bkey_copy(&tmp.k, &wbio->key);
-
-	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
-		ptr->offset += wbio->sector_offset;
-
-	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
-				  &tmp.k, false);
-}
-
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
-{
-	struct btree_write_bio *wbio;
-	struct bset *i;
-	struct btree_node *bn = NULL;
-	struct btree_node_entry *bne = NULL;
-	struct sort_iter_stack sort_iter;
-	struct nonce nonce;
-	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
-	u64 seq = 0;
-	bool used_mempool;
-	unsigned long old, new;
-	bool validate_before_checksum = false;
-	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
-	void *data;
-	u64 start_time = local_clock();
-	int ret;
-
-	if (flags & BTREE_WRITE_ALREADY_STARTED)
-		goto do_write;
-
-	/*
-	 * We may only have a read lock on the btree node - the dirty bit is our
-	 * "lock" against racing with other threads that may be trying to start
-	 * a write, we do a write iff we clear the dirty bit. Since setting the
-	 * dirty bit requires a write lock, we can't race with other threads
-	 * redirtying it:
-	 */
-	old = READ_ONCE(b->flags);
-	do {
-		new = old;
-
-		if (!(old & (1 << BTREE_NODE_dirty)))
-			return;
-
-		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
-		    !(old & (1 << BTREE_NODE_need_write)))
-			return;
-
-		if (old &
-		    ((1 << BTREE_NODE_never_write)|
-		     (1 << BTREE_NODE_write_blocked)))
-			return;
-
-		if (b->written &&
-		    (old & (1 << BTREE_NODE_will_make_reachable)))
-			return;
-
-		if (old & (1 << BTREE_NODE_write_in_flight))
-			return;
-
-		if (flags & BTREE_WRITE_ONLY_IF_NEED)
-			type = new & BTREE_WRITE_TYPE_MASK;
-		new &= ~BTREE_WRITE_TYPE_MASK;
-
-		new &= ~(1 << BTREE_NODE_dirty);
-		new &= ~(1 << BTREE_NODE_need_write);
-		new |=  (1 << BTREE_NODE_write_in_flight);
-		new |=  (1 << BTREE_NODE_write_in_flight_inner);
-		new |=  (1 << BTREE_NODE_just_written);
-		new ^=  (1 << BTREE_NODE_write_idx);
-	} while (!try_cmpxchg_acquire(&b->flags, &old, new));
-
-	if (new & (1U << BTREE_NODE_need_write))
-		return;
-do_write:
-	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
-
-	atomic_long_dec(&c->btree_cache.nr_dirty);
-
-	BUG_ON(btree_node_fake(b));
-	BUG_ON((b->will_make_reachable != 0) != !b->written);
-
-	BUG_ON(b->written >= btree_sectors(c));
-	BUG_ON(b->written & (block_sectors(c) - 1));
-	BUG_ON(bset_written(b, btree_bset_last(b)));
-	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
-	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
-	bch2_sort_whiteouts(c, b);
-
-	sort_iter_stack_init(&sort_iter, b);
-
-	bytes = !b->written
-		? sizeof(struct btree_node)
-		: sizeof(struct btree_node_entry);
-
-	bytes += b->whiteout_u64s * sizeof(u64);
-
-	for_each_bset(b, t) {
-		i = bset(b, t);
-
-		if (bset_written(b, i))
-			continue;
-
-		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-		sort_iter_add(&sort_iter.iter,
-			      btree_bkey_first(b, t),
-			      btree_bkey_last(b, t));
-		seq = max(seq, le64_to_cpu(i->journal_seq));
-	}
-
-	BUG_ON(b->written && !seq);
-
-	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
-	bytes += 8;
-
-	/* buffer must be a multiple of the block size */
-	bytes = round_up(bytes, block_bytes(c));
-
-	data = btree_bounce_alloc(c, bytes, &used_mempool);
-
-	if (!b->written) {
-		bn = data;
-		*bn = *b->data;
-		i = &bn->keys;
-	} else {
-		bne = data;
-		bne->keys = b->data->keys;
-		i = &bne->keys;
-	}
-
-	i->journal_seq	= cpu_to_le64(seq);
-	i->u64s		= 0;
-
-	sort_iter_add(&sort_iter.iter,
-		      unwritten_whiteouts_start(b),
-		      unwritten_whiteouts_end(b));
-	SET_BSET_SEPARATE_WHITEOUTS(i, false);
-
-	u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
-	le16_add_cpu(&i->u64s, u64s);
-
-	b->whiteout_u64s = 0;
-
-	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
-
-	set_needs_whiteout(i, false);
-
-	/* do we have data to write? */
-	if (b->written && !i->u64s)
-		goto nowrite;
-
-	bytes_to_write = vstruct_end(i) - data;
-	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
-	if (!b->written &&
-	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
-
-	memset(data + bytes_to_write, 0,
-	       (sectors_to_write << 9) - bytes_to_write);
-
-	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
-	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
-	BUG_ON(i->seq != b->data->keys.seq);
-
-	i->version = cpu_to_le16(c->sb.version);
-	SET_BSET_OFFSET(i, b->written);
-	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
-
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
-		validate_before_checksum = true;
-
-	/* validate_bset will be modifying: */
-	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
-		validate_before_checksum = true;
-
-	/* if we're going to be encrypting, check metadata validity first: */
-	if (validate_before_checksum &&
-	    validate_bset_for_write(c, b, i))
-		goto err;
-
-	ret = bset_encrypt(c, i, b->written << 9);
-	if (bch2_fs_fatal_err_on(ret, c,
-			"encrypting btree node: %s", bch2_err_str(ret)))
-		goto err;
-
-	nonce = btree_nonce(i, b->written << 9);
-
-	if (bn)
-		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
-	else
-		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-	/* if we're not encrypting, check metadata after checksumming: */
-	if (!validate_before_checksum &&
-	    validate_bset_for_write(c, b, i))
-		goto err;
-
-	/*
-	 * We handle btree write errors by immediately halting the journal -
-	 * after we've done that, we can't issue any subsequent btree writes
-	 * because they might have pointers to new nodes that failed to write.
-	 *
-	 * Furthermore, there's no point in doing any more btree writes because
-	 * with the journal stopped, we're never going to update the journal to
-	 * reflect that those writes were done and the data flushed from the
-	 * journal:
-	 *
-	 * Also on journal error, the pending write may have updates that were
-	 * never journalled (interior nodes, see btree_update_nodes_written()) -
-	 * it's critical that we don't do the write in that case otherwise we
-	 * will have updates visible that weren't in the journal:
-	 *
-	 * Make sure to update b->written so bch2_btree_init_next() doesn't
-	 * break:
-	 */
-	if (bch2_journal_error(&c->journal) ||
-	    c->opts.nochanges)
-		goto err;
-
-	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
-
-	wbio = container_of(bio_alloc_bioset(NULL,
-				buf_pages(data, sectors_to_write << 9),
-				REQ_OP_WRITE|REQ_META,
-				GFP_NOFS,
-				&c->btree_bio),
-			    struct btree_write_bio, wbio.bio);
-	wbio_init(&wbio->wbio.bio);
-	wbio->data			= data;
-	wbio->data_bytes		= bytes;
-	wbio->sector_offset		= b->written;
-	wbio->start_time		= start_time;
-	wbio->wbio.c			= c;
-	wbio->wbio.used_mempool		= used_mempool;
-	wbio->wbio.first_btree_write	= !b->written;
-	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
-	wbio->wbio.bio.bi_private	= b;
-
-	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
-
-	bkey_copy(&wbio->key, &b->key);
-
-	b->written += sectors_to_write;
-
-	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
-			cpu_to_le16(b->written);
-
-	atomic64_inc(&c->btree_write_stats[type].nr);
-	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
-
-	async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
-
-	INIT_WORK(&wbio->work, btree_write_submit);
-	queue_work(c->btree_write_submit_wq, &wbio->work);
-	return;
-err:
-	set_btree_node_noevict(b);
-	b->written += sectors_to_write;
-nowrite:
-	btree_bounce_free(c, bytes, used_mempool, data);
-	__btree_node_write_done(c, b, 0);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
-	bool invalidated_iter = false;
-	struct btree_node_entry *bne;
-
-	if (!btree_node_just_written(b))
-		return false;
-
-	BUG_ON(b->whiteout_u64s);
-
-	clear_btree_node_just_written(b);
-
-	/*
-	 * Note: immediately after write, bset_written() doesn't work - the
-	 * amount of data we had to write after compaction might have been
-	 * smaller than the offset of the last bset.
-	 *
-	 * However, we know that all bsets have been written here, as long as
-	 * we're still holding the write lock:
-	 */
-
-	/*
-	 * XXX: decide if we really want to unconditionally sort down to a
-	 * single bset:
-	 */
-	if (b->nsets > 1) {
-		btree_node_sort(c, b, 0, b->nsets);
-		invalidated_iter = true;
-	} else {
-		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
-	}
-
-	for_each_bset(b, t)
-		set_needs_whiteout(bset(b, t), true);
-
-	bch2_btree_verify(c, b);
-
-	/*
-	 * If later we don't unconditionally sort down to a single bset, we have
-	 * to ensure this is still true:
-	 */
-	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
-	bne = want_new_bset(c, b);
-	if (bne)
-		bch2_bset_init_next(b, bne);
-
-	bch2_btree_build_aux_trees(b);
-
-	return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-			   enum six_lock_type lock_type_held,
-			   unsigned flags)
-{
-	if (lock_type_held == SIX_LOCK_intent ||
-	    (lock_type_held == SIX_LOCK_read &&
-	     six_lock_tryupgrade(&b->c.lock))) {
-		__bch2_btree_node_write(c, b, flags);
-
-		/* don't cycle lock unnecessarily: */
-		if (btree_node_just_written(b) &&
-		    six_trylock_write(&b->c.lock)) {
-			bch2_btree_post_write_cleanup(c, b);
-			six_unlock_write(&b->c.lock);
-		}
-
-		if (lock_type_held == SIX_LOCK_read)
-			six_lock_downgrade(&b->c.lock);
-	} else {
-		__bch2_btree_node_write(c, b, flags);
-		if (lock_type_held == SIX_LOCK_write &&
-		    btree_node_just_written(b))
-			bch2_btree_post_write_cleanup(c, b);
-	}
-}
-
-void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
-				 enum six_lock_type lock_type_held,
-				 unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-
-	if (lock_type_held == SIX_LOCK_intent ||
-	    (lock_type_held == SIX_LOCK_read &&
-	     six_lock_tryupgrade(&b->c.lock))) {
-		__bch2_btree_node_write(c, b, flags);
-
-		/* don't cycle lock unnecessarily: */
-		if (btree_node_just_written(b) &&
-		    six_trylock_write(&b->c.lock)) {
-			bch2_btree_post_write_cleanup(c, b);
-			__bch2_btree_node_unlock_write(trans, b);
-		}
-
-		if (lock_type_held == SIX_LOCK_read)
-			six_lock_downgrade(&b->c.lock);
-	} else {
-		__bch2_btree_node_write(c, b, flags);
-		if (lock_type_held == SIX_LOCK_write &&
-		    btree_node_just_written(b))
-			bch2_btree_post_write_cleanup(c, b);
-	}
-}
-
-static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
-{
-	struct bucket_table *tbl;
-	struct rhash_head *pos;
-	struct btree *b;
-	unsigned i;
-	bool ret = false;
-restart:
-	rcu_read_lock();
-	for_each_cached_btree(b, c, tbl, i, pos)
-		if (test_bit(flag, &b->flags)) {
-			rcu_read_unlock();
-			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
-			ret = true;
-			goto restart;
-		}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *c)
-{
-	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
-}
-
-bool bch2_btree_flush_all_writes(struct bch_fs *c)
-{
-	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
-}
-
-static const char * const bch2_btree_write_types[] = {
-#define x(t, n) [n] = #t,
-	BCH_BTREE_WRITE_TYPES()
-	NULL
-};
-
-void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	printbuf_tabstop_push(out, 20);
-	printbuf_tabstop_push(out, 10);
-
-	prt_printf(out, "\tnr\tsize\n");
-
-	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
-		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
-		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
-
-		prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
-		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
-		prt_newline(out);
-	}
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
deleted file mode 100644
index 30a5180532c8..000000000000
--- a/fs/bcachefs/btree_io.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_IO_H
-#define _BCACHEFS_BTREE_IO_H
-
-#include "bkey_methods.h"
-#include "bset.h"
-#include "btree_locking.h"
-#include "checksum.h"
-#include "extents.h"
-#include "io_write_types.h"
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-struct btree_node_read_all;
-
-static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
-		atomic_long_inc(&c->btree_cache.nr_dirty);
-}
-
-static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
-	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
-		atomic_long_dec(&c->btree_cache.nr_dirty);
-}
-
-static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
-{
-	return k.k->type == KEY_TYPE_btree_ptr_v2
-		? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
-		: 0;
-}
-
-struct btree_read_bio {
-	struct bch_fs		*c;
-	struct btree		*b;
-	struct btree_node_read_all *ra;
-	u64			start_time;
-	unsigned		have_ioref:1;
-	unsigned		idx:7;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	unsigned		list_idx;
-#endif
-	struct extent_ptr_decoded	pick;
-	struct work_struct	work;
-	struct bio		bio;
-};
-
-struct btree_write_bio {
-	struct work_struct	work;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-	void			*data;
-	unsigned		data_bytes;
-	unsigned		sector_offset;
-	u64			start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	unsigned		list_idx;
-#endif
-	struct bch_write_bio	wbio;
-};
-
-void bch2_btree_node_io_unlock(struct btree *);
-void bch2_btree_node_io_lock(struct btree *);
-void __bch2_btree_node_wait_on_read(struct btree *);
-void __bch2_btree_node_wait_on_write(struct btree *);
-void bch2_btree_node_wait_on_read(struct btree *);
-void bch2_btree_node_wait_on_write(struct btree *);
-
-enum compact_mode {
-	COMPACT_LAZY,
-	COMPACT_ALL,
-};
-
-bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
-			    enum compact_mode);
-
-static inline bool should_compact_bset_lazy(struct btree *b,
-					    struct bset_tree *t)
-{
-	unsigned total_u64s = bset_u64s(t);
-	unsigned dead_u64s = bset_dead_u64s(b, t);
-
-	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
-}
-
-static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
-	for_each_bset(b, t)
-		if (should_compact_bset_lazy(b, t))
-			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
-
-	return false;
-}
-
-static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
-	return (struct nonce) {{
-		[0] = cpu_to_le32(offset),
-		[1] = ((__le32 *) &i->seq)[0],
-		[2] = ((__le32 *) &i->seq)[1],
-		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
-	}};
-}
-
-static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
-	struct nonce nonce = btree_nonce(i, offset);
-	int ret;
-
-	if (!offset) {
-		struct btree_node *bn = container_of(i, struct btree_node, keys);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
-				   &bn->flags, bytes);
-		if (ret)
-			return ret;
-
-		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
-	}
-
-	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-			    vstruct_end(i) - (void *) i->_data);
-}
-
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch2_btree_node_drop_keys_outside_node(struct btree *);
-
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree *);
-
-int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
-			      struct btree *,
-			      struct bch_io_failures *,
-			      struct printbuf *);
-void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
-			 const struct bkey_i *, unsigned);
-
-void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
-
-int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
-			  struct bkey_s_c, unsigned);
-
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-enum btree_write_flags {
-	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
-	__BTREE_WRITE_ALREADY_STARTED,
-};
-#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
-#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
-
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
-			   enum six_lock_type, unsigned);
-void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
-				 enum six_lock_type, unsigned);
-
-static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
-					    enum six_lock_type lock_held)
-{
-	bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *);
-bool bch2_btree_flush_all_writes(struct bch_fs *);
-
-static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-				  unsigned version, unsigned big_endian,
-				  int write, struct bkey_format *f)
-{
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_inodes) {
-		swap(f->bits_per_field[BKEY_FIELD_INODE],
-		     f->bits_per_field[BKEY_FIELD_OFFSET]);
-		swap(f->field_offset[BKEY_FIELD_INODE],
-		     f->field_offset[BKEY_FIELD_OFFSET]);
-	}
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    (level || btree_type_has_snapshots(btree_id))) {
-		u64 max_packed =
-			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
-		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
-			? 0
-			: cpu_to_le64(U32_MAX - max_packed);
-	}
-}
-
-static inline void compat_bpos(unsigned level, enum btree_id btree_id,
-			       unsigned version, unsigned big_endian,
-			       int write, struct bpos *p)
-{
-	if (big_endian != CPU_BIG_ENDIAN)
-		bch2_bpos_swab(p);
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id == BTREE_ID_inodes)
-		swap(p->inode, p->offset);
-}
-
-static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
-				     unsigned version, unsigned big_endian,
-				     int write,
-				     struct btree_node *bn)
-{
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bpos_eq(bn->min_key, POS_MIN) &&
-	    write)
-		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    write)
-		bn->max_key.snapshot = 0;
-
-	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
-	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
-
-	if (version < bcachefs_metadata_version_snapshot &&
-	    !write)
-		bn->max_key.snapshot = U32_MAX;
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bpos_eq(bn->min_key, POS_MIN) &&
-	    !write)
-		bn->min_key = bpos_nosnap_successor(bn->min_key);
-}
-
-void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
deleted file mode 100644
index f8829b667ad3..000000000000
--- a/fs/bcachefs/btree_iter.c
+++ /dev/null
@@ -1,3804 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *,
-			btree_path_idx_t, btree_path_idx_t);
-
-static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
-{
-#ifdef TRACK_PATH_ALLOCATED
-	return iter->ip_allocated;
-#else
-	return 0;
-#endif
-}
-
-static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
-static void bch2_trans_srcu_lock(struct btree_trans *);
-
-static inline int __btree_path_cmp(const struct btree_path *l,
-				   enum btree_id	r_btree_id,
-				   bool			r_cached,
-				   struct bpos		r_pos,
-				   unsigned		r_level)
-{
-	/*
-	 * Must match lock ordering as defined by __bch2_btree_node_lock:
-	 */
-	return   cmp_int(l->btree_id,	r_btree_id) ?:
-		 cmp_int((int) l->cached,	(int) r_cached) ?:
-		 bpos_cmp(l->pos,	r_pos) ?:
-		-cmp_int(l->level,	r_level);
-}
-
-static inline int btree_path_cmp(const struct btree_path *l,
-				 const struct btree_path *r)
-{
-	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
-}
-
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
-{
-	/* Are we iterating over keys in all snapshots? */
-	if (iter->flags & BTREE_ITER_all_snapshots) {
-		p = bpos_successor(p);
-	} else {
-		p = bpos_nosnap_successor(p);
-		p.snapshot = iter->snapshot;
-	}
-
-	return p;
-}
-
-static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
-{
-	/* Are we iterating over keys in all snapshots? */
-	if (iter->flags & BTREE_ITER_all_snapshots) {
-		p = bpos_predecessor(p);
-	} else {
-		p = bpos_nosnap_predecessor(p);
-		p.snapshot = iter->snapshot;
-	}
-
-	return p;
-}
-
-static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
-{
-	struct bpos pos = iter->pos;
-
-	if ((iter->flags & BTREE_ITER_is_extents) &&
-	    !bkey_eq(pos, POS_MAX))
-		pos = bkey_successor(iter, pos);
-	return pos;
-}
-
-static inline bool btree_path_pos_before_node(struct btree_path *path,
-					      struct btree *b)
-{
-	return bpos_lt(path->pos, b->data->min_key);
-}
-
-static inline bool btree_path_pos_after_node(struct btree_path *path,
-					     struct btree *b)
-{
-	return bpos_gt(path->pos, b->key.k.p);
-}
-
-static inline bool btree_path_pos_in_node(struct btree_path *path,
-					  struct btree *b)
-{
-	return path->btree_id == b->c.btree_id &&
-		!btree_path_pos_before_node(path, b) &&
-		!btree_path_pos_after_node(path, b);
-}
-
-/* Debug: */
-
-static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
-					  struct btree_path *path)
-{
-	struct bkey_cached *ck;
-	bool locked = btree_node_locked(path, 0);
-
-	if (!bch2_btree_node_relock(trans, path, 0))
-		return;
-
-	ck = (void *) path->l[0].b;
-	BUG_ON(ck->key.btree_id != path->btree_id ||
-	       !bkey_eq(ck->key.pos, path->pos));
-
-	if (!locked)
-		btree_node_unlock(trans, path, 0);
-}
-
-static void __bch2_btree_path_verify_level(struct btree_trans *trans,
-				struct btree_path *path, unsigned level)
-{
-	struct btree_path_level *l;
-	struct btree_node_iter tmp;
-	bool locked;
-	struct bkey_packed *p, *k;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	struct printbuf buf3 = PRINTBUF;
-	const char *msg;
-
-	l	= &path->l[level];
-	tmp	= l->iter;
-	locked	= btree_node_locked(path, level);
-
-	if (path->cached) {
-		if (!level)
-			__bch2_btree_path_verify_cached(trans, path);
-		return;
-	}
-
-	if (!btree_path_node(path, level))
-		return;
-
-	if (!bch2_btree_node_relock_notrace(trans, path, level))
-		return;
-
-	BUG_ON(!btree_path_pos_in_node(path, l->b));
-
-	bch2_btree_node_iter_verify(&l->iter, l->b);
-
-	/*
-	 * For interior nodes, the iterator will have skipped past deleted keys:
-	 */
-	p = level
-		? bch2_btree_node_iter_prev(&tmp, l->b)
-		: bch2_btree_node_iter_prev_all(&tmp, l->b);
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
-	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
-		msg = "before";
-		goto err;
-	}
-
-	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-		msg = "after";
-		goto err;
-	}
-
-	if (!locked)
-		btree_node_unlock(trans, path, level);
-	return;
-err:
-	bch2_bpos_to_text(&buf1, path->pos);
-
-	if (p) {
-		struct bkey uk = bkey_unpack_key(l->b, p);
-
-		bch2_bkey_to_text(&buf2, &uk);
-	} else {
-		prt_printf(&buf2, "(none)");
-	}
-
-	if (k) {
-		struct bkey uk = bkey_unpack_key(l->b, k);
-
-		bch2_bkey_to_text(&buf3, &uk);
-	} else {
-		prt_printf(&buf3, "(none)");
-	}
-
-	panic("path should be %s key at level %u:\n"
-	      "path pos %s\n"
-	      "prev key %s\n"
-	      "cur  key %s\n",
-	      msg, level, buf1.buf, buf2.buf, buf3.buf);
-}
-
-static void __bch2_btree_path_verify(struct btree_trans *trans,
-				   struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-
-	for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
-		if (!path->l[i].b) {
-			BUG_ON(!path->cached &&
-			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
-			break;
-		}
-
-		__bch2_btree_path_verify_level(trans, path, i);
-	}
-
-	bch2_btree_path_verify_locks(trans, path);
-}
-
-void __bch2_trans_verify_paths(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned iter;
-
-	trans_for_each_path(trans, path, iter)
-		__bch2_btree_path_verify(trans, path);
-}
-
-static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
-{
-	BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
-
-	BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
-	       (iter->flags & BTREE_ITER_all_snapshots));
-
-	BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
-	       (iter->flags & BTREE_ITER_all_snapshots) &&
-	       !btree_type_has_snapshot_field(iter->btree_id));
-
-	if (iter->update_path)
-		__bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
-	__bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
-}
-
-static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
-	BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
-	       !iter->pos.snapshot);
-
-	BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
-	       iter->pos.snapshot != iter->snapshot);
-
-	BUG_ON(iter->flags & BTREE_ITER_all_snapshots	? !bpos_eq(iter->pos, iter->k.p) :
-	       !(iter->flags & BTREE_ITER_is_extents)	? !bkey_eq(iter->pos, iter->k.p) :
-	       (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
-		bkey_gt(iter->pos, iter->k.p)));
-}
-
-static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
-					struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct btree_iter copy;
-	struct bkey_s_c prev;
-	int ret = 0;
-
-	if (!(iter->flags & BTREE_ITER_filter_snapshots))
-		return 0;
-
-	if (bkey_err(k) || !k.k)
-		return 0;
-
-	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
-					  iter->snapshot,
-					  k.k->p.snapshot));
-
-	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
-			     BTREE_ITER_nopreserve|
-			     BTREE_ITER_all_snapshots);
-	prev = bch2_btree_iter_prev(trans, &copy);
-	if (!prev.k)
-		goto out;
-
-	ret = bkey_err(prev);
-	if (ret)
-		goto out;
-
-	if (bkey_eq(prev.k->p, k.k->p) &&
-	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
-				      prev.k->p.snapshot) > 0) {
-		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
-		bch2_bkey_to_text(&buf1, k.k);
-		bch2_bkey_to_text(&buf2, prev.k);
-
-		panic("iter snap %u\n"
-		      "k    %s\n"
-		      "prev %s\n",
-		      iter->snapshot,
-		      buf1.buf, buf2.buf);
-	}
-out:
-	bch2_trans_iter_exit(trans, &copy);
-	return ret;
-}
-
-void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
-			    struct bpos pos)
-{
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	struct btree_path *path;
-	struct trans_for_each_path_inorder_iter iter;
-	struct printbuf buf = PRINTBUF;
-
-	btree_trans_sort_paths(trans);
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		if (path->btree_id != id ||
-		    !btree_node_locked(path, 0) ||
-		    !path->should_be_locked)
-			continue;
-
-		if (!path->cached) {
-			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
-			    bkey_le(pos, path->l[0].b->key.k.p))
-				return;
-		} else {
-			if (bkey_eq(pos, path->pos))
-				return;
-		}
-	}
-
-	bch2_dump_trans_paths_updates(trans);
-	bch2_bpos_to_text(&buf, pos);
-
-	panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
-}
-
-static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
-						struct btree_path *path, unsigned l)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_btree_path_verify_level(trans, path, l);
-}
-
-static inline void bch2_btree_path_verify(struct btree_trans *trans,
-					  struct btree_path *path)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_btree_path_verify(trans, path);
-}
-
-static inline void bch2_btree_iter_verify(struct btree_trans *trans,
-					  struct btree_iter *iter)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_btree_iter_verify(trans, iter);
-}
-
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_btree_iter_verify_entry_exit(iter);
-}
-
-static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
-					     struct bkey_s_c k)
-{
-	return static_branch_unlikely(&bch2_debug_check_iterators)
-		? __bch2_btree_iter_verify_ret(trans, iter, k)
-		: 0;
-}
-
-/* Btree path: fixups after btree updates */
-
-static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
-					struct btree *b,
-					struct bset_tree *t,
-					struct bkey_packed *k)
-{
-	struct btree_node_iter_set *set;
-
-	btree_node_iter_for_each(iter, set)
-		if (set->end == t->end_offset) {
-			set->k = __btree_node_key_to_offset(b, k);
-			bch2_btree_node_iter_sort(iter, b);
-			return;
-		}
-
-	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
-}
-
-static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
-					       struct btree *b,
-					       struct bkey_packed *where)
-{
-	struct btree_path_level *l = &path->l[b->c.level];
-
-	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
-		return;
-
-	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
-		bch2_btree_node_iter_advance(&l->iter, l->b);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-				      struct btree *b,
-				      struct bkey_packed *where)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path_with_node(trans, b, path, i) {
-		__bch2_btree_path_fix_key_modified(path, b, where);
-		bch2_btree_path_verify_level(trans, path, b->c.level);
-	}
-}
-
-static void __bch2_btree_node_iter_fix(struct btree_path *path,
-				       struct btree *b,
-				       struct btree_node_iter *node_iter,
-				       struct bset_tree *t,
-				       struct bkey_packed *where,
-				       unsigned clobber_u64s,
-				       unsigned new_u64s)
-{
-	const struct bkey_packed *end = btree_bkey_last(b, t);
-	struct btree_node_iter_set *set;
-	unsigned offset = __btree_node_key_to_offset(b, where);
-	int shift = new_u64s - clobber_u64s;
-	unsigned old_end = t->end_offset - shift;
-	unsigned orig_iter_pos = node_iter->data[0].k;
-	bool iter_current_key_modified =
-		orig_iter_pos >= offset &&
-		orig_iter_pos <= offset + clobber_u64s;
-
-	btree_node_iter_for_each(node_iter, set)
-		if (set->end == old_end)
-			goto found;
-
-	/* didn't find the bset in the iterator - might have to readd it: */
-	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-		bch2_btree_node_iter_push(node_iter, b, where, end);
-		goto fixup_done;
-	} else {
-		/* Iterator is after key that changed */
-		return;
-	}
-found:
-	set->end = t->end_offset;
-
-	/* Iterator hasn't gotten to the key that changed yet: */
-	if (set->k < offset)
-		return;
-
-	if (new_u64s &&
-	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
-		set->k = offset;
-	} else if (set->k < offset + clobber_u64s) {
-		set->k = offset + new_u64s;
-		if (set->k == set->end)
-			bch2_btree_node_iter_set_drop(node_iter, set);
-	} else {
-		/* Iterator is after key that changed */
-		set->k = (int) set->k + shift;
-		return;
-	}
-
-	bch2_btree_node_iter_sort(node_iter, b);
-fixup_done:
-	if (node_iter->data[0].k != orig_iter_pos)
-		iter_current_key_modified = true;
-
-	/*
-	 * When a new key is added, and the node iterator now points to that
-	 * key, the iterator might have skipped past deleted keys that should
-	 * come after the key the iterator now points to. We have to rewind to
-	 * before those deleted keys - otherwise
-	 * bch2_btree_node_iter_prev_all() breaks:
-	 */
-	if (!bch2_btree_node_iter_end(node_iter) &&
-	    iter_current_key_modified &&
-	    b->c.level) {
-		struct bkey_packed *k, *k2, *p;
-
-		k = bch2_btree_node_iter_peek_all(node_iter, b);
-
-		for_each_bset(b, t) {
-			bool set_pos = false;
-
-			if (node_iter->data[0].end == t->end_offset)
-				continue;
-
-			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
-			       bkey_iter_cmp(b, k, p) < 0) {
-				k2 = p;
-				set_pos = true;
-			}
-
-			if (set_pos)
-				btree_node_iter_set_set_pos(node_iter,
-							    b, t, k2);
-		}
-	}
-}
-
-void bch2_btree_node_iter_fix(struct btree_trans *trans,
-			      struct btree_path *path,
-			      struct btree *b,
-			      struct btree_node_iter *node_iter,
-			      struct bkey_packed *where,
-			      unsigned clobber_u64s,
-			      unsigned new_u64s)
-{
-	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
-	struct btree_path *linked;
-	unsigned i;
-
-	if (node_iter != &path->l[b->c.level].iter) {
-		__bch2_btree_node_iter_fix(path, b, node_iter, t,
-					   where, clobber_u64s, new_u64s);
-
-		if (static_branch_unlikely(&bch2_debug_check_iterators))
-			bch2_btree_node_iter_verify(node_iter, b);
-	}
-
-	trans_for_each_path_with_node(trans, b, linked, i) {
-		__bch2_btree_node_iter_fix(linked, b,
-					   &linked->l[b->c.level].iter, t,
-					   where, clobber_u64s, new_u64s);
-		bch2_btree_path_verify_level(trans, linked, b->c.level);
-	}
-}
-
-/* Btree path level: pointer to a particular btree node and node iter */
-
-static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
-						  struct btree_path_level *l,
-						  struct bkey *u,
-						  struct bkey_packed *k)
-{
-	if (unlikely(!k)) {
-		/*
-		 * signal to bch2_btree_iter_peek_slot() that we're currently at
-		 * a hole
-		 */
-		u->type = KEY_TYPE_deleted;
-		return bkey_s_c_null;
-	}
-
-	return bkey_disassemble(l->b, k, u);
-}
-
-static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
-							struct btree_path_level *l,
-							struct bkey *u)
-{
-	return __btree_iter_unpack(c, l, u,
-			bch2_btree_node_iter_peek_all(&l->iter, l->b));
-}
-
-static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
-						    struct btree_path *path,
-						    struct btree_path_level *l,
-						    struct bkey *u)
-{
-	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
-			bch2_btree_node_iter_prev(&l->iter, l->b));
-
-	path->pos = k.k ? k.k->p : l->b->data->min_key;
-	trans->paths_sorted = false;
-	bch2_btree_path_verify_level(trans, path, l - path->l);
-	return k;
-}
-
-static inline bool btree_path_advance_to_pos(struct btree_path *path,
-					     struct btree_path_level *l,
-					     int max_advance)
-{
-	struct bkey_packed *k;
-	int nr_advanced = 0;
-
-	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
-		if (max_advance > 0 && nr_advanced >= max_advance)
-			return false;
-
-		bch2_btree_node_iter_advance(&l->iter, l->b);
-		nr_advanced++;
-	}
-
-	return true;
-}
-
-static inline void __btree_path_level_init(struct btree_path *path,
-					   unsigned level)
-{
-	struct btree_path_level *l = &path->l[level];
-
-	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-	/*
-	 * Iterators to interior nodes should always be pointed at the first non
-	 * whiteout:
-	 */
-	if (level)
-		bch2_btree_node_iter_peek(&l->iter, l->b);
-}
-
-void bch2_btree_path_level_init(struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b)
-{
-	BUG_ON(path->cached);
-
-	EBUG_ON(!btree_path_pos_in_node(path, b));
-
-	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-	path->l[b->c.level].b = b;
-	__btree_path_level_init(path, b->c.level);
-}
-
-/* Btree path: fixups after btree node updates: */
-
-static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	trans_for_each_update(trans, i)
-		if (!i->cached &&
-		    i->level	== b->c.level &&
-		    i->btree_id	== b->c.btree_id &&
-		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
-		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
-			i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
-
-			if (unlikely(trans->journal_replay_not_finished)) {
-				struct bkey_i *j_k =
-					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
-								    i->k->k.p);
-
-				if (j_k) {
-					i->old_k = j_k->k;
-					i->old_v = &j_k->v;
-				}
-			}
-		}
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-void bch2_trans_node_add(struct btree_trans *trans,
-			 struct btree_path *path,
-			 struct btree *b)
-{
-	struct btree_path *prev;
-
-	BUG_ON(!btree_path_pos_in_node(path, b));
-
-	while ((prev = prev_btree_path(trans, path)) &&
-	       btree_path_pos_in_node(prev, b))
-		path = prev;
-
-	for (;
-	     path && btree_path_pos_in_node(path, b);
-	     path = next_btree_path(trans, path))
-		if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
-			enum btree_node_locked_type t =
-				btree_lock_want(path, b->c.level);
-
-			if (t != BTREE_NODE_UNLOCKED) {
-				btree_node_unlock(trans, path, b->c.level);
-				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-				mark_btree_node_locked(trans, path, b->c.level, t);
-			}
-
-			bch2_btree_path_level_init(trans, path, b);
-		}
-
-	bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-void bch2_trans_node_drop(struct btree_trans *trans,
-			  struct btree *b)
-{
-	struct btree_path *path;
-	unsigned i, level = b->c.level;
-
-	trans_for_each_path(trans, path, i)
-		if (path->l[level].b == b) {
-			btree_node_unlock(trans, path, level);
-			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-		}
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path_with_node(trans, b, path, i)
-		__btree_path_level_init(path, b->c.level);
-
-	bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/* Btree path: traverse, set_pos: */
-
-static inline int btree_path_lock_root(struct btree_trans *trans,
-				       struct btree_path *path,
-				       unsigned depth_want,
-				       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
-	enum six_lock_type lock_type;
-	unsigned i;
-	int ret;
-
-	EBUG_ON(path->nodes_locked);
-
-	while (1) {
-		struct btree *b = READ_ONCE(r->b);
-		if (unlikely(!b)) {
-			BUG_ON(!r->error);
-			return r->error;
-		}
-
-		path->level = READ_ONCE(b->c.level);
-
-		if (unlikely(path->level < depth_want)) {
-			/*
-			 * the root is at a lower depth than the depth we want:
-			 * got to the end of the btree, or we're walking nodes
-			 * greater than some depth and there are no nodes >=
-			 * that depth
-			 */
-			path->level = depth_want;
-			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
-				path->l[i].b = NULL;
-			return 1;
-		}
-
-		lock_type = __btree_lock_want(path, path->level);
-		ret = btree_node_lock(trans, path, &b->c,
-				      path->level, lock_type, trace_ip);
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				return ret;
-			BUG();
-		}
-
-		if (likely(b == READ_ONCE(r->b) &&
-			   b->c.level == path->level &&
-			   !race_fault())) {
-			for (i = 0; i < path->level; i++)
-				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
-			path->l[path->level].b = b;
-			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
-				path->l[i].b = NULL;
-
-			mark_btree_node_locked(trans, path, path->level,
-					       (enum btree_node_locked_type) lock_type);
-			bch2_btree_path_level_init(trans, path, b);
-			return 0;
-		}
-
-		six_unlock_type(&b->c.lock, lock_type);
-	}
-}
-
-noinline
-static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree_node_iter node_iter = l->iter;
-	struct bkey_packed *k;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (path->level > 1 ? 0 :  2)
-		: (path->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(path, path->level);
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	while (nr-- && !ret) {
-		if (!bch2_btree_node_relock(trans, path, path->level))
-			break;
-
-		bch2_btree_node_iter_advance(&node_iter, l->b);
-		k = bch2_btree_node_iter_peek(&node_iter, l->b);
-		if (!k)
-			break;
-
-		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-					       path->level - 1);
-	}
-
-	if (!was_locked)
-		btree_node_unlock(trans, path, path->level);
-
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
-				 struct btree_and_journal_iter *jiter)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (path->level > 1 ? 0 :  2)
-		: (path->level > 1 ? 1 : 16);
-	bool was_locked = btree_node_locked(path, path->level);
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	jiter->fail_if_too_many_whiteouts = true;
-
-	while (nr-- && !ret) {
-		if (!bch2_btree_node_relock(trans, path, path->level))
-			break;
-
-		bch2_btree_and_journal_iter_advance(jiter);
-		k = bch2_btree_and_journal_iter_peek(jiter);
-		if (!k.k)
-			break;
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
-					       path->level - 1);
-	}
-
-	if (!was_locked)
-		btree_node_unlock(trans, path, path->level);
-
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
-					    struct btree_path *path,
-					    unsigned plevel, struct btree *b)
-{
-	struct btree_path_level *l = &path->l[plevel];
-	bool locked = btree_node_locked(path, plevel);
-	struct bkey_packed *k;
-	struct bch_btree_ptr_v2 *bp;
-
-	if (!bch2_btree_node_relock(trans, path, plevel))
-		return;
-
-	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
-
-	bp = (void *) bkeyp_val(&l->b->format, k);
-	bp->mem_ptr = (unsigned long)b;
-
-	if (!locked)
-		btree_node_unlock(trans, path, plevel);
-}
-
-static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
-						     struct btree_path *path,
-						     unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree_and_journal_iter jiter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
-
-	k = bch2_btree_and_journal_iter_peek(&jiter);
-	if (!k.k) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "node not found at pos ");
-		bch2_bpos_to_text(&buf, path->pos);
-		prt_str(&buf, " at btree ");
-		bch2_btree_pos_to_text(&buf, c, l->b);
-
-		ret = bch2_fs_topology_error(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		goto err;
-	}
-
-	bkey_reassemble(&trans->btree_path_down, k);
-
-	if ((flags & BTREE_ITER_prefetch) &&
-	    c->opts.btree_node_prefetch)
-		ret = btree_path_prefetch_j(trans, path, &jiter);
-
-err:
-	bch2_btree_and_journal_iter_exit(&jiter);
-	return ret;
-}
-
-static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
-						     struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	prt_str(&buf, "node not found at pos ");
-	bch2_bpos_to_text(&buf, path->pos);
-	prt_str(&buf, " within parent node ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
-
-	bch2_fs_fatal_error(c, "%s", buf.buf);
-	printbuf_exit(&buf);
-	return bch_err_throw(c, btree_need_topology_repair);
-}
-
-static __always_inline int btree_path_down(struct btree_trans *trans,
-					   struct btree_path *path,
-					   unsigned flags,
-					   unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path_level *l = path_l(path);
-	struct btree *b;
-	unsigned level = path->level - 1;
-	enum six_lock_type lock_type = __btree_lock_want(path, level);
-	int ret;
-
-	EBUG_ON(!btree_node_locked(path, path->level));
-
-	if (unlikely(trans->journal_replay_not_finished)) {
-		ret = btree_node_iter_and_journal_peek(trans, path, flags);
-		if (ret)
-			return ret;
-	} else {
-		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
-		if (unlikely(!k))
-			return btree_node_missing_err(trans, path);
-
-		bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
-
-		if (unlikely((flags & BTREE_ITER_prefetch)) &&
-		    c->opts.btree_node_prefetch) {
-			ret = btree_path_prefetch(trans, path);
-			if (ret)
-				return ret;
-		}
-	}
-
-	b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
-				level, lock_type, trace_ip);
-	ret = PTR_ERR_OR_ZERO(b);
-	if (unlikely(ret))
-		return ret;
-
-	if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
-	    likely(!trans->journal_replay_not_finished &&
-		   trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
-		btree_node_mem_ptr_set(trans, path, level + 1, b);
-
-	if (btree_node_read_locked(path, level + 1))
-		btree_node_unlock(trans, path, level + 1);
-
-	mark_btree_node_locked(trans, path, level,
-			       (enum btree_node_locked_type) lock_type);
-	path->level = level;
-	bch2_btree_path_level_init(trans, path, b);
-
-	bch2_btree_path_verify_locks(trans, path);
-	return 0;
-}
-
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_path *path;
-	unsigned long trace_ip = _RET_IP_;
-	unsigned i;
-	int ret = 0;
-
-	if (trans->in_traverse_all)
-		return bch_err_throw(c, transaction_restart_in_traverse_all);
-
-	trans->in_traverse_all = true;
-retry_all:
-	trans->restarted = 0;
-	trans->last_restarted_ip = 0;
-
-	trans_for_each_path(trans, path, i)
-		path->should_be_locked = false;
-
-	btree_trans_sort_paths(trans);
-
-	bch2_trans_unlock(trans);
-	cond_resched();
-	trans_set_locked(trans, false);
-
-	if (unlikely(trans->memory_allocation_failure)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);
-
-		do {
-			ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-			closure_sync(&cl);
-		} while (ret);
-	}
-
-	/* Now, redo traversals in correct order: */
-	i = 0;
-	while (i < trans->nr_sorted) {
-		btree_path_idx_t idx = trans->sorted[i];
-
-		/*
-		 * Traversing a path can cause another path to be added at about
-		 * the same position:
-		 */
-		if (trans->paths[idx].uptodate) {
-			__btree_path_get(trans, &trans->paths[idx], false);
-			ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
-			__btree_path_put(trans, &trans->paths[idx], false);
-
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, ENOMEM))
-				goto retry_all;
-			if (ret)
-				goto err;
-		} else {
-			i++;
-		}
-	}
-
-	/*
-	 * We used to assert that all paths had been traversed here
-	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-	 * path->should_be_locked is not set yet, we might have unlocked and
-	 * then failed to relock a path - that's fine.
-	 */
-err:
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	trans->in_traverse_all = false;
-
-	trace_and_count(c, trans_traverse_all, trans, trace_ip);
-	return ret;
-}
-
-static inline bool btree_path_check_pos_in_node(struct btree_path *path,
-						unsigned l, int check_pos)
-{
-	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
-		return false;
-	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
-		return false;
-	return true;
-}
-
-static inline bool btree_path_good_node(struct btree_trans *trans,
-					struct btree_path *path,
-					unsigned l, int check_pos)
-{
-	return is_btree_node(path, l) &&
-		bch2_btree_node_relock(trans, path, l) &&
-		btree_path_check_pos_in_node(path, l, check_pos);
-}
-
-static void btree_path_set_level_down(struct btree_trans *trans,
-				      struct btree_path *path,
-				      unsigned new_level)
-{
-	unsigned l;
-
-	path->level = new_level;
-
-	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(trans, path, l);
-
-	btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-	bch2_btree_path_verify(trans, path);
-}
-
-static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
-							 struct btree_path *path,
-							 int check_pos)
-{
-	unsigned i, l = path->level;
-again:
-	while (btree_path_node(path, l) &&
-	       !btree_path_good_node(trans, path, l, check_pos))
-		__btree_path_set_level_up(trans, path, l++);
-
-	/* If we need intent locks, take them too: */
-	for (i = l + 1;
-	     i < path->locks_want && btree_path_node(path, i);
-	     i++)
-		if (!bch2_btree_node_relock(trans, path, i)) {
-			while (l <= i)
-				__btree_path_set_level_up(trans, path, l++);
-			goto again;
-		}
-
-	return l;
-}
-
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-						     struct btree_path *path,
-						     int check_pos)
-{
-	return likely(btree_node_locked(path, path->level) &&
-		      btree_path_check_pos_in_node(path, path->level, check_pos))
-		? path->level
-		: __btree_path_up_until_good_node(trans, path, check_pos);
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_trans_exit().
- */
-int bch2_btree_path_traverse_one(struct btree_trans *trans,
-				 btree_path_idx_t path_idx,
-				 unsigned flags,
-				 unsigned long trace_ip)
-{
-	struct btree_path *path = &trans->paths[path_idx];
-	unsigned depth_want = path->level;
-	int ret = -((int) trans->restarted);
-
-	if (unlikely(ret))
-		goto out;
-
-	if (unlikely(!trans->srcu_held))
-		bch2_trans_srcu_lock(trans);
-
-	trace_btree_path_traverse_start(trans, path);
-
-	/*
-	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
-	 * and re-traverse the path without a transaction restart:
-	 */
-	if (path->should_be_locked) {
-		ret = bch2_btree_path_relock(trans, path, trace_ip);
-		goto out;
-	}
-
-	if (path->cached) {
-		ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
-		goto out;
-	}
-
-	path = &trans->paths[path_idx];
-
-	if (unlikely(path->level >= BTREE_MAX_DEPTH))
-		goto out_uptodate;
-
-	path->level = btree_path_up_until_good_node(trans, path, 0);
-	unsigned max_level = path->level;
-
-	EBUG_ON(btree_path_node(path, path->level) &&
-		!btree_node_locked(path, path->level));
-
-	/*
-	 * Note: path->nodes[path->level] may be temporarily NULL here - that
-	 * would indicate to other code that we got to the end of the btree,
-	 * here it indicates that relocking the root failed - it's critical that
-	 * btree_path_lock_root() comes next and that it can't fail
-	 */
-	while (path->level > depth_want) {
-		ret = btree_path_node(path, path->level)
-			? btree_path_down(trans, path, flags, trace_ip)
-			: btree_path_lock_root(trans, path, depth_want, trace_ip);
-		if (unlikely(ret)) {
-			if (ret == 1) {
-				/*
-				 * No nodes at this level - got to the end of
-				 * the btree:
-				 */
-				ret = 0;
-				goto out;
-			}
-
-			__bch2_btree_path_unlock(trans, path);
-			path->level = depth_want;
-			path->l[path->level].b = ERR_PTR(ret);
-			goto out;
-		}
-	}
-
-	if (unlikely(max_level > path->level)) {
-		struct btree_path *linked;
-		unsigned iter;
-
-		trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
-			for (unsigned j = path->level + 1; j < max_level; j++)
-				linked->l[j] = path->l[j];
-	}
-
-out_uptodate:
-	path->uptodate = BTREE_ITER_UPTODATE;
-	trace_btree_path_traverse_end(trans, path);
-out:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
-		panic("ret %s (%i) trans->restarted %s (%i)\n",
-		      bch2_err_str(ret), ret,
-		      bch2_err_str(trans->restarted), trans->restarted);
-	bch2_btree_path_verify(trans, path);
-	return ret;
-}
-
-static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
-			    struct btree_path *src)
-{
-	unsigned i, offset = offsetof(struct btree_path, pos);
-
-	memcpy((void *) dst + offset,
-	       (void *) src + offset,
-	       sizeof(struct btree_path) - offset);
-
-	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
-		unsigned t = btree_node_locked_type(dst, i);
-
-		if (t != BTREE_NODE_UNLOCKED)
-			six_lock_increment(&dst->l[i].b->c.lock, t);
-	}
-}
-
-static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
-					 bool intent, unsigned long ip)
-{
-	btree_path_idx_t new = btree_path_alloc(trans, src);
-	btree_path_copy(trans, trans->paths + new, trans->paths + src);
-	__btree_path_get(trans, trans->paths + new, intent);
-#ifdef TRACK_PATH_ALLOCATED
-	trans->paths[new].ip_allocated = ip;
-#endif
-	return new;
-}
-
-__flatten
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
-			btree_path_idx_t path, bool intent, unsigned long ip)
-{
-	struct btree_path *old = trans->paths + path;
-	__btree_path_put(trans, trans->paths + path, intent);
-	path = btree_path_clone(trans, path, intent, ip);
-	trace_btree_path_clone(trans, old, trans->paths + path);
-	trans->paths[path].preserve = false;
-	return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *trans,
-			  btree_path_idx_t path_idx, struct bpos new_pos,
-			  bool intent, unsigned long ip)
-{
-	int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	EBUG_ON(!trans->paths[path_idx].ref);
-
-	trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
-
-	path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
-
-	struct btree_path *path = trans->paths + path_idx;
-	path->pos		= new_pos;
-	trans->paths_sorted	= false;
-
-	if (unlikely(path->cached)) {
-		btree_node_unlock(trans, path, 0);
-		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-		btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-		goto out;
-	}
-
-	unsigned level = btree_path_up_until_good_node(trans, path, cmp);
-
-	if (btree_path_node(path, level)) {
-		struct btree_path_level *l = &path->l[level];
-
-		BUG_ON(!btree_node_locked(path, level));
-		/*
-		 * We might have to skip over many keys, or just a few: try
-		 * advancing the node iterator, and if we have to skip over too
-		 * many keys just reinit it (or if we're rewinding, since that
-		 * is expensive).
-		 */
-		if (cmp < 0 ||
-		    !btree_path_advance_to_pos(path, l, 8))
-			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
-		/*
-		 * Iterators to interior nodes should always be pointed at the first non
-		 * whiteout:
-		 */
-		if (unlikely(level))
-			bch2_btree_node_iter_peek(&l->iter, l->b);
-	}
-
-	if (unlikely(level != path->level)) {
-		btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-		__bch2_btree_path_unlock(trans, path);
-	}
-out:
-	bch2_btree_path_verify(trans, path);
-	return path_idx;
-}
-
-/* Btree path: main interface: */
-
-static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-	struct btree_path *sib;
-
-	sib = prev_btree_path(trans, path);
-	if (sib && !btree_path_cmp(sib, path))
-		return sib;
-
-	sib = next_btree_path(trans, path);
-	if (sib && !btree_path_cmp(sib, path))
-		return sib;
-
-	return NULL;
-}
-
-static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
-	struct btree_path *sib;
-
-	sib = prev_btree_path(trans, path);
-	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-		return sib;
-
-	sib = next_btree_path(trans, path);
-	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
-		return sib;
-
-	return NULL;
-}
-
-static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
-{
-	__bch2_btree_path_unlock(trans, trans->paths + path);
-	btree_path_list_remove(trans, trans->paths + path);
-	__clear_bit(path, trans->paths_allocated);
-}
-
-static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned l = path->level;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!is_btree_node(path, l))
-			return false;
-
-		if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
-			return false;
-
-		l++;
-	} while (l < path->locks_want);
-
-	return true;
-}
-
-void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
-{
-	struct btree_path *path = trans->paths + path_idx, *dup = NULL;
-
-	if (!__btree_path_put(trans, path, intent))
-		return;
-
-	if (!path->preserve && !path->should_be_locked)
-		goto free;
-
-	dup = path->preserve
-		? have_path_at_pos(trans, path)
-		: have_node_at_pos(trans, path);
-	if (!dup)
-		return;
-
-	/*
-	 * If we need this path locked, the duplicate also has te be locked
-	 * before we free this one:
-	 */
-	if (path->should_be_locked &&
-	    !dup->should_be_locked &&
-	    !trans->restarted) {
-		if (!(trans->locked
-		      ? bch2_btree_path_relock_norestart(trans, dup)
-		      : bch2_btree_path_can_relock(trans, dup)))
-			return;
-
-		dup->should_be_locked = true;
-	}
-
-	BUG_ON(path->should_be_locked &&
-	       !trans->restarted &&
-	       trans->locked &&
-	       !btree_node_locked(dup, dup->level));
-
-	path->should_be_locked = false;
-	dup->preserve |= path->preserve;
-free:
-	trace_btree_path_free(trans, path_idx, dup);
-	__bch2_path_free(trans, path_idx);
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
-{
-	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
-	      trans->restart_count, restart_count,
-	      (void *) trans->last_begin_ip);
-}
-
-static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct printbuf buf = PRINTBUF;
-	bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
-	panic("in transaction restart: %s, last restarted by\n%s",
-	      bch2_err_str(trans->restarted),
-	      buf.buf);
-#else
-	panic("in transaction restart: %s, last restarted by %pS\n",
-	      bch2_err_str(trans->restarted),
-	      (void *) trans->last_restarted_ip);
-#endif
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
-{
-	if (trans->restarted)
-		bch2_trans_in_restart_error(trans);
-
-	if (!trans->locked)
-		panic("trans should be locked, unlocked by %pS\n",
-		      (void *) trans->last_unlock_ip);
-
-	BUG();
-}
-
-noinline __cold
-void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
-{
-	prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
-		   trans->nr_updates, trans->fn, trans->journal_res.seq);
-	printbuf_indent_add(buf, 2);
-
-	trans_for_each_update(trans, i) {
-		struct bkey_s_c old = { &i->old_k, i->old_v };
-
-		prt_str(buf, "update: btree=");
-		bch2_btree_id_to_text(buf, i->btree_id);
-		prt_printf(buf, " cached=%u %pS\n",
-			   i->cached,
-			   (void *) i->ip_allocated);
-
-		prt_printf(buf, "  old ");
-		bch2_bkey_val_to_text(buf, trans->c, old);
-		prt_newline(buf);
-
-		prt_printf(buf, "  new ");
-		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
-		prt_newline(buf);
-	}
-
-	for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
-	     e != btree_trans_journal_entries_top(trans);
-	     e = vstruct_next(e)) {
-		bch2_journal_entry_to_text(buf, trans->c, e);
-		prt_newline(buf);
-	}
-
-	printbuf_indent_sub(buf, 2);
-}
-
-static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
-		   path_idx, path->ref, path->intent_ref,
-		   path->preserve ? 'P' : ' ',
-		   path->should_be_locked ? 'S' : ' ',
-		   path->cached ? 'C' : 'B');
-	bch2_btree_id_level_to_text(out, path->btree_id, path->level);
-	prt_str(out, " pos ");
-	bch2_bpos_to_text(out, path->pos);
-
-	if (!path->cached && btree_node_locked(path, path->level)) {
-		prt_char(out, ' ');
-		struct btree *b = path_l(path)->b;
-		bch2_bpos_to_text(out, b->data->min_key);
-		prt_char(out, '-');
-		bch2_bpos_to_text(out, b->key.k.p);
-	}
-
-#ifdef TRACK_PATH_ALLOCATED
-	prt_printf(out, " %pS", (void *) path->ip_allocated);
-#endif
-}
-
-static const char *btree_node_locked_str(enum btree_node_locked_type t)
-{
-	switch (t) {
-	case BTREE_NODE_UNLOCKED:
-		return "unlocked";
-	case BTREE_NODE_READ_LOCKED:
-		return "read";
-	case BTREE_NODE_INTENT_LOCKED:
-		return "intent";
-	case BTREE_NODE_WRITE_LOCKED:
-		return "write";
-	default:
-		return NULL;
-	}
-}
-
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
-	bch2_btree_path_to_text_short(out, trans, path_idx);
-
-	struct btree_path *path = trans->paths + path_idx;
-
-	prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-		prt_printf(out, "l=%u locks %s seq %u node ", l,
-			   btree_node_locked_str(btree_node_locked_type(path, l)),
-			   path->l[l].lock_seq);
-
-		int ret = PTR_ERR_OR_ZERO(path->l[l].b);
-		if (ret)
-			prt_str(out, bch2_err_str(ret));
-		else
-			prt_printf(out, "%px", path->l[l].b);
-		prt_newline(out);
-	}
-	printbuf_indent_sub(out, 2);
-}
-
-static noinline __cold
-void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
-				bool nosort)
-{
-	struct trans_for_each_path_inorder_iter iter;
-
-	if (!nosort)
-		btree_trans_sort_paths(trans);
-
-	trans_for_each_path_idx_inorder(trans, iter) {
-		bch2_btree_path_to_text_short(out, trans, iter.path_idx);
-		prt_newline(out);
-	}
-}
-
-noinline __cold
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-	__bch2_trans_paths_to_text(out, trans, false);
-}
-
-static noinline __cold
-void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
-{
-	struct printbuf buf = PRINTBUF;
-
-	__bch2_trans_paths_to_text(&buf, trans, nosort);
-	bch2_trans_updates_to_text(&buf, trans);
-
-	bch2_print_str(trans->c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
-{
-	__bch2_dump_trans_paths_updates(trans, false);
-}
-
-noinline __cold
-static void bch2_trans_update_max_paths(struct btree_trans *trans)
-{
-	struct btree_transaction_stats *s = btree_trans_stats(trans);
-	struct printbuf buf = PRINTBUF;
-	size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
-
-	bch2_trans_paths_to_text(&buf, trans);
-
-	if (!buf.allocation_failure) {
-		mutex_lock(&s->lock);
-		if (nr > s->nr_max_paths) {
-			s->nr_max_paths = nr;
-			swap(s->max_paths_text, buf.buf);
-		}
-		mutex_unlock(&s->lock);
-	}
-
-	printbuf_exit(&buf);
-
-	trans->nr_paths_max = nr;
-}
-
-noinline __cold
-int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
-{
-	if (trace_trans_restart_too_many_iters_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_trans_paths_to_text(&buf, trans);
-		trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	count_event(trans->c, trans_restart_too_many_iters);
-
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
-}
-
-static noinline void btree_path_overflow(struct btree_trans *trans)
-{
-	bch2_dump_trans_paths_updates(trans);
-	bch_err(trans->c, "trans path overflow");
-}
-
-static noinline void btree_paths_realloc(struct btree_trans *trans)
-{
-	unsigned nr = trans->nr_paths * 2;
-
-	void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
-			  sizeof(struct btree_trans_paths) +
-			  nr * sizeof(struct btree_path) +
-			  nr * sizeof(btree_path_idx_t) + 8 +
-			  nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
-
-	unsigned long *paths_allocated = p;
-	memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
-	p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
-
-	p += sizeof(struct btree_trans_paths);
-	struct btree_path *paths = p;
-	*trans_paths_nr(paths) = nr;
-	memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
-	p += nr * sizeof(struct btree_path);
-
-	btree_path_idx_t *sorted = p;
-	memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
-	p += nr * sizeof(btree_path_idx_t) + 8;
-
-	struct btree_insert_entry *updates = p;
-	memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
-
-	unsigned long *old = trans->paths_allocated;
-
-	rcu_assign_pointer(trans->paths_allocated,	paths_allocated);
-	rcu_assign_pointer(trans->paths,		paths);
-	rcu_assign_pointer(trans->sorted,		sorted);
-	rcu_assign_pointer(trans->updates,		updates);
-
-	trans->nr_paths		= nr;
-
-	if (old != trans->_paths_allocated)
-		kfree_rcu_mightsleep(old);
-}
-
-static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
-						btree_path_idx_t pos)
-{
-	btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
-
-	if (unlikely(idx == trans->nr_paths)) {
-		if (trans->nr_paths == BTREE_ITER_MAX) {
-			btree_path_overflow(trans);
-			return 0;
-		}
-
-		btree_paths_realloc(trans);
-	}
-
-	/*
-	 * Do this before marking the new path as allocated, since it won't be
-	 * initialized yet:
-	 */
-	if (unlikely(idx > trans->nr_paths_max))
-		bch2_trans_update_max_paths(trans);
-
-	__set_bit(idx, trans->paths_allocated);
-
-	struct btree_path *path = &trans->paths[idx];
-	path->ref		= 0;
-	path->intent_ref	= 0;
-	path->nodes_locked	= 0;
-
-	btree_path_list_add(trans, pos, idx);
-	trans->paths_sorted = false;
-	return idx;
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *trans,
-			     enum btree_id btree_id, struct bpos pos,
-			     unsigned locks_want, unsigned level,
-			     unsigned flags, unsigned long ip)
-{
-	struct btree_path *path;
-	bool cached = flags & BTREE_ITER_cached;
-	bool intent = flags & BTREE_ITER_intent;
-	struct trans_for_each_path_inorder_iter iter;
-	btree_path_idx_t path_pos = 0, path_idx;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	bch2_trans_verify_locks(trans);
-
-	btree_trans_sort_paths(trans);
-
-	if (intent)
-		locks_want = max(locks_want, level + 1);
-	locks_want = min(locks_want, BTREE_MAX_DEPTH);
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		if (__btree_path_cmp(path,
-				     btree_id,
-				     cached,
-				     pos,
-				     level) > 0)
-			break;
-
-		path_pos = iter.path_idx;
-	}
-
-	if (path_pos &&
-	    trans->paths[path_pos].cached	== cached &&
-	    trans->paths[path_pos].btree_id	== btree_id &&
-	    trans->paths[path_pos].level	== level &&
-	    bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
-		trace_btree_path_get(trans, trans->paths + path_pos, &pos);
-
-		__btree_path_get(trans, trans->paths + path_pos, intent);
-		path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
-		path = trans->paths + path_idx;
-	} else {
-		path_idx = btree_path_alloc(trans, path_pos);
-		path = trans->paths + path_idx;
-
-		__btree_path_get(trans, path, intent);
-		path->pos			= pos;
-		path->btree_id			= btree_id;
-		path->cached			= cached;
-		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
-		path->should_be_locked		= false;
-		path->level			= level;
-		path->locks_want		= locks_want;
-		path->nodes_locked		= 0;
-		for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
-			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef TRACK_PATH_ALLOCATED
-		path->ip_allocated		= ip;
-#endif
-		trans->paths_sorted		= false;
-
-		trace_btree_path_alloc(trans, path);
-	}
-
-	if (!(flags & BTREE_ITER_nopreserve))
-		path->preserve = true;
-
-	/*
-	 * If the path has locks_want greater than requested, we don't downgrade
-	 * it here - on transaction restart because btree node split needs to
-	 * upgrade locks, we might be putting/getting the iterator again.
-	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
-	 * a successful transaction commit.
-	 */
-
-	return path_idx;
-}
-
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
-					    enum btree_id btree_id,
-					    unsigned level,
-					    struct bpos pos)
-{
-	btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
-			     BTREE_ITER_nopreserve|
-			     BTREE_ITER_intent, _RET_IP_);
-	path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
-	struct btree_path *path = trans->paths + path_idx;
-	bch2_btree_path_downgrade(trans, path);
-	__bch2_btree_path_unlock(trans, path);
-	return path_idx;
-}
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
-{
-
-	struct btree_path_level *l = path_l(path);
-	struct bkey_packed *_k;
-	struct bkey_s_c k;
-
-	if (unlikely(!l->b))
-		return bkey_s_c_null;
-
-	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-	EBUG_ON(!btree_node_locked(path, path->level));
-
-	if (!path->cached) {
-		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
-
-		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
-
-		if (!k.k || !bpos_eq(path->pos, k.k->p))
-			goto hole;
-	} else {
-		struct bkey_cached *ck = (void *) path->l[0].b;
-		if (!ck)
-			return bkey_s_c_null;
-
-		EBUG_ON(path->btree_id != ck->key.btree_id ||
-			!bkey_eq(path->pos, ck->key.pos));
-
-		*u = ck->k->k;
-		k = (struct bkey_s_c) { u, &ck->k->v };
-	}
-
-	return k;
-hole:
-	bkey_init(u);
-	u->p = path->pos;
-	return (struct bkey_s_c) { u, NULL };
-}
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (!iter->path || trans->restarted)
-		return;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	path->preserve		= false;
-	if (path->ref == 1)
-		path->should_be_locked	= false;
-}
-/* Btree iterators: */
-
-int __must_check
-__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return bch2_btree_path_traverse(trans, iter->path, iter->flags);
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path,
-					btree_iter_search_key(iter),
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (ret)
-		return ret;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	if (btree_path_node(path, path->level))
-		btree_path_set_should_be_locked(trans, path);
-	return 0;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
-					struct btree_iter *iter)
-{
-	struct btree *b = NULL;
-	int ret;
-
-	EBUG_ON(trans->paths[iter->path].cached);
-	bch2_btree_iter_verify(trans, iter);
-
-	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (ret)
-		goto err;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	b = btree_path_node(path, path->level);
-	if (!b)
-		goto out;
-
-	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = b->key.k.p;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(trans, iter);
-
-	return b;
-err:
-	b = ERR_PTR(ret);
-	goto out;
-}
-
-/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
-						    struct btree_iter *iter)
-{
-	struct btree *b;
-
-	while (b = bch2_btree_iter_peek_node(trans, iter),
-	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
-		bch2_trans_begin(trans);
-
-	return b;
-}
-
-struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct btree *b = NULL;
-	int ret;
-
-	EBUG_ON(trans->paths[iter->path].cached);
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	bch2_btree_iter_verify(trans, iter);
-
-	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (ret)
-		goto err;
-
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	/* already at end? */
-	if (!btree_path_node(path, path->level))
-		return NULL;
-
-	/* got to end? */
-	if (!btree_path_node(path, path->level + 1)) {
-		path->should_be_locked = false;
-		btree_path_set_level_up(trans, path);
-		return NULL;
-	}
-
-	/*
-	 * We don't correctly handle nodes with extra intent locks here:
-	 * downgrade so we don't violate locking invariants
-	 */
-	bch2_btree_path_downgrade(trans, path);
-
-	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
-		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
-		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
-		__bch2_btree_path_unlock(trans, path);
-		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
-		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
-		btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-		goto err;
-	}
-
-	b = btree_path_node(path, path->level + 1);
-
-	if (bpos_eq(iter->pos, b->key.k.p)) {
-		__btree_path_set_level_up(trans, path, path->level++);
-	} else {
-		if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
-			btree_node_unlock(trans, path, path->level + 1);
-
-		/*
-		 * Haven't gotten to the end of the parent node: go back down to
-		 * the next child node
-		 */
-		iter->path = bch2_btree_path_set_pos(trans, iter->path,
-					bpos_successor(iter->pos),
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-		path = btree_iter_path(trans, iter);
-		btree_path_set_level_down(trans, path, iter->min_depth);
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (ret)
-			goto err;
-
-		path = btree_iter_path(trans, iter);
-		b = path->l[path->level].b;
-	}
-
-	bkey_init(&iter->k);
-	iter->k.p = iter->pos = b->key.k.p;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-	EBUG_ON(btree_iter_path(trans, iter)->uptodate);
-out:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(trans, iter);
-
-	return b;
-err:
-	b = ERR_PTR(ret);
-	goto out;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct bpos pos = iter->k.p;
-	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-		     ? bpos_eq(pos, SPOS_MAX)
-		     : bkey_eq(pos, SPOS_MAX));
-
-	if (ret && !(iter->flags & BTREE_ITER_is_extents))
-		pos = bkey_successor(iter, pos);
-	bch2_btree_iter_set_pos(trans, iter, pos);
-	return ret;
-}
-
-inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct bpos pos = bkey_start_pos(&iter->k);
-	bool ret = !(iter->flags & BTREE_ITER_all_snapshots
-		     ? bpos_eq(pos, POS_MIN)
-		     : bkey_eq(pos, POS_MIN));
-
-	if (ret && !(iter->flags & BTREE_ITER_is_extents))
-		pos = bkey_predecessor(iter, pos);
-	bch2_btree_iter_set_pos(trans, iter, pos);
-	return ret;
-}
-
-static noinline
-void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
-					struct bpos search_key, struct bkey_s_c *k)
-{
-	struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
-
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_le(i->k->k.p, search_key) &&
-		    bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static noinline
-void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
-				   struct bpos search_key,
-				   struct bkey_s_c *k)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bpos end = path_l(path)->b->key.k.p;
-
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_ge(i->k->k.p, search_key) &&
-		    bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static noinline
-void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k)
-{
-	trans_for_each_update(trans, i)
-		if (!i->key_cache_already_flushed &&
-		    i->btree_id == iter->btree_id &&
-		    bpos_eq(i->k->k.p, iter->pos)) {
-			iter->k = i->k->k;
-			*k = bkey_i_to_s_c(i->k);
-		}
-}
-
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
-					      struct btree_iter *iter,
-					      struct bpos search_pos,
-					      struct bpos end_pos)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
-					   path->level,
-					   search_pos,
-					   end_pos,
-					   &iter->journal_idx);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
-					      struct btree_iter *iter)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
-
-	if (k) {
-		iter->k = k->k;
-		return bkey_i_to_s_c(k);
-	} else {
-		return bkey_s_c_null;
-	}
-}
-
-static noinline
-void btree_trans_peek_journal(struct btree_trans *trans,
-			      struct btree_iter *iter,
-			      struct bpos search_key,
-			      struct bkey_s_c *k)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bkey_i *next_journal =
-		bch2_btree_journal_peek(trans, iter, search_key,
-				k->k ? k->k->p : path_l(path)->b->key.k.p);
-	if (next_journal) {
-		iter->k = next_journal->k;
-		*k = bkey_i_to_s_c(next_journal);
-	}
-}
-
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
-					      struct btree_iter *iter,
-					      struct bpos search_key,
-					      struct bpos end_pos)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
-					   path->level,
-					   search_key,
-					   end_pos,
-					   &iter->journal_idx);
-}
-
-static noinline
-void btree_trans_peek_prev_journal(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bpos search_key,
-				   struct bkey_s_c *k)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct bkey_i *next_journal =
-		bch2_btree_journal_peek_prev(trans, iter, search_key,
-				k->k ? k->k->p : path_l(path)->b->data->min_key);
-
-	if (next_journal) {
-		iter->k = next_journal->k;
-		*k = bkey_i_to_s_c(next_journal);
-	}
-}
-
-/*
- * Checks btree key cache for key at iter->pos and returns it if present, or
- * bkey_s_c_null:
- */
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
-					   struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey u;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	if ((iter->flags & BTREE_ITER_key_cache_fill) &&
-	    bpos_eq(iter->pos, pos))
-		return bkey_s_c_null;
-
-	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
-		return bkey_s_c_null;
-
-	if (!iter->key_cache_path)
-		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
-						     iter->flags & BTREE_ITER_intent, 0,
-						     iter->flags|BTREE_ITER_cached|
-						     BTREE_ITER_cached_nofill,
-						     _THIS_IP_);
-
-	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
-					 iter->flags|BTREE_ITER_cached) ?:
-		bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
-	if (unlikely(ret))
-		return bkey_s_c_err(ret);
-
-	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
-	if (!k.k)
-		return k;
-
-	if ((iter->flags & BTREE_ITER_all_snapshots) &&
-	    !bpos_eq(pos, k.k->p))
-		return bkey_s_c_null;
-
-	iter->k = u;
-	k.k = &iter->k;
-	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-	return k;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
-					      struct bpos search_key)
-{
-	struct bkey_s_c k, k2;
-	int ret;
-
-	EBUG_ON(btree_iter_path(trans, iter)->cached);
-	bch2_btree_iter_verify(trans, iter);
-
-	while (1) {
-		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (unlikely(ret)) {
-			/* ensure that iter->k is consistent with iter->pos: */
-			bch2_btree_iter_set_pos(trans, iter, iter->pos);
-			k = bkey_s_c_err(ret);
-			break;
-		}
-
-		struct btree_path *path = btree_iter_path(trans, iter);
-		struct btree_path_level *l = path_l(path);
-
-		if (unlikely(!l->b)) {
-			/* No btree nodes at requested level: */
-			bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-			k = bkey_s_c_null;
-			break;
-		}
-
-		btree_path_set_should_be_locked(trans, path);
-
-		k = btree_path_level_peek_all(trans->c, l, &iter->k);
-
-		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-		    k.k &&
-		    (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
-			k = k2;
-			if (bkey_err(k)) {
-				bch2_btree_iter_set_pos(trans, iter, iter->pos);
-				break;
-			}
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_journal))
-			btree_trans_peek_journal(trans, iter, search_key, &k);
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates))
-			bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
-
-		if (k.k && bkey_deleted(k.k)) {
-			/*
-			 * If we've got a whiteout, and it's after the search
-			 * key, advance the search key to the whiteout instead
-			 * of just after the whiteout - it might be a btree
-			 * whiteout, with a real key at the same position, since
-			 * in the btree deleted keys sort before non deleted.
-			 */
-			search_key = !bpos_eq(search_key, k.k->p)
-				? k.k->p
-				: bpos_successor(k.k->p);
-			continue;
-		}
-
-		if (likely(k.k)) {
-			break;
-		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
-			/* Advance to next leaf node: */
-			search_key = bpos_successor(l->b->key.k.p);
-		} else {
-			/* End of btree: */
-			bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-			k = bkey_s_c_null;
-			break;
-		}
-	}
-
-	bch2_btree_iter_verify(trans, iter);
-
-	if (trace___btree_iter_peek_enabled()) {
-		CLASS(printbuf, buf)();
-
-		int ret = bkey_err(k);
-		if (ret)
-			prt_str(&buf, bch2_err_str(ret));
-		else if (k.k)
-			bch2_bkey_val_to_text(&buf, trans->c, k);
-		else
-			prt_str(&buf, "(null)");
-		trace___btree_iter_peek(trans->c, buf.buf);
-	}
-
-	return k;
-}
-
-/**
- * bch2_btree_iter_peek_max() - returns first key greater than or equal to
- * iterator's current position
- * @trans:	btree transaction object
- * @iter:	iterator to peek from
- * @end:	search limit: returns keys less than or equal to @end
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
-					 struct bpos end)
-{
-	struct bpos search_key = btree_iter_search_key(iter);
-	struct bkey_s_c k;
-	struct bpos iter_pos = iter->pos;
-	int ret;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	bch2_btree_iter_verify_entry_exit(iter);
-	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
-
-	ret = trans_maybe_inject_restart(trans, _RET_IP_);
-	if (unlikely(ret)) {
-		k = bkey_s_c_err(ret);
-		goto out_no_locked;
-	}
-
-	if (iter->update_path) {
-		bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
-		iter->update_path = 0;
-	}
-
-	while (1) {
-		k = __bch2_btree_iter_peek(trans, iter, search_key);
-		if (unlikely(!k.k))
-			goto end;
-		if (unlikely(bkey_err(k)))
-			goto out_no_locked;
-
-		if (iter->flags & BTREE_ITER_filter_snapshots) {
-			/*
-			 * We need to check against @end before FILTER_SNAPSHOTS because
-			 * if we get to a different inode that requested we might be
-			 * seeing keys for a different snapshot tree that will all be
-			 * filtered out.
-			 *
-			 * But we can't do the full check here, because bkey_start_pos()
-			 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
-			 * that's what we check against in extents mode:
-			 */
-			if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-				     ? bkey_gt(k.k->p, end)
-				     : k.k->p.inode > end.inode))
-				goto end;
-
-			if (iter->update_path &&
-			    !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
-				bch2_path_put(trans, iter->update_path,
-					      iter->flags & BTREE_ITER_intent);
-				iter->update_path = 0;
-			}
-
-			if ((iter->flags & BTREE_ITER_intent) &&
-			    !(iter->flags & BTREE_ITER_is_extents) &&
-			    !iter->update_path) {
-				struct bpos pos = k.k->p;
-
-				if (pos.snapshot < iter->snapshot) {
-					search_key = bpos_successor(k.k->p);
-					continue;
-				}
-
-				pos.snapshot = iter->snapshot;
-
-				/*
-				 * advance, same as on exit for iter->path, but only up
-				 * to snapshot
-				 */
-				__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
-				iter->update_path = iter->path;
-
-				iter->update_path = bch2_btree_path_set_pos(trans,
-							iter->update_path, pos,
-							iter->flags & BTREE_ITER_intent,
-							_THIS_IP_);
-				ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
-				if (unlikely(ret)) {
-					k = bkey_s_c_err(ret);
-					goto out_no_locked;
-				}
-			}
-
-			/*
-			 * We can never have a key in a leaf node at POS_MAX, so
-			 * we don't have to check these successor() calls:
-			 */
-			if (!bch2_snapshot_is_ancestor(trans->c,
-						       iter->snapshot,
-						       k.k->p.snapshot)) {
-				search_key = bpos_successor(k.k->p);
-				continue;
-			}
-
-			if (bkey_whiteout(k.k) &&
-			    !(iter->flags & BTREE_ITER_key_cache_fill)) {
-				search_key = bkey_successor(iter, k.k->p);
-				continue;
-			}
-		}
-
-		/*
-		 * iter->pos should be mononotically increasing, and always be
-		 * equal to the key we just returned - except extents can
-		 * straddle iter->pos:
-		 */
-		if (!(iter->flags & BTREE_ITER_is_extents))
-			iter_pos = k.k->p;
-		else
-			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
-
-		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_gt(iter_pos, end) :
-			     iter->flags & BTREE_ITER_is_extents	? bkey_ge(iter_pos, end) :
-									  bkey_gt(iter_pos, end)))
-			goto end;
-
-		break;
-	}
-
-	iter->pos = iter_pos;
-
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
-				iter->flags & BTREE_ITER_intent,
-				btree_iter_ip_allocated(iter));
-
-	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
-	if (iter->update_path) {
-		ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
-		if (unlikely(ret))
-			k = bkey_s_c_err(ret);
-		else
-			btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
-	}
-
-	if (!(iter->flags & BTREE_ITER_all_snapshots))
-		iter->pos.snapshot = iter->snapshot;
-
-	ret = bch2_btree_iter_verify_ret(trans, iter, k);
-	if (unlikely(ret)) {
-		bch2_btree_iter_set_pos(trans, iter, iter->pos);
-		k = bkey_s_c_err(ret);
-	}
-
-	bch2_btree_iter_verify_entry_exit(iter);
-
-	if (trace_btree_iter_peek_max_enabled()) {
-		CLASS(printbuf, buf)();
-
-		int ret = bkey_err(k);
-		if (ret)
-			prt_str(&buf, bch2_err_str(ret));
-		else if (k.k)
-			bch2_bkey_val_to_text(&buf, trans->c, k);
-		else
-			prt_str(&buf, "(null)");
-		trace_btree_iter_peek_max(trans->c, buf.buf);
-	}
-
-	return k;
-end:
-	bch2_btree_iter_set_pos(trans, iter, end);
-	k = bkey_s_c_null;
-	goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_next() - returns first key greater than iterator's current
- * position
- * @trans:	btree transaction object
- * @iter:	iterator to peek from
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_advance(trans, iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek(trans, iter);
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
-						   struct bpos search_key)
-{
-	struct bkey_s_c k, k2;
-
-	bch2_btree_iter_verify(trans, iter);
-
-	while (1) {
-		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-		int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-		if (unlikely(ret)) {
-			/* ensure that iter->k is consistent with iter->pos: */
-			bch2_btree_iter_set_pos(trans, iter, iter->pos);
-			k = bkey_s_c_err(ret);
-			break;
-		}
-
-		struct btree_path *path = btree_iter_path(trans, iter);
-		struct btree_path_level *l = path_l(path);
-
-		if (unlikely(!l->b)) {
-			/* No btree nodes at requested level: */
-			bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
-			k = bkey_s_c_null;
-			break;
-		}
-
-		btree_path_set_should_be_locked(trans, path);
-
-		k = btree_path_level_peek_all(trans->c, l, &iter->k);
-		if (!k.k || bpos_gt(k.k->p, search_key)) {
-			k = btree_path_level_prev(trans, path, l, &iter->k);
-
-			BUG_ON(k.k && bpos_gt(k.k->p, search_key));
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-		    k.k &&
-		    (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
-			k = k2;
-			if (bkey_err(k2)) {
-				bch2_btree_iter_set_pos(trans, iter, iter->pos);
-				break;
-			}
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_journal))
-			btree_trans_peek_prev_journal(trans, iter, search_key, &k);
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates))
-			bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
-
-		if (likely(k.k && !bkey_deleted(k.k))) {
-			break;
-		} else if (k.k) {
-			search_key = bpos_predecessor(k.k->p);
-		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
-			/* Advance to previous leaf node: */
-			search_key = bpos_predecessor(path->l[0].b->data->min_key);
-		} else {
-			/* Start of btree: */
-			bch2_btree_iter_set_pos(trans, iter, POS_MIN);
-			k = bkey_s_c_null;
-			break;
-		}
-	}
-
-	bch2_btree_iter_verify(trans, iter);
-	return k;
-}
-
-/**
- * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
- * iterator's current position
- * @trans:	btree transaction object
- * @iter:	iterator to peek from
- * @end:	search limit: returns keys greater than or equal to @end
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
-					      struct bpos end)
-{
-	if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
-	   !bkey_eq(iter->pos, POS_MAX) &&
-	   !((iter->flags & BTREE_ITER_is_extents) &&
-	     iter->pos.offset == U64_MAX)) {
-
-		/*
-		 * bkey_start_pos(), for extents, is not monotonically
-		 * increasing until after filtering for snapshots:
-		 *
-		 * Thus, for extents we need to search forward until we find a
-		 * real visible extents - easiest to just use peek_slot() (which
-		 * internally uses peek() for extents)
-		 */
-		struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-		if (bkey_err(k))
-			return k;
-
-		if (!bkey_deleted(k.k) &&
-		    (!(iter->flags & BTREE_ITER_is_extents) ||
-		     bkey_lt(bkey_start_pos(k.k), iter->pos)))
-			return k;
-	}
-
-	struct bpos search_key = iter->pos;
-	struct bkey_s_c k;
-	btree_path_idx_t saved_path = 0;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	bch2_btree_iter_verify_entry_exit(iter);
-	EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
-
-	int ret = trans_maybe_inject_restart(trans, _RET_IP_);
-	if (unlikely(ret)) {
-		k = bkey_s_c_err(ret);
-		goto out_no_locked;
-	}
-
-	while (1) {
-		k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
-		if (unlikely(!k.k))
-			goto end;
-		if (unlikely(bkey_err(k)))
-			goto out_no_locked;
-
-		if (iter->flags & BTREE_ITER_filter_snapshots) {
-			struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
-			if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
-				/*
-				 * If we have a saved candidate, and we're past
-				 * the last possible snapshot overwrite, return
-				 * it:
-				 */
-				bch2_path_put(trans, iter->path,
-					      iter->flags & BTREE_ITER_intent);
-				iter->path = saved_path;
-				saved_path = 0;
-				k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
-				break;
-			}
-
-			/*
-			 * We need to check against @end before FILTER_SNAPSHOTS because
-			 * if we get to a different inode that requested we might be
-			 * seeing keys for a different snapshot tree that will all be
-			 * filtered out.
-			 */
-			if (unlikely(bkey_lt(k.k->p, end)))
-				goto end;
-
-			if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
-				search_key = bpos_predecessor(k.k->p);
-				continue;
-			}
-
-			if (k.k->p.snapshot != iter->snapshot) {
-				/*
-				 * Have a key visible in iter->snapshot, but
-				 * might have overwrites: - save it and keep
-				 * searching. Unless it's a whiteout - then drop
-				 * our previous saved candidate:
-				 */
-				if (saved_path) {
-					bch2_path_put(trans, saved_path,
-						      iter->flags & BTREE_ITER_intent);
-					saved_path = 0;
-				}
-
-				if (!bkey_whiteout(k.k)) {
-					saved_path = btree_path_clone(trans, iter->path,
-								iter->flags & BTREE_ITER_intent,
-								_THIS_IP_);
-					trace_btree_path_save_pos(trans,
-								  trans->paths + iter->path,
-								  trans->paths + saved_path);
-				}
-
-				search_key = bpos_predecessor(k.k->p);
-				continue;
-			}
-
-			if (bkey_whiteout(k.k)) {
-				search_key = bkey_predecessor(iter, k.k->p);
-				search_key.snapshot = U32_MAX;
-				continue;
-			}
-		}
-
-		EBUG_ON(iter->flags & BTREE_ITER_all_snapshots		? bpos_gt(k.k->p, iter->pos) :
-			iter->flags & BTREE_ITER_is_extents		? bkey_ge(bkey_start_pos(k.k), iter->pos) :
-									  bkey_gt(k.k->p, iter->pos));
-
-		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_lt(k.k->p, end) :
-			     iter->flags & BTREE_ITER_is_extents	? bkey_le(k.k->p, end) :
-									  bkey_lt(k.k->p, end)))
-			goto end;
-
-		break;
-	}
-
-	/* Extents can straddle iter->pos: */
-	iter->pos = bpos_min(iter->pos, k.k->p);;
-
-	if (iter->flags & BTREE_ITER_filter_snapshots)
-		iter->pos.snapshot = iter->snapshot;
-out_no_locked:
-	if (saved_path)
-		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
-
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(trans, iter);
-
-	if (trace_btree_iter_peek_prev_min_enabled()) {
-		CLASS(printbuf, buf)();
-
-		int ret = bkey_err(k);
-		if (ret)
-			prt_str(&buf, bch2_err_str(ret));
-		else if (k.k)
-			bch2_bkey_val_to_text(&buf, trans->c, k);
-		else
-			prt_str(&buf, "(null)");
-		trace_btree_iter_peek_prev_min(trans->c, buf.buf);
-	}
-	return k;
-end:
-	bch2_btree_iter_set_pos(trans, iter, end);
-	k = bkey_s_c_null;
-	goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_prev() - returns first key less than iterator's current
- * position
- * @trans:	btree transaction object
- * @iter:	iterator to peek from
- *
- * Returns:	key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_rewind(trans, iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_prev(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct bpos search_key;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	bch2_btree_iter_verify(trans, iter);
-	bch2_btree_iter_verify_entry_exit(iter);
-	EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
-
-	ret = trans_maybe_inject_restart(trans, _RET_IP_);
-	if (unlikely(ret)) {
-		k = bkey_s_c_err(ret);
-		goto out;
-	}
-
-	/* extents can't span inode numbers: */
-	if ((iter->flags & BTREE_ITER_is_extents) &&
-	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
-		if (iter->pos.inode == KEY_INODE_MAX) {
-			k = bkey_s_c_null;
-			goto out2;
-		}
-
-		bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
-	}
-
-	search_key = btree_iter_search_key(iter);
-	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
-					iter->flags & BTREE_ITER_intent,
-					btree_iter_ip_allocated(iter));
-
-	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-	if (unlikely(ret)) {
-		k = bkey_s_c_err(ret);
-		goto out;
-	}
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	if (unlikely(!btree_path_node(path, path->level))) {
-		k = bkey_s_c_null;
-		goto out2;
-	}
-
-	btree_path_set_should_be_locked(trans, path);
-
-	if ((iter->flags & BTREE_ITER_cached) ||
-	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
-		k = bkey_s_c_null;
-
-		if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
-			     trans->nr_updates)) {
-			bch2_btree_trans_peek_slot_updates(trans, iter, &k);
-			if (k.k)
-				goto out;
-		}
-
-		if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
-		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
-			goto out;
-
-		if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
-		    (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
-			if (!bkey_err(k))
-				iter->k = *k.k;
-			/* We're not returning a key from iter->path: */
-			goto out;
-		}
-
-		k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
-		if (unlikely(!k.k))
-			goto out;
-
-		if (unlikely(k.k->type == KEY_TYPE_whiteout &&
-			     (iter->flags & BTREE_ITER_filter_snapshots) &&
-			     !(iter->flags & BTREE_ITER_key_cache_fill)))
-			iter->k.type = KEY_TYPE_deleted;
-	} else {
-		struct bpos next;
-		struct bpos end = iter->pos;
-
-		if (iter->flags & BTREE_ITER_is_extents)
-			end.offset = U64_MAX;
-
-		EBUG_ON(btree_iter_path(trans, iter)->level);
-
-		if (iter->flags & BTREE_ITER_intent) {
-			struct btree_iter iter2;
-
-			bch2_trans_copy_iter(trans, &iter2, iter);
-			k = bch2_btree_iter_peek_max(trans, &iter2, end);
-
-			if (k.k && !bkey_err(k)) {
-				swap(iter->key_cache_path, iter2.key_cache_path);
-				iter->k = iter2.k;
-				k.k = &iter->k;
-			}
-			bch2_trans_iter_exit(trans, &iter2);
-		} else {
-			struct bpos pos = iter->pos;
-
-			k = bch2_btree_iter_peek_max(trans, iter, end);
-			if (unlikely(bkey_err(k)))
-				bch2_btree_iter_set_pos(trans, iter, pos);
-			else
-				iter->pos = pos;
-		}
-
-		if (unlikely(bkey_err(k)))
-			goto out;
-
-		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
-		if (bkey_lt(iter->pos, next)) {
-			bkey_init(&iter->k);
-			iter->k.p = iter->pos;
-
-			if (iter->flags & BTREE_ITER_is_extents) {
-				bch2_key_resize(&iter->k,
-						min_t(u64, KEY_SIZE_MAX,
-						      (next.inode == iter->pos.inode
-						       ? next.offset
-						       : KEY_OFFSET_MAX) -
-						      iter->pos.offset));
-				EBUG_ON(!iter->k.size);
-			}
-
-			k = (struct bkey_s_c) { &iter->k, NULL };
-		}
-	}
-out:
-	bch2_btree_iter_verify_entry_exit(iter);
-	bch2_btree_iter_verify(trans, iter);
-	ret = bch2_btree_iter_verify_ret(trans, iter, k);
-	if (unlikely(ret))
-		k = bkey_s_c_err(ret);
-out2:
-	if (trace_btree_iter_peek_slot_enabled()) {
-		CLASS(printbuf, buf)();
-
-		int ret = bkey_err(k);
-		if (ret)
-			prt_str(&buf, bch2_err_str(ret));
-		else if (k.k)
-			bch2_bkey_val_to_text(&buf, trans->c, k);
-		else
-			prt_str(&buf, "(null)");
-		trace_btree_iter_peek_slot(trans->c, buf.buf);
-	}
-
-	return k;
-}
-
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_advance(trans, iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (!bch2_btree_iter_rewind(trans, iter))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
-{
-	struct bkey_s_c k;
-
-	while (btree_trans_too_many_iters(trans) ||
-	       (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
-		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
-		bch2_trans_begin(trans);
-
-	return k;
-}
-
-/* new transactional stuff: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
-
-	trans_for_each_path(trans, path, i) {
-		BUG_ON(path->sorted_idx >= trans->nr_sorted);
-		BUG_ON(trans->sorted[path->sorted_idx] != i);
-	}
-
-	for (i = 0; i < trans->nr_sorted; i++) {
-		unsigned idx = trans->sorted[i];
-
-		BUG_ON(!test_bit(idx, trans->paths_allocated));
-		BUG_ON(trans->paths[idx].sorted_idx != i);
-	}
-}
-
-static void btree_trans_verify_sorted(struct btree_trans *trans)
-{
-	struct btree_path *path, *prev = NULL;
-	struct trans_for_each_path_inorder_iter iter;
-
-	if (!static_branch_unlikely(&bch2_debug_check_iterators))
-		return;
-
-	trans_for_each_path_inorder(trans, path, iter) {
-		if (prev && btree_path_cmp(prev, path) > 0) {
-			__bch2_dump_trans_paths_updates(trans, true);
-			panic("trans paths out of order!\n");
-		}
-		prev = path;
-	}
-}
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
-#endif
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
-{
-	int i, l = 0, r = trans->nr_sorted, inc = 1;
-	bool swapped;
-
-	btree_trans_verify_sorted_refs(trans);
-
-	if (trans->paths_sorted)
-		goto out;
-
-	/*
-	 * Cocktail shaker sort: this is efficient because iterators will be
-	 * mostly sorted.
-	 */
-	do {
-		swapped = false;
-
-		for (i = inc > 0 ? l : r - 2;
-		     i + 1 < r && i >= l;
-		     i += inc) {
-			if (btree_path_cmp(trans->paths + trans->sorted[i],
-					   trans->paths + trans->sorted[i + 1]) > 0) {
-				swap(trans->sorted[i], trans->sorted[i + 1]);
-				trans->paths[trans->sorted[i]].sorted_idx = i;
-				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
-				swapped = true;
-			}
-		}
-
-		if (inc > 0)
-			--r;
-		else
-			l++;
-		inc = -inc;
-	} while (swapped);
-
-	trans->paths_sorted = true;
-out:
-	btree_trans_verify_sorted(trans);
-}
-
-static inline void btree_path_list_remove(struct btree_trans *trans,
-					  struct btree_path *path)
-{
-	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	trans->nr_sorted--;
-	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
-				trans->sorted + path->sorted_idx + 1,
-				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-					     sizeof(u64) / sizeof(btree_path_idx_t)));
-#else
-	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-#endif
-	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-		trans->paths[trans->sorted[i]].sorted_idx = i;
-}
-
-static inline void btree_path_list_add(struct btree_trans *trans,
-				       btree_path_idx_t pos,
-				       btree_path_idx_t path_idx)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
-			      trans->sorted + path->sorted_idx,
-			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
-					   sizeof(u64) / sizeof(btree_path_idx_t)));
-	trans->nr_sorted++;
-	trans->sorted[path->sorted_idx] = path_idx;
-#else
-	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
-#endif
-
-	for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
-		trans->paths[trans->sorted[i]].sorted_idx = i;
-
-	btree_trans_verify_sorted_refs(trans);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
-{
-	if (iter->update_path)
-		bch2_path_put(trans, iter->update_path,
-			      iter->flags & BTREE_ITER_intent);
-	if (iter->path)
-		bch2_path_put(trans, iter->path,
-			      iter->flags & BTREE_ITER_intent);
-	if (iter->key_cache_path)
-		bch2_path_put(trans, iter->key_cache_path,
-			      iter->flags & BTREE_ITER_intent);
-	iter->path		= 0;
-	iter->update_path	= 0;
-	iter->key_cache_path	= 0;
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  enum btree_id btree_id, struct bpos pos,
-			  unsigned flags)
-{
-	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-			       bch2_btree_iter_flags(trans, btree_id, 0, flags),
-			       _RET_IP_);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       enum btree_id btree_id,
-			       struct bpos pos,
-			       unsigned locks_want,
-			       unsigned depth,
-			       unsigned flags)
-{
-	flags |= BTREE_ITER_not_extents;
-	flags |= BTREE_ITER_snapshot_field;
-	flags |= BTREE_ITER_all_snapshots;
-
-	if (!depth && btree_id_cached(trans->c, btree_id))
-		flags |= BTREE_ITER_with_key_cache;
-
-	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
-			       bch2_btree_iter_flags(trans, btree_id, depth, flags),
-			       _RET_IP_);
-
-	iter->min_depth	= depth;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	BUG_ON(path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
-	BUG_ON(path->level	!= depth);
-	BUG_ON(iter->min_depth	!= depth);
-}
-
-void bch2_trans_copy_iter(struct btree_trans *trans,
-			  struct btree_iter *dst, struct btree_iter *src)
-{
-	*dst = *src;
-#ifdef TRACK_PATH_ALLOCATED
-	dst->ip_allocated = _RET_IP_;
-#endif
-	if (src->path)
-		__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
-	if (src->update_path)
-		__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
-	dst->key_cache_path = 0;
-}
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
-				      darray_trans_kmalloc_trace *trace)
-{
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 60);
-
-	darray_for_each(*trace, i)
-		prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
-}
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
-{
-	struct bch_fs *c = trans->c;
-	unsigned new_top = trans->mem_top + size;
-	unsigned old_bytes = trans->mem_bytes;
-	unsigned new_bytes = roundup_pow_of_two(new_top);
-	int ret;
-	void *new_mem;
-	void *p;
-
-	if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
-			   BTREE_TRANS_MEM_MAX);
-
-		bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-#endif
-	}
-
-	ret = trans_maybe_inject_restart(trans, _RET_IP_);
-	if (ret)
-		return ERR_PTR(ret);
-
-	struct btree_transaction_stats *s = btree_trans_stats(trans);
-	if (new_bytes > s->max_mem) {
-		mutex_lock(&s->lock);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-		darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
-		s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
-						trans->trans_kmalloc_trace.nr);
-
-		memcpy(s->trans_kmalloc_trace.data,
-		       trans->trans_kmalloc_trace.data,
-		       sizeof(s->trans_kmalloc_trace.data[0]) *
-		       s->trans_kmalloc_trace.nr);
-#endif
-		s->max_mem = new_bytes;
-		mutex_unlock(&s->lock);
-	}
-
-	if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
-		EBUG_ON(trans->mem_bytes >= new_bytes);
-		return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
-	}
-
-	if (old_bytes) {
-		trans->realloc_bytes_required = new_bytes;
-		trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
-		return ERR_PTR(btree_trans_restart_ip(trans,
-					BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
-	}
-
-	EBUG_ON(trans->mem);
-
-	new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-	if (unlikely(!new_mem)) {
-		bch2_trans_unlock(trans);
-
-		new_mem = kmalloc(new_bytes, GFP_KERNEL);
-		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-			new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
-			new_bytes = BTREE_TRANS_MEM_MAX;
-			trans->used_mempool = true;
-		}
-
-		EBUG_ON(!new_mem);
-
-		trans->mem = new_mem;
-		trans->mem_bytes = new_bytes;
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	trans->mem = new_mem;
-	trans->mem_bytes = new_bytes;
-
-	p = trans->mem + trans->mem_top;
-	trans->mem_top += size;
-	memset(p, 0, size);
-	return p;
-}
-
-static inline void check_srcu_held_too_long(struct btree_trans *trans)
-{
-	WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
-	     "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
-	     (jiffies - trans->srcu_lock_time) / HZ);
-}
-
-void bch2_trans_srcu_unlock(struct btree_trans *trans)
-{
-	if (trans->srcu_held) {
-		struct bch_fs *c = trans->c;
-		struct btree_path *path;
-		unsigned i;
-
-		trans_for_each_path(trans, path, i)
-			if (path->cached && !btree_node_locked(path, 0))
-				path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
-
-		check_srcu_held_too_long(trans);
-		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-		trans->srcu_held = false;
-	}
-}
-
-static void bch2_trans_srcu_lock(struct btree_trans *trans)
-{
-	if (!trans->srcu_held) {
-		trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
-		trans->srcu_lock_time	= jiffies;
-		trans->srcu_held = true;
-	}
-}
-
-/**
- * bch2_trans_begin() - reset a transaction after a interrupted attempt
- * @trans: transaction to reset
- *
- * Returns:	current restart counter, to be used with trans_was_restarted()
- *
- * While iterating over nodes or updating nodes a attempt to lock a btree node
- * may return BCH_ERR_transaction_restart when the trylock fails. When this
- * occurs bch2_trans_begin() should be called and the transaction retried.
- */
-u32 bch2_trans_begin(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-	u64 now;
-
-	bch2_trans_reset_updates(trans);
-
-	trans->restart_count++;
-	trans->mem_top			= 0;
-
-	if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
-		EBUG_ON(!trans->mem || !trans->mem_bytes);
-		unsigned new_bytes = trans->realloc_bytes_required;
-		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
-		if (unlikely(!new_mem)) {
-			bch2_trans_unlock(trans);
-			new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
-
-			EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
-
-			if (!new_mem) {
-				new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-				new_bytes = BTREE_TRANS_MEM_MAX;
-				trans->used_mempool = true;
-				kfree(trans->mem);
-			}
-                }
-		trans->mem = new_mem;
-		trans->mem_bytes = new_bytes;
-	}
-
-	trans_for_each_path(trans, path, i) {
-		path->should_be_locked = false;
-
-		/*
-		 * If the transaction wasn't restarted, we're presuming to be
-		 * doing something new: dont keep iterators excpt the ones that
-		 * are in use - except for the subvolumes btree:
-		 */
-		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
-			path->preserve = false;
-
-		/*
-		 * XXX: we probably shouldn't be doing this if the transaction
-		 * was restarted, but currently we still overflow transaction
-		 * iterators if we do that
-		 */
-		if (!path->ref && !path->preserve)
-			__bch2_path_free(trans, i);
-		else
-			path->preserve = false;
-	}
-
-	now = local_clock();
-
-	if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
-	    time_after64(now, trans->last_begin_time + 10))
-		__bch2_time_stats_update(&btree_trans_stats(trans)->duration,
-					 trans->last_begin_time, now);
-
-	if (!trans->restarted &&
-	    (need_resched() ||
-	     time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
-		bch2_trans_unlock(trans);
-		cond_resched();
-		now = local_clock();
-	}
-	trans->last_begin_time = now;
-
-	if (unlikely(trans->srcu_held &&
-		     time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
-		bch2_trans_srcu_unlock(trans);
-
-	trans->last_begin_ip = _RET_IP_;
-
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-	if (trans->restarted) {
-		trans->restart_count_this_trans++;
-	} else {
-		trans->restart_count_this_trans = 0;
-	}
-#endif
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-	trans->trans_kmalloc_trace.nr = 0;
-#endif
-
-	trans_set_locked(trans, false);
-
-	if (trans->restarted) {
-		bch2_btree_path_traverse_all(trans);
-		trans->notrace_relock_fail = false;
-	}
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	return trans->restart_count;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
-
-unsigned bch2_trans_get_fn_idx(const char *fn)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
-		if (!bch2_btree_transaction_fns[i] ||
-		    bch2_btree_transaction_fns[i] == fn) {
-			bch2_btree_transaction_fns[i] = fn;
-			return i;
-		}
-
-	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
-	return 0;
-}
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
-	__acquires(&c->btree_trans_barrier)
-{
-	struct btree_trans *trans;
-
-	if (IS_ENABLED(__KERNEL__)) {
-		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
-		if (trans) {
-			memset(trans, 0, offsetof(struct btree_trans, list));
-			goto got_trans;
-		}
-	}
-
-	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
-	memset(trans, 0, sizeof(*trans));
-
-	seqmutex_lock(&c->btree_trans_lock);
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		struct btree_trans *pos;
-		pid_t pid = current->pid;
-
-		trans->locking_wait.task = current;
-
-		list_for_each_entry(pos, &c->btree_trans_list, list) {
-			struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
-			/*
-			 * We'd much prefer to be stricter here and completely
-			 * disallow multiple btree_trans in the same thread -
-			 * but the data move path calls bch2_write when we
-			 * already have a btree_trans initialized.
-			 */
-			BUG_ON(pos_task &&
-			       pid == pos_task->pid &&
-			       pos->locked);
-		}
-	}
-
-	list_add(&trans->list, &c->btree_trans_list);
-	seqmutex_unlock(&c->btree_trans_lock);
-got_trans:
-	trans->c		= c;
-	trans->last_begin_time	= local_clock();
-	trans->fn_idx		= fn_idx;
-	trans->locking_wait.task = current;
-	trans->journal_replay_not_finished =
-		unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
-		atomic_inc_not_zero(&c->journal_keys.ref);
-	trans->nr_paths		= ARRAY_SIZE(trans->_paths);
-	trans->paths_allocated	= trans->_paths_allocated;
-	trans->sorted		= trans->_sorted;
-	trans->paths		= trans->_paths;
-	trans->updates		= trans->_updates;
-
-	*trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
-
-	trans->paths_allocated[0] = 1;
-
-	static struct lock_class_key lockdep_key;
-	lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
-
-	if (fn_idx < BCH_TRANSACTIONS_NR) {
-		trans->fn = bch2_btree_transaction_fns[fn_idx];
-
-		struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
-
-		if (s->max_mem) {
-			unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
-			trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-			if (likely(trans->mem))
-				trans->mem_bytes = expected_mem_bytes;
-		}
-
-		trans->nr_paths_max = s->nr_max_paths;
-	}
-
-	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
-	trans->srcu_lock_time	= jiffies;
-	trans->srcu_held	= true;
-	trans_set_locked(trans, false);
-
-	closure_init_stack_release(&trans->ref);
-	return trans;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static bool btree_paths_leaked(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (path->ref)
-			return true;
-	return false;
-}
-
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
-	if (btree_paths_leaked(trans)) {
-		struct bch_fs *c = trans->c;
-		struct btree_path *path;
-		unsigned i;
-
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
-		trans_for_each_path(trans, path, i)
-			if (path->ref)
-				prt_printf(&buf, "btree %s %pS\n",
-					   bch2_btree_id_str(path->btree_id),
-					   (void *) path->ip_allocated);
-
-		bch2_fs_emergency_read_only2(c, &buf);
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-}
-#else
-static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
-#endif
-
-void bch2_trans_put(struct btree_trans *trans)
-	__releases(&c->btree_trans_barrier)
-{
-	struct bch_fs *c = trans->c;
-
-	if (trans->restarted)
-		bch2_trans_in_restart_error(trans);
-
-	bch2_trans_unlock(trans);
-
-	trans_for_each_update(trans, i)
-		__btree_path_put(trans, trans->paths + i->path, true);
-	trans->nr_updates	= 0;
-
-	check_btree_paths_leaked(trans);
-
-	if (trans->srcu_held) {
-		check_srcu_held_too_long(trans);
-		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-	}
-
-	if (unlikely(trans->journal_replay_not_finished))
-		bch2_journal_keys_put(c);
-
-	/*
-	 * trans->ref protects trans->locking_wait.task, btree_paths array; used
-	 * by cycle detector
-	 */
-	closure_return_sync(&trans->ref);
-	trans->locking_wait.task = NULL;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	darray_exit(&trans->last_restarted_trace);
-#endif
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-	darray_exit(&trans->trans_kmalloc_trace);
-#endif
-
-	unsigned long *paths_allocated = trans->paths_allocated;
-	trans->paths_allocated	= NULL;
-	trans->paths		= NULL;
-
-	if (paths_allocated != trans->_paths_allocated)
-		kvfree_rcu_mightsleep(paths_allocated);
-
-	if (trans->used_mempool)
-		mempool_free(trans->mem, &c->btree_trans_mem_pool);
-	else
-		kfree(trans->mem);
-
-	/* Userspace doesn't have a real percpu implementation: */
-	if (IS_ENABLED(__KERNEL__))
-		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-
-	if (trans) {
-		seqmutex_lock(&c->btree_trans_lock);
-		list_del(&trans->list);
-		seqmutex_unlock(&c->btree_trans_lock);
-
-		mempool_free(trans, &c->btree_trans_pool);
-	}
-}
-
-bool bch2_current_has_btree_trans(struct bch_fs *c)
-{
-	seqmutex_lock(&c->btree_trans_lock);
-	struct btree_trans *trans;
-	bool ret = false;
-	list_for_each_entry(trans, &c->btree_trans_list, list)
-		if (trans->locking_wait.task == current &&
-		    trans->locked) {
-			ret = true;
-			break;
-		}
-	seqmutex_unlock(&c->btree_trans_lock);
-	return ret;
-}
-
-static void __maybe_unused
-bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
-				      struct btree_bkey_cached_common *b)
-{
-	struct six_lock_count c = six_lock_counts(&b->lock);
-	pid_t pid;
-
-	scoped_guard(rcu) {
-		struct task_struct *owner = READ_ONCE(b->lock.owner);
-		pid = owner ? owner->pid : 0;
-	}
-
-	prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
-	bch2_btree_id_to_text(out, b->btree_id);
-	prt_printf(out, " l=%u:", b->level);
-	bch2_bpos_to_text(out, btree_node_pos(b));
-
-	prt_printf(out, "\t locks %u:%u:%u held by pid %u",
-		   c.n[0], c.n[1], c.n[2], pid);
-}
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
-{
-	struct btree_bkey_cached_common *b;
-	static char lock_types[] = { 'r', 'i', 'w' };
-	struct task_struct *task = READ_ONCE(trans->locking_wait.task);
-	unsigned l, idx;
-
-	/* before rcu_read_lock(): */
-	bch2_printbuf_make_room(out, 4096);
-
-	if (!out->nr_tabstops) {
-		printbuf_tabstop_push(out, 16);
-		printbuf_tabstop_push(out, 32);
-	}
-
-	prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
-
-	/* trans->paths is rcu protected vs. freeing */
-	guard(rcu)();
-	out->atomic++;
-
-	struct btree_path *paths = rcu_dereference(trans->paths);
-	if (!paths)
-		goto out;
-
-	unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
-		struct btree_path *path = paths + idx;
-		if (!path->nodes_locked)
-			continue;
-
-		prt_printf(out, "  path %u %c ",
-			   idx,
-			   path->cached ? 'c' : 'b');
-		bch2_btree_id_to_text(out, path->btree_id);
-		prt_printf(out, " l=%u:", path->level);
-		bch2_bpos_to_text(out, path->pos);
-		prt_newline(out);
-
-		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-			if (btree_node_locked(path, l) &&
-			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
-				prt_printf(out, "    %c l=%u ",
-					   lock_types[btree_node_locked_type(path, l)], l);
-				bch2_btree_bkey_cached_common_to_text(out, b);
-				prt_newline(out);
-			}
-		}
-	}
-
-	b = READ_ONCE(trans->locking);
-	if (b) {
-		prt_printf(out, "  blocked for %lluus on\n",
-			   div_u64(local_clock() - trans->locking_wait.start_time, 1000));
-		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
-		bch2_btree_bkey_cached_common_to_text(out, b);
-		prt_newline(out);
-	}
-out:
-	--out->atomic;
-}
-
-void bch2_fs_btree_iter_exit(struct bch_fs *c)
-{
-	struct btree_transaction_stats *s;
-	struct btree_trans *trans;
-	int cpu;
-
-	if (c->btree_trans_bufs)
-		for_each_possible_cpu(cpu) {
-			struct btree_trans *trans =
-				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
-
-			if (trans) {
-				seqmutex_lock(&c->btree_trans_lock);
-				list_del(&trans->list);
-				seqmutex_unlock(&c->btree_trans_lock);
-			}
-			kfree(trans);
-		}
-	free_percpu(c->btree_trans_bufs);
-
-	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
-	if (trans)
-		panic("%s leaked btree_trans\n", trans->fn);
-
-	for (s = c->btree_transaction_stats;
-	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-		darray_exit(&s->trans_kmalloc_trace);
-#endif
-		kfree(s->max_paths_text);
-		bch2_time_stats_exit(&s->lock_hold_times);
-	}
-
-	if (c->btree_trans_barrier_initialized) {
-		synchronize_srcu_expedited(&c->btree_trans_barrier);
-		cleanup_srcu_struct(&c->btree_trans_barrier);
-	}
-	mempool_exit(&c->btree_trans_mem_pool);
-	mempool_exit(&c->btree_trans_pool);
-}
-
-void bch2_fs_btree_iter_init_early(struct bch_fs *c)
-{
-	struct btree_transaction_stats *s;
-
-	for (s = c->btree_transaction_stats;
-	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
-	     s++) {
-		bch2_time_stats_init(&s->duration);
-		bch2_time_stats_init(&s->lock_hold_times);
-		mutex_init(&s->lock);
-	}
-
-	INIT_LIST_HEAD(&c->btree_trans_list);
-	seqmutex_init(&c->btree_trans_lock);
-}
-
-int bch2_fs_btree_iter_init(struct bch_fs *c)
-{
-	int ret;
-
-	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
-	if (!c->btree_trans_bufs)
-		return -ENOMEM;
-
-	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
-					  sizeof(struct btree_trans)) ?:
-		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-					  BTREE_TRANS_MEM_MAX) ?:
-		init_srcu_struct(&c->btree_trans_barrier);
-	if (ret)
-		return ret;
-
-	/*
-	 * static annotation (hackily done) for lock ordering of reclaim vs.
-	 * btree node locks:
-	 */
-#ifdef CONFIG_LOCKDEP
-	fs_reclaim_acquire(GFP_KERNEL);
-	struct btree_trans *trans = bch2_trans_get(c);
-	trans_set_locked(trans, false);
-	bch2_trans_put(trans);
-	fs_reclaim_release(GFP_KERNEL);
-#endif
-
-	c->btree_trans_barrier_initialized = true;
-	return 0;
-
-}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
deleted file mode 100644
index 09dd3e52622e..000000000000
--- a/fs/bcachefs/btree_iter.h
+++ /dev/null
@@ -1,1010 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_ITER_H
-#define _BCACHEFS_BTREE_ITER_H
-
-#include "bset.h"
-#include "btree_types.h"
-#include "trace.h"
-
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
-static inline int __bkey_err(const struct bkey *k)
-{
-	return PTR_ERR_OR_ZERO(k);
-}
-
-#define bkey_err(_k)	__bkey_err((_k).k)
-
-static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
-	unsigned idx = path - trans->paths;
-
-	EBUG_ON(idx >= trans->nr_paths);
-	EBUG_ON(!test_bit(idx, trans->paths_allocated));
-	if (unlikely(path->ref == U8_MAX)) {
-		bch2_dump_trans_paths_updates(trans);
-		panic("path %u refcount overflow\n", idx);
-	}
-
-	path->ref++;
-	path->intent_ref += intent;
-	trace_btree_path_get_ll(trans, path);
-}
-
-static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
-	EBUG_ON(path - trans->paths >= trans->nr_paths);
-	EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
-	EBUG_ON(!path->ref);
-	EBUG_ON(!path->intent_ref && intent);
-
-	trace_btree_path_put_ll(trans, path);
-	path->intent_ref -= intent;
-	return --path->ref == 0;
-}
-
-static inline void btree_path_set_dirty(struct btree_trans *trans,
-					struct btree_path *path,
-					enum btree_path_uptodate u)
-{
-	BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
-	path->uptodate = max_t(unsigned, path->uptodate, u);
-}
-
-static inline struct btree *btree_path_node(struct btree_path *path,
-					    unsigned level)
-{
-	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
-}
-
-static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
-					const struct btree *b, unsigned level)
-{
-	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
-}
-
-static inline struct btree *btree_node_parent(struct btree_path *path,
-					      struct btree *b)
-{
-	return btree_path_node(path, b->c.level + 1);
-}
-
-/* Iterate over paths within a transaction: */
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *);
-
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
-	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    trans->paths_sorted)
-		return;
-	__bch2_btree_trans_sort_paths(trans);
-}
-
-static inline unsigned long *trans_paths_nr(struct btree_path *paths)
-{
-	return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
-}
-
-static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
-{
-	unsigned long *v = trans_paths_nr(paths);
-	return v - BITS_TO_LONGS(*v);
-}
-
-#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
-	for (_idx = _start;						\
-	     (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr;	\
-	     _idx++)
-
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned *idx)
-{
-	unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
-	/*
-	 * Open coded find_next_bit(), because
-	 *  - this is fast path, we can't afford the function call
-	 *  - and we know that nr_paths is a multiple of BITS_PER_LONG,
-	 */
-	while (*idx < trans->nr_paths) {
-		unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
-		if (v) {
-			*idx += __ffs(v);
-			return trans->paths + *idx;
-		}
-
-		*idx += BITS_PER_LONG;
-		*idx &= ~(BITS_PER_LONG - 1);
-		w++;
-	}
-
-	return NULL;
-}
-
-/*
- * This version is intended to be safe for use on a btree_trans that is owned by
- * another thread, for bch2_btree_trans_to_text();
- */
-#define trans_for_each_path_from(_trans, _path, _idx, _start)		\
-	for (_idx = _start;						\
-	     (_path = __trans_next_path((_trans), &_idx));		\
-	     _idx++)
-
-#define trans_for_each_path(_trans, _path, _idx)			\
-	trans_for_each_path_from(_trans, _path, _idx, 1)
-
-static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned idx = path ? path->sorted_idx + 1 : 0;
-
-	EBUG_ON(idx > trans->nr_sorted);
-
-	return idx < trans->nr_sorted
-		? trans->paths + trans->sorted[idx]
-		: NULL;
-}
-
-static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
-	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
-
-	return idx
-		? trans->paths + trans->sorted[idx - 1]
-		: NULL;
-}
-
-#define trans_for_each_path_idx_inorder(_trans, _iter)			\
-	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
-	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
-	      _iter.sorted_idx < (_trans)->nr_sorted);			\
-	     _iter.sorted_idx++)
-
-struct trans_for_each_path_inorder_iter {
-	btree_path_idx_t	sorted_idx;
-	btree_path_idx_t	path_idx;
-};
-
-#define trans_for_each_path_inorder(_trans, _path, _iter)		\
-	for (_iter = (struct trans_for_each_path_inorder_iter) { 0 };	\
-	     (_iter.path_idx = trans->sorted[_iter.sorted_idx],		\
-	      _path = (_trans)->paths + _iter.path_idx,			\
-	      _iter.sorted_idx < (_trans)->nr_sorted);			\
-	     _iter.sorted_idx++)
-
-#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
-	for (_i = trans->nr_sorted - 1;					\
-	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
-	     --_i)
-
-static inline bool __path_has_node(const struct btree_path *path,
-				   const struct btree *b)
-{
-	return path->l[b->c.level].b == b &&
-		btree_node_lock_seq_matches(path, b, b->c.level);
-}
-
-static inline struct btree_path *
-__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
-			    unsigned *idx)
-{
-	struct btree_path *path;
-
-	while ((path = __trans_next_path(trans, idx)) &&
-		!__path_has_node(path, b))
-	       (*idx)++;
-
-	return path;
-}
-
-#define trans_for_each_path_with_node(_trans, _b, _path, _iter)		\
-	for (_iter = 1;							\
-	     (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
-	     _iter++)
-
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
-					    bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
-			 btree_path_idx_t path, bool intent,
-			 unsigned long ip)
-{
-	if (trans->paths[path].ref > 1 ||
-	    trans->paths[path].preserve)
-		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
-	trans->paths[path].should_be_locked = false;
-	return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
-			  struct bpos, bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_set_pos(struct btree_trans *trans,
-			btree_path_idx_t path, struct bpos new_pos,
-			bool intent, unsigned long ip)
-{
-	return !bpos_eq(new_pos, trans->paths[path].pos)
-		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
-		: path;
-}
-
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
-					      btree_path_idx_t,
-					      unsigned, unsigned long);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
-
-static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
-					  btree_path_idx_t path, unsigned flags)
-{
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
-		return 0;
-
-	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
-				 unsigned, unsigned, unsigned, unsigned long);
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
-					    unsigned, struct bpos);
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-	if (k.k && bpos_eq(path->pos, k.k->p))
-		return k;
-
-	bkey_init(u);
-	u->p = path->pos;
-	return (struct bkey_s_c) { u, NULL };
-}
-
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
-					struct btree_iter *, struct bpos);
-
-void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
-
-int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
-
-static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
-{
-	return mutex_trylock(lock)
-		? 0
-		: __bch2_trans_mutex_lock(trans, lock);
-}
-
-/* Debug: */
-
-void __bch2_trans_verify_paths(struct btree_trans *);
-void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline void bch2_trans_verify_paths(struct btree_trans *trans)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_trans_verify_paths(trans);
-}
-
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
-					  struct bpos pos)
-{
-	if (static_branch_unlikely(&bch2_debug_check_iterators))
-		__bch2_assert_pos_locked(trans, btree, pos);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
-				      struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
-			      struct btree *, struct btree_node_iter *,
-			      struct bkey_packed *, unsigned, unsigned);
-
-int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-
-void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
-
-int bch2_trans_relock(struct btree_trans *);
-int bch2_trans_relock_notrace(struct btree_trans *);
-void bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_unlock_long(struct btree_trans *);
-
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
-{
-	return restart_count != trans->restart_count
-		? -BCH_ERR_transaction_restart_nested
-		: 0;
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
-
-static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
-						   u32 restart_count)
-{
-	if (trans_was_restarted(trans, restart_count))
-		bch2_trans_restart_error(trans, restart_count);
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
-{
-	if (trans->restarted || !trans->locked)
-		bch2_trans_unlocked_or_in_restart_error(trans);
-}
-
-__always_inline
-static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
-{
-	BUG_ON(err <= 0);
-	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
-
-	trans->restarted = err;
-	trans->last_restarted_ip = ip;
-	return -err;
-}
-
-__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
-{
-	btree_trans_restart_foreign_task(trans, err, ip);
-#ifdef CONFIG_BCACHEFS_DEBUG
-	darray_exit(&trans->last_restarted_trace);
-	bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
-#endif
-	return -err;
-}
-
-__always_inline
-static int btree_trans_restart(struct btree_trans *trans, int err)
-{
-	return btree_trans_restart_ip(trans, err, _THIS_IP_);
-}
-
-static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-	if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
-		trace_and_count(trans->c, trans_restart_injected, trans, ip);
-		return btree_trans_restart_ip(trans,
-					BCH_ERR_transaction_restart_fault_inject, ip);
-	}
-#endif
-	return 0;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *,
-			     struct btree_path *, unsigned);
-
-void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
-					     struct btree_path *path)
-{
-	unsigned new_locks_want = path->level + !!path->intent_ref;
-
-	if (path->locks_want > new_locks_want)
-		__bch2_btree_path_downgrade(trans, path, new_locks_want);
-}
-
-void bch2_trans_downgrade(struct btree_trans *);
-
-void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
-void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-
-int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
-						   struct btree_iter *iter)
-{
-	return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
-}
-
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-
-bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
-
-static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-	iter->k.type = KEY_TYPE_deleted;
-	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
-	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
-	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
-	iter->k.size = 0;
-}
-
-static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
-					   struct btree_iter *iter, struct bpos new_pos)
-{
-	if (unlikely(iter->update_path))
-		bch2_path_put(trans, iter->update_path,
-			      iter->flags & BTREE_ITER_intent);
-	iter->update_path = 0;
-
-	if (!(iter->flags & BTREE_ITER_all_snapshots))
-		new_pos.snapshot = iter->snapshot;
-
-	__bch2_btree_iter_set_pos(iter, new_pos);
-}
-
-static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-{
-	BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
-	iter->pos = bkey_start_pos(&iter->k);
-}
-
-static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
-						struct btree_iter *iter, u32 snapshot)
-{
-	struct bpos pos = iter->pos;
-
-	iter->snapshot = snapshot;
-	pos.snapshot = snapshot;
-	bch2_btree_iter_set_pos(trans, iter, pos);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
-					     unsigned btree_id,
-					     unsigned level,
-					     unsigned flags)
-{
-	if (level || !btree_id_cached(trans->c, btree_id)) {
-		flags &= ~BTREE_ITER_cached;
-		flags &= ~BTREE_ITER_with_key_cache;
-	} else if (!(flags & BTREE_ITER_cached))
-		flags |= BTREE_ITER_with_key_cache;
-
-	if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
-	    btree_id_is_extents(btree_id))
-		flags |= BTREE_ITER_is_extents;
-
-	if (!(flags & BTREE_ITER_snapshot_field) &&
-	    !btree_type_has_snapshot_field(btree_id))
-		flags &= ~BTREE_ITER_all_snapshots;
-
-	if (!(flags & BTREE_ITER_all_snapshots) &&
-	    btree_type_has_snapshots(btree_id))
-		flags |= BTREE_ITER_filter_snapshots;
-
-	if (trans->journal_replay_not_finished)
-		flags |= BTREE_ITER_with_journal;
-
-	return flags;
-}
-
-static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
-					  struct btree_iter *iter,
-					  unsigned btree_id, struct bpos pos,
-					  unsigned locks_want,
-					  unsigned depth,
-					  unsigned flags,
-					  unsigned long ip)
-{
-	iter->update_path	= 0;
-	iter->key_cache_path	= 0;
-	iter->btree_id		= btree_id;
-	iter->min_depth		= 0;
-	iter->flags		= flags;
-	iter->snapshot		= pos.snapshot;
-	iter->pos		= pos;
-	iter->k			= POS_KEY(pos);
-	iter->journal_idx	= 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	iter->ip_allocated = ip;
-#endif
-	iter->path = bch2_path_get(trans, btree_id, iter->pos,
-				   locks_want, depth, flags, ip);
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-			  enum btree_id, struct bpos, unsigned);
-
-static inline void bch2_trans_iter_init(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  unsigned btree_id, struct bpos pos,
-			  unsigned flags)
-{
-	if (__builtin_constant_p(btree_id) &&
-	    __builtin_constant_p(flags))
-		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-				bch2_btree_iter_flags(trans, btree_id, 0, flags),
-				_THIS_IP_);
-	else
-		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
-			       enum btree_id, struct bpos,
-			       unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
-				      darray_trans_kmalloc_trace *);
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
-
-static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
-					    unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-	darray_push(&trans->trans_kmalloc_trace,
-		    ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
-#endif
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
-						    unsigned long ip)
-{
-	size = roundup(size, 8);
-
-	bch2_trans_kmalloc_trace(trans, size, ip);
-
-	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-		void *p = trans->mem + trans->mem_top;
-
-		trans->mem_top += size;
-		return p;
-	} else {
-		return __bch2_trans_kmalloc(trans, size, ip);
-	}
-}
-
-static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
-					  unsigned long ip)
-{
-	size = roundup(size, 8);
-
-	bch2_trans_kmalloc_trace(trans, size, ip);
-
-	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
-		void *p = trans->mem + trans->mem_top;
-
-		trans->mem_top += size;
-		memset(p, 0, size);
-		return p;
-	} else {
-		return __bch2_trans_kmalloc(trans, size, ip);
-	}
-}
-
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
-	return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
-{
-	return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
-}
-
-static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
-				struct btree_iter *iter,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags, unsigned type)
-{
-	struct bkey_s_c k;
-
-	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
-	k = bch2_btree_iter_peek_slot(trans, iter);
-
-	if (!bkey_err(k) && type && k.k->type != type)
-		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
-	if (unlikely(bkey_err(k)))
-		bch2_trans_iter_exit(trans, iter);
-	return k;
-}
-
-static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
-				struct btree_iter *iter,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags)
-{
-	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
-}
-
-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
-				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
-
-static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
-{
-	unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
-	memcpy(dst_v, src_k.v, b);
-	if (unlikely(b < dst_size))
-		memset(dst_v + b, 0, dst_size - b);
-}
-
-#define bkey_val_copy(_dst_v, _src_k)					\
-do {									\
-	BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v));			\
-	__bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c);		\
-} while (0)
-
-static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
-				unsigned btree_id, struct bpos pos,
-				unsigned flags, unsigned type,
-				unsigned val_size, void *val)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
-	int ret = bkey_err(k);
-	if (!ret) {
-		__bkey_val_copy(val, val_size, k);
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	return ret;
-}
-
-#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
-	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
-				  KEY_TYPE_##_type, sizeof(*_val), _val)
-
-void bch2_trans_srcu_unlock(struct btree_trans *);
-
-u32 bch2_trans_begin(struct btree_trans *);
-
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,			\
-			      _locks_want, _depth, _flags, _b, _do)		\
-({										\
-	bch2_trans_begin((_trans));						\
-										\
-	struct btree_iter _iter;						\
-	bch2_trans_node_iter_init((_trans), &_iter, (_btree_id),		\
-				  _start, _locks_want, _depth, _flags);		\
-	int _ret3 = 0;								\
-	do {									\
-		_ret3 = lockrestart_do((_trans), ({				\
-			struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
-			if (!_b)						\
-				break;						\
-										\
-			PTR_ERR_OR_ZERO(_b) ?: (_do);				\
-		})) ?:								\
-		lockrestart_do((_trans),					\
-			PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
-	} while (!_ret3);							\
-										\
-	bch2_trans_iter_exit((_trans), &(_iter));				\
-	_ret3;									\
-})
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			    _flags, _b, _do)				\
-	__for_each_btree_node(_trans, _iter, _btree_id, _start,	\
-			      0, 0, _flags, _b, _do)
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
-							     struct btree_iter *iter,
-							     unsigned flags)
-{
-	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(trans, iter) :
-						bch2_btree_iter_peek_prev(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
-							struct btree_iter *iter,
-							unsigned flags)
-{
-	return  flags & BTREE_ITER_slots      ? bch2_btree_iter_peek_slot(trans, iter) :
-						bch2_btree_iter_peek(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
-							    struct btree_iter *iter,
-							    struct bpos end,
-							    unsigned flags)
-{
-	if (!(flags & BTREE_ITER_slots))
-		return bch2_btree_iter_peek_max(trans, iter, end);
-
-	if (bkey_gt(iter->pos, end))
-		return bkey_s_c_null;
-
-	return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-int __bch2_btree_trans_too_many_iters(struct btree_trans *);
-
-static inline int btree_trans_too_many_iters(struct btree_trans *trans)
-{
-	if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
-		return __bch2_btree_trans_too_many_iters(trans);
-
-	return 0;
-}
-
-/*
- * goto instead of loop, so that when used inside for_each_btree_key2()
- * break/continue work correctly
- */
-#define lockrestart_do(_trans, _do)					\
-({									\
-	__label__ transaction_restart;					\
-	u32 _restart_count;						\
-	int _ret2;							\
-transaction_restart:							\
-	_restart_count = bch2_trans_begin(_trans);			\
-	_ret2 = (_do);							\
-									\
-	if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart))	\
-		goto transaction_restart;				\
-									\
-	if (!_ret2)							\
-		bch2_trans_verify_not_restarted(_trans, _restart_count);\
-	_ret2;								\
-})
-
-/*
- * nested_lockrestart_do(), nested_commit_do():
- *
- * These are like lockrestart_do() and commit_do(), with two differences:
- *
- *  - We don't call bch2_trans_begin() unless we had a transaction restart
- *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
- *  transaction restart
- */
-#define nested_lockrestart_do(_trans, _do)				\
-({									\
-	u32 _restart_count, _orig_restart_count;			\
-	int _ret2;							\
-									\
-	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
-									\
-	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
-		_restart_count = bch2_trans_begin(_trans);		\
-									\
-	if (!_ret2)							\
-		bch2_trans_verify_not_restarted(_trans, _restart_count);\
-									\
-	_ret2 ?: trans_was_restarted(_trans, _orig_restart_count);		\
-})
-
-#define for_each_btree_key_max_continue(_trans, _iter,			\
-					 _end, _flags, _k, _do)		\
-({									\
-	struct bkey_s_c _k;						\
-	int _ret3 = 0;							\
-									\
-	do {								\
-		_ret3 = lockrestart_do(_trans, ({			\
-			(_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter),	\
-						_end, (_flags));	\
-			if (!(_k).k)					\
-				break;					\
-									\
-			bkey_err(_k) ?: (_do);				\
-		}));							\
-	} while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter)));	\
-									\
-	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret3;								\
-})
-
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do)	\
-	for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_max(_trans, _iter, _btree_id,		\
-				_start, _end, _flags, _k, _do)		\
-({									\
-	bch2_trans_begin(trans);					\
-									\
-	struct btree_iter _iter;					\
-	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
-			     (_start), (_flags));			\
-									\
-	for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
-})
-
-#define for_each_btree_key(_trans, _iter, _btree_id,			\
-			   _start, _flags, _k, _do)			\
-	for_each_btree_key_max(_trans, _iter, _btree_id, _start,	\
-				 SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
-				   _start, _flags, _k, _do)		\
-({									\
-	struct btree_iter _iter;					\
-	struct bkey_s_c _k;						\
-	int _ret3 = 0;							\
-									\
-	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
-			     (_start), (_flags));			\
-									\
-	do {								\
-		_ret3 = lockrestart_do(_trans, ({			\
-			(_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter),	\
-							(_flags));	\
-			if (!(_k).k)					\
-				break;					\
-									\
-			bkey_err(_k) ?: (_do);				\
-		}));							\
-	} while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter)));	\
-									\
-	bch2_trans_iter_exit((_trans), &(_iter));			\
-	_ret3;								\
-})
-
-#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
-				  _start, _iter_flags, _k,		\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
-				  _start, _iter_flags, _k,		\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_max_commit(_trans, _iter, _btree_id,	\
-				  _start, _end, _iter_flags, _k,	\
-				  _disk_res, _journal_seq, _commit_flags,\
-				  _do)					\
-	for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
-			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_commit_flags)))
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
-							  struct btree_iter *);
-
-#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id,	\
-			   _start, _end, _flags, _k, _ret)		\
-	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
-				  (_start), (_flags));			\
-	     (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
-	     !((_ret) = bkey_err(_k)) && (_k).k;			\
-	     bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
-	for (;									\
-	     (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),	\
-	     !((_ret) = bkey_err(_k)) && (_k).k;				\
-	     bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
-			   _start, _flags, _k, _ret)			\
-	for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
-					  SPOS_MAX, _flags, _k, _ret)
-
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,		\
-					     _start, _flags, _k, _ret)		\
-	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
-				  (_start), (_flags));				\
-	     (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags),	\
-	     !((_ret) = bkey_err(_k)) && (_k).k;				\
-	     bch2_btree_iter_rewind(_trans, &(_iter)))
-
-#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret)	\
-	for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
-
-/*
- * This should not be used in a fastpath, without first trying _do in
- * nonblocking mode - it will cause excessive transaction restarts and
- * potentially livelocking:
- */
-#define drop_locks_do(_trans, _do)					\
-({									\
-	bch2_trans_unlock(_trans);					\
-	(_do) ?: bch2_trans_relock(_trans);				\
-})
-
-#define allocate_dropping_locks_errcode(_trans, _do)			\
-({									\
-	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
-	int _ret = _do;							\
-									\
-	if (bch2_err_matches(_ret, ENOMEM)) {				\
-		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(_trans, _do);			\
-	}								\
-	_ret;								\
-})
-
-#define allocate_dropping_locks(_trans, _ret, _do)			\
-({									\
-	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
-	typeof(_do) _p = _do;						\
-									\
-	_ret = 0;							\
-	if (unlikely(!_p)) {						\
-		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(_trans, ((_p = _do), 0));		\
-	}								\
-	_p;								\
-})
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
-void bch2_trans_put(struct btree_trans *);
-
-bool bch2_current_has_btree_trans(struct bch_fs *);
-
-extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
-unsigned bch2_trans_get_fn_idx(const char *);
-
-#define bch2_trans_get(_c)						\
-({									\
-	static unsigned trans_fn_idx;					\
-									\
-	if (unlikely(!trans_fn_idx))					\
-		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
-	__bch2_trans_get(_c, trans_fn_idx);				\
-})
-
-/*
- * We don't use DEFINE_CLASS() because using a function for the constructor
- * breaks bch2_trans_get()'s use of __func__
- */
-typedef struct btree_trans * class_btree_trans_t;
-static inline void class_btree_trans_destructor(struct btree_trans **p)
-{
-	struct btree_trans *trans = *p;
-	bch2_trans_put(trans);
-}
-
-#define class_btree_trans_constructor(_c)	bch2_trans_get(_c)
-
-#define bch2_trans_run(_c, _do)						\
-({									\
-	CLASS(btree_trans, trans)(_c);					\
-	(_do);								\
-})
-
-#define bch2_trans_do(_c, _do)	bch2_trans_run(_c, lockrestart_do(trans, _do))
-
-void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
-
-void bch2_fs_btree_iter_exit(struct bch_fs *);
-void bch2_fs_btree_iter_init_early(struct bch_fs *);
-int bch2_fs_btree_iter_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
deleted file mode 100644
index ea839560a136..000000000000
--- a/fs/bcachefs/btree_journal_iter.c
+++ /dev/null
@@ -1,830 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "journal_io.h"
-
-#include <linux/sort.h>
-
-/*
- * For managing keys we read from the journal: until journal replay works normal
- * btree lookups need to be able to find and return keys from the journal where
- * they overwrite what's in the btree, so we have a special iterator and
- * operations for the regular btree iter code to use:
- */
-
-static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
-{
-	size_t gap_size = keys->size - keys->nr;
-
-	BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
-
-	if (pos >= keys->gap)
-		pos -= gap_size;
-	return pos;
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-	size_t gap_size = keys->size - keys->nr;
-
-	if (idx >= keys->gap)
-		idx += gap_size;
-	return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-	return keys->data + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-					enum btree_id id, unsigned level,
-					struct bpos pos)
-{
-	size_t l = 0, r = keys->nr, m;
-
-	while (l < r) {
-		m = l + ((r - l) >> 1);
-		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-			l = m + 1;
-		else
-			r = m;
-	}
-
-	BUG_ON(l < keys->nr &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-	BUG_ON(l &&
-	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-	return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-				      enum btree_id id, unsigned level,
-				      struct bpos pos)
-{
-	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos,
-					   struct bpos end_pos, size_t *idx)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	unsigned iters = 0;
-	struct journal_key *k;
-
-	BUG_ON(*idx > keys->nr);
-search:
-	if (!*idx)
-		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-	while (*idx &&
-	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
-		--(*idx);
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	struct bkey_i *ret = NULL;
-	rcu_read_lock(); /* for overwritten_ranges */
-
-	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-			break;
-
-		if (k->overwritten) {
-			if (k->overwritten_range)
-				*idx = rcu_dereference(k->overwritten_range)->end;
-			else
-				*idx += 1;
-			continue;
-		}
-
-		if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
-			ret = k->k;
-			break;
-		}
-
-		(*idx)++;
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			rcu_read_unlock();
-			goto search;
-		}
-	}
-
-	rcu_read_unlock();
-	return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos,
-					   struct bpos end_pos, size_t *idx)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	unsigned iters = 0;
-	struct journal_key *k;
-
-	BUG_ON(*idx > keys->nr);
-
-	if (!keys->nr)
-		return NULL;
-search:
-	if (!*idx)
-		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-	while (*idx < keys->nr &&
-	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
-		(*idx)++;
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	if (*idx == keys->nr)
-		--(*idx);
-
-	struct bkey_i *ret = NULL;
-	rcu_read_lock(); /* for overwritten_ranges */
-
-	while (true) {
-		k = idx_to_key(keys, *idx);
-		if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
-			break;
-
-		if (k->overwritten) {
-			if (k->overwritten_range)
-				*idx = rcu_dereference(k->overwritten_range)->start;
-			if (!*idx)
-				break;
-			--(*idx);
-			continue;
-		}
-
-		if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
-			ret = k->k;
-			break;
-		}
-
-		if (!*idx)
-			break;
-		--(*idx);
-		iters++;
-		if (iters == 10) {
-			*idx = 0;
-			goto search;
-		}
-	}
-
-	rcu_read_unlock();
-	return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-					   unsigned level, struct bpos pos)
-{
-	size_t idx = 0;
-
-	return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iter_verify(struct journal_iter *iter)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct journal_keys *keys = iter->keys;
-	size_t gap_size = keys->size - keys->nr;
-
-	BUG_ON(iter->idx >= keys->gap &&
-	       iter->idx <  keys->gap + gap_size);
-
-	if (iter->idx < keys->size) {
-		struct journal_key *k = keys->data + iter->idx;
-
-		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
-		BUG_ON(cmp > 0);
-	}
-#endif
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	/* The key we just inserted is immediately before the gap: */
-	size_t gap_end = keys->gap + (keys->size - keys->nr);
-	struct journal_key *new_key = &keys->data[keys->gap - 1];
-	struct journal_iter *iter;
-
-	/*
-	 * If an iterator points one after the key we just inserted, decrement
-	 * the iterator so it points at the key we just inserted - if the
-	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-	 * handle that:
-	 */
-	list_for_each_entry(iter, &c->journal_iters, list) {
-		journal_iter_verify(iter);
-		if (iter->idx		== gap_end &&
-		    new_key->btree_id	== iter->btree_id &&
-		    new_key->level	== iter->level)
-			iter->idx = keys->gap - 1;
-		journal_iter_verify(iter);
-	}
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	struct journal_iter *iter;
-	size_t gap_size = keys->size - keys->nr;
-
-	list_for_each_entry(iter, &c->journal_iters, list) {
-		if (iter->idx > old_gap)
-			iter->idx -= gap_size;
-		if (iter->idx >= new_gap)
-			iter->idx += gap_size;
-	}
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-				 unsigned level, struct bkey_i *k)
-{
-	struct journal_key n = {
-		.btree_id	= id,
-		.level		= level,
-		.k		= k,
-		.allocated	= true,
-		/*
-		 * Ensure these keys are done last by journal replay, to unblock
-		 * journal reclaim:
-		 */
-		.journal_seq	= U64_MAX,
-	};
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-	if (idx < keys->size &&
-	    journal_key_cmp(&n, &keys->data[idx]) == 0) {
-		if (keys->data[idx].allocated)
-			kfree(keys->data[idx].k);
-		keys->data[idx] = n;
-		return 0;
-	}
-
-	if (idx > keys->gap)
-		idx -= keys->size - keys->nr;
-
-	size_t old_gap = keys->gap;
-
-	if (keys->nr == keys->size) {
-		journal_iters_move_gap(c, old_gap, keys->size);
-		old_gap = keys->size;
-
-		struct journal_keys new_keys = {
-			.nr			= keys->nr,
-			.size			= max_t(size_t, keys->size, 8) * 2,
-		};
-
-		new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);
-		if (!new_keys.data) {
-			bch_err(c, "%s: error allocating new key array (size %zu)",
-				__func__, new_keys.size);
-			return bch_err_throw(c, ENOMEM_journal_key_insert);
-		}
-
-		/* Since @keys was full, there was no gap: */
-		memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
-		kvfree(keys->data);
-		keys->data	= new_keys.data;
-		keys->nr	= new_keys.nr;
-		keys->size	= new_keys.size;
-
-		/* And now the gap is at the end: */
-		keys->gap	= keys->nr;
-	}
-
-	journal_iters_move_gap(c, old_gap, idx);
-
-	move_gap(keys, idx);
-
-	keys->nr++;
-	keys->data[keys->gap++] = n;
-
-	journal_iters_fix(c);
-
-	return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bkey_i *k)
-{
-	struct bkey_i *n;
-	int ret;
-
-	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-	if (!n)
-		return bch_err_throw(c, ENOMEM_journal_key_insert);
-
-	bkey_copy(n, k);
-	ret = bch2_journal_key_insert_take(c, id, level, n);
-	if (ret)
-		kfree(n);
-	return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-			    unsigned level, struct bpos pos)
-{
-	struct bkey_i whiteout;
-
-	bkey_init(&whiteout.k);
-	whiteout.k.p = pos;
-
-	return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
-				 unsigned level, struct bpos pos)
-{
-	struct journal_keys *keys = &trans->c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-	if (!trans->journal_replay_not_finished)
-		return false;
-
-	return (idx < keys->size &&
-		keys->data[idx].btree_id	== btree &&
-		keys->data[idx].level		== level &&
-		bpos_eq(keys->data[idx].k->k.p, pos) &&
-		bkey_deleted(&keys->data[idx].k->k));
-}
-
-static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
-{
-	struct journal_key *k = keys->data + pos;
-	size_t idx = pos_to_idx(keys, pos);
-
-	k->overwritten = true;
-
-	struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
-	struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
-
-	bool prev_overwritten = prev && prev->overwritten;
-	bool next_overwritten = next && next->overwritten;
-
-	struct journal_key_range_overwritten *prev_range =
-		prev_overwritten ? prev->overwritten_range : NULL;
-	struct journal_key_range_overwritten *next_range =
-		next_overwritten ? next->overwritten_range : NULL;
-
-	BUG_ON(prev_range && prev_range->end != idx);
-	BUG_ON(next_range && next_range->start != idx + 1);
-
-	if (prev_range && next_range) {
-		prev_range->end = next_range->end;
-
-		keys->data[pos].overwritten_range = prev_range;
-		for (size_t i = next_range->start; i < next_range->end; i++) {
-			struct journal_key *ip = keys->data + idx_to_pos(keys, i);
-			BUG_ON(ip->overwritten_range != next_range);
-			ip->overwritten_range = prev_range;
-		}
-
-		kfree_rcu_mightsleep(next_range);
-	} else if (prev_range) {
-		prev_range->end++;
-		k->overwritten_range = prev_range;
-		if (next_overwritten) {
-			prev_range->end++;
-			next->overwritten_range = prev_range;
-		}
-	} else if (next_range) {
-		next_range->start--;
-		k->overwritten_range = next_range;
-		if (prev_overwritten) {
-			next_range->start--;
-			prev->overwritten_range = next_range;
-		}
-	} else if (prev_overwritten || next_overwritten) {
-		struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
-		if (!r)
-			return;
-
-		r->start = idx - (size_t) prev_overwritten;
-		r->end = idx + 1 + (size_t) next_overwritten;
-
-		rcu_assign_pointer(k->overwritten_range, r);
-		if (prev_overwritten)
-			prev->overwritten_range = r;
-		if (next_overwritten)
-			next->overwritten_range = r;
-	}
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-				  unsigned level, struct bpos pos)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-	if (idx < keys->size &&
-	    keys->data[idx].btree_id	== btree &&
-	    keys->data[idx].level	== level &&
-	    bpos_eq(keys->data[idx].k->k.p, pos) &&
-	    !keys->data[idx].overwritten) {
-		mutex_lock(&keys->overwrite_lock);
-		__bch2_journal_key_overwritten(keys, idx);
-		mutex_unlock(&keys->overwrite_lock);
-	}
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-	if (iter->idx < iter->keys->size) {
-		iter->idx++;
-		if (iter->idx == iter->keys->gap)
-			iter->idx += iter->keys->size - iter->keys->nr;
-	}
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-	journal_iter_verify(iter);
-
-	guard(rcu)();
-	while (iter->idx < iter->keys->size) {
-		struct journal_key *k = iter->keys->data + iter->idx;
-
-		int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
-		if (cmp < 0)
-			break;
-		BUG_ON(cmp);
-
-		if (!k->overwritten)
-			return bkey_i_to_s_c(k->k);
-
-		if (k->overwritten_range)
-			iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
-		else
-			bch2_journal_iter_advance(iter);
-	}
-
-	return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-	list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-				   struct journal_iter *iter,
-				   enum btree_id id, unsigned level,
-				   struct bpos pos)
-{
-	iter->btree_id	= id;
-	iter->level	= level;
-	iter->keys	= &c->journal_keys;
-	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
-
-	journal_iter_verify(iter);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-						iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-	if (bpos_eq(iter->pos, SPOS_MAX))
-		iter->at_end = true;
-	else
-		iter->pos = bpos_successor(iter->pos);
-}
-
-static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
-{
-	struct btree_and_journal_iter iter = *_iter;
-	struct bch_fs *c = iter.trans->c;
-	unsigned level = iter.journal.level;
-	struct bkey_buf tmp;
-	unsigned nr = test_bit(BCH_FS_started, &c->flags)
-		? (level > 1 ? 0 :  2)
-		: (level > 1 ? 1 : 16);
-
-	iter.prefetch = false;
-	iter.fail_if_too_many_whiteouts = true;
-	bch2_bkey_buf_init(&tmp);
-
-	while (nr--) {
-		bch2_btree_and_journal_iter_advance(&iter);
-		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
-		if (!k.k)
-			break;
-
-		bch2_bkey_buf_reassemble(&tmp, c, k);
-		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
-	}
-
-	bch2_bkey_buf_exit(&tmp, c);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
-	size_t iters = 0;
-
-	if (iter->prefetch && iter->journal.level)
-		btree_and_journal_iter_prefetch(iter);
-again:
-	if (iter->at_end)
-		return bkey_s_c_null;
-
-	iters++;
-
-	if (iters > 20 && iter->fail_if_too_many_whiteouts)
-		return bkey_s_c_null;
-
-	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-	       bpos_lt(btree_k.k->p, iter->pos))
-		bch2_journal_iter_advance_btree(iter);
-
-	if (iter->trans->journal_replay_not_finished)
-		while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-		       bpos_lt(journal_k.k->p, iter->pos))
-			bch2_journal_iter_advance(&iter->journal);
-
-	ret = journal_k.k &&
-		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-		? journal_k
-		: btree_k;
-
-	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-		ret = bkey_s_c_null;
-
-	if (ret.k) {
-		iter->pos = ret.k->p;
-		if (bkey_deleted(ret.k)) {
-			bch2_btree_and_journal_iter_advance(iter);
-			goto again;
-		}
-	} else {
-		iter->pos = SPOS_MAX;
-		iter->at_end = true;
-	}
-
-	return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-	bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-						  struct btree_and_journal_iter *iter,
-						  struct btree *b,
-						  struct btree_node_iter node_iter,
-						  struct bpos pos)
-{
-	memset(iter, 0, sizeof(*iter));
-
-	iter->trans = trans;
-	iter->b = b;
-	iter->node_iter = node_iter;
-	iter->pos = b->data->min_key;
-	iter->at_end = false;
-	INIT_LIST_HEAD(&iter->journal.list);
-
-	if (trans->journal_replay_not_finished) {
-		bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
-		if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
-			list_add(&iter->journal.list, &trans->c->journal_iters);
-	}
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
-						struct btree_and_journal_iter *iter,
-						struct btree *b)
-{
-	struct btree_node_iter node_iter;
-
-	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
-}
-
-/* sort and dedup all keys in the journal: */
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-	const struct journal_key *l = _l;
-	const struct journal_key *r = _r;
-	int rewind = l->rewind && r->rewind ? -1 : 1;
-
-	return  journal_key_cmp(l, r) ?:
-		((cmp_int(l->journal_seq, r->journal_seq) ?:
-		  cmp_int(l->journal_offset, r->journal_offset)) * rewind);
-}
-
-void bch2_journal_keys_put(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-
-	BUG_ON(atomic_read(&keys->ref) <= 0);
-
-	if (!atomic_dec_and_test(&keys->ref))
-		return;
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i) {
-		if (i->overwritten_range &&
-		    (i == &darray_last(*keys) ||
-		     i->overwritten_range != i[1].overwritten_range))
-			kfree(i->overwritten_range);
-
-		if (i->allocated)
-			kfree(i->k);
-	}
-
-	kvfree(keys->data);
-	keys->data = NULL;
-	keys->nr = keys->gap = keys->size = 0;
-
-	struct journal_replay **i;
-	struct genradix_iter iter;
-
-	genradix_for_each(&c->journal_entries, iter, i)
-		kvfree(*i);
-	genradix_free(&c->journal_entries);
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-	sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
-		       journal_sort_key_cmp, NULL);
-
-	cond_resched();
-
-	struct journal_key *dst = keys->data;
-
-	darray_for_each(*keys, src) {
-		/*
-		 * We don't accumulate accounting keys here because we have to
-		 * compare each individual accounting key against the version in
-		 * the btree during replay:
-		 */
-		if (src->k->k.type != KEY_TYPE_accounting &&
-		    src + 1 < &darray_top(*keys) &&
-		    !journal_key_cmp(src, src + 1))
-			continue;
-
-		*dst++ = *src;
-	}
-
-	keys->nr = dst - keys->data;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *c)
-{
-	struct genradix_iter iter;
-	struct journal_replay *i, **_i;
-	struct journal_keys *keys = &c->journal_keys;
-	size_t nr_read = 0;
-
-	u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX;
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		cond_resched();
-
-		vstruct_for_each(&i->j, entry) {
-			bool rewind = !entry->level &&
-				!btree_id_is_alloc(entry->btree_id) &&
-				le64_to_cpu(i->j.seq) >= rewind_seq;
-
-			if (entry->type != (rewind
-					    ? BCH_JSET_ENTRY_overwrite
-					    : BCH_JSET_ENTRY_btree_keys))
-				continue;
-
-			if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start)
-				continue;
-
-			jset_entry_for_each_key(entry, k) {
-				struct journal_key n = (struct journal_key) {
-					.btree_id	= entry->btree_id,
-					.level		= entry->level,
-					.rewind		= rewind,
-					.k		= k,
-					.journal_seq	= le64_to_cpu(i->j.seq),
-					.journal_offset	= k->_data - i->j._data,
-				};
-
-				if (darray_push(keys, n)) {
-					__journal_keys_sort(keys);
-
-					if (keys->nr * 8 > keys->size * 7) {
-						bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
-							keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
-						return bch_err_throw(c, ENOMEM_journal_keys_sort);
-					}
-
-					BUG_ON(darray_push(keys, n));
-				}
-
-				nr_read++;
-			}
-		}
-	}
-
-	__journal_keys_sort(keys);
-	keys->gap = keys->nr;
-
-	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
-	return 0;
-}
-
-void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
-				  unsigned level_min, unsigned level_max,
-				  struct bpos start, struct bpos end)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	size_t dst = 0;
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i)
-		if (!(i->btree_id == btree &&
-		      i->level >= level_min &&
-		      i->level <= level_max &&
-		      bpos_ge(i->k->k.p, start) &&
-		      bpos_le(i->k->k.p, end)))
-			keys->data[dst++] = *i;
-	keys->nr = keys->gap = dst;
-}
-
-void bch2_journal_keys_dump(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	struct printbuf buf = PRINTBUF;
-
-	pr_info("%zu keys:", keys->nr);
-
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "btree=");
-		bch2_btree_id_to_text(&buf, i->btree_id);
-		prt_printf(&buf, " l=%u ", i->level);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-		pr_err("%s", buf.buf);
-	}
-	printbuf_exit(&buf);
-}
-
-void bch2_fs_journal_keys_init(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-
-	atomic_set(&keys->ref, 1);
-	keys->initial_ref_held = true;
-	mutex_init(&keys->overwrite_lock);
-}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
deleted file mode 100644
index 2a3082919b8d..000000000000
--- a/fs/bcachefs/btree_journal_iter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_H
-
-#include "bkey.h"
-
-struct journal_iter {
-	struct list_head	list;
-	enum btree_id		btree_id;
-	unsigned		level;
-	size_t			idx;
-	struct journal_keys	*keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
-	struct btree_trans	*trans;
-	struct btree		*b;
-	struct btree_node_iter	node_iter;
-	struct bkey		unpacked;
-
-	struct journal_iter	journal;
-	struct bpos		pos;
-	bool			at_end;
-	bool			prefetch;
-	bool			fail_if_too_many_whiteouts;
-};
-
-static inline int __journal_key_btree_cmp(enum btree_id	l_btree_id,
-					  unsigned	l_level,
-					  const struct journal_key *r)
-{
-	return -cmp_int(l_level,	r->level) ?:
-		cmp_int(l_btree_id,	r->btree_id);
-}
-
-static inline int __journal_key_cmp(enum btree_id	l_btree_id,
-				    unsigned		l_level,
-				    struct bpos	l_pos,
-				    const struct journal_key *r)
-{
-	return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
-		bpos_cmp(l_pos,	r->k->k.p);
-}
-
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
-				unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
-				unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-					   unsigned, struct bpos);
-
-int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
-					 struct btree_and_journal_iter *);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-				 unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-			    unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-			    unsigned, struct bpos);
-bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-				struct btree_and_journal_iter *, struct btree *,
-				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
-				struct btree_and_journal_iter *, struct btree *);
-
-void bch2_journal_keys_put(struct bch_fs *);
-
-static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
-{
-	if (c->journal_keys.initial_ref_held)
-		bch2_journal_keys_put(c);
-	c->journal_keys.initial_ref_held = false;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *);
-
-void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
-				  unsigned, unsigned,
-				  struct bpos, struct bpos);
-
-void bch2_journal_keys_dump(struct bch_fs *);
-
-void bch2_fs_journal_keys_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
deleted file mode 100644
index 86aacb254fb2..000000000000
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-
-struct journal_key_range_overwritten {
-	size_t			start, end;
-};
-
-struct journal_key {
-	u64			journal_seq;
-	u32			journal_offset;
-	enum btree_id		btree_id:8;
-	unsigned		level:8;
-	bool			allocated:1;
-	bool			overwritten:1;
-	bool			rewind:1;
-	struct journal_key_range_overwritten __rcu *
-				overwritten_range;
-	struct bkey_i		*k;
-};
-
-struct journal_keys {
-	/* must match layout in darray_types.h */
-	size_t			nr, size;
-	struct journal_key	*data;
-	/*
-	 * Gap buffer: instead of all the empty space in the array being at the
-	 * end of the buffer - from @nr to @size - the empty space is at @gap.
-	 * This means that sequential insertions are O(n) instead of O(n^2).
-	 */
-	size_t			gap;
-	atomic_t		ref;
-	bool			initial_ref_held;
-	struct mutex		overwrite_lock;
-};
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
deleted file mode 100644
index d96188b92db2..000000000000
--- a/fs/bcachefs/btree_key_cache.c
+++ /dev/null
@@ -1,880 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static inline bool btree_uses_pcpu_readers(enum btree_id id)
-{
-	return id == BTREE_ID_subvolumes;
-}
-
-static struct kmem_cache *bch2_key_cache;
-
-static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
-				       const void *obj)
-{
-	const struct bkey_cached *ck = obj;
-	const struct bkey_cached_key *key = arg->key;
-
-	return ck->key.btree_id != key->btree_id ||
-		!bpos_eq(ck->key.pos, key->pos);
-}
-
-static const struct rhashtable_params bch2_btree_key_cache_params = {
-	.head_offset		= offsetof(struct bkey_cached, hash),
-	.key_offset		= offsetof(struct bkey_cached, key),
-	.key_len		= sizeof(struct bkey_cached_key),
-	.obj_cmpfn		= bch2_btree_key_cache_cmp_fn,
-	.automatic_shrinking	= true,
-};
-
-static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
-					 struct bkey_cached *ck,
-					 enum btree_node_locked_type lock_held)
-{
-	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
-	path->l[0].b		= (void *) ck;
-	mark_btree_node_locked(trans, path, 0, lock_held);
-}
-
-__flatten
-inline struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
-{
-	struct bkey_cached_key key = {
-		.btree_id	= btree_id,
-		.pos		= pos,
-	};
-
-	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
-				      bch2_btree_key_cache_params);
-}
-
-static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
-{
-	if (!six_trylock_intent(&ck->c.lock))
-		return false;
-
-	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		six_unlock_intent(&ck->c.lock);
-		return false;
-	}
-
-	if (!six_trylock_write(&ck->c.lock)) {
-		six_unlock_intent(&ck->c.lock);
-		return false;
-	}
-
-	return true;
-}
-
-static bool bkey_cached_evict(struct btree_key_cache *c,
-			      struct bkey_cached *ck)
-{
-	bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
-				      bch2_btree_key_cache_params);
-	if (ret) {
-		memset(&ck->key, ~0, sizeof(ck->key));
-		atomic_long_dec(&c->nr_keys);
-	}
-
-	return ret;
-}
-
-static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
-{
-	struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
-	struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
-
-	this_cpu_dec(*c->btree_key_cache.nr_pending);
-	kmem_cache_free(bch2_key_cache, ck);
-}
-
-static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
-				      struct bkey_cached *ck)
-{
-	kfree(ck->k);
-	ck->k		= NULL;
-	ck->u64s	= 0;
-
-	six_unlock_write(&ck->c.lock);
-	six_unlock_intent(&ck->c.lock);
-
-	bool pcpu_readers = ck->c.lock.readers != NULL;
-	rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
-	this_cpu_inc(*bc->nr_pending);
-}
-
-static void bkey_cached_free(struct btree_trans *trans,
-			     struct btree_key_cache *bc,
-			     struct bkey_cached *ck)
-{
-	/*
-	 * we'll hit strange issues in the SRCU code if we aren't holding an
-	 * SRCU read lock...
-	 */
-	EBUG_ON(!trans->srcu_held);
-
-	bkey_cached_free_noassert(bc, ck);
-}
-
-static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
-{
-	gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
-	struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
-	if (unlikely(!ck))
-		return NULL;
-	ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
-	if (unlikely(!ck->k)) {
-		kmem_cache_free(bch2_key_cache, ck);
-		return NULL;
-	}
-	ck->u64s = key_u64s;
-	return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
-	int ret;
-
-	struct bkey_cached *ck = container_of_or_null(
-				rcu_pending_dequeue(&bc->pending[pcpu_readers]),
-				struct bkey_cached, rcu);
-	if (ck)
-		goto lock;
-
-	ck = allocate_dropping_locks(trans, ret,
-				     __bkey_cached_alloc(key_u64s, _gfp));
-	if (ret) {
-		if (ck)
-			kfree(ck->k);
-		kmem_cache_free(bch2_key_cache, ck);
-		return ERR_PTR(ret);
-	}
-
-	if (ck) {
-		bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
-		ck->c.cached = true;
-		goto lock;
-	}
-
-	ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
-				  struct bkey_cached, rcu);
-	if (ck)
-		goto lock;
-lock:
-	six_lock_intent(&ck->c.lock, NULL, NULL);
-	six_lock_write(&ck->c.lock, NULL, NULL);
-	return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_reuse(struct btree_key_cache *c)
-{
-
-	guard(rcu)();
-	struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
-	struct rhash_head *pos;
-	struct bkey_cached *ck;
-
-	for (unsigned i = 0; i < tbl->size; i++)
-		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-			    bkey_cached_lock_for_evict(ck)) {
-				if (bkey_cached_evict(c, ck))
-					return ck;
-				six_unlock_write(&ck->c.lock);
-				six_unlock_intent(&ck->c.lock);
-			}
-		}
-	return NULL;
-}
-
-static int btree_key_cache_create(struct btree_trans *trans,
-				  struct btree_path *path,
-				  struct btree_path *ck_path,
-				  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at
-	 * most 7 bytes (it won't be used):
-	 */
-	unsigned key_u64s = k.k->u64s + 1;
-
-	/*
-	 * Allocate some extra space so that the transaction commit path is less
-	 * likely to have to reallocate, since that requires a transaction
-	 * restart:
-	 */
-	key_u64s = min(256U, (key_u64s * 3) / 2);
-	key_u64s = roundup_pow_of_two(key_u64s);
-
-	struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
-	int ret = PTR_ERR_OR_ZERO(ck);
-	if (ret)
-		return ret;
-
-	if (unlikely(!ck)) {
-		ck = bkey_cached_reuse(bc);
-		if (unlikely(!ck)) {
-			bch_err(c, "error allocating memory for key cache item, btree %s",
-				bch2_btree_id_str(ck_path->btree_id));
-			return bch_err_throw(c, ENOMEM_btree_key_cache_create);
-		}
-	}
-
-	ck->c.level		= 0;
-	ck->c.btree_id		= ck_path->btree_id;
-	ck->key.btree_id	= ck_path->btree_id;
-	ck->key.pos		= ck_path->pos;
-	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
-
-	if (unlikely(key_u64s > ck->u64s)) {
-		mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
-		struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
-				kmalloc(key_u64s * sizeof(u64), _gfp));
-		if (unlikely(!new_k)) {
-			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-				bch2_btree_id_str(ck->key.btree_id), key_u64s);
-			ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
-		} else if (ret) {
-			kfree(new_k);
-			goto err;
-		}
-
-		kfree(ck->k);
-		ck->k = new_k;
-		ck->u64s = key_u64s;
-	}
-
-	bkey_reassemble(ck->k, k);
-
-	ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
-	if (unlikely(ret))
-		goto err;
-
-	ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
-
-	bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
-
-	if (unlikely(ret)) /* raced with another fill? */
-		goto err;
-
-	atomic_long_inc(&bc->nr_keys);
-	six_unlock_write(&ck->c.lock);
-
-	enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
-	if (lock_want == SIX_LOCK_read)
-		six_lock_downgrade(&ck->c.lock);
-	btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
-	ck_path->uptodate = BTREE_ITER_UPTODATE;
-	return 0;
-err:
-	bkey_cached_free(trans, bc, ck);
-	mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
-	return ret;
-}
-
-static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
-						       struct btree_path *ck_path,
-						       struct bkey_s_c k)
-{
-	struct printbuf buf = PRINTBUF;
-
-	bch2_bpos_to_text(&buf, ck_path->pos);
-	prt_char(&buf, ' ');
-	bch2_bkey_val_to_text(&buf, trans->c, k);
-	trace_key_cache_fill(trans, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static noinline int btree_key_cache_fill(struct btree_trans *trans,
-					 btree_path_idx_t ck_path_idx,
-					 unsigned flags)
-{
-	struct btree_path *ck_path = trans->paths + ck_path_idx;
-
-	if (flags & BTREE_ITER_cached_nofill) {
-		ck_path->l[0].b = NULL;
-		return 0;
-	}
-
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
-			     BTREE_ITER_intent|
-			     BTREE_ITER_key_cache_fill|
-			     BTREE_ITER_cached_nofill);
-	iter.flags &= ~BTREE_ITER_with_journal;
-	k = bch2_btree_iter_peek_slot(trans, &iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	/* Recheck after btree lookup, before allocating: */
-	ck_path = trans->paths + ck_path_idx;
-	ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
-	if (unlikely(ret))
-		goto out;
-
-	ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
-	if (ret)
-		goto err;
-
-	if (trace_key_cache_fill_enabled())
-		do_trace_key_cache_fill(trans, ck_path, k);
-out:
-	/* We're not likely to need this iterator again: */
-	bch2_set_btree_iter_dontneed(trans, &iter);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
-						  btree_path_idx_t path_idx)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck;
-	struct btree_path *path = trans->paths + path_idx;
-retry:
-	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
-	if (!ck)
-		return -ENOENT;
-
-	enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
-	int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
-	if (ret)
-		return ret;
-
-	if (ck->key.btree_id != path->btree_id ||
-	    !bpos_eq(ck->key.pos, path->pos)) {
-		six_unlock_type(&ck->c.lock, lock_want);
-		goto retry;
-	}
-
-	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
-		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
-	btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-	path->uptodate = BTREE_ITER_UPTODATE;
-	return 0;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans,
-				    btree_path_idx_t path_idx,
-				    unsigned flags)
-{
-	EBUG_ON(trans->paths[path_idx].level);
-
-	int ret;
-	do {
-		ret = btree_path_traverse_cached_fast(trans, path_idx);
-		if (unlikely(ret == -ENOENT))
-			ret = btree_key_cache_fill(trans, path_idx, flags);
-	} while (ret == -EEXIST);
-
-	struct btree_path *path = trans->paths + path_idx;
-
-	if (unlikely(ret)) {
-		path->uptodate = BTREE_ITER_NEED_TRAVERSE;
-		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			btree_node_unlock(trans, path, 0);
-			path->l[0].b = ERR_PTR(ret);
-		}
-	} else {
-		BUG_ON(path->uptodate);
-		BUG_ON(!path->nodes_locked);
-	}
-
-	return ret;
-}
-
-static int btree_key_cache_flush_pos(struct btree_trans *trans,
-				     struct bkey_cached_key key,
-				     u64 journal_seq,
-				     unsigned commit_flags,
-				     bool evict)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree_iter c_iter, b_iter;
-	struct bkey_cached *ck = NULL;
-	int ret;
-
-	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
-			     BTREE_ITER_slots|
-			     BTREE_ITER_intent|
-			     BTREE_ITER_all_snapshots);
-	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_intent);
-	b_iter.flags &= ~BTREE_ITER_with_key_cache;
-
-	ret = bch2_btree_iter_traverse(trans, &c_iter);
-	if (ret)
-		goto out;
-
-	ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
-	if (!ck)
-		goto out;
-
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		if (evict)
-			goto evict;
-		goto out;
-	}
-
-	if (journal_seq && ck->journal.seq != journal_seq)
-		goto out;
-
-	trans->journal_res.seq = ck->journal.seq;
-
-	/*
-	 * If we're at the end of the journal, we really want to free up space
-	 * in the journal right away - we don't want to pin that old journal
-	 * sequence number with a new btree node write, we want to re-journal
-	 * the update
-	 */
-	if (ck->journal.seq == journal_last_seq(j))
-		commit_flags |= BCH_WATERMARK_reclaim;
-
-	if (ck->journal.seq != journal_last_seq(j) ||
-	    !test_bit(JOURNAL_space_low, &c->journal.flags))
-		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
-
-	struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
-	ret = bkey_err(btree_k);
-	if (ret)
-		goto err;
-
-	/* * Check that we're not violating cache coherency rules: */
-	BUG_ON(bkey_deleted(btree_k.k));
-
-	ret   = bch2_trans_update(trans, &b_iter, ck->k,
-				  BTREE_UPDATE_key_cache_reclaim|
-				  BTREE_UPDATE_internal_snapshot_node|
-				  BTREE_TRIGGER_norun) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_check_rw|
-				  BCH_TRANS_COMMIT_no_enospc|
-				  commit_flags);
-err:
-	bch2_fs_fatal_err_on(ret &&
-			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
-			     !bch2_journal_error(j), c,
-			     "flushing key cache: %s", bch2_err_str(ret));
-	if (ret)
-		goto out;
-
-	bch2_journal_pin_drop(j, &ck->journal);
-
-	struct btree_path *path = btree_iter_path(trans, &c_iter);
-	BUG_ON(!btree_node_locked(path, 0));
-
-	if (!evict) {
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		}
-	} else {
-		struct btree_path *path2;
-		unsigned i;
-evict:
-		trans_for_each_path(trans, path2, i)
-			if (path2 != path)
-				__bch2_btree_path_unlock(trans, path2);
-
-		bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
-
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-			atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		}
-
-		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
-		if (bkey_cached_evict(&c->btree_key_cache, ck)) {
-			bkey_cached_free(trans, &c->btree_key_cache, ck);
-		} else {
-			six_unlock_write(&ck->c.lock);
-			six_unlock_intent(&ck->c.lock);
-		}
-	}
-out:
-	bch2_trans_iter_exit(trans, &b_iter);
-	bch2_trans_iter_exit(trans, &c_iter);
-	return ret;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *j,
-				struct journal_entry_pin *pin, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_cached *ck =
-		container_of(pin, struct bkey_cached, journal);
-	struct bkey_cached_key key;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-	int ret = 0;
-
-	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
-	key = ck->key;
-
-	if (ck->journal.seq != seq ||
-	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		six_unlock_read(&ck->c.lock);
-		goto unlock;
-	}
-
-	if (ck->seq != seq) {
-		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
-					bch2_btree_key_cache_journal_flush);
-		six_unlock_read(&ck->c.lock);
-		goto unlock;
-	}
-	six_unlock_read(&ck->c.lock);
-
-	ret = lockrestart_do(trans,
-		btree_key_cache_flush_pos(trans, key, seq,
-				BCH_TRANS_COMMIT_journal_reclaim, false));
-unlock:
-	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-				  unsigned flags,
-				  struct btree_insert_entry *insert_entry)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
-	struct bkey_i *insert = insert_entry->k;
-	bool kick_reclaim = false;
-
-	BUG_ON(insert->k.u64s > ck->u64s);
-
-	bkey_copy(ck->k, insert);
-
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		atomic_long_inc(&c->btree_key_cache.nr_dirty);
-
-		if (bch2_nr_btree_keys_need_flush(c))
-			kick_reclaim = true;
-	}
-
-	/*
-	 * To minimize lock contention, we only add the journal pin here and
-	 * defer pin updates to the flush callback via ->seq. Be careful not to
-	 * update ->seq on nojournal commits because we don't want to update the
-	 * pin to a seq that doesn't include journal updates on disk. Otherwise
-	 * we risk losing the update after a crash.
-	 *
-	 * The only exception is if the pin is not active in the first place. We
-	 * have to add the pin because journal reclaim drives key cache
-	 * flushing. The flush callback will not proceed unless ->seq matches
-	 * the latest pin, so make sure it starts with a consistent value.
-	 */
-	if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
-	    !journal_pin_active(&ck->journal)) {
-		ck->seq = trans->journal_res.seq;
-	}
-	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-			     &ck->journal, bch2_btree_key_cache_journal_flush);
-
-	if (kick_reclaim)
-		journal_reclaim_kick(&c->journal);
-	return true;
-}
-
-void bch2_btree_key_cache_drop(struct btree_trans *trans,
-			       struct btree_path *path)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bkey_cached *ck = (void *) path->l[0].b;
-
-	/*
-	 * We just did an update to the btree, bypassing the key cache: the key
-	 * cache key is now stale and must be dropped, even if dirty:
-	 */
-	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
-		atomic_long_dec(&c->btree_key_cache.nr_dirty);
-		bch2_journal_pin_drop(&c->journal, &ck->journal);
-	}
-
-	bkey_cached_evict(bc, ck);
-	bkey_cached_free(trans, bc, ck);
-
-	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-
-	struct btree_path *path2;
-	unsigned i;
-	trans_for_each_path(trans, path2, i)
-		if (path2->l[0].b == (void *) ck) {
-			/*
-			 * It's safe to clear should_be_locked here because
-			 * we're evicting from the key cache, and we still have
-			 * the underlying btree locked: filling into the key
-			 * cache would require taking a write lock on the btree
-			 * node
-			 */
-			path2->should_be_locked = false;
-			__bch2_btree_path_unlock(trans, path2);
-			path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
-			btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE);
-		}
-
-	bch2_trans_verify_locks(trans);
-}
-
-static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-					   struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bucket_table *tbl;
-	struct bkey_cached *ck;
-	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-	unsigned iter, start;
-	int srcu_idx;
-
-	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-	rcu_read_lock();
-
-	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-
-	/*
-	 * Scanning is expensive while a rehash is in progress - most elements
-	 * will be on the new hashtable, if it's in progress
-	 *
-	 * A rehash could still start while we're scanning - that's ok, we'll
-	 * still see most elements.
-	 */
-	if (unlikely(tbl->nest)) {
-		rcu_read_unlock();
-		srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-		return SHRINK_STOP;
-	}
-
-	iter = bc->shrink_iter;
-	if (iter >= tbl->size)
-		iter = 0;
-	start = iter;
-
-	do {
-		struct rhash_head *pos, *next;
-
-		pos = rht_ptr_rcu(&tbl->buckets[iter]);
-
-		while (!rht_is_a_nulls(pos)) {
-			next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
-			ck = container_of(pos, struct bkey_cached, hash);
-
-			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-				bc->skipped_dirty++;
-			} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
-				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-				bc->skipped_accessed++;
-			} else if (!bkey_cached_lock_for_evict(ck)) {
-				bc->skipped_lock_fail++;
-			} else if (bkey_cached_evict(bc, ck)) {
-				bkey_cached_free_noassert(bc, ck);
-				bc->freed++;
-				freed++;
-			} else {
-				six_unlock_write(&ck->c.lock);
-				six_unlock_intent(&ck->c.lock);
-			}
-
-			scanned++;
-			if (scanned >= nr)
-				goto out;
-
-			pos = next;
-		}
-
-		iter++;
-		if (iter >= tbl->size)
-			iter = 0;
-	} while (scanned < nr && iter != start);
-out:
-	bc->shrink_iter = iter;
-
-	rcu_read_unlock();
-	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-	return freed;
-}
-
-static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
-					    struct shrink_control *sc)
-{
-	struct bch_fs *c = shrink->private_data;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	long nr = atomic_long_read(&bc->nr_keys) -
-		atomic_long_read(&bc->nr_dirty);
-
-	/*
-	 * Avoid hammering our shrinker too much if it's nearly empty - the
-	 * shrinker code doesn't take into account how big our cache is, if it's
-	 * mostly empty but the system is under memory pressure it causes nasty
-	 * lock contention:
-	 */
-	nr -= 128;
-
-	return max(0L, nr);
-}
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-	struct bucket_table *tbl;
-	struct bkey_cached *ck;
-	struct rhash_head *pos;
-	LIST_HEAD(items);
-	unsigned i;
-
-	shrinker_free(bc->shrink);
-
-	/*
-	 * The loop is needed to guard against racing with rehash:
-	 */
-	while (atomic_long_read(&bc->nr_keys)) {
-		rcu_read_lock();
-		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-		if (tbl) {
-			if (tbl->nest) {
-				/* wait for in progress rehash */
-				rcu_read_unlock();
-				mutex_lock(&bc->table.mutex);
-				mutex_unlock(&bc->table.mutex);
-				continue;
-			}
-			for (i = 0; i < tbl->size; i++)
-				while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
-					ck = container_of(pos, struct bkey_cached, hash);
-					BUG_ON(!bkey_cached_evict(bc, ck));
-					kfree(ck->k);
-					kmem_cache_free(bch2_key_cache, ck);
-				}
-		}
-		rcu_read_unlock();
-	}
-
-	if (atomic_long_read(&bc->nr_dirty) &&
-	    !bch2_journal_error(&c->journal) &&
-	    test_bit(BCH_FS_was_rw, &c->flags))
-		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
-		      atomic_long_read(&bc->nr_dirty));
-
-	if (atomic_long_read(&bc->nr_keys))
-		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
-		      atomic_long_read(&bc->nr_keys));
-
-	if (bc->table_init_done)
-		rhashtable_destroy(&bc->table);
-
-	rcu_pending_exit(&bc->pending[0]);
-	rcu_pending_exit(&bc->pending[1]);
-
-	free_percpu(bc->nr_pending);
-}
-
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
-{
-}
-
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-	struct shrinker *shrink;
-
-	bc->nr_pending = alloc_percpu(size_t);
-	if (!bc->nr_pending)
-		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-	if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
-	    rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
-		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
-		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
-	bc->table_init_done = true;
-
-	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
-	if (!shrink)
-		return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-	bc->shrink = shrink;
-	shrink->count_objects	= bch2_btree_key_cache_count;
-	shrink->scan_objects	= bch2_btree_key_cache_scan;
-	shrink->batch		= 1 << 14;
-	shrink->seeks		= 0;
-	shrink->private_data	= c;
-	shrinker_register(shrink);
-	return 0;
-}
-
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
-{
-	printbuf_tabstop_push(out, 24);
-	printbuf_tabstop_push(out, 12);
-
-	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
-	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
-	prt_printf(out, "table size:\t%u\r\n",		bc->table.tbl->size);
-	prt_newline(out);
-	prt_printf(out, "shrinker:\n");
-	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
-	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
-	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
-	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
-	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
-	prt_newline(out);
-	prt_printf(out, "pending:\t%zu\r\n",		per_cpu_sum(bc->nr_pending));
-}
-
-void bch2_btree_key_cache_exit(void)
-{
-	kmem_cache_destroy(bch2_key_cache);
-}
-
-int __init bch2_btree_key_cache_init(void)
-{
-	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
-	if (!bch2_key_cache)
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
deleted file mode 100644
index 82d8c72512a9..000000000000
--- a/fs/bcachefs/btree_key_cache.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
-#define _BCACHEFS_BTREE_KEY_CACHE_H
-
-static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = 1024 + nr_keys  / 2;
-
-	return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
-static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
-
-	return nr_dirty - max_dirty;
-}
-
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
-	return __bch2_btree_key_cache_must_wait(c) > 0;
-}
-
-static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
-{
-	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
-	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
-	size_t max_dirty = 2048 + (nr_keys * 5) / 8;
-
-	return nr_dirty <= max_dirty;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *,
-				struct journal_entry_pin *, u64);
-
-struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-
-int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
-
-bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
-			struct btree_insert_entry *);
-void bch2_btree_key_cache_drop(struct btree_trans *,
-			       struct btree_path *);
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
-
-void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
-
-void bch2_btree_key_cache_exit(void);
-int __init bch2_btree_key_cache_init(void);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
deleted file mode 100644
index 722f1ed10551..000000000000
--- a/fs/bcachefs/btree_key_cache_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-
-#include "rcu_pending.h"
-
-struct btree_key_cache {
-	struct rhashtable	table;
-	bool			table_init_done;
-
-	struct shrinker		*shrink;
-	unsigned		shrink_iter;
-
-	/* 0: non pcpu reader locks, 1: pcpu reader locks */
-	struct rcu_pending	pending[2];
-	size_t __percpu		*nr_pending;
-
-	atomic_long_t		nr_keys;
-	atomic_long_t		nr_dirty;
-
-	/* shrinker stats */
-	unsigned long		requested_to_free;
-	unsigned long		freed;
-	unsigned long		skipped_dirty;
-	unsigned long		skipped_accessed;
-	unsigned long		skipped_lock_fail;
-};
-
-struct bkey_cached_key {
-	u32			btree_id;
-	struct bpos		pos;
-} __packed __aligned(4);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
deleted file mode 100644
index bed2b4b6ffb9..000000000000
--- a/fs/bcachefs/btree_locking.c
+++ /dev/null
@@ -1,936 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_types.h"
-
-static struct lock_class_key bch2_btree_node_lock_key;
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
-			  enum six_lock_init_flags flags,
-			  gfp_t gfp)
-{
-	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
-	lockdep_set_notrack_class(&b->lock);
-}
-
-/* Btree node locking: */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
-						  struct btree_path *skip,
-						  struct btree_bkey_cached_common *b,
-						  unsigned level)
-{
-	struct btree_path *path;
-	struct six_lock_count ret;
-	unsigned i;
-
-	memset(&ret, 0, sizeof(ret));
-
-	if (IS_ERR_OR_NULL(b))
-		return ret;
-
-	trans_for_each_path(trans, path, i)
-		if (path != skip && &path->l[level].b->c == b) {
-			int t = btree_node_locked_type(path, level);
-
-			if (t != BTREE_NODE_UNLOCKED)
-				ret.n[t]++;
-		}
-
-	return ret;
-}
-
-/* unlock */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
-			struct btree_path *path, struct btree *b)
-{
-	bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-/* lock */
-
-/*
- * @trans wants to lock @b with type @type
- */
-struct trans_waiting_for_lock {
-	struct btree_trans		*trans;
-	struct btree_bkey_cached_common	*node_want;
-	enum six_lock_type		lock_want;
-
-	/* for iterating over held locks :*/
-	u8				path_idx;
-	u8				level;
-	u64				lock_start_time;
-};
-
-struct lock_graph {
-	struct trans_waiting_for_lock	g[8];
-	unsigned			nr;
-};
-
-static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
-
-	for (i = g->g; i < g->g + g->nr; i++) {
-		struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
-		if (!task)
-			continue;
-
-		bch2_btree_trans_to_text(out, i->trans);
-		bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
-	}
-}
-
-static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
-{
-	struct trans_waiting_for_lock *i;
-
-	for (i = g->g; i != g->g + g->nr; i++) {
-		struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
-		if (i != g->g)
-			prt_str(out, "<- ");
-		prt_printf(out, "%u ", task ? task->pid : 0);
-	}
-	prt_newline(out);
-}
-
-static void lock_graph_up(struct lock_graph *g)
-{
-	closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static noinline void lock_graph_pop_all(struct lock_graph *g)
-{
-	while (g->nr)
-		lock_graph_up(g);
-}
-
-static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
-	while (g->g + g->nr > i)
-		lock_graph_up(g);
-}
-
-static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-	g->g[g->nr++] = (struct trans_waiting_for_lock) {
-		.trans		= trans,
-		.node_want	= trans->locking,
-		.lock_want	= trans->locking_wait.lock_want,
-	};
-}
-
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
-	closure_get(&trans->ref);
-	__lock_graph_down(g, trans);
-}
-
-static bool lock_graph_remove_non_waiters(struct lock_graph *g,
-					  struct trans_waiting_for_lock *from)
-{
-	struct trans_waiting_for_lock *i;
-
-	if (from->trans->locking != from->node_want) {
-		lock_graph_pop_from(g, from);
-		return true;
-	}
-
-	for (i = from + 1; i < g->g + g->nr; i++)
-		if (i->trans->locking != i->node_want ||
-		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-			lock_graph_pop_from(g, i);
-			return true;
-		}
-
-	return false;
-}
-
-static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	count_event(c, trans_restart_would_deadlock);
-
-	if (trace_trans_restart_would_deadlock_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		buf.atomic++;
-		print_cycle(&buf, g);
-
-		trace_trans_restart_would_deadlock(trans, buf.buf);
-		printbuf_exit(&buf);
-	}
-}
-
-static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
-	if (i == g->g) {
-		trace_would_deadlock(g, i->trans);
-		return btree_trans_restart_foreign_task(i->trans,
-					BCH_ERR_transaction_restart_would_deadlock,
-					_THIS_IP_);
-	} else {
-		i->trans->lock_must_abort = true;
-		wake_up_process(i->trans->locking_wait.task);
-		return 0;
-	}
-}
-
-static int btree_trans_abort_preference(struct btree_trans *trans)
-{
-	if (trans->lock_may_not_fail)
-		return 0;
-	if (trans->locking_wait.lock_want == SIX_LOCK_write)
-		return 1;
-	if (!trans->in_traverse_all)
-		return 2;
-	return 3;
-}
-
-static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
-{
-	struct printbuf buf = PRINTBUF;
-	buf.atomic++;
-
-	prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
-	for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
-		struct btree_trans *trans = i->trans;
-
-		bch2_btree_trans_to_text(&buf, trans);
-
-		prt_printf(&buf, "backtrace:\n");
-		printbuf_indent_add(&buf, 2);
-		bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
-		printbuf_indent_sub(&buf, 2);
-		prt_newline(&buf);
-	}
-
-	bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-	BUG();
-}
-
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
-				struct trans_waiting_for_lock *from)
-{
-	struct trans_waiting_for_lock *i, *abort = NULL;
-	unsigned best = 0, pref;
-	int ret;
-
-	if (lock_graph_remove_non_waiters(g, from))
-		return 0;
-
-	/* Only checking, for debugfs: */
-	if (cycle) {
-		print_cycle(cycle, g);
-		ret = -1;
-		goto out;
-	}
-
-	for (i = from; i < g->g + g->nr; i++) {
-		pref = btree_trans_abort_preference(i->trans);
-		if (pref > best) {
-			abort = i;
-			best = pref;
-		}
-	}
-
-	if (unlikely(!best))
-		break_cycle_fail(g);
-
-	ret = abort_lock(g, abort);
-out:
-	if (ret)
-		lock_graph_pop_all(g);
-	else
-		lock_graph_pop_from(g, abort);
-	return ret;
-}
-
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
-			      struct printbuf *cycle)
-{
-	struct btree_trans *orig_trans = g->g->trans;
-
-	for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
-		if (i->trans == trans) {
-			closure_put(&trans->ref);
-			return break_cycle(g, cycle, i);
-		}
-
-	if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
-		closure_put(&trans->ref);
-
-		if (orig_trans->lock_may_not_fail)
-			return 0;
-
-		lock_graph_pop_all(g);
-
-		if (cycle)
-			return 0;
-
-		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-	}
-
-	__lock_graph_down(g, trans);
-	return 0;
-}
-
-static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
-{
-	return t1 + t2 > 1;
-}
-
-int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
-{
-	struct lock_graph g;
-	struct trans_waiting_for_lock *top;
-	struct btree_bkey_cached_common *b;
-	btree_path_idx_t path_idx;
-	int ret = 0;
-
-	g.nr = 0;
-
-	if (trans->lock_must_abort && !trans->lock_may_not_fail) {
-		if (cycle)
-			return -1;
-
-		trace_would_deadlock(&g, trans);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
-	}
-
-	lock_graph_down(&g, trans);
-
-	/* trans->paths is rcu protected vs. freeing */
-	guard(rcu)();
-	if (cycle)
-		cycle->atomic++;
-next:
-	if (!g.nr)
-		goto out;
-
-	top = &g.g[g.nr - 1];
-
-	struct btree_path *paths = rcu_dereference(top->trans->paths);
-	if (!paths)
-		goto up;
-
-	unsigned long *paths_allocated = trans_paths_allocated(paths);
-
-	trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
-				     path_idx, top->path_idx) {
-		struct btree_path *path = paths + path_idx;
-		if (!path->nodes_locked)
-			continue;
-
-		if (path_idx != top->path_idx) {
-			top->path_idx		= path_idx;
-			top->level		= 0;
-			top->lock_start_time	= 0;
-		}
-
-		for (;
-		     top->level < BTREE_MAX_DEPTH;
-		     top->level++, top->lock_start_time = 0) {
-			int lock_held = btree_node_locked_type(path, top->level);
-
-			if (lock_held == BTREE_NODE_UNLOCKED)
-				continue;
-
-			b = &READ_ONCE(path->l[top->level].b)->c;
-
-			if (IS_ERR_OR_NULL(b)) {
-				/*
-				 * If we get here, it means we raced with the
-				 * other thread updating its btree_path
-				 * structures - which means it can't be blocked
-				 * waiting on a lock:
-				 */
-				if (!lock_graph_remove_non_waiters(&g, g.g)) {
-					/*
-					 * If lock_graph_remove_non_waiters()
-					 * didn't do anything, it must be
-					 * because we're being called by debugfs
-					 * checking for lock cycles, which
-					 * invokes us on btree_transactions that
-					 * aren't actually waiting on anything.
-					 * Just bail out:
-					 */
-					lock_graph_pop_all(&g);
-				}
-
-				goto next;
-			}
-
-			if (list_empty_careful(&b->lock.wait_list))
-				continue;
-
-			raw_spin_lock(&b->lock.wait_lock);
-			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
-				BUG_ON(b != trans->locking);
-
-				if (top->lock_start_time &&
-				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
-					continue;
-
-				top->lock_start_time = trans->locking_wait.start_time;
-
-				/* Don't check for self deadlock: */
-				if (trans == top->trans ||
-				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
-					continue;
-
-				closure_get(&trans->ref);
-				raw_spin_unlock(&b->lock.wait_lock);
-
-				ret = lock_graph_descend(&g, trans, cycle);
-				if (ret)
-					goto out;
-				goto next;
-
-			}
-			raw_spin_unlock(&b->lock.wait_lock);
-		}
-	}
-up:
-	if (g.nr > 1 && cycle)
-		print_chain(cycle, &g);
-	lock_graph_up(&g);
-	goto next;
-out:
-	if (cycle)
-		--cycle->atomic;
-	return ret;
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
-{
-	struct btree_trans *trans = p;
-
-	return bch2_check_for_deadlock(trans, NULL);
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
-				 struct btree_bkey_cached_common *b,
-				 bool lock_may_not_fail)
-{
-	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
-	int ret;
-
-	/*
-	 * Must drop our read locks before calling six_lock_write() -
-	 * six_unlock() won't do wakeups until the reader count
-	 * goes to 0, and it's safe because we have the node intent
-	 * locked:
-	 */
-	six_lock_readers_add(&b->lock, -readers);
-	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
-				       lock_may_not_fail, _RET_IP_);
-	six_lock_readers_add(&b->lock, readers);
-
-	if (ret)
-		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
-
-	return ret;
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree_bkey_cached_common *b)
-{
-	int ret = __btree_node_lock_write(trans, path, b, true);
-	BUG_ON(ret);
-}
-
-/* relock */
-
-static int btree_path_get_locks(struct btree_trans *trans,
-				struct btree_path *path,
-				bool upgrade,
-				struct get_locks_fail *f,
-				int restart_err)
-{
-	unsigned l = path->level;
-
-	do {
-		if (!btree_path_node(path, l))
-			break;
-
-		if (!(upgrade
-		      ? bch2_btree_node_upgrade(trans, path, l)
-		      : bch2_btree_node_relock(trans, path, l)))
-			goto err;
-
-		l++;
-	} while (l < path->locks_want);
-
-	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
-		path->uptodate = BTREE_ITER_UPTODATE;
-
-	return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1;
-err:
-	if (f) {
-		f->l	= l;
-		f->b	= path->l[l].b;
-	}
-
-	/*
-	 * Do transaction restart before unlocking, so we don't pop
-	 * should_be_locked asserts
-	 */
-	if (restart_err) {
-		btree_trans_restart(trans, restart_err);
-	} else if (path->should_be_locked && !trans->restarted) {
-		if (upgrade)
-			path->locks_want = l;
-		return -1;
-	}
-
-	__bch2_btree_path_unlock(trans, path);
-	btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-
-	/*
-	 * When we fail to get a lock, we have to ensure that any child nodes
-	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
-	 * the node that we failed to relock:
-	 */
-	do {
-		path->l[l].b = upgrade
-			? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
-			: ERR_PTR(-BCH_ERR_no_btree_node_relock);
-	} while (l--);
-
-	return -restart_err ?: -1;
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
-			      struct btree_path *path, unsigned level,
-			      bool trace)
-{
-	struct btree *b = btree_path_node(path, level);
-	int want = __btree_lock_want(path, level);
-
-	if (race_fault())
-		goto fail;
-
-	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
-	    (btree_node_lock_seq_matches(path, b, level) &&
-	     btree_node_lock_increment(trans, &b->c, level, want))) {
-		mark_btree_node_locked(trans, path, level, want);
-		return true;
-	}
-fail:
-	if (trace && !trans->notrace_relock_fail)
-		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
-	return false;
-}
-
-/* upgrade */
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
-			     struct btree_path *path, unsigned level)
-{
-	struct btree *b = path->l[level].b;
-
-	if (!is_btree_node(path, level))
-		return false;
-
-	switch (btree_lock_want(path, level)) {
-	case BTREE_NODE_UNLOCKED:
-		BUG_ON(btree_node_locked(path, level));
-		return true;
-	case BTREE_NODE_READ_LOCKED:
-		BUG_ON(btree_node_intent_locked(path, level));
-		return bch2_btree_node_relock(trans, path, level);
-	case BTREE_NODE_INTENT_LOCKED:
-		break;
-	case BTREE_NODE_WRITE_LOCKED:
-		BUG();
-	}
-
-	if (btree_node_intent_locked(path, level))
-		return true;
-
-	if (race_fault())
-		return false;
-
-	if (btree_node_locked(path, level)
-	    ? six_lock_tryupgrade(&b->c.lock)
-	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
-		goto success;
-
-	if (btree_node_lock_seq_matches(path, b, level) &&
-	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
-		btree_node_unlock(trans, path, level);
-		goto success;
-	}
-
-	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
-	return false;
-success:
-	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
-	return true;
-}
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
-				  struct btree_path *path)
-{
-	unsigned l;
-
-	for (l = path->level;
-	     l < path->locks_want && btree_path_node(path, l);
-	     l++) {
-		if (!bch2_btree_node_relock(trans, path, l)) {
-			__bch2_btree_path_unlock(trans, path);
-			btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
-		}
-	}
-
-	return 0;
-}
-
-__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
-{
-	bool ret = !btree_path_get_locks(trans, path, false, NULL, 0);
-	bch2_trans_verify_locks(trans);
-	return ret;
-}
-
-int __bch2_btree_path_relock(struct btree_trans *trans,
-			struct btree_path *path, unsigned long trace_ip)
-{
-	if (!bch2_btree_path_relock_norestart(trans, path)) {
-		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
-	}
-
-	return 0;
-}
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
-					 struct btree_path *path,
-					 unsigned new_locks_want)
-{
-	path->locks_want = new_locks_want;
-
-	/*
-	 * If we need it locked, we can't touch it. Otherwise, we can return
-	 * success - bch2_path_get() will use this path, and it'll just be
-	 * retraversed:
-	 */
-	bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) ||
-		!path->should_be_locked;
-
-	bch2_btree_path_verify_locks(trans, path);
-	return ret;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *trans,
-			      struct btree_path *path,
-			      unsigned new_locks_want)
-{
-	unsigned old_locks = path->nodes_locked;
-	unsigned old_locks_want = path->locks_want;
-
-	path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
-
-	struct get_locks_fail f = {};
-	int ret = btree_path_get_locks(trans, path, true, &f,
-				BCH_ERR_transaction_restart_upgrade);
-	if (!ret)
-		goto out;
-
-	/*
-	 * XXX: this is ugly - we'd prefer to not be mucking with other
-	 * iterators in the btree_trans here.
-	 *
-	 * On failure to upgrade the iterator, setting iter->locks_want and
-	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
-	 * get the locks we want on transaction restart.
-	 *
-	 * But if this iterator was a clone, on transaction restart what we did
-	 * to this iterator isn't going to be preserved.
-	 *
-	 * Possibly we could add an iterator field for the parent iterator when
-	 * an iterator is a copy - for now, we'll just upgrade any other
-	 * iterators with the same btree id.
-	 *
-	 * The code below used to be needed to ensure ancestor nodes get locked
-	 * before interior nodes - now that's handled by
-	 * bch2_btree_path_traverse_all().
-	 */
-	if (!path->cached && !trans->in_traverse_all) {
-		struct btree_path *linked;
-		unsigned i;
-
-		trans_for_each_path(trans, linked, i)
-			if (linked != path &&
-			    linked->cached == path->cached &&
-			    linked->btree_id == path->btree_id &&
-			    linked->locks_want < new_locks_want) {
-				linked->locks_want = new_locks_want;
-				btree_path_get_locks(trans, linked, true, NULL, 0);
-			}
-	}
-
-	count_event(trans->c, trans_restart_upgrade);
-	if (trace_trans_restart_upgrade_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
-		prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
-		bch2_bpos_to_text(&buf, path->pos);
-		prt_printf(&buf, "locks want %u -> %u level %u\n",
-			   old_locks_want, new_locks_want, f.l);
-		prt_printf(&buf, "nodes_locked %x -> %x\n",
-			   old_locks, path->nodes_locked);
-		prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
-			   !f.b ? "(null)" : "(node)");
-		prt_printf(&buf, "path seq %u node seq %u\n",
-			   IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
-			   path->l[f.l].lock_seq);
-
-		trace_trans_restart_upgrade(trans->c, buf.buf);
-		printbuf_exit(&buf);
-	}
-out:
-	bch2_trans_verify_locks(trans);
-	return ret;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
-				 struct btree_path *path,
-				 unsigned new_locks_want)
-{
-	unsigned l, old_locks_want = path->locks_want;
-
-	if (trans->restarted)
-		return;
-
-	EBUG_ON(path->locks_want < new_locks_want);
-
-	path->locks_want = new_locks_want;
-
-	while (path->nodes_locked &&
-	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
-		if (l > path->level) {
-			btree_node_unlock(trans, path, l);
-		} else {
-			if (btree_node_intent_locked(path, l)) {
-				six_lock_downgrade(&path->l[l].b->c.lock);
-				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
-			}
-			break;
-		}
-	}
-
-	bch2_btree_path_verify_locks(trans, path);
-
-	trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
-}
-
-/* Btree transaction locking: */
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	if (trans->restarted)
-		return;
-
-	trans_for_each_path(trans, path, i)
-		if (path->ref)
-			bch2_btree_path_downgrade(trans, path);
-}
-
-static inline void __bch2_trans_unlock(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		__bch2_btree_path_unlock(trans, path);
-}
-
-static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
-						   struct get_locks_fail *f, bool trace, ulong ip)
-{
-	if (!trace)
-		goto out;
-
-	if (trace_trans_restart_relock_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bpos_to_text(&buf, path->pos);
-		prt_printf(&buf, " %s l=%u seq=%u node seq=",
-			   bch2_btree_id_str(path->btree_id),
-			   f->l, path->l[f->l].lock_seq);
-		if (IS_ERR_OR_NULL(f->b)) {
-			prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
-		} else {
-			prt_printf(&buf, "%u", f->b->c.lock.seq);
-
-			struct six_lock_count c =
-				bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
-			prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
-			c = six_lock_counts(&f->b->c.lock);
-			prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-		}
-
-		trace_trans_restart_relock(trans, ip, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	count_event(trans->c, trans_restart_relock);
-out:
-	__bch2_trans_unlock(trans);
-	bch2_trans_verify_locks(trans);
-}
-
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip)
-{
-	bch2_trans_verify_locks(trans);
-
-	if (unlikely(trans->restarted))
-		return -((int) trans->restarted);
-	if (unlikely(trans->locked))
-		goto out;
-
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i) {
-		struct get_locks_fail f;
-		int ret;
-
-		if (path->should_be_locked &&
-		    (ret = btree_path_get_locks(trans, path, false, &f,
-					BCH_ERR_transaction_restart_relock))) {
-			bch2_trans_relock_fail(trans, path, &f, trace, ip);
-			return ret;
-		}
-	}
-
-	trans_set_locked(trans, true);
-out:
-	bch2_trans_verify_locks(trans);
-	return 0;
-}
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
-	return __bch2_trans_relock(trans, true, _RET_IP_);
-}
-
-int bch2_trans_relock_notrace(struct btree_trans *trans)
-{
-	return __bch2_trans_relock(trans, false, _RET_IP_);
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
-	trans_set_unlocked(trans);
-
-	__bch2_trans_unlock(trans);
-}
-
-void bch2_trans_unlock_long(struct btree_trans *trans)
-{
-	bch2_trans_unlock(trans);
-	bch2_trans_srcu_unlock(trans);
-}
-
-void bch2_trans_unlock_write(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
-			if (btree_node_write_locked(path, l))
-				bch2_btree_node_unlock_write(trans, path, path->l[l].b);
-}
-
-int __bch2_trans_mutex_lock(struct btree_trans *trans,
-			    struct mutex *lock)
-{
-	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
-
-	if (ret)
-		mutex_unlock(lock);
-	return ret;
-}
-
-/* Debug */
-
-void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path)
-{
-	if (!path->nodes_locked && btree_path_node(path, path->level)) {
-		/*
-		 * A path may be uptodate and yet have nothing locked if and only if
-		 * there is no node at path->level, which generally means we were
-		 * iterating over all nodes and got to the end of the btree
-		 */
-		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
-		BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
-	}
-
-	if (!path->nodes_locked)
-		return;
-
-	for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
-		int want = btree_lock_want(path, l);
-		int have = btree_node_locked_type_nowrite(path, l);
-
-		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
-
-		BUG_ON(is_btree_node(path, l) && want != have);
-
-		BUG_ON(btree_node_locked(path, l) &&
-		       path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
-	}
-}
-
-static bool bch2_trans_locked(struct btree_trans *trans)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (path->nodes_locked)
-			return true;
-	return false;
-}
-
-void __bch2_trans_verify_locks(struct btree_trans *trans)
-{
-	if (!trans->locked) {
-		BUG_ON(bch2_trans_locked(trans));
-		return;
-	}
-
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		__bch2_btree_path_verify_locks(trans, path);
-}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
deleted file mode 100644
index f2173a3316f4..000000000000
--- a/fs/bcachefs/btree_locking.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_LOCKING_H
-#define _BCACHEFS_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-
-void bch2_trans_unlock_write(struct btree_trans *);
-
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
-	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
-	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
-		? &trans->c->btree_transaction_stats[trans->fn_idx]
-		: NULL;
-}
-
-/* matches six lock types */
-enum btree_node_locked_type {
-	BTREE_NODE_UNLOCKED		= -1,
-	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
-	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
-	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
-};
-
-static inline int btree_node_locked_type(struct btree_path *path,
-					 unsigned level)
-{
-	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
-}
-
-static inline int btree_node_locked_type_nowrite(struct btree_path *path,
-						 unsigned level)
-{
-	int have = btree_node_locked_type(path, level);
-	return have == BTREE_NODE_WRITE_LOCKED
-		? BTREE_NODE_INTENT_LOCKED
-		: have;
-}
-
-static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
-}
-
-static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
-{
-	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
-{
-	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_path *path,
-						  unsigned level,
-						  enum btree_node_locked_type type)
-{
-	/* relying on this to avoid a branch */
-	BUILD_BUG_ON(SIX_LOCK_read   != 0);
-	BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
-	path->nodes_locked &= ~(3U << (level << 1));
-	path->nodes_locked |= (type + 1) << (level << 1);
-}
-
-static inline void mark_btree_node_locked(struct btree_trans *trans,
-					  struct btree_path *path,
-					  unsigned level,
-					  enum btree_node_locked_type type)
-{
-	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	path->l[level].lock_taken_time = local_clock();
-#endif
-}
-
-static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
-{
-	return level < path->locks_want
-		? SIX_LOCK_intent
-		: SIX_LOCK_read;
-}
-
-static inline enum btree_node_locked_type
-btree_lock_want(struct btree_path *path, int level)
-{
-	if (level < path->level)
-		return BTREE_NODE_UNLOCKED;
-	if (level < path->locks_want)
-		return BTREE_NODE_INTENT_LOCKED;
-	if (level == path->level)
-		return BTREE_NODE_READ_LOCKED;
-	return BTREE_NODE_UNLOCKED;
-}
-
-static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
-					      struct btree_path *path, unsigned level)
-{
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-	__bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
-				 path->l[level].lock_taken_time,
-				 local_clock());
-#endif
-}
-
-/* unlock: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
-			struct btree_path *, struct btree *);
-
-static inline void btree_node_unlock(struct btree_trans *trans,
-				     struct btree_path *path, unsigned level)
-{
-	int lock_type = btree_node_locked_type(path, level);
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-
-	if (lock_type != BTREE_NODE_UNLOCKED) {
-		if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
-			bch2_btree_node_unlock_write(trans, path, path->l[level].b);
-			lock_type = BTREE_NODE_INTENT_LOCKED;
-		}
-		six_unlock_type(&path->l[level].b->c.lock, lock_type);
-		btree_trans_lock_hold_time_update(trans, path, level);
-		mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
-	}
-}
-
-static inline int btree_path_lowest_level_locked(struct btree_path *path)
-{
-	return __ffs(path->nodes_locked) >> 1;
-}
-
-static inline int btree_path_highest_level_locked(struct btree_path *path)
-{
-	return __fls(path->nodes_locked) >> 1;
-}
-
-static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
-					    struct btree_path *path)
-{
-	btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK);
-
-	while (path->nodes_locked)
-		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
-{
-	if (!b->c.lock.write_lock_recurse) {
-		struct btree_path *linked;
-		unsigned i;
-
-		trans_for_each_path_with_node(trans, b, linked, i)
-			linked->l[b->c.level].lock_seq++;
-	}
-
-	six_unlock_write(&b->c.lock);
-}
-
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
-				     struct btree *b)
-{
-	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
-	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
-
-	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-	__bch2_btree_node_unlock_write(trans, b);
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
-
-/* lock: */
-
-static inline void trans_set_locked(struct btree_trans *trans, bool try)
-{
-	if (!trans->locked) {
-		lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
-		trans->locked = true;
-		trans->last_unlock_ip = 0;
-
-		trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
-		current->flags |= PF_MEMALLOC_NOFS;
-	}
-}
-
-static inline void trans_set_unlocked(struct btree_trans *trans)
-{
-	if (trans->locked) {
-		lock_release(&trans->dep_map, _THIS_IP_);
-		trans->locked = false;
-		trans->last_unlock_ip = _RET_IP_;
-
-		if (!trans->pf_memalloc_nofs)
-			current->flags &= ~PF_MEMALLOC_NOFS;
-	}
-}
-
-static inline int __btree_node_lock_nopath(struct btree_trans *trans,
-					 struct btree_bkey_cached_common *b,
-					 enum six_lock_type type,
-					 bool lock_may_not_fail,
-					 unsigned long ip)
-{
-	trans->lock_may_not_fail = lock_may_not_fail;
-	trans->lock_must_abort	= false;
-	trans->locking		= b;
-
-	int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
-				     bch2_six_check_for_deadlock, trans, ip);
-	WRITE_ONCE(trans->locking, NULL);
-	WRITE_ONCE(trans->locking_wait.start_time, 0);
-
-	if (!ret)
-		trace_btree_path_lock(trans, _THIS_IP_, b);
-	return ret;
-}
-
-static inline int __must_check
-btree_node_lock_nopath(struct btree_trans *trans,
-		       struct btree_bkey_cached_common *b,
-		       enum six_lock_type type,
-		       unsigned long ip)
-{
-	return __btree_node_lock_nopath(trans, b, type, false, ip);
-}
-
-static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
-					 struct btree_bkey_cached_common *b,
-					 enum six_lock_type type)
-{
-	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
-
-	BUG_ON(ret);
-}
-
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_trans *trans,
-					     struct btree_bkey_cached_common *b,
-					     unsigned level,
-					     enum btree_node_locked_type want)
-{
-	struct btree_path *path;
-	unsigned i;
-
-	trans_for_each_path(trans, path, i)
-		if (&path->l[level].b->c == b &&
-		    btree_node_locked_type(path, level) >= want) {
-			six_lock_increment(&b->lock, (enum six_lock_type) want);
-			return true;
-		}
-
-	return false;
-}
-
-static inline int btree_node_lock(struct btree_trans *trans,
-			struct btree_path *path,
-			struct btree_bkey_cached_common *b,
-			unsigned level,
-			enum six_lock_type type,
-			unsigned long ip)
-{
-	int ret = 0;
-
-	EBUG_ON(level >= BTREE_MAX_DEPTH);
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	if (likely(six_trylock_type(&b->lock, type)) ||
-	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
-	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		path->l[b->level].lock_taken_time = local_clock();
-#endif
-	}
-
-	return ret;
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
-				 struct btree_bkey_cached_common *b, bool);
-
-static inline int __btree_node_lock_write(struct btree_trans *trans,
-					  struct btree_path *path,
-					  struct btree_bkey_cached_common *b,
-					  bool lock_may_not_fail)
-{
-	EBUG_ON(&path->l[b->level].b->c != b);
-	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
-	EBUG_ON(!btree_node_intent_locked(path, b->level));
-
-	/*
-	 * six locks are unfair, and read locks block while a thread wants a
-	 * write lock: thus, we need to tell the cycle detector we have a write
-	 * lock _before_ taking the lock:
-	 */
-	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
-
-	return likely(six_trylock_write(&b->lock))
-		? 0
-		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
-}
-
-static inline int __must_check
-bch2_btree_node_lock_write(struct btree_trans *trans,
-			   struct btree_path *path,
-			   struct btree_bkey_cached_common *b)
-{
-	return __btree_node_lock_write(trans, path, b, false);
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *,
-				       struct btree_path *,
-				       struct btree_bkey_cached_common *);
-
-/* relock: */
-
-bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
-int __bch2_btree_path_relock(struct btree_trans *,
-			     struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
-				struct btree_path *path, unsigned long trace_ip)
-{
-	return btree_node_locked(path, path->level)
-		? 0
-		: __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
-					  struct btree_path *path, unsigned level)
-{
-	EBUG_ON(btree_node_locked(path, level) &&
-		!btree_node_write_locked(path, level) &&
-		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
-	return likely(btree_node_locked(path, level)) ||
-		(!IS_ERR_OR_NULL(path->l[level].b) &&
-		 __bch2_btree_node_relock(trans, path, level, true));
-}
-
-static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
-						  struct btree_path *path, unsigned level)
-{
-	EBUG_ON(btree_node_locked(path, level) &&
-		btree_node_locked_type_nowrite(path, level) !=
-		__btree_lock_want(path, level));
-
-	return likely(btree_node_locked(path, level)) ||
-		(!IS_ERR_OR_NULL(path->l[level].b) &&
-		 __bch2_btree_node_relock(trans, path, level, false));
-}
-
-/* upgrade */
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
-			       struct btree_path *path,
-			       unsigned new_locks_want)
-{
-	return new_locks_want > path->locks_want
-		? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want)
-		: true;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *,
-			      struct btree_path *, unsigned);
-
-static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
-					  struct btree_path *path,
-					  unsigned new_locks_want)
-{
-	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
-	return likely(path->locks_want >= new_locks_want && path->nodes_locked)
-		? 0
-		: __bch2_btree_path_upgrade(trans, path, new_locks_want);
-}
-
-/* misc: */
-
-static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
-{
-	EBUG_ON(!btree_node_locked(path, path->level));
-	EBUG_ON(path->uptodate);
-
-	if (!path->should_be_locked) {
-		path->should_be_locked = true;
-		trace_btree_path_should_be_locked(trans, path);
-	}
-}
-
-static inline void __btree_path_set_level_up(struct btree_trans *trans,
-				      struct btree_path *path,
-				      unsigned l)
-{
-	btree_node_unlock(trans, path, l);
-	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-}
-
-static inline void btree_path_set_level_up(struct btree_trans *trans,
-				    struct btree_path *path)
-{
-	__btree_path_set_level_up(trans, path, path->level++);
-	btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-}
-
-/* debug */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
-				struct btree_path *,
-				struct btree_bkey_cached_common *b,
-				unsigned);
-
-int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-
-void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *);
-void __bch2_trans_verify_locks(struct btree_trans *);
-
-static inline void bch2_btree_path_verify_locks(struct btree_trans *trans,
-						struct btree_path *path)
-{
-	if (static_branch_unlikely(&bch2_debug_check_btree_locking))
-		__bch2_btree_path_verify_locks(trans, path);
-}
-
-static inline void bch2_trans_verify_locks(struct btree_trans *trans)
-{
-	if (static_branch_unlikely(&bch2_debug_check_btree_locking))
-		__bch2_trans_verify_locks(trans);
-}
-
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
deleted file mode 100644
index a3fb07c60e25..000000000000
--- a/fs/bcachefs/btree_node_scan.c
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "recovery_passes.h"
-
-#include <linux/kthread.h>
-#include <linux/min_heap.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sort.h>
-
-struct find_btree_nodes_worker {
-	struct closure		*cl;
-	struct find_btree_nodes	*f;
-	struct bch_dev		*ca;
-};
-
-static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
-{
-	bch2_btree_id_level_to_text(out, n->btree_id, n->level);
-	prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
-		   n->seq, n->journal_seq, n->cookie);
-	bch2_bpos_to_text(out, n->min_key);
-	prt_str(out, "-");
-	bch2_bpos_to_text(out, n->max_key);
-
-	if (n->range_updated)
-		prt_str(out, " range updated");
-
-	for (unsigned i = 0; i < n->nr_ptrs; i++) {
-		prt_char(out, ' ');
-		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
-	}
-}
-
-static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
-{
-	printbuf_indent_add(out, 2);
-	darray_for_each(nodes, i) {
-		found_btree_node_to_text(out, c, i);
-		prt_newline(out);
-	}
-	printbuf_indent_sub(out, 2);
-}
-
-static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
-{
-	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
-
-	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
-	bp->k.p			= f->max_key;
-	bp->v.seq		= cpu_to_le64(f->cookie);
-	bp->v.sectors_written	= 0;
-	bp->v.flags		= 0;
-	bp->v.sectors_written	= cpu_to_le16(f->sectors_written);
-	bp->v.min_key		= f->min_key;
-	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
-	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
-}
-
-static inline u64 bkey_journal_seq(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode_v3:
-		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
-	default:
-		return 0;
-	}
-}
-
-static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-		cmp_int(l->level,	r->level) ?:
-		cmp_int(l->cookie,	r->cookie);
-}
-
-/*
- * Given two found btree nodes, if their sequence numbers are equal, take the
- * one that's readable:
- */
-static int found_btree_node_cmp_time(const struct found_btree_node *l,
-				     const struct found_btree_node *r)
-{
-	return  cmp_int(l->seq, r->seq) ?:
-		cmp_int(l->journal_seq, r->journal_seq);
-}
-
-static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-	       -cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->min_key,	r->min_key) ?:
-	       -found_btree_node_cmp_time(l, r);
-}
-
-static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
-{
-	return found_btree_node_cmp_pos(l, r) < 0;
-}
-
-static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
-{
-	struct found_btree_node *l = _l;
-	struct found_btree_node *r = _r;
-
-	swap(*l, *r);
-}
-
-static const struct min_heap_callbacks found_btree_node_heap_cbs = {
-	.less = found_btree_node_cmp_pos_less,
-	.swp = found_btree_node_swap,
-};
-
-static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
-				struct btree *b, struct bio *bio, u64 offset)
-{
-	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-	struct btree_node *bn = b->data;
-
-	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
-	bio->bi_iter.bi_sector	= offset;
-	bch2_bio_map(bio, b->data, c->opts.block_size);
-
-	u64 submit_time = local_clock();
-	submit_bio_wait(bio);
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
-	if (bio->bi_status) {
-		bch_err_dev_ratelimited(ca,
-				"IO error in try_read_btree_node() at %llu: %s",
-				offset, bch2_blk_status_to_str(bio->bi_status));
-		return;
-	}
-
-	if (le64_to_cpu(bn->magic) != bset_magic(c))
-		return;
-
-	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
-		if (!c->chacha20_key_set)
-			return;
-
-		struct nonce nonce = btree_nonce(&bn->keys, 0);
-		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
-		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
-	}
-
-	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
-		return;
-
-	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
-		return;
-
-	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
-		return;
-
-	rcu_read_lock();
-	struct found_btree_node n = {
-		.btree_id	= BTREE_NODE_ID(bn),
-		.level		= BTREE_NODE_LEVEL(bn),
-		.seq		= BTREE_NODE_SEQ(bn),
-		.cookie		= le64_to_cpu(bn->keys.seq),
-		.min_key	= bn->min_key,
-		.max_key	= bn->max_key,
-		.nr_ptrs	= 1,
-		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
-		.ptrs[0].offset	= offset,
-		.ptrs[0].dev	= ca->dev_idx,
-		.ptrs[0].gen	= bucket_gen_get(ca, sector_to_bucket(ca, offset)),
-	};
-	rcu_read_unlock();
-
-	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
-	bio->bi_iter.bi_sector	= offset;
-	bch2_bio_map(bio, b->data, c->opts.btree_node_size);
-
-	submit_time = local_clock();
-	submit_bio_wait(bio);
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
-	found_btree_node_to_key(&b->key, &n);
-
-	CLASS(printbuf, buf)();
-	if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
-		/* read_done will swap out b->data for another buffer */
-		bn = b->data;
-		/*
-		 * Grab journal_seq here because we want the max journal_seq of
-		 * any bset; read_done sorts down to a single set and picks the
-		 * max journal_seq
-		 */
-		n.journal_seq		= le64_to_cpu(bn->keys.journal_seq),
-		n.sectors_written	= b->written;
-
-		mutex_lock(&f->lock);
-		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
-			bch_err(c, "try_read_btree_node() can't handle endian conversion");
-			f->ret = -EINVAL;
-			goto unlock;
-		}
-
-		if (darray_push(&f->nodes, n))
-			f->ret = -ENOMEM;
-unlock:
-		mutex_unlock(&f->lock);
-	}
-}
-
-static int read_btree_nodes_worker(void *p)
-{
-	struct find_btree_nodes_worker *w = p;
-	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
-	struct bch_dev *ca = w->ca;
-	unsigned long last_print = jiffies;
-	struct btree *b = NULL;
-	struct bio *bio = NULL;
-
-	b = __bch2_btree_node_mem_alloc(c);
-	if (!b) {
-		bch_err(c, "read_btree_nodes_worker: error allocating buf");
-		w->f->ret = -ENOMEM;
-		goto err;
-	}
-
-	bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
-	if (!bio) {
-		bch_err(c, "read_btree_nodes_worker: error allocating bio");
-		w->f->ret = -ENOMEM;
-		goto err;
-	}
-
-	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
-		for (unsigned bucket_offset = 0;
-		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
-		     bucket_offset += btree_sectors(c)) {
-			if (time_after(jiffies, last_print + HZ * 30)) {
-				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
-				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
-
-				bch_info(ca, "%s: %2u%% done", __func__,
-					 (unsigned) div64_u64(cur_sector * 100, end_sector));
-				last_print = jiffies;
-			}
-
-			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
-
-			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
-			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
-				continue;
-
-			try_read_btree_node(w->f, ca, b, bio, sector);
-		}
-err:
-	if (b)
-		__btree_node_data_free(b);
-	kfree(b);
-	bio_put(bio);
-	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-	closure_put(w->cl);
-	kfree(w);
-	return 0;
-}
-
-static int read_btree_nodes(struct find_btree_nodes *f)
-{
-	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
-	struct closure cl;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-
-	for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
-		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
-			continue;
-
-		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
-		if (!w) {
-			enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		w->cl		= &cl;
-		w->f		= f;
-		w->ca		= ca;
-
-		struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
-		ret = PTR_ERR_OR_ZERO(t);
-		if (ret) {
-			enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-			kfree(w);
-			bch_err_msg(c, ret, "starting kthread");
-			break;
-		}
-
-		closure_get(&cl);
-		enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
-		wake_up_process(t);
-	}
-err:
-	while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
-		;
-	return f->ret ?: ret;
-}
-
-static bool nodes_overlap(const struct found_btree_node *l,
-			  const struct found_btree_node *r)
-{
-	return (l->btree_id	== r->btree_id &&
-		l->level	== r->level &&
-		bpos_gt(l->max_key, r->min_key));
-}
-
-static int handle_overwrites(struct bch_fs *c,
-			     struct found_btree_node *l,
-			     found_btree_nodes *nodes_heap)
-{
-	struct found_btree_node *r;
-
-	while ((r = min_heap_peek(nodes_heap)) &&
-	       nodes_overlap(l, r)) {
-		int cmp = found_btree_node_cmp_time(l, r);
-
-		if (cmp > 0) {
-			if (bpos_cmp(l->max_key, r->max_key) >= 0)
-				min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-			else {
-				r->range_updated = true;
-				r->min_key = bpos_successor(l->max_key);
-				r->range_updated = true;
-				min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
-			}
-		} else if (cmp < 0) {
-			BUG_ON(bpos_eq(l->min_key, r->min_key));
-
-			l->max_key = bpos_predecessor(r->min_key);
-			l->range_updated = true;
-		} else if (r->level) {
-			min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-		} else {
-			if (bpos_cmp(l->max_key, r->max_key) >= 0)
-				min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
-			else {
-				r->range_updated = true;
-				r->min_key = bpos_successor(l->max_key);
-				r->range_updated = true;
-				min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
-			}
-		}
-
-		cond_resched();
-	}
-
-	return 0;
-}
-
-int bch2_scan_for_btree_nodes(struct bch_fs *c)
-{
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-	struct printbuf buf = PRINTBUF;
-	found_btree_nodes nodes_heap = {};
-	size_t dst;
-	int ret = 0;
-
-	if (f->nodes.nr)
-		return 0;
-
-	mutex_init(&f->lock);
-
-	ret = read_btree_nodes(f);
-	if (ret)
-		return ret;
-
-	if (!f->nodes.nr) {
-		bch_err(c, "%s: no btree nodes found", __func__);
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (0 && c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes found:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_str(c, KERN_INFO, buf.buf);
-	}
-
-	sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
-
-	dst = 0;
-	darray_for_each(f->nodes, i) {
-		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
-
-		if (prev &&
-		    prev->cookie == i->cookie) {
-			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
-				bch_err(c, "%s: found too many replicas for btree node", __func__);
-				ret = -EINVAL;
-				goto err;
-			}
-			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
-		} else {
-			f->nodes.data[dst++] = *i;
-		}
-	}
-	f->nodes.nr = dst;
-
-	sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-
-	if (0 && c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_str(c, KERN_INFO, buf.buf);
-	}
-
-	swap(nodes_heap, f->nodes);
-
-	{
-		/* darray must have same layout as a heap */
-		min_heap_char real_heap;
-		BUILD_BUG_ON(sizeof(nodes_heap.nr)	!= sizeof(real_heap.nr));
-		BUILD_BUG_ON(sizeof(nodes_heap.size)	!= sizeof(real_heap.size));
-		BUILD_BUG_ON(offsetof(found_btree_nodes, nr)	!= offsetof(min_heap_char, nr));
-		BUILD_BUG_ON(offsetof(found_btree_nodes, size)	!= offsetof(min_heap_char, size));
-	}
-
-	min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-
-	if (nodes_heap.nr) {
-		ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
-		if (ret)
-			goto err;
-
-		min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-	}
-
-	while (true) {
-		ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
-		if (ret)
-			goto err;
-
-		if (!nodes_heap.nr)
-			break;
-
-		ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
-		if (ret)
-			goto err;
-
-		min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-	}
-
-	for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
-		BUG_ON(nodes_overlap(n, n + 1));
-
-	if (0 && c->opts.verbose) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
-		found_btree_nodes_to_text(&buf, c, f->nodes);
-		bch2_print_str(c, KERN_INFO, buf.buf);
-	} else {
-		bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
-	}
-
-	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-err:
-	darray_exit(&nodes_heap);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
-{
-	const struct found_btree_node *l = _l;
-	const struct found_btree_node *r = _r;
-
-	return  cmp_int(l->btree_id,	r->btree_id) ?:
-	       -cmp_int(l->level,	r->level) ?:
-		bpos_cmp(l->max_key,	r->min_key);
-}
-
-#define for_each_found_btree_node_in_range(_f, _search, _idx)				\
-	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
-					sizeof((_f)->nodes.data[0]),			\
-					found_btree_node_range_start_cmp, &search);	\
-	     _idx < (_f)->nodes.nr &&							\
-	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
-	     (_f)->nodes.data[_idx].level == _search.level &&				\
-	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
-	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
-
-bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
-{
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-
-	struct found_btree_node search = {
-		.btree_id	= b->c.btree_id,
-		.level		= b->c.level,
-		.min_key	= b->data->min_key,
-		.max_key	= b->key.k.p,
-	};
-
-	for_each_found_btree_node_in_range(f, search, idx)
-		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
-			return true;
-	return false;
-}
-
-int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
-{
-	int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-	if (ret)
-		return ret;
-
-	struct found_btree_node search = {
-		.btree_id	= btree,
-		.level		= 0,
-		.min_key	= POS_MIN,
-		.max_key	= SPOS_MAX,
-	};
-
-	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
-		return true;
-	return false;
-}
-
-int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
-			   unsigned level, struct bpos node_min, struct bpos node_max)
-{
-	if (btree_id_is_alloc(btree))
-		return 0;
-
-	struct find_btree_nodes *f = &c->found_btree_nodes;
-
-	int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
-	if (ret)
-		return ret;
-
-	if (c->opts.verbose) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "recovery ");
-		bch2_btree_id_level_to_text(&buf, btree, level);
-		prt_str(&buf, " ");
-		bch2_bpos_to_text(&buf, node_min);
-		prt_str(&buf, " - ");
-		bch2_bpos_to_text(&buf, node_max);
-
-		bch_info(c, "%s(): %s", __func__, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	struct found_btree_node search = {
-		.btree_id	= btree,
-		.level		= level,
-		.min_key	= node_min,
-		.max_key	= node_max,
-	};
-
-	for_each_found_btree_node_in_range(f, search, idx) {
-		struct found_btree_node n = f->nodes.data[idx];
-
-		n.range_updated |= bpos_lt(n.min_key, node_min);
-		n.min_key = bpos_max(n.min_key, node_min);
-
-		n.range_updated |= bpos_gt(n.max_key, node_max);
-		n.max_key = bpos_min(n.max_key, node_max);
-
-		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
-		found_btree_node_to_key(&tmp.k, &n);
-
-		if (c->opts.verbose) {
-			struct printbuf buf = PRINTBUF;
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
-			bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
-					  (struct bkey_validate_context) {
-						.from	= BKEY_VALIDATE_btree_node,
-						.level	= level + 1,
-						.btree	= btree,
-					  }));
-
-		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
-{
-	darray_exit(&f->nodes);
-}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
deleted file mode 100644
index 66e6f9ed19d0..000000000000
--- a/fs/bcachefs/btree_node_scan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
-#define _BCACHEFS_BTREE_NODE_SCAN_H
-
-int bch2_scan_for_btree_nodes(struct bch_fs *);
-bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
-int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
deleted file mode 100644
index 2811b6857c97..000000000000
--- a/fs/bcachefs/btree_node_scan_types.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-
-#include "darray.h"
-
-struct found_btree_node {
-	bool			range_updated:1;
-	u8			btree_id;
-	u8			level;
-	unsigned		sectors_written;
-	u32			seq;
-	u64			journal_seq;
-	u64			cookie;
-
-	struct bpos		min_key;
-	struct bpos		max_key;
-
-	unsigned		nr_ptrs;
-	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
-};
-
-typedef DARRAY(struct found_btree_node)	found_btree_nodes;
-
-struct find_btree_nodes {
-	int			ret;
-	struct mutex		lock;
-	found_btree_nodes	nodes;
-};
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
deleted file mode 100644
index 639ef75b3dbd..000000000000
--- a/fs/bcachefs/btree_trans_commit.c
+++ /dev/null
@@ -1,1121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "snapshot.h"
-
-#include <linux/prefetch.h>
-#include <linux/string_helpers.h>
-
-static const char * const trans_commit_flags_strs[] = {
-#define x(n, ...) #n,
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
-{
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-	prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
-
-	flags >>= BCH_WATERMARK_BITS;
-	if (flags) {
-		prt_char(out, ' ');
-		bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
-	}
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bch_fs *c = trans->c;
-	struct bkey u;
-	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
-
-	if (unlikely(trans->journal_replay_not_finished)) {
-		struct bkey_i *j_k =
-			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
-		if (j_k)
-			k = bkey_i_to_s_c(j_k);
-	}
-
-	u = *k.k;
-	u.needs_whiteout = i->old_k.needs_whiteout;
-
-	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-	BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	return (trans->paths + i->path)->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i != trans->updates &&
-		insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
-				     struct btree_insert_entry *i)
-{
-	return i + 1 < trans->updates + trans->nr_updates &&
-		insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-					   struct btree_path *path,
-					   struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	if (unlikely(btree_node_just_written(b)) &&
-	    bch2_btree_post_write_cleanup(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-
-	/*
-	 * If the last bset has been written, or if it's gotten too big - start
-	 * a new bset to insert into:
-	 */
-	if (want_new_bset(c, b))
-		bch2_btree_init_next(trans, b);
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	while (--i >= trans->updates) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-	}
-
-	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int bch2_trans_lock_write(struct btree_trans *trans)
-{
-	EBUG_ON(trans->write_locked);
-
-	trans_for_each_update(trans, i) {
-		if (same_leaf_as_prev(trans, i))
-			continue;
-
-		if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
-			return trans_lock_write_fail(trans, i);
-
-		if (!i->cached)
-			bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
-	}
-
-	trans->write_locked = true;
-	return 0;
-}
-
-static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
-{
-	if (likely(trans->write_locked)) {
-		trans_for_each_update(trans, i)
-			if (btree_node_locked_type(trans->paths + i->path, i->level) ==
-			    BTREE_NODE_WRITE_LOCKED)
-				bch2_btree_node_unlock_write_inlined(trans,
-						trans->paths + i->path, insert_l(trans, i)->b);
-		trans->write_locked = false;
-	}
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct btree_node_iter *node_iter,
-				struct bkey_i *insert)
-{
-	struct bkey_packed *k;
-	unsigned clobber_u64s = 0, new_u64s = 0;
-
-	EBUG_ON(btree_node_just_written(b));
-	EBUG_ON(bset_written(b, btree_bset_last(b)));
-	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-	EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
-	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
-	kmsan_check_memory(insert, bkey_bytes(&insert->k));
-
-	k = bch2_btree_node_iter_peek_all(node_iter, b);
-	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-		k = NULL;
-
-	/* @k is the key being overwritten/deleted, if any: */
-	EBUG_ON(k && bkey_deleted(k));
-
-	/* Deleting, but not found? nothing to do: */
-	if (bkey_deleted(&insert->k) && !k)
-		return false;
-
-	if (bkey_deleted(&insert->k)) {
-		/* Deleting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		if (k->needs_whiteout)
-			push_whiteout(b, insert->k.p);
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			bch2_bset_delete(b, k, clobber_u64s);
-			goto fix_iter;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-
-		return true;
-	}
-
-	if (k) {
-		/* Overwriting: */
-		btree_account_key_drop(b, k);
-		k->type = KEY_TYPE_deleted;
-
-		insert->k.needs_whiteout = k->needs_whiteout;
-		k->needs_whiteout = false;
-
-		if (k >= btree_bset_last(b)->start) {
-			clobber_u64s = k->u64s;
-			goto overwrite;
-		} else {
-			bch2_btree_path_fix_key_modified(trans, b, k);
-		}
-	}
-
-	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
-	bch2_bset_insert(b, k, insert, clobber_u64s);
-	new_u64s = k->u64s;
-fix_iter:
-	if (clobber_u64s != new_u64s)
-		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-					 clobber_u64s, new_u64s);
-	return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-			       unsigned i, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct btree_write *w = container_of(pin, struct btree_write, journal);
-	struct btree *b = container_of(w, struct btree, writes[i]);
-	struct btree_trans *trans = bch2_trans_get(c);
-	unsigned long old, new;
-	unsigned idx = w - b->writes;
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
-	old = READ_ONCE(b->flags);
-	do {
-		new = old;
-
-		if (!(old & (1 << BTREE_NODE_dirty)) ||
-		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-		    w->journal.seq != seq)
-			break;
-
-		new &= ~BTREE_WRITE_TYPE_MASK;
-		new |= BTREE_WRITE_journal_reclaim;
-		new |= 1 << BTREE_NODE_need_write;
-	} while (!try_cmpxchg(&b->flags, &old, new));
-
-	btree_node_write_if_need(trans, b, SIX_LOCK_read);
-	six_unlock_read(&b->c.lock);
-
-	bch2_trans_put(trans);
-	return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-	return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-				       struct btree *b, u64 seq)
-{
-	struct btree_write *w = btree_current_write(b);
-
-	bch2_journal_pin_add(&c->journal, seq, &w->journal,
-			     btree_node_write_idx(b) == 0
-			     ? bch2_btree_node_flush0
-			     : bch2_btree_node_flush1);
-}
-
-/**
- * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
- * @trans:		btree transaction object
- * @path:		path pointing to @insert's pos
- * @insert:		key to insert
- * @journal_seq:	sequence number of journal reservation
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct bkey_i *insert,
-				       u64 journal_seq)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = path_l(path)->b;
-	struct bset_tree *t = bset_tree_last(b);
-	struct bset *i = bset(b, t);
-	int old_u64s = bset_u64s(t);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-
-	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-					&path_l(path)->iter, insert)))
-		return;
-
-	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
-	bch2_btree_add_journal_pin(c, b, journal_seq);
-
-	if (unlikely(!btree_node_dirty(b))) {
-		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-		set_btree_node_dirty_acct(c, b);
-	}
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) bset_u64s(t) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
-					     struct btree_insert_entry *i)
-{
-	struct btree_path *path = trans->paths + i->path;
-
-	BUG_ON(!bpos_eq(i->k->k.p, path->pos));
-	BUG_ON(i->cached	!= path->cached);
-	BUG_ON(i->level		!= path->level);
-	BUG_ON(i->btree_id	!= path->btree_id);
-	BUG_ON(i->bkey_type	!= __btree_node_type(path->level, path->btree_id));
-	EBUG_ON(!i->level &&
-		btree_type_has_snapshots(i->btree_id) &&
-		!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
-		test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
-		i->k->k.p.snapshot &&
-		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-						      unsigned flags)
-{
-	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-				    trans->journal_u64s, flags, trans);
-}
-
-#define JSET_ENTRY_LOG_U64s		4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct jset_entry *entry =
-		bch2_journal_add_entry(j, &trans->journal_res,
-				       BCH_JSET_ENTRY_log, 0, 0,
-				       JSET_ENTRY_LOG_U64s);
-	struct jset_entry_log *l =
-		container_of(entry, struct jset_entry_log, entry);
-
-	memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
-		       trans->fn, strlen(trans->fn), 0);
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
-				       struct btree *b, unsigned u64s)
-{
-	if (!bch2_btree_node_insert_fits(b, u64s))
-		return bch_err_throw(trans->c, btree_insert_btree_node_full);
-
-	return 0;
-}
-
-noinline static int
-btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
-				     struct btree_path *path, unsigned new_u64s)
-{
-	struct bkey_cached *ck = (void *) path->l[0].b;
-	struct bkey_i *new_k;
-	int ret;
-
-	bch2_trans_unlock_updates_write(trans);
-	bch2_trans_unlock(trans);
-
-	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
-	if (!new_k) {
-		struct bch_fs *c = trans->c;
-		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-			bch2_btree_id_str(path->btree_id), new_u64s);
-		return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
-	}
-
-	ret =   bch2_trans_relock(trans) ?:
-		bch2_trans_lock_write(trans);
-	if (unlikely(ret)) {
-		kfree(new_k);
-		return ret;
-	}
-
-	memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
-
-	trans_for_each_update(trans, i)
-		if (i->old_v == &ck->k->v)
-			i->old_v = &new_k->v;
-
-	kfree(ck->k);
-	ck->u64s	= new_u64s;
-	ck->k		= new_k;
-	return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-				       struct btree_path *path, unsigned u64s)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_cached *ck = (void *) path->l[0].b;
-	unsigned new_u64s;
-	struct bkey_i *new_k;
-	unsigned watermark = flags & BCH_WATERMARK_MASK;
-
-	EBUG_ON(path->level);
-
-	if (watermark < BCH_WATERMARK_reclaim &&
-	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(c))
-		return bch_err_throw(c, btree_insert_need_journal_reclaim);
-
-	/*
-	 * bch2_varint_decode can read past the end of the buffer by at most 7
-	 * bytes (it won't be used):
-	 */
-	u64s += 1;
-
-	if (u64s <= ck->u64s)
-		return 0;
-
-	new_u64s	= roundup_pow_of_two(u64s);
-	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
-	if (unlikely(!new_k))
-		return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
-
-	trans_for_each_update(trans, i)
-		if (i->old_v == &ck->k->v)
-			i->old_v = &new_k->v;
-
-	ck->u64s	= new_u64s;
-	ck->k		= new_k;
-	return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
-			       struct btree_insert_entry *i,
-			       unsigned flags)
-{
-	verify_update_old_key(trans, i);
-
-	if (unlikely(flags & BTREE_TRIGGER_norun))
-		return 0;
-
-	struct bkey_s_c old = { &i->old_k, i->old_v };
-	struct bkey_i *new = i->k;
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
-	if (old_ops->trigger == new_ops->trigger)
-		return bch2_key_trigger(trans, i->btree_id, i->level,
-				old, bkey_i_to_s(new),
-				BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
-	else
-		return bch2_key_trigger_new(trans, i->btree_id, i->level,
-				bkey_i_to_s(new), flags) ?:
-		       bch2_key_trigger_old(trans, i->btree_id, i->level,
-				old, flags);
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-	verify_update_old_key(trans, i);
-
-	if ((i->flags & BTREE_TRIGGER_norun) ||
-	    !btree_node_type_has_trans_triggers(i->bkey_type))
-		return 0;
-
-	/*
-	 * Transactional triggers create new btree_insert_entries, so we can't
-	 * pass them a pointer to a btree_insert_entry, that memory is going to
-	 * move:
-	 */
-	struct bkey old_k = i->old_k;
-	struct bkey_s_c old = { &old_k, i->old_v };
-	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-	unsigned flags = i->flags|BTREE_TRIGGER_transactional;
-
-	if (!i->insert_trigger_run &&
-	    !i->overwrite_trigger_run &&
-	    old_ops->trigger == new_ops->trigger) {
-		i->overwrite_trigger_run = true;
-		i->insert_trigger_run = true;
-		return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
-					BTREE_TRIGGER_insert|
-					BTREE_TRIGGER_overwrite|flags) ?: 1;
-	} else if (!i->overwrite_trigger_run) {
-		i->overwrite_trigger_run = true;
-		return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
-	} else if (!i->insert_trigger_run) {
-		i->insert_trigger_run = true;
-		return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
-	} else {
-		return 0;
-	}
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-	unsigned sort_id_start = 0;
-
-	while (sort_id_start < trans->nr_updates) {
-		unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
-		bool trans_trigger_run;
-
-		/*
-		 * For a given btree, this algorithm runs insert triggers before
-		 * overwrite triggers: this is so that when extents are being
-		 * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
-		 * references before they are re-added.
-		 *
-		 * Running triggers will append more updates to the list of
-		 * updates as we're walking it:
-		 */
-		do {
-			trans_trigger_run = false;
-
-			for (i = sort_id_start;
-			     i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
-			     i++) {
-				if (trans->updates[i].sort_order < sort_id) {
-					sort_id_start = i;
-					continue;
-				}
-
-				int ret = run_one_trans_trigger(trans, trans->updates + i);
-				if (ret < 0)
-					return ret;
-				if (ret)
-					trans_trigger_run = true;
-			}
-		} while (trans_trigger_run);
-
-		sort_id_start = i;
-	}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
-		       btree_node_type_has_trans_triggers(i->bkey_type) &&
-		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
-	return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
-	trans_for_each_update(trans, i)
-		if (btree_node_type_has_triggers(i->bkey_type) &&
-		    gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
-			int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
-			if (ret)
-				return ret;
-		}
-
-	return 0;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-			       struct btree_insert_entry **stopped_at,
-			       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_trans_commit_hook *h;
-	unsigned u64s = 0;
-	int ret = 0;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-#if 0
-	/* todo: bring back dynamic fault injection */
-	if (race_fault()) {
-		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
-	}
-#endif
-	/*
-	 * Check if the insert will fit in the leaf node with the write lock
-	 * held, otherwise another thread could write the node changing the
-	 * amount of space available:
-	 */
-
-	prefetch(&trans->c->journal.flags);
-
-	trans_for_each_update(trans, i) {
-		/* Multiple inserts might go to same leaf: */
-		if (!same_leaf_as_prev(trans, i))
-			u64s = 0;
-
-		u64s += i->k->k.u64s;
-		ret = !i->cached
-			? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
-			: btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
-		if (ret) {
-			*stopped_at = i;
-			return ret;
-		}
-
-		i->k->k.needs_whiteout = false;
-	}
-
-	/*
-	 * Don't get journal reservation until after we know insert will
-	 * succeed:
-	 */
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-		ret = bch2_trans_journal_res_get(trans,
-				(flags & BCH_WATERMARK_MASK)|
-				JOURNAL_RES_GET_NONBLOCK);
-		if (ret)
-			return ret;
-
-		if (unlikely(trans->journal_transaction_names))
-			journal_transaction_name(trans);
-	}
-
-	/*
-	 * Not allowed to fail after we've gotten our journal reservation - we
-	 * have to use it:
-	 */
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
-		if (static_branch_unlikely(&bch2_journal_seq_verify))
-			trans_for_each_update(trans, i)
-				i->k->k.bversion.lo = trans->journal_res.seq;
-		else if (static_branch_unlikely(&bch2_inject_invalid_keys))
-			trans_for_each_update(trans, i)
-				i->k->k.bversion = MAX_VERSION;
-	}
-
-	h = trans->hooks;
-	while (h) {
-		ret = h->fn(trans, h);
-		if (ret)
-			return ret;
-		h = h->next;
-	}
-
-	struct bkey_i *accounting;
-
-	percpu_down_read(&c->mark_lock);
-	for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
-	     accounting != btree_trans_subbuf_top(trans, &trans->accounting);
-	     accounting = bkey_next(accounting)) {
-		ret = bch2_accounting_trans_commit_hook(trans,
-					bkey_i_to_accounting(accounting), flags);
-		if (ret)
-			goto revert_fs_usage;
-	}
-	percpu_up_read(&c->mark_lock);
-
-	/* XXX: we only want to run this if deltas are nonzero */
-	bch2_trans_account_disk_usage_change(trans);
-
-	trans_for_each_update(trans, i)
-		if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
-			ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
-			if (ret)
-				goto fatal_err;
-		}
-
-	if (unlikely(c->gc_pos.phase)) {
-		ret = bch2_trans_commit_run_gc_triggers(trans);
-		if  (ret)
-			goto fatal_err;
-	}
-
-	struct bkey_validate_context validate_context = { .from	= BKEY_VALIDATE_commit };
-
-	if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
-		validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
-	for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
-	     i != btree_trans_journal_entries_top(trans);
-	     i = vstruct_next(i)) {
-		ret = bch2_journal_entry_validate(c, NULL, i,
-						  bcachefs_metadata_version_current,
-						  CPU_BIG_ENDIAN, validate_context);
-		if (unlikely(ret)) {
-			bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
-						trans->fn);
-			goto fatal_err;
-		}
-	}
-
-	trans_for_each_update(trans, i) {
-		validate_context.level	= i->level;
-		validate_context.btree	= i->btree_id;
-
-		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
-		if (unlikely(ret)){
-			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
-						trans->fn, (void *) i->ip_allocated);
-			goto fatal_err;
-		}
-		btree_insert_entry_checks(trans, i);
-	}
-
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
-		struct journal *j = &c->journal;
-		struct jset_entry *entry;
-
-		trans_for_each_update(trans, i) {
-			if (i->key_cache_already_flushed)
-				continue;
-
-			if (i->flags & BTREE_UPDATE_nojournal)
-				continue;
-
-			verify_update_old_key(trans, i);
-
-			if (trans->journal_transaction_names) {
-				entry = bch2_journal_add_entry(j, &trans->journal_res,
-						       BCH_JSET_ENTRY_overwrite,
-						       i->btree_id, i->level,
-						       i->old_k.u64s);
-				bkey_reassemble((struct bkey_i *) entry->start,
-						(struct bkey_s_c) { &i->old_k, i->old_v });
-			}
-
-			entry = bch2_journal_add_entry(j, &trans->journal_res,
-					       BCH_JSET_ENTRY_btree_keys,
-					       i->btree_id, i->level,
-					       i->k->k.u64s);
-			bkey_copy((struct bkey_i *) entry->start, i->k);
-		}
-
-		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-				  btree_trans_journal_entries_start(trans),
-				  trans->journal_entries.u64s);
-
-		EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s);
-
-		trans->journal_res.offset	+= trans->journal_entries.u64s;
-		trans->journal_res.u64s		-= trans->journal_entries.u64s;
-
-		memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
-						BCH_JSET_ENTRY_write_buffer_keys,
-						BTREE_ID_accounting, 0,
-						trans->accounting.u64s)->_data,
-				  btree_trans_subbuf_base(trans, &trans->accounting),
-				  trans->accounting.u64s);
-
-		if (trans->journal_seq)
-			*trans->journal_seq = trans->journal_res.seq;
-	}
-
-	trans_for_each_update(trans, i) {
-		struct btree_path *path = trans->paths + i->path;
-
-		if (!i->cached)
-			bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
-		else if (!i->key_cache_already_flushed)
-			bch2_btree_insert_key_cached(trans, flags, i);
-		else
-			bch2_btree_key_cache_drop(trans, path);
-	}
-
-	return 0;
-fatal_err:
-	bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
-	percpu_down_read(&c->mark_lock);
-revert_fs_usage:
-	for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-	     i != accounting;
-	     i = bkey_next(i))
-		bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
-	percpu_up_read(&c->mark_lock);
-	return ret;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
-	/*
-	 * Accounting keys aren't deduped in the journal: we have to compare
-	 * each individual update against what's in the btree to see if it has
-	 * been applied yet, and accounting updates also don't overwrite,
-	 * they're deltas that accumulate.
-	 */
-	trans_for_each_update(trans, i)
-		if (i->k->k.type != KEY_TYPE_accounting)
-			bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static int bch2_trans_commit_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-				       struct btree_insert_entry **stopped_at,
-				       unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0, u64s_delta = 0;
-
-	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
-		struct btree_insert_entry *i = trans->updates + idx;
-		if (i->cached)
-			continue;
-
-		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-		u64s_delta -= i->old_btree_u64s;
-
-		if (!same_leaf_as_next(trans, i)) {
-			if (u64s_delta <= 0) {
-				ret = bch2_foreground_maybe_merge(trans, i->path,
-							i->level, flags);
-				if (unlikely(ret))
-					return ret;
-			}
-
-			u64s_delta = 0;
-		}
-	}
-
-	ret = bch2_trans_lock_write(trans);
-	if (unlikely(ret))
-		return ret;
-
-	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
-	if (!ret && unlikely(trans->journal_replay_not_finished))
-		bch2_drop_overwrites_from_journal(trans);
-
-	bch2_trans_unlock_updates_write(trans);
-
-	if (!ret && trans->journal_pin)
-		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-				     trans->journal_pin,
-				     bch2_trans_commit_journal_pin_flush);
-
-	/*
-	 * Drop journal reservation after dropping write locks, since dropping
-	 * the journal reservation may kick off a journal write:
-	 */
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-		bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-	return ret;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
-	int ret = bch2_journal_error(&c->journal) ?:
-		bch2_btree_key_cache_wait_done(c);
-
-	if (!ret)
-		journal_reclaim_kick(&c->journal);
-	return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-			    struct btree_insert_entry *i,
-			    int ret, unsigned long trace_ip)
-{
-	struct bch_fs *c = trans->c;
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
-	if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
-		/*
-		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-		 * flag
-		 */
-		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    watermark < BCH_WATERMARK_reclaim) {
-			ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-			goto out;
-		}
-
-		ret = drop_locks_do(trans,
-			bch2_trans_journal_res_get(trans,
-					(flags & BCH_WATERMARK_MASK)|
-					JOURNAL_RES_GET_CHECK));
-		goto out;
-	}
-
-	switch (ret) {
-	case -BCH_ERR_btree_insert_btree_node_full:
-		ret = bch2_btree_split_leaf(trans, i->path, flags);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			trace_and_count(c, trans_restart_btree_node_split, trans,
-					trace_ip, trans->paths + i->path);
-		break;
-	case -BCH_ERR_btree_insert_need_mark_replicas:
-		ret = drop_locks_do(trans,
-			bch2_accounting_update_sb(trans));
-		break;
-	case -BCH_ERR_btree_insert_need_journal_reclaim:
-		bch2_trans_unlock(trans);
-
-		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
-
-		wait_event_freezable(c->journal.reclaim_wait,
-				     (ret = journal_reclaim_wait_done(c)));
-
-		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
-
-		if (ret < 0)
-			break;
-
-		ret = bch2_trans_relock(trans);
-		break;
-	default:
-		BUG_ON(ret >= 0);
-		break;
-	}
-out:
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-				(flags & BCH_TRANS_COMMIT_no_enospc), c,
-		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
-	return ret;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	BUG_ON(current != c->recovery_task);
-
-	trans_for_each_update(trans, i) {
-		int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-		if (ret)
-			return ret;
-	}
-
-	for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
-	     i != btree_trans_journal_entries_top(trans);
-	     i = vstruct_next(i)) {
-		if (i->type == BCH_JSET_ENTRY_btree_keys ||
-		    i->type == BCH_JSET_ENTRY_write_buffer_keys) {
-			jset_entry_for_each_key(i, k) {
-				int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
-				if (ret)
-					return ret;
-			}
-		}
-
-		if (i->type == BCH_JSET_ENTRY_btree_root) {
-			guard(mutex)(&c->btree_root_lock);
-
-			struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
-
-			bkey_copy(&r->key, i->start);
-			r->level = i->level;
-			r->alive = true;
-		}
-	}
-
-	for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-	     i != btree_trans_subbuf_top(trans, &trans->accounting);
-	     i = bkey_next(i)) {
-		int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
-	struct btree_insert_entry *errored_at = NULL;
-	struct bch_fs *c = trans->c;
-	unsigned journal_u64s = 0;
-	int ret = 0;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	ret = trans_maybe_inject_restart(trans, _RET_IP_);
-	if (unlikely(ret))
-		goto out_reset;
-
-	if (!trans->nr_updates &&
-	    !trans->journal_entries.u64s &&
-	    !trans->accounting.u64s)
-		goto out_reset;
-
-	ret = bch2_trans_commit_run_triggers(trans);
-	if (ret)
-		goto out_reset;
-
-	if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
-	    unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
-		if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
-			ret = do_bch2_trans_commit_to_journal_replay(trans);
-		else
-			ret = bch_err_throw(c, erofs_trans_commit);
-		goto out_reset;
-	}
-
-	EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-
-	journal_u64s = jset_u64s(trans->accounting.u64s);
-	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-	if (trans->journal_transaction_names)
-		journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
-	trans_for_each_update(trans, i) {
-		struct btree_path *path = trans->paths + i->path;
-
-		EBUG_ON(!path->should_be_locked);
-
-		ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
-		if (unlikely(ret))
-			goto out;
-
-		EBUG_ON(!btree_node_intent_locked(path, i->level));
-
-		if (i->key_cache_already_flushed)
-			continue;
-
-		if (i->flags & BTREE_UPDATE_nojournal)
-			continue;
-
-		/* we're going to journal the key being updated: */
-		journal_u64s += jset_u64s(i->k->k.u64s);
-
-		/* and we're also going to log the overwrite: */
-		if (trans->journal_transaction_names)
-			journal_u64s += jset_u64s(i->old_k.u64s);
-	}
-
-	if (trans->extra_disk_res) {
-		ret = bch2_disk_reservation_add(c, trans->disk_res,
-				trans->extra_disk_res,
-				(flags & BCH_TRANS_COMMIT_no_enospc)
-				? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (ret)
-			goto err;
-	}
-retry:
-	errored_at = NULL;
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
-		memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-	memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
-	trans->journal_u64s = journal_u64s + trans->journal_entries.u64s;
-
-	ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
-
-	/* make sure we didn't drop or screw up locks: */
-	bch2_trans_verify_locks(trans);
-
-	if (ret)
-		goto err;
-
-	trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
-	if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
-out_reset:
-	if (!ret)
-		bch2_trans_downgrade(trans);
-	bch2_trans_reset_updates(trans);
-
-	return ret;
-err:
-	ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
-	if (ret)
-		goto out;
-
-	/*
-	 * We might have done another transaction commit in the error path -
-	 * i.e. btree write buffer flush - which will have made use of
-	 * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
-	 * how the journal sequence number to pin is passed in - so we must
-	 * restart:
-	 */
-	if (flags & BCH_TRANS_COMMIT_no_journal_res) {
-		ret = bch_err_throw(c, transaction_restart_nested);
-		goto out;
-	}
-
-	goto retry;
-}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
deleted file mode 100644
index 112170fd9c8f..000000000000
--- a/fs/bcachefs/btree_types.h
+++ /dev/null
@@ -1,937 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_TYPES_H
-#define _BCACHEFS_BTREE_TYPES_H
-
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-
-#include "bbpos_types.h"
-#include "btree_key_cache_types.h"
-#include "buckets_types.h"
-#include "darray.h"
-#include "errcode.h"
-#include "journal_types.h"
-#include "replicas_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_update;
-struct btree_trans;
-
-#define MAX_BSETS		3U
-
-struct btree_nr_keys {
-
-	/*
-	 * Amount of live metadata (i.e. size of node after a compaction) in
-	 * units of u64s
-	 */
-	u16			live_u64s;
-	u16			bset_u64s[MAX_BSETS];
-
-	/* live keys only: */
-	u16			packed_keys;
-	u16			unpacked_keys;
-};
-
-struct bset_tree {
-	/*
-	 * We construct a binary tree in an array as if the array
-	 * started at 1, so that things line up on the same cachelines
-	 * better: see comments in bset.c at cacheline_to_bkey() for
-	 * details
-	 */
-
-	/* size of the binary tree and prev array */
-	u16			size;
-
-	/* function of size - precalculated for to_inorder() */
-	u16			extra;
-
-	u16			data_offset;
-	u16			aux_data_offset;
-	u16			end_offset;
-};
-
-struct btree_write {
-	struct journal_entry_pin	journal;
-};
-
-struct btree_alloc {
-	struct open_buckets	ob;
-	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-struct btree_bkey_cached_common {
-	struct six_lock		lock;
-	u8			level;
-	u8			btree_id;
-	bool			cached;
-};
-
-struct btree {
-	struct btree_bkey_cached_common c;
-
-	struct rhash_head	hash;
-	u64			hash_val;
-
-	unsigned long		flags;
-	u16			written;
-	u8			nsets;
-	u8			nr_key_bits;
-	u16			version_ondisk;
-
-	struct bkey_format	format;
-
-	struct btree_node	*data;
-	void			*aux_data;
-
-	/*
-	 * Sets of sorted keys - the real btree node - plus a binary search tree
-	 *
-	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-	 * to the memory we have allocated for this btree node. Additionally,
-	 * set[0]->data points to the entire btree node as it exists on disk.
-	 */
-	struct bset_tree	set[MAX_BSETS];
-
-	struct btree_nr_keys	nr;
-	u16			sib_u64s[2];
-	u16			whiteout_u64s;
-	u8			byte_order;
-	u8			unpack_fn_len;
-
-	struct btree_write	writes[2];
-
-	/* Key/pointer for this btree node */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-	/*
-	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
-	 * fails because the lock sequence number has changed - i.e. the
-	 * contents were modified - we can still relock the node if it's still
-	 * the one we want, without redoing the traversal
-	 */
-
-	/*
-	 * For asynchronous splits/interior node updates:
-	 * When we do a split, we allocate new child nodes and update the parent
-	 * node to point to them: we update the parent in memory immediately,
-	 * but then we must wait until the children have been written out before
-	 * the update to the parent can be written - this is a list of the
-	 * btree_updates that are blocking this node from being
-	 * written:
-	 */
-	struct list_head	write_blocked;
-
-	/*
-	 * Also for asynchronous splits/interior node updates:
-	 * If a btree node isn't reachable yet, we don't want to kick off
-	 * another write - because that write also won't yet be reachable and
-	 * marking it as completed before it's reachable would be incorrect:
-	 */
-	unsigned long		will_make_reachable;
-
-	struct open_buckets	ob;
-
-	/* lru list */
-	struct list_head	list;
-};
-
-#define BCH_BTREE_CACHE_NOT_FREED_REASONS()	\
-	x(cache_reserve)			\
-	x(lock_intent)				\
-	x(lock_write)				\
-	x(dirty)				\
-	x(read_in_flight)			\
-	x(write_in_flight)			\
-	x(noevict)				\
-	x(write_blocked)			\
-	x(will_make_reachable)			\
-	x(access_bit)
-
-enum bch_btree_cache_not_freed_reasons {
-#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
-	BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
-	BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
-};
-
-struct btree_cache_list {
-	unsigned		idx;
-	struct shrinker		*shrink;
-	struct list_head	list;
-	size_t			nr;
-};
-
-struct btree_cache {
-	struct rhashtable	table;
-	bool			table_init_done;
-	/*
-	 * We never free a struct btree, except on shutdown - we just put it on
-	 * the btree_cache_freed list and reuse it later. This simplifies the
-	 * code, and it doesn't cost us much memory as the memory usage is
-	 * dominated by buffers that hold the actual btree node data and those
-	 * can be freed - and the number of struct btrees allocated is
-	 * effectively bounded.
-	 *
-	 * btree_cache_freeable effectively is a small cache - we use it because
-	 * high order page allocations can be rather expensive, and it's quite
-	 * common to delete and allocate btree nodes in quick succession. It
-	 * should never grow past ~2-3 nodes in practice.
-	 */
-	struct mutex		lock;
-	struct list_head	freeable;
-	struct list_head	freed_pcpu;
-	struct list_head	freed_nonpcpu;
-	struct btree_cache_list	live[2];
-
-	size_t			nr_freeable;
-	size_t			nr_reserve;
-	size_t			nr_by_btree[BTREE_ID_NR];
-	atomic_long_t		nr_dirty;
-
-	/* shrinker stats */
-	size_t			nr_freed;
-	u64			not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
-
-	/*
-	 * If we need to allocate memory for a new btree node and that
-	 * allocation fails, we can cannibalize another node in the btree cache
-	 * to satisfy the allocation - lock to guarantee only one thread does
-	 * this at a time:
-	 */
-	struct task_struct	*alloc_lock;
-	struct closure_waitlist	alloc_wait;
-
-	struct bbpos		pinned_nodes_start;
-	struct bbpos		pinned_nodes_end;
-	/* btree id mask: 0 for leaves, 1 for interior */
-	u64			pinned_nodes_mask[2];
-};
-
-struct btree_node_iter {
-	struct btree_node_iter_set {
-		u16	k, end;
-	} data[MAX_BSETS];
-};
-
-#define BTREE_ITER_FLAGS()			\
-	x(slots)				\
-	x(intent)				\
-	x(prefetch)				\
-	x(is_extents)				\
-	x(not_extents)				\
-	x(cached)				\
-	x(with_key_cache)			\
-	x(with_updates)				\
-	x(with_journal)				\
-	x(snapshot_field)			\
-	x(all_snapshots)			\
-	x(filter_snapshots)			\
-	x(nopreserve)				\
-	x(cached_nofill)			\
-	x(key_cache_fill)			\
-
-#define STR_HASH_FLAGS()			\
-	x(must_create)				\
-	x(must_replace)
-
-#define BTREE_UPDATE_FLAGS()			\
-	x(internal_snapshot_node)		\
-	x(nojournal)				\
-	x(key_cache_reclaim)
-
-
-/*
- * BTREE_TRIGGER_norun - don't run triggers at all
- *
- * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
- * a transaction commit: triggers may generate new updates
- *
- * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
- * commit: we have our journal reservation, we're holding btree node write
- * locks, and we know the transaction is going to commit (returning an error
- * here is a fatal error, causing us to go emergency read-only)
- *
- * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
- *
- * BTREE_TRIGGER_insert - @new is entering the btree
- * BTREE_TRIGGER_overwrite - @old is leaving the btree
- */
-#define BTREE_TRIGGER_FLAGS()			\
-	x(norun)				\
-	x(transactional)			\
-	x(atomic)				\
-	x(check_repair)				\
-	x(gc)					\
-	x(insert)				\
-	x(overwrite)				\
-	x(is_root)
-
-enum {
-#define x(n) BTREE_ITER_FLAG_BIT_##n,
-	BTREE_ITER_FLAGS()
-	STR_HASH_FLAGS()
-	BTREE_UPDATE_FLAGS()
-	BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-/* iter flags must fit in a u16: */
-//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
-
-enum btree_iter_update_trigger_flags {
-#define x(n) BTREE_ITER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_ITER_FLAGS()
-#undef x
-#define x(n) STR_HASH_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	STR_HASH_FLAGS()
-#undef x
-#define x(n) BTREE_UPDATE_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_UPDATE_FLAGS()
-#undef x
-#define x(n) BTREE_TRIGGER_##n	= 1U << BTREE_ITER_FLAG_BIT_##n,
-	BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-enum btree_path_uptodate {
-	BTREE_ITER_UPTODATE		= 0,
-	BTREE_ITER_NEED_RELOCK		= 1,
-	BTREE_ITER_NEED_TRAVERSE	= 2,
-};
-
-#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
-#define TRACK_PATH_ALLOCATED
-#endif
-
-typedef u16 btree_path_idx_t;
-
-struct btree_path {
-	btree_path_idx_t	sorted_idx;
-	u8			ref;
-	u8			intent_ref;
-
-	/* btree_iter_copy starts here: */
-	struct bpos		pos;
-
-	enum btree_id		btree_id:5;
-	bool			cached:1;
-	bool			preserve:1;
-	enum btree_path_uptodate uptodate:2;
-	/*
-	 * When true, failing to relock this path will cause the transaction to
-	 * restart:
-	 */
-	bool			should_be_locked:1;
-	unsigned		level:3,
-				locks_want:3;
-	u8			nodes_locked;
-
-	struct btree_path_level {
-		struct btree	*b;
-		struct btree_node_iter iter;
-		u32		lock_seq;
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-		u64             lock_taken_time;
-#endif
-	}			l[BTREE_MAX_DEPTH];
-#ifdef TRACK_PATH_ALLOCATED
-	unsigned long		ip_allocated;
-#endif
-};
-
-static inline struct btree_path_level *path_l(struct btree_path *path)
-{
-	return path->l + path->level;
-}
-
-static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
-{
-#ifdef TRACK_PATH_ALLOCATED
-	return path->ip_allocated;
-#else
-	return _THIS_IP_;
-#endif
-}
-
-/*
- * @pos			- iterator's current position
- * @level		- current btree depth
- * @locks_want		- btree level below which we start taking intent locks
- * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked	- bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-	btree_path_idx_t	path;
-	btree_path_idx_t	update_path;
-	btree_path_idx_t	key_cache_path;
-
-	enum btree_id		btree_id:8;
-	u8			min_depth;
-
-	/* btree_iter_copy starts here: */
-	u16			flags;
-
-	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
-	unsigned		snapshot;
-
-	struct bpos		pos;
-	/*
-	 * Current unpacked key - so that bch2_btree_iter_next()/
-	 * bch2_btree_iter_next_slot() can correctly advance pos.
-	 */
-	struct bkey		k;
-
-	/* BTREE_ITER_with_journal: */
-	size_t			journal_idx;
-#ifdef TRACK_PATH_ALLOCATED
-	unsigned long		ip_allocated;
-#endif
-};
-
-#define BKEY_CACHED_ACCESSED		0
-#define BKEY_CACHED_DIRTY		1
-
-struct bkey_cached {
-	struct btree_bkey_cached_common c;
-
-	unsigned long		flags;
-	u16			u64s;
-	struct bkey_cached_key	key;
-
-	struct rhash_head	hash;
-
-	struct journal_entry_pin journal;
-	u64			seq;
-
-	struct bkey_i		*k;
-	struct rcu_head		rcu;
-};
-
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
-{
-	return !b->cached
-		? container_of(b, struct btree, c)->key.k.p
-		: container_of(b, struct bkey_cached, c)->key.pos;
-}
-
-struct btree_insert_entry {
-	unsigned		flags;
-	u8			sort_order;
-	u8			bkey_type;
-	enum btree_id		btree_id:8;
-	u8			level:4;
-	bool			cached:1;
-	bool			insert_trigger_run:1;
-	bool			overwrite_trigger_run:1;
-	bool			key_cache_already_flushed:1;
-	/*
-	 * @old_k may be a key from the journal; @old_btree_u64s always refers
-	 * to the size of the key being overwritten in the btree:
-	 */
-	u8			old_btree_u64s;
-	btree_path_idx_t	path;
-	struct bkey_i		*k;
-	/* key being overwritten: */
-	struct bkey		old_k;
-	const struct bch_val	*old_v;
-	unsigned long		ip_allocated;
-};
-
-/* Number of btree paths we preallocate, usually enough */
-#define BTREE_ITER_INITIAL		64
-/*
- * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
- * paths should run inside this limit, and if they don't it usually indicates a
- * bug (leaking/duplicated btree paths).
- *
- * exception: some fsck paths
- *
- * bugs with excessive path usage seem to have possibly been eliminated now, so
- * we might consider eliminating this (and btree_trans_too_many_iter()) at some
- * point.
- */
-#define BTREE_ITER_NORMAL_LIMIT		256
-/* never exceed limit */
-#define BTREE_ITER_MAX			(1U << 10)
-
-struct btree_trans_commit_hook;
-typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-
-struct btree_trans_commit_hook {
-	btree_trans_commit_hook_fn	*fn;
-	struct btree_trans_commit_hook	*next;
-};
-
-#define BTREE_TRANS_MEM_MAX	(1U << 16)
-
-#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
-
-struct btree_trans_paths {
-	unsigned long		nr_paths;
-	struct btree_path	paths[];
-};
-
-struct trans_kmalloc_trace {
-	unsigned long		ip;
-	size_t			bytes;
-};
-typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
-
-struct btree_trans_subbuf {
-	u16			base;
-	u16			u64s;
-	u16			size;;
-};
-
-struct btree_trans {
-	struct bch_fs		*c;
-
-	unsigned long		*paths_allocated;
-	struct btree_path	*paths;
-	btree_path_idx_t	*sorted;
-	struct btree_insert_entry *updates;
-
-	void			*mem;
-	unsigned		mem_top;
-	unsigned		mem_bytes;
-	unsigned		realloc_bytes_required;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-	darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
-
-	btree_path_idx_t	nr_sorted;
-	btree_path_idx_t	nr_paths;
-	btree_path_idx_t	nr_paths_max;
-	btree_path_idx_t	nr_updates;
-	u8			fn_idx;
-	u8			lock_must_abort;
-	bool			lock_may_not_fail:1;
-	bool			srcu_held:1;
-	bool			locked:1;
-	bool			pf_memalloc_nofs:1;
-	bool			write_locked:1;
-	bool			used_mempool:1;
-	bool			in_traverse_all:1;
-	bool			paths_sorted:1;
-	bool			memory_allocation_failure:1;
-	bool			journal_transaction_names:1;
-	bool			journal_replay_not_finished:1;
-	bool			notrace_relock_fail:1;
-	enum bch_errcode	restarted:16;
-	u32			restart_count;
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
-	u32			restart_count_this_trans;
-#endif
-
-	u64			last_begin_time;
-	unsigned long		last_begin_ip;
-	unsigned long		last_restarted_ip;
-#ifdef CONFIG_BCACHEFS_DEBUG
-	bch_stacktrace		last_restarted_trace;
-#endif
-	unsigned long		last_unlock_ip;
-	unsigned long		srcu_lock_time;
-
-	const char		*fn;
-	struct btree_bkey_cached_common *locking;
-	struct six_lock_waiter	locking_wait;
-	int			srcu_idx;
-
-	/* update path: */
-	struct btree_trans_subbuf journal_entries;
-	struct btree_trans_subbuf accounting;
-
-	struct btree_trans_commit_hook *hooks;
-	struct journal_entry_pin *journal_pin;
-
-	struct journal_res	journal_res;
-	u64			*journal_seq;
-	struct disk_reservation *disk_res;
-
-	struct bch_fs_usage_base fs_usage_delta;
-
-	unsigned		journal_u64s;
-	unsigned		extra_disk_res; /* XXX kill */
-
-	__BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-	/* Entries before this are zeroed out on every bch2_trans_get() call */
-
-	struct list_head	list;
-	struct closure		ref;
-
-	unsigned long		_paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
-	struct btree_trans_paths trans_paths;
-	struct btree_path	_paths[BTREE_ITER_INITIAL];
-	btree_path_idx_t	_sorted[BTREE_ITER_INITIAL + 4];
-	struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
-};
-
-static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return trans->paths + iter->path;
-}
-
-static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
-{
-	return iter->key_cache_path
-		? trans->paths + iter->key_cache_path
-		: NULL;
-}
-
-#define BCH_BTREE_WRITE_TYPES()						\
-	x(initial,		0)					\
-	x(init_next_bset,	1)					\
-	x(cache_reclaim,	2)					\
-	x(journal_reclaim,	3)					\
-	x(interior,		4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
-	BCH_BTREE_WRITE_TYPES()
-#undef x
-	BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
-
-#define BTREE_FLAGS()							\
-	x(read_in_flight)						\
-	x(read_error)							\
-	x(dirty)							\
-	x(need_write)							\
-	x(write_blocked)						\
-	x(will_make_reachable)						\
-	x(noevict)							\
-	x(write_idx)							\
-	x(accessed)							\
-	x(write_in_flight)						\
-	x(write_in_flight_inner)					\
-	x(just_written)							\
-	x(dying)							\
-	x(fake)								\
-	x(need_rewrite)							\
-	x(need_rewrite_error)						\
-	x(need_rewrite_degraded)					\
-	x(need_rewrite_ptr_written_zero)				\
-	x(never_write)							\
-	x(pinned)
-
-enum btree_flags {
-	/* First bits for btree node write type */
-	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
-#define x(flag)	BTREE_NODE_##flag,
-	BTREE_FLAGS()
-#undef x
-};
-
-#define x(flag)								\
-static inline bool btree_node_ ## flag(struct btree *b)			\
-{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
-									\
-static inline void set_btree_node_ ## flag(struct btree *b)		\
-{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
-									\
-static inline void clear_btree_node_ ## flag(struct btree *b)		\
-{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-BTREE_FLAGS()
-#undef x
-
-#define BTREE_NODE_REWRITE_REASON()					\
-	x(none)								\
-	x(unknown)							\
-	x(error)							\
-	x(degraded)							\
-	x(ptr_written_zero)
-
-enum btree_node_rewrite_reason {
-#define x(n)	BTREE_NODE_REWRITE_##n,
-	BTREE_NODE_REWRITE_REASON()
-#undef x
-};
-
-static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b)
-{
-	if (btree_node_need_rewrite_ptr_written_zero(b))
-		return BTREE_NODE_REWRITE_ptr_written_zero;
-	if (btree_node_need_rewrite_degraded(b))
-		return BTREE_NODE_REWRITE_degraded;
-	if (btree_node_need_rewrite_error(b))
-		return BTREE_NODE_REWRITE_error;
-	if (btree_node_need_rewrite(b))
-		return BTREE_NODE_REWRITE_unknown;
-	return BTREE_NODE_REWRITE_none;
-}
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
-	return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
-	return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
-	EBUG_ON(!b->nsets);
-	return b->set + b->nsets - 1;
-}
-
-static inline void *
-__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
-{
-	return (void *) ((u64 *) b->data + offset);
-}
-
-static inline u16
-__btree_node_ptr_to_offset(const struct btree *b, const void *p)
-{
-	u16 ret = (u64 *) p - (u64 *) b->data;
-
-	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
-	return ret;
-}
-
-static inline struct bset *bset(const struct btree *b,
-				const struct bset_tree *t)
-{
-	return __btree_node_offset_to_ptr(b, t->data_offset);
-}
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
-	t->end_offset =
-		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-				  const struct bset *i)
-{
-	t->data_offset = __btree_node_ptr_to_offset(b, i);
-	set_btree_bset_end(b, t);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
-	return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
-	return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
-	return __btree_node_ptr_to_offset(b, k);
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
-	return __btree_node_offset_to_ptr(b, k);
-}
-
-static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
-{
-	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
-}
-
-#define btree_bkey_first(_b, _t)					\
-({									\
-	EBUG_ON(bset(_b, _t)->start !=					\
-		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
-									\
-	bset(_b, _t)->start;						\
-})
-
-#define btree_bkey_last(_b, _t)						\
-({									\
-	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
-		vstruct_last(bset(_b, _t)));				\
-									\
-	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
-})
-
-static inline unsigned bset_u64s(struct bset_tree *t)
-{
-	return t->end_offset - t->data_offset -
-		sizeof(struct bset) / sizeof(u64);
-}
-
-static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
-{
-	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
-	return i - (void *) b->data;
-}
-
-enum btree_node_type {
-	BKEY_TYPE_btree,
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
-	BCH_BTREE_IDS()
-#undef x
-	BKEY_TYPE_NR
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
-{
-	return level ? BKEY_TYPE_btree : (unsigned) id + 1;
-}
-
-/* Type of keys @b contains: */
-static inline enum btree_node_type btree_node_type(struct btree *b)
-{
-	return __btree_node_type(b->c.level, b->c.btree_id);
-}
-
-const char *bch2_btree_node_type_str(enum btree_node_type);
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-	(BIT_ULL(BKEY_TYPE_extents)|			\
-	 BIT_ULL(BKEY_TYPE_alloc)|			\
-	 BIT_ULL(BKEY_TYPE_inodes)|			\
-	 BIT_ULL(BKEY_TYPE_stripes)|			\
-	 BIT_ULL(BKEY_TYPE_reflink)|			\
-	 BIT_ULL(BKEY_TYPE_subvolumes)|			\
-	 BIT_ULL(BKEY_TYPE_btree))
-
-#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS		\
-	(BIT_ULL(BKEY_TYPE_alloc)|			\
-	 BIT_ULL(BKEY_TYPE_inodes)|			\
-	 BIT_ULL(BKEY_TYPE_stripes)|			\
-	 BIT_ULL(BKEY_TYPE_snapshots))
-
-#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
-	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
-	 BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-
-static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
-{
-	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
-{
-	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_triggers(enum btree_node_type type)
-{
-	return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
-}
-
-static inline bool btree_id_is_extents(enum btree_id btree)
-{
-	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_extents)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-	return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
-}
-
-static inline bool btree_type_has_snapshots(enum btree_id btree)
-{
-	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_snapshots)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_snapshot_field(enum btree_id btree)
-{
-	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_ptrs(enum btree_id btree)
-{
-	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_data)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_uses_write_buffer(enum btree_id btree)
-{
-	const u64 mask = 0
-#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_IS_write_buffer)) << nr)
-	BCH_BTREE_IDS()
-#undef x
-	;
-
-	return BIT_ULL(btree) & mask;
-}
-
-static inline u8 btree_trigger_order(enum btree_id btree)
-{
-	switch (btree) {
-	case BTREE_ID_alloc:
-		return U8_MAX;
-	case BTREE_ID_stripes:
-		return U8_MAX - 1;
-	default:
-		return btree;
-	}
-}
-
-struct btree_root {
-	struct btree		*b;
-
-	/* On disk root - see async splits: */
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-	u8			level;
-	u8			alive;
-	s16			error;
-};
-
-enum btree_gc_coalesce_fail_reason {
-	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
-	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
-	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-enum btree_node_sibling {
-	btree_prev_sib,
-	btree_next_sib,
-};
-
-struct get_locks_fail {
-	unsigned	l;
-	struct btree	*b;
-};
-
-#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
deleted file mode 100644
index ee657b9f4b96..000000000000
--- a/fs/bcachefs/btree_update.c
+++ /dev/null
@@ -1,916 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extents.h"
-#include "keylist.h"
-#include "snapshot.h"
-#include "trace.h"
-
-#include <linux/string_helpers.h>
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-					 const struct btree_insert_entry *r)
-{
-	return   cmp_int(l->sort_order,	r->sort_order) ?:
-		 cmp_int(l->cached,	r->cached) ?:
-		 -cmp_int(l->level,	r->level) ?:
-		 bpos_cmp(l->k->k.p,	r->k->k.p);
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
-			  struct bkey_i *, enum btree_iter_update_trigger_flags,
-			  unsigned long ip);
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       struct bkey_s_c k,
-				       struct bkey_i **insert,
-				       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *update;
-	int ret;
-
-	if (unlikely(trans->journal_replay_not_finished))
-		return 0;
-
-	update = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(update);
-	if (ret)
-		return ret;
-
-	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-		return 0;
-
-	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
-		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	ret = bch2_btree_delete_at(trans, iter, flags);
-	if (ret)
-		return ret;
-
-	*insert = update;
-	return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-				      struct btree_iter *iter,
-				      struct bkey_i *insert,
-				      struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	if (unlikely(trans->journal_replay_not_finished))
-		return 0;
-
-	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
-		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-	return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-				      enum btree_id btree_id, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot = pos.snapshot;
-	int ret;
-
-	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-		return 0;
-
-	pos.snapshot++;
-
-	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_all_snapshots|
-			   BTREE_ITER_nopreserve, k, ret) {
-		if (!bkey_eq(k.k->p, pos))
-			break;
-
-		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-					      k.k->p.snapshot)) {
-			ret = !bkey_whiteout(k.k);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-				     enum btree_id btree, struct bpos pos,
-				     snapshot_id_list *s)
-{
-	int ret = 0;
-
-	darray_for_each(*s, id) {
-		pos.snapshot = *id;
-
-		struct btree_iter iter;
-		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
-						       BTREE_ITER_not_extents|
-						       BTREE_ITER_intent);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		if (k.k->type == KEY_TYPE_deleted) {
-			struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-			ret = PTR_ERR_OR_ZERO(update);
-			if (ret) {
-				bch2_trans_iter_exit(trans, &iter);
-				break;
-			}
-
-			bkey_init(&update->k);
-			update->k.p		= pos;
-			update->k.type		= KEY_TYPE_whiteout;
-
-			ret = bch2_trans_update(trans, &iter, update,
-						BTREE_UPDATE_internal_snapshot_node);
-		}
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (ret)
-			break;
-	}
-
-	darray_exit(s);
-	return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-				       struct btree_iter *iter,
-				       enum btree_iter_update_trigger_flags flags,
-				       struct bkey_s_c old,
-				       struct bkey_s_c new)
-{
-	enum btree_id btree_id = iter->btree_id;
-	struct bkey_i *update;
-	struct bpos new_start = bkey_start_pos(new.k);
-	unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-	unsigned back_split  = bkey_gt(old.k->p, new.k->p);
-	unsigned middle_split = (front_split || back_split) &&
-		old.k->p.snapshot != new.k->p.snapshot;
-	unsigned nr_splits = front_split + back_split + middle_split;
-	int ret = 0, compressed_sectors;
-
-	/*
-	 * If we're going to be splitting a compressed extent, note it
-	 * so that __bch2_trans_commit() can increase our disk
-	 * reservation:
-	 */
-	if (nr_splits > 1 &&
-	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-		trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
-
-	if (front_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_back(new_start, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	/* If we're overwriting in a different snapshot - middle split: */
-	if (middle_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new_start, update);
-		bch2_cut_back(new.k->p, update);
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-					old.k->p, update->k.p) ?:
-			bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (bkey_le(old.k->p, new.k->p)) {
-		update = bch2_trans_kmalloc(trans, sizeof(*update));
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bkey_init(&update->k);
-		update->k.p = old.k->p;
-		update->k.p.snapshot = new.k->p.snapshot;
-
-		if (new.k->p.snapshot != old.k->p.snapshot) {
-			update->k.type = KEY_TYPE_whiteout;
-		} else if (btree_type_has_snapshots(btree_id)) {
-			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-			if (ret < 0)
-				return ret;
-			if (ret)
-				update->k.type = KEY_TYPE_whiteout;
-		}
-
-		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-					  BTREE_UPDATE_internal_snapshot_node|flags);
-		if (ret)
-			return ret;
-	}
-
-	if (back_split) {
-		update = bch2_bkey_make_mut_noupdate(trans, old);
-		if ((ret = PTR_ERR_OR_ZERO(update)))
-			return ret;
-
-		bch2_cut_front(new.k->p, update);
-
-		ret = bch2_trans_update_by_path(trans, iter->path, update,
-					  BTREE_UPDATE_internal_snapshot_node|
-					  flags, _RET_IP_);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-				    struct btree_iter *orig_iter,
-				    struct bkey_i *insert,
-				    enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	enum btree_id btree_id = orig_iter->btree_id;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-			     BTREE_ITER_intent|
-			     BTREE_ITER_with_updates|
-			     BTREE_ITER_not_extents);
-	k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
-	if ((ret = bkey_err(k)))
-		goto err;
-	if (!k.k)
-		goto out;
-
-	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-			ret = extent_front_merge(trans, &iter, k, &insert, flags);
-			if (ret)
-				goto err;
-		}
-
-		goto next;
-	}
-
-	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-		bool done = bkey_lt(insert->k.p, k.k->p);
-
-		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-		if (ret)
-			goto err;
-
-		if (done)
-			goto out;
-next:
-		bch2_btree_iter_advance(trans, &iter);
-		k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
-		if ((ret = bkey_err(k)))
-			goto err;
-		if (!k.k)
-			goto out;
-	}
-
-	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-		ret = extent_back_merge(trans, &iter, insert, k);
-		if (ret)
-			goto err;
-	}
-out:
-	if (!bkey_deleted(&insert->k))
-		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-					    struct btree_insert_entry *i,
-					    enum btree_iter_update_trigger_flags flags,
-					    unsigned long ip)
-{
-	struct bkey k;
-	int ret;
-
-	btree_path_idx_t path_idx =
-		bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
-			      BTREE_ITER_intent, _THIS_IP_);
-	ret = bch2_btree_path_traverse(trans, path_idx, 0);
-	if (ret)
-		goto out;
-
-	struct btree_path *btree_path = trans->paths + path_idx;
-
-	/*
-	 * The old key in the insert entry might actually refer to an existing
-	 * key in the btree that has been deleted from cache and not yet
-	 * flushed. Check for this and skip the flush so we don't run triggers
-	 * against a stale key.
-	 */
-	bch2_btree_path_peek_slot_exact(btree_path, &k);
-	if (!bkey_deleted(&k))
-		goto out;
-
-	i->key_cache_already_flushed = true;
-	i->flags |= BTREE_TRIGGER_norun;
-
-	btree_path_set_should_be_locked(trans, btree_path);
-	ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
-out:
-	bch2_path_put(trans, path_idx, true);
-	return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
-			  struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
-			  unsigned long ip)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_insert_entry *i, n;
-	int cmp;
-
-	struct btree_path *path = trans->paths + path_idx;
-	EBUG_ON(!path->should_be_locked);
-	EBUG_ON(trans->nr_updates >= trans->nr_paths);
-	EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-	n = (struct btree_insert_entry) {
-		.flags		= flags,
-		.sort_order	= btree_trigger_order(path->btree_id),
-		.bkey_type	= __btree_node_type(path->level, path->btree_id),
-		.btree_id	= path->btree_id,
-		.level		= path->level,
-		.cached		= path->cached,
-		.path		= path_idx,
-		.k		= k,
-		.ip_allocated	= ip,
-	};
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	trans_for_each_update(trans, i)
-		BUG_ON(i != trans->updates &&
-		       btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-	/*
-	 * Pending updates are kept sorted: first, find position of new update,
-	 * then delete/trim any updates the new update overwrites:
-	 */
-	for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
-		cmp = btree_insert_entry_cmp(&n, i);
-		if (cmp <= 0)
-			break;
-	}
-
-	bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
-
-	if (overwrite) {
-		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-		bch2_path_put(trans, i->path, true);
-		i->flags	= n.flags;
-		i->cached	= n.cached;
-		i->k		= n.k;
-		i->path		= n.path;
-		i->ip_allocated	= n.ip_allocated;
-	} else {
-		array_insert_item(trans->updates, trans->nr_updates,
-				  i - trans->updates, n);
-
-		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-		if (unlikely(trans->journal_replay_not_finished)) {
-			struct bkey_i *j_k =
-				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-			if (j_k) {
-				i->old_k = j_k->k;
-				i->old_v = &j_k->v;
-			}
-		}
-	}
-
-	__btree_path_get(trans, trans->paths + i->path, true);
-
-	trace_update_by_path(trans, path, i, overwrite);
-
-	/*
-	 * If a key is present in the key cache, it must also exist in the
-	 * btree - this is necessary for cache coherency. When iterating over
-	 * a btree that's cached in the key cache, the btree iter code checks
-	 * the key cache - but the key has to exist in the btree for that to
-	 * work:
-	 */
-	if (path->cached && !i->old_btree_u64s)
-		return flush_new_cached_update(trans, i, flags, ip);
-
-	return 0;
-}
-
-static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
-						    struct btree_iter *iter,
-						    struct btree_path *path)
-{
-	struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
-
-	if (!key_cache_path ||
-	    !key_cache_path->should_be_locked ||
-	    !bpos_eq(key_cache_path->pos, iter->pos)) {
-		struct bkey_cached *ck;
-		int ret;
-
-		if (!iter->key_cache_path)
-			iter->key_cache_path =
-				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-					      BTREE_ITER_intent|
-					      BTREE_ITER_cached, _THIS_IP_);
-
-		iter->key_cache_path =
-			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-						iter->flags & BTREE_ITER_intent,
-						_THIS_IP_);
-
-		ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
-		if (unlikely(ret))
-			return ret;
-
-		ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
-
-		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-		}
-
-		btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-	}
-
-	return 0;
-}
-
-int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
-				      struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
-				      unsigned long ip)
-{
-	kmsan_check_memory(k, bkey_bytes(&k->k));
-
-	btree_path_idx_t path_idx = iter->update_path ?: iter->path;
-	int ret;
-
-	if (iter->flags & BTREE_ITER_is_extents)
-		return bch2_trans_update_extent(trans, iter, k, flags);
-
-	if (bkey_deleted(&k->k) &&
-	    !(flags & BTREE_UPDATE_key_cache_reclaim) &&
-	    (iter->flags & BTREE_ITER_filter_snapshots)) {
-		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-		if (unlikely(ret < 0))
-			return ret;
-
-		if (ret)
-			k->k.type = KEY_TYPE_whiteout;
-	}
-
-	/*
-	 * Ensure that updates to cached btrees go to the key cache:
-	 */
-	struct btree_path *path = trans->paths + path_idx;
-	if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
-	    !path->cached &&
-	    !path->level &&
-	    btree_id_cached(trans->c, path->btree_id)) {
-		ret = bch2_trans_update_get_key_cache(trans, iter, path);
-		if (ret)
-			return ret;
-
-		path_idx = iter->key_cache_path;
-	}
-
-	return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *trans,
-				  enum btree_id btree,
-				  struct bkey_i *k)
-{
-	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bkey_copy(n, k);
-	return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
-				struct btree_trans_subbuf *buf,
-				unsigned u64s)
-{
-	unsigned new_top = buf->u64s + u64s;
-	unsigned new_size = buf->size;
-
-	BUG_ON(roundup_pow_of_two(new_top) > U16_MAX);
-
-	if (new_top > new_size)
-		new_size = roundup_pow_of_two(new_top);
-
-	void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64));
-	if (IS_ERR(n))
-		return n;
-
-	unsigned offset = (u64 *) n - (u64 *) trans->mem;
-	BUG_ON(offset > U16_MAX);
-
-	if (buf->u64s)
-		memcpy(n,
-		       btree_trans_subbuf_base(trans, buf),
-		       buf->size * sizeof(u64));
-	buf->base = (u64 *) n - (u64 *) trans->mem;
-	buf->size = new_size;
-
-	void *p = btree_trans_subbuf_top(trans, buf);
-	buf->u64s = new_top;
-	return p;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-			     enum btree_id btree, struct bpos end)
-{
-	bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
-	struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_advance(trans, iter);
-	k = bch2_btree_iter_peek_slot(trans, iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-	if (bkey_gt(k.k->p, end)) {
-		ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
-		goto err;
-	}
-
-	return 0;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-			    struct btree_trans_commit_hook *h)
-{
-	h->next = trans->hooks;
-	trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-				enum btree_id btree, struct bkey_i *k,
-				enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_not_extents|
-			     BTREE_ITER_intent);
-	ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
-			    struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-			     BTREE_ITER_intent|flags);
-	int ret = bch2_btree_iter_traverse(trans, &iter) ?:
-		  bch2_trans_update(trans, &iter, k, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:			pointer to struct bch_fs
- * @id:			btree to insert into
- * @k:			key to insert
- * @disk_res:		must be non-NULL whenever inserting or potentially
- *			splitting data extents
- * @flags:		transaction commit flags
- * @iter_flags:		btree iter update trigger flags
- *
- * Returns:		0 on success, error code on failure
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
-		      struct disk_reservation *disk_res, int flags,
-		      enum btree_iter_update_trigger_flags iter_flags)
-{
-	return bch2_trans_commit_do(c, disk_res, NULL, flags,
-			     bch2_btree_insert_trans(trans, id, k, iter_flags));
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-			 struct btree_iter *iter, unsigned update_flags)
-{
-	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-	int ret = PTR_ERR_OR_ZERO(k);
-	if (ret)
-		return ret;
-
-	bkey_init(&k->k);
-	k->k.p = iter->pos;
-	return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete(struct btree_trans *trans,
-		      enum btree_id btree, struct bpos pos,
-		      unsigned update_flags)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, btree, pos,
-			     BTREE_ITER_cached|
-			     BTREE_ITER_intent);
-	ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_btree_delete_at(trans, &iter, update_flags);
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-				  struct bpos start, struct bpos end,
-				  unsigned update_flags,
-				  u64 *journal_seq)
-{
-	u32 restart_count = trans->restart_count;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
-	while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(trans->c, 0);
-		struct bkey_i delete;
-
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		bkey_init(&delete.k);
-
-		/*
-		 * This could probably be more efficient for extents:
-		 */
-
-		/*
-		 * For extents, iter.pos won't necessarily be the same as
-		 * bkey_start_pos(k.k) (for non extents they always will be the
-		 * same). It's important that we delete starting from iter.pos
-		 * because the range we want to delete could start in the middle
-		 * of k.
-		 *
-		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-		 * bkey_start_pos(k.k)).
-		 */
-		delete.k.p = iter.pos;
-
-		if (iter.flags & BTREE_ITER_is_extents)
-			bch2_key_resize(&delete.k,
-					bpos_min(end, k.k->p).offset -
-					iter.pos.offset);
-
-		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-			bch2_trans_commit(trans, &disk_res, journal_seq,
-					  BCH_TRANS_COMMIT_no_enospc);
-		bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-		/*
-		 * the bch2_trans_begin() call is in a weird place because we
-		 * need to call it after every transaction commit, to avoid path
-		 * overflow, but don't want to call it if the delete operation
-		 * is a no-op and we have no work to do:
-		 */
-		bch2_trans_begin(trans);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-			    struct bpos start, struct bpos end,
-			    unsigned update_flags,
-			    u64 *journal_seq)
-{
-	int ret = bch2_trans_run(c,
-			bch2_btree_delete_range_trans(trans, id, start, end,
-						      update_flags, journal_seq));
-	if (ret == -BCH_ERR_transaction_restart_nested)
-		ret = 0;
-	return ret;
-}
-
-int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
-{
-	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-	int ret = PTR_ERR_OR_ZERO(k);
-	if (ret)
-		return ret;
-
-	bkey_init(&k->k);
-	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k->k.p = iter->pos;
-	if (iter->flags & BTREE_ITER_is_extents)
-		bch2_key_resize(&k->k, 1);
-
-	return bch2_trans_update(trans, iter, k, 0);
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-		       struct bpos pos, bool set)
-{
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-
-	int ret = bch2_btree_iter_traverse(trans, &iter) ?:
-		  bch2_btree_bit_mod_iter(trans, &iter, set);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
-				struct bpos pos, bool set)
-{
-	struct bkey_i k;
-
-	bkey_init(&k.k);
-	k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-	k.k.p = pos;
-
-	return bch2_trans_update_buffered(trans, btree, &k);
-}
-
-static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len)
-{
-	unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
-
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
-	journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
-	memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0);
-	return 0;
-}
-
-int bch2_trans_log_str(struct btree_trans *trans, const char *str)
-{
-	return __bch2_trans_log_str(trans, str, strlen(str));
-}
-
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
-{
-	int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-	if (ret)
-		return ret;
-
-	return __bch2_trans_log_str(trans, buf->buf, buf->pos);
-}
-
-int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
-			unsigned level, struct bkey_i *k)
-{
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s);
-	bkey_copy(e->start, k);
-	return 0;
-}
-
-__printf(3, 0)
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-		  va_list args)
-{
-	struct printbuf buf = PRINTBUF;
-	prt_vprintf(&buf, fmt, args);
-
-	unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
-	int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-	if (ret)
-		goto err;
-
-	if (!test_bit(JOURNAL_running, &c->journal.flags)) {
-		ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
-		if (ret)
-			goto err;
-
-		struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
-		journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
-		memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
-		c->journal.early_journal_entries.nr += jset_u64s(u64s);
-	} else {
-		ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
-			bch2_trans_log_msg(trans, &buf));
-	}
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-__printf(2, 3)
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, 0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-__printf(2, 3)
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	int ret;
-
-	va_start(args, fmt);
-	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-	va_end(args);
-	return ret;
-}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
deleted file mode 100644
index 0b98ab959719..000000000000
--- a/fs/bcachefs/btree_update.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_H
-#define _BCACHEFS_BTREE_UPDATE_H
-
-#include "btree_iter.h"
-#include "journal.h"
-#include "snapshot.h"
-
-struct bch_fs;
-struct btree;
-
-void bch2_btree_node_prep_for_write(struct btree_trans *,
-				    struct btree_path *, struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
-				struct btree *, struct btree_node_iter *,
-				struct bkey_i *);
-
-int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
-int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
-void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
-
-void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
-				struct bkey_i *, u64);
-
-#define BCH_TRANS_COMMIT_FLAGS()							\
-	x(no_enospc,	"don't check for enospc")					\
-	x(no_check_rw,	"don't attempt to take a ref on c->writes")			\
-	x(no_journal_res, "don't take a journal reservation, instead "			\
-			"pin journal entry referred to by trans->journal_res.seq")	\
-	x(journal_reclaim, "operation required for journal reclaim; may return error"	\
-			"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
-	x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
-
-enum __bch_trans_commit_flags {
-	/* First bits for bch_watermark: */
-	__BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
-#define x(n, ...)	__BCH_TRANS_COMMIT_##n,
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-enum bch_trans_commit_flags {
-#define x(n, ...)	BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
-	BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-
-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
-
-int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
-				struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
-			enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
-		disk_reservation *, int flags, enum
-		btree_iter_update_trigger_flags iter_flags);
-
-int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-				  struct bpos, struct bpos, unsigned, u64 *);
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-			    struct bpos, struct bpos, unsigned, u64 *);
-
-int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
-int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
-
-static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-						enum btree_id btree, struct bpos pos)
-{
-	return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
-				     struct bpos, snapshot_id_list *);
-
-/*
- * For use when splitting extents in existing snapshots:
- *
- * If @old_pos is an interior snapshot node, iterate over descendent snapshot
- * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
- * not visible, emit a whiteout at @new_pos.
- */
-static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-						 enum btree_id btree,
-						 struct bpos old_pos,
-						 struct bpos new_pos)
-{
-	BUG_ON(old_pos.snapshot != new_pos.snapshot);
-
-	if (!btree_type_has_snapshots(btree) ||
-	    bkey_eq(old_pos, new_pos))
-		return 0;
-
-	snapshot_id_list s;
-	int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
-	if (ret)
-		return ret;
-
-	return s.nr
-		? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
-		: 0;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
-				       enum btree_iter_update_trigger_flags,
-				       struct bkey_s_c, struct bkey_s_c);
-
-int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
-			     enum btree_id, struct bpos);
-
-int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
-				      struct bkey_i *, enum btree_iter_update_trigger_flags,
-				      unsigned long);
-
-static inline int __must_check
-bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-		  struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
-	return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
-}
-
-static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
-					    struct btree_trans_subbuf *buf)
-{
-	return (u64 *) trans->mem + buf->base;
-}
-
-static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
-					   struct btree_trans_subbuf *buf)
-{
-	return (u64 *) trans->mem + buf->base + buf->u64s;
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *,
-				struct btree_trans_subbuf *,
-				unsigned);
-
-static inline void *
-bch2_trans_subbuf_alloc(struct btree_trans *trans,
-			struct btree_trans_subbuf *buf,
-			unsigned u64s)
-{
-	if (buf->u64s + u64s > buf->size)
-		return __bch2_trans_subbuf_alloc(trans, buf, u64s);
-
-	void *p = btree_trans_subbuf_top(trans, buf);
-	buf->u64s += u64s;
-	return p;
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
-{
-	return btree_trans_subbuf_base(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
-{
-	return btree_trans_subbuf_top(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *
-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
-	return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *);
-
-static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-					    enum btree_id btree,
-					    struct bkey_i *k)
-{
-	kmsan_check_memory(k, bkey_bytes(&k->k));
-
-	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
-	if (unlikely(!btree_type_uses_write_buffer(btree))) {
-		int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k);
-		dump_stack();
-		return ret;
-	}
-	/*
-	 * Most updates skip the btree write buffer until journal replay is
-	 * finished because synchronization with journal replay relies on having
-	 * a btree node locked - if we're overwriting a key in the journal that
-	 * journal replay hasn't yet replayed, we have to mark it as
-	 * overwritten.
-	 *
-	 * But accounting updates don't overwrite, they're deltas, and they have
-	 * to be flushed to the btree strictly in order for journal replay to be
-	 * able to tell which updates need to be applied:
-	 */
-	if (k->k.type != KEY_TYPE_accounting &&
-	    unlikely(trans->journal_replay_not_finished))
-		return bch2_btree_insert_clone_trans(trans, btree, k);
-
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
-	bkey_copy(e->start, k);
-	return 0;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *,
-			    struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *, unsigned);
-
-int bch2_trans_log_str(struct btree_trans *, const char *);
-int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
-int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
-
-__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
-
-/**
- * bch2_trans_commit - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static inline int bch2_trans_commit(struct btree_trans *trans,
-				    struct disk_reservation *disk_res,
-				    u64 *journal_seq,
-				    unsigned flags)
-{
-	trans->disk_res		= disk_res;
-	trans->journal_seq	= journal_seq;
-
-	return __bch2_trans_commit(trans, flags);
-}
-
-#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
-	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_flags)))
-
-#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
-	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
-					(_journal_seq), (_flags)))
-
-#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do)		\
-	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
-
-#define trans_for_each_update(_trans, _i)				\
-	for (struct btree_insert_entry *_i = (_trans)->updates;		\
-	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
-	     (_i)++)
-
-static inline void bch2_trans_reset_updates(struct btree_trans *trans)
-{
-	trans_for_each_update(trans, i)
-		bch2_path_put(trans, i->path, true);
-
-	trans->nr_updates		= 0;
-	trans->journal_entries.u64s	= 0;
-	trans->journal_entries.size	= 0;
-	trans->accounting.u64s		= 0;
-	trans->accounting.size		= 0;
-	trans->hooks			= NULL;
-	trans->extra_disk_res		= 0;
-}
-
-static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
-						  unsigned type, unsigned min_bytes)
-{
-	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
-	struct bkey_i *mut;
-
-	if (type && k.k->type != type)
-		return ERR_PTR(-ENOENT);
-
-	/* extra padding for varint_decode_fast... */
-	mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
-	if (!IS_ERR(mut)) {
-		bkey_reassemble(mut, k);
-
-		if (unlikely(bytes > bkey_bytes(k.k))) {
-			memset((void *) mut + bkey_bytes(k.k), 0,
-			       bytes - bkey_bytes(k.k));
-			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
-		}
-	}
-	return mut;
-}
-
-static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
-{
-	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
-}
-
-#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
-	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
-				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
-					struct bkey_s_c *k,
-					enum btree_iter_update_trigger_flags flags,
-					unsigned type, unsigned min_bytes)
-{
-	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
-	int ret;
-
-	if (IS_ERR(mut))
-		return mut;
-
-	ret = bch2_trans_update(trans, iter, mut, flags);
-	if (ret)
-		return ERR_PTR(ret);
-
-	*k = bkey_i_to_s_c(mut);
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
-						struct btree_iter *iter, struct bkey_s_c *k,
-						enum btree_iter_update_trigger_flags flags)
-{
-	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
-}
-
-#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
-	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
-				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 unsigned btree_id, struct bpos pos,
-					 enum btree_iter_update_trigger_flags flags,
-					 unsigned type, unsigned min_bytes)
-{
-	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
-				btree_id, pos, flags|BTREE_ITER_intent, type);
-	struct bkey_i *ret = IS_ERR(k.k)
-		? ERR_CAST(k.k)
-		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-	if (IS_ERR(ret))
-		bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       unsigned btree_id, struct bpos pos,
-					       enum btree_iter_update_trigger_flags flags)
-{
-	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 unsigned btree_id, struct bpos pos,
-					 enum btree_iter_update_trigger_flags flags,
-					 unsigned type, unsigned min_bytes)
-{
-	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
-				btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
-	int ret;
-
-	if (IS_ERR(mut))
-		return mut;
-
-	ret = bch2_trans_update(trans, iter, mut, flags);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ERR_PTR(ret);
-	}
-
-	return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
-						       struct btree_iter *iter,
-						       unsigned btree_id, struct bpos pos,
-						       enum btree_iter_update_trigger_flags flags,
-						       unsigned min_bytes)
-{
-	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       unsigned btree_id, struct bpos pos,
-					       enum btree_iter_update_trigger_flags flags)
-{
-	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
-	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
-			_btree_id, _pos, _flags,			\
-			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
-					       enum btree_iter_update_trigger_flags flags,
-					       unsigned type, unsigned val_size)
-{
-	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
-	int ret;
-
-	if (IS_ERR(k))
-		return k;
-
-	bkey_init(&k->k);
-	k->k.p = iter->pos;
-	k->k.type = type;
-	set_bkey_val_bytes(&k->k, val_size);
-
-	ret = bch2_trans_update(trans, iter, k, flags);
-	if (unlikely(ret))
-		return ERR_PTR(ret);
-	return k;
-}
-
-#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
-	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
-				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
-
-#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
deleted file mode 100644
index 553059b33bfd..000000000000
--- a/fs/bcachefs/btree_update_interior.c
+++ /dev/null
@@ -1,2854 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/random.h>
-
-static const char * const bch2_btree_update_modes[] = {
-#define x(t) #t,
-	BTREE_UPDATE_MODES()
-#undef x
-	NULL
-};
-
-static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
-
-static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				  btree_path_idx_t, struct btree *, struct keylist *);
-static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-/*
- * Verify that child nodes correctly span parent node's range:
- */
-int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-	struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
-		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
-		: b->data->min_key;
-	struct btree_and_journal_iter iter;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	struct bkey_buf prev;
-	int ret = 0;
-
-	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
-			b->data->min_key));
-
-	bch2_bkey_buf_init(&prev);
-	bkey_init(&prev.k->k);
-	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-
-	if (b == btree_node_root(c, b)) {
-		if (!bpos_eq(b->data->min_key, POS_MIN)) {
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "btree root with incorrect min_key: ");
-			bch2_bpos_to_text(&buf, b->data->min_key);
-			prt_newline(&buf);
-
-			bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
-			goto err;
-		}
-
-		if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "btree root with incorrect max_key: ");
-			bch2_bpos_to_text(&buf, b->data->max_key);
-			prt_newline(&buf);
-
-			bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
-			goto err;
-		}
-	}
-
-	if (!b->c.level)
-		goto out;
-
-	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-		if (k.k->type != KEY_TYPE_btree_ptr_v2)
-			goto out;
-
-		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-		struct bpos expected_min = bkey_deleted(&prev.k->k)
-			? node_min
-			: bpos_successor(prev.k->k.p);
-
-		if (!bpos_eq(expected_min, bp.v->min_key)) {
-			prt_str(&buf, "end of prev node doesn't match start of next node");
-			prt_str(&buf, "\nprev ");
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-			prt_str(&buf, "\nnext ");
-			bch2_bkey_val_to_text(&buf, c, k);
-			prt_newline(&buf);
-
-			bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
-			goto err;
-		}
-
-		bch2_bkey_buf_reassemble(&prev, c, k);
-		bch2_btree_and_journal_iter_advance(&iter);
-	}
-
-	if (bkey_deleted(&prev.k->k)) {
-		prt_printf(&buf, "empty interior node\n");
-		bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
-		goto err;
-	}
-
-	if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
-		prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
-		prt_newline(&buf);
-
-		bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
-		goto err;
-	}
-out:
-	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_bkey_buf_exit(&prev, c);
-	printbuf_exit(&buf);
-	return ret;
-err:
-	bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
-	prt_char(&buf, ' ');
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-	prt_newline(&buf);
-
-	ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	BUG_ON(!ret);
-	goto out;
-}
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
-	struct bkey_packed *k;
-	struct bkey uk;
-
-	for_each_bset(b, t)
-		bset_tree_for_each_key(b, t, k)
-			if (!bkey_deleted(k)) {
-				uk = bkey_unpack_key(b, k);
-				bch2_bkey_format_add_key(s, &uk);
-			}
-}
-
-static struct bkey_format bch2_btree_calc_format(struct btree *b)
-{
-	struct bkey_format_state s;
-
-	bch2_bkey_format_init(&s);
-	bch2_bkey_format_add_pos(&s, b->data->min_key);
-	bch2_bkey_format_add_pos(&s, b->data->max_key);
-	__bch2_btree_calc_format(&s, b);
-
-	return bch2_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
-					  struct bkey_format *old_f,
-					  struct bkey_format *new_f)
-{
-	/* stupid integer promotion rules */
-	ssize_t delta =
-	    (((int) new_f->key_u64s - old_f->key_u64s) *
-	     (int) nr.packed_keys) +
-	    (((int) new_f->key_u64s - BKEY_U64s) *
-	     (int) nr.unpacked_keys);
-
-	BUG_ON(delta + nr.live_u64s < 0);
-
-	return nr.live_u64s + delta;
-}
-
-/**
- * bch2_btree_node_format_fits - check if we could rewrite node with a new format
- *
- * @c:		filesystem handle
- * @b:		btree node to rewrite
- * @nr:		number of keys for new node (i.e. b->nr)
- * @new_f:	bkey format to translate keys to
- *
- * Returns: true if all re-packed keys will be able to fit in a new node.
- *
- * Assumes all keys will successfully pack with the new format.
- */
-static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
-				 struct btree_nr_keys nr,
-				 struct bkey_format *new_f)
-{
-	size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
-
-	return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
-}
-
-/* Btree node freeing/allocation: */
-
-static void __btree_node_free(struct btree_trans *trans, struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	trace_and_count(c, btree_node_free, trans, b);
-
-	BUG_ON(btree_node_write_blocked(b));
-	BUG_ON(btree_node_dirty(b));
-	BUG_ON(btree_node_need_write(b));
-	BUG_ON(b == btree_node_root(c, b));
-	BUG_ON(b->ob.nr);
-	BUG_ON(!list_empty(&b->write_blocked));
-	BUG_ON(b->will_make_reachable);
-
-	clear_btree_node_noevict(b);
-}
-
-static void bch2_btree_node_free_inmem(struct btree_trans *trans,
-				       struct btree_path *path,
-				       struct btree *b)
-{
-	struct bch_fs *c = trans->c;
-
-	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
-	__btree_node_free(trans, b);
-
-	mutex_lock(&c->btree_cache.lock);
-	bch2_btree_node_hash_remove(&c->btree_cache, b);
-	mutex_unlock(&c->btree_cache.lock);
-
-	six_unlock_write(&b->c.lock);
-	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
-	bch2_trans_node_drop(trans, b);
-}
-
-static void bch2_btree_node_free_never_used(struct btree_update *as,
-					    struct btree_trans *trans,
-					    struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-
-	BUG_ON(!list_empty(&b->write_blocked));
-	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
-
-	b->will_make_reachable = 0;
-	closure_put(&as->cl);
-
-	clear_btree_node_will_make_reachable(b);
-	clear_btree_node_accessed(b);
-	clear_btree_node_dirty_acct(c, b);
-	clear_btree_node_need_write(b);
-
-	mutex_lock(&c->btree_cache.lock);
-	__bch2_btree_node_hash_remove(&c->btree_cache, b);
-	mutex_unlock(&c->btree_cache.lock);
-
-	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
-	p->b[p->nr++] = b;
-
-	six_unlock_intent(&b->c.lock);
-
-	bch2_trans_node_drop(trans, b);
-}
-
-static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
-					     struct disk_reservation *res,
-					     struct closure *cl,
-					     bool interior_node,
-					     unsigned target,
-					     unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct write_point *wp;
-	struct btree *b;
-	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-	struct open_buckets obs = { .nr = 0 };
-	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
-		? BTREE_NODE_RESERVE
-		: 0;
-	int ret;
-
-	b = bch2_btree_node_mem_alloc(trans, interior_node);
-	if (IS_ERR(b))
-		return b;
-
-	BUG_ON(b->ob.nr);
-
-	mutex_lock(&c->btree_reserve_cache_lock);
-	if (c->btree_reserve_cache_nr > nr_reserve) {
-		struct btree_alloc *a =
-			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-		obs = a->ob;
-		bkey_copy(&tmp.k, &a->k);
-		mutex_unlock(&c->btree_reserve_cache_lock);
-		goto out;
-	}
-	mutex_unlock(&c->btree_reserve_cache_lock);
-retry:
-	ret = bch2_alloc_sectors_start_trans(trans,
-				      target ?:
-				      c->opts.metadata_target ?:
-				      c->opts.foreground_target,
-				      0,
-				      writepoint_ptr(&c->btree_write_point),
-				      &devs_have,
-				      res->nr_replicas,
-				      min(res->nr_replicas,
-					  c->opts.metadata_replicas_required),
-				      watermark,
-				      target ? BCH_WRITE_only_specified_devs : 0,
-				      cl, &wp);
-	if (unlikely(ret))
-		goto err;
-
-	if (wp->sectors_free < btree_sectors(c)) {
-		struct open_bucket *ob;
-		unsigned i;
-
-		open_bucket_for_each(c, &wp->ptrs, ob, i)
-			if (ob->sectors_free < btree_sectors(c))
-				ob->sectors_free = 0;
-
-		bch2_alloc_sectors_done(c, wp);
-		goto retry;
-	}
-
-	bkey_btree_ptr_v2_init(&tmp.k);
-	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
-
-	bch2_open_bucket_get(c, wp, &obs);
-	bch2_alloc_sectors_done(c, wp);
-out:
-	bkey_copy(&b->key, &tmp.k);
-	b->ob = obs;
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	return b;
-err:
-	bch2_btree_node_to_freelist(c, b);
-	return ERR_PTR(ret);
-}
-
-static struct btree *bch2_btree_node_alloc(struct btree_update *as,
-					   struct btree_trans *trans,
-					   unsigned level)
-{
-	struct bch_fs *c = as->c;
-	struct btree *b;
-	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
-	int ret;
-
-	BUG_ON(level >= BTREE_MAX_DEPTH);
-	BUG_ON(!p->nr);
-
-	b = p->b[--p->nr];
-
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-
-	set_btree_node_accessed(b);
-	set_btree_node_dirty_acct(c, b);
-	set_btree_node_need_write(b);
-
-	bch2_bset_init_first(b, &b->data->keys);
-	b->c.level	= level;
-	b->c.btree_id	= as->btree_id;
-	b->version_ondisk = c->sb.version;
-
-	memset(&b->nr, 0, sizeof(b->nr));
-	b->data->magic = cpu_to_le64(bset_magic(c));
-	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
-	b->data->flags = 0;
-	SET_BTREE_NODE_ID(b->data, as->btree_id);
-	SET_BTREE_NODE_LEVEL(b->data, level);
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
-
-		bp->v.mem_ptr		= 0;
-		bp->v.seq		= b->data->keys.seq;
-		bp->v.sectors_written	= 0;
-	}
-
-	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
-	bch2_btree_build_aux_trees(b);
-
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
-	BUG_ON(ret);
-
-	trace_and_count(c, btree_node_alloc, trans, b);
-	bch2_increment_clock(c, btree_sectors(c), WRITE);
-	return b;
-}
-
-static void btree_set_min(struct btree *b, struct bpos pos)
-{
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
-		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
-	b->data->min_key = pos;
-}
-
-static void btree_set_max(struct btree *b, struct bpos pos)
-{
-	b->key.k.p = pos;
-	b->data->max_key = pos;
-}
-
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
-						       struct btree_trans *trans,
-						       struct btree *b)
-{
-	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
-	struct bkey_format format = bch2_btree_calc_format(b);
-
-	/*
-	 * The keys might expand with the new format - if they wouldn't fit in
-	 * the btree node anymore, use the old format for now:
-	 */
-	if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
-		format = b->format;
-
-	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
-	btree_set_min(n, b->data->min_key);
-	btree_set_max(n, b->data->max_key);
-
-	n->data->format		= format;
-	btree_node_set_format(n, format);
-
-	bch2_btree_sort_into(as->c, n, b);
-
-	btree_node_reset_sib_u64s(n);
-	return n;
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as,
-				struct btree_trans *trans, unsigned level)
-{
-	struct btree *b = bch2_btree_node_alloc(as, trans, level);
-
-	btree_set_min(b, POS_MIN);
-	btree_set_max(b, SPOS_MAX);
-	b->data->format = bch2_btree_calc_format(b);
-
-	btree_node_set_format(b, b->data->format);
-	bch2_btree_build_aux_trees(b);
-
-	return b;
-}
-
-static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-	struct prealloc_nodes *p;
-
-	for (p = as->prealloc_nodes;
-	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
-	     p++) {
-		while (p->nr) {
-			struct btree *b = p->b[--p->nr];
-
-			mutex_lock(&c->btree_reserve_cache_lock);
-
-			if (c->btree_reserve_cache_nr <
-			    ARRAY_SIZE(c->btree_reserve_cache)) {
-				struct btree_alloc *a =
-					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
-				a->ob = b->ob;
-				b->ob.nr = 0;
-				bkey_copy(&a->k, &b->key);
-			} else {
-				bch2_open_buckets_put(c, &b->ob);
-			}
-
-			mutex_unlock(&c->btree_reserve_cache_lock);
-
-			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-			__btree_node_free(trans, b);
-			bch2_btree_node_to_freelist(c, b);
-		}
-	}
-}
-
-static int bch2_btree_reserve_get(struct btree_trans *trans,
-				  struct btree_update *as,
-				  unsigned nr_nodes[2],
-				  unsigned target,
-				  unsigned flags,
-				  struct closure *cl)
-{
-	struct btree *b;
-	unsigned interior;
-	int ret = 0;
-
-	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
-
-	/*
-	 * Protects reaping from the btree node cache and using the btree node
-	 * open bucket reserve:
-	 */
-	ret = bch2_btree_cache_cannibalize_lock(trans, cl);
-	if (ret)
-		return ret;
-
-	for (interior = 0; interior < 2; interior++) {
-		struct prealloc_nodes *p = as->prealloc_nodes + interior;
-
-		while (p->nr < nr_nodes[interior]) {
-			b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
-						    interior, target, flags);
-			if (IS_ERR(b)) {
-				ret = PTR_ERR(b);
-				goto err;
-			}
-
-			p->b[p->nr++] = b;
-		}
-	}
-err:
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return ret;
-}
-
-/* Asynchronous interior node update machinery */
-
-static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-
-	if (as->took_gc_lock)
-		up_read(&c->gc_lock);
-	as->took_gc_lock = false;
-
-	bch2_journal_pin_drop(&c->journal, &as->journal);
-	bch2_journal_pin_flush(&c->journal, &as->journal);
-	bch2_disk_reservation_put(c, &as->disk_res);
-	bch2_btree_reserve_put(as, trans);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
-			       as->start_time);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_del(&as->unwritten_list);
-	list_del(&as->list);
-
-	closure_debug_destroy(&as->cl);
-	mempool_free(as, &c->btree_interior_update_pool);
-
-	/*
-	 * Have to do the wakeup with btree_interior_update_lock still held,
-	 * since being on btree_interior_update_list is our ref on @c:
-	 */
-	closure_wake_up(&c->btree_interior_update_wait);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_update_add_key(struct btree_update *as,
-				 struct keylist *keys, struct btree *b)
-{
-	struct bkey_i *k = &b->key;
-
-	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
-	       ARRAY_SIZE(as->_old_keys));
-
-	bkey_copy(keys->top, k);
-	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
-
-	bch2_keylist_push(keys);
-}
-
-static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
-{
-	for_each_keylist_key(&as->new_keys, k)
-		if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
-			return false;
-	return true;
-}
-
-static void btree_update_new_nodes_mark_sb(struct btree_update *as)
-{
-	struct bch_fs *c = as->c;
-
-	mutex_lock(&c->sb_lock);
-	for_each_keylist_key(&as->new_keys, k)
-		bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-}
-
-/*
- * The transactional part of an interior btree node update, where we journal the
- * update we did to the interior node and update alloc info:
- */
-static int btree_update_nodes_written_trans(struct btree_trans *trans,
-					    struct btree_update *as)
-{
-	struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
-	int ret = PTR_ERR_OR_ZERO(e);
-	if (ret)
-		return ret;
-
-	memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
-	trans->journal_pin = &as->journal;
-
-	for_each_keylist_key(&as->old_keys, k) {
-		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-		ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
-					   BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	for_each_keylist_key(&as->new_keys, k) {
-		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
-		ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
-					   BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
-static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
-{
-	struct btree_node *b_data = READ_ONCE(b->data);
-
-	return (b_data ? b_data->keys.seq : 0) == seq;
-}
-
-static void btree_update_nodes_written(struct btree_update *as)
-{
-	struct bch_fs *c = as->c;
-	struct btree *b;
-	struct btree_trans *trans = bch2_trans_get(c);
-	u64 journal_seq = 0;
-	unsigned i;
-	int ret;
-
-	/*
-	 * If we're already in an error state, it might be because a btree node
-	 * was never written, and we might be trying to free that same btree
-	 * node here, but it won't have been marked as allocated and we'll see
-	 * spurious disk usage inconsistencies in the transactional part below
-	 * if we don't skip it:
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	if (!btree_update_new_nodes_marked_sb(as))
-		btree_update_new_nodes_mark_sb(as);
-
-	/*
-	 * Wait for any in flight writes to finish before we free the old nodes
-	 * on disk. But we haven't pinned those old nodes in the btree cache,
-	 * they might have already been evicted.
-	 *
-	 * The update we're completing deleted references to those nodes from the
-	 * btree, so we know if they've been evicted they can't be pulled back in.
-	 * We just have to check if the nodes we have pointers to are still those
-	 * old nodes, and haven't been reused.
-	 *
-	 * This can't be done locklessly because the data buffer might have been
-	 * vmalloc allocated, and they're not RCU freed. We also need the
-	 * __no_kmsan_checks annotation because even with the btree node read
-	 * lock, nothing tells us that the data buffer has been initialized (if
-	 * the btree node has been reused for a different node, and the data
-	 * buffer swapped for a new data buffer).
-	 */
-	for (i = 0; i < as->nr_old_nodes; i++) {
-		b = as->old_nodes[i];
-
-		bch2_trans_begin(trans);
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
-		six_unlock_read(&b->c.lock);
-		bch2_trans_unlock_long(trans);
-
-		if (seq_matches)
-			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
-				       TASK_UNINTERRUPTIBLE);
-	}
-
-	/*
-	 * We did an update to a parent node where the pointers we added pointed
-	 * to child nodes that weren't written yet: now, the child nodes have
-	 * been written so we can write out the update to the interior node.
-	 */
-
-	/*
-	 * We can't call into journal reclaim here: we'd block on the journal
-	 * reclaim lock, but we may need to release the open buckets we have
-	 * pinned in order for other btree updates to make forward progress, and
-	 * journal reclaim does btree updates when flushing bkey_cached entries,
-	 * which may require allocations as well.
-	 */
-	ret = commit_do(trans, &as->disk_res, &journal_seq,
-			BCH_WATERMARK_interior_updates|
-			BCH_TRANS_COMMIT_no_enospc|
-			BCH_TRANS_COMMIT_no_check_rw|
-			BCH_TRANS_COMMIT_journal_reclaim,
-			btree_update_nodes_written_trans(trans, as));
-	bch2_trans_unlock(trans);
-
-	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "%s", bch2_err_str(ret));
-err:
-	/*
-	 * Ensure transaction is unlocked before using btree_node_lock_nopath()
-	 * (the use of which is always suspect, we need to work on removing this
-	 * in the future)
-	 *
-	 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
-	 * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
-	 * rarely end up with a locked path besides the one we have here:
-	 */
-	bch2_trans_unlock(trans);
-	bch2_trans_begin(trans);
-
-	/*
-	 * We have to be careful because another thread might be getting ready
-	 * to free as->b and calling btree_update_reparent() on us - we'll
-	 * recheck under btree_update_lock below:
-	 */
-	b = READ_ONCE(as->b);
-	if (b) {
-		/*
-		 * @b is the node we did the final insert into:
-		 *
-		 * On failure to get a journal reservation, we still have to
-		 * unblock the write and allow most of the write path to happen
-		 * so that shutdown works, but the i->journal_seq mechanism
-		 * won't work to prevent the btree write from being visible (we
-		 * didn't get a journal sequence number) - instead
-		 * __bch2_btree_node_write() doesn't do the actual write if
-		 * we're in journal error state:
-		 */
-
-		btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
-						as->btree_id, b->c.level, b->key.k.p);
-		struct btree_path *path = trans->paths + path_idx;
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
-		path->l[b->c.level].b = b;
-
-		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
-		mutex_lock(&c->btree_interior_update_lock);
-
-		list_del(&as->write_blocked_list);
-		if (list_empty(&b->write_blocked))
-			clear_btree_node_write_blocked(b);
-
-		/*
-		 * Node might have been freed, recheck under
-		 * btree_interior_update_lock:
-		 */
-		if (as->b == b) {
-			BUG_ON(!b->c.level);
-			BUG_ON(!btree_node_dirty(b));
-
-			if (!ret) {
-				struct bset *last = btree_bset_last(b);
-
-				last->journal_seq = cpu_to_le64(
-							     max(journal_seq,
-								 le64_to_cpu(last->journal_seq)));
-
-				bch2_btree_add_journal_pin(c, b, journal_seq);
-			} else {
-				/*
-				 * If we didn't get a journal sequence number we
-				 * can't write this btree node, because recovery
-				 * won't know to ignore this write:
-				 */
-				set_btree_node_never_write(b);
-			}
-		}
-
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-		six_unlock_write(&b->c.lock);
-
-		btree_node_write_if_need(trans, b, SIX_LOCK_intent);
-		btree_node_unlock(trans, path, b->c.level);
-		bch2_path_put(trans, path_idx, true);
-	}
-
-	bch2_journal_pin_drop(&c->journal, &as->journal);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	for (i = 0; i < as->nr_new_nodes; i++) {
-		b = as->new_nodes[i];
-
-		BUG_ON(b->will_make_reachable != (unsigned long) as);
-		b->will_make_reachable = 0;
-		clear_btree_node_will_make_reachable(b);
-	}
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	for (i = 0; i < as->nr_new_nodes; i++) {
-		b = as->new_nodes[i];
-
-		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		btree_node_write_if_need(trans, b, SIX_LOCK_read);
-		six_unlock_read(&b->c.lock);
-	}
-
-	for (i = 0; i < as->nr_open_buckets; i++)
-		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
-
-	bch2_btree_update_free(as, trans);
-	bch2_trans_put(trans);
-}
-
-static void btree_interior_update_work(struct work_struct *work)
-{
-	struct bch_fs *c =
-		container_of(work, struct bch_fs, btree_interior_update_work);
-	struct btree_update *as;
-
-	while (1) {
-		mutex_lock(&c->btree_interior_update_lock);
-		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
-					      struct btree_update, unwritten_list);
-		if (as && !as->nodes_written)
-			as = NULL;
-		mutex_unlock(&c->btree_interior_update_lock);
-
-		if (!as)
-			break;
-
-		btree_update_nodes_written(as);
-	}
-}
-
-static CLOSURE_CALLBACK(btree_update_set_nodes_written)
-{
-	closure_type(as, struct btree_update, cl);
-	struct bch_fs *c = as->c;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	as->nodes_written = true;
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_update_updated_node(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-
-	BUG_ON(as->mode != BTREE_UPDATE_none);
-	BUG_ON(as->update_level_end < b->c.level);
-	BUG_ON(!btree_node_dirty(b));
-	BUG_ON(!b->c.level);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-	as->mode	= BTREE_UPDATE_node;
-	as->b		= b;
-	as->update_level_end = b->c.level;
-
-	set_btree_node_write_blocked(b);
-	list_add(&as->write_blocked_list, &b->write_blocked);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static int bch2_update_reparent_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-static void btree_update_reparent(struct btree_update *as,
-				  struct btree_update *child)
-{
-	struct bch_fs *c = as->c;
-
-	lockdep_assert_held(&c->btree_interior_update_lock);
-
-	child->b = NULL;
-	child->mode = BTREE_UPDATE_update;
-
-	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
-			      bch2_update_reparent_journal_pin_flush);
-}
-
-static void btree_update_updated_root(struct btree_update *as, struct btree *b)
-{
-	struct bkey_i *insert = &b->key;
-	struct bch_fs *c = as->c;
-
-	BUG_ON(as->mode != BTREE_UPDATE_none);
-
-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_root,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-	as->mode	= BTREE_UPDATE_root;
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
- * bch2_btree_update_add_new_node:
- *
- * This causes @as to wait on @b to be written, before it gets to
- * bch2_btree_update_nodes_written
- *
- * Additionally, it sets b->will_make_reachable to prevent any additional writes
- * to @b from happening besides the first until @b is reachable on disk
- *
- * And it adds @b to the list of @as's new nodes, so that we can update sector
- * counts in bch2_btree_update_nodes_written:
- */
-static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
-{
-	struct bch_fs *c = as->c;
-
-	closure_get(&as->cl);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
-	BUG_ON(b->will_make_reachable);
-
-	as->new_nodes[as->nr_new_nodes++] = b;
-	b->will_make_reachable = 1UL|(unsigned long) as;
-	set_btree_node_will_make_reachable(b);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	btree_update_add_key(as, &as->new_keys, b);
-
-	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
-		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
-		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
-
-		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-			cpu_to_le16(sectors);
-	}
-}
-
-/*
- * returns true if @b was a new node
- */
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
-	struct btree_update *as;
-	unsigned long v;
-	unsigned i;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	/*
-	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
-	 * dropped when it gets written by bch2_btree_complete_write - the
-	 * xchg() is for synchronization with bch2_btree_complete_write:
-	 */
-	v = xchg(&b->will_make_reachable, 0);
-	clear_btree_node_will_make_reachable(b);
-	as = (struct btree_update *) (v & ~1UL);
-
-	if (!as) {
-		mutex_unlock(&c->btree_interior_update_lock);
-		return;
-	}
-
-	for (i = 0; i < as->nr_new_nodes; i++)
-		if (as->new_nodes[i] == b)
-			goto found;
-
-	BUG();
-found:
-	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	if (v & 1)
-		closure_put(&as->cl);
-}
-
-static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
-{
-	while (b->ob.nr)
-		as->open_buckets[as->nr_open_buckets++] =
-			b->ob.v[--b->ob.nr];
-}
-
-static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	return 0;
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_updates - redirect @b's
- * btree_updates to point to this btree_update:
- */
-static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-						      struct btree *b)
-{
-	struct bch_fs *c = as->c;
-	struct btree_update *p, *n;
-	struct btree_write *w;
-
-	set_btree_node_dying(b);
-
-	if (btree_node_fake(b))
-		return;
-
-	mutex_lock(&c->btree_interior_update_lock);
-
-	/*
-	 * Does this node have any btree_update operations preventing
-	 * it from being written?
-	 *
-	 * If so, redirect them to point to this btree_update: we can
-	 * write out our new nodes, but we won't make them visible until those
-	 * operations complete
-	 */
-	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
-		list_del_init(&p->write_blocked_list);
-		btree_update_reparent(as, p);
-
-		/*
-		 * for flush_held_btree_writes() waiting on updates to flush or
-		 * nodes to be writeable:
-		 */
-		closure_wake_up(&c->btree_interior_update_wait);
-	}
-
-	clear_btree_node_dirty_acct(c, b);
-	clear_btree_node_need_write(b);
-	clear_btree_node_write_blocked(b);
-
-	/*
-	 * Does this node have unwritten data that has a pin on the journal?
-	 *
-	 * If so, transfer that pin to the btree_update operation -
-	 * note that if we're freeing multiple nodes, we only need to keep the
-	 * oldest pin of any of the nodes we're freeing. We'll release the pin
-	 * when the new nodes are persistent and reachable on disk:
-	 */
-	w = btree_current_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-			      bch2_btree_update_will_free_node_journal_pin_flush);
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-
-	w = btree_prev_write(b);
-	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
-			      bch2_btree_update_will_free_node_journal_pin_flush);
-	bch2_journal_pin_drop(&c->journal, &w->journal);
-
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	/*
-	 * Is this a node that isn't reachable on disk yet?
-	 *
-	 * Nodes that aren't reachable yet have writes blocked until they're
-	 * reachable - now that we've cancelled any pending writes and moved
-	 * things waiting on that write to wait on this update, we can drop this
-	 * node from the list of nodes that the other update is making
-	 * reachable, prior to freeing it:
-	 */
-	btree_update_drop_new_node(c, b);
-
-	btree_update_add_key(as, &as->old_keys, b);
-
-	as->old_nodes[as->nr_old_nodes] = b;
-	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
-	as->nr_old_nodes++;
-}
-
-static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
-{
-	struct bch_fs *c = as->c;
-	u64 start_time = as->start_time;
-
-	BUG_ON(as->mode == BTREE_UPDATE_none);
-
-	if (as->took_gc_lock)
-		up_read(&as->c->gc_lock);
-	as->took_gc_lock = false;
-
-	bch2_btree_reserve_put(as, trans);
-
-	continue_at(&as->cl, btree_update_set_nodes_written,
-		    as->c->btree_interior_update_worker);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
-			       start_time);
-}
-
-static const char * const btree_node_reawrite_reason_strs[] = {
-#define x(n)	#n,
-	BTREE_NODE_REWRITE_REASON()
-#undef x
-	NULL,
-};
-
-static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-			unsigned level_start, bool split,
-			unsigned target, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_update *as;
-	u64 start_time = local_clock();
-	int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
-		? BCH_DISK_RESERVATION_NOFAIL : 0;
-	unsigned nr_nodes[2] = { 0, 0 };
-	unsigned level_end = level_start;
-	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	int ret = 0;
-	u32 restart_count = trans->restart_count;
-
-	BUG_ON(!path->should_be_locked);
-
-	if (watermark == BCH_WATERMARK_copygc)
-		watermark = BCH_WATERMARK_btree_copygc;
-	if (watermark < BCH_WATERMARK_btree)
-		watermark = BCH_WATERMARK_btree;
-
-	flags &= ~BCH_WATERMARK_MASK;
-	flags |= watermark;
-
-	if (watermark < BCH_WATERMARK_reclaim &&
-	    test_bit(JOURNAL_space_low, &c->journal.flags)) {
-		if (flags & BCH_TRANS_COMMIT_journal_reclaim)
-			return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
-
-		ret = drop_locks_do(trans,
-			({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	while (1) {
-		nr_nodes[!!level_end] += 1 + split;
-		level_end++;
-
-		ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
-		if (ret)
-			return ERR_PTR(ret);
-
-		if (!btree_path_node(path, level_end)) {
-			/* Allocating new root? */
-			nr_nodes[1] += split;
-			level_end = BTREE_MAX_DEPTH;
-			break;
-		}
-
-		/*
-		 * Always check for space for two keys, even if we won't have to
-		 * split at prior level - it might have been a merge instead:
-		 */
-		if (bch2_btree_node_insert_fits(path->l[level_end].b,
-						BKEY_BTREE_PTR_U64s_MAX * 2))
-			break;
-
-		split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
-	}
-
-	if (!down_read_trylock(&c->gc_lock)) {
-		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
-		if (ret) {
-			up_read(&c->gc_lock);
-			return ERR_PTR(ret);
-		}
-	}
-
-	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
-	memset(as, 0, sizeof(*as));
-	closure_init(&as->cl, NULL);
-	as->c			= c;
-	as->start_time		= start_time;
-	as->ip_started		= _RET_IP_;
-	as->mode		= BTREE_UPDATE_none;
-	as->flags		= flags;
-	as->took_gc_lock	= true;
-	as->btree_id		= path->btree_id;
-	as->update_level_start	= level_start;
-	as->update_level_end	= level_end;
-	INIT_LIST_HEAD(&as->list);
-	INIT_LIST_HEAD(&as->unwritten_list);
-	INIT_LIST_HEAD(&as->write_blocked_list);
-	bch2_keylist_init(&as->old_keys, as->_old_keys);
-	bch2_keylist_init(&as->new_keys, as->_new_keys);
-	bch2_keylist_init(&as->parent_keys, as->inline_keys);
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->list, &c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	struct btree *b = btree_path_node(path, path->level);
-	as->node_start	= b->data->min_key;
-	as->node_end	= b->data->max_key;
-	as->node_needed_rewrite = btree_node_rewrite_reason(b);
-	as->node_written = b->written;
-	as->node_sectors = btree_buf_bytes(b) >> 9;
-	as->node_remaining = __bch2_btree_u64s_remaining(b,
-				btree_bkey_last(b, bset_tree_last(b)));
-
-	/*
-	 * We don't want to allocate if we're in an error state, that can cause
-	 * deadlock on emergency shutdown due to open buckets getting stuck in
-	 * the btree_reserve_cache after allocator shutdown has cleared it out.
-	 * This check needs to come after adding us to the btree_interior_update
-	 * list but before calling bch2_btree_reserve_get, to synchronize with
-	 * __bch2_fs_read_only().
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	ret = bch2_disk_reservation_get(c, &as->disk_res,
-			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
-			READ_ONCE(c->opts.metadata_replicas),
-			disk_res_flags);
-	if (ret)
-		goto err;
-
-	ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
-	if (bch2_err_matches(ret, ENOSPC) ||
-	    bch2_err_matches(ret, ENOMEM)) {
-		struct closure cl;
-
-		/*
-		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-		 * flag
-		 */
-		if (bch2_err_matches(ret, ENOSPC) &&
-		    (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    watermark < BCH_WATERMARK_reclaim) {
-			ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-			goto err;
-		}
-
-		closure_init_stack(&cl);
-
-		do {
-			ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
-			if (!bch2_err_matches(ret, BCH_ERR_operation_blocked))
-				break;
-			bch2_trans_unlock(trans);
-			bch2_wait_on_allocator(c, &cl);
-		} while (1);
-	}
-
-	if (ret) {
-		trace_and_count(c, btree_reserve_get_fail, trans->fn,
-				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
-		goto err;
-	}
-
-	ret = bch2_trans_relock(trans);
-	if (ret)
-		goto err;
-
-	bch2_trans_verify_not_restarted(trans, restart_count);
-	return as;
-err:
-	bch2_btree_update_free(as, trans);
-	if (!bch2_err_matches(ret, ENOSPC) &&
-	    !bch2_err_matches(ret, EROFS) &&
-	    ret != -BCH_ERR_journal_reclaim_would_deadlock &&
-	    ret != -BCH_ERR_journal_shutdown)
-		bch_err_fn_ratelimited(c, ret);
-	return ERR_PTR(ret);
-}
-
-/* Btree root updates: */
-
-static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
-	/* Root nodes cannot be reaped */
-	mutex_lock(&c->btree_cache.lock);
-	list_del_init(&b->list);
-	mutex_unlock(&c->btree_cache.lock);
-
-	mutex_lock(&c->btree_root_lock);
-	bch2_btree_id_root(c, b->c.btree_id)->b = b;
-	mutex_unlock(&c->btree_root_lock);
-
-	bch2_recalc_btree_reserve(c);
-}
-
-static int bch2_btree_set_root(struct btree_update *as,
-			       struct btree_trans *trans,
-			       struct btree_path *path,
-			       struct btree *b,
-			       bool nofail)
-{
-	struct bch_fs *c = as->c;
-
-	trace_and_count(c, btree_node_set_root, trans, b);
-
-	struct btree *old = btree_node_root(c, b);
-
-	/*
-	 * Ensure no one is using the old root while we switch to the
-	 * new root:
-	 */
-	if (nofail) {
-		bch2_btree_node_lock_write_nofail(trans, path, &old->c);
-	} else {
-		int ret = bch2_btree_node_lock_write(trans, path, &old->c);
-		if (ret)
-			return ret;
-	}
-
-	bch2_btree_set_root_inmem(c, b);
-
-	btree_update_updated_root(as, b);
-
-	/*
-	 * Unlock old root after new root is visible:
-	 *
-	 * The new root isn't persistent, but that's ok: we still have
-	 * an intent lock on the new root, and any updates that would
-	 * depend on the new root would have to update the new root.
-	 */
-	bch2_btree_node_unlock_write(trans, path, old);
-	return 0;
-}
-
-/* Interior node updates: */
-
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
-					struct btree_trans *trans,
-					struct btree_path *path,
-					struct btree *b,
-					struct btree_node_iter *node_iter,
-					struct bkey_i *insert)
-{
-	struct bch_fs *c = as->c;
-	struct bkey_packed *k;
-	struct printbuf buf = PRINTBUF;
-	unsigned long old, new;
-
-	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
-	       !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
-
-	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
-		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
-	struct bkey_validate_context from = (struct bkey_validate_context) {
-		.from	= BKEY_VALIDATE_btree_node,
-		.level	= b->c.level,
-		.btree	= b->c.btree_id,
-		.flags	= BCH_VALIDATE_commit,
-	};
-	if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
-	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
-		bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
-		dump_stack();
-	}
-
-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_keys,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
-	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
-		bch2_btree_node_iter_advance(node_iter, b);
-
-	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-	set_btree_node_dirty_acct(c, b);
-
-	old = READ_ONCE(b->flags);
-	do {
-		new = old;
-
-		new &= ~BTREE_WRITE_TYPE_MASK;
-		new |= BTREE_WRITE_interior;
-		new |= 1 << BTREE_NODE_need_write;
-	} while (!try_cmpxchg(&b->flags, &old, new));
-
-	printbuf_exit(&buf);
-}
-
-static int
-bch2_btree_insert_keys_interior(struct btree_update *as,
-				struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct btree_node_iter node_iter,
-				struct keylist *keys)
-{
-	struct bkey_i *insert = bch2_keylist_front(keys);
-	struct bkey_packed *k;
-
-	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
-	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-		;
-
-	for (;
-	     insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
-	     insert = bkey_next(insert))
-		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-
-	int ret = bch2_btree_node_check_topology(trans, b);
-	if (ret) {
-		struct printbuf buf = PRINTBUF;
-
-		for (struct bkey_i *k = keys->keys;
-		     k != insert;
-		     k = bkey_next(k)) {
-			bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
-			prt_newline(&buf);
-		}
-
-		bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s",
-				    (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf);
-		dump_stack();
-		return ret;
-	}
-
-	memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
-	keys->top_p -= insert->_data - keys->keys_p;
-	return 0;
-}
-
-static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
-{
-	if (insert_keys)
-		for_each_keylist_key(insert_keys, k)
-			if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
-				return true;
-	return false;
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static void __btree_split_node(struct btree_update *as,
-			       struct btree_trans *trans,
-			       struct btree *b,
-			       struct btree *n[2],
-			       struct keylist *insert_keys)
-{
-	struct bkey_packed *k;
-	struct bpos n1_pos = POS_MIN;
-	struct btree_node_iter iter;
-	struct bset *bsets[2];
-	struct bkey_format_state format[2];
-	struct bkey_packed *out[2];
-	struct bkey uk;
-	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
-	struct { unsigned nr_keys, val_u64s; } nr_keys[2];
-	int i;
-
-	memset(&nr_keys, 0, sizeof(nr_keys));
-
-	for (i = 0; i < 2; i++) {
-		BUG_ON(n[i]->nsets != 1);
-
-		bsets[i] = btree_bset_first(n[i]);
-		out[i] = bsets[i]->start;
-
-		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
-		bch2_bkey_format_init(&format[i]);
-	}
-
-	u64s = 0;
-	for_each_btree_node_key(b, k, &iter) {
-		if (bkey_deleted(k))
-			continue;
-
-		uk = bkey_unpack_key(b, k);
-
-		if (b->c.level &&
-		    u64s < n1_u64s &&
-		    u64s + k->u64s >= n1_u64s &&
-		    (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
-		     key_deleted_in_insert(insert_keys, uk.p)))
-			n1_u64s += k->u64s;
-
-		i = u64s >= n1_u64s;
-		u64s += k->u64s;
-		if (!i)
-			n1_pos = uk.p;
-		bch2_bkey_format_add_key(&format[i], &uk);
-
-		nr_keys[i].nr_keys++;
-		nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
-	}
-
-	btree_set_min(n[0], b->data->min_key);
-	btree_set_max(n[0], n1_pos);
-	btree_set_min(n[1], bpos_successor(n1_pos));
-	btree_set_max(n[1], b->data->max_key);
-
-	for (i = 0; i < 2; i++) {
-		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
-		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
-
-		n[i]->data->format = bch2_bkey_format_done(&format[i]);
-
-		unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
-			nr_keys[i].val_u64s;
-		if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
-			n[i]->data->format = b->format;
-
-		btree_node_set_format(n[i], n[i]->data->format);
-	}
-
-	u64s = 0;
-	for_each_btree_node_key(b, k, &iter) {
-		if (bkey_deleted(k))
-			continue;
-
-		i = u64s >= n1_u64s;
-		u64s += k->u64s;
-
-		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
-					? &b->format: &bch2_bkey_format_current, k))
-			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
-		else
-			bch2_bkey_unpack(b, (void *) out[i], k);
-
-		out[i]->needs_whiteout = false;
-
-		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
-		out[i] = bkey_p_next(out[i]);
-	}
-
-	for (i = 0; i < 2; i++) {
-		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
-
-		BUG_ON(!bsets[i]->u64s);
-
-		set_btree_bset_end(n[i], n[i]->set);
-
-		btree_node_reset_sib_u64s(n[i]);
-
-		bch2_verify_btree_nr_keys(n[i]);
-
-		BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
-	}
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static int btree_split_insert_keys(struct btree_update *as,
-				   struct btree_trans *trans,
-				   btree_path_idx_t path_idx,
-				   struct btree *b,
-				   struct keylist *keys)
-{
-	struct btree_path *path = trans->paths + path_idx;
-
-	if (!bch2_keylist_empty(keys) &&
-	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
-		struct btree_node_iter node_iter;
-
-		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
-
-		int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int btree_split(struct btree_update *as, struct btree_trans *trans,
-		       btree_path_idx_t path, struct btree *b,
-		       struct keylist *keys)
-{
-	struct bch_fs *c = as->c;
-	struct btree *parent = btree_node_parent(trans->paths + path, b);
-	struct btree *n1, *n2 = NULL, *n3 = NULL;
-	btree_path_idx_t path1 = 0, path2 = 0;
-	u64 start_time = local_clock();
-	int ret = 0;
-
-	bch2_verify_btree_nr_keys(b);
-	BUG_ON(!parent && (b != btree_node_root(c, b)));
-	BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
-
-	ret = bch2_btree_node_check_topology(trans, b);
-	if (ret)
-		return ret;
-
-	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
-		struct btree *n[2];
-
-		trace_and_count(c, btree_node_split, trans, b);
-
-		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
-		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
-
-		__btree_split_node(as, trans, b, n, keys);
-
-		if (keys) {
-			ret =   btree_split_insert_keys(as, trans, path, n1, keys) ?:
-				btree_split_insert_keys(as, trans, path, n2, keys);
-			if (ret)
-				goto err;
-			BUG_ON(!bch2_keylist_empty(keys));
-		}
-
-		bch2_btree_build_aux_trees(n2);
-		bch2_btree_build_aux_trees(n1);
-
-		bch2_btree_update_add_new_node(as, n1);
-		bch2_btree_update_add_new_node(as, n2);
-		six_unlock_write(&n2->c.lock);
-		six_unlock_write(&n1->c.lock);
-
-		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-		path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
-		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path2, n2);
-
-		/*
-		 * Note that on recursive parent_keys == keys, so we
-		 * can't start adding new keys to parent_keys before emptying it
-		 * out (which we did with btree_split_insert_keys() above)
-		 */
-		bch2_keylist_add(&as->parent_keys, &n1->key);
-		bch2_keylist_add(&as->parent_keys, &n2->key);
-
-		if (!parent) {
-			/* Depth increases, make a new root */
-			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
-
-			bch2_btree_update_add_new_node(as, n3);
-			six_unlock_write(&n3->c.lock);
-
-			trans->paths[path2].locks_want++;
-			BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
-			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-			mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
-			bch2_btree_path_level_init(trans, trans->paths + path2, n3);
-
-			n3->sib_u64s[0] = U16_MAX;
-			n3->sib_u64s[1] = U16_MAX;
-
-			ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-			if (ret)
-				goto err;
-		}
-	} else {
-		trace_and_count(c, btree_node_compact, trans, b);
-
-		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
-
-		if (keys) {
-			ret = btree_split_insert_keys(as, trans, path, n1, keys);
-			if (ret)
-				goto err;
-			BUG_ON(!bch2_keylist_empty(keys));
-		}
-
-		bch2_btree_build_aux_trees(n1);
-		bch2_btree_update_add_new_node(as, n1);
-		six_unlock_write(&n1->c.lock);
-
-		path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
-		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-		mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
-		bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
-		if (parent)
-			bch2_keylist_add(&as->parent_keys, &n1->key);
-	}
-
-	/* New nodes all written, now make them visible: */
-
-	if (parent) {
-		/* Split a non root node */
-		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-	} else if (n3) {
-		ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
-	} else {
-		/* Root filled up but didn't need to be split */
-		ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
-	}
-
-	if (ret)
-		goto err;
-
-	bch2_btree_interior_update_will_free_node(as, b);
-
-	if (n3) {
-		bch2_btree_update_get_open_buckets(as, n3);
-		bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
-	}
-	if (n2) {
-		bch2_btree_update_get_open_buckets(as, n2);
-		bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
-	}
-	bch2_btree_update_get_open_buckets(as, n1);
-	bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
-
-	/*
-	 * The old node must be freed (in memory) _before_ unlocking the new
-	 * nodes - else another thread could re-acquire a read lock on the old
-	 * node after another thread has locked and updated the new node, thus
-	 * seeing stale data:
-	 */
-	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-
-	if (n3)
-		bch2_trans_node_add(trans, trans->paths + path, n3);
-	if (n2)
-		bch2_trans_node_add(trans, trans->paths + path2, n2);
-	bch2_trans_node_add(trans, trans->paths + path1, n1);
-
-	if (n3)
-		six_unlock_intent(&n3->c.lock);
-	if (n2)
-		six_unlock_intent(&n2->c.lock);
-	six_unlock_intent(&n1->c.lock);
-out:
-	if (path2) {
-		__bch2_btree_path_unlock(trans, trans->paths + path2);
-		bch2_path_put(trans, path2, true);
-	}
-	if (path1) {
-		__bch2_btree_path_unlock(trans, trans->paths + path1);
-		bch2_path_put(trans, path1, true);
-	}
-
-	bch2_trans_verify_locks(trans);
-
-	bch2_time_stats_update(&c->times[n2
-			       ? BCH_TIME_btree_node_split
-			       : BCH_TIME_btree_node_compact],
-			       start_time);
-	return ret;
-err:
-	if (n3)
-		bch2_btree_node_free_never_used(as, trans, n3);
-	if (n2)
-		bch2_btree_node_free_never_used(as, trans, n2);
-	bch2_btree_node_free_never_used(as, trans, n1);
-	goto out;
-}
-
-/**
- * bch2_btree_insert_node - insert bkeys into a given btree node
- *
- * @as:			btree_update object
- * @trans:		btree_trans object
- * @path_idx:		path that points to current node
- * @b:			node to insert keys into
- * @keys:		list of keys to insert
- *
- * Returns: 0 on success, typically transaction restart error on failure
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-				  btree_path_idx_t path_idx, struct btree *b,
-				  struct keylist *keys)
-{
-	struct bch_fs *c = as->c;
-	struct btree_path *path = trans->paths + path_idx, *linked;
-	unsigned i;
-	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
-	int old_live_u64s = b->nr.live_u64s;
-	int live_u64s_added, u64s_added;
-	int ret;
-
-	lockdep_assert_held(&c->gc_lock);
-	BUG_ON(!b->c.level);
-	BUG_ON(!as || as->b);
-	bch2_verify_keylist_sorted(keys);
-
-	if (!btree_node_intent_locked(path, b->c.level)) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "%s(): node not locked at level %u\n",
-			   __func__, b->c.level);
-		bch2_btree_update_to_text(&buf, as);
-		bch2_btree_path_to_text(&buf, trans, path_idx);
-		bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-		return -EIO;
-	}
-
-	ret = bch2_btree_node_lock_write(trans, path, &b->c);
-	if (ret)
-		return ret;
-
-	bch2_btree_node_prep_for_write(trans, path, b);
-
-	if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
-		bch2_btree_node_unlock_write(trans, path, b);
-		goto split;
-	}
-
-
-	ret =   bch2_btree_node_check_topology(trans, b) ?:
-		bch2_btree_insert_keys_interior(as, trans, path, b,
-					path->l[b->c.level].iter, keys);
-	if (ret) {
-		bch2_btree_node_unlock_write(trans, path, b);
-		return ret;
-	}
-
-	trans_for_each_path_with_node(trans, b, linked, i)
-		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
-	bch2_trans_verify_paths(trans);
-
-	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
-	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-	if (u64s_added > live_u64s_added &&
-	    bch2_maybe_compact_whiteouts(c, b))
-		bch2_trans_node_reinit_iter(trans, b);
-
-	btree_update_updated_node(as, b);
-	bch2_btree_node_unlock_write(trans, path, b);
-	return 0;
-split:
-	/*
-	 * We could attempt to avoid the transaction restart, by calling
-	 * bch2_btree_path_upgrade() and allocating more nodes:
-	 */
-	if (b->c.level >= as->update_level_end) {
-		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
-		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
-	}
-
-	return btree_split(as, trans, path_idx, b, keys);
-}
-
-int bch2_btree_split_leaf(struct btree_trans *trans,
-			  btree_path_idx_t path,
-			  unsigned flags)
-{
-	/* btree_split & merge may both cause paths array to be reallocated */
-	struct btree *b = path_l(trans->paths + path)->b;
-	struct btree_update *as;
-	unsigned l;
-	int ret = 0;
-
-	as = bch2_btree_update_start(trans, trans->paths + path,
-				     trans->paths[path].level,
-				     true, 0, flags);
-	if (IS_ERR(as))
-		return PTR_ERR(as);
-
-	ret = btree_split(as, trans, path, b, NULL);
-	if (ret) {
-		bch2_btree_update_free(as, trans);
-		return ret;
-	}
-
-	bch2_btree_update_done(as, trans);
-
-	for (l = trans->paths[path].level + 1;
-	     btree_node_intent_locked(&trans->paths[path], l) && !ret;
-	     l++)
-		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
-
-	return ret;
-}
-
-static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
-				   btree_path_idx_t path_idx)
-{
-	struct bch_fs *c = as->c;
-	struct btree_path *path = trans->paths + path_idx;
-	struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
-
-	BUG_ON(!btree_node_locked(path, b->c.level));
-
-	n = __btree_root_alloc(as, trans, b->c.level + 1);
-
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	path->locks_want++;
-	BUG_ON(btree_node_locked(path, n->c.level));
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, path, n);
-
-	n->sib_u64s[0] = U16_MAX;
-	n->sib_u64s[1] = U16_MAX;
-
-	bch2_keylist_add(&as->parent_keys, &b->key);
-	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
-
-	int ret = bch2_btree_set_root(as, trans, path, n, true);
-	BUG_ON(ret);
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-	bch2_trans_node_add(trans, path, n);
-	six_unlock_intent(&n->c.lock);
-
-	mutex_lock(&c->btree_cache.lock);
-	list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
-	mutex_unlock(&c->btree_cache.lock);
-
-	bch2_trans_verify_locks(trans);
-}
-
-int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
-
-	if (btree_node_fake(b))
-		return bch2_btree_split_leaf(trans, path, flags);
-
-	struct btree_update *as =
-		bch2_btree_update_start(trans, trans->paths + path, b->c.level,
-					true, 0, flags);
-	if (IS_ERR(as))
-		return PTR_ERR(as);
-
-	__btree_increase_depth(as, trans, path);
-	bch2_btree_update_done(as, trans);
-	return 0;
-}
-
-int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-				  btree_path_idx_t path,
-				  unsigned level,
-				  unsigned flags,
-				  enum btree_node_sibling sib)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_update *as;
-	struct bkey_format_state new_s;
-	struct bkey_format new_f;
-	struct bkey_i delete;
-	struct btree *b, *m, *n, *prev, *next, *parent;
-	struct bpos sib_pos;
-	size_t sib_u64s;
-	enum btree_id btree = trans->paths[path].btree_id;
-	btree_path_idx_t sib_path = 0, new_path = 0;
-	u64 start_time = local_clock();
-	int ret = 0;
-
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-	BUG_ON(!trans->paths[path].should_be_locked);
-	BUG_ON(!btree_node_locked(&trans->paths[path], level));
-
-	/*
-	 * Work around a deadlock caused by the btree write buffer not doing
-	 * merges and leaving tons of merges for us to do - we really don't need
-	 * to be doing merges at all from the interior update path, and if the
-	 * interior update path is generating too many new interior updates we
-	 * deadlock:
-	 */
-	if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
-		return 0;
-
-	if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
-		flags &= ~BCH_WATERMARK_MASK;
-		flags |= BCH_WATERMARK_btree;
-		flags |= BCH_TRANS_COMMIT_journal_reclaim;
-	}
-
-	b = trans->paths[path].l[level].b;
-
-	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
-	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
-		b->sib_u64s[sib] = U16_MAX;
-		return 0;
-	}
-
-	sib_pos = sib == btree_prev_sib
-		? bpos_predecessor(b->data->min_key)
-		: bpos_successor(b->data->max_key);
-
-	sib_path = bch2_path_get(trans, btree, sib_pos,
-				 U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
-	ret = bch2_btree_path_traverse(trans, sib_path, false);
-	if (ret)
-		goto err;
-
-	btree_path_set_should_be_locked(trans, trans->paths + sib_path);
-
-	m = trans->paths[sib_path].l[level].b;
-
-	if (btree_node_parent(trans->paths + path, b) !=
-	    btree_node_parent(trans->paths + sib_path, m)) {
-		b->sib_u64s[sib] = U16_MAX;
-		goto out;
-	}
-
-	if (sib == btree_prev_sib) {
-		prev = m;
-		next = b;
-	} else {
-		prev = b;
-		next = m;
-	}
-
-	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
-		struct printbuf buf = PRINTBUF;
-
-		printbuf_indent_add_nextline(&buf, 2);
-		prt_printf(&buf, "%s(): ", __func__);
-		ret = __bch2_topology_error(c, &buf);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "prev ends at   ");
-		bch2_bpos_to_text(&buf, prev->data->max_key);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "next starts at ");
-		bch2_bpos_to_text(&buf, next->data->min_key);
-
-		bch_err(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		goto err;
-	}
-
-	bch2_bkey_format_init(&new_s);
-	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
-	__bch2_btree_calc_format(&new_s, prev);
-	__bch2_btree_calc_format(&new_s, next);
-	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
-	new_f = bch2_bkey_format_done(&new_s);
-
-	sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
-		btree_node_u64s_with_format(m->nr, &m->format, &new_f);
-
-	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
-		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-		sib_u64s /= 2;
-		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
-	}
-
-	sib_u64s = min(sib_u64s, btree_max_u64s(c));
-	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
-	b->sib_u64s[sib] = sib_u64s;
-
-	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-		goto out;
-
-	parent = btree_node_parent(trans->paths + path, b);
-	as = bch2_btree_update_start(trans, trans->paths + path, level, false,
-				     0, BCH_TRANS_COMMIT_no_enospc|flags);
-	ret = PTR_ERR_OR_ZERO(as);
-	if (ret)
-		goto err;
-
-	as->node_start	= prev->data->min_key;
-	as->node_end	= next->data->max_key;
-
-	trace_and_count(c, btree_node_merge, trans, b);
-
-	n = bch2_btree_node_alloc(as, trans, b->c.level);
-
-	SET_BTREE_NODE_SEQ(n->data,
-			   max(BTREE_NODE_SEQ(b->data),
-			       BTREE_NODE_SEQ(m->data)) + 1);
-
-	btree_set_min(n, prev->data->min_key);
-	btree_set_max(n, next->data->max_key);
-
-	n->data->format	 = new_f;
-	btree_node_set_format(n, new_f);
-
-	bch2_btree_sort_into(c, n, prev);
-	bch2_btree_sort_into(c, n, next);
-
-	bch2_btree_build_aux_trees(n);
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-	bkey_init(&delete.k);
-	delete.k.p = prev->key.k.p;
-	bch2_keylist_add(&as->parent_keys, &delete);
-	bch2_keylist_add(&as->parent_keys, &n->key);
-
-	bch2_trans_verify_paths(trans);
-
-	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-	if (ret)
-		goto err_free_update;
-
-	bch2_btree_interior_update_will_free_node(as, b);
-	bch2_btree_interior_update_will_free_node(as, m);
-
-	bch2_trans_verify_paths(trans);
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
-	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-	bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
-
-	bch2_trans_node_add(trans, trans->paths + path, n);
-
-	bch2_trans_verify_paths(trans);
-
-	six_unlock_intent(&n->c.lock);
-
-	bch2_btree_update_done(as, trans);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
-out:
-err:
-	if (new_path)
-		bch2_path_put(trans, new_path, true);
-	bch2_path_put(trans, sib_path, true);
-	bch2_trans_verify_locks(trans);
-	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
-		ret = 0;
-	if (!ret)
-		ret = bch2_trans_relock(trans);
-	return ret;
-err_free_update:
-	bch2_btree_node_free_never_used(as, trans, n);
-	bch2_btree_update_free(as, trans);
-	goto out;
-}
-
-static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
-			    struct btree *b)
-{
-	bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
-				  BTREE_MAX_DEPTH, b->c.level,
-				  BTREE_ITER_intent);
-	int ret = bch2_btree_iter_traverse(trans, iter);
-	if (ret)
-		goto err;
-
-	/* has node been freed? */
-	if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
-		/* node has been freed: */
-		BUG_ON(!btree_node_dying(b));
-		ret = bch_err_throw(trans->c, btree_node_dying);
-		goto err;
-	}
-
-	BUG_ON(!btree_node_hashed(b));
-	return 0;
-err:
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct btree *b,
-			    unsigned target,
-			    unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *n, *parent;
-	struct btree_update *as;
-	btree_path_idx_t new_path = 0;
-	int ret;
-
-	flags |= BCH_TRANS_COMMIT_no_enospc;
-
-	struct btree_path *path = btree_iter_path(trans, iter);
-	parent = btree_node_parent(path, b);
-	as = bch2_btree_update_start(trans, path, b->c.level,
-				     false, target, flags);
-	ret = PTR_ERR_OR_ZERO(as);
-	if (ret)
-		goto out;
-
-	n = bch2_btree_node_alloc_replacement(as, trans, b);
-
-	bch2_btree_build_aux_trees(n);
-	bch2_btree_update_add_new_node(as, n);
-	six_unlock_write(&n->c.lock);
-
-	new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
-	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-	mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
-	bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
-	trace_and_count(c, btree_node_rewrite, trans, b);
-
-	if (parent) {
-		bch2_keylist_add(&as->parent_keys, &n->key);
-		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
-	} else {
-		ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
-	}
-
-	if (ret)
-		goto err;
-
-	bch2_btree_interior_update_will_free_node(as, b);
-
-	bch2_btree_update_get_open_buckets(as, n);
-	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
-	bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
-
-	bch2_trans_node_add(trans, trans->paths + iter->path, n);
-	six_unlock_intent(&n->c.lock);
-
-	bch2_btree_update_done(as, trans);
-out:
-	if (new_path)
-		bch2_path_put(trans, new_path, true);
-	bch2_trans_downgrade(trans);
-	return ret;
-err:
-	bch2_btree_node_free_never_used(as, trans, n);
-	bch2_btree_update_free(as, trans);
-	goto out;
-}
-
-int bch2_btree_node_rewrite_key(struct btree_trans *trans,
-				enum btree_id btree, unsigned level,
-				struct bkey_i *k, unsigned flags)
-{
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter,
-				  btree, k->k.p,
-				  BTREE_MAX_DEPTH, level, 0);
-	struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto out;
-
-	bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
-	ret = found
-		? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
-		: -ENOENT;
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
-				enum btree_id btree, unsigned level,
-				struct bpos pos,
-				unsigned target,
-				unsigned flags)
-{
-	BUG_ON(!level);
-
-	/* Traverse one depth lower to get a pointer to the node itself: */
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
-	struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		goto err;
-
-	ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
-					 struct btree *b, unsigned flags)
-{
-	struct btree_iter iter;
-	int ret = get_iter_to_node(trans, &iter, b);
-	if (ret)
-		return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
-	ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-struct async_btree_rewrite {
-	struct bch_fs		*c;
-	struct work_struct	work;
-	struct list_head	list;
-	enum btree_id		btree_id;
-	unsigned		level;
-	struct bkey_buf		key;
-};
-
-static void async_btree_node_rewrite_work(struct work_struct *work)
-{
-	struct async_btree_rewrite *a =
-		container_of(work, struct async_btree_rewrite, work);
-	struct bch_fs *c = a->c;
-
-	int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
-						a->btree_id, a->level, a->key.k, 0));
-	if (!bch2_err_matches(ret, ENOENT) &&
-	    !bch2_err_matches(ret, EROFS))
-		bch_err_fn_ratelimited(c, ret);
-
-	spin_lock(&c->btree_node_rewrites_lock);
-	list_del(&a->list);
-	spin_unlock(&c->btree_node_rewrites_lock);
-
-	closure_wake_up(&c->btree_node_rewrites_wait);
-
-	bch2_bkey_buf_exit(&a->key, c);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
-	kfree(a);
-}
-
-void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
-{
-	struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (!a)
-		return;
-
-	a->c		= c;
-	a->btree_id	= b->c.btree_id;
-	a->level	= b->c.level;
-	INIT_WORK(&a->work, async_btree_node_rewrite_work);
-
-	bch2_bkey_buf_init(&a->key);
-	bch2_bkey_buf_copy(&a->key, c, &b->key);
-
-	bool now = false, pending = false;
-
-	spin_lock(&c->btree_node_rewrites_lock);
-	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
-	    enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
-		list_add(&a->list, &c->btree_node_rewrites);
-		now = true;
-	} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
-		list_add(&a->list, &c->btree_node_rewrites_pending);
-		pending = true;
-	}
-	spin_unlock(&c->btree_node_rewrites_lock);
-
-	if (now) {
-		queue_work(c->btree_node_rewrite_worker, &a->work);
-	} else if (pending) {
-		/* bch2_do_pending_node_rewrites will execute */
-	} else {
-		bch2_bkey_buf_exit(&a->key, c);
-		kfree(a);
-	}
-}
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
-{
-	closure_wait_event(&c->btree_node_rewrites_wait,
-			   list_empty(&c->btree_node_rewrites));
-}
-
-void bch2_do_pending_node_rewrites(struct bch_fs *c)
-{
-	while (1) {
-		spin_lock(&c->btree_node_rewrites_lock);
-		struct async_btree_rewrite *a =
-			list_pop_entry(&c->btree_node_rewrites_pending,
-				       struct async_btree_rewrite, list);
-		if (a)
-			list_add(&a->list, &c->btree_node_rewrites);
-		spin_unlock(&c->btree_node_rewrites_lock);
-
-		if (!a)
-			break;
-
-		enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
-		queue_work(c->btree_node_rewrite_worker, &a->work);
-	}
-}
-
-void bch2_free_pending_node_rewrites(struct bch_fs *c)
-{
-	while (1) {
-		spin_lock(&c->btree_node_rewrites_lock);
-		struct async_btree_rewrite *a =
-			list_pop_entry(&c->btree_node_rewrites_pending,
-				       struct async_btree_rewrite, list);
-		spin_unlock(&c->btree_node_rewrites_lock);
-
-		if (!a)
-			break;
-
-		bch2_bkey_buf_exit(&a->key, c);
-		kfree(a);
-	}
-}
-
-static int __bch2_btree_node_update_key(struct btree_trans *trans,
-					struct btree_iter *iter,
-					struct btree *b, struct btree *new_hash,
-					struct bkey_i *new_key,
-					unsigned commit_flags,
-					bool skip_triggers)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter2 = {};
-	struct btree *parent;
-	int ret;
-
-	if (!skip_triggers) {
-		ret   = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
-					     bkey_i_to_s_c(&b->key),
-					     BTREE_TRIGGER_transactional) ?:
-			bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
-					     bkey_i_to_s(new_key),
-					     BTREE_TRIGGER_transactional);
-		if (ret)
-			return ret;
-	}
-
-	if (new_hash) {
-		bkey_copy(&new_hash->key, new_key);
-		ret = bch2_btree_node_hash_insert(&c->btree_cache,
-				new_hash, b->c.level, b->c.btree_id);
-		BUG_ON(ret);
-	}
-
-	parent = btree_node_parent(btree_iter_path(trans, iter), b);
-	if (parent) {
-		bch2_trans_copy_iter(trans, &iter2, iter);
-
-		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
-				iter2.flags & BTREE_ITER_intent,
-				_THIS_IP_);
-
-		struct btree_path *path2 = btree_iter_path(trans, &iter2);
-		BUG_ON(path2->level != b->c.level);
-		BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
-
-		btree_path_set_level_up(trans, path2);
-
-		trans->paths_sorted = false;
-
-		ret   = bch2_btree_iter_traverse(trans, &iter2) ?:
-			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
-		if (ret)
-			goto err;
-	} else {
-		BUG_ON(btree_node_root(c, b) != b);
-
-		struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
-				       jset_u64s(new_key->k.u64s));
-		ret = PTR_ERR_OR_ZERO(e);
-		if (ret)
-			return ret;
-
-		journal_entry_set(e,
-				  BCH_JSET_ENTRY_btree_root,
-				  b->c.btree_id, b->c.level,
-				  new_key, new_key->k.u64s);
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
-	if (ret)
-		goto err;
-
-	bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
-
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
-
-		__bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-		bkey_copy(&b->key, new_key);
-		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-		BUG_ON(ret);
-		mutex_unlock(&c->btree_cache.lock);
-	} else {
-		bkey_copy(&b->key, new_key);
-	}
-
-	bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
-out:
-	bch2_trans_iter_exit(trans, &iter2);
-	return ret;
-err:
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		bch2_btree_node_hash_remove(&c->btree_cache, b);
-		mutex_unlock(&c->btree_cache.lock);
-	}
-	goto out;
-}
-
-int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
-			       struct btree *b, struct bkey_i *new_key,
-			       unsigned commit_flags, bool skip_triggers)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *new_hash = NULL;
-	struct btree_path *path = btree_iter_path(trans, iter);
-	struct closure cl;
-	int ret = 0;
-
-	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
-	if (ret)
-		return ret;
-
-	closure_init_stack(&cl);
-
-	/*
-	 * check btree_ptr_hash_val() after @b is locked by
-	 * btree_iter_traverse():
-	 */
-	if (btree_ptr_hash_val(new_key) != b->hash_val) {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		if (ret) {
-			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
-			if (ret)
-				return ret;
-		}
-
-		new_hash = bch2_btree_node_mem_alloc(trans, false);
-		ret = PTR_ERR_OR_ZERO(new_hash);
-		if (ret)
-			goto err;
-	}
-
-	path->intent_ref++;
-	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
-					   commit_flags, skip_triggers);
-	--path->intent_ref;
-
-	if (new_hash)
-		bch2_btree_node_to_freelist(c, new_hash);
-err:
-	closure_sync(&cl);
-	bch2_btree_cache_cannibalize_unlock(trans);
-	return ret;
-}
-
-int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
-					struct btree *b, struct bkey_i *new_key,
-					unsigned commit_flags, bool skip_triggers)
-{
-	struct btree_iter iter;
-	int ret = get_iter_to_node(trans, &iter, b);
-	if (ret)
-		return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
-	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
-			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
-
-	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
-					 commit_flags, skip_triggers);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* Init code: */
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
-{
-	BUG_ON(btree_node_root(c, b));
-
-	bch2_btree_set_root_inmem(c, b);
-}
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
-{
-	struct bch_fs *c = trans->c;
-	struct closure cl;
-	struct btree *b;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	do {
-		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
-		closure_sync(&cl);
-	} while (ret);
-
-	b = bch2_btree_node_mem_alloc(trans, false);
-	bch2_btree_cache_cannibalize_unlock(trans);
-
-	ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		return ret;
-
-	set_btree_node_fake(b);
-	set_btree_node_need_rewrite(b);
-	b->c.level	= level;
-	b->c.btree_id	= id;
-
-	bkey_btree_ptr_init(&b->key);
-	b->key.k.p = SPOS_MAX;
-	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
-
-	bch2_bset_init_first(b, &b->data->keys);
-	bch2_btree_build_aux_trees(b);
-
-	b->data->flags = 0;
-	btree_set_min(b, POS_MIN);
-	btree_set_max(b, SPOS_MAX);
-	b->data->format = bch2_btree_calc_format(b);
-	btree_node_set_format(b, b->data->format);
-
-	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
-					  b->c.level, b->c.btree_id);
-	BUG_ON(ret);
-
-	bch2_btree_set_root_inmem(c, b);
-
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-	return 0;
-}
-
-void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
-{
-	bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
-}
-
-static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-{
-	prt_printf(out, "%ps: ", (void *) as->ip_started);
-	bch2_trans_commit_flags_to_text(out, as->flags);
-
-	prt_str(out, " ");
-	bch2_btree_id_to_text(out, as->btree_id);
-	prt_printf(out, " l=%u-%u ",
-		   as->update_level_start,
-		   as->update_level_end);
-	bch2_bpos_to_text(out, as->node_start);
-	prt_char(out, ' ');
-	bch2_bpos_to_text(out, as->node_end);
-	prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s",
-		   as->node_written,
-		   as->node_sectors,
-		   as->node_remaining,
-		   btree_node_reawrite_reason_strs[as->node_needed_rewrite]);
-
-	prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
-		   bch2_btree_update_modes[as->mode],
-		   as->nodes_written,
-		   closure_nr_remaining(&as->cl),
-		   as->journal.seq);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct btree_update *as;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		bch2_btree_update_to_text(out, as);
-	mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
-{
-	bool ret;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	ret = !list_empty(&c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	return ret;
-}
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *c)
-{
-	bool ret = bch2_btree_interior_updates_pending(c);
-
-	if (ret)
-		closure_wait_event(&c->btree_interior_update_wait,
-				   !bch2_btree_interior_updates_pending(c));
-	return ret;
-}
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
-{
-	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
-	mutex_lock(&c->btree_root_lock);
-
-	r->level = entry->level;
-	r->alive = true;
-	bkey_copy(&r->key, (struct bkey_i *) entry->start);
-
-	mutex_unlock(&c->btree_root_lock);
-}
-
-struct jset_entry *
-bch2_btree_roots_to_journal_entries(struct bch_fs *c,
-				    struct jset_entry *end,
-				    unsigned long skip)
-{
-	unsigned i;
-
-	mutex_lock(&c->btree_root_lock);
-
-	for (i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (r->alive && !test_bit(i, &skip)) {
-			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
-					  i, r->level, &r->key, r->key.k.u64s);
-			end = vstruct_next(end);
-		}
-	}
-
-	mutex_unlock(&c->btree_root_lock);
-
-	return end;
-}
-
-static void bch2_btree_alloc_to_text(struct printbuf *out,
-				     struct bch_fs *c,
-				     struct btree_alloc *a)
-{
-	printbuf_indent_add(out, 2);
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
-	prt_newline(out);
-
-	struct open_bucket *ob;
-	unsigned i;
-	open_bucket_for_each(c, &a->ob, ob, i)
-		bch2_open_bucket_to_text(out, c, ob);
-
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
-		bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
-}
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
-{
-	WARN_ON(!list_empty(&c->btree_node_rewrites));
-	WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
-
-	if (c->btree_node_rewrite_worker)
-		destroy_workqueue(c->btree_node_rewrite_worker);
-	if (c->btree_interior_update_worker)
-		destroy_workqueue(c->btree_interior_update_worker);
-	mempool_exit(&c->btree_interior_update_pool);
-}
-
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
-{
-	mutex_init(&c->btree_reserve_cache_lock);
-	INIT_LIST_HEAD(&c->btree_interior_update_list);
-	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
-	mutex_init(&c->btree_interior_update_lock);
-	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
-
-	INIT_LIST_HEAD(&c->btree_node_rewrites);
-	INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
-	spin_lock_init(&c->btree_node_rewrites_lock);
-}
-
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
-{
-	c->btree_interior_update_worker =
-		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
-	if (!c->btree_interior_update_worker)
-		return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
-	c->btree_node_rewrite_worker =
-		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
-	if (!c->btree_node_rewrite_worker)
-		return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
-	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-				      sizeof(struct btree_update)))
-		return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
-
-	return 0;
-}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
deleted file mode 100644
index ac04e45a8515..000000000000
--- a/fs/bcachefs/btree_update_interior.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-
-#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-
-#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
-int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
-
-#define BTREE_UPDATE_MODES()	\
-	x(none)			\
-	x(node)			\
-	x(root)			\
-	x(update)
-
-enum btree_update_mode {
-#define x(n)	BTREE_UPDATE_##n,
-	BTREE_UPDATE_MODES()
-#undef x
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_update {
-	struct closure			cl;
-	struct bch_fs			*c;
-	u64				start_time;
-	unsigned long			ip_started;
-
-	struct list_head		list;
-	struct list_head		unwritten_list;
-
-	enum btree_update_mode		mode;
-	enum bch_trans_commit_flags	flags;
-	unsigned			nodes_written:1;
-	unsigned			took_gc_lock:1;
-
-	enum btree_id			btree_id;
-	struct bpos			node_start;
-	struct bpos			node_end;
-	enum btree_node_rewrite_reason	node_needed_rewrite;
-	u16				node_written;
-	u16				node_sectors;
-	u16				node_remaining;
-
-	unsigned			update_level_start;
-	unsigned			update_level_end;
-
-	struct disk_reservation		disk_res;
-
-	/*
-	 * BTREE_UPDATE_node:
-	 * The update that made the new nodes visible was a regular update to an
-	 * existing interior node - @b. We can't write out the update to @b
-	 * until the new nodes we created are finished writing, so we block @b
-	 * from writing by putting this btree_interior update on the
-	 * @b->write_blocked list with @write_blocked_list:
-	 */
-	struct btree			*b;
-	struct list_head		write_blocked_list;
-
-	/*
-	 * We may be freeing nodes that were dirty, and thus had journal entries
-	 * pinned: we need to transfer the oldest of those pins to the
-	 * btree_update operation, and release it when the new node(s)
-	 * are all persistent and reachable:
-	 */
-	struct journal_entry_pin	journal;
-
-	/* Preallocated nodes we reserve when we start the update: */
-	struct prealloc_nodes {
-		struct btree		*b[BTREE_UPDATE_NODES_MAX];
-		unsigned		nr;
-	}				prealloc_nodes[2];
-
-	/* Nodes being freed: */
-	struct keylist			old_keys;
-	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_U64s_MAX];
-
-	/* Nodes being added: */
-	struct keylist			new_keys;
-	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
-						  BKEY_BTREE_PTR_U64s_MAX];
-
-	/* New nodes, that will be made reachable by this update: */
-	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
-	unsigned			nr_new_nodes;
-
-	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
-	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
-	unsigned			nr_old_nodes;
-
-	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
-						     BCH_REPLICAS_MAX];
-	open_bucket_idx_t		nr_open_buckets;
-
-	unsigned			journal_u64s;
-	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
-	/* Only here to reduce stack usage on recursive splits: */
-	struct keylist			parent_keys;
-	/*
-	 * Enough room for btree_split's keys without realloc - btree node
-	 * pointers never have crc/compression info, so we only need to acount
-	 * for the pointers for three keys
-	 */
-	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
-						  struct btree_trans *,
-						  struct btree *,
-						  struct bkey_format);
-
-int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
-				  unsigned, unsigned, enum btree_node_sibling);
-
-static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-					btree_path_idx_t path_idx,
-					unsigned level, unsigned flags,
-					enum btree_node_sibling sib)
-{
-	struct btree_path *path = trans->paths + path_idx;
-	struct btree *b;
-
-	EBUG_ON(!btree_node_locked(path, level));
-
-	if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
-		return 0;
-
-	b = path->l[level].b;
-	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
-		return 0;
-
-	return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
-}
-
-static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-					      btree_path_idx_t path,
-					      unsigned level,
-					      unsigned flags)
-{
-	bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
-	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-						    btree_prev_sib) ?:
-		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
-						    btree_next_sib);
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-			    struct btree *, unsigned, unsigned);
-int bch2_btree_node_rewrite_key(struct btree_trans *,
-				enum btree_id, unsigned,
-				struct bkey_i *, unsigned);
-int bch2_btree_node_rewrite_pos(struct btree_trans *,
-				enum btree_id, unsigned,
-				struct bpos, unsigned, unsigned);
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
-					 struct btree *, unsigned);
-
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-			       struct btree *, struct bkey_i *,
-			       unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
-					struct bkey_i *, unsigned, bool);
-
-void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
-void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
-
-static inline unsigned btree_update_reserve_required(struct bch_fs *c,
-						     struct btree *b)
-{
-	unsigned depth = btree_node_root(c, b)->c.level + 1;
-
-	/*
-	 * Number of nodes we might have to allocate in a worst case btree
-	 * split operation - we split all the way up to the root, then allocate
-	 * a new root, unless we're already at max depth:
-	 */
-	if (depth < BTREE_MAX_DEPTH)
-		return (depth - b->c.level) * 2 + 1;
-	else
-		return (depth - b->c.level) * 2 - 1;
-}
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
-	b->sib_u64s[0] = b->nr.live_u64s;
-	b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-static inline void *btree_data_end(struct btree *b)
-{
-	return (void *) b->data + btree_buf_bytes(b);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
-{
-	return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
-{
-	return btree_data_end(b);
-}
-
-static inline void *write_block(struct btree *b)
-{
-	return (void *) b->data + (b->written << 9);
-}
-
-static inline bool __btree_addr_written(struct btree *b, void *p)
-{
-	return p < write_block(b);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
-	return __btree_addr_written(b, i);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
-{
-	return __btree_addr_written(b, k);
-}
-
-static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
-{
-	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
-		b->whiteout_u64s;
-	ssize_t total = btree_buf_bytes(b) >> 3;
-
-	/* Always leave one extra u64 for bch2_varint_decode: */
-	used++;
-
-	return total - used;
-}
-
-static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
-{
-	ssize_t remaining = __bch2_btree_u64s_remaining(b,
-				btree_bkey_last(b, bset_tree_last(b)));
-
-	BUG_ON(remaining < 0);
-
-	if (bset_written(b, btree_bset_last(b)))
-		return 0;
-
-	return remaining;
-}
-
-#define BTREE_WRITE_SET_U64s_BITS	9
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
-	/*
-	 * Could buffer up larger amounts of keys for btrees with larger keys,
-	 * pending benchmarking:
-	 */
-	return 8 << BTREE_WRITE_SET_U64s_BITS;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
-{
-	struct bset_tree *t = bset_tree_last(b);
-	struct btree_node_entry *bne = max(write_block(b),
-			(void *) btree_bkey_last(b, t));
-	ssize_t remaining_space =
-		__bch2_btree_u64s_remaining(b, bne->keys.start);
-
-	if (unlikely(bset_written(b, bset(b, t)))) {
-		if (b->written + block_sectors(c) <= btree_sectors(c))
-			return bne;
-	} else {
-		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
-		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
-			return bne;
-	}
-
-	return NULL;
-}
-
-static inline void push_whiteout(struct btree *b, struct bpos pos)
-{
-	struct bkey_packed k;
-
-	BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
-	EBUG_ON(btree_node_just_written(b));
-
-	if (!bkey_pack_pos(&k, pos, b)) {
-		struct bkey *u = (void *) &k;
-
-		bkey_init(u);
-		u->p = pos;
-	}
-
-	k.needs_whiteout = true;
-
-	b->whiteout_u64s += k.u64s;
-	bkey_p_copy(unwritten_whiteouts_start(b), &k);
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
-{
-	if (unlikely(btree_node_need_rewrite(b)))
-		return false;
-
-	return u64s <= bch2_btree_keys_u64s_remaining(b);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *);
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
-struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
-					struct jset_entry *, unsigned long);
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
-void bch2_do_pending_node_rewrites(struct bch_fs *);
-void bch2_free_pending_node_rewrites(struct bch_fs *);
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *);
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
-int bch2_fs_btree_interior_update_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
deleted file mode 100644
index 4b095235a0d2..000000000000
--- a/fs/bcachefs/btree_write_buffer.c
+++ /dev/null
@@ -1,893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *,
-				struct journal_entry_pin *, u64);
-
-static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-	return (cmp_int(l->hi, r->hi) ?:
-		cmp_int(l->mi, r->mi) ?:
-		cmp_int(l->lo, r->lo)) >= 0;
-}
-
-static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-#ifdef CONFIG_X86_64
-	int cmp;
-
-	asm("mov   (%[l]), %%rax;"
-	    "sub   (%[r]), %%rax;"
-	    "mov  8(%[l]), %%rax;"
-	    "sbb  8(%[r]), %%rax;"
-	    "mov 16(%[l]), %%rax;"
-	    "sbb 16(%[r]), %%rax;"
-	    : "=@ccae" (cmp)
-	    : [l] "r" (l), [r] "r" (r)
-	    : "rax", "cc");
-
-	EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
-	return cmp;
-#else
-	return __wb_key_ref_cmp(l, r);
-#endif
-}
-
-static int wb_key_seq_cmp(const void *_l, const void *_r)
-{
-	const struct btree_write_buffered_key *l = _l;
-	const struct btree_write_buffered_key *r = _r;
-
-	return cmp_int(l->journal_seq, r->journal_seq);
-}
-
-/* Compare excluding idx, the low 24 bits: */
-static inline bool wb_key_eq(const void *_l, const void *_r)
-{
-	const struct wb_key_ref *l = _l;
-	const struct wb_key_ref *r = _r;
-
-	return !((l->hi ^ r->hi)|
-		 (l->mi ^ r->mi)|
-		 ((l->lo >> 24) ^ (r->lo >> 24)));
-}
-
-static noinline void wb_sort(struct wb_key_ref *base, size_t num)
-{
-	size_t n = num, a = num / 2;
-
-	if (!a)		/* num < 2 || size == 0 */
-		return;
-
-	for (;;) {
-		size_t b, c, d;
-
-		if (a)			/* Building heap: sift down --a */
-			--a;
-		else if (--n)		/* Sorting: Extract root to --n */
-			swap(base[0], base[n]);
-		else			/* Sort complete */
-			break;
-
-		/*
-		 * Sift element at "a" down into heap.  This is the
-		 * "bottom-up" variant, which significantly reduces
-		 * calls to cmp_func(): we find the sift-down path all
-		 * the way to the leaves (one compare per level), then
-		 * backtrack to find where to insert the target element.
-		 *
-		 * Because elements tend to sift down close to the leaves,
-		 * this uses fewer compares than doing two per level
-		 * on the way down.  (A bit more than half as many on
-		 * average, 3/4 worst-case.)
-		 */
-		for (b = a; c = 2*b + 1, (d = c + 1) < n;)
-			b = wb_key_ref_cmp(base + c, base + d) ? c : d;
-		if (d == n)		/* Special case last leaf with no sibling */
-			b = c;
-
-		/* Now backtrack from "b" to the correct location for "a" */
-		while (b != a && wb_key_ref_cmp(base + a, base + b))
-			b = (b - 1) / 2;
-		c = b;			/* Where "a" belongs */
-		while (b != a) {	/* Shift it into place */
-			b = (b - 1) / 2;
-			swap(base[b], base[c]);
-		}
-	}
-}
-
-static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
-					  struct btree_iter *iter,
-					  struct btree_write_buffered_key *wb)
-{
-	struct btree_path *path = btree_iter_path(trans, iter);
-
-	bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-
-	trans->journal_res.seq = wb->journal_seq;
-
-	return bch2_trans_update(trans, iter, &wb->k,
-				 BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_enospc|
-				  BCH_TRANS_COMMIT_no_check_rw|
-				  BCH_TRANS_COMMIT_no_journal_res|
-				  BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
-			       struct btree_write_buffered_key *wb,
-			       bool *write_locked,
-			       bool *accounting_accumulated,
-			       size_t *fast)
-{
-	struct btree_path *path;
-	int ret;
-
-	EBUG_ON(!wb->journal_seq);
-	EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
-	EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
-
-	ret = bch2_btree_iter_traverse(trans, iter);
-	if (ret)
-		return ret;
-
-	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
-		struct bkey u;
-		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
-
-		if (k.k->type == KEY_TYPE_accounting)
-			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
-						   bkey_s_c_to_accounting(k));
-	}
-	*accounting_accumulated = true;
-
-	/*
-	 * We can't clone a path that has write locks: unshare it now, before
-	 * set_pos and traverse():
-	 */
-	if (btree_iter_path(trans, iter)->ref > 1)
-		iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
-
-	path = btree_iter_path(trans, iter);
-
-	if (!*write_locked) {
-		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
-		if (ret)
-			return ret;
-
-		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
-		*write_locked = true;
-	}
-
-	if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
-		*write_locked = false;
-		return wb_flush_one_slowpath(trans, iter, wb);
-	}
-
-	EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
-
-	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
-	(*fast)++;
-	return 0;
-}
-
-/*
- * Update a btree with a write buffered key using the journal seq of the
- * original write buffer insert.
- *
- * It is not safe to rejournal the key once it has been inserted into the write
- * buffer because that may break recovery ordering. For example, the key may
- * have already been modified in the active write buffer in a seq that comes
- * before the current transaction. If we were to journal this key again and
- * crash, recovery would process updates in the wrong order.
- */
-static int
-btree_write_buffered_insert(struct btree_trans *trans,
-			  struct btree_write_buffered_key *wb)
-{
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
-			     BTREE_ITER_cached|BTREE_ITER_intent);
-
-	trans->journal_res.seq = wb->journal_seq;
-
-	ret   = bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_trans_update(trans, &iter, &wb->k,
-				  BTREE_UPDATE_internal_snapshot_node);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
-{
-	struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
-	struct journal *j = &c->journal;
-
-	if (!wb->inc.keys.nr)
-		return;
-
-	bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
-			     bch2_btree_write_buffer_journal_flush);
-
-	darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
-	darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-	if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
-		swap(wb->flushing.keys, wb->inc.keys);
-		goto out;
-	}
-
-	size_t nr = min(darray_room(wb->flushing.keys),
-			wb->sorted.size - wb->flushing.keys.nr);
-	nr = min(nr, wb->inc.keys.nr);
-
-	memcpy(&darray_top(wb->flushing.keys),
-	       wb->inc.keys.data,
-	       sizeof(wb->inc.keys.data[0]) * nr);
-
-	memmove(wb->inc.keys.data,
-		wb->inc.keys.data + nr,
-	       sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
-
-	wb->flushing.keys.nr	+= nr;
-	wb->inc.keys.nr		-= nr;
-out:
-	if (!wb->inc.keys.nr)
-		bch2_journal_pin_drop(j, &wb->inc.pin);
-	else
-		bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
-					bch2_btree_write_buffer_journal_flush);
-
-	if (j->watermark) {
-		spin_lock(&j->lock);
-		bch2_journal_set_watermark(j);
-		spin_unlock(&j->lock);
-	}
-
-	BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
-}
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *c,
-				       enum btree_id btree, struct bkey_i *k)
-{
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
-	bch2_btree_id_to_text(&buf, btree);
-	prt_str(&buf, "\n");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
-	bch2_fs_inconsistent(c, "%s", buf.buf);
-	printbuf_exit(&buf);
-	return -EROFS;
-}
-
-static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct journal *j = &c->journal;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct btree_iter iter = {};
-	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
-	bool write_locked = false;
-	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
-	int ret = 0;
-
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		return ret;
-
-	bch2_trans_unlock(trans);
-	bch2_trans_begin(trans);
-
-	mutex_lock(&wb->inc.lock);
-	move_keys_from_inc_to_flushing(wb);
-	mutex_unlock(&wb->inc.lock);
-
-	for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
-		wb->sorted.data[i].idx = i;
-		wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
-		memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
-	}
-	wb->sorted.nr = wb->flushing.keys.nr;
-
-	/*
-	 * We first sort so that we can detect and skip redundant updates, and
-	 * then we attempt to flush in sorted btree order, as this is most
-	 * efficient.
-	 *
-	 * However, since we're not flushing in the order they appear in the
-	 * journal we won't be able to drop our journal pin until everything is
-	 * flushed - which means this could deadlock the journal if we weren't
-	 * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
-	 * if it would block taking a journal reservation.
-	 *
-	 * If that happens, simply skip the key so we can optimistically insert
-	 * as many keys as possible in the fast path.
-	 */
-	wb_sort(wb->sorted.data, wb->sorted.nr);
-
-	darray_for_each(wb->sorted, i) {
-		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
-
-		if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
-			ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k);
-			goto err;
-		}
-
-		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
-			prefetch(&wb->flushing.keys.data[n->idx]);
-
-		BUG_ON(!k->journal_seq);
-
-		if (!accounting_replay_done &&
-		    k->k.k.type == KEY_TYPE_accounting) {
-			slowpath++;
-			continue;
-		}
-
-		if (i + 1 < &darray_top(wb->sorted) &&
-		    wb_key_eq(i, i + 1)) {
-			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
-
-			if (k->k.k.type == KEY_TYPE_accounting &&
-			    n->k.k.type == KEY_TYPE_accounting)
-				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
-							   bkey_i_to_s_c_accounting(&k->k));
-
-			overwritten++;
-			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
-			k->journal_seq = 0;
-			continue;
-		}
-
-		if (write_locked) {
-			struct btree_path *path = btree_iter_path(trans, &iter);
-
-			if (path->btree_id != i->btree ||
-			    bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
-				bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-				write_locked = false;
-
-				ret = lockrestart_do(trans,
-					bch2_btree_iter_traverse(trans, &iter) ?:
-					bch2_foreground_maybe_merge(trans, iter.path, 0,
-							BCH_WATERMARK_reclaim|
-							BCH_TRANS_COMMIT_journal_reclaim|
-							BCH_TRANS_COMMIT_no_check_rw|
-							BCH_TRANS_COMMIT_no_enospc));
-				if (ret)
-					goto err;
-			}
-		}
-
-		if (!iter.path || iter.btree_id != k->btree) {
-			bch2_trans_iter_exit(trans, &iter);
-			bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
-					     BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-		}
-
-		bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
-		btree_iter_path(trans, &iter)->preserve = false;
-
-		bool accounting_accumulated = false;
-		do {
-			if (race_fault()) {
-				ret = bch_err_throw(c, journal_reclaim_would_deadlock);
-				break;
-			}
-
-			ret = wb_flush_one(trans, &iter, k, &write_locked,
-					   &accounting_accumulated, &fast);
-			if (!write_locked)
-				bch2_trans_begin(trans);
-		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-		if (!ret) {
-			k->journal_seq = 0;
-		} else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
-			slowpath++;
-			ret = 0;
-		} else
-			break;
-	}
-
-	if (write_locked) {
-		struct btree_path *path = btree_iter_path(trans, &iter);
-		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		goto err;
-
-	if (slowpath) {
-		/*
-		 * Flush in the order they were present in the journal, so that
-		 * we can release journal pins:
-		 * The fastpath zapped the seq of keys that were successfully flushed so
-		 * we can skip those here.
-		 */
-		trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
-
-		sort_nonatomic(wb->flushing.keys.data,
-			       wb->flushing.keys.nr,
-			       sizeof(wb->flushing.keys.data[0]),
-			       wb_key_seq_cmp, NULL);
-
-		darray_for_each(wb->flushing.keys, i) {
-			if (!i->journal_seq)
-				continue;
-
-			if (!accounting_replay_done &&
-			    i->k.k.type == KEY_TYPE_accounting) {
-				could_not_insert++;
-				continue;
-			}
-
-			if (!could_not_insert)
-				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
-							bch2_btree_write_buffer_journal_flush);
-
-			bch2_trans_begin(trans);
-
-			ret = commit_do(trans, NULL, NULL,
-					BCH_WATERMARK_reclaim|
-					BCH_TRANS_COMMIT_journal_reclaim|
-					BCH_TRANS_COMMIT_no_check_rw|
-					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_journal_res ,
-					btree_write_buffered_insert(trans, i));
-			if (ret)
-				goto err;
-
-			i->journal_seq = 0;
-		}
-
-		/*
-		 * If journal replay hasn't finished with accounting keys we
-		 * can't flush accounting keys at all - condense them and leave
-		 * them for next time.
-		 *
-		 * Q: Can the write buffer overflow?
-		 * A Shouldn't be any actual risk. It's just new accounting
-		 * updates that the write buffer can't flush, and those are only
-		 * going to be generated by interior btree node updates as
-		 * journal replay has to split/rewrite nodes to make room for
-		 * its updates.
-		 *
-		 * And for those new acounting updates, updates to the same
-		 * counters get accumulated as they're flushed from the journal
-		 * to the write buffer - see the patch for eytzingcer tree
-		 * accumulated. So we could only overflow if the number of
-		 * distinct counters touched somehow was very large.
-		 */
-		if (could_not_insert) {
-			struct btree_write_buffered_key *dst = wb->flushing.keys.data;
-
-			darray_for_each(wb->flushing.keys, i)
-				if (i->journal_seq)
-					*dst++ = *i;
-			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
-		}
-	}
-err:
-	if (ret || !could_not_insert) {
-		bch2_journal_pin_drop(j, &wb->flushing.pin);
-		wb->flushing.keys.nr = 0;
-	}
-
-	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
-	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
-	return ret;
-}
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
-	struct journal_keys_to_wb dst;
-	int ret = 0;
-
-	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
-	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
-		jset_entry_for_each_key(entry, k) {
-			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
-			if (ret)
-				goto out;
-		}
-
-		entry->type = BCH_JSET_ENTRY_btree_keys;
-	}
-out:
-	ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
-	return ret;
-}
-
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
-{
-	struct journal *j = &c->journal;
-	struct journal_buf *buf;
-	bool blocked;
-	int ret = 0;
-
-	while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
-		ret = bch2_journal_keys_to_write_buffer(c, buf);
-
-		if (!blocked && !ret) {
-			spin_lock(&j->lock);
-			buf->need_flush_to_write_buffer = false;
-			spin_unlock(&j->lock);
-		}
-
-		mutex_unlock(&j->buf_lock);
-
-		if (blocked) {
-			bch2_journal_unblock(j);
-			break;
-		}
-	}
-
-	return ret;
-}
-
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
-					bool *did_work)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret = 0, fetch_from_journal_err;
-
-	do {
-		bch2_trans_unlock(trans);
-
-		fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
-
-		*did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
-
-		/*
-		 * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
-		 * is not guaranteed to empty wb->inc:
-		 */
-		mutex_lock(&wb->flushing.lock);
-		ret = bch2_btree_write_buffer_flush_locked(trans);
-		mutex_unlock(&wb->flushing.lock);
-	} while (!ret &&
-		 (fetch_from_journal_err ||
-		  (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
-		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
-
-	return ret;
-}
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
-				struct journal_entry_pin *_pin, u64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool did_work = false;
-
-	return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
-}
-
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	bool did_work = false;
-
-	trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
-
-	return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
-}
-
-/*
- * The write buffer requires flushing when going RO: keys in the journal for the
- * write buffer don't have a journal pin yet
- */
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
-{
-	if (bch2_journal_error(&c->journal))
-		return false;
-
-	bool did_work = false;
-	bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
-				journal_cur_seq(&c->journal), &did_work));
-	return did_work;
-}
-
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret = 0;
-
-	if (mutex_trylock(&wb->flushing.lock)) {
-		ret = bch2_btree_write_buffer_flush_locked(trans);
-		mutex_unlock(&wb->flushing.lock);
-	}
-
-	return ret;
-}
-
-int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
-		return bch_err_throw(c, erofs_no_writes);
-
-	int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-	return ret;
-}
-
-/*
- * In check and repair code, when checking references to write buffer btrees we
- * need to issue a flush before we have a definitive error: this issues a flush
- * if this is a key we haven't yet checked.
- */
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
-					struct bkey_s_c referring_k,
-					struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_buf tmp;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&tmp);
-
-	if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
-		if (trace_write_buffer_maybe_flush_enabled()) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, referring_k);
-			trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		bch2_bkey_buf_reassemble(&tmp, c, referring_k);
-
-		if (bkey_is_btree_ptr(referring_k.k)) {
-			bch2_trans_unlock(trans);
-			bch2_btree_interior_updates_flush(c);
-		}
-
-		ret = bch2_btree_write_buffer_flush_sync(trans);
-		if (ret)
-			goto err;
-
-		bch2_bkey_buf_copy(last_flushed, c, tmp.k);
-
-		/* can we avoid the unconditional restart? */
-		trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_);
-		ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
-	}
-err:
-	bch2_bkey_buf_exit(&tmp, c);
-	return ret;
-}
-
-static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret;
-
-	mutex_lock(&wb->flushing.lock);
-	do {
-		ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
-	} while (!ret && bch2_btree_write_buffer_should_flush(c));
-	mutex_unlock(&wb->flushing.lock);
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-}
-
-static void wb_accounting_sort(struct btree_write_buffer *wb)
-{
-	eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
-			sizeof(wb->accounting.data[0]),
-			wb_key_cmp, NULL);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
-				       struct bkey_i_accounting *k)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct btree_write_buffered_key new = { .btree = btree };
-
-	bkey_copy(&new.k, &k->k_i);
-
-	int ret = darray_push(&wb->accounting, new);
-	if (ret)
-		return ret;
-
-	wb_accounting_sort(wb);
-	return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
-			     struct journal_keys_to_wb *dst,
-			     enum btree_id btree, struct bkey_i *k)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	int ret;
-retry:
-	ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
-	if (!ret && dst->wb == &wb->flushing)
-		ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
-
-	if (unlikely(ret)) {
-		if (dst->wb == &c->btree_write_buffer.flushing) {
-			mutex_unlock(&dst->wb->lock);
-			dst->wb = &c->btree_write_buffer.inc;
-			bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
-					     bch2_btree_write_buffer_journal_flush);
-			goto retry;
-		}
-
-		return ret;
-	}
-
-	dst->room = darray_room(dst->wb->keys);
-	if (dst->wb == &wb->flushing)
-		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-	BUG_ON(!dst->room);
-	BUG_ON(!dst->seq);
-
-	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-	wb_k->journal_seq	= dst->seq;
-	wb_k->btree		= btree;
-	bkey_copy(&wb_k->k, k);
-	dst->wb->keys.nr++;
-	dst->room--;
-	return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	if (mutex_trylock(&wb->flushing.lock)) {
-		mutex_lock(&wb->inc.lock);
-		move_keys_from_inc_to_flushing(wb);
-
-		/*
-		 * Attempt to skip wb->inc, and add keys directly to
-		 * wb->flushing, saving us a copy later:
-		 */
-
-		if (!wb->inc.keys.nr) {
-			dst->wb = &wb->flushing;
-		} else {
-			mutex_unlock(&wb->flushing.lock);
-			dst->wb = &wb->inc;
-		}
-	} else {
-		mutex_lock(&wb->inc.lock);
-		dst->wb = &wb->inc;
-	}
-
-	dst->room = darray_room(dst->wb->keys);
-	if (dst->wb == &wb->flushing)
-		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
-	dst->seq = seq;
-
-	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
-			     bch2_btree_write_buffer_journal_flush);
-
-	darray_for_each(wb->accounting, i)
-		memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
-}
-
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	unsigned live_accounting_keys = 0;
-	int ret = 0;
-
-	darray_for_each(wb->accounting, i)
-		if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
-			i->journal_seq = dst->seq;
-			live_accounting_keys++;
-			ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
-			if (ret)
-				break;
-		}
-
-	if (live_accounting_keys * 2 < wb->accounting.nr) {
-		struct btree_write_buffered_key *dst = wb->accounting.data;
-
-		darray_for_each(wb->accounting, src)
-			if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
-				*dst++ = *src;
-		wb->accounting.nr = dst - wb->accounting.data;
-		wb_accounting_sort(wb);
-	}
-
-	if (!dst->wb->keys.nr)
-		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
-
-	if (bch2_btree_write_buffer_should_flush(c) &&
-	    __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
-	    !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-
-	if (dst->wb == &wb->flushing)
-		mutex_unlock(&wb->flushing.lock);
-	mutex_unlock(&wb->inc.lock);
-
-	return ret;
-}
-
-static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
-{
-	if (wb->keys.size >= new_size)
-		return 0;
-
-	if (!mutex_trylock(&wb->lock))
-		return -EINTR;
-
-	int ret = darray_resize(&wb->keys, new_size);
-	mutex_unlock(&wb->lock);
-	return ret;
-}
-
-int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb_keys_resize(&wb->flushing, new_size) ?:
-		wb_keys_resize(&wb->inc, new_size);
-}
-
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
-	       !bch2_journal_error(&c->journal));
-
-	darray_exit(&wb->accounting);
-	darray_exit(&wb->sorted);
-	darray_exit(&wb->flushing.keys);
-	darray_exit(&wb->inc.keys);
-}
-
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	mutex_init(&wb->inc.lock);
-	mutex_init(&wb->flushing.lock);
-	INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
-}
-
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	/* Will be resized by journal as needed: */
-	unsigned initial_size = 1 << 16;
-
-	return  darray_make_room(&wb->inc.keys, initial_size) ?:
-		darray_make_room(&wb->flushing.keys, initial_size) ?:
-		darray_make_room(&wb->sorted, initial_size);
-}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
deleted file mode 100644
index c351d21aca0b..000000000000
--- a/fs/bcachefs/btree_write_buffer.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-
-#include "bkey.h"
-#include "disk_accounting.h"
-
-static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
-}
-
-static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-	return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
-}
-
-struct btree_trans;
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
-int bch2_btree_write_buffer_tryflush(struct btree_trans *);
-
-struct bkey_buf;
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
-
-struct journal_keys_to_wb {
-	struct btree_write_buffer_keys	*wb;
-	size_t				room;
-	u64				seq;
-};
-
-static inline int wb_key_cmp(const void *_l, const void *_r)
-{
-	const struct btree_write_buffered_key *l = _l;
-	const struct btree_write_buffered_key *r = _r;
-
-	return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
-			      enum btree_id, struct bkey_i_accounting *);
-
-static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
-			     enum btree_id btree, struct bkey_i_accounting *k)
-{
-	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct btree_write_buffered_key search;
-	search.btree = btree;
-	search.k.k.p = k->k.p;
-
-	unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
-			sizeof(wb->accounting.data[0]),
-			wb_key_cmp, &search);
-
-	if (idx >= wb->accounting.nr)
-		return bch2_accounting_key_to_wb_slowpath(c, btree, k);
-
-	struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
-	bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
-	return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
-			     struct journal_keys_to_wb *,
-			     enum btree_id, struct bkey_i *);
-
-static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
-			     struct journal_keys_to_wb *dst,
-			     enum btree_id btree, struct bkey_i *k)
-{
-	if (unlikely(!dst->room))
-		return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
-
-	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
-	wb_k->journal_seq	= dst->seq;
-	wb_k->btree		= btree;
-	bkey_copy(&wb_k->k, k);
-	dst->wb->keys.nr++;
-	dst->room--;
-	return 0;
-}
-
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
-			     struct journal_keys_to_wb *dst,
-			     enum btree_id btree, struct bkey_i *k)
-{
-	if (unlikely(!btree_type_uses_write_buffer(btree))) {
-		int ret = bch2_btree_write_buffer_insert_err(c, btree, k);
-		dump_stack();
-		return ret;
-	}
-
-	EBUG_ON(!dst->seq);
-
-	return k->k.type == KEY_TYPE_accounting
-		? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
-		: __bch2_journal_key_to_wb(c, dst, btree, k);
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
-
-int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
-int bch2_fs_btree_write_buffer_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
deleted file mode 100644
index e9e76e20f43b..000000000000
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-
-#include "darray.h"
-#include "journal_types.h"
-
-#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
-#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-
-struct wb_key_ref {
-union {
-	struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		unsigned			idx:24;
-		u8				pos[sizeof(struct bpos)];
-		enum btree_id			btree:8;
-#else
-		enum btree_id			btree:8;
-		u8				pos[sizeof(struct bpos)];
-		unsigned			idx:24;
-#endif
-	} __packed;
-	struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-		u64 lo;
-		u64 mi;
-		u64 hi;
-#else
-		u64 hi;
-		u64 mi;
-		u64 lo;
-#endif
-	};
-};
-};
-
-struct btree_write_buffered_key {
-	enum btree_id			btree:8;
-	u64				journal_seq:56;
-	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-struct btree_write_buffer_keys {
-	DARRAY(struct btree_write_buffered_key) keys;
-	struct journal_entry_pin	pin;
-	struct mutex			lock;
-};
-
-struct btree_write_buffer {
-	DARRAY(struct wb_key_ref)	sorted;
-	struct btree_write_buffer_keys	inc;
-	struct btree_write_buffer_keys	flushing;
-	struct work_struct		flush_work;
-
-	DARRAY(struct btree_write_buffered_key) accounting;
-};
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
deleted file mode 100644
index f25903c10e8a..000000000000
--- a/fs/bcachefs/buckets.c
+++ /dev/null
@@ -1,1395 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "inode.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/preempt.h>
-
-void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
-{
-	for (unsigned i = 0; i < BCH_DATA_NR; i++)
-		usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
-{
-	memset(usage, 0, sizeof(*usage));
-	acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
-			sizeof(struct bch_dev_usage_full) / sizeof(u64));
-}
-
-static u64 reserve_factor(u64 r)
-{
-	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
-}
-
-static struct bch_fs_usage_short
-__bch2_fs_usage_read_short(struct bch_fs *c)
-{
-	struct bch_fs_usage_short ret;
-	u64 data, reserved;
-
-	ret.capacity = c->capacity -
-		percpu_u64_get(&c->usage->hidden);
-
-	data		= percpu_u64_get(&c->usage->data) +
-		percpu_u64_get(&c->usage->btree);
-	reserved	= percpu_u64_get(&c->usage->reserved) +
-		percpu_u64_get(c->online_reserved);
-
-	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
-	ret.free	= ret.capacity - ret.used;
-
-	ret.nr_inodes	= percpu_u64_get(&c->usage->nr_inodes);
-
-	return ret;
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *c)
-{
-	struct bch_fs_usage_short ret;
-
-	percpu_down_read(&c->mark_lock);
-	ret = __bch2_fs_usage_read_short(c);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *out,
-			    struct bch_dev *ca,
-			    struct bch_dev_usage_full *usage)
-{
-	if (out->nr_tabstops < 5) {
-		printbuf_tabstops_reset(out);
-		printbuf_tabstop_push(out, 12);
-		printbuf_tabstop_push(out, 16);
-		printbuf_tabstop_push(out, 16);
-		printbuf_tabstop_push(out, 16);
-		printbuf_tabstop_push(out, 16);
-	}
-
-	prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
-
-	for (unsigned i = 0; i < BCH_DATA_NR; i++) {
-		bch2_prt_data_type(out, i);
-		prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
-			   usage->d[i].buckets,
-			   usage->d[i].sectors,
-			   usage->d[i].fragmented);
-	}
-
-	prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
-}
-
-static int bch2_check_fix_ptr(struct btree_trans *trans,
-			      struct bkey_s_c k,
-			      struct extent_ptr_decoded p,
-			      const union bch_extent_entry *entry,
-			      bool *do_update)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
-	if (!ca) {
-		if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
-				trans, ptr_to_invalid_device,
-				"pointer to missing device %u\n"
-				"while marking %s",
-				p.ptr.dev,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			*do_update = true;
-		return 0;
-	}
-
-	struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-	if (!g) {
-		if (fsck_err(trans, ptr_to_invalid_device,
-			     "pointer to invalid bucket on device %u\n"
-			     "while marking %s",
-			     p.ptr.dev,
-			     (printbuf_reset(&buf),
-			      bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			*do_update = true;
-		goto out;
-	}
-
-	enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-
-	if (fsck_err_on(!g->gen_valid,
-			trans, ptr_to_missing_alloc_key,
-			"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-			p.ptr.gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		if (!p.ptr.cached) {
-			g->gen_valid		= true;
-			g->gen			= p.ptr.gen;
-		} else {
-			/* this pointer will be dropped */
-			*do_update = true;
-			goto out;
-		}
-	}
-
-	/* g->gen_valid == true */
-
-	if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
-			trans, ptr_gen_newer_than_bucket_gen,
-			"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-			p.ptr.gen, g->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		if (!p.ptr.cached &&
-		    (g->data_type != BCH_DATA_btree ||
-		     data_type == BCH_DATA_btree)) {
-			g->data_type		= data_type;
-			g->stripe_sectors	= 0;
-			g->dirty_sectors	= 0;
-			g->cached_sectors	= 0;
-		}
-
-		*do_update = true;
-	}
-
-	if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
-			trans, ptr_gen_newer_than_bucket_gen,
-			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-			p.ptr.gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		*do_update = true;
-
-	if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
-			trans, stale_dirty_ptr,
-			"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
-			p.ptr.gen, g->gen,
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		*do_update = true;
-
-	if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
-		goto out;
-
-	if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
-			trans, ptr_bucket_data_type_mismatch,
-			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
-			"while marking %s",
-			p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-			bch2_data_type_str(g->data_type),
-			bch2_data_type_str(data_type),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		if (!p.ptr.cached &&
-		    data_type == BCH_DATA_btree) {
-			switch (g->data_type) {
-			case BCH_DATA_sb:
-				bch_err(c, "btree and superblock in the same bucket - cannot repair");
-				ret = bch_err_throw(c, fsck_repair_unimplemented);
-				goto out;
-			case BCH_DATA_journal:
-				ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
-				bch_err_msg(c, ret, "error deleting journal bucket %zu",
-					    PTR_BUCKET_NR(ca, &p.ptr));
-				if (ret)
-					goto out;
-				break;
-			}
-
-			g->data_type		= data_type;
-			g->stripe_sectors	= 0;
-			g->dirty_sectors	= 0;
-			g->cached_sectors	= 0;
-		} else {
-			*do_update = true;
-		}
-	}
-
-	if (p.has_ec) {
-		struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
-		if (fsck_err_on(!m || !m->alive,
-				trans, ptr_to_missing_stripe,
-				"pointer to nonexistent stripe %llu\n"
-				"while marking %s",
-				(u64) p.ec.idx,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			*do_update = true;
-
-		if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
-				trans, ptr_to_incorrect_stripe,
-				"pointer does not match stripe %llu\n"
-				"while marking %s",
-				(u64) p.ec.idx,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-			*do_update = true;
-	}
-out:
-fsck_err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_fix_ptrs(struct btree_trans *trans,
-			enum btree_id btree, unsigned level, struct bkey_s_c k,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry_c;
-	struct extent_ptr_decoded p = { 0 };
-	bool do_update = false;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	/* We don't yet do btree key updates correctly for when we're RW */
-	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-	bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
-		ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
-		if (ret)
-			goto err;
-	}
-
-	if (do_update) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto err;
-
-		scoped_guard(rcu)
-			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
-
-		if (level) {
-			/*
-			 * We don't want to drop btree node pointers - if the
-			 * btree node isn't there anymore, the read path will
-			 * sort it out:
-			 */
-			struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			scoped_guard(rcu)
-				bkey_for_each_ptr(ptrs, ptr) {
-					struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-					ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
-				}
-		} else {
-			struct bkey_ptrs ptrs;
-			union bch_extent_entry *entry;
-
-			rcu_read_lock();
-restart_drop_ptrs:
-			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
-				struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
-				struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-				enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
-				if ((p.ptr.cached &&
-				     (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
-				    (!p.ptr.cached &&
-				     gen_cmp(p.ptr.gen, g->gen) < 0) ||
-				    gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
-				    (g->data_type &&
-				     g->data_type != data_type)) {
-					bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
-					goto restart_drop_ptrs;
-				}
-			}
-			rcu_read_unlock();
-again:
-			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-			bkey_extent_entry_for_each(ptrs, entry) {
-				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
-									entry->stripe_ptr.idx);
-					union bch_extent_entry *next_ptr;
-
-					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
-						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
-							goto found;
-					next_ptr = NULL;
-found:
-					if (!next_ptr) {
-						bch_err(c, "aieee, found stripe ptr with no data ptr");
-						continue;
-					}
-
-					if (!m || !m->alive ||
-					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
-								       &next_ptr->ptr,
-								       m->sectors)) {
-						bch2_bkey_extent_entry_drop(new, entry);
-						goto again;
-					}
-				}
-			}
-		}
-
-		if (0) {
-			printbuf_reset(&buf);
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch_info(c, "updated %s", buf.buf);
-
-			printbuf_reset(&buf);
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-			bch_info(c, "new key %s", buf.buf);
-		}
-
-		if (!(flags & BTREE_TRIGGER_is_root)) {
-			struct btree_iter iter;
-			bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
-						  BTREE_ITER_intent|BTREE_ITER_all_snapshots);
-			ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-				bch2_trans_update(trans, &iter, new,
-						  BTREE_UPDATE_internal_snapshot_node|
-						  BTREE_TRIGGER_norun);
-			bch2_trans_iter_exit(trans, &iter);
-			if (ret)
-				goto err;
-
-			if (level)
-				bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
-		} else {
-			struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
-					       jset_u64s(new->k.u64s));
-			ret = PTR_ERR_OR_ZERO(e);
-			if (ret)
-				goto err;
-
-			journal_entry_set(e,
-					  BCH_JSET_ENTRY_btree_root,
-					  btree, level - 1,
-					  new, new->k.u64s);
-
-			/*
-			 * no locking, we're single threaded and not rw yet, see
-			 * the big assertino above that we repeat here:
-			 */
-			BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
-			struct btree *b = bch2_btree_id_root(c, btree)->b;
-			bkey_copy(&b->key, new);
-		}
-	}
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf,
-				 struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
-{
-	struct bch_fs *c = trans->c;
-
-	prt_printf(buf, "\nwhile marking ");
-	bch2_bkey_val_to_text(buf, c, k);
-	prt_newline(buf);
-
-	bool print = __bch2_count_fsck_err(c, id, buf);
-
-	int ret = bch2_run_explicit_recovery_pass(c, buf,
-					BCH_RECOVERY_PASS_check_allocations, 0);
-
-	if (insert) {
-		bch2_trans_updates_to_text(buf, trans);
-		__bch2_inconsistent_error(c, buf);
-		/*
-		 * If we're in recovery, run_explicit_recovery_pass might give
-		 * us an error code for rewinding recovery
-		 */
-		if (!ret)
-			ret = bch_err_throw(c, bucket_ref_update);
-	} else {
-		/* Always ignore overwrite errors, so that deletion works */
-		ret = 0;
-	}
-
-	if (print || insert)
-		bch2_print_str(c, KERN_ERR, buf->buf);
-	return ret;
-}
-
-int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
-			   struct bkey_s_c k,
-			   const struct bch_extent_ptr *ptr,
-			   s64 sectors, enum bch_data_type ptr_data_type,
-			   u8 b_gen, u8 bucket_data_type,
-			   u32 *bucket_sectors)
-{
-	struct bch_fs *c = trans->c;
-	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-	struct printbuf buf = PRINTBUF;
-	bool inserting = sectors > 0;
-	int ret = 0;
-
-	BUG_ON(!sectors);
-
-	if (unlikely(gen_after(ptr->gen, b_gen))) {
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
-			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen);
-
-		ret = bucket_ref_update_err(trans, &buf, k, inserting,
-					    BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen);
-		goto out;
-	}
-
-	if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) {
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
-			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen);
-
-		ret = bucket_ref_update_err(trans, &buf, k, inserting,
-					    BCH_FSCK_ERR_ptr_too_stale);
-		goto out;
-	}
-
-	if (b_gen != ptr->gen && ptr->cached) {
-		ret = 1;
-		goto out;
-	}
-
-	if (unlikely(b_gen != ptr->gen)) {
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
-			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)",
-			ptr->dev, bucket_nr, b_gen,
-			bucket_gen_get(ca, bucket_nr),
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			ptr->gen);
-
-		ret = bucket_ref_update_err(trans, &buf, k, inserting,
-					    BCH_FSCK_ERR_stale_dirty_ptr);
-		goto out;
-	}
-
-	if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) {
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
-			   ptr->dev, bucket_nr, b_gen,
-			   bch2_data_type_str(bucket_data_type),
-			   bch2_data_type_str(ptr_data_type));
-
-		ret = bucket_ref_update_err(trans, &buf, k, inserting,
-					    BCH_FSCK_ERR_ptr_bucket_data_type_mismatch);
-		goto out;
-	}
-
-	if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) {
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
-			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX",
-			ptr->dev, bucket_nr, b_gen,
-			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
-			*bucket_sectors, sectors);
-
-		ret = bucket_ref_update_err(trans, &buf, k, inserting,
-					    BCH_FSCK_ERR_bucket_sector_count_overflow);
-		sectors = -*bucket_sectors;
-		goto out;
-	}
-
-	*bucket_sectors += sectors;
-out:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-	static int warned_disk_usage = 0;
-	bool warn = false;
-
-	percpu_down_read(&c->mark_lock);
-	struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-
-	s64 added = src->btree + src->data + src->reserved;
-
-	/*
-	 * Not allowed to reduce sectors_available except by getting a
-	 * reservation:
-	 */
-	s64 should_not_have_added = added - (s64) disk_res_sectors;
-	if (unlikely(should_not_have_added > 0)) {
-		u64 old, new;
-
-		old = atomic64_read(&c->sectors_available);
-		do {
-			new = max_t(s64, 0, old - should_not_have_added);
-		} while (!atomic64_try_cmpxchg(&c->sectors_available,
-					       &old, new));
-
-		added -= should_not_have_added;
-		warn = true;
-	}
-
-	if (added > 0) {
-		trans->disk_res->sectors -= added;
-		this_cpu_sub(*c->online_reserved, added);
-	}
-
-	preempt_disable();
-	struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
-	acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-
-	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
-		bch2_trans_inconsistent(trans,
-					"disk usage increased %lli more than %llu sectors reserved)",
-					should_not_have_added, disk_res_sectors);
-}
-
-/* KEY_TYPE_extent: */
-
-static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
-			  struct bkey_s_c k,
-			  const struct extent_ptr_decoded *p,
-			  s64 sectors, enum bch_data_type ptr_data_type,
-			  struct bch_alloc_v4 *a,
-			  bool insert)
-{
-	u32 *dst_sectors = p->has_ec	? &a->stripe_sectors :
-		!p->ptr.cached		? &a->dirty_sectors :
-					  &a->cached_sectors;
-	int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
-					 a->gen, a->data_type, dst_sectors);
-
-	if (ret)
-		return ret;
-	if (insert)
-		alloc_data_type_set(a, ptr_data_type);
-	return 0;
-}
-
-static int bch2_trigger_pointer(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level,
-			struct bkey_s_c k, struct extent_ptr_decoded p,
-			const union bch_extent_entry *entry,
-			s64 *sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	bool insert = !(flags & BTREE_TRIGGER_overwrite);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bkey_i_backpointer bp;
-	bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
-
-	*sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
-	if (unlikely(!ca)) {
-		if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
-			ret = bch_err_throw(c, trigger_pointer);
-		goto err;
-	}
-
-	struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-	if (!bucket_valid(ca, bucket.offset)) {
-		if (insert) {
-			bch2_dev_bucket_missing(ca, bucket.offset);
-			ret = bch_err_throw(c, trigger_pointer);
-		}
-		goto err;
-	}
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
-		ret = PTR_ERR_OR_ZERO(a) ?:
-			__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
-		if (ret)
-			goto err;
-
-		ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
-		if (ret)
-			goto err;
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct bucket *g = gc_bucket(ca, bucket.offset);
-		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
-					    p.ptr.dev,
-					    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			ret = bch_err_throw(c, trigger_pointer);
-			goto err;
-		}
-
-		bucket_lock(g);
-		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
-		alloc_to_bucket(g, new);
-		bucket_unlock(g);
-
-		if (!ret)
-			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-	}
-err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
-				struct bkey_s_c k,
-				struct extent_ptr_decoded p,
-				enum bch_data_type data_type,
-				s64 sectors,
-				enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct btree_iter iter;
-		struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
-				BTREE_ID_stripes, POS(0, p.ec.idx),
-				BTREE_ITER_with_updates, stripe);
-		int ret = PTR_ERR_OR_ZERO(s);
-		if (unlikely(ret)) {
-			bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
-				"pointer to nonexistent stripe %llu",
-				(u64) p.ec.idx);
-			goto err;
-		}
-
-		if (!bch2_ptr_matches_stripe(&s->v, p)) {
-			bch2_trans_inconsistent(trans,
-				"stripe pointer doesn't match stripe %llu",
-				(u64) p.ec.idx);
-			ret = bch_err_throw(c, trigger_stripe_pointer);
-			goto err;
-		}
-
-		stripe_blockcount_set(&s->v, p.ec.block,
-			stripe_blockcount_get(&s->v, p.ec.block) +
-			sectors);
-
-		struct disk_accounting_pos acc;
-		memset(&acc, 0, sizeof(acc));
-		acc.type = BCH_DISK_ACCOUNTING_replicas;
-		bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-		acc.replicas.data_type = data_type;
-		ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-err:
-		bch2_trans_iter_exit(trans, &iter);
-		return ret;
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
-		if (!m) {
-			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
-				(u64) p.ec.idx);
-			return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
-		}
-
-		gc_stripe_lock(m);
-
-		if (!m || !m->alive) {
-			gc_stripe_unlock(m);
-			struct printbuf buf = PRINTBUF;
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "pointer to nonexistent stripe %llu\n  while marking ",
-				   (u64) p.ec.idx);
-			bch2_bkey_val_to_text(&buf, c, k);
-			__bch2_inconsistent_error(c, &buf);
-			bch2_print_str(c, KERN_ERR, buf.buf);
-			printbuf_exit(&buf);
-			return bch_err_throw(c, trigger_stripe_pointer);
-		}
-
-		m->block_sectors[p.ec.block] += sectors;
-
-		struct disk_accounting_pos acc;
-		memset(&acc, 0, sizeof(acc));
-		acc.type = BCH_DISK_ACCOUNTING_replicas;
-		unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
-		gc_stripe_unlock(m);
-
-		acc.replicas.data_type = data_type;
-		int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int __trigger_extent(struct btree_trans *trans,
-			    enum btree_id btree_id, unsigned level,
-			    struct bkey_s_c k,
-			    enum btree_iter_update_trigger_flags flags)
-{
-	bool gc = flags & BTREE_TRIGGER_gc;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
-		? BCH_DATA_btree
-		: BCH_DATA_user;
-	int ret = 0;
-
-	s64 replicas_sectors = 0;
-
-	struct disk_accounting_pos acc_replicas_key;
-	memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
-	acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
-	acc_replicas_key.replicas.data_type	= data_type;
-	acc_replicas_key.replicas.nr_devs	= 0;
-	acc_replicas_key.replicas.nr_required	= 1;
-
-	unsigned cur_compression_type = 0;
-	u64 compression_acct[3] = { 1, 0, 0 };
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		s64 disk_sectors = 0;
-		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
-		if (ret < 0)
-			return ret;
-
-		bool stale = ret > 0;
-
-		if (p.ptr.cached && stale)
-			continue;
-
-		if (p.ptr.cached) {
-			ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
-			if (ret)
-				return ret;
-		} else if (!p.has_ec) {
-			replicas_sectors       += disk_sectors;
-			replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
-		} else {
-			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
-			if (ret)
-				return ret;
-
-			/*
-			 * There may be other dirty pointers in this extent, but
-			 * if so they're not required for mounting if we have an
-			 * erasure coded pointer in this extent:
-			 */
-			acc_replicas_key.replicas.nr_required = 0;
-		}
-
-		if (cur_compression_type &&
-		    cur_compression_type != p.crc.compression_type) {
-			if (flags & BTREE_TRIGGER_overwrite)
-				bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
-			ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
-							compression, cur_compression_type);
-			if (ret)
-				return ret;
-
-			compression_acct[0] = 1;
-			compression_acct[1] = 0;
-			compression_acct[2] = 0;
-		}
-
-		cur_compression_type = p.crc.compression_type;
-		if (p.crc.compression_type) {
-			compression_acct[1] += p.crc.uncompressed_size;
-			compression_acct[2] += p.crc.compressed_size;
-		}
-	}
-
-	if (acc_replicas_key.replicas.nr_devs) {
-		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
-		if (ret)
-			return ret;
-	}
-
-	if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
-		ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
-		if (ret)
-			return ret;
-	}
-
-	if (cur_compression_type) {
-		if (flags & BTREE_TRIGGER_overwrite)
-			bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
-		ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
-						compression, cur_compression_type);
-		if (ret)
-			return ret;
-	}
-
-	if (level) {
-		ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
-		if (ret)
-			return ret;
-	} else {
-		bool insert = !(flags & BTREE_TRIGGER_overwrite);
-
-		s64 v[3] = {
-			insert ? 1 : -1,
-			insert ? k.k->size : -((s64) k.k->size),
-			replicas_sectors,
-		};
-		ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_trigger_extent(struct btree_trans *trans,
-			enum btree_id btree, unsigned level,
-			struct bkey_s_c old, struct bkey_s new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
-	struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
-	unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
-	unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
-
-	if (unlikely(flags & BTREE_TRIGGER_check_repair))
-		return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
-
-	/* if pointers aren't changing - nothing to do: */
-	if (new_ptrs_bytes == old_ptrs_bytes &&
-	    !memcmp(new_ptrs.start,
-		    old_ptrs.start,
-		    new_ptrs_bytes))
-		return 0;
-
-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-		if (old.k->type) {
-			int ret = __trigger_extent(trans, btree, level, old,
-						   flags & ~BTREE_TRIGGER_insert);
-			if (ret)
-				return ret;
-		}
-
-		if (new.k->type) {
-			int ret = __trigger_extent(trans, btree, level, new.s_c,
-						   flags & ~BTREE_TRIGGER_overwrite);
-			if (ret)
-				return ret;
-		}
-
-		int need_rebalance_delta = 0;
-		s64 need_rebalance_sectors_delta[1] = { 0 };
-
-		s64 s = bch2_bkey_sectors_need_rebalance(c, old);
-		need_rebalance_delta -= s != 0;
-		need_rebalance_sectors_delta[0] -= s;
-
-		s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
-		need_rebalance_delta += s != 0;
-		need_rebalance_sectors_delta[0] += s;
-
-		if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
-			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-							  new.k->p, need_rebalance_delta > 0);
-			if (ret)
-				return ret;
-		}
-
-		if (need_rebalance_sectors_delta[0]) {
-			int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
-							    need_rebalance_sectors_delta, rebalance_work);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-/* KEY_TYPE_reservation */
-
-static int __trigger_reservation(struct btree_trans *trans,
-			enum btree_id btree_id, unsigned level, struct bkey_s_c k,
-			enum btree_iter_update_trigger_flags flags)
-{
-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-		s64 sectors[1] = { k.k->size };
-
-		if (flags & BTREE_TRIGGER_overwrite)
-			sectors[0] = -sectors[0];
-
-		return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
-				persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
-	}
-
-	return 0;
-}
-
-int bch2_trigger_reservation(struct btree_trans *trans,
-			  enum btree_id btree_id, unsigned level,
-			  struct bkey_s_c old, struct bkey_s new,
-			  enum btree_iter_update_trigger_flags flags)
-{
-	return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
-}
-
-/* Mark superblocks: */
-
-static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-				    struct bch_dev *ca, u64 b,
-				    enum bch_data_type type,
-				    unsigned sectors)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	int ret = 0;
-
-	struct bkey_i_alloc_v4 *a =
-		bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
-	if (IS_ERR(a))
-		return PTR_ERR(a);
-
-	if (a->v.data_type && type && a->v.data_type != type) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
-			   "while marking %s\n",
-			   iter.pos.inode, iter.pos.offset, a->v.gen,
-			   bch2_data_type_str(a->v.data_type),
-			   bch2_data_type_str(type),
-			   bch2_data_type_str(type));
-
-		bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
-
-		ret = bch2_run_explicit_recovery_pass(c, &buf,
-					BCH_RECOVERY_PASS_check_allocations, 0);
-
-		/* Always print, this is always fatal */
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-		if (!ret)
-			ret = bch_err_throw(c, metadata_bucket_inconsistency);
-		goto err;
-	}
-
-	if (a->v.data_type	!= type ||
-	    a->v.dirty_sectors	!= sectors) {
-		a->v.data_type		= type;
-		a->v.dirty_sectors	= sectors;
-		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
-			u64 b, enum bch_data_type data_type, unsigned sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	struct bucket *g = gc_bucket(ca, b);
-	if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
-				    ca->dev_idx, bch2_data_type_str(data_type)))
-		goto err;
-
-	bucket_lock(g);
-	struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
-
-	if (bch2_fs_inconsistent_on(g->data_type &&
-			g->data_type != data_type, c,
-			"different types of data in same bucket: %s, %s",
-			bch2_data_type_str(g->data_type),
-			bch2_data_type_str(data_type)))
-		goto err_unlock;
-
-	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
-			"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
-			ca->dev_idx, b, g->gen,
-			bch2_data_type_str(g->data_type ?: data_type),
-			g->dirty_sectors, sectors))
-		goto err_unlock;
-
-	g->data_type = data_type;
-	g->dirty_sectors += sectors;
-	struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
-	bucket_unlock(g);
-	ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-	return ret;
-err_unlock:
-	bucket_unlock(g);
-err:
-	return bch_err_throw(c, metadata_bucket_inconsistency);
-}
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
-			struct bch_dev *ca, u64 b,
-			enum bch_data_type type, unsigned sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	BUG_ON(type != BCH_DATA_free &&
-	       type != BCH_DATA_sb &&
-	       type != BCH_DATA_journal);
-
-	/*
-	 * Backup superblock might be past the end of our normal usable space:
-	 */
-	if (b >= ca->mi.nbuckets)
-		return 0;
-
-	if (flags & BTREE_TRIGGER_gc)
-		return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
-	else if (flags & BTREE_TRIGGER_transactional)
-		return commit_do(trans, NULL, NULL, 0,
-				 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
-	else
-		BUG();
-}
-
-static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
-			struct bch_dev *ca, u64 start, u64 end,
-			enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
-			enum btree_iter_update_trigger_flags flags)
-{
-	do {
-		u64 b = sector_to_bucket(ca, start);
-		unsigned sectors =
-			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
-		if (b != *bucket && *bucket_sectors) {
-			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
-							type, *bucket_sectors, flags);
-			if (ret)
-				return ret;
-
-			*bucket_sectors = 0;
-		}
-
-		*bucket		= b;
-		*bucket_sectors	+= sectors;
-		start += sectors;
-	} while (start < end);
-
-	return 0;
-}
-
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_layout layout = ca->disk_sb.sb->layout;
-	mutex_unlock(&c->sb_lock);
-
-	u64 bucket = 0;
-	unsigned i, bucket_sectors = 0;
-	int ret;
-
-	for (i = 0; i < layout.nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout.sb_offset[i]);
-
-		if (offset == BCH_SB_SECTOR) {
-			ret = bch2_trans_mark_metadata_sectors(trans, ca,
-						0, BCH_SB_SECTOR,
-						BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-			if (ret)
-				return ret;
-		}
-
-		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
-				      offset + (1 << layout.sb_max_size_bits),
-				      BCH_DATA_sb, &bucket, &bucket_sectors, flags);
-		if (ret)
-			return ret;
-	}
-
-	if (bucket_sectors) {
-		ret = bch2_trans_mark_metadata_bucket(trans, ca,
-				bucket, BCH_DATA_sb, bucket_sectors, flags);
-		if (ret)
-			return ret;
-	}
-
-	for (i = 0; i < ca->journal.nr; i++) {
-		ret = bch2_trans_mark_metadata_bucket(trans, ca,
-				ca->journal.buckets[i],
-				BCH_DATA_journal, ca->mi.bucket_size, flags);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
-			enum btree_iter_update_trigger_flags flags)
-{
-	int ret = bch2_trans_run(c,
-		__bch2_trans_mark_dev_sb(trans, ca, flags));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
-			enum btree_iter_update_trigger_flags flags)
-{
-	for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
-		int ret = bch2_trans_mark_dev_sb(c, ca, flags);
-		if (ret) {
-			enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
-{
-	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
-}
-
-bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-	u64 b_offset	= bucket_to_sector(ca, b);
-	u64 b_end	= bucket_to_sector(ca, b + 1);
-	unsigned i;
-
-	if (!b)
-		return true;
-
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-		u64 end = offset + (1 << layout->sb_max_size_bits);
-
-		if (!(offset >= b_end || end <= b_offset))
-			return true;
-	}
-
-	for (i = 0; i < ca->journal.nr; i++)
-		if (b == ca->journal.buckets[i])
-			return true;
-
-	return false;
-}
-
-/* Disk reservations: */
-
-#define SECTORS_CACHE	1024
-
-int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-				u64 sectors, enum bch_reservation_flags flags)
-{
-	struct bch_fs_pcpu *pcpu;
-	u64 old, get;
-	u64 sectors_available;
-	int ret;
-
-	percpu_down_read(&c->mark_lock);
-	preempt_disable();
-	pcpu = this_cpu_ptr(c->pcpu);
-
-	if (sectors <= pcpu->sectors_available)
-		goto out;
-
-	old = atomic64_read(&c->sectors_available);
-	do {
-		get = min((u64) sectors + SECTORS_CACHE, old);
-
-		if (get < sectors) {
-			preempt_enable();
-			goto recalculate;
-		}
-	} while (!atomic64_try_cmpxchg(&c->sectors_available,
-				       &old, old - get));
-
-	pcpu->sectors_available		+= get;
-
-out:
-	pcpu->sectors_available		-= sectors;
-	this_cpu_add(*c->online_reserved, sectors);
-	res->sectors			+= sectors;
-
-	preempt_enable();
-	percpu_up_read(&c->mark_lock);
-	return 0;
-
-recalculate:
-	mutex_lock(&c->sectors_available_lock);
-
-	percpu_u64_set(&c->pcpu->sectors_available, 0);
-	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
-
-	if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
-		sectors = min(sectors, sectors_available);
-
-	if (sectors <= sectors_available ||
-	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
-		atomic64_set(&c->sectors_available,
-			     max_t(s64, 0, sectors_available - sectors));
-		this_cpu_add(*c->online_reserved, sectors);
-		res->sectors			+= sectors;
-		ret = 0;
-	} else {
-		atomic64_set(&c->sectors_available, sectors_available);
-		ret = bch_err_throw(c, ENOSPC_disk_reservation);
-	}
-
-	mutex_unlock(&c->sectors_available_lock);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-/* Startup/shutdown: */
-
-void bch2_buckets_nouse_free(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		kvfree_rcu_mightsleep(ca->buckets_nouse);
-		ca->buckets_nouse = NULL;
-	}
-}
-
-int bch2_buckets_nouse_alloc(struct bch_fs *c)
-{
-	for_each_member_device(c, ca) {
-		BUG_ON(ca->buckets_nouse);
-
-		ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO);
-		if (!ca->buckets_nouse) {
-			bch2_dev_put(ca);
-			return bch_err_throw(c, ENOMEM_buckets_nouse);
-		}
-	}
-
-	return 0;
-}
-
-static void bucket_gens_free_rcu(struct rcu_head *rcu)
-{
-	struct bucket_gens *buckets =
-		container_of(rcu, struct bucket_gens, rcu);
-
-	kvfree(buckets);
-}
-
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
-	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
-	bool resize = ca->bucket_gens != NULL;
-	int ret;
-
-	if (resize)
-		lockdep_assert_held(&c->state_lock);
-
-	if (resize && ca->buckets_nouse)
-		return bch_err_throw(c, no_resize_with_buckets_nouse);
-
-	bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
-				    GFP_KERNEL|__GFP_ZERO);
-	if (!bucket_gens) {
-		ret = bch_err_throw(c, ENOMEM_bucket_gens);
-		goto err;
-	}
-
-	bucket_gens->first_bucket = ca->mi.first_bucket;
-	bucket_gens->nbuckets	= nbuckets;
-	bucket_gens->nbuckets_minus_first =
-		bucket_gens->nbuckets - bucket_gens->first_bucket;
-
-	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
-
-	if (resize) {
-		u64 copy = min(bucket_gens->nbuckets,
-			       old_bucket_gens->nbuckets);
-		memcpy(bucket_gens->b,
-		       old_bucket_gens->b,
-		       sizeof(bucket_gens->b[0]) * copy);
-	}
-
-	ret =   bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
-					  ca->mi.nbuckets, nbuckets) ?:
-		bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
-					  ca->mi.nbuckets, nbuckets);
-
-	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-	bucket_gens	= old_bucket_gens;
-
-	nbuckets = ca->mi.nbuckets;
-
-	ret = 0;
-err:
-	if (bucket_gens)
-		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-
-	return ret;
-}
-
-void bch2_dev_buckets_free(struct bch_dev *ca)
-{
-	kvfree(ca->buckets_nouse);
-	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
-	free_percpu(ca->usage);
-}
-
-int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
-	ca->usage = alloc_percpu(struct bch_dev_usage_full);
-	if (!ca->usage)
-		return bch_err_throw(c, ENOMEM_usage_init);
-
-	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
-}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
deleted file mode 100644
index 49a3807a5eab..000000000000
--- a/fs/bcachefs/buckets.h
+++ /dev/null
@@ -1,369 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-	return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-	return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-	u32 remainder;
-
-	div_u64_rem(s, ca->mi.bucket_size, &remainder);
-	return remainder;
-}
-
-static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
-{
-	return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-#define for_each_bucket(_b, _buckets)				\
-	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
-	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-
-static inline void bucket_unlock(struct bucket *b)
-{
-	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
-	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
-	smp_mb__after_atomic();
-	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void bucket_lock(struct bucket *b)
-{
-	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
-			 TASK_UNINTERRUPTIBLE);
-}
-
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
-	return bucket_valid(ca, b)
-		? genradix_ptr(&ca->buckets_gc, b)
-		: NULL;
-}
-
-static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
-{
-	return rcu_dereference_check(ca->bucket_gens,
-				     lockdep_is_held(&ca->fs->state_lock));
-}
-
-static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
-{
-	struct bucket_gens *gens = bucket_gens(ca);
-
-	if (b - gens->first_bucket >= gens->nbuckets_minus_first)
-		return NULL;
-	return gens->b + b;
-}
-
-static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
-{
-	u8 *gen = bucket_gen(ca, b);
-	return gen ? *gen : -1;
-}
-
-static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
-{
-	guard(rcu)();
-	return bucket_gen_get_rcu(ca, b);
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
-				   const struct bch_extent_ptr *ptr)
-{
-	return sector_to_bucket(ca, ptr->offset);
-}
-
-static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
-					 const struct bch_extent_ptr *ptr)
-{
-	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
-						const struct bch_extent_ptr *ptr,
-						u32 *bucket_offset)
-{
-	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
-}
-
-static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
-					   const struct bch_extent_ptr *ptr)
-{
-	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline enum bch_data_type ptr_data_type(const struct bkey *k,
-					       const struct bch_extent_ptr *ptr)
-{
-	if (bkey_is_btree_ptr(k))
-		return BCH_DATA_btree;
-
-	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
-}
-
-static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-	EBUG_ON(sectors < 0);
-
-	return crc_is_compressed(p.crc)
-		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-				   p.crc.uncompressed_size)
-		: sectors;
-}
-
-static inline int gen_cmp(u8 a, u8 b)
-{
-	return (s8) (a - b);
-}
-
-static inline int gen_after(u8 a, u8 b)
-{
-	return max(0, gen_cmp(a, b));
-}
-
-static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-	int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
-	return gen < 0 ? gen : gen_after(gen, ptr->gen);
-}
-
-/**
- * dev_ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-	guard(rcu)();
-	return dev_ptr_stale_rcu(ca, ptr);
-}
-
-/* Device usage: */
-
-void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
-static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
-{
-	struct bch_dev_usage ret;
-
-	bch2_dev_usage_read_fast(ca, &ret);
-	return ret;
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
-static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
-{
-	struct bch_dev_usage_full ret;
-
-	bch2_dev_usage_full_read_fast(ca, &ret);
-	return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
-
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
-{
-	s64 reserved = 0;
-
-	switch (watermark) {
-	case BCH_WATERMARK_NR:
-		BUG();
-	case BCH_WATERMARK_stripe:
-		reserved += ca->mi.nbuckets >> 6;
-		fallthrough;
-	case BCH_WATERMARK_normal:
-		reserved += ca->mi.nbuckets >> 6;
-		fallthrough;
-	case BCH_WATERMARK_copygc:
-		reserved += ca->nr_btree_reserve;
-		fallthrough;
-	case BCH_WATERMARK_btree:
-		reserved += ca->nr_btree_reserve;
-		fallthrough;
-	case BCH_WATERMARK_btree_copygc:
-	case BCH_WATERMARK_reclaim:
-	case BCH_WATERMARK_interior_updates:
-		break;
-	}
-
-	return reserved;
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca,
-				   struct bch_dev_usage usage,
-				   enum bch_watermark watermark)
-{
-	return max_t(s64, 0,
-		     usage.buckets[BCH_DATA_free]-
-		     ca->nr_open_buckets -
-		     bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
-					  struct bch_dev_usage usage,
-					  enum bch_watermark watermark)
-{
-	return max_t(s64, 0,
-		       usage.buckets[BCH_DATA_free]
-		     + usage.buckets[BCH_DATA_cached]
-		     + usage.buckets[BCH_DATA_need_gc_gens]
-		     + usage.buckets[BCH_DATA_need_discard]
-		     - ca->nr_open_buckets
-		     - bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca,
-					enum bch_watermark watermark)
-{
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
-}
-
-/* Filesystem usage: */
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *);
-
-int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
-			   struct bkey_s_c, const struct bch_extent_ptr *,
-			   s64, enum bch_data_type, u8, u8, u32 *);
-
-int bch2_check_fix_ptrs(struct btree_trans *,
-			enum btree_id, unsigned, struct bkey_s_c,
-			enum btree_iter_update_trigger_flags);
-
-int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
-			struct bkey_s_c, struct bkey_s,
-			enum btree_iter_update_trigger_flags);
-int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
-			  struct bkey_s_c, struct bkey_s,
-			  enum btree_iter_update_trigger_flags);
-
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
-({												\
-	int ret = 0;										\
-												\
-	if (_old.k->type)									\
-		ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert);	\
-	if (!ret && _new.k->type)								\
-		ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
-	ret;											\
-})
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
-				    enum bch_data_type, unsigned,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
-				    enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs(struct bch_fs *);
-
-bool bch2_is_superblock_bucket(struct bch_dev *, u64);
-
-static inline const char *bch2_data_type_str(enum bch_data_type type)
-{
-	return type < BCH_DATA_NR
-		? __bch2_data_types[type]
-		: "(invalid data type)";
-}
-
-/* disk reservations: */
-
-static inline void bch2_disk_reservation_put(struct bch_fs *c,
-					     struct disk_reservation *res)
-{
-	if (res->sectors) {
-		this_cpu_sub(*c->online_reserved, res->sectors);
-		res->sectors = 0;
-	}
-}
-
-enum bch_reservation_flags {
-	BCH_DISK_RESERVATION_NOFAIL	= 1 << 0,
-	BCH_DISK_RESERVATION_PARTIAL	= 1 << 1,
-};
-
-int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
-				u64, enum bch_reservation_flags);
-
-static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-					    u64 sectors, enum bch_reservation_flags flags)
-{
-#ifdef __KERNEL__
-	u64 old, new;
-
-	old = this_cpu_read(c->pcpu->sectors_available);
-	do {
-		if (sectors > old)
-			return __bch2_disk_reservation_add(c, res, sectors, flags);
-
-		new = old - sectors;
-	} while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
-
-	this_cpu_add(*c->online_reserved, sectors);
-	res->sectors			+= sectors;
-	return 0;
-#else
-	return __bch2_disk_reservation_add(c, res, sectors, flags);
-#endif
-}
-
-static inline struct disk_reservation
-bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-{
-	return (struct disk_reservation) {
-		.sectors	= 0,
-#if 0
-		/* not used yet: */
-		.gen		= c->capacity_gen,
-#endif
-		.nr_replicas	= nr_replicas,
-	};
-}
-
-static inline int bch2_disk_reservation_get(struct bch_fs *c,
-					    struct disk_reservation *res,
-					    u64 sectors, unsigned nr_replicas,
-					    int flags)
-{
-	*res = bch2_disk_reservation_init(c, nr_replicas);
-
-	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
-}
-
-#define RESERVE_FACTOR	6
-
-static inline u64 avail_factor(u64 r)
-{
-	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
-void bch2_buckets_nouse_free(struct bch_fs *);
-int bch2_buckets_nouse_alloc(struct bch_fs *);
-
-int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
-void bch2_dev_buckets_free(struct bch_dev *);
-int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
-
-#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
deleted file mode 100644
index 0aed2500ade3..000000000000
--- a/fs/bcachefs/buckets_types.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-#include "bcachefs_format.h"
-#include "util.h"
-
-#define BUCKET_JOURNAL_SEQ_BITS		16
-
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- *   while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR	0
-#else
-#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
-	ulong	ulong;
-	u8	byte;
-};
-
-struct bucket {
-	u8			lock;
-	u8			gen_valid:1;
-	u8			data_type:7;
-	u8			gen;
-	u8			stripe_redundancy;
-	u32			stripe;
-	u32			dirty_sectors;
-	u32			cached_sectors;
-	u32			stripe_sectors;
-} __aligned(sizeof(long));
-
-struct bucket_gens {
-	struct rcu_head		rcu;
-	u16			first_bucket;
-	size_t			nbuckets;
-	size_t			nbuckets_minus_first;
-	u8			b[] __counted_by(nbuckets);
-};
-
-/* Only info on bucket countns: */
-struct bch_dev_usage {
-	u64			buckets[BCH_DATA_NR];
-};
-
-struct bch_dev_usage_full {
-	struct bch_dev_usage_type {
-		u64		buckets;
-		u64		sectors; /* _compressed_ sectors: */
-		/*
-		 * XXX
-		 * Why do we have this? Isn't it just buckets * bucket_size -
-		 * sectors?
-		 */
-		u64		fragmented;
-	}			d[BCH_DATA_NR];
-};
-
-struct bch_fs_usage_base {
-	u64			hidden;
-	u64			btree;
-	u64			data;
-	u64			cached;
-	u64			reserved;
-	u64			nr_inodes;
-};
-
-struct bch_fs_usage_short {
-	u64			capacity;
-	u64			used;
-	u64			free;
-	u64			nr_inodes;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
-	u64			sectors;
-	u32			gen;
-	unsigned		nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
deleted file mode 100644
index 832eff93acb6..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets_waiting_for_journal.h"
-#include <linux/hash.h>
-#include <linux/random.h>
-
-static inline struct bucket_hashed *
-bucket_hash(struct buckets_waiting_for_journal_table *t,
-	    unsigned hash_seed_idx, u64 dev_bucket)
-{
-	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
-}
-
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
-{
-	unsigned i;
-
-	t->bits = bits;
-	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
-		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
-	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
-}
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
-				  unsigned dev, u64 bucket)
-{
-	struct buckets_waiting_for_journal_table *t;
-	u64 dev_bucket = (u64) dev << 56 | bucket;
-	u64 ret = 0;
-
-	mutex_lock(&b->lock);
-	t = b->t;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
-
-		if (h->dev_bucket == dev_bucket) {
-			ret = h->journal_seq;
-			break;
-		}
-	}
-
-	mutex_unlock(&b->lock);
-
-	return ret;
-}
-
-static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
-				struct bucket_hashed *new,
-				u64 flushed_seq)
-{
-	struct bucket_hashed *last_evicted = NULL;
-	unsigned tries, i;
-
-	for (tries = 0; tries < 10; tries++) {
-		struct bucket_hashed *old, *victim = NULL;
-
-		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
-			old = bucket_hash(t, i, new->dev_bucket);
-
-			if (old->dev_bucket == new->dev_bucket ||
-			    old->journal_seq <= flushed_seq) {
-				*old = *new;
-				return true;
-			}
-
-			if (last_evicted != old)
-				victim = old;
-		}
-
-		/* hashed to same slot 3 times: */
-		if (!victim)
-			break;
-
-		/* Failed to find an empty slot: */
-		swap(*new, *victim);
-		last_evicted = victim;
-	}
-
-	return false;
-}
-
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
-					 u64 flushed_seq,
-					 unsigned dev, u64 bucket,
-					 u64 journal_seq)
-{
-	struct buckets_waiting_for_journal_table *t, *n;
-	struct bucket_hashed tmp, new = {
-		.dev_bucket	= (u64) dev << 56 | bucket,
-		.journal_seq	= journal_seq,
-	};
-	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
-	int ret = 0;
-
-	mutex_lock(&b->lock);
-
-	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
-		goto out;
-
-	t = b->t;
-	size = 1UL << t->bits;
-	for (i = 0; i < size; i++)
-		nr_elements += t->d[i].journal_seq > flushed_seq;
-
-	new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-realloc:
-	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
-	if (!n) {
-		struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
-		ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
-		goto out;
-	}
-
-retry_rehash:
-	if (nr_rehashes_this_size == 3) {
-		new_bits++;
-		nr_rehashes_this_size = 0;
-		kvfree(n);
-		goto realloc;
-	}
-
-	nr_rehashes++;
-	nr_rehashes_this_size++;
-
-	bucket_table_init(n, new_bits);
-
-	tmp = new;
-	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
-
-	for (i = 0; i < 1UL << t->bits; i++) {
-		if (t->d[i].journal_seq <= flushed_seq)
-			continue;
-
-		tmp = t->d[i];
-		if (!bucket_table_insert(n, &tmp, flushed_seq))
-			goto retry_rehash;
-	}
-
-	b->t = n;
-	kvfree(t);
-
-	pr_debug("took %zu rehashes, table at %zu/%lu elements",
-		 nr_rehashes, nr_elements, 1UL << b->t->bits);
-out:
-	mutex_unlock(&b->lock);
-
-	return ret;
-}
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
-{
-	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-	kvfree(b->t);
-}
-
-#define INITIAL_TABLE_BITS		3
-
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
-{
-	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
-	mutex_init(&b->lock);
-
-	b->t = kvmalloc(sizeof(*b->t) +
-			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
-	if (!b->t)
-		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
-
-	bucket_table_init(b->t, INITIAL_TABLE_BITS);
-	return 0;
-}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
deleted file mode 100644
index 365619ca44c8..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_H
-
-#include "buckets_waiting_for_journal_types.h"
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
-				  unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
-					 u64, unsigned, u64, u64);
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
deleted file mode 100644
index e593db061d81..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal_types.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-
-#include <linux/siphash.h>
-
-struct bucket_hashed {
-	u64			dev_bucket;
-	u64			journal_seq;
-};
-
-struct buckets_waiting_for_journal_table {
-	unsigned		bits;
-	u64			hash_seeds[3];
-	struct bucket_hashed	d[];
-};
-
-struct buckets_waiting_for_journal {
-	struct mutex		lock;
-	struct buckets_waiting_for_journal_table *t;
-};
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
deleted file mode 100644
index 5ea89aa2b0c4..000000000000
--- a/fs/bcachefs/chardev.c
+++ /dev/null
@@ -1,843 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_CHARDEV
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "disk_accounting.h"
-#include "fsck.h"
-#include "journal.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-counters.h"
-#include "super-io.h"
-#include "thread_with_file.h"
-
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/major.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
-					  unsigned flags)
-{
-	struct bch_dev *ca;
-
-	if (flags & BCH_BY_INDEX) {
-		if (dev >= c->sb.nr_devices)
-			return ERR_PTR(-EINVAL);
-
-		ca = bch2_dev_tryget_noerror(c, dev);
-		if (!ca)
-			return ERR_PTR(-EINVAL);
-	} else {
-		char *path;
-
-		path = strndup_user((const char __user *)
-				    (unsigned long) dev, PATH_MAX);
-		if (IS_ERR(path))
-			return ERR_CAST(path);
-
-		ca = bch2_dev_lookup(c, path);
-		kfree(path);
-	}
-
-	return ca;
-}
-
-#if 0
-static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
-	struct bch_ioctl_assemble arg;
-	struct bch_fs *c;
-	u64 *user_devs = NULL;
-	char **devs = NULL;
-	unsigned i;
-	int ret = -EFAULT;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
-	if (!user_devs)
-		return -ENOMEM;
-
-	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
-	if (copy_from_user(user_devs, user_arg->devs,
-			   sizeof(u64) * arg.nr_devs))
-		goto err;
-
-	for (i = 0; i < arg.nr_devs; i++) {
-		devs[i] = strndup_user((const char __user *)(unsigned long)
-				       user_devs[i],
-				       PATH_MAX);
-		ret= PTR_ERR_OR_ZERO(devs[i]);
-		if (ret)
-			goto err;
-	}
-
-	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
-	ret = PTR_ERR_OR_ZERO(c);
-	if (!ret)
-		closure_put(&c->cl);
-err:
-	if (devs)
-		for (i = 0; i < arg.nr_devs; i++)
-			kfree(devs[i]);
-	kfree(devs);
-	return ret;
-}
-
-static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
-	struct bch_ioctl_incremental arg;
-	const char *err;
-	char *path;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	err = bch2_fs_open_incremental(path);
-	kfree(path);
-
-	if (err) {
-		pr_err("Could not register bcachefs devices: %s", err);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-#endif
-
-static long bch2_global_ioctl(unsigned cmd, void __user *arg)
-{
-	long ret;
-
-	switch (cmd) {
-#if 0
-	case BCH_IOCTL_ASSEMBLE:
-		return bch2_ioctl_assemble(arg);
-	case BCH_IOCTL_INCREMENTAL:
-		return bch2_ioctl_incremental(arg);
-#endif
-	case BCH_IOCTL_FSCK_OFFLINE: {
-		ret = bch2_ioctl_fsck_offline(arg);
-		break;
-	}
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-	return ret;
-}
-
-static long bch2_ioctl_query_uuid(struct bch_fs *c,
-			struct bch_ioctl_query_uuid __user *user_arg)
-{
-	return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
-				    sizeof(c->sb.user_uuid));
-}
-
-#if 0
-static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	return bch2_fs_start(c);
-}
-
-static long bch2_ioctl_stop(struct bch_fs *c)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	bch2_fs_stop(c);
-	return 0;
-}
-#endif
-
-static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	char *path;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	ret = bch2_dev_add(c, path);
-	if (!IS_ERR(path))
-		kfree(path);
-
-	return ret;
-}
-
-static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	struct bch_dev *ca;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	return bch2_dev_remove(c, ca, arg.flags);
-}
-
-static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	char *path;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (arg.flags || arg.pad)
-		return -EINVAL;
-
-	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-	ret = PTR_ERR_OR_ZERO(path);
-	if (ret)
-		return ret;
-
-	ret = bch2_dev_online(c, path);
-	kfree(path);
-	return ret;
-}
-
-static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_offline(c, ca, arg.flags);
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_disk_set_state(struct bch_fs *c,
-			struct bch_ioctl_disk_set_state arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
-			   BCH_FORCE_IF_METADATA_LOST|
-			   BCH_FORCE_IF_DEGRADED|
-			   BCH_BY_INDEX)) ||
-	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
-	    arg.new_state >= BCH_MEMBER_STATE_NR)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
-	if (ret)
-		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-struct bch_data_ctx {
-	struct thread_with_file		thr;
-
-	struct bch_fs			*c;
-	struct bch_ioctl_data		arg;
-	struct bch_move_stats		stats;
-};
-
-static int bch2_data_thread(void *arg)
-{
-	struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
-
-	ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
-	if (ctx->thr.ret == -BCH_ERR_device_offline)
-		ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
-	else {
-		ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
-		ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
-	}
-	enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data);
-	return 0;
-}
-
-static int bch2_data_job_release(struct inode *inode, struct file *file)
-{
-	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-
-	bch2_thread_with_file_exit(&ctx->thr);
-	kfree(ctx);
-	return 0;
-}
-
-static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
-				  size_t len, loff_t *ppos)
-{
-	struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-	struct bch_fs *c = ctx->c;
-	struct bch_ioctl_data_event e = {
-		.type				= BCH_DATA_EVENT_PROGRESS,
-		.ret				= ctx->stats.ret,
-		.p.data_type			= ctx->stats.data_type,
-		.p.btree_id			= ctx->stats.pos.btree,
-		.p.pos				= ctx->stats.pos.pos,
-		.p.sectors_done			= atomic64_read(&ctx->stats.sectors_seen),
-		.p.sectors_error_corrected	= atomic64_read(&ctx->stats.sectors_error_corrected),
-		.p.sectors_error_uncorrected	= atomic64_read(&ctx->stats.sectors_error_uncorrected),
-	};
-
-	if (ctx->arg.op == BCH_DATA_OP_scrub) {
-		struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
-		if (ca) {
-			struct bch_dev_usage_full u;
-			bch2_dev_usage_full_read_fast(ca, &u);
-			for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
-				if (ctx->arg.scrub.data_types & BIT(i))
-					e.p.sectors_total += u.d[i].sectors;
-			bch2_dev_put(ca);
-		}
-	} else {
-		e.p.sectors_total	= bch2_fs_usage_read_short(c).used;
-	}
-
-	if (len < sizeof(e))
-		return -EINVAL;
-
-	return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
-}
-
-static const struct file_operations bcachefs_data_ops = {
-	.release	= bch2_data_job_release,
-	.read		= bch2_data_job_read,
-};
-
-static long bch2_ioctl_data(struct bch_fs *c,
-			    struct bch_ioctl_data arg)
-{
-	struct bch_data_ctx *ctx;
-	int ret;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data))
-		return -EROFS;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		ret = -EPERM;
-		goto put_ref;
-	}
-
-	if (arg.op >= BCH_DATA_OP_NR || arg.flags) {
-		ret = -EINVAL;
-		goto put_ref;
-	}
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx) {
-		ret = -ENOMEM;
-		goto put_ref;
-	}
-
-	ctx->c = c;
-	ctx->arg = arg;
-
-	ret = bch2_run_thread_with_file(&ctx->thr,
-			&bcachefs_data_ops,
-			bch2_data_thread);
-	if (ret < 0)
-		goto cleanup;
-	return ret;
-cleanup:
-	kfree(ctx);
-put_ref:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data);
-	return ret;
-}
-
-static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c,
-				struct bch_ioctl_fs_usage __user *user_arg)
-{
-	struct bch_ioctl_fs_usage arg = {};
-	darray_char replicas = {};
-	u32 replica_entries_bytes;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
-		return -EFAULT;
-
-	ret   = bch2_fs_replicas_usage_read(c, &replicas) ?:
-		(replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
-		copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
-	if (ret)
-		goto err;
-
-	struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
-	arg.capacity		= c->capacity;
-	arg.used		= u.used;
-	arg.online_reserved	= percpu_u64_get(c->online_reserved);
-	arg.replica_entries_bytes = replicas.nr;
-
-	for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
-		struct disk_accounting_pos k;
-		disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i);
-
-		bch2_accounting_mem_read(c,
-					 disk_accounting_pos_to_bpos(&k),
-					 &arg.persistent_reserved[i], 1);
-	}
-
-	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
-	darray_exit(&replicas);
-	return ret;
-}
-
-static long bch2_ioctl_query_accounting(struct bch_fs *c,
-			struct bch_ioctl_query_accounting __user *user_arg)
-{
-	struct bch_ioctl_query_accounting arg;
-	darray_char accounting = {};
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	ret   = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
-		bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
-		(arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
-		copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
-	if (ret)
-		goto err;
-
-	arg.capacity		= c->capacity;
-	arg.used		= bch2_fs_usage_read_short(c).used;
-	arg.online_reserved	= percpu_u64_get(c->online_reserved);
-	arg.accounting_u64s	= accounting.nr / sizeof(u64);
-
-	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
-	darray_exit(&accounting);
-	return ret;
-}
-
-/* obsolete, didn't allow for new data types: */
-static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c,
-				 struct bch_ioctl_dev_usage __user *user_arg)
-{
-	struct bch_ioctl_dev_usage arg;
-	struct bch_dev_usage_full src;
-	struct bch_dev *ca;
-	unsigned i;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad[0] ||
-	    arg.pad[1] ||
-	    arg.pad[2])
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	src = bch2_dev_usage_full_read(ca);
-
-	arg.state		= ca->mi.state;
-	arg.bucket_size		= ca->mi.bucket_size;
-	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
-
-	for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
-		arg.d[i].buckets	= src.d[i].buckets;
-		arg.d[i].sectors	= src.d[i].sectors;
-		arg.d[i].fragmented	= src.d[i].fragmented;
-	}
-
-	bch2_dev_put(ca);
-
-	return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-}
-
-static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
-				 struct bch_ioctl_dev_usage_v2 __user *user_arg)
-{
-	struct bch_ioctl_dev_usage_v2 arg;
-	struct bch_dev_usage_full src;
-	struct bch_dev *ca;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad[0] ||
-	    arg.pad[1] ||
-	    arg.pad[2])
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	src = bch2_dev_usage_full_read(ca);
-
-	arg.state		= ca->mi.state;
-	arg.bucket_size		= ca->mi.bucket_size;
-	arg.nr_data_types	= min(arg.nr_data_types, BCH_DATA_NR);
-	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
-
-	ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-	if (ret)
-		goto err;
-
-	for (unsigned i = 0; i < arg.nr_data_types; i++) {
-		struct bch_ioctl_dev_usage_type t = {
-			.buckets	= src.d[i].buckets,
-			.sectors	= src.d[i].sectors,
-			.fragmented	= src.d[i].fragmented,
-		};
-
-		ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
-		if (ret)
-			goto err;
-	}
-err:
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_read_super(struct bch_fs *c,
-				  struct bch_ioctl_read_super arg)
-{
-	struct bch_dev *ca = NULL;
-	struct bch_sb *sb;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
-	    arg.pad)
-		return -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	if (arg.flags & BCH_READ_DEV) {
-		ca = bch2_device_lookup(c, arg.dev, arg.flags);
-		ret = PTR_ERR_OR_ZERO(ca);
-		if (ret)
-			goto err_unlock;
-
-		sb = ca->disk_sb.sb;
-	} else {
-		sb = c->disk_sb.sb;
-	}
-
-	if (vstruct_bytes(sb) > arg.size) {
-		ret = -ERANGE;
-		goto err;
-	}
-
-	ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
-				   vstruct_bytes(sb));
-err:
-	bch2_dev_put(ca);
-err_unlock:
-	mutex_unlock(&c->sb_lock);
-	return ret;
-}
-
-static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
-				    struct bch_ioctl_disk_get_idx arg)
-{
-	dev_t dev = huge_decode_dev(arg.dev);
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (!dev)
-		return -EINVAL;
-
-	guard(rcu)();
-	for_each_online_member_rcu(c, ca)
-		if (ca->dev == dev)
-			return ca->dev_idx;
-
-	return bch_err_throw(c, ENOENT_dev_idx_not_found);
-}
-
-static long bch2_ioctl_disk_resize(struct bch_fs *c,
-				   struct bch_ioctl_disk_resize arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_dev_resize(c, ca, arg.nbuckets);
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
-				   struct bch_ioctl_disk_resize_journal arg)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if ((arg.flags & ~BCH_BY_INDEX) ||
-	    arg.pad)
-		return -EINVAL;
-
-	if (arg.nbuckets > U32_MAX)
-		return -EINVAL;
-
-	ca = bch2_device_lookup(c, arg.dev, arg.flags);
-	if (IS_ERR(ca))
-		return PTR_ERR(ca);
-
-	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
-
-	bch2_dev_put(ca);
-	return ret;
-}
-
-#define BCH_IOCTL(_name, _argtype)					\
-do {									\
-	_argtype i;							\
-									\
-	if (copy_from_user(&i, arg, sizeof(i)))				\
-		return -EFAULT;						\
-	ret = bch2_ioctl_##_name(c, i);					\
-	goto out;							\
-} while (0)
-
-long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
-	long ret;
-
-	switch (cmd) {
-	case BCH_IOCTL_QUERY_UUID:
-		return bch2_ioctl_query_uuid(c, arg);
-	case BCH_IOCTL_FS_USAGE:
-		return bch2_ioctl_fs_usage(c, arg);
-	case BCH_IOCTL_DEV_USAGE:
-		return bch2_ioctl_dev_usage(c, arg);
-	case BCH_IOCTL_DEV_USAGE_V2:
-		return bch2_ioctl_dev_usage_v2(c, arg);
-#if 0
-	case BCH_IOCTL_START:
-		BCH_IOCTL(start, struct bch_ioctl_start);
-	case BCH_IOCTL_STOP:
-		return bch2_ioctl_stop(c);
-#endif
-	case BCH_IOCTL_READ_SUPER:
-		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
-	case BCH_IOCTL_DISK_GET_IDX:
-		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
-	}
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EINVAL;
-
-	switch (cmd) {
-	case BCH_IOCTL_DISK_ADD:
-		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_REMOVE:
-		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_ONLINE:
-		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_OFFLINE:
-		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
-	case BCH_IOCTL_DISK_SET_STATE:
-		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
-	case BCH_IOCTL_DATA:
-		BCH_IOCTL(data, struct bch_ioctl_data);
-	case BCH_IOCTL_DISK_RESIZE:
-		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
-	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
-		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-	case BCH_IOCTL_FSCK_ONLINE:
-		BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
-	case BCH_IOCTL_QUERY_ACCOUNTING:
-		return bch2_ioctl_query_accounting(c, arg);
-	case BCH_IOCTL_QUERY_COUNTERS:
-		return bch2_ioctl_query_counters(c, arg);
-	default:
-		return -ENOTTY;
-	}
-out:
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-	return ret;
-}
-
-static DEFINE_IDR(bch_chardev_minor);
-
-static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
-	unsigned minor = iminor(file_inode(filp));
-	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
-	void __user *arg = (void __user *) v;
-
-	return c
-		? bch2_fs_ioctl(c, cmd, arg)
-		: bch2_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
-	.owner		= THIS_MODULE,
-	.unlocked_ioctl = bch2_chardev_ioctl,
-	.open		= nonseekable_open,
-};
-
-static int bch_chardev_major;
-static const struct class bch_chardev_class = {
-	.name = "bcachefs",
-};
-static struct device *bch_chardev;
-
-void bch2_fs_chardev_exit(struct bch_fs *c)
-{
-	if (!IS_ERR_OR_NULL(c->chardev))
-		device_unregister(c->chardev);
-	if (c->minor >= 0)
-		idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch2_fs_chardev_init(struct bch_fs *c)
-{
-	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
-	if (c->minor < 0)
-		return c->minor;
-
-	c->chardev = device_create(&bch_chardev_class, NULL,
-				   MKDEV(bch_chardev_major, c->minor), c,
-				   "bcachefs%u-ctl", c->minor);
-	if (IS_ERR(c->chardev))
-		return PTR_ERR(c->chardev);
-
-	return 0;
-}
-
-void bch2_chardev_exit(void)
-{
-	device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
-	class_unregister(&bch_chardev_class);
-	if (bch_chardev_major > 0)
-		unregister_chrdev(bch_chardev_major, "bcachefs");
-}
-
-int __init bch2_chardev_init(void)
-{
-	int ret;
-
-	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
-	if (bch_chardev_major < 0)
-		return bch_chardev_major;
-
-	ret = class_register(&bch_chardev_class);
-	if (ret)
-		goto major_out;
-
-	bch_chardev = device_create(&bch_chardev_class, NULL,
-				    MKDEV(bch_chardev_major, U8_MAX),
-				    NULL, "bcachefs-ctl");
-	if (IS_ERR(bch_chardev)) {
-		ret = PTR_ERR(bch_chardev);
-		goto class_out;
-	}
-
-	return 0;
-
-class_out:
-	class_unregister(&bch_chardev_class);
-major_out:
-	unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
deleted file mode 100644
index 0f563ca53c36..000000000000
--- a/fs/bcachefs/chardev.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHARDEV_H
-#define _BCACHEFS_CHARDEV_H
-
-#ifndef NO_BCACHEFS_FS
-
-long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch2_fs_chardev_exit(struct bch_fs *);
-int bch2_fs_chardev_init(struct bch_fs *);
-
-void bch2_chardev_exit(void);
-int __init bch2_chardev_init(void);
-
-#else
-
-static inline long bch2_fs_ioctl(struct bch_fs *c,
-				unsigned cmd, void __user * arg)
-{
-	return -ENOTTY;
-}
-
-static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_chardev_exit(void) {}
-static inline int __init bch2_chardev_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
deleted file mode 100644
index a6795e73f0b9..000000000000
--- a/fs/bcachefs/checksum.c
+++ /dev/null
@@ -1,698 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "errcode.h"
-#include "error.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/xxhash.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <crypto/chacha.h>
-#include <crypto/poly1305.h>
-#include <keys/user-type.h>
-
-/*
- * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
- * it features page merging without having the checksum algorithm lose its state.
- * for native checksum aglorithms (like crc), a default seed value will do.
- * for hash-like algorithms, a state needs to be stored
- */
-
-struct bch2_checksum_state {
-	union {
-		u64 seed;
-		struct xxh64_state h64state;
-	};
-	unsigned int type;
-};
-
-static void bch2_checksum_init(struct bch2_checksum_state *state)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		state->seed = 0;
-		break;
-	case BCH_CSUM_crc32c_nonzero:
-		state->seed = U32_MAX;
-		break;
-	case BCH_CSUM_crc64_nonzero:
-		state->seed = U64_MAX;
-		break;
-	case BCH_CSUM_xxhash:
-		xxh64_reset(&state->h64state, 0);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		return state->seed;
-	case BCH_CSUM_crc32c_nonzero:
-		return state->seed ^ U32_MAX;
-	case BCH_CSUM_crc64_nonzero:
-		return state->seed ^ U64_MAX;
-	case BCH_CSUM_xxhash:
-		return xxh64_digest(&state->h64state);
-	default:
-		BUG();
-	}
-}
-
-static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
-{
-	switch (state->type) {
-	case BCH_CSUM_none:
-		return;
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc32c:
-		state->seed = crc32c(state->seed, data, len);
-		break;
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc64:
-		state->seed = crc64_be(state->seed, data, len);
-		break;
-	case BCH_CSUM_xxhash:
-		xxh64_update(&state->h64state, data, len);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void bch2_chacha20_init(struct chacha_state *state,
-			       const struct bch_key *key, struct nonce nonce)
-{
-	u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
-
-	BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
-	memcpy(key_words, key, sizeof(key_words));
-	le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
-
-	BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
-	chacha_init(state, key_words, (const u8 *)nonce.d);
-
-	memzero_explicit(key_words, sizeof(key_words));
-}
-
-void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
-		   void *data, size_t len)
-{
-	struct chacha_state state;
-
-	bch2_chacha20_init(&state, key, nonce);
-	chacha20_crypt(&state, data, data, len);
-	chacha_zeroize_state(&state);
-}
-
-static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
-			       struct bch_fs *c, struct nonce nonce)
-{
-	u8 key[POLY1305_KEY_SIZE] = { 0 };
-
-	nonce.d[3] ^= BCH_NONCE_POLY;
-
-	bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
-	poly1305_init(desc, key);
-}
-
-struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
-			      struct nonce nonce, const void *data, size_t len)
-{
-	switch (type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_xxhash:
-	case BCH_CSUM_crc64: {
-		struct bch2_checksum_state state;
-
-		state.type = type;
-
-		bch2_checksum_init(&state);
-		bch2_checksum_update(&state, data, len);
-
-		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-	}
-
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128: {
-		struct poly1305_desc_ctx dctx;
-		u8 digest[POLY1305_DIGEST_SIZE];
-		struct bch_csum ret = { 0 };
-
-		bch2_poly1305_init(&dctx, c, nonce);
-		poly1305_update(&dctx, data, len);
-		poly1305_final(&dctx, digest);
-
-		memcpy(&ret, digest, bch_crc_bytes[type]);
-		return ret;
-	}
-	default:
-		return (struct bch_csum) {};
-	}
-}
-
-int bch2_encrypt(struct bch_fs *c, unsigned type,
-		  struct nonce nonce, void *data, size_t len)
-{
-	if (!bch2_csum_type_is_encryption(type))
-		return 0;
-
-	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
-				    c, "attempting to encrypt without encryption key"))
-		return bch_err_throw(c, no_encryption_key);
-
-	bch2_chacha20(&c->chacha20_key, nonce, data, len);
-	return 0;
-}
-
-static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
-					   struct nonce nonce, struct bio *bio,
-					   struct bvec_iter *iter)
-{
-	struct bio_vec bv;
-
-	switch (type) {
-	case BCH_CSUM_none:
-		return (struct bch_csum) { 0 };
-	case BCH_CSUM_crc32c_nonzero:
-	case BCH_CSUM_crc64_nonzero:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_xxhash:
-	case BCH_CSUM_crc64: {
-		struct bch2_checksum_state state;
-
-		state.type = type;
-		bch2_checksum_init(&state);
-
-#ifdef CONFIG_HIGHMEM
-		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-			bch2_checksum_update(&state, p, bv.bv_len);
-			kunmap_local(p);
-		}
-#else
-		__bio_for_each_bvec(bv, bio, *iter, *iter)
-			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
-				bv.bv_len);
-#endif
-		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
-	}
-
-	case BCH_CSUM_chacha20_poly1305_80:
-	case BCH_CSUM_chacha20_poly1305_128: {
-		struct poly1305_desc_ctx dctx;
-		u8 digest[POLY1305_DIGEST_SIZE];
-		struct bch_csum ret = { 0 };
-
-		bch2_poly1305_init(&dctx, c, nonce);
-
-#ifdef CONFIG_HIGHMEM
-		__bio_for_each_segment(bv, bio, *iter, *iter) {
-			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
-			poly1305_update(&dctx, p, bv.bv_len);
-			kunmap_local(p);
-		}
-#else
-		__bio_for_each_bvec(bv, bio, *iter, *iter)
-			poly1305_update(&dctx,
-				page_address(bv.bv_page) + bv.bv_offset,
-				bv.bv_len);
-#endif
-		poly1305_final(&dctx, digest);
-
-		memcpy(&ret, digest, bch_crc_bytes[type]);
-		return ret;
-	}
-	default:
-		return (struct bch_csum) {};
-	}
-}
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
-				  struct nonce nonce, struct bio *bio)
-{
-	struct bvec_iter iter = bio->bi_iter;
-
-	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
-}
-
-int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-		     struct nonce nonce, struct bio *bio)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	struct chacha_state chacha_state;
-	int ret = 0;
-
-	if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
-				    c, "attempting to encrypt without encryption key"))
-		return bch_err_throw(c, no_encryption_key);
-
-	bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
-
-	bio_for_each_segment(bv, bio, iter) {
-		void *p;
-
-		/*
-		 * chacha_crypt() assumes that the length is a multiple of
-		 * CHACHA_BLOCK_SIZE on any non-final call.
-		 */
-		if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
-			bch_err_ratelimited(c, "bio not aligned for encryption");
-			ret = -EIO;
-			break;
-		}
-
-		p = bvec_kmap_local(&bv);
-		chacha20_crypt(&chacha_state, p, p, bv.bv_len);
-		kunmap_local(p);
-	}
-	chacha_zeroize_state(&chacha_state);
-	return ret;
-}
-
-struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
-				    struct bch_csum b, size_t b_len)
-{
-	struct bch2_checksum_state state;
-
-	state.type = type;
-	bch2_checksum_init(&state);
-	state.seed = le64_to_cpu(a.lo);
-
-	BUG_ON(!bch2_checksum_mergeable(type));
-
-	while (b_len) {
-		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
-
-		bch2_checksum_update(&state,
-				page_address(ZERO_PAGE(0)), page_len);
-		b_len -= page_len;
-	}
-	a.lo = cpu_to_le64(bch2_checksum_final(&state));
-	a.lo ^= b.lo;
-	a.hi ^= b.hi;
-	return a;
-}
-
-int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
-			struct bversion version,
-			struct bch_extent_crc_unpacked crc_old,
-			struct bch_extent_crc_unpacked *crc_a,
-			struct bch_extent_crc_unpacked *crc_b,
-			unsigned len_a, unsigned len_b,
-			unsigned new_csum_type)
-{
-	struct bvec_iter iter = bio->bi_iter;
-	struct nonce nonce = extent_nonce(version, crc_old);
-	struct bch_csum merged = { 0 };
-	struct crc_split {
-		struct bch_extent_crc_unpacked	*crc;
-		unsigned			len;
-		unsigned			csum_type;
-		struct bch_csum			csum;
-	} splits[3] = {
-		{ crc_a, len_a, new_csum_type, { 0 }},
-		{ crc_b, len_b, new_csum_type, { 0 } },
-		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
-	}, *i;
-	bool mergeable = crc_old.csum_type == new_csum_type &&
-		bch2_checksum_mergeable(new_csum_type);
-	unsigned crc_nonce = crc_old.nonce;
-
-	BUG_ON(len_a + len_b > bio_sectors(bio));
-	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
-	BUG_ON(crc_is_compressed(crc_old));
-	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
-	       bch2_csum_type_is_encryption(new_csum_type));
-
-	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-		iter.bi_size = i->len << 9;
-		if (mergeable || i->crc)
-			i->csum = __bch2_checksum_bio(c, i->csum_type,
-						      nonce, bio, &iter);
-		else
-			bio_advance_iter(bio, &iter, i->len << 9);
-		nonce = nonce_add(nonce, i->len << 9);
-	}
-
-	if (mergeable)
-		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
-			merged = bch2_checksum_merge(new_csum_type, merged,
-						     i->csum, i->len << 9);
-	else
-		merged = bch2_checksum_bio(c, crc_old.csum_type,
-				extent_nonce(version, crc_old), bio);
-
-	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-		struct printbuf buf = PRINTBUF;
-		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
-			   "  expected %0llx:%0llx got %0llx:%0llx (old type ",
-			   __func__,
-			   crc_old.csum.hi,
-			   crc_old.csum.lo,
-			   merged.hi,
-			   merged.lo);
-		bch2_prt_csum_type(&buf, crc_old.csum_type);
-		prt_str(&buf, " new type ");
-		bch2_prt_csum_type(&buf, new_csum_type);
-		prt_str(&buf, ")");
-		WARN_RATELIMIT(1, "%s", buf.buf);
-		printbuf_exit(&buf);
-		return bch_err_throw(c, recompute_checksum);
-	}
-
-	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
-		if (i->crc)
-			*i->crc = (struct bch_extent_crc_unpacked) {
-				.csum_type		= i->csum_type,
-				.compression_type	= crc_old.compression_type,
-				.compressed_size	= i->len,
-				.uncompressed_size	= i->len,
-				.offset			= 0,
-				.live_size		= i->len,
-				.nonce			= crc_nonce,
-				.csum			= i->csum,
-			};
-
-		if (bch2_csum_type_is_encryption(new_csum_type))
-			crc_nonce += i->len;
-	}
-
-	return 0;
-}
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&crypt->field), sizeof(*crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	if (BCH_CRYPT_KDF_TYPE(crypt)) {
-		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-		return -BCH_ERR_invalid_sb_crypt;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-	prt_printf(out, "KFD:               %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
-	prt_printf(out, "scrypt n:          %llu\n", BCH_KDF_SCRYPT_N(crypt));
-	prt_printf(out, "scrypt r:          %llu\n", BCH_KDF_SCRYPT_R(crypt));
-	prt_printf(out, "scrypt p:          %llu\n", BCH_KDF_SCRYPT_P(crypt));
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-	.validate	= bch2_sb_crypt_validate,
-	.to_text	= bch2_sb_crypt_to_text,
-};
-
-#ifdef __KERNEL__
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-	struct key *keyring_key;
-	const struct user_key_payload *ukp;
-	int ret;
-
-	keyring_key = request_key(&key_type_user, key_description, NULL);
-	if (IS_ERR(keyring_key))
-		return PTR_ERR(keyring_key);
-
-	down_read(&keyring_key->sem);
-	ukp = dereference_key_locked(keyring_key);
-	if (ukp->datalen == sizeof(*key)) {
-		memcpy(key, ukp->data, ukp->datalen);
-		ret = 0;
-	} else {
-		ret = -EINVAL;
-	}
-	up_read(&keyring_key->sem);
-	key_put(keyring_key);
-
-	return ret;
-}
-#else
-#include <keyutils.h>
-
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
-	key_serial_t key_id;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_SESSION_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_USER_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	key_id = request_key("user", key_description, NULL,
-			     KEY_SPEC_USER_SESSION_KEYRING);
-	if (key_id >= 0)
-		goto got_key;
-
-	return -errno;
-got_key:
-
-	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
-		return -1;
-
-	return 0;
-}
-
-#include "crypto.h"
-#endif
-
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
-{
-	struct printbuf key_description = PRINTBUF;
-	int ret;
-
-	prt_printf(&key_description, "bcachefs:");
-	pr_uuid(&key_description, sb->user_uuid.b);
-
-	ret = __bch2_request_key(key_description.buf, key);
-	printbuf_exit(&key_description);
-
-#ifndef __KERNEL__
-	if (ret) {
-		char *passphrase = read_passphrase("Enter passphrase: ");
-		struct bch_encrypted_key sb_key;
-
-		bch2_passphrase_check(sb, passphrase,
-				      key, &sb_key);
-		ret = 0;
-	}
-#endif
-
-	/* stash with memfd, pass memfd fd to mount */
-
-	return ret;
-}
-
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *sb)
-{
-	key_serial_t key_id;
-	struct printbuf key_description = PRINTBUF;
-
-	prt_printf(&key_description, "bcachefs:");
-	pr_uuid(&key_description, sb->user_uuid.b);
-
-	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
-	printbuf_exit(&key_description);
-	if (key_id < 0)
-		return errno;
-
-	keyctl_revoke(key_id);
-
-	return 0;
-}
-#endif
-
-int bch2_decrypt_sb_key(struct bch_fs *c,
-			struct bch_sb_field_crypt *crypt,
-			struct bch_key *key)
-{
-	struct bch_encrypted_key sb_key = crypt->key;
-	struct bch_key user_key;
-	int ret = 0;
-
-	/* is key encrypted? */
-	if (!bch2_key_is_encrypted(&sb_key))
-		goto out;
-
-	ret = bch2_request_key(c->disk_sb.sb, &user_key);
-	if (ret) {
-		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-		goto err;
-	}
-
-	/* decrypt real key: */
-	bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
-
-	if (bch2_key_is_encrypted(&sb_key)) {
-		bch_err(c, "incorrect encryption key");
-		ret = -EINVAL;
-		goto err;
-	}
-out:
-	*key = sb_key.key;
-err:
-	memzero_explicit(&sb_key, sizeof(sb_key));
-	memzero_explicit(&user_key, sizeof(user_key));
-	return ret;
-}
-
-#if 0
-
-/*
- * This seems to be duplicating code in cmd_remove_passphrase() in
- * bcachefs-tools, but we might want to switch userspace to use this - and
- * perhaps add an ioctl for calling this at runtime, so we can take the
- * passphrase off of a mounted filesystem (which has come up).
- */
-int bch2_disable_encryption(struct bch_fs *c)
-{
-	struct bch_sb_field_crypt *crypt;
-	struct bch_key key;
-	int ret = -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-	if (!crypt)
-		goto out;
-
-	/* is key encrypted? */
-	ret = 0;
-	if (bch2_key_is_encrypted(&crypt->key))
-		goto out;
-
-	ret = bch2_decrypt_sb_key(c, crypt, &key);
-	if (ret)
-		goto out;
-
-	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
-	crypt->key.key		= key;
-
-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
-	bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-/*
- * For enabling encryption on an existing filesystem: not hooked up yet, but it
- * should be
- */
-int bch2_enable_encryption(struct bch_fs *c, bool keyed)
-{
-	struct bch_encrypted_key key;
-	struct bch_key user_key;
-	struct bch_sb_field_crypt *crypt;
-	int ret = -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-
-	/* Do we already have an encryption key? */
-	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
-		goto err;
-
-	ret = bch2_alloc_ciphers(c);
-	if (ret)
-		goto err;
-
-	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
-	get_random_bytes(&key.key, sizeof(key.key));
-
-	if (keyed) {
-		ret = bch2_request_key(c->disk_sb.sb, &user_key);
-		if (ret) {
-			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
-			goto err;
-		}
-
-		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-					      &key, sizeof(key));
-		if (ret)
-			goto err;
-	}
-
-	ret = crypto_skcipher_setkey(&c->chacha20->base,
-			(void *) &key.key, sizeof(key.key));
-	if (ret)
-		goto err;
-
-	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
-				     sizeof(*crypt) / sizeof(u64));
-	if (!crypt) {
-		ret = bch_err_throw(c, ENOSPC_sb_crypt);
-		goto err;
-	}
-
-	crypt->key = key;
-
-	/* write superblock */
-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
-	bch2_write_super(c);
-err:
-	mutex_unlock(&c->sb_lock);
-	memzero_explicit(&user_key, sizeof(user_key));
-	memzero_explicit(&key, sizeof(key));
-	return ret;
-}
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *c)
-{
-	memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
-}
-
-int bch2_fs_encryption_init(struct bch_fs *c)
-{
-	struct bch_sb_field_crypt *crypt;
-	int ret;
-
-	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
-	if (!crypt)
-		return 0;
-
-	ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
-	if (ret)
-		return ret;
-	c->chacha20_key_set = true;
-	return 0;
-}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
deleted file mode 100644
index 7bd9cf6104ca..000000000000
--- a/fs/bcachefs/checksum.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHECKSUM_H
-#define _BCACHEFS_CHECKSUM_H
-
-#include "bcachefs.h"
-#include "extents_types.h"
-#include "super-io.h"
-
-#include <linux/crc64.h>
-#include <crypto/chacha.h>
-
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
-	switch (type) {
-	case BCH_CSUM_none:
-	case BCH_CSUM_crc32c:
-	case BCH_CSUM_crc64:
-		return true;
-	default:
-		return false;
-	}
-}
-
-struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
-				    struct bch_csum, size_t);
-
-#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
-
-struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
-			     const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i)				\
-({									\
-	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
-									\
-	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
-})
-
-static inline void bch2_csum_to_text(struct printbuf *out,
-				     enum bch_csum_type type,
-				     struct bch_csum csum)
-{
-	const u8 *p = (u8 *) &csum;
-	unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
-
-	for (unsigned i = 0; i < bytes; i++)
-		prt_hex_byte(out, p[i]);
-}
-
-static inline void bch2_csum_err_msg(struct printbuf *out,
-				     enum bch_csum_type type,
-				     struct bch_csum expected,
-				     struct bch_csum got)
-{
-	prt_str(out, "checksum error, type ");
-	bch2_prt_csum_type(out, type);
-	prt_str(out, ": got ");
-	bch2_csum_to_text(out, type, got);
-	prt_str(out, " should be ");
-	bch2_csum_to_text(out, type, expected);
-}
-
-void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
-
-int bch2_request_key(struct bch_sb *, struct bch_key *);
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *);
-#endif
-
-int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
-		 void *data, size_t);
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
-				  struct nonce, struct bio *);
-
-int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
-			struct bch_extent_crc_unpacked,
-			struct bch_extent_crc_unpacked *,
-			struct bch_extent_crc_unpacked *,
-			unsigned, unsigned, unsigned);
-
-int __bch2_encrypt_bio(struct bch_fs *, unsigned,
-		       struct nonce, struct bio *);
-
-static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-				   struct nonce nonce, struct bio *bio)
-{
-	return bch2_csum_type_is_encryption(type)
-		? __bch2_encrypt_bio(c, type, nonce, bio)
-		: 0;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
-
-int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
-			struct bch_key *);
-
-#if 0
-int bch2_disable_encryption(struct bch_fs *);
-int bch2_enable_encryption(struct bch_fs *, bool);
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *);
-int bch2_fs_encryption_init(struct bch_fs *);
-
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
-						       bool data)
-{
-	switch (type) {
-	case BCH_CSUM_OPT_none:
-		return BCH_CSUM_none;
-	case BCH_CSUM_OPT_crc32c:
-		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
-	case BCH_CSUM_OPT_crc64:
-		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
-	case BCH_CSUM_OPT_xxhash:
-		return BCH_CSUM_xxhash;
-	default:
-		BUG();
-	}
-}
-
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
-							 struct bch_io_opts opts)
-{
-	if (opts.nocow)
-		return 0;
-
-	if (c->sb.encryption_type)
-		return c->opts.wide_macs
-			? BCH_CSUM_chacha20_poly1305_128
-			: BCH_CSUM_chacha20_poly1305_80;
-
-	return bch2_csum_opt_to_type(opts.data_checksum, true);
-}
-
-static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
-{
-	if (c->sb.encryption_type)
-		return BCH_CSUM_chacha20_poly1305_128;
-
-	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
-}
-
-static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
-					   unsigned type)
-{
-	if (type >= BCH_CSUM_NR)
-		return false;
-
-	if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
-		return false;
-
-	return true;
-}
-
-/* returns true if not equal */
-static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
-	/*
-	 * XXX: need some way of preventing the compiler from optimizing this
-	 * into a form that isn't constant time..
-	 */
-	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
-	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
-
-	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
-	return nonce;
-}
-
-static inline struct nonce null_nonce(void)
-{
-	struct nonce ret;
-
-	memset(&ret, 0, sizeof(ret));
-	return ret;
-}
-
-static inline struct nonce extent_nonce(struct bversion version,
-					struct bch_extent_crc_unpacked crc)
-{
-	unsigned compression_type = crc_is_compressed(crc)
-		? crc.compression_type
-		: 0;
-	unsigned size = compression_type ? crc.uncompressed_size : 0;
-	struct nonce nonce = (struct nonce) {{
-		[0] = cpu_to_le32(size << 22),
-		[1] = cpu_to_le32(version.lo),
-		[2] = cpu_to_le32(version.lo >> 32),
-		[3] = cpu_to_le32(version.hi|
-				  (compression_type << 24))^BCH_NONCE_EXTENT,
-	}};
-
-	return nonce_add(nonce, crc.nonce << 9);
-}
-
-static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
-{
-	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
-{
-	__le64 magic = __bch2_sb_magic(sb);
-
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = 0,
-		[2] = ((__le32 *) &magic)[0],
-		[3] = ((__le32 *) &magic)[1],
-	}};
-}
-
-static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
-{
-	__le64 magic = bch2_sb_magic(c);
-
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = 0,
-		[2] = ((__le32 *) &magic)[0],
-		[3] = ((__le32 *) &magic)[1],
-	}};
-}
-
-#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
deleted file mode 100644
index 8e9264b5a84e..000000000000
--- a/fs/bcachefs/clock.c
+++ /dev/null
@@ -1,181 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-
-static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
-{
-	struct io_timer **_l = (struct io_timer **)l;
-	struct io_timer **_r = (struct io_timer **)r;
-
-	return (*_l)->expire < (*_r)->expire;
-}
-
-static const struct min_heap_callbacks callbacks = {
-	.less = io_timer_cmp,
-	.swp = NULL,
-};
-
-void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
-	spin_lock(&clock->timer_lock);
-
-	if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
-		spin_unlock(&clock->timer_lock);
-		timer->fn(timer);
-		return;
-	}
-
-	for (size_t i = 0; i < clock->timers.nr; i++)
-		if (clock->timers.data[i] == timer)
-			goto out;
-
-	BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
-out:
-	spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
-	spin_lock(&clock->timer_lock);
-
-	for (size_t i = 0; i < clock->timers.nr; i++)
-		if (clock->timers.data[i] == timer) {
-			min_heap_del(&clock->timers, i, &callbacks, NULL);
-			break;
-		}
-
-	spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
-	struct io_timer		io_timer;
-	struct task_struct	*task;
-	int			expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
-	struct io_clock_wait *wait = container_of(timer,
-				struct io_clock_wait, io_timer);
-
-	wait->expired = 1;
-	wake_up_process(wait->task);
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
-{
-	struct io_clock_wait wait = {
-		.io_timer.expire	= until,
-		.io_timer.fn		= io_clock_wait_fn,
-		.io_timer.fn2		= (void *) _RET_IP_,
-		.task			= current,
-	};
-
-	bch2_io_timer_add(clock, &wait.io_timer);
-	schedule();
-	bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
-				     u64 io_until, unsigned long cpu_timeout)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct io_clock_wait wait = {
-		.io_timer.expire	= io_until,
-		.io_timer.fn		= io_clock_wait_fn,
-		.io_timer.fn2		= (void *) _RET_IP_,
-		.task			= current,
-	};
-
-	bch2_io_timer_add(clock, &wait.io_timer);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	if (!(kthread && kthread_should_stop())) {
-		cpu_timeout = schedule_timeout(cpu_timeout);
-		try_to_freeze();
-	}
-
-	__set_current_state(TASK_RUNNING);
-	bch2_io_timer_del(clock, &wait.io_timer);
-	return cpu_timeout;
-}
-
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
-				u64 io_until, unsigned long cpu_timeout)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-
-	while (!(kthread && kthread_should_stop()) &&
-	       cpu_timeout &&
-	       atomic64_read(&clock->now) < io_until)
-		cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
-{
-	struct io_timer *ret = NULL;
-
-	if (clock->timers.nr &&
-	    time_after_eq64(now, clock->timers.data[0]->expire)) {
-		ret = *min_heap_peek(&clock->timers);
-		min_heap_pop(&clock->timers, &callbacks, NULL);
-	}
-
-	return ret;
-}
-
-void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
-{
-	struct io_timer *timer;
-	u64 now = atomic64_add_return(sectors, &clock->now);
-
-	spin_lock(&clock->timer_lock);
-	while ((timer = get_expired_timer(clock, now)))
-		timer->fn(timer);
-	spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
-{
-	out->atomic++;
-	spin_lock(&clock->timer_lock);
-	u64 now = atomic64_read(&clock->now);
-
-	printbuf_tabstop_push(out, 40);
-	prt_printf(out, "current time:\t%llu\n", now);
-
-	for (unsigned i = 0; i < clock->timers.nr; i++)
-		prt_printf(out, "%ps %ps:\t%llu\n",
-		       clock->timers.data[i]->fn,
-		       clock->timers.data[i]->fn2,
-		       clock->timers.data[i]->expire);
-	spin_unlock(&clock->timer_lock);
-	--out->atomic;
-}
-
-void bch2_io_clock_exit(struct io_clock *clock)
-{
-	free_heap(&clock->timers);
-	free_percpu(clock->pcpu_buf);
-}
-
-int bch2_io_clock_init(struct io_clock *clock)
-{
-	atomic64_set(&clock->now, 0);
-	spin_lock_init(&clock->timer_lock);
-
-	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
-
-	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
-	if (!clock->pcpu_buf)
-		return -BCH_ERR_ENOMEM_io_clock_init;
-
-	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
-		return -BCH_ERR_ENOMEM_io_clock_init;
-
-	return 0;
-}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
deleted file mode 100644
index 8769be2aa21e..000000000000
--- a/fs/bcachefs/clock.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_H
-#define _BCACHEFS_CLOCK_H
-
-void bch2_io_timer_add(struct io_clock *, struct io_timer *);
-void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
-void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-
-void __bch2_increment_clock(struct io_clock *, u64);
-
-static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
-					int rw)
-{
-	struct io_clock *clock = &c->io_clock[rw];
-
-	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
-		   IO_CLOCK_PCPU_SECTORS))
-		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-
-void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
-
-void bch2_io_clock_exit(struct io_clock *);
-int bch2_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
deleted file mode 100644
index 37554e4514fe..000000000000
--- a/fs/bcachefs/clock_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_TYPES_H
-#define _BCACHEFS_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
-	io_timer_fn		fn;
-	void			*fn2;
-	u64			expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS	128
-
-typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap)	io_timer_heap;
-
-struct io_clock {
-	atomic64_t		now;
-	u16 __percpu		*pcpu_buf;
-	unsigned		max_slop;
-
-	spinlock_t		timer_lock;
-	io_timer_heap		timers;
-};
-
-#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
deleted file mode 100644
index b37b1f325f0a..000000000000
--- a/fs/bcachefs/compress.c
+++ /dev/null
@@ -1,773 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "compress.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "opts.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-#include <linux/zstd.h>
-
-static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
-{
-	switch (type) {
-	case BCH_COMPRESSION_TYPE_none:
-	case BCH_COMPRESSION_TYPE_incompressible:
-		return BCH_COMPRESSION_OPT_none;
-	case BCH_COMPRESSION_TYPE_lz4_old:
-	case BCH_COMPRESSION_TYPE_lz4:
-		return BCH_COMPRESSION_OPT_lz4;
-	case BCH_COMPRESSION_TYPE_gzip:
-		return BCH_COMPRESSION_OPT_gzip;
-	case BCH_COMPRESSION_TYPE_zstd:
-		return BCH_COMPRESSION_OPT_zstd;
-	default:
-		BUG();
-	}
-}
-
-/* Bounce buffer: */
-struct bbuf {
-	void		*b;
-	enum {
-		BB_NONE,
-		BB_VMAP,
-		BB_KMALLOC,
-		BB_MEMPOOL,
-	}		type;
-	int		rw;
-};
-
-static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
-{
-	void *b;
-
-	BUG_ON(size > c->opts.encoded_extent_max);
-
-	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
-
-	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
-	if (b)
-		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
-	BUG();
-}
-
-static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	void *expected_start = NULL;
-
-	__bio_for_each_bvec(bv, bio, iter, start) {
-		if (expected_start &&
-		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
-			return false;
-
-		expected_start = page_address(bv.bv_page) +
-			bv.bv_offset + bv.bv_len;
-	}
-
-	return true;
-}
-
-static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
-				       struct bvec_iter start, int rw)
-{
-	struct bbuf ret;
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	unsigned nr_pages = 0;
-	struct page *stack_pages[16];
-	struct page **pages = NULL;
-	void *data;
-
-	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
-
-	if (!PageHighMem(bio_iter_page(bio, start)) &&
-	    bio_phys_contig(bio, start))
-		return (struct bbuf) {
-			.b = page_address(bio_iter_page(bio, start)) +
-				bio_iter_offset(bio, start),
-			.type = BB_NONE, .rw = rw
-		};
-
-	/* check if we can map the pages contiguously: */
-	__bio_for_each_segment(bv, bio, iter, start) {
-		if (iter.bi_size != start.bi_size &&
-		    bv.bv_offset)
-			goto bounce;
-
-		if (bv.bv_len < iter.bi_size &&
-		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
-			goto bounce;
-
-		nr_pages++;
-	}
-
-	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
-	pages = nr_pages > ARRAY_SIZE(stack_pages)
-		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
-		: stack_pages;
-	if (!pages)
-		goto bounce;
-
-	nr_pages = 0;
-	__bio_for_each_segment(bv, bio, iter, start)
-		pages[nr_pages++] = bv.bv_page;
-
-	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-	if (pages != stack_pages)
-		kfree(pages);
-
-	if (data)
-		return (struct bbuf) {
-			.b = data + bio_iter_offset(bio, start),
-			.type = BB_VMAP, .rw = rw
-		};
-bounce:
-	ret = __bounce_alloc(c, start.bi_size, rw);
-
-	if (rw == READ)
-		memcpy_from_bio(ret.b, bio, start);
-
-	return ret;
-}
-
-static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
-{
-	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
-{
-	switch (buf.type) {
-	case BB_NONE:
-		break;
-	case BB_VMAP:
-		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
-		break;
-	case BB_KMALLOC:
-		kfree(buf.b);
-		break;
-	case BB_MEMPOOL:
-		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
-		break;
-	}
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
-	strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-			    void *dst_data, struct bch_extent_crc_unpacked crc)
-{
-	struct bbuf src_data = { NULL };
-	size_t src_len = src->bi_iter.bi_size;
-	size_t dst_len = crc.uncompressed_size << 9;
-	void *workspace;
-	int ret = 0, ret2;
-
-	enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
-	mempool_t *workspace_pool = &c->compress_workspace[opt];
-	if (unlikely(!mempool_initialized(workspace_pool))) {
-		if (fsck_err(c, compression_type_not_marked_in_sb,
-			     "compression type %s set but not marked in superblock",
-			     __bch2_compression_types[crc.compression_type]))
-			ret = bch2_check_set_has_compressed_data(c, opt);
-		else
-			ret = bch_err_throw(c, compression_workspace_not_initialized);
-		if (ret)
-			goto err;
-	}
-
-	src_data = bio_map_or_bounce(c, src, READ);
-
-	switch (crc.compression_type) {
-	case BCH_COMPRESSION_TYPE_lz4_old:
-	case BCH_COMPRESSION_TYPE_lz4:
-		ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
-						   src_len, dst_len, dst_len);
-		if (ret2 != dst_len)
-			ret = bch_err_throw(c, decompress_lz4);
-		break;
-	case BCH_COMPRESSION_TYPE_gzip: {
-		z_stream strm = {
-			.next_in	= src_data.b,
-			.avail_in	= src_len,
-			.next_out	= dst_data,
-			.avail_out	= dst_len,
-		};
-
-		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
-		zlib_set_workspace(&strm, workspace);
-		zlib_inflateInit2(&strm, -MAX_WBITS);
-		ret2 = zlib_inflate(&strm, Z_FINISH);
-
-		mempool_free(workspace, workspace_pool);
-
-		if (ret2 != Z_STREAM_END)
-			ret = bch_err_throw(c, decompress_gzip);
-		break;
-	}
-	case BCH_COMPRESSION_TYPE_zstd: {
-		ZSTD_DCtx *ctx;
-		size_t real_src_len = le32_to_cpup(src_data.b);
-
-		if (real_src_len > src_len - 4) {
-			ret = bch_err_throw(c, decompress_zstd_src_len_bad);
-			goto err;
-		}
-
-		workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
-
-		ret2 = zstd_decompress_dctx(ctx,
-				dst_data,	dst_len,
-				src_data.b + 4, real_src_len);
-
-		mempool_free(workspace, workspace_pool);
-
-		if (ret2 != dst_len)
-			ret = bch_err_throw(c, decompress_zstd);
-		break;
-	}
-	default:
-		BUG();
-	}
-err:
-fsck_err:
-	bio_unmap_or_unbounce(c, src_data);
-	return ret;
-}
-
-int bch2_bio_uncompress_inplace(struct bch_write_op *op,
-				struct bio *bio)
-{
-	struct bch_fs *c = op->c;
-	struct bch_extent_crc_unpacked *crc = &op->crc;
-	struct bbuf data = { NULL };
-	size_t dst_len = crc->uncompressed_size << 9;
-	int ret = 0;
-
-	/* bio must own its pages: */
-	BUG_ON(!bio->bi_vcnt);
-	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
-
-	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max) {
-		bch2_write_op_error(op, op->pos.offset,
-				    "extent too big to decompress (%u > %u)",
-				    crc->uncompressed_size << 9, c->opts.encoded_extent_max);
-		return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
-	}
-
-	data = __bounce_alloc(c, dst_len, WRITE);
-
-	ret = __bio_uncompress(c, bio, data.b, *crc);
-
-	if (c->opts.no_data_io)
-		ret = 0;
-
-	if (ret) {
-		bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
-		goto err;
-	}
-
-	/*
-	 * XXX: don't have a good way to assert that the bio was allocated with
-	 * enough space, we depend on bch2_move_extent doing the right thing
-	 */
-	bio->bi_iter.bi_size = crc->live_size << 9;
-
-	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
-
-	crc->csum_type		= 0;
-	crc->compression_type	= 0;
-	crc->compressed_size	= crc->live_size;
-	crc->uncompressed_size	= crc->live_size;
-	crc->offset		= 0;
-	crc->csum		= (struct bch_csum) { 0, 0 };
-err:
-	bio_unmap_or_unbounce(c, data);
-	return ret;
-}
-
-int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
-		       struct bio *dst, struct bvec_iter dst_iter,
-		       struct bch_extent_crc_unpacked crc)
-{
-	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc.uncompressed_size << 9;
-	int ret;
-
-	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
-	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
-		return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
-
-	dst_data = dst_len == dst_iter.bi_size
-		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
-		: __bounce_alloc(c, dst_len, WRITE);
-
-	ret = __bio_uncompress(c, src, dst_data.b, crc);
-	if (ret)
-		goto err;
-
-	if (dst_data.type != BB_NONE &&
-	    dst_data.type != BB_VMAP)
-		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
-err:
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-}
-
-static int attempt_compress(struct bch_fs *c,
-			    void *workspace,
-			    void *dst, size_t dst_len,
-			    void *src, size_t src_len,
-			    struct bch_compression_opt compression)
-{
-	enum bch_compression_type compression_type =
-		__bch2_compression_opt_to_type[compression.type];
-
-	switch (compression_type) {
-	case BCH_COMPRESSION_TYPE_lz4:
-		if (compression.level < LZ4HC_MIN_CLEVEL) {
-			int len = src_len;
-			int ret = LZ4_compress_destSize(
-					src,		dst,
-					&len,		dst_len,
-					workspace);
-			if (len < src_len)
-				return -len;
-
-			return ret;
-		} else {
-			int ret = LZ4_compress_HC(
-					src,		dst,
-					src_len,	dst_len,
-					compression.level,
-					workspace);
-
-			return ret ?: -1;
-		}
-	case BCH_COMPRESSION_TYPE_gzip: {
-		z_stream strm = {
-			.next_in	= src,
-			.avail_in	= src_len,
-			.next_out	= dst,
-			.avail_out	= dst_len,
-		};
-
-		zlib_set_workspace(&strm, workspace);
-		if (zlib_deflateInit2(&strm,
-				  compression.level
-				  ? clamp_t(unsigned, compression.level,
-					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
-				  : Z_DEFAULT_COMPRESSION,
-				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
-				  Z_DEFAULT_STRATEGY) != Z_OK)
-			return 0;
-
-		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
-			return 0;
-
-		if (zlib_deflateEnd(&strm) != Z_OK)
-			return 0;
-
-		return strm.total_out;
-	}
-	case BCH_COMPRESSION_TYPE_zstd: {
-		/*
-		 * rescale:
-		 * zstd max compression level is 22, our max level is 15
-		 */
-		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
-		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
-		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
-
-		/*
-		 * ZSTD requires that when we decompress we pass in the exact
-		 * compressed size - rounding it up to the nearest sector
-		 * doesn't work, so we use the first 4 bytes of the buffer for
-		 * that.
-		 *
-		 * Additionally, the ZSTD code seems to have a bug where it will
-		 * write just past the end of the buffer - so subtract a fudge
-		 * factor (7 bytes) from the dst buffer size to account for
-		 * that.
-		 */
-		size_t len = zstd_compress_cctx(ctx,
-				dst + 4,	dst_len - 4 - 7,
-				src,		src_len,
-				&params);
-		if (zstd_is_error(len))
-			return 0;
-
-		*((__le32 *) dst) = cpu_to_le32(len);
-		return len + 4;
-	}
-	default:
-		BUG();
-	}
-}
-
-static unsigned __bio_compress(struct bch_fs *c,
-			       struct bio *dst, size_t *dst_len,
-			       struct bio *src, size_t *src_len,
-			       struct bch_compression_opt compression)
-{
-	struct bbuf src_data = { NULL }, dst_data = { NULL };
-	void *workspace;
-	enum bch_compression_type compression_type =
-		__bch2_compression_opt_to_type[compression.type];
-	unsigned pad;
-	int ret = 0;
-
-	/* bch2_compression_decode catches unknown compression types: */
-	BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
-
-	mempool_t *workspace_pool = &c->compress_workspace[compression.type];
-	if (unlikely(!mempool_initialized(workspace_pool))) {
-		if (fsck_err(c, compression_opt_not_marked_in_sb,
-			     "compression opt %s set but not marked in superblock",
-			     bch2_compression_opts[compression.type])) {
-			ret = bch2_check_set_has_compressed_data(c, compression.type);
-			if (ret) /* memory allocation failure, don't compress */
-				return 0;
-		} else {
-			return 0;
-		}
-	}
-
-	/* If it's only one block, don't bother trying to compress: */
-	if (src->bi_iter.bi_size <= c->opts.block_size)
-		return BCH_COMPRESSION_TYPE_incompressible;
-
-	dst_data = bio_map_or_bounce(c, dst, WRITE);
-	src_data = bio_map_or_bounce(c, src, READ);
-
-	workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
-	*src_len = src->bi_iter.bi_size;
-	*dst_len = dst->bi_iter.bi_size;
-
-	/*
-	 * XXX: this algorithm sucks when the compression code doesn't tell us
-	 * how much would fit, like LZ4 does:
-	 */
-	while (1) {
-		if (*src_len <= block_bytes(c)) {
-			ret = -1;
-			break;
-		}
-
-		ret = attempt_compress(c, workspace,
-				       dst_data.b,	*dst_len,
-				       src_data.b,	*src_len,
-				       compression);
-		if (ret > 0) {
-			*dst_len = ret;
-			ret = 0;
-			break;
-		}
-
-		/* Didn't fit: should we retry with a smaller amount?  */
-		if (*src_len <= *dst_len) {
-			ret = -1;
-			break;
-		}
-
-		/*
-		 * If ret is negative, it's a hint as to how much data would fit
-		 */
-		BUG_ON(-ret >= *src_len);
-
-		if (ret < 0)
-			*src_len = -ret;
-		else
-			*src_len -= (*src_len - *dst_len) / 2;
-		*src_len = round_down(*src_len, block_bytes(c));
-	}
-
-	mempool_free(workspace, workspace_pool);
-
-	if (ret)
-		goto err;
-
-	/* Didn't get smaller: */
-	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
-		goto err;
-
-	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
-	memset(dst_data.b + *dst_len, 0, pad);
-	*dst_len += pad;
-
-	if (dst_data.type != BB_NONE &&
-	    dst_data.type != BB_VMAP)
-		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
-
-	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
-	BUG_ON(*dst_len & (block_bytes(c) - 1));
-	BUG_ON(*src_len & (block_bytes(c) - 1));
-	ret = compression_type;
-out:
-	bio_unmap_or_unbounce(c, src_data);
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-err:
-	ret = BCH_COMPRESSION_TYPE_incompressible;
-	goto out;
-fsck_err:
-	ret = 0;
-	goto out;
-}
-
-unsigned bch2_bio_compress(struct bch_fs *c,
-			   struct bio *dst, size_t *dst_len,
-			   struct bio *src, size_t *src_len,
-			   unsigned compression_opt)
-{
-	unsigned orig_dst = dst->bi_iter.bi_size;
-	unsigned orig_src = src->bi_iter.bi_size;
-	unsigned compression_type;
-
-	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
-	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-				     c->opts.encoded_extent_max);
-	/* Don't generate a bigger output than input: */
-	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
-	compression_type =
-		__bio_compress(c, dst, dst_len, src, src_len,
-			       bch2_compression_decode(compression_opt));
-
-	dst->bi_iter.bi_size = orig_dst;
-	src->bi_iter.bi_size = orig_src;
-	return compression_type;
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *, u64);
-
-#define BCH_FEATURE_none	0
-
-static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
-	BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-#undef BCH_FEATURE_none
-
-static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
-{
-	int ret = 0;
-
-	if ((c->sb.features & f) == f)
-		return 0;
-
-	mutex_lock(&c->sb_lock);
-
-	if ((c->sb.features & f) == f) {
-		mutex_unlock(&c->sb_lock);
-		return 0;
-	}
-
-	ret = __bch2_fs_compress_init(c, c->sb.features|f);
-	if (ret) {
-		mutex_unlock(&c->sb_lock);
-		return ret;
-	}
-
-	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
-				       unsigned compression_opt)
-{
-	unsigned compression_type = bch2_compression_decode(compression_opt).type;
-
-	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
-
-	return compression_type
-		? __bch2_check_set_has_compressed_data(c,
-				1ULL << bch2_compression_opt_to_feature[compression_type])
-		: 0;
-}
-
-void bch2_fs_compress_exit(struct bch_fs *c)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
-		mempool_exit(&c->compress_workspace[i]);
-	mempool_exit(&c->compression_bounce[WRITE]);
-	mempool_exit(&c->compression_bounce[READ]);
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
-	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
-						 c->opts.encoded_extent_max);
-
-	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
-
-	struct {
-		unsigned			feature;
-		enum bch_compression_opts	type;
-		size_t				compress_workspace;
-	} compression_types[] = {
-		{ BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
-			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
-		{ BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
-			max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
-			    zlib_inflate_workspacesize()) },
-		{ BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
-			max(c->zstd_workspace_size,
-			    zstd_dctx_workspace_bound()) },
-	}, *i;
-	bool have_compressed = false;
-
-	for (i = compression_types;
-	     i < compression_types + ARRAY_SIZE(compression_types);
-	     i++)
-		have_compressed |= (features & (1 << i->feature)) != 0;
-
-	if (!have_compressed)
-		return 0;
-
-	if (!mempool_initialized(&c->compression_bounce[READ]) &&
-	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
-				       1, c->opts.encoded_extent_max))
-		return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
-
-	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
-				       1, c->opts.encoded_extent_max))
-		return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
-
-	for (i = compression_types;
-	     i < compression_types + ARRAY_SIZE(compression_types);
-	     i++) {
-		if (!(features & (1 << i->feature)))
-			continue;
-
-		if (mempool_initialized(&c->compress_workspace[i->type]))
-			continue;
-
-		if (mempool_init_kvmalloc_pool(
-				&c->compress_workspace[i->type],
-				1, i->compress_workspace))
-			return bch_err_throw(c, ENOMEM_compression_workspace_init);
-	}
-
-	return 0;
-}
-
-static u64 compression_opt_to_feature(unsigned v)
-{
-	unsigned type = bch2_compression_decode(v).type;
-
-	return BIT_ULL(bch2_compression_opt_to_feature[type]);
-}
-
-int bch2_fs_compress_init(struct bch_fs *c)
-{
-	u64 f = c->sb.features;
-
-	f |= compression_opt_to_feature(c->opts.compression);
-	f |= compression_opt_to_feature(c->opts.background_compression);
-
-	return __bch2_fs_compress_init(c, f);
-}
-
-int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
-			       struct printbuf *err)
-{
-	char *val = kstrdup(_val, GFP_KERNEL);
-	char *p = val, *type_str, *level_str;
-	struct bch_compression_opt opt = { 0 };
-	int ret;
-
-	if (!val)
-		return -ENOMEM;
-
-	type_str = strsep(&p, ":");
-	level_str = p;
-
-	ret = match_string(bch2_compression_opts, -1, type_str);
-	if (ret < 0 && err)
-		prt_printf(err, "invalid compression type\n");
-	if (ret < 0)
-		goto err;
-
-	opt.type = ret;
-
-	if (level_str) {
-		unsigned level;
-
-		ret = kstrtouint(level_str, 10, &level);
-		if (!ret && !opt.type && level)
-			ret = -EINVAL;
-		if (!ret && level > 15)
-			ret = -EINVAL;
-		if (ret < 0 && err)
-			prt_printf(err, "invalid compression level\n");
-		if (ret < 0)
-			goto err;
-
-		opt.level = level;
-	}
-
-	*res = bch2_compression_encode(opt);
-err:
-	kfree(val);
-	return ret;
-}
-
-void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
-{
-	struct bch_compression_opt opt = bch2_compression_decode(v);
-
-	if (opt.type < BCH_COMPRESSION_OPT_NR)
-		prt_str(out, bch2_compression_opts[opt.type]);
-	else
-		prt_printf(out, "(unknown compression opt %u)", opt.type);
-	if (opt.level)
-		prt_printf(out, ":%u", opt.level);
-}
-
-void bch2_opt_compression_to_text(struct printbuf *out,
-				  struct bch_fs *c,
-				  struct bch_sb *sb,
-				  u64 v)
-{
-	return bch2_compression_opt_to_text(out, v);
-}
-
-int bch2_opt_compression_validate(u64 v, struct printbuf *err)
-{
-	if (!bch2_compression_opt_valid(v)) {
-		prt_printf(err, "invalid compression opt %llu", v);
-		return -BCH_ERR_invalid_sb_opt_compression;
-	}
-
-	return 0;
-}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
deleted file mode 100644
index bec2f05bfd52..000000000000
--- a/fs/bcachefs/compress.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COMPRESS_H
-#define _BCACHEFS_COMPRESS_H
-
-#include "extents_types.h"
-
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-	BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-struct bch_compression_opt {
-	u8		type:4,
-			level:4;
-};
-
-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
-{
-	return (struct bch_compression_opt) {
-		.type	= v & 15,
-		.level	= v >> 4,
-	};
-}
-
-static inline bool bch2_compression_opt_valid(unsigned v)
-{
-	struct bch_compression_opt opt = __bch2_compression_decode(v);
-
-	return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
-}
-
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
-{
-	return bch2_compression_opt_valid(v)
-		? __bch2_compression_decode(v)
-		: (struct bch_compression_opt) { 0 };
-}
-
-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
-{
-	return opt.type|(opt.level << 4);
-}
-
-static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
-{
-	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
-}
-
-struct bch_write_op;
-int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
-int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
-		       struct bvec_iter, struct bch_extent_crc_unpacked);
-unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
-			   struct bio *, size_t *, unsigned);
-
-int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch2_fs_compress_exit(struct bch_fs *);
-int bch2_fs_compress_init(struct bch_fs *);
-
-void bch2_compression_opt_to_text(struct printbuf *, u64);
-
-int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-int bch2_opt_compression_validate(u64, struct printbuf *);
-
-#define bch2_opt_compression (struct bch_opt_fn) {		\
-	.parse		= bch2_opt_compression_parse,		\
-	.to_text	= bch2_opt_compression_to_text,		\
-	.validate	= bch2_opt_compression_validate,	\
-}
-
-#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
deleted file mode 100644
index e86d36d23e9e..000000000000
--- a/fs/bcachefs/darray.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include "darray.h"
-
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
-{
-	if (new_size > d->size) {
-		new_size = roundup_pow_of_two(new_size);
-
-		/*
-		 * This is a workaround: kvmalloc() doesn't support > INT_MAX
-		 * allocations, but vmalloc() does.
-		 * The limit needs to be lifted from kvmalloc, and when it does
-		 * we'll go back to just using that.
-		 */
-		size_t bytes;
-		if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
-			return -ENOMEM;
-
-		void *data = likely(bytes < INT_MAX)
-			? kvmalloc_noprof(bytes, gfp)
-			: vmalloc_noprof(bytes);
-		if (!data)
-			return -ENOMEM;
-
-		if (d->size)
-			memcpy(data, d->data, d->size * element_size);
-		if (d->data != d->preallocated)
-			kvfree(d->data);
-		d->data	= data;
-		d->size = new_size;
-	}
-
-	return 0;
-}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
deleted file mode 100644
index 4080ee99aadd..000000000000
--- a/fs/bcachefs/darray.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
-
-/*
- * Dynamic arrays:
- *
- * Inspired by CCAN's darray
- */
-
-#include <linux/cleanup.h>
-#include <linux/slab.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr)					\
-struct {								\
-	size_t nr, size;						\
-	_type *data;							\
-	_type preallocated[_nr];					\
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char)	darray_char;
-typedef DARRAY(char *)	darray_str;
-typedef DARRAY(const char *) darray_const_str;
-
-typedef DARRAY(u8)	darray_u8;
-typedef DARRAY(u16)	darray_u16;
-typedef DARRAY(u32)	darray_u32;
-typedef DARRAY(u64)	darray_u64;
-
-typedef DARRAY(s8)	darray_s8;
-typedef DARRAY(s16)	darray_s16;
-typedef DARRAY(s32)	darray_s32;
-typedef DARRAY(s64)	darray_s64;
-
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
-
-#define __bch2_darray_resize(...)	alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-
-#define __darray_resize(_d, _element_size, _new_size, _gfp)		\
-	(unlikely((_new_size) > (_d)->size)				\
-	 ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
-	 : 0)
-
-#define darray_resize_gfp(_d, _new_size, _gfp)				\
-	__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
-
-#define darray_resize(_d, _new_size)					\
-	darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-
-#define darray_make_room_gfp(_d, _more, _gfp)				\
-	darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
-
-#define darray_make_room(_d, _more)					\
-	darray_make_room_gfp(_d, _more, GFP_KERNEL)
-
-#define darray_room(_d)		((_d).size - (_d).nr)
-
-#define darray_top(_d)		((_d).data[(_d).nr])
-
-#define darray_push_gfp(_d, _item, _gfp)				\
-({									\
-	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
-									\
-	if (!_ret)							\
-		(_d)->data[(_d)->nr++] = (_item);			\
-	_ret;								\
-})
-
-#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
-
-#define darray_pop(_d)		((_d)->data[--(_d)->nr])
-
-#define darray_first(_d)	((_d).data[0])
-#define darray_last(_d)		((_d).data[(_d).nr - 1])
-
-#define darray_insert_item(_d, pos, _item)				\
-({									\
-	size_t _pos = (pos);						\
-	int _ret = darray_make_room((_d), 1);				\
-									\
-	if (!_ret)							\
-		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
-	_ret;								\
-})
-
-#define darray_remove_item(_d, _pos)					\
-	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-
-#define darray_find_p(_d, _i, cond)					\
-({									\
-	typeof((_d).data) _ret = NULL;					\
-									\
-	darray_for_each(_d, _i)						\
-		if (cond) {						\
-			_ret = _i;					\
-			break;						\
-		}							\
-	_ret;								\
-})
-
-#define darray_find(_d, _item)	darray_find_p(_d, _i, *_i == _item)
-
-/* Iteration: */
-
-#define __darray_for_each(_d, _i)					\
-	for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each(_d, _i)						\
-	for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each_reverse(_d, _i)					\
-	for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
-
-/* Init/exit */
-
-#define darray_init(_d)							\
-do {									\
-	(_d)->nr = 0;							\
-	(_d)->size = ARRAY_SIZE((_d)->preallocated);			\
-	(_d)->data = (_d)->size ? (_d)->preallocated : NULL;		\
-} while (0)
-
-#define darray_exit(_d)							\
-do {									\
-	if (!ARRAY_SIZE((_d)->preallocated) ||				\
-	    (_d)->data != (_d)->preallocated)				\
-		kvfree((_d)->data);					\
-	darray_init(_d);						\
-} while (0)
-
-#define DEFINE_DARRAY_CLASS(_type)					\
-DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
-
-#define DEFINE_DARRAY(_type)						\
-typedef DARRAY(_type)	darray_##_type;					\
-DEFINE_DARRAY_CLASS(darray_##_type)
-
-#define DEFINE_DARRAY_NAMED(_name, _type)				\
-typedef DARRAY(_type)	_name;						\
-DEFINE_DARRAY_CLASS(_name)
-
-DEFINE_DARRAY_CLASS(darray_char);
-DEFINE_DARRAY_CLASS(darray_str)
-DEFINE_DARRAY_CLASS(darray_const_str)
-
-DEFINE_DARRAY_CLASS(darray_u8)
-DEFINE_DARRAY_CLASS(darray_u16)
-DEFINE_DARRAY_CLASS(darray_u32)
-DEFINE_DARRAY_CLASS(darray_u64)
-
-DEFINE_DARRAY_CLASS(darray_s8)
-DEFINE_DARRAY_CLASS(darray_s16)
-DEFINE_DARRAY_CLASS(darray_s32)
-DEFINE_DARRAY_CLASS(darray_s64)
-
-#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
deleted file mode 100644
index e848e210a9bf..000000000000
--- a/fs/bcachefs/data_update.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-
-static const char * const bch2_data_update_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
-	BCH_DATA_UPDATE_TYPES()
-#undef x
-	NULL
-};
-
-static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr)
-		bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
-}
-
-static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
-			bkey_for_each_ptr(ptrs, ptr2) {
-				if (ptr2 == ptr)
-					break;
-				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
-			}
-			return false;
-		}
-	}
-	return true;
-}
-
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-	}
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
-		       const struct bch_extent_ptr *start)
-{
-	if (!ctxt) {
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (ptr == start)
-				break;
-
-			struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-			struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-		}
-		return false;
-	}
-
-	__bkey_for_each_ptr(start, ptrs.end, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		bool locked;
-		move_ctxt_wait_event(ctxt,
-				     (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
-				     list_empty(&ctxt->ios));
-		if (!locked)
-			bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-	}
-	return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-		if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
-			return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
-	}
-
-	return true;
-}
-
-noinline_for_stack
-static void trace_io_move_finish2(struct data_update *u,
-				  struct bkey_i *new,
-				  struct bkey_i *insert)
-{
-	struct bch_fs *c = u->op.c;
-	struct printbuf buf = PRINTBUF;
-
-	prt_newline(&buf);
-
-	bch2_data_update_to_text(&buf, u);
-	prt_newline(&buf);
-
-	prt_str_indented(&buf, "new replicas:\t");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-	prt_newline(&buf);
-
-	prt_str_indented(&buf, "insert:\t");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-	prt_newline(&buf);
-
-	trace_io_move_finish(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_fail2(struct data_update *m,
-			 struct bkey_s_c new,
-			 struct bkey_s_c wrote,
-			 struct bkey_i *insert,
-			 const char *msg)
-{
-	struct bch_fs *c = m->op.c;
-	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-	struct printbuf buf = PRINTBUF;
-	unsigned rewrites_found = 0;
-
-	if (!trace_io_move_fail_enabled())
-		return;
-
-	prt_str(&buf, msg);
-
-	if (insert) {
-		const union bch_extent_entry *entry;
-		struct bch_extent_ptr *ptr;
-		struct extent_ptr_decoded p;
-
-		unsigned ptr_bit = 1;
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
-			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-			    !ptr->cached)
-				rewrites_found |= ptr_bit;
-			ptr_bit <<= 1;
-		}
-	}
-
-	prt_str(&buf, "rewrites found:\t");
-	bch2_prt_u64_base2(&buf, rewrites_found);
-	prt_newline(&buf);
-
-	bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-	prt_str(&buf, "\nold:    ");
-	bch2_bkey_val_to_text(&buf, c, old);
-
-	prt_str(&buf, "\nnew:    ");
-	bch2_bkey_val_to_text(&buf, c, new);
-
-	prt_str(&buf, "\nwrote:  ");
-	bch2_bkey_val_to_text(&buf, c, wrote);
-
-	if (insert) {
-		prt_str(&buf, "\ninsert: ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-	}
-
-	trace_io_move_fail(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_data_update2(struct data_update *m,
-			       struct bkey_s_c old, struct bkey_s_c k,
-			       struct bkey_i *insert)
-{
-	struct bch_fs *c = m->op.c;
-	struct printbuf buf = PRINTBUF;
-
-	prt_str(&buf, "\nold: ");
-	bch2_bkey_val_to_text(&buf, c, old);
-	prt_str(&buf, "\nk:   ");
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_str(&buf, "\nnew: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-	trace_data_update(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
-					     struct bkey_s_c old, struct bkey_s_c k,
-					     struct bkey_i *insert)
-{
-	struct bch_fs *c = m->op.c;
-	struct printbuf buf = PRINTBUF;
-
-	bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-	prt_str(&buf, "\nold: ");
-	bch2_bkey_val_to_text(&buf, c, old);
-	prt_str(&buf, "\nk:   ");
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_str(&buf, "\nnew: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-	trace_io_move_created_rebalance(c, buf.buf);
-	printbuf_exit(&buf);
-
-	this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
-}
-
-noinline_for_stack
-static int data_update_invalid_bkey(struct data_update *m,
-				    struct bkey_s_c old, struct bkey_s_c k,
-				    struct bkey_i *insert)
-{
-	struct bch_fs *c = m->op.c;
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	prt_str(&buf, "about to insert invalid key in data update path");
-	prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
-	prt_str(&buf, "\nold: ");
-	bch2_bkey_val_to_text(&buf, c, old);
-	prt_str(&buf, "\nk:   ");
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_str(&buf, "\nnew: ");
-	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-	prt_newline(&buf);
-
-	bch2_fs_emergency_read_only2(c, &buf);
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-
-	return bch_err_throw(c, invalid_bkey);
-}
-
-static int __bch2_data_update_index_update(struct btree_trans *trans,
-					   struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_iter iter;
-	struct data_update *m = container_of(op, struct data_update, op);
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, m->btree_id,
-			     bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k),
-			     BTREE_ITER_slots|BTREE_ITER_intent);
-
-	while (1) {
-		struct bkey_s_c k;
-		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
-		struct bkey_i *insert = NULL;
-		struct bkey_i_extent *new;
-		const union bch_extent_entry *entry_c;
-		union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		struct bch_extent_ptr *ptr;
-		const struct bch_extent_ptr *ptr_c;
-		struct bpos next_pos;
-		bool should_check_enospc;
-		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-		unsigned rewrites_found = 0, durability, ptr_bit;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_btree_iter_peek_slot(trans, &iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
-
-		if (!bch2_extents_match(k, old)) {
-			trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
-					    NULL, "no match:");
-			goto nowork;
-		}
-
-		insert = bch2_trans_kmalloc(trans,
-					    bkey_bytes(k.k) +
-					    bkey_val_bytes(&new->k) +
-					    sizeof(struct bch_extent_rebalance));
-		ret = PTR_ERR_OR_ZERO(insert);
-		if (ret)
-			goto err;
-
-		bkey_reassemble(insert, k);
-
-		new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k));
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto err;
-
-		bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys));
-		bch2_cut_front(iter.pos, &new->k_i);
-
-		bch2_cut_front(iter.pos,	insert);
-		bch2_cut_back(new->k.p,		insert);
-		bch2_cut_back(insert->k.p,	&new->k_i);
-
-		/*
-		 * @old: extent that we read from
-		 * @insert: key that we're going to update, initialized from
-		 * extent currently in btree - same as @old unless we raced with
-		 * other updates
-		 * @new: extent with new pointers that we'll be adding to @insert
-		 *
-		 * Fist, drop rewrite_ptrs from @new:
-		 */
-		ptr_bit = 1;
-		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
-			if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
-			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
-			    !ptr->cached) {
-				bch2_extent_ptr_set_cached(c, &m->op.opts,
-							   bkey_i_to_s(insert), ptr);
-				rewrites_found |= ptr_bit;
-			}
-			ptr_bit <<= 1;
-		}
-
-		if (m->data_opts.rewrite_ptrs &&
-		    !rewrites_found &&
-		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
-			trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
-			goto nowork;
-		}
-
-		/*
-		 * A replica that we just wrote might conflict with a replica
-		 * that we want to keep, due to racing with another move:
-		 */
-restart_drop_conflicting_replicas:
-		extent_for_each_ptr(extent_i_to_s(new), ptr)
-			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
-			    !ptr_c->cached) {
-				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
-				goto restart_drop_conflicting_replicas;
-			}
-
-		if (!bkey_val_u64s(&new->k)) {
-			trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
-			goto nowork;
-		}
-
-		/* Now, drop pointers that conflict with what we just wrote: */
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
-				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-
-		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
-			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
-
-		/* Now, drop excess replicas: */
-		scoped_guard(rcu) {
-restart_drop_extra_replicas:
-			bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
-				unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
-
-				if (!p.ptr.cached &&
-				    durability - ptr_durability >= m->op.opts.data_replicas) {
-					durability -= ptr_durability;
-
-					bch2_extent_ptr_set_cached(c, &m->op.opts,
-								   bkey_i_to_s(insert), &entry->ptr);
-					goto restart_drop_extra_replicas;
-				}
-			}
-		}
-
-		/* Finally, add the pointers we just wrote: */
-		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
-			bch2_extent_ptr_decoded_append(insert, &p);
-
-		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
-		bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
-
-		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
-						 &should_check_enospc,
-						 &i_sectors_delta,
-						 &disk_sectors_delta);
-		if (ret)
-			goto err;
-
-		if (disk_sectors_delta > (s64) op->res.sectors) {
-			ret = bch2_disk_reservation_add(c, &op->res,
-						disk_sectors_delta - op->res.sectors,
-						!should_check_enospc
-						? BCH_DISK_RESERVATION_NOFAIL : 0);
-			if (ret)
-				goto out;
-		}
-
-		next_pos = insert->k.p;
-
-		/*
-		 * Check for nonce offset inconsistency:
-		 * This is debug code - we've been seeing this bug rarely, and
-		 * it's been hard to reproduce, so this should give us some more
-		 * information when it does occur:
-		 */
-		int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
-						 (struct bkey_validate_context) {
-							.btree	= m->btree_id,
-							.flags	= BCH_VALIDATE_commit,
-						 });
-		if (unlikely(invalid)) {
-			ret = data_update_invalid_bkey(m, old, k, insert);
-			goto out;
-		}
-
-		ret =   bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
-			bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
-			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-						k.k->p, bkey_start_pos(&insert->k)) ?:
-			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-						k.k->p, insert->k.p) ?:
-			bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
-			bch2_trans_update(trans, &iter, insert,
-				BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto err;
-
-		if (trace_data_update_enabled())
-			trace_data_update2(m, old, k, insert);
-
-		if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
-		    bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
-			trace_io_move_created_rebalance2(m, old, k, insert);
-
-		ret =   bch2_trans_commit(trans, &op->res,
-				NULL,
-				BCH_TRANS_COMMIT_no_check_rw|
-				BCH_TRANS_COMMIT_no_enospc|
-				m->data_opts.btree_insert_flags);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_pos(trans, &iter, next_pos);
-
-		this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
-		if (trace_io_move_finish_enabled())
-			trace_io_move_finish2(m, &new->k_i, insert);
-err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-		if (ret)
-			break;
-next:
-		while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) {
-			bch2_keylist_pop_front(&op->insert_keys);
-			if (bch2_keylist_empty(&op->insert_keys))
-				goto out;
-		}
-		continue;
-nowork:
-		if (m->stats) {
-			BUG_ON(k.k->p.offset <= iter.pos.offset);
-			atomic64_inc(&m->stats->keys_raced);
-			atomic64_add(k.k->p.offset - iter.pos.offset,
-				     &m->stats->sectors_raced);
-		}
-
-		count_event(c, io_move_fail);
-
-		bch2_btree_iter_advance(trans, &iter);
-		goto next;
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-	return ret;
-}
-
-int bch2_data_update_index_update(struct bch_write_op *op)
-{
-	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
-}
-
-void bch2_data_update_read_done(struct data_update *m)
-{
-	m->read_done = true;
-
-	/* write bio must own pages: */
-	BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
-	m->op.crc = m->rbio.pick.crc;
-	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
-	this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
-
-	closure_call(&m->op.cl, bch2_write, NULL, NULL);
-}
-
-void bch2_data_update_exit(struct data_update *update)
-{
-	struct bch_fs *c = update->op.c;
-	struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
-
-	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
-	kfree(update->bvecs);
-	update->bvecs = NULL;
-
-	if (c->opts.nocow_enabled)
-		bkey_nocow_unlock(c, k);
-	bkey_put_dev_refs(c, k);
-	bch2_disk_reservation_put(c, &update->op.res);
-	bch2_bkey_buf_exit(&update->k, c);
-}
-
-static noinline_for_stack
-int bch2_update_unwritten_extent(struct btree_trans *trans,
-				 struct data_update *update)
-{
-	struct bch_fs *c = update->op.c;
-	struct bkey_i_extent *e;
-	struct write_point *wp;
-	struct closure cl;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
-
-	while (bpos_lt(update->op.pos, update->k.k->k.p)) {
-		unsigned sectors = update->k.k->k.p.offset -
-			update->op.pos.offset;
-
-		bch2_trans_begin(trans);
-
-		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
-				     BTREE_ITER_slots);
-		ret = lockrestart_do(trans, ({
-			k = bch2_btree_iter_peek_slot(trans, &iter);
-			bkey_err(k);
-		}));
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
-			break;
-
-		e = bkey_extent_init(update->op.insert_keys.top);
-		e->k.p = update->op.pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				update->op.target,
-				false,
-				update->op.write_point,
-				&update->op.devs_have,
-				update->op.nr_replicas,
-				update->op.nr_replicas,
-				update->op.watermark,
-				0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
-			bch2_trans_unlock(trans);
-			closure_sync(&cl);
-			continue;
-		}
-
-		bch_err_fn_ratelimited(c, ret);
-
-		if (ret)
-			break;
-
-		sectors = min(sectors, wp->sectors_free);
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		update->op.pos.offset += sectors;
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-		bch2_keylist_push(&update->op.insert_keys);
-
-		ret = __bch2_data_update_index_update(trans, &update->op);
-
-		bch2_open_buckets_put(c, &update->op.open_buckets);
-
-		if (ret)
-			break;
-	}
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
-		closure_sync(&cl);
-	}
-
-	return ret;
-}
-
-void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
-				   struct bch_io_opts *io_opts,
-				   struct data_update_opts *data_opts)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 20);
-
-	prt_str_indented(out, "rewrite ptrs:\t");
-	bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
-	prt_newline(out);
-
-	prt_str_indented(out, "kill ptrs:\t");
-	bch2_prt_u64_base2(out, data_opts->kill_ptrs);
-	prt_newline(out);
-
-	prt_str_indented(out, "target:\t");
-	bch2_target_to_text(out, c, data_opts->target);
-	prt_newline(out);
-
-	prt_str_indented(out, "compression:\t");
-	bch2_compression_opt_to_text(out, io_opts->background_compression);
-	prt_newline(out);
-
-	prt_str_indented(out, "opts.replicas:\t");
-	prt_u64(out, io_opts->data_replicas);
-	prt_newline(out);
-
-	prt_str_indented(out, "extra replicas:\t");
-	prt_u64(out, data_opts->extra_replicas);
-	prt_newline(out);
-
-	prt_str_indented(out, "scrub:\t");
-	prt_u64(out, data_opts->scrub);
-}
-
-void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
-{
-	prt_str(out, bch2_data_update_type_strs[m->type]);
-	prt_newline(out);
-
-	bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
-	prt_newline(out);
-
-	prt_str_indented(out, "old key:\t");
-	bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-}
-
-void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
-{
-	bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-	bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
-
-	if (!m->read_done) {
-		prt_printf(out, "read:\n");
-		printbuf_indent_add(out, 2);
-		bch2_read_bio_to_text(out, &m->rbio);
-	} else {
-		prt_printf(out, "write:\n");
-		printbuf_indent_add(out, 2);
-		bch2_write_op_to_text(out, &m->op);
-	}
-	printbuf_indent_sub(out, 4);
-}
-
-int bch2_extent_drop_ptrs(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct bkey_s_c k,
-			  struct bch_io_opts *io_opts,
-			  struct data_update_opts *data_opts)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *n;
-	int ret;
-
-	n = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	while (data_opts->kill_ptrs) {
-		unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
-
-		bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
-		data_opts->kill_ptrs ^= 1U << drop;
-	}
-
-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
-
-	/*
-	 * Since we're not inserting through an extent iterator
-	 * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
-	 * we aren't using the extent overwrite path to delete, we're
-	 * just using the normal key deletion path:
-	 */
-	if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
-		n->k.size = 0;
-
-	return bch2_trans_relock(trans) ?:
-		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
-					struct bch_io_opts *io_opts,
-					unsigned buf_bytes)
-{
-	unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
-
-	m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
-	if (!m->bvecs)
-		return -ENOMEM;
-
-	bio_init(&m->rbio.bio,		NULL, m->bvecs, nr_vecs, REQ_OP_READ);
-	bio_init(&m->op.wbio.bio,	NULL, m->bvecs, nr_vecs, 0);
-
-	if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
-		kfree(m->bvecs);
-		m->bvecs = NULL;
-		return -ENOMEM;
-	}
-
-	rbio_init(&m->rbio.bio, c, *io_opts, NULL);
-	m->rbio.data_update		= true;
-	m->rbio.bio.bi_iter.bi_size	= buf_bytes;
-	m->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(&m->k.k->k);
-	m->op.wbio.bio.bi_ioprio	= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-	return 0;
-}
-
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
-			       struct bch_io_opts *io_opts)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	/* write path might have to decompress data: */
-	unsigned buf_bytes = 0;
-	bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
-		buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
-	return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
-}
-
-static int can_write_extent(struct bch_fs *c, struct data_update *m)
-{
-	if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
-	    unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
-		return bch_err_throw(c, data_update_done_would_block);
-
-	unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
-		? m->op.target
-		: 0;
-	struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
-	darray_for_each(m->op.devs_have, i)
-		__clear_bit(*i, devs.d);
-
-	guard(rcu)();
-
-	unsigned nr_replicas = 0, i;
-	for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
-		if (!ca)
-			continue;
-
-		struct bch_dev_usage usage;
-		bch2_dev_usage_read_fast(ca, &usage);
-
-		if (!dev_buckets_free(ca, usage, m->op.watermark))
-			continue;
-
-		nr_replicas += ca->mi.durability;
-		if (nr_replicas >= m->op.nr_replicas)
-			break;
-	}
-
-	if (!nr_replicas)
-		return bch_err_throw(c, data_update_done_no_rw_devs);
-	if (nr_replicas < m->op.nr_replicas)
-		return bch_err_throw(c, insufficient_devices);
-	return 0;
-}
-
-int bch2_data_update_init(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct moving_context *ctxt,
-			  struct data_update *m,
-			  struct write_point_specifier wp,
-			  struct bch_io_opts *io_opts,
-			  struct data_update_opts data_opts,
-			  enum btree_id btree_id,
-			  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	if (k.k->p.snapshot) {
-		ret = bch2_check_key_has_snapshot(trans, iter, k);
-		if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
-			/* Can't repair yet, waiting on other recovery passes */
-			return bch_err_throw(c, data_update_done_no_snapshot);
-		}
-		if (ret < 0)
-			return ret;
-		if (ret) /* key was deleted */
-			return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-				bch_err_throw(c, data_update_done_no_snapshot);
-		ret = 0;
-	}
-
-	bch2_bkey_buf_init(&m->k);
-	bch2_bkey_buf_reassemble(&m->k, c, k);
-	m->type		= data_opts.btree_insert_flags & BCH_WATERMARK_copygc
-		? BCH_DATA_UPDATE_copygc
-		: BCH_DATA_UPDATE_rebalance;
-	m->btree_id	= btree_id;
-	m->data_opts	= data_opts;
-	m->ctxt		= ctxt;
-	m->stats	= ctxt ? ctxt->stats : NULL;
-
-	bch2_write_op_init(&m->op, c, *io_opts);
-	m->op.pos	= bkey_start_pos(k.k);
-	m->op.version	= k.k->bversion;
-	m->op.target	= data_opts.target;
-	m->op.write_point = wp;
-	m->op.nr_replicas = 0;
-	m->op.flags	|= BCH_WRITE_pages_stable|
-		BCH_WRITE_pages_owned|
-		BCH_WRITE_data_encoded|
-		BCH_WRITE_move|
-		m->data_opts.write_flags;
-	m->op.compression_opt	= io_opts->background_compression;
-	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
-
-	unsigned durability_have = 0, durability_removing = 0;
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
-	unsigned buf_bytes = 0;
-	bool unwritten = false;
-
-	unsigned ptr_bit = 1;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (!p.ptr.cached) {
-			guard(rcu)();
-			if (ptr_bit & m->data_opts.rewrite_ptrs) {
-				if (crc_is_compressed(p.crc))
-					reserve_sectors += k.k->size;
-
-				m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-				durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-			} else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
-				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
-				durability_have += bch2_extent_ptr_durability(c, &p);
-			}
-		}
-
-		/*
-		 * op->csum_type is normally initialized from the fs/file's
-		 * current options - but if an extent is encrypted, we require
-		 * that it stays encrypted:
-		 */
-		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
-			m->op.nonce	= p.crc.nonce + p.crc.offset;
-			m->op.csum_type = p.crc.csum_type;
-		}
-
-		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-			m->op.incompressible = true;
-
-		buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-		unwritten |= p.ptr.unwritten;
-
-		ptr_bit <<= 1;
-	}
-
-	unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
-
-	/*
-	 * If current extent durability is less than io_opts.data_replicas,
-	 * we're not trying to rereplicate the extent up to data_replicas here -
-	 * unless extra_replicas was specified
-	 *
-	 * Increasing replication is an explicit operation triggered by
-	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
-	 */
-	m->op.nr_replicas = min(durability_removing, durability_required) +
-		m->data_opts.extra_replicas;
-
-	/*
-	 * If device(s) were set to durability=0 after data was written to them
-	 * we can end up with a duribilty=0 extent, and the normal algorithm
-	 * that tries not to increase durability doesn't work:
-	 */
-	if (!(durability_have + durability_removing))
-		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
-
-	m->op.nr_replicas_required = m->op.nr_replicas;
-
-	/*
-	 * It might turn out that we don't need any new replicas, if the
-	 * replicas or durability settings have been changed since the extent
-	 * was written:
-	 */
-	if (!m->op.nr_replicas) {
-		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-		m->data_opts.rewrite_ptrs = 0;
-		/* if iter == NULL, it's just a promote */
-		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
-		if (!ret)
-			ret = bch_err_throw(c, data_update_done_no_writes_needed);
-		goto out_bkey_buf_exit;
-	}
-
-	/*
-	 * Check if the allocation will succeed, to avoid getting an error later
-	 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
-	 * read:
-	 *
-	 * This guards against
-	 * - BCH_WRITE_alloc_nowait allocations failing (promotes)
-	 * - Destination target full
-	 * - Device(s) in destination target offline
-	 * - Insufficient durability available in destination target
-	 *   (i.e. trying to move a durability=2 replica to a target with a
-	 *   single durability=2 device)
-	 */
-	ret = can_write_extent(c, m);
-	if (ret)
-		goto out_bkey_buf_exit;
-
-	if (reserve_sectors) {
-		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
-				m->data_opts.extra_replicas
-				? 0
-				: BCH_DISK_RESERVATION_NOFAIL);
-		if (ret)
-			goto out_bkey_buf_exit;
-	}
-
-	if (!bkey_get_dev_refs(c, k)) {
-		ret = bch_err_throw(c, data_update_done_no_dev_refs);
-		goto out_put_disk_res;
-	}
-
-	if (c->opts.nocow_enabled &&
-	    !bkey_nocow_lock(c, ctxt, ptrs)) {
-		ret = bch_err_throw(c, nocow_lock_blocked);
-		goto out_put_dev_refs;
-	}
-
-	if (unwritten) {
-		ret = bch2_update_unwritten_extent(trans, m) ?:
-			bch_err_throw(c, data_update_done_unwritten);
-		goto out_nocow_unlock;
-	}
-
-	bch2_trans_unlock(trans);
-
-	ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
-	if (ret)
-		goto out_nocow_unlock;
-
-	return 0;
-out_nocow_unlock:
-	if (c->opts.nocow_enabled)
-		bkey_nocow_unlock(c, k);
-out_put_dev_refs:
-	bkey_put_dev_refs(c, k);
-out_put_disk_res:
-	bch2_disk_reservation_put(c, &m->op.res);
-out_bkey_buf_exit:
-	bch2_bkey_buf_exit(&m->k, c);
-	return ret;
-}
-
-void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned ptr_bit = 1;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
-			opts->kill_ptrs |= ptr_bit;
-			opts->rewrite_ptrs ^= ptr_bit;
-		}
-
-		ptr_bit <<= 1;
-	}
-}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
deleted file mode 100644
index 5e14d13568de..000000000000
--- a/fs/bcachefs/data_update.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BCACHEFS_DATA_UPDATE_H
-#define _BCACHEFS_DATA_UPDATE_H
-
-#include "bkey_buf.h"
-#include "io_read.h"
-#include "io_write_types.h"
-
-struct moving_context;
-
-struct data_update_opts {
-	unsigned	rewrite_ptrs;
-	unsigned	kill_ptrs;
-	u16		target;
-	u8		extra_replicas;
-	unsigned	btree_insert_flags;
-	unsigned	write_flags;
-
-	int		read_dev;
-	bool		scrub;
-};
-
-void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
-				   struct bch_io_opts *, struct data_update_opts *);
-
-#define BCH_DATA_UPDATE_TYPES()		\
-	x(copygc,	0)		\
-	x(rebalance,	1)		\
-	x(promote,	2)
-
-enum bch_data_update_types {
-#define x(n, id)	BCH_DATA_UPDATE_##n = id,
-	BCH_DATA_UPDATE_TYPES()
-#undef x
-};
-
-struct data_update {
-	enum bch_data_update_types type;
-	/* extent being updated: */
-	bool			read_done;
-	enum btree_id		btree_id;
-	struct bkey_buf		k;
-	struct data_update_opts	data_opts;
-	struct moving_context	*ctxt;
-	struct bch_move_stats	*stats;
-
-	struct bch_read_bio	rbio;
-	struct bch_write_op	op;
-	struct bio_vec		*bvecs;
-};
-
-struct promote_op {
-	struct rcu_head		rcu;
-	u64			start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	unsigned		list_idx;
-#endif
-
-	struct rhash_head	hash;
-	struct bpos		pos;
-
-	struct work_struct	work;
-	struct data_update	write;
-	struct bio_vec		bi_inline_vecs[]; /* must be last */
-};
-
-void bch2_data_update_to_text(struct printbuf *, struct data_update *);
-void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
-
-int bch2_data_update_index_update(struct bch_write_op *);
-
-void bch2_data_update_read_done(struct data_update *);
-
-int bch2_extent_drop_ptrs(struct btree_trans *,
-			  struct btree_iter *,
-			  struct bkey_s_c,
-			  struct bch_io_opts *,
-			  struct data_update_opts *);
-
-int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
-			       struct bch_io_opts *);
-
-void bch2_data_update_exit(struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
-			  struct moving_context *,
-			  struct data_update *,
-			  struct write_point_specifier,
-			  struct bch_io_opts *, struct data_update_opts,
-			  enum btree_id, struct bkey_s_c);
-void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
-
-#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
deleted file mode 100644
index 07c2a0f73cc2..000000000000
--- a/fs/bcachefs/debug.c
+++ /dev/null
@@ -1,996 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Assorted bcachefs debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal_reclaim.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
-				      struct extent_ptr_decoded pick)
-{
-	struct btree *v = c->verify_data;
-	struct btree_node *n_ondisk = c->verify_ondisk;
-	struct btree_node *n_sorted = c->verify_data->data;
-	struct bset *sorted, *inmemory = &b->data->keys;
-	struct bio *bio;
-	bool failed = false;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-				BCH_DEV_READ_REF_btree_verify_replicas);
-	if (!ca)
-		return false;
-
-	bio = bio_alloc_bioset(ca->disk_sb.bdev,
-			       buf_pages(n_sorted, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
-
-	submit_bio_wait(bio);
-
-	bio_put(bio);
-	enumerated_ref_put(&ca->io_ref[READ],
-			   BCH_DEV_READ_REF_btree_verify_replicas);
-
-	memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
-
-	v->written = 0;
-	if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
-		return false;
-
-	n_sorted = c->verify_data->data;
-	sorted = &n_sorted->keys;
-
-	if (inmemory->u64s != sorted->u64s ||
-	    memcmp(inmemory->start,
-		   sorted->start,
-		   vstruct_end(inmemory) - (void *) inmemory->start)) {
-		unsigned offset = 0, sectors;
-		struct bset *i;
-		unsigned j;
-
-		console_lock();
-
-		printk(KERN_ERR "*** in memory:\n");
-		bch2_dump_bset(c, b, inmemory, 0);
-
-		printk(KERN_ERR "*** read back in:\n");
-		bch2_dump_bset(c, v, sorted, 0);
-
-		while (offset < v->written) {
-			if (!offset) {
-				i = &n_ondisk->keys;
-				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
-					c->block_bits;
-			} else {
-				struct btree_node_entry *bne =
-					(void *) n_ondisk + (offset << 9);
-				i = &bne->keys;
-
-				sectors = vstruct_blocks(bne, c->block_bits) <<
-					c->block_bits;
-			}
-
-			printk(KERN_ERR "*** on disk block %u:\n", offset);
-			bch2_dump_bset(c, b, i, offset);
-
-			offset += sectors;
-		}
-
-		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
-			if (inmemory->_data[j] != sorted->_data[j])
-				break;
-
-		console_unlock();
-		bch_err(c, "verify failed at key %u", j);
-
-		failed = true;
-	}
-
-	if (v->written != b->written) {
-		bch_err(c, "written wrong: expected %u, got %u",
-			b->written, v->written);
-		failed = true;
-	}
-
-	return failed;
-}
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-	struct bkey_ptrs_c ptrs;
-	struct extent_ptr_decoded p;
-	const union bch_extent_entry *entry;
-	struct btree *v;
-	struct bset *inmemory = &b->data->keys;
-	struct bkey_packed *k;
-	bool failed = false;
-
-	if (c->opts.nochanges)
-		return;
-
-	bch2_btree_node_io_lock(b);
-	mutex_lock(&c->verify_lock);
-
-	if (!c->verify_ondisk) {
-		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-		if (!c->verify_ondisk)
-			goto out;
-	}
-
-	if (!c->verify_data) {
-		c->verify_data = __bch2_btree_node_mem_alloc(c);
-		if (!c->verify_data)
-			goto out;
-	}
-
-	BUG_ON(b->nsets != 1);
-
-	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-		if (k->type == KEY_TYPE_btree_ptr_v2)
-			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
-
-	v = c->verify_data;
-	bkey_copy(&v->key, &b->key);
-	v->c.level	= b->c.level;
-	v->c.btree_id	= b->c.btree_id;
-	bch2_btree_keys_init(v);
-
-	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
-	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
-		failed |= bch2_btree_verify_replica(c, b, p);
-
-	if (failed) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
-		printbuf_exit(&buf);
-	}
-out:
-	mutex_unlock(&c->verify_lock);
-	bch2_btree_node_io_unlock(b);
-}
-
-void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
-				    const struct btree *b)
-{
-	struct btree_node *n_ondisk = NULL;
-	struct extent_ptr_decoded pick;
-	struct bch_dev *ca;
-	struct bio *bio = NULL;
-	unsigned offset = 0;
-	int ret;
-
-	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
-		prt_printf(out, "error getting device to read from: invalid device\n");
-		return;
-	}
-
-	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-			BCH_DEV_READ_REF_btree_node_ondisk_to_text);
-	if (!ca) {
-		prt_printf(out, "error getting device to read from: not online\n");
-		return;
-	}
-
-	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
-	if (!n_ondisk) {
-		prt_printf(out, "memory allocation failure\n");
-		goto out;
-	}
-
-	bio = bio_alloc_bioset(ca->disk_sb.bdev,
-			       buf_pages(n_ondisk, btree_buf_bytes(b)),
-			       REQ_OP_READ|REQ_META,
-			       GFP_NOFS,
-			       &c->btree_bio);
-	bio->bi_iter.bi_sector	= pick.ptr.offset;
-	bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
-
-	ret = submit_bio_wait(bio);
-	if (ret) {
-		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
-		goto out;
-	}
-
-	while (offset < btree_sectors(c)) {
-		struct bset *i;
-		struct nonce nonce;
-		struct bch_csum csum;
-		struct bkey_packed *k;
-		unsigned sectors;
-
-		if (!offset) {
-			i = &n_ondisk->keys;
-
-			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-					   offset, BSET_CSUM_TYPE(i));
-				goto out;
-			}
-
-			nonce = btree_nonce(i, offset << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
-
-			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
-				prt_printf(out, "invalid checksum\n");
-				goto out;
-			}
-
-			bset_encrypt(c, i, offset << 9);
-
-			sectors = vstruct_sectors(n_ondisk, c->block_bits);
-		} else {
-			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
-
-			i = &bne->keys;
-
-			if (i->seq != n_ondisk->keys.seq)
-				break;
-
-			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
-				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
-					   offset, BSET_CSUM_TYPE(i));
-				goto out;
-			}
-
-			nonce = btree_nonce(i, offset << 9);
-			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-			if (bch2_crc_cmp(csum, bne->csum)) {
-				prt_printf(out, "invalid checksum");
-				goto out;
-			}
-
-			bset_encrypt(c, i, offset << 9);
-
-			sectors = vstruct_sectors(bne, c->block_bits);
-		}
-
-		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
-			   offset,
-			   le16_to_cpu(i->version),
-			   le64_to_cpu(i->journal_seq));
-		offset += sectors;
-
-		printbuf_indent_add(out, 4);
-
-		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
-			struct bkey u;
-
-			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
-			prt_newline(out);
-		}
-
-		printbuf_indent_sub(out, 4);
-	}
-out:
-	if (bio)
-		bio_put(bio);
-	kvfree(n_ondisk);
-	enumerated_ref_put(&ca->io_ref[READ],
-			   BCH_DEV_READ_REF_btree_node_ondisk_to_text);
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
-{
-	if (i->buf.pos) {
-		size_t bytes = min_t(size_t, i->buf.pos, i->size);
-		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
-
-		i->ret	 += copied;
-		i->ubuf	 += copied;
-		i->size	 -= copied;
-		i->buf.pos -= copied;
-		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
-
-		if (i->buf.last_newline >= copied)
-			i->buf.last_newline -= copied;
-		if (i->buf.last_field >= copied)
-			i->buf.last_field -= copied;
-
-		if (copied != bytes)
-			return -EFAULT;
-	}
-
-	return i->size ? 0 : i->ret;
-}
-
-static int bch2_dump_open(struct inode *inode, struct file *file)
-{
-	struct btree_debug *bd = inode->i_private;
-	struct dump_iter *i;
-
-	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-	if (!i)
-		return -ENOMEM;
-
-	file->private_data = i;
-	i->from = POS_MIN;
-	i->iter	= 0;
-	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
-	i->id	= bd->id;
-	i->buf	= PRINTBUF;
-
-	return 0;
-}
-
-int bch2_dump_release(struct inode *inode, struct file *file)
-{
-	struct dump_iter *i = file->private_data;
-
-	printbuf_exit(&i->buf);
-	kfree(i);
-	return 0;
-}
-
-static ssize_t bch2_read_btree(struct file *file, char __user *buf,
-			       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	return bch2_debugfs_flush_buf(i) ?:
-		bch2_trans_run(i->c,
-			for_each_btree_key(trans, iter, i->id, i->from,
-					   BTREE_ITER_prefetch|
-					   BTREE_ITER_all_snapshots, k, ({
-				bch2_bkey_val_to_text(&i->buf, i->c, k);
-				prt_newline(&i->buf);
-				bch2_trans_unlock(trans);
-				i->from = bpos_successor(iter.pos);
-				bch2_debugfs_flush_buf(i);
-			}))) ?:
-		i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_btree,
-};
-
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	ssize_t ret = bch2_debugfs_flush_buf(i);
-	if (ret)
-		return ret;
-
-	if (bpos_eq(SPOS_MAX, i->from))
-		return i->ret;
-
-	return bch2_trans_run(i->c,
-		for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
-			bch2_btree_node_to_text(&i->buf, i->c, b);
-			i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
-				? bpos_successor(b->key.k.p)
-				: b->key.k.p;
-
-			drop_locks_do(trans, bch2_debugfs_flush_buf(i));
-		}))) ?: i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_btree_formats,
-};
-
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	return bch2_debugfs_flush_buf(i) ?:
-		bch2_trans_run(i->c,
-			for_each_btree_key(trans, iter, i->id, i->from,
-					   BTREE_ITER_prefetch|
-					   BTREE_ITER_all_snapshots, k, ({
-				struct btree_path_level *l =
-					&btree_iter_path(trans, &iter)->l[0];
-				struct bkey_packed *_k =
-					bch2_btree_node_iter_peek(&l->iter, l->b);
-
-				if (bpos_gt(l->b->key.k.p, i->prev_node)) {
-					bch2_btree_node_to_text(&i->buf, i->c, l->b);
-					i->prev_node = l->b->key.k.p;
-				}
-
-				bch2_bfloat_to_text(&i->buf, l->b, _k);
-				bch2_trans_unlock(trans);
-				i->from = bpos_successor(iter.pos);
-				bch2_debugfs_flush_buf(i);
-			}))) ?:
-		i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_read_bfloat_failed,
-};
-
-static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
-					   struct btree *b)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	prt_printf(out, "%px ", b);
-	bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
-	prt_printf(out, "\n");
-
-	printbuf_indent_add(out, 2);
-
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-	prt_newline(out);
-
-	prt_printf(out, "flags:\t");
-	prt_bitflags(out, bch2_btree_node_flags, b->flags);
-	prt_newline(out);
-
-	prt_printf(out, "pcpu read locks:\t%u\n",	b->c.lock.readers != NULL);
-	prt_printf(out, "written:\t%u\n",		b->written);
-	prt_printf(out, "writes blocked:\t%u\n",	!list_empty_careful(&b->write_blocked));
-	prt_printf(out, "will make reachable:\t%lx\n",	b->will_make_reachable);
-
-	prt_printf(out, "journal pin %px:\t%llu\n",
-		   &b->writes[0].journal, b->writes[0].journal.seq);
-	prt_printf(out, "journal pin %px:\t%llu\n",
-		   &b->writes[1].journal, b->writes[1].journal.seq);
-
-	prt_printf(out, "ob:\t%u\n", b->ob.nr);
-
-	printbuf_indent_sub(out, 2);
-}
-
-static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	bool done = false;
-	ssize_t ret = 0;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	do {
-		ret = bch2_debugfs_flush_buf(i);
-		if (ret)
-			return ret;
-
-		i->buf.atomic++;
-		scoped_guard(rcu) {
-			struct bucket_table *tbl =
-				rht_dereference_rcu(c->btree_cache.table.tbl,
-						    &c->btree_cache.table);
-			if (i->iter < tbl->size) {
-				struct rhash_head *pos;
-				struct btree *b;
-
-				rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
-					bch2_cached_btree_node_to_text(&i->buf, c, b);
-				i->iter++;
-			} else {
-				done = true;
-			}
-		}
-		--i->buf.atomic;
-	} while (!done);
-
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = bch2_debugfs_flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations cached_btree_nodes_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_cached_btree_nodes_read,
-};
-
-typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
-
-static void list_sort(struct list_head *head, list_cmp_fn cmp)
-{
-	struct list_head *pos;
-
-	list_for_each(pos, head)
-		while (!list_is_last(pos, head) &&
-		       cmp(pos, pos->next) > 0) {
-			struct list_head *pos2, *next = pos->next;
-
-			list_del(next);
-			list_for_each(pos2, head)
-				if (cmp(next, pos2) < 0)
-					goto pos_found;
-			BUG();
-pos_found:
-			list_add_tail(next, pos2);
-		}
-}
-
-static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
-{
-	return cmp_int(l, r);
-}
-
-static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	struct btree_trans *trans;
-	ssize_t ret = 0;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-restart:
-	seqmutex_lock(&c->btree_trans_lock);
-	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		if ((ulong) trans <= i->iter)
-			continue;
-
-		i->iter = (ulong) trans;
-
-		if (!closure_get_not_zero(&trans->ref))
-			continue;
-
-		if (!trans->srcu_held) {
-			closure_put(&trans->ref);
-			continue;
-		}
-
-		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
-		bch2_btree_trans_to_text(&i->buf, trans);
-
-		prt_printf(&i->buf, "backtrace:\n");
-		printbuf_indent_add(&i->buf, 2);
-		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
-		printbuf_indent_sub(&i->buf, 2);
-		prt_newline(&i->buf);
-
-		closure_put(&trans->ref);
-
-		ret = bch2_debugfs_flush_buf(i);
-		if (ret)
-			goto unlocked;
-
-		if (!seqmutex_relock(&c->btree_trans_lock, seq))
-			goto restart;
-	}
-	seqmutex_unlock(&c->btree_trans_lock);
-unlocked:
-	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = bch2_debugfs_flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static const struct file_operations btree_transactions_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_transactions_read,
-};
-
-static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
-				      size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	bool done = false;
-	int err;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	while (1) {
-		err = bch2_debugfs_flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
-
-		if (done)
-			break;
-
-		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
-		i->iter++;
-	}
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations journal_pins_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_journal_pins_read,
-};
-
-static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
-				       size_t size, loff_t *ppos)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	int err;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	if (!i->iter) {
-		bch2_btree_updates_to_text(&i->buf, c);
-		i->iter++;
-	}
-
-	err = bch2_debugfs_flush_buf(i);
-	if (err)
-		return err;
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations btree_updates_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_updates_read,
-};
-
-static int btree_transaction_stats_open(struct inode *inode, struct file *file)
-{
-	struct bch_fs *c = inode->i_private;
-	struct dump_iter *i;
-
-	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-	if (!i)
-		return -ENOMEM;
-
-	i->iter = 1;
-	i->c    = c;
-	i->buf  = PRINTBUF;
-	file->private_data = i;
-
-	return 0;
-}
-
-static int btree_transaction_stats_release(struct inode *inode, struct file *file)
-{
-	struct dump_iter *i = file->private_data;
-
-	printbuf_exit(&i->buf);
-	kfree(i);
-
-	return 0;
-}
-
-static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
-					    size_t size, loff_t *ppos)
-{
-	struct dump_iter        *i = file->private_data;
-	struct bch_fs *c = i->c;
-	int err;
-
-	i->ubuf = buf;
-	i->size = size;
-	i->ret  = 0;
-
-	while (1) {
-		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
-
-		err = bch2_debugfs_flush_buf(i);
-		if (err)
-			return err;
-
-		if (!i->size)
-			break;
-
-		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
-		    !bch2_btree_transaction_fns[i->iter])
-			break;
-
-		prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
-		printbuf_indent_add(&i->buf, 2);
-
-		mutex_lock(&s->lock);
-
-		prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-		printbuf_indent_add(&i->buf, 2);
-		bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
-		printbuf_indent_sub(&i->buf, 2);
-#endif
-
-		prt_printf(&i->buf, "Transaction duration:\n");
-
-		printbuf_indent_add(&i->buf, 2);
-		bch2_time_stats_to_text(&i->buf, &s->duration);
-		printbuf_indent_sub(&i->buf, 2);
-
-		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
-			prt_printf(&i->buf, "Lock hold times:\n");
-
-			printbuf_indent_add(&i->buf, 2);
-			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
-			printbuf_indent_sub(&i->buf, 2);
-		}
-
-		if (s->max_paths_text) {
-			prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
-
-			printbuf_indent_add(&i->buf, 2);
-			prt_str_indented(&i->buf, s->max_paths_text);
-			printbuf_indent_sub(&i->buf, 2);
-		}
-
-		mutex_unlock(&s->lock);
-
-		printbuf_indent_sub(&i->buf, 2);
-		prt_newline(&i->buf);
-		i->iter++;
-	}
-
-	if (i->buf.allocation_failure)
-		return -ENOMEM;
-
-	return i->ret;
-}
-
-static const struct file_operations btree_transaction_stats_op = {
-	.owner		= THIS_MODULE,
-	.open		= btree_transaction_stats_open,
-	.release	= btree_transaction_stats_release,
-	.read		= btree_transaction_stats_read,
-};
-
-/* walk btree transactions until we find a deadlock and print it */
-static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct btree_trans *trans;
-	ulong iter = 0;
-restart:
-	seqmutex_lock(&c->btree_trans_lock);
-	list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
-	list_for_each_entry(trans, &c->btree_trans_list, list) {
-		if ((ulong) trans <= iter)
-			continue;
-
-		iter = (ulong) trans;
-
-		if (!closure_get_not_zero(&trans->ref))
-			continue;
-
-		u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
-		bool found = bch2_check_for_deadlock(trans, out) != 0;
-
-		closure_put(&trans->ref);
-
-		if (found)
-			return;
-
-		if (!seqmutex_relock(&c->btree_trans_lock, seq))
-			goto restart;
-	}
-	seqmutex_unlock(&c->btree_trans_lock);
-}
-
-typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
-
-static ssize_t bch2_simple_print(struct file *file, char __user *buf,
-				 size_t size, loff_t *ppos,
-				 fs_to_text_fn fn)
-{
-	struct dump_iter *i = file->private_data;
-	struct bch_fs *c = i->c;
-	ssize_t ret = 0;
-
-	i->ubuf = buf;
-	i->size	= size;
-	i->ret	= 0;
-
-	if (!i->iter) {
-		fn(&i->buf, c);
-		i->iter++;
-	}
-
-	if (i->buf.allocation_failure)
-		ret = -ENOMEM;
-
-	if (!ret)
-		ret = bch2_debugfs_flush_buf(i);
-
-	return ret ?: i->ret;
-}
-
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
-					size_t size, loff_t *ppos)
-{
-	return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
-}
-
-static const struct file_operations btree_deadlock_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_btree_deadlock_read,
-};
-
-static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
-				     size_t size, loff_t *ppos)
-{
-	return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
-}
-
-static const struct file_operations write_points_ops = {
-	.owner		= THIS_MODULE,
-	.open		= bch2_dump_open,
-	.release	= bch2_dump_release,
-	.read		= bch2_write_points_read,
-};
-
-void bch2_fs_debug_exit(struct bch_fs *c)
-{
-	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
-		debugfs_remove_recursive(c->fs_debug_dir);
-}
-
-static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
-{
-	struct dentry *d;
-
-	d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
-
-	debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
-
-	debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
-
-	debugfs_create_file("bfloat-failed", 0400, d, bd,
-			    &bfloat_failed_debug_ops);
-}
-
-void bch2_fs_debug_init(struct bch_fs *c)
-{
-	struct btree_debug *bd;
-	char name[100];
-
-	if (IS_ERR_OR_NULL(bch_debug))
-		return;
-
-	if (c->sb.multi_device)
-		snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-	else
-		strscpy(name, c->name, sizeof(name));
-
-	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
-	if (IS_ERR_OR_NULL(c->fs_debug_dir))
-		return;
-
-	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
-			    c->btree_debug, &cached_btree_nodes_ops);
-
-	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_transactions_ops);
-
-	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
-			    c->btree_debug, &journal_pins_ops);
-
-	debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_updates_ops);
-
-	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
-			    c, &btree_transaction_stats_op);
-
-	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
-			    c->btree_debug, &btree_deadlock_ops);
-
-	debugfs_create_file("write_points", 0400, c->fs_debug_dir,
-			    c->btree_debug, &write_points_ops);
-
-	bch2_fs_async_obj_debugfs_init(c);
-
-	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
-	if (IS_ERR_OR_NULL(c->btree_debug_dir))
-		return;
-
-	for (bd = c->btree_debug;
-	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
-	     bd++) {
-		bd->id = bd - c->btree_debug;
-		bch2_fs_debug_btree_init(c, bd);
-	}
-}
-
-#endif
-
-void bch2_debug_exit(void)
-{
-	if (!IS_ERR_OR_NULL(bch_debug))
-		debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch2_debug_init(void)
-{
-	bch_debug = debugfs_create_dir("bcachefs", NULL);
-	return 0;
-}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
deleted file mode 100644
index d88b1194b8ac..000000000000
--- a/fs/bcachefs/debug.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DEBUG_H
-#define _BCACHEFS_DEBUG_H
-
-#include "bcachefs.h"
-
-struct bio;
-struct btree;
-struct bch_fs;
-
-void __bch2_btree_verify(struct bch_fs *, struct btree *);
-void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
-				    const struct btree *);
-
-static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
-	if (static_branch_unlikely(&bch2_verify_btree_ondisk))
-		__bch2_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-struct dump_iter {
-	struct bch_fs		*c;
-	struct async_obj_list	*list;
-	enum btree_id		id;
-	struct bpos		from;
-	struct bpos		prev_node;
-	u64			iter;
-
-	struct printbuf		buf;
-
-	char __user		*ubuf;	/* destination user buffer */
-	size_t			size;	/* size of requested read */
-	ssize_t			ret;	/* bytes read so far */
-};
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
-int bch2_dump_release(struct inode *, struct file *);
-
-void bch2_fs_debug_exit(struct bch_fs *);
-void bch2_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch2_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch2_debug_exit(void);
-int bch2_debug_init(void);
-
-#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
deleted file mode 100644
index 28875c5c86ad..000000000000
--- a/fs/bcachefs/dirent.c
+++ /dev/null
@@ -1,766 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-#include <linux/dcache.h>
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
-		  const struct qstr *str, struct qstr *out_cf)
-{
-	*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
-
-	if (!bch2_fs_casefold_enabled(trans->c))
-		return -EOPNOTSUPP;
-
-	unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
-	int ret = PTR_ERR_OR_ZERO(buf);
-	if (ret)
-		return ret;
-
-	ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
-	if (ret <= 0)
-		return ret;
-
-	*out_cf = (struct qstr) QSTR_INIT(buf, ret);
-	return 0;
-}
-#endif
-
-static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
-	if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
-		return 0;
-
-	unsigned bkey_u64s = bkey_val_u64s(d.k);
-	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
-	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
-#if CPU_BIG_ENDIAN
-	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
-#else
-	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
-#endif
-
-	return bkey_bytes -
-		(d.v->d_casefold
-		? offsetof(struct bch_dirent, d_cf_name_block.d_names)
-		: offsetof(struct bch_dirent, d_name)) -
-		trailing_nuls;
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
-{
-	if (d.v->d_casefold) {
-		unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
-		return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
-	} else {
-		return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
-	}
-}
-
-static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
-{
-	if (d.v->d_casefold) {
-		unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
-		unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
-		return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
-	} else {
-		return (struct qstr) QSTR_INIT(NULL, 0);
-	}
-}
-
-static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
-{
-	return d.v->d_casefold
-		? bch2_dirent_get_casefold_name(d)
-		: bch2_dirent_get_name(d);
-}
-
-static u64 bch2_dirent_hash(const struct bch_hash_info *info,
-			    const struct qstr *name)
-{
-	struct bch_str_hash_ctx ctx;
-
-	bch2_str_hash_init(&ctx, info);
-	bch2_str_hash_update(&ctx, info, name->name, name->len);
-
-	/* [0,2) reserved for dots */
-	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
-	return bch2_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr name = bch2_dirent_get_lookup_name(d);
-
-	return bch2_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
-	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	const struct qstr l_name = bch2_dirent_get_lookup_name(l);
-	const struct qstr *r_name = _r;
-
-	return !qstr_eq(l_name, *r_name);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
-	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-	const struct qstr l_name = bch2_dirent_get_lookup_name(l);
-	const struct qstr r_name = bch2_dirent_get_lookup_name(r);
-
-	return !qstr_eq(l_name, r_name);
-}
-
-static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-	if (d.v->d_type == DT_SUBVOL)
-		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
-	return true;
-}
-
-const struct bch_hash_desc bch2_dirent_hash_desc = {
-	.btree_id	= BTREE_ID_dirents,
-	.key_type	= KEY_TYPE_dirent,
-	.hash_key	= dirent_hash_key,
-	.hash_bkey	= dirent_hash_bkey,
-	.cmp_key	= dirent_cmp_key,
-	.cmp_bkey	= dirent_cmp_bkey,
-	.is_visible	= dirent_is_visible,
-};
-
-int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
-			 struct bkey_validate_context from)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	unsigned name_block_len = bch2_dirent_name_bytes(d);
-	struct qstr d_name = bch2_dirent_get_name(d);
-	struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
-	int ret = 0;
-
-	bkey_fsck_err_on(!d_name.len,
-			 c, dirent_empty_name,
-			 "empty name");
-
-	bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
-			 c, dirent_val_too_big,
-			 "dirent names exceed bkey size (%d + %d > %d)",
-			 d_name.len, d_cf_name.len, name_block_len);
-
-	/*
-	 * Check new keys don't exceed the max length
-	 * (older keys may be larger.)
-	 */
-	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
-			 c, dirent_name_too_long,
-			 "dirent name too big (%u > %u)",
-			 d_name.len, BCH_NAME_MAX);
-
-	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
-			 c, dirent_name_embedded_nul,
-			 "dirent has stray data after name's NUL");
-
-	bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
-			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
-			 c, dirent_name_dot_or_dotdot,
-			 "invalid name");
-
-	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
-			 c, dirent_name_has_slash,
-			 "name with /");
-
-	bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
-			 le64_to_cpu(d.v->d_inum) == d.k->p.inode,
-			 c, dirent_to_itself,
-			 "dirent points to own directory");
-
-	if (d.v->d_casefold) {
-		bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
-				 d_cf_name.len > BCH_NAME_MAX,
-				 c, dirent_cf_name_too_big,
-				 "dirent w/ cf name too big (%u > %u)",
-				 d_cf_name.len, BCH_NAME_MAX);
-
-		bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
-				 c, dirent_stray_data_after_cf_name,
-				 "dirent has stray data after cf name's NUL");
-	}
-fsck_err:
-	return ret;
-}
-
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	struct qstr d_name = bch2_dirent_get_name(d);
-
-	prt_printf(out, "%.*s", d_name.len, d_name.name);
-
-	if (d.v->d_casefold) {
-		struct qstr d_name = bch2_dirent_get_lookup_name(d);
-		prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
-	}
-
-	prt_str(out, " ->");
-
-	if (d.v->d_type != DT_SUBVOL)
-		prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
-	else
-		prt_printf(out, " %u -> %u",
-			   le32_to_cpu(d.v->d_parent_subvol),
-			   le32_to_cpu(d.v->d_child_subvol));
-
-	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
-}
-
-int bch2_dirent_init_name(struct bch_fs *c,
-			  struct bkey_i_dirent *dirent,
-			  const struct bch_hash_info *hash_info,
-			  const struct qstr *name,
-			  const struct qstr *cf_name)
-{
-	EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
-	int cf_len = 0;
-
-	if (name->len > BCH_NAME_MAX)
-		return -ENAMETOOLONG;
-
-	dirent->v.d_casefold = hash_info->cf_encoding != NULL;
-
-	if (!dirent->v.d_casefold) {
-		memcpy(&dirent->v.d_name[0], name->name, name->len);
-		memset(&dirent->v.d_name[name->len], 0,
-		       bkey_val_bytes(&dirent->k) -
-		       offsetof(struct bch_dirent, d_name) -
-		       name->len);
-	} else {
-		if (!bch2_fs_casefold_enabled(c))
-			return -EOPNOTSUPP;
-
-#ifdef CONFIG_UNICODE
-		memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
-
-		char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
-
-		if (cf_name) {
-			cf_len = cf_name->len;
-
-			memcpy(cf_out, cf_name->name, cf_name->len);
-		} else {
-			cf_len = utf8_casefold(hash_info->cf_encoding, name,
-					       cf_out,
-					       bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
-			if (cf_len <= 0)
-				return cf_len;
-		}
-
-		memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
-		       bkey_val_bytes(&dirent->k) -
-		       offsetof(struct bch_dirent, d_cf_name_block.d_names) -
-		       name->len + cf_len);
-
-		dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
-		dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
-
-		EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
-#endif
-	}
-
-	unsigned u64s = dirent_val_u64s(name->len, cf_len);
-	BUG_ON(u64s > bkey_val_u64s(&dirent->k));
-	set_bkey_val_u64s(&dirent->k, u64s);
-	return 0;
-}
-
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
-				const struct bch_hash_info *hash_info,
-				subvol_inum dir,
-				u8 type,
-				const struct qstr *name,
-				const struct qstr *cf_name,
-				u64 dst)
-{
-	struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
-	if (IS_ERR(dirent))
-		return dirent;
-
-	bkey_dirent_init(&dirent->k_i);
-	dirent->k.u64s = BKEY_U64s_MAX;
-
-	if (type != DT_SUBVOL) {
-		dirent->v.d_inum = cpu_to_le64(dst);
-	} else {
-		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
-		dirent->v.d_child_subvol = cpu_to_le32(dst);
-	}
-
-	dirent->v.d_type = type;
-	dirent->v.d_unused = 0;
-
-	int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name);
-	if (ret)
-		return ERR_PTR(ret);
-
-	EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-	return dirent;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *trans,
-			u32 dir_subvol, u64 dir, u32 snapshot,
-			const struct bch_hash_info *hash_info,
-			u8 type, const struct qstr *name, u64 dst_inum,
-			u64 *dir_offset,
-			enum btree_iter_update_trigger_flags flags)
-{
-	subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
-	struct bkey_i_dirent *dirent;
-	int ret;
-
-	dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
-	ret = PTR_ERR_OR_ZERO(dirent);
-	if (ret)
-		return ret;
-
-	dirent->k.p.inode	= dir;
-	dirent->k.p.snapshot	= snapshot;
-
-	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-					dir_inum, snapshot, &dirent->k_i, flags);
-	*dir_offset = dirent->k.p.offset;
-
-	return ret;
-}
-
-int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
-		       const struct bch_hash_info *hash_info,
-		       u8 type, const struct qstr *name, u64 dst_inum,
-		       u64 *dir_offset,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i_dirent *dirent;
-	int ret;
-
-	dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
-	ret = PTR_ERR_OR_ZERO(dirent);
-	if (ret)
-		return ret;
-
-	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-			    dir, &dirent->k_i, flags);
-	*dir_offset = dirent->k.p.offset;
-
-	return ret;
-}
-
-int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
-			    struct bkey_s_c_dirent d, subvol_inum *target)
-{
-	struct bch_subvolume s;
-	int ret = 0;
-
-	if (d.v->d_type == DT_SUBVOL &&
-	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
-		return 1;
-
-	if (likely(d.v->d_type != DT_SUBVOL)) {
-		target->subvol	= dir.subvol;
-		target->inum	= le64_to_cpu(d.v->d_inum);
-	} else {
-		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
-
-		ret = bch2_subvolume_get(trans, target->subvol, true, &s);
-
-		target->inum	= le64_to_cpu(s.inode);
-	}
-
-	return ret;
-}
-
-int bch2_dirent_rename(struct btree_trans *trans,
-		subvol_inum src_dir, struct bch_hash_info *src_hash,
-		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
-		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
-		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
-		enum bch_rename_mode mode)
-{
-	struct qstr src_name_lookup, dst_name_lookup;
-	struct btree_iter src_iter = {};
-	struct btree_iter dst_iter = {};
-	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
-	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-	struct bpos dst_pos =
-		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-	unsigned src_update_flags = 0;
-	bool delete_src, delete_dst;
-	int ret = 0;
-
-	memset(src_inum, 0, sizeof(*src_inum));
-	memset(dst_inum, 0, sizeof(*dst_inum));
-
-	/* Lookup src: */
-	ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
-	if (ret)
-		goto out;
-	old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
-				   src_hash, src_dir, &src_name_lookup,
-				   BTREE_ITER_intent);
-	ret = bkey_err(old_src);
-	if (ret)
-		goto out;
-
-	ret = bch2_dirent_read_target(trans, src_dir,
-			bkey_s_c_to_dirent(old_src), src_inum);
-	if (ret)
-		goto out;
-
-	/* Lookup dst: */
-	ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
-	if (ret)
-		goto out;
-	if (mode == BCH_RENAME) {
-		/*
-		 * Note that we're _not_ checking if the target already exists -
-		 * we're relying on the VFS to do that check for us for
-		 * correctness:
-		 */
-		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
-				     dst_hash, dst_dir, &dst_name_lookup);
-		if (ret)
-			goto out;
-	} else {
-		old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
-					    dst_hash, dst_dir, &dst_name_lookup,
-					    BTREE_ITER_intent);
-		ret = bkey_err(old_dst);
-		if (ret)
-			goto out;
-
-		ret = bch2_dirent_read_target(trans, dst_dir,
-				bkey_s_c_to_dirent(old_dst), dst_inum);
-		if (ret)
-			goto out;
-	}
-
-	if (mode != BCH_RENAME_EXCHANGE)
-		*src_offset = dst_iter.pos.offset;
-
-	/* Create new dst key: */
-	new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
-					 dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
-	ret = PTR_ERR_OR_ZERO(new_dst);
-	if (ret)
-		goto out;
-
-	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-	new_dst->k.p = dst_iter.pos;
-
-	/* Create new src key: */
-	if (mode == BCH_RENAME_EXCHANGE) {
-		new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
-						 src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
-		ret = PTR_ERR_OR_ZERO(new_src);
-		if (ret)
-			goto out;
-
-		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-		new_src->k.p = src_iter.pos;
-	} else {
-		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-		ret = PTR_ERR_OR_ZERO(new_src);
-		if (ret)
-			goto out;
-
-		bkey_init(&new_src->k);
-		new_src->k.p = src_iter.pos;
-
-		if (bkey_le(dst_pos, src_iter.pos) &&
-		    bkey_lt(src_iter.pos, dst_iter.pos)) {
-			/*
-			 * We have a hash collision for the new dst key,
-			 * and new_src - the key we're deleting - is between
-			 * new_dst's hashed slot and the slot we're going to be
-			 * inserting it into - oops.  This will break the hash
-			 * table if we don't deal with it:
-			 */
-			if (mode == BCH_RENAME) {
-				/*
-				 * If we're not overwriting, we can just insert
-				 * new_dst at the src position:
-				 */
-				new_src = new_dst;
-				new_src->k.p = src_iter.pos;
-				goto out_set_src;
-			} else {
-				/* If we're overwriting, we can't insert new_dst
-				 * at a different slot because it has to
-				 * overwrite old_dst - just make sure to use a
-				 * whiteout when deleting src:
-				 */
-				new_src->k.type = KEY_TYPE_hash_whiteout;
-			}
-		} else {
-			/* Check if we need a whiteout to delete src: */
-			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-						       src_hash, &src_iter);
-			if (ret < 0)
-				goto out;
-
-			if (ret)
-				new_src->k.type = KEY_TYPE_hash_whiteout;
-		}
-	}
-
-	if (new_dst->v.d_type == DT_SUBVOL)
-		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
-
-	if ((mode == BCH_RENAME_EXCHANGE) &&
-	    new_src->v.d_type == DT_SUBVOL)
-		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
-
-	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
-	if (ret)
-		goto out;
-out_set_src:
-	/*
-	 * If we're deleting a subvolume we need to really delete the dirent,
-	 * not just emit a whiteout in the current snapshot - there can only be
-	 * single dirent that points to a given subvolume.
-	 *
-	 * IOW, we don't maintain multiple versions in different snapshots of
-	 * dirents that point to subvolumes - dirents that point to subvolumes
-	 * are only visible in one particular subvolume so it's not necessary,
-	 * and it would be particularly confusing for fsck to have to deal with.
-	 */
-	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
-		new_src->k.p.snapshot != old_src.k->p.snapshot;
-
-	delete_dst = old_dst.k &&
-		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
-		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
-
-	if (!delete_src || !bkey_deleted(&new_src->k)) {
-		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-		if (ret)
-			goto out;
-	}
-
-	if (delete_src) {
-		bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
-		ret =   bch2_btree_iter_traverse(trans, &src_iter) ?:
-			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto out;
-	}
-
-	if (delete_dst) {
-		bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
-		ret =   bch2_btree_iter_traverse(trans, &dst_iter) ?:
-			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto out;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE)
-		*src_offset = new_src->k.p.offset;
-	*dst_offset = new_dst->k.p.offset;
-out:
-	bch2_trans_iter_exit(trans, &src_iter);
-	bch2_trans_iter_exit(trans, &dst_iter);
-	return ret;
-}
-
-int bch2_dirent_lookup_trans(struct btree_trans *trans,
-			     struct btree_iter *iter,
-			     subvol_inum dir,
-			     const struct bch_hash_info *hash_info,
-			     const struct qstr *name, subvol_inum *inum,
-			     unsigned flags)
-{
-	struct qstr lookup_name;
-	int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
-	if (ret)
-		return ret;
-
-	struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-					     hash_info, dir, &lookup_name, flags);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
-	if (ret > 0)
-		ret = -ENOENT;
-err:
-	if (ret)
-		bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
-		       const struct bch_hash_info *hash_info,
-		       const struct qstr *name, subvol_inum *inum)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = {};
-
-	int ret = lockrestart_do(trans,
-		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(dir, 0, snapshot),
-			   POS(dir, U64_MAX), 0, k, ret)
-		if (k.k->type == KEY_TYPE_dirent) {
-			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
-				continue;
-			ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
-			break;
-		}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
-{
-	u32 snapshot;
-
-	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
-		bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
-}
-
-static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
-{
-	struct qstr name = bch2_dirent_get_name(d);
-	/*
-	 * Although not required by the kernel code, updating ctx->pos is needed
-	 * for the bcachefs FUSE driver. Without this update, the FUSE
-	 * implementation will be stuck in an infinite loop when reading
-	 * directories (via the bcachefs_fuse_readdir callback).
-	 * In kernel space, ctx->pos is updated by the VFS code.
-	 */
-	ctx->pos = d.k->p.offset;
-	bool ret = dir_emit(ctx, name.name,
-		      name.len,
-		      target.inum,
-		      vfs_d_type(d.v->d_type));
-	if (ret)
-		ctx->pos = d.k->p.offset + 1;
-	return !ret;
-}
-
-int bch2_readdir(struct bch_fs *c, subvol_inum inum,
-		 struct bch_hash_info *hash_info,
-		 struct dir_context *ctx)
-{
-	struct bkey_buf sk;
-	bch2_bkey_buf_init(&sk);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
-				   POS(inum.inum, ctx->pos),
-				   POS(inum.inum, U64_MAX),
-				   inum.subvol, 0, k, ({
-			if (k.k->type != KEY_TYPE_dirent)
-				continue;
-
-			/* dir_emit() can fault and block: */
-			bch2_bkey_buf_reassemble(&sk, c, k);
-			struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
-
-			subvol_inum target;
-
-			bool need_second_pass = false;
-			int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
-							   hash_info, &iter, k, &need_second_pass) ?:
-				bch2_dirent_read_target(trans, inum, dirent, &target);
-			if (ret2 > 0)
-				continue;
-
-			ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
-		})));
-
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret < 0 ? ret : 0;
-}
-
-/* fsck */
-
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
-			      struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
-				     BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inode_nr)
-			break;
-		if (!bkey_is_inode(k.k))
-			continue;
-		ret = bch2_inode_unpack(k, inode);
-		goto found;
-	}
-	ret = bch_err_throw(trans->c, ENOENT_inode);
-found:
-	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bch_inode_unpacked dir_inode;
-	struct bch_hash_info dir_hash_info;
-	int ret;
-
-	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
-	if (ret)
-		goto err;
-
-	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
-	ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				    &dir_hash_info, &iter,
-				    BTREE_UPDATE_internal_snapshot_node);
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
deleted file mode 100644
index 0417608c18d5..000000000000
--- a/fs/bcachefs/dirent.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_H
-#define _BCACHEFS_DIRENT_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_dirent_hash_desc;
-
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
-			 struct bkey_validate_context);
-void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
-	.key_validate	= bch2_dirent_validate,		\
-	.val_to_text	= bch2_dirent_to_text,		\
-	.min_val_size	= 16,				\
-})
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-struct bch_inode_info;
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
-		  const struct qstr *, struct qstr *);
-#else
-static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
-				const struct qstr *str, struct qstr *out_cf)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
-				      const struct bch_hash_info *info,
-				      const struct qstr *str, struct qstr *out_cf)
-{
-	if (likely(!info->cf_encoding)) {
-		*out_cf = *str;
-		return 0;
-	} else {
-		return bch2_casefold(trans, info, str, out_cf);
-	}
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
-
-static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
-{
-	unsigned bytes = cf_len
-		? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
-		: offsetof(struct bch_dirent, d_name) + len;
-
-	return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
-			    struct bkey_s_c_dirent, subvol_inum *);
-
-static inline void dirent_copy_target(struct bkey_i_dirent *dst,
-				      struct bkey_s_c_dirent src)
-{
-	dst->v.d_inum = src.v->d_inum;
-	dst->v.d_type = src.v->d_type;
-}
-
-int bch2_dirent_init_name(struct bch_fs *,
-			  struct bkey_i_dirent *,
-			  const struct bch_hash_info *,
-			  const struct qstr *,
-			  const struct qstr *);
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
-				const struct bch_hash_info *, subvol_inum, u8,
-				const struct qstr *, const struct qstr *, u64);
-
-int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
-			const struct bch_hash_info *, u8,
-			const struct qstr *, u64, u64 *,
-			enum btree_iter_update_trigger_flags);
-int bch2_dirent_create(struct btree_trans *, subvol_inum,
-		       const struct bch_hash_info *, u8,
-		       const struct qstr *, u64, u64 *,
-		       enum btree_iter_update_trigger_flags);
-
-static inline unsigned vfs_d_type(unsigned type)
-{
-	return type == DT_SUBVOL ? DT_DIR : type;
-}
-
-enum bch_rename_mode {
-	BCH_RENAME,
-	BCH_RENAME_OVERWRITE,
-	BCH_RENAME_EXCHANGE,
-};
-
-int bch2_dirent_rename(struct btree_trans *,
-		       subvol_inum, struct bch_hash_info *,
-		       subvol_inum, struct bch_hash_info *,
-		       const struct qstr *, subvol_inum *, u64 *,
-		       const struct qstr *, subvol_inum *, u64 *,
-		       enum bch_rename_mode);
-
-int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
-			       subvol_inum, const struct bch_hash_info *,
-			       const struct qstr *, subvol_inum *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
-		       const struct bch_hash_info *,
-		       const struct qstr *, subvol_inum *);
-
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
-int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
-
-int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
-
-#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
deleted file mode 100644
index a46dbddd21aa..000000000000
--- a/fs/bcachefs/dirent_format.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_FORMAT_H
-#define _BCACHEFS_DIRENT_FORMAT_H
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
-	struct bch_val		v;
-
-	/* Target inode number: */
-	union {
-	__le64			d_inum;
-	struct {		/* DT_SUBVOL */
-	__le32			d_child_subvol;
-	__le32			d_parent_subvol;
-	};
-	};
-
-	/*
-	 * Copy of mode bits 12-15 from the target inode - so userspace can get
-	 * the filetype without having to do a stat()
-	 */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u8			d_type:5,
-				d_unused:2,
-				d_casefold:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	__u8			d_casefold:1,
-				d_unused:2,
-				d_type:5;
-#endif
-
-	union {
-	struct {
-		__u8		d_pad;
-		__le16		d_name_len;
-		__le16		d_cf_name_len;
-		__u8		d_names[];
-	} d_cf_name_block __packed;
-	__DECLARE_FLEX_ARRAY(__u8, d_name);
-	} __packed;
-} __packed __aligned(8);
-
-#define DT_SUBVOL	16
-#define BCH_DT_MAX	17
-
-#define BCH_NAME_MAX	512
-
-#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
deleted file mode 100644
index f7528cd69c73..000000000000
--- a/fs/bcachefs/disk_accounting.c
+++ /dev/null
@@ -1,1074 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-
-/*
- * Notes on disk accounting:
- *
- * We have two parallel sets of counters to be concerned with, and both must be
- * kept in sync.
- *
- *  - Persistent/on disk accounting, stored in the accounting btree and updated
- *    via btree write buffer updates that treat new accounting keys as deltas to
- *    apply to existing values. But reading from a write buffer btree is
- *    expensive, so we also have
- *
- *  - In memory accounting, where accounting is stored as an array of percpu
- *    counters, indexed by an eytzinger array of disk acounting keys/bpos (which
- *    are the same thing, excepting byte swabbing on big endian).
- *
- *    Cheap to read, but non persistent.
- *
- * Disk accounting updates are generated by transactional triggers; these run as
- * keys enter and leave the btree, and can compare old and new versions of keys;
- * the output of these triggers are deltas to the various counters.
- *
- * Disk accounting updates are done as btree write buffer updates, where the
- * counters in the disk accounting key are deltas that will be applied to the
- * counter in the btree when the key is flushed by the write buffer (or journal
- * replay).
- *
- * To do a disk accounting update:
- * - initialize a disk_accounting_pos, to specify which counter is being update
- * - initialize counter deltas, as an array of 1-3 s64s
- * - call bch2_disk_accounting_mod()
- *
- * This queues up the accounting update to be done at transaction commit time.
- * Underneath, it's a normal btree write buffer update.
- *
- * The transaction commit path is responsible for propagating updates to the in
- * memory counters, with bch2_accounting_mem_mod().
- *
- * The commit path also assigns every disk accounting update a unique version
- * number, based on the journal sequence number and offset within that journal
- * buffer; this is used by journal replay to determine which updates have been
- * done.
- *
- * The transaction commit path also ensures that replicas entry accounting
- * updates are properly marked in the superblock (so that we know whether we can
- * mount without data being unavailable); it will update the superblock if
- * bch2_accounting_mem_mod() tells it to.
- */
-
-static const char * const disk_accounting_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
-	BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-	NULL
-};
-
-static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
-					 s64 *d, unsigned nr)
-{
-	struct bkey_i_accounting *acc = bkey_accounting_init(k);
-
-	acc->k.p = pos;
-	set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
-
-	memcpy_u64s_small(acc->v.d, d, nr);
-}
-
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
-				       s64 *d, unsigned nr)
-{
-	return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
-
-int bch2_disk_accounting_mod(struct btree_trans *trans,
-			     struct disk_accounting_pos *k,
-			     s64 *d, unsigned nr, bool gc)
-{
-	BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
-
-	/* Normalize: */
-	switch (k->type) {
-	case BCH_DISK_ACCOUNTING_replicas:
-		bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
-		break;
-	}
-
-	struct bpos pos = disk_accounting_pos_to_bpos(k);
-
-	if (likely(!gc)) {
-		struct bkey_i_accounting *a;
-#if 0
-		for (a = btree_trans_subbuf_base(trans, &trans->accounting);
-		     a != btree_trans_subbuf_top(trans, &trans->accounting);
-		     a = (void *) bkey_next(&a->k_i))
-			if (bpos_eq(a->k.p, pos)) {
-				BUG_ON(nr != bch2_accounting_counters(&a->k));
-				acc_u64s(a->v.d, d, nr);
-
-				if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
-					unsigned offset = (u64 *) a -
-						(u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
-
-					trans->accounting.u64s -= a->k.u64s;
-					memmove_u64s_down(a,
-							  bkey_next(&a->k_i),
-							  trans->accounting.u64s - offset);
-				}
-				return 0;
-			}
-#endif
-		unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
-		a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
-		int ret = PTR_ERR_OR_ZERO(a);
-		if (ret)
-			return ret;
-
-		__accounting_key_init(&a->k_i, pos, d, nr);
-		return 0;
-	} else {
-		struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
-		__accounting_key_init(&k_i.k, pos, d, nr);
-
-		int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
-		if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
-			ret = drop_locks_do(trans,
-				bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
-				bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
-		return ret;
-	}
-}
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
-				unsigned dev, s64 sectors,
-				bool gc)
-{
-	struct disk_accounting_pos acc;
-	memset(&acc, 0, sizeof(acc));
-	acc.type = BCH_DISK_ACCOUNTING_replicas;
-	bch2_replicas_entry_cached(&acc.replicas, dev);
-
-	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-}
-
-static inline bool is_zero(char *start, char *end)
-{
-	BUG_ON(start > end);
-
-	for (; start < end; start++)
-		if (*start)
-			return false;
-	return true;
-}
-
-#define field_end(p, member)	(((void *) (&p.member)) + sizeof(p.member))
-
-static const unsigned bch2_accounting_type_nr_counters[] = {
-#define x(f, id, nr)	[BCH_DISK_ACCOUNTING_##f]	= nr,
-	BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-};
-
-int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
-			     struct bkey_validate_context from)
-{
-	struct disk_accounting_pos acc_k;
-	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-	void *end = &acc_k + 1;
-	int ret = 0;
-
-	bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
-			 bversion_zero(k.k->bversion),
-			 c, accounting_key_version_0,
-			 "accounting key with version=0");
-
-	switch (acc_k.type) {
-	case BCH_DISK_ACCOUNTING_nr_inodes:
-		end = field_end(acc_k, nr_inodes);
-		break;
-	case BCH_DISK_ACCOUNTING_persistent_reserved:
-		end = field_end(acc_k, persistent_reserved);
-		break;
-	case BCH_DISK_ACCOUNTING_replicas:
-		bkey_fsck_err_on(!acc_k.replicas.nr_devs,
-				 c, accounting_key_replicas_nr_devs_0,
-				 "accounting key replicas entry with nr_devs=0");
-
-		bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
-				 (acc_k.replicas.nr_required > 1 &&
-				  acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
-				 c, accounting_key_replicas_nr_required_bad,
-				 "accounting key replicas entry with bad nr_required");
-
-		for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
-			bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
-					 c, accounting_key_replicas_devs_unsorted,
-					 "accounting key replicas entry with unsorted devs");
-
-		end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
-		break;
-	case BCH_DISK_ACCOUNTING_dev_data_type:
-		end = field_end(acc_k, dev_data_type);
-		break;
-	case BCH_DISK_ACCOUNTING_compression:
-		end = field_end(acc_k, compression);
-		break;
-	case BCH_DISK_ACCOUNTING_snapshot:
-		end = field_end(acc_k, snapshot);
-		break;
-	case BCH_DISK_ACCOUNTING_btree:
-		end = field_end(acc_k, btree);
-		break;
-	case BCH_DISK_ACCOUNTING_rebalance_work:
-		end = field_end(acc_k, rebalance_work);
-		break;
-	}
-
-	bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
-			 c, accounting_key_junk_at_end,
-			 "junk at end of accounting key");
-
-	bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
-			 c, accounting_key_nr_counters_wrong,
-			 "accounting key with %u counters, should be %u",
-			 bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
-fsck_err:
-	return ret;
-}
-
-void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
-{
-	if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
-		prt_printf(out, "unknown type %u", k->type);
-		return;
-	}
-
-	prt_str(out, disk_accounting_type_strs[k->type]);
-	prt_str(out, " ");
-
-	switch (k->type) {
-	case BCH_DISK_ACCOUNTING_nr_inodes:
-		break;
-	case BCH_DISK_ACCOUNTING_persistent_reserved:
-		prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
-		break;
-	case BCH_DISK_ACCOUNTING_replicas:
-		bch2_replicas_entry_to_text(out, &k->replicas);
-		break;
-	case BCH_DISK_ACCOUNTING_dev_data_type:
-		prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
-		bch2_prt_data_type(out, k->dev_data_type.data_type);
-		break;
-	case BCH_DISK_ACCOUNTING_compression:
-		bch2_prt_compression_type(out, k->compression.type);
-		break;
-	case BCH_DISK_ACCOUNTING_snapshot:
-		prt_printf(out, "id=%u", k->snapshot.id);
-		break;
-	case BCH_DISK_ACCOUNTING_btree:
-		prt_str(out, "btree=");
-		bch2_btree_id_to_text(out, k->btree.id);
-		break;
-	}
-}
-
-void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
-	struct disk_accounting_pos acc_k;
-	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-	bch2_accounting_key_to_text(out, &acc_k);
-
-	for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
-		prt_printf(out, " %lli", acc.v->d[i]);
-}
-
-void bch2_accounting_swab(struct bkey_s k)
-{
-	for (u64 *p = (u64 *) k.v;
-	     p < (u64 *) bkey_val_end(k);
-	     p++)
-		*p = swab64(*p);
-}
-
-static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
-					    struct disk_accounting_pos *acc)
-{
-	unsafe_memcpy(r, &acc->replicas,
-		      replicas_entry_bytes(&acc->replicas),
-		      "variable length struct");
-}
-
-static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
-{
-	struct disk_accounting_pos acc_k;
-	bpos_to_disk_accounting_pos(&acc_k, p);
-
-	switch (acc_k.type) {
-	case BCH_DISK_ACCOUNTING_replicas:
-		__accounting_to_replicas(r, &acc_k);
-		return true;
-	default:
-		return false;
-	}
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
-{
-	union bch_replicas_padded r;
-	return accounting_to_replicas(&r.e, p)
-		? bch2_mark_replicas(c, &r.e)
-		: 0;
-}
-
-/*
- * Ensure accounting keys being updated are present in the superblock, when
- * applicable (i.e. replicas updates)
- */
-int bch2_accounting_update_sb(struct btree_trans *trans)
-{
-	for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
-	     i != btree_trans_subbuf_top(trans, &trans->accounting);
-	     i = bkey_next(i)) {
-		int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	/* raced with another insert, already present: */
-	if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			    accounting_pos_cmp, &a.k->p) < acc->k.nr)
-		return 0;
-
-	struct accounting_mem_entry n = {
-		.pos		= a.k->p,
-		.bversion	= a.k->bversion,
-		.nr_counters	= bch2_accounting_counters(a.k),
-		.v[0]		= __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
-						     sizeof(u64), GFP_KERNEL),
-	};
-
-	if (!n.v[0])
-		goto err;
-
-	if (acc->gc_running) {
-		n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
-					    sizeof(u64), GFP_KERNEL);
-		if (!n.v[1])
-			goto err;
-	}
-
-	if (darray_push(&acc->k, n))
-		goto err;
-
-	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			accounting_pos_cmp, NULL);
-
-	if (trace_accounting_mem_insert_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_accounting_to_text(&buf, c, a.s_c);
-		trace_accounting_mem_insert(c, buf.buf);
-		printbuf_exit(&buf);
-	}
-	return 0;
-err:
-	free_percpu(n.v[1]);
-	free_percpu(n.v[0]);
-	return bch_err_throw(c, ENOMEM_disk_accounting);
-}
-
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
-			       enum bch_accounting_mode mode)
-{
-	union bch_replicas_padded r;
-
-	if (mode != BCH_ACCOUNTING_read &&
-	    accounting_to_replicas(&r.e, a.k->p) &&
-	    !bch2_replicas_marked_locked(c, &r.e))
-		return bch_err_throw(c, btree_insert_need_mark_replicas);
-
-	percpu_up_read(&c->mark_lock);
-	percpu_down_write(&c->mark_lock);
-	int ret = __bch2_accounting_mem_insert(c, a);
-	percpu_up_write(&c->mark_lock);
-	percpu_down_read(&c->mark_lock);
-	return ret;
-}
-
-int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
-			       enum bch_accounting_mode mode)
-{
-	union bch_replicas_padded r;
-
-	if (mode != BCH_ACCOUNTING_read &&
-	    accounting_to_replicas(&r.e, a.k->p) &&
-	    !bch2_replicas_marked_locked(c, &r.e))
-		return bch_err_throw(c, btree_insert_need_mark_replicas);
-
-	return __bch2_accounting_mem_insert(c, a);
-}
-
-static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
-{
-	for (unsigned i = 0; i < e->nr_counters; i++)
-		if (percpu_u64_get(e->v[0] + i) ||
-		    (e->v[1] &&
-		     percpu_u64_get(e->v[1] + i)))
-			return false;
-	return true;
-}
-
-void bch2_accounting_mem_gc(struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	percpu_down_write(&c->mark_lock);
-	struct accounting_mem_entry *dst = acc->k.data;
-
-	darray_for_each(acc->k, src) {
-		if (accounting_mem_entry_is_zero(src)) {
-			free_percpu(src->v[0]);
-			free_percpu(src->v[1]);
-		} else {
-			*dst++ = *src;
-		}
-	}
-
-	acc->k.nr = dst - acc->k.data;
-	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			accounting_pos_cmp, NULL);
-	percpu_up_write(&c->mark_lock);
-}
-
-/*
- * Read out accounting keys for replicas entries, as an array of
- * bch_replicas_usage entries.
- *
- * Note: this may be deprecated/removed at smoe point in the future and replaced
- * with something more general, it exists to support the ioctl used by the
- * 'bcachefs fs usage' command.
- */
-int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-	int ret = 0;
-
-	darray_init(usage);
-
-	percpu_down_read(&c->mark_lock);
-	darray_for_each(acc->k, i) {
-		union {
-			u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
-					       BCH_BKEY_PTRS_MAX)];
-			struct bch_replicas_usage r;
-		} u;
-		u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
-
-		if (!accounting_to_replicas(&u.r.r, i->pos))
-			continue;
-
-		u64 sectors;
-		bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
-		u.r.sectors = sectors;
-
-		ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
-		if (ret)
-			break;
-
-		memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
-		usage->nr += replicas_usage_bytes(&u.r);
-	}
-	percpu_up_read(&c->mark_lock);
-
-	if (ret)
-		darray_exit(usage);
-	return ret;
-}
-
-int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
-{
-
-	struct bch_accounting_mem *acc = &c->accounting;
-	int ret = 0;
-
-	darray_init(out_buf);
-
-	percpu_down_read(&c->mark_lock);
-	darray_for_each(acc->k, i) {
-		struct disk_accounting_pos a_p;
-		bpos_to_disk_accounting_pos(&a_p, i->pos);
-
-		if (!(accounting_types_mask & BIT(a_p.type)))
-			continue;
-
-		ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
-				       sizeof(u64) * i->nr_counters);
-		if (ret)
-			break;
-
-		struct bkey_i_accounting *a_out =
-			bkey_accounting_init((void *) &darray_top(*out_buf));
-		set_bkey_val_u64s(&a_out->k, i->nr_counters);
-		a_out->k.p = i->pos;
-		bch2_accounting_mem_read_counters(acc, i - acc->k.data,
-						  a_out->v.d, i->nr_counters, false);
-
-		if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
-			out_buf->nr += bkey_bytes(&a_out->k);
-	}
-
-	percpu_up_read(&c->mark_lock);
-
-	if (ret)
-		darray_exit(out_buf);
-	return ret;
-}
-
-static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
-{
-	darray_for_each(acc->k, e) {
-		free_percpu(e->v[gc]);
-		e->v[gc] = NULL;
-	}
-}
-
-int bch2_gc_accounting_start(struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-	int ret = 0;
-
-	percpu_down_write(&c->mark_lock);
-	darray_for_each(acc->k, e) {
-		e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
-					     sizeof(u64), GFP_KERNEL);
-		if (!e->v[1]) {
-			bch2_accounting_free_counters(acc, true);
-			ret = bch_err_throw(c, ENOMEM_disk_accounting);
-			break;
-		}
-	}
-
-	acc->gc_running = !ret;
-	percpu_up_write(&c->mark_lock);
-
-	return ret;
-}
-
-int bch2_gc_accounting_done(struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct printbuf buf = PRINTBUF;
-	struct bpos pos = POS_MIN;
-	int ret = 0;
-
-	percpu_down_write(&c->mark_lock);
-	while (1) {
-		unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-						  accounting_pos_cmp, &pos);
-
-		if (idx >= acc->k.nr)
-			break;
-
-		struct accounting_mem_entry *e = acc->k.data + idx;
-		pos = bpos_successor(e->pos);
-
-		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, e->pos);
-
-		if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-			continue;
-
-		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
-		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
-
-		unsigned nr = e->nr_counters;
-		bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
-		bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
-
-		if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
-			printbuf_reset(&buf);
-			prt_str(&buf, "accounting mismatch for ");
-			bch2_accounting_key_to_text(&buf, &acc_k);
-
-			prt_str(&buf, ":\n      got");
-			for (unsigned j = 0; j < nr; j++)
-				prt_printf(&buf, " %llu", dst_v[j]);
-
-			prt_str(&buf,  "\nshould be");
-			for (unsigned j = 0; j < nr; j++)
-				prt_printf(&buf, " %llu", src_v[j]);
-
-			for (unsigned j = 0; j < nr; j++)
-				src_v[j] -= dst_v[j];
-
-			bch2_trans_unlock_long(trans);
-
-			if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
-				percpu_up_write(&c->mark_lock);
-				ret = commit_do(trans, NULL, NULL, 0,
-						bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
-				percpu_down_write(&c->mark_lock);
-				if (ret)
-					goto err;
-
-				if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
-					memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-					struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
-					accounting_key_init(&k_i.k, &acc_k, src_v, nr);
-					bch2_accounting_mem_mod_locked(trans,
-								bkey_i_to_s_c_accounting(&k_i.k),
-								BCH_ACCOUNTING_normal, true);
-
-					preempt_disable();
-					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
-					struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-					acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
-					preempt_enable();
-				}
-			}
-		}
-	}
-err:
-fsck_err:
-	percpu_up_write(&c->mark_lock);
-	printbuf_exit(&buf);
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-
-	if (k.k->type != KEY_TYPE_accounting)
-		return 0;
-
-	percpu_down_read(&c->mark_lock);
-	int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
-						 BCH_ACCOUNTING_read, false);
-	percpu_up_read(&c->mark_lock);
-	return ret;
-}
-
-static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
-					      struct disk_accounting_pos *acc,
-					      u64 *v, unsigned nr)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0, invalid_dev = -1;
-
-	switch (acc->type) {
-	case BCH_DISK_ACCOUNTING_replicas: {
-		union bch_replicas_padded r;
-		__accounting_to_replicas(&r.e, acc);
-
-		for (unsigned i = 0; i < r.e.nr_devs; i++)
-			if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
-			    !bch2_dev_exists(c, r.e.devs[i])) {
-				invalid_dev = r.e.devs[i];
-				goto invalid_device;
-			}
-
-		/*
-		 * All replicas entry checks except for invalid device are done
-		 * in bch2_accounting_validate
-		 */
-		BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
-
-		if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
-				trans, accounting_replicas_not_marked,
-				"accounting not marked in superblock replicas\n%s",
-				(printbuf_reset(&buf),
-				 bch2_accounting_key_to_text(&buf, acc),
-				 buf.buf))) {
-			/*
-			 * We're not RW yet and still single threaded, dropping
-			 * and retaking lock is ok:
-			 */
-			percpu_up_write(&c->mark_lock);
-			ret = bch2_mark_replicas(c, &r.e);
-			if (ret)
-				goto fsck_err;
-			percpu_down_write(&c->mark_lock);
-		}
-		break;
-	}
-
-	case BCH_DISK_ACCOUNTING_dev_data_type:
-		if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
-			invalid_dev = acc->dev_data_type.dev;
-			goto invalid_device;
-		}
-		break;
-	}
-
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-invalid_device:
-	if (fsck_err(trans, accounting_to_invalid_device,
-		     "accounting entry points to invalid device %i\n%s",
-		     invalid_dev,
-		     (printbuf_reset(&buf),
-		      bch2_accounting_key_to_text(&buf, acc),
-		      buf.buf))) {
-		for (unsigned i = 0; i < nr; i++)
-			v[i] = -v[i];
-
-		ret = commit_do(trans, NULL, NULL, 0,
-				bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
-			-BCH_ERR_remove_disk_accounting_entry;
-	} else {
-		ret = bch_err_throw(c, remove_disk_accounting_entry);
-	}
-	goto fsck_err;
-}
-
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct printbuf buf = PRINTBUF;
-
-	/*
-	 * We might run more than once if we rewind to start topology repair or
-	 * btree node scan - and those might cause us to get different results,
-	 * so we can't just skip if we've already run.
-	 *
-	 * Instead, zero out any accounting we have:
-	 */
-	percpu_down_write(&c->mark_lock);
-	darray_for_each(acc->k, e)
-		percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
-	for_each_member_device(c, ca)
-		percpu_memset(ca->usage, 0, sizeof(*ca->usage));
-	percpu_memset(c->usage, 0, sizeof(*c->usage));
-	percpu_up_write(&c->mark_lock);
-
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
-			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-	iter.flags &= ~BTREE_ITER_with_journal;
-	int ret = for_each_btree_key_continue(trans, iter,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-			struct bkey u;
-			struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
-			if (k.k->type != KEY_TYPE_accounting)
-				continue;
-
-			struct disk_accounting_pos acc_k;
-			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-				break;
-
-			if (!bch2_accounting_is_mem(&acc_k)) {
-				struct disk_accounting_pos next;
-				memset(&next, 0, sizeof(next));
-				next.type = acc_k.type + 1;
-				bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
-				continue;
-			}
-
-			accounting_read_key(trans, k);
-		}));
-	if (ret)
-		goto err;
-
-	struct journal_keys *keys = &c->journal_keys;
-	struct journal_key *dst = keys->data;
-	move_gap(keys, keys->nr);
-
-	darray_for_each(*keys, i) {
-		if (i->k->k.type == KEY_TYPE_accounting) {
-			struct disk_accounting_pos acc_k;
-			bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
-
-			if (!bch2_accounting_is_mem(&acc_k))
-				continue;
-
-			struct bkey_s_c k = bkey_i_to_s_c(i->k);
-			unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
-						sizeof(acc->k.data[0]),
-						accounting_pos_cmp, &k.k->p);
-
-			bool applied = idx < acc->k.nr &&
-				bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
-
-			if (applied)
-				continue;
-
-			if (i + 1 < &darray_top(*keys) &&
-			    i[1].k->k.type == KEY_TYPE_accounting &&
-			    !journal_key_cmp(i, i + 1)) {
-				WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
-
-				i[1].journal_seq = i[0].journal_seq;
-
-				bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
-							   bkey_s_c_to_accounting(k));
-				continue;
-			}
-
-			ret = accounting_read_key(trans, k);
-			if (ret)
-				goto err;
-		}
-
-		*dst++ = *i;
-	}
-	keys->gap = keys->nr = dst - keys->data;
-
-	percpu_down_write(&c->mark_lock);
-
-	darray_for_each_reverse(acc->k, i) {
-		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, i->pos);
-
-		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		memset(v, 0, sizeof(v));
-
-		for (unsigned j = 0; j < i->nr_counters; j++)
-			v[j] = percpu_u64_get(i->v[0] + j);
-
-		/*
-		 * If the entry counters are zeroed, it should be treated as
-		 * nonexistent - it might point to an invalid device.
-		 *
-		 * Remove it, so that if it's re-added it gets re-marked in the
-		 * superblock:
-		 */
-		ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
-			? -BCH_ERR_remove_disk_accounting_entry
-			: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
-
-		if (ret == -BCH_ERR_remove_disk_accounting_entry) {
-			free_percpu(i->v[0]);
-			free_percpu(i->v[1]);
-			darray_remove_item(&acc->k, i);
-			ret = 0;
-			continue;
-		}
-
-		if (ret)
-			goto fsck_err;
-	}
-
-	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			accounting_pos_cmp, NULL);
-
-	preempt_disable();
-	struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
-
-	for (unsigned i = 0; i < acc->k.nr; i++) {
-		struct disk_accounting_pos k;
-		bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
-		u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-		bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
-		switch (k.type) {
-		case BCH_DISK_ACCOUNTING_persistent_reserved:
-			usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
-			break;
-		case BCH_DISK_ACCOUNTING_replicas:
-			fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
-			break;
-		case BCH_DISK_ACCOUNTING_dev_data_type: {
-			guard(rcu)();
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
-			if (ca) {
-				struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
-				percpu_u64_set(&d->buckets,	v[0]);
-				percpu_u64_set(&d->sectors,	v[1]);
-				percpu_u64_set(&d->fragmented,	v[2]);
-
-				if (k.dev_data_type.data_type == BCH_DATA_sb ||
-				    k.dev_data_type.data_type == BCH_DATA_journal)
-					usage->hidden += v[0] * ca->mi.bucket_size;
-			}
-			break;
-		}
-		}
-	}
-	preempt_enable();
-fsck_err:
-	percpu_up_write(&c->mark_lock);
-err:
-	printbuf_exit(&buf);
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
-{
-	return bch2_trans_run(c,
-		bch2_btree_write_buffer_flush_sync(trans) ?:
-		for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
-				BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
-			struct disk_accounting_pos acc;
-			bpos_to_disk_accounting_pos(&acc, k.k->p);
-
-			acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
-			acc.dev_data_type.dev == dev
-				? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
-				: 0;
-		})) ?:
-		bch2_btree_write_buffer_flush_sync(trans));
-}
-
-int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
-{
-	struct bch_fs *c = ca->fs;
-	u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
-
-	int ret = bch2_trans_do(c, ({
-		bch2_disk_accounting_mod2(trans, gc,
-					  v, dev_data_type,
-					  .dev = ca->dev_idx,
-					  .data_type = BCH_DATA_free) ?:
-		(!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
-	}));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-void bch2_verify_accounting_clean(struct bch_fs *c)
-{
-	bool mismatch = false;
-	struct bch_fs_usage_base base = {}, base_inmem = {};
-
-	bch2_trans_run(c,
-		for_each_btree_key(trans, iter,
-				   BTREE_ID_accounting, POS_MIN,
-				   BTREE_ITER_all_snapshots, k, ({
-			u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
-			struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
-			unsigned nr = bch2_accounting_counters(k.k);
-
-			struct disk_accounting_pos acc_k;
-			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
-			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
-				break;
-
-			if (!bch2_accounting_is_mem(&acc_k)) {
-				struct disk_accounting_pos next;
-				memset(&next, 0, sizeof(next));
-				next.type = acc_k.type + 1;
-				bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
-				continue;
-			}
-
-			bch2_accounting_mem_read(c, k.k->p, v, nr);
-
-			if (memcmp(a.v->d, v, nr * sizeof(u64))) {
-				struct printbuf buf = PRINTBUF;
-
-				bch2_bkey_val_to_text(&buf, c, k);
-				prt_str(&buf, " !=");
-				for (unsigned j = 0; j < nr; j++)
-					prt_printf(&buf, " %llu", v[j]);
-
-				pr_err("%s", buf.buf);
-				printbuf_exit(&buf);
-				mismatch = true;
-			}
-
-			switch (acc_k.type) {
-			case BCH_DISK_ACCOUNTING_persistent_reserved:
-				base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-				break;
-			case BCH_DISK_ACCOUNTING_replicas:
-				fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
-				break;
-			case BCH_DISK_ACCOUNTING_dev_data_type:
-				{
-					guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
-					struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
-					if (!ca)
-						continue;
-
-					v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
-					v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
-					v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
-				}
-
-				if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
-					struct printbuf buf = PRINTBUF;
-
-					bch2_bkey_val_to_text(&buf, c, k);
-					prt_str(&buf, " in mem");
-					for (unsigned j = 0; j < nr; j++)
-						prt_printf(&buf, " %llu", v[j]);
-
-					pr_err("dev accounting mismatch: %s", buf.buf);
-					printbuf_exit(&buf);
-					mismatch = true;
-				}
-			}
-
-			0;
-		})));
-
-	acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
-
-#define check(x)										\
-	if (base.x != base_inmem.x) {								\
-		pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x);	\
-		mismatch = true;								\
-	}
-
-	//check(hidden);
-	check(btree);
-	check(data);
-	check(cached);
-	check(reserved);
-	check(nr_inodes);
-
-	WARN_ON(mismatch);
-}
-
-void bch2_accounting_gc_free(struct bch_fs *c)
-{
-	lockdep_assert_held(&c->mark_lock);
-
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	bch2_accounting_free_counters(acc, true);
-	acc->gc_running = false;
-}
-
-void bch2_fs_accounting_exit(struct bch_fs *c)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-
-	bch2_accounting_free_counters(acc, false);
-	darray_exit(&acc->k);
-}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
deleted file mode 100644
index d61abebf3e0b..000000000000
--- a/fs/bcachefs/disk_accounting.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_H
-#define _BCACHEFS_DISK_ACCOUNTING_H
-
-#include "btree_update.h"
-#include "eytzinger.h"
-#include "sb-members.h"
-
-static inline void bch2_u64s_neg(u64 *v, unsigned nr)
-{
-	for (unsigned i = 0; i < nr; i++)
-		v[i] = -v[i];
-}
-
-static inline unsigned bch2_accounting_counters(const struct bkey *k)
-{
-	return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
-}
-
-static inline void bch2_accounting_neg(struct bkey_s_accounting a)
-{
-	bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
-}
-
-static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
-{
-	for (unsigned i = 0;  i < bch2_accounting_counters(a.k); i++)
-		if (a.v->d[i])
-			return false;
-	return true;
-}
-
-static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
-					      struct bkey_s_c_accounting src)
-{
-	for (unsigned i = 0;
-	     i < min(bch2_accounting_counters(&dst->k),
-		     bch2_accounting_counters(src.k));
-	     i++)
-		dst->v.d[i] += src.v->d[i];
-
-	if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
-		dst->k.bversion = src.k->bversion;
-}
-
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
-					      enum bch_data_type data_type,
-					      s64 sectors)
-{
-	switch (data_type) {
-	case BCH_DATA_btree:
-		fs_usage->btree		+= sectors;
-		break;
-	case BCH_DATA_user:
-	case BCH_DATA_parity:
-		fs_usage->data		+= sectors;
-		break;
-	case BCH_DATA_cached:
-		fs_usage->cached	+= sectors;
-		break;
-	default:
-		break;
-	}
-}
-
-static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
-{
-	BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	acc->_pad = p;
-#else
-	memcpy_swab(acc, &p, sizeof(p));
-#endif
-}
-
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
-{
-	struct bpos p;
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-	p = acc->_pad;
-#else
-	memcpy_swab(&p, acc, sizeof(p));
-#endif
-	return p;
-}
-
-int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
-			     s64 *, unsigned, bool);
-
-#define disk_accounting_key_init(_k, _type, ...)			\
-do {									\
-	memset(&(_k), 0, sizeof(_k));					\
-	(_k).type	= BCH_DISK_ACCOUNTING_##_type;			\
-	(_k)._type	= (struct bch_acct_##_type) { __VA_ARGS__ };	\
-} while (0)
-
-#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...)		\
-({									\
-	struct disk_accounting_pos pos;					\
-	disk_accounting_key_init(pos, __VA_ARGS__);			\
-	bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc);		\
-})
-
-#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...)			\
-	bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
-			     struct bkey_validate_context);
-void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
-void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_accounting_swab(struct bkey_s);
-
-#define bch2_bkey_ops_accounting ((struct bkey_ops) {	\
-	.key_validate	= bch2_accounting_validate,	\
-	.val_to_text	= bch2_accounting_to_text,	\
-	.swab		= bch2_accounting_swab,		\
-	.min_val_size	= 8,				\
-})
-
-int bch2_accounting_update_sb(struct btree_trans *);
-
-static inline int accounting_pos_cmp(const void *_l, const void *_r)
-{
-	const struct bpos *l = _l, *r = _r;
-
-	return bpos_cmp(*l, *r);
-}
-
-enum bch_accounting_mode {
-	BCH_ACCOUNTING_normal,
-	BCH_ACCOUNTING_gc,
-	BCH_ACCOUNTING_read,
-};
-
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-void bch2_accounting_mem_gc(struct bch_fs *);
-
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
-{
-	return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
-		acc->type != BCH_DISK_ACCOUNTING_inum;
-}
-
-/*
- * Update in memory counters so they match the btree update we're doing; called
- * from transaction commit path
- */
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
-						 struct bkey_s_c_accounting a,
-						 enum bch_accounting_mode mode,
-						 bool write_locked)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_accounting_mem *acc = &c->accounting;
-	struct disk_accounting_pos acc_k;
-	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
-	bool gc = mode == BCH_ACCOUNTING_gc;
-
-	if (gc && !acc->gc_running)
-		return 0;
-
-	if (!bch2_accounting_is_mem(&acc_k))
-		return 0;
-
-	if (mode == BCH_ACCOUNTING_normal) {
-		switch (acc_k.type) {
-		case BCH_DISK_ACCOUNTING_persistent_reserved:
-			trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-			break;
-		case BCH_DISK_ACCOUNTING_replicas:
-			fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
-			break;
-		case BCH_DISK_ACCOUNTING_dev_data_type: {
-			guard(rcu)();
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
-			if (ca) {
-				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
-				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
-				this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
-			}
-			break;
-		}
-		}
-	}
-
-	unsigned idx;
-
-	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
-		int ret = 0;
-		if (unlikely(write_locked))
-			ret = bch2_accounting_mem_insert_locked(c, a, mode);
-		else
-			ret = bch2_accounting_mem_insert(c, a, mode);
-		if (ret)
-			return ret;
-	}
-
-	struct accounting_mem_entry *e = &acc->k.data[idx];
-
-	EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
-	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
-		this_cpu_add(e->v[gc][i], a.v->d[i]);
-	return 0;
-}
-
-static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
-{
-	percpu_down_read(&trans->c->mark_lock);
-	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
-	percpu_up_read(&trans->c->mark_lock);
-	return ret;
-}
-
-static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
-						     unsigned idx, u64 *v, unsigned nr, bool gc)
-{
-	memset(v, 0, sizeof(*v) * nr);
-
-	if (unlikely(idx >= acc->k.nr))
-		return;
-
-	struct accounting_mem_entry *e = &acc->k.data[idx];
-
-	nr = min_t(unsigned, nr, e->nr_counters);
-
-	for (unsigned i = 0; i < nr; i++)
-		v[i] = percpu_u64_get(e->v[gc] + i);
-}
-
-static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
-					    u64 *v, unsigned nr)
-{
-	percpu_down_read(&c->mark_lock);
-	struct bch_accounting_mem *acc = &c->accounting;
-	unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-				       accounting_pos_cmp, &p);
-
-	bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
-	percpu_up_read(&c->mark_lock);
-}
-
-static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
-	EBUG_ON(!res->ref);
-
-	return (struct bversion) {
-		.hi = res->seq >> 32,
-		.lo = (res->seq << 32) | (res->offset + offset),
-	};
-}
-
-static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
-						    struct bkey_i_accounting *a,
-						    unsigned commit_flags)
-{
-	u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
-	a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
-
-	EBUG_ON(bversion_zero(a->k.bversion));
-
-	return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
-		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
-		: 0;
-}
-
-static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
-						       struct bkey_i_accounting *a_i,
-						       unsigned commit_flags)
-{
-	if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
-		struct bkey_s_accounting a = accounting_i_to_s(a_i);
-
-		bch2_accounting_neg(a);
-		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
-		bch2_accounting_neg(a);
-	}
-}
-
-int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
-int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-
-int bch2_gc_accounting_start(struct bch_fs *);
-int bch2_gc_accounting_done(struct bch_fs *);
-
-int bch2_accounting_read(struct bch_fs *);
-
-int bch2_dev_usage_remove(struct bch_fs *, unsigned);
-int bch2_dev_usage_init(struct bch_dev *, bool);
-
-void bch2_verify_accounting_clean(struct bch_fs *c);
-
-void bch2_accounting_gc_free(struct bch_fs *);
-void bch2_fs_accounting_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
deleted file mode 100644
index 8269af1dbe2a..000000000000
--- a/fs/bcachefs/disk_accounting_format.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-
-#include "replicas_format.h"
-
-/*
- * Disk accounting - KEY_TYPE_accounting - on disk format:
- *
- * Here, the key has considerably more structure than a typical key (bpos); an
- * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
- *
- * More specifically: a key is just a muliword integer (where word endianness
- * matches native byte order), so we're treating bpos as an opaque 20 byte
- * integer and mapping bch_accounting_key to that.
- *
- * This is a type-tagged union of all our various subtypes; a disk accounting
- * key can be device counters, replicas counters, et cetera - it's extensible.
- *
- * The value is a list of u64s or s64s; the number of counters is specific to a
- * given accounting type.
- *
- * Unlike with other key types, updates are _deltas_, and the deltas are not
- * resolved until the update to the underlying btree, done by btree write buffer
- * flush or journal replay.
- *
- * Journal replay in particular requires special handling. The journal tracks a
- * range of entries which may possibly have not yet been applied to the btree
- * yet - it does not know definitively whether individual entries are dirty and
- * still need to be applied.
- *
- * To handle this, we use the version field of struct bkey, and give every
- * accounting update a unique version number - a total ordering in time; the
- * version number is derived from the key's position in the journal. Then
- * journal replay can compare the version number of the key from the journal
- * with the version number of the key in the btree to determine if a key needs
- * to be replayed.
- *
- * For this to work, we must maintain this strict time ordering of updates as
- * they are flushed to the btree, both via write buffer flush and via journal
- * replay. This has complications for the write buffer code while journal replay
- * is still in progress; the write buffer cannot flush any accounting keys to
- * the btree until journal replay has finished replaying its accounting keys, or
- * the (newer) version number of the keys from the write buffer will cause
- * updates from journal replay to be lost.
- */
-
-struct bch_accounting {
-	struct bch_val		v;
-	__u64			d[];
-};
-
-#define BCH_ACCOUNTING_MAX_COUNTERS		3
-
-#define BCH_DATA_TYPES()		\
-	x(free,		0)		\
-	x(sb,		1)		\
-	x(journal,	2)		\
-	x(btree,	3)		\
-	x(user,		4)		\
-	x(cached,	5)		\
-	x(parity,	6)		\
-	x(stripe,	7)		\
-	x(need_gc_gens,	8)		\
-	x(need_discard,	9)		\
-	x(unstriped,	10)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
-	BCH_DATA_TYPES()
-#undef x
-	BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_free:
-	case BCH_DATA_need_gc_gens:
-	case BCH_DATA_need_discard:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
-	switch (type) {
-	case BCH_DATA_sb:
-	case BCH_DATA_journal:
-		return true;
-	default:
-		return false;
-	}
-}
-
-/*
- * field 1: name
- * field 2: id
- * field 3: number of counters (max 3)
- */
-
-#define BCH_DISK_ACCOUNTING_TYPES()		\
-	x(nr_inodes,		0,	1)	\
-	x(persistent_reserved,	1,	1)	\
-	x(replicas,		2,	1)	\
-	x(dev_data_type,	3,	3)	\
-	x(compression,		4,	3)	\
-	x(snapshot,		5,	1)	\
-	x(btree,		6,	1)	\
-	x(rebalance_work,	7,	1)	\
-	x(inum,			8,	3)
-
-enum disk_accounting_type {
-#define x(f, nr, ...)	BCH_DISK_ACCOUNTING_##f	= nr,
-	BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-	BCH_DISK_ACCOUNTING_TYPE_NR,
-};
-
-/*
- * No subtypes - number of inodes in the entire filesystem
- *
- * XXX: perhaps we could add a per-subvolume counter?
- */
-struct bch_acct_nr_inodes {
-};
-
-/*
- * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
- * reservation:
- */
-struct bch_acct_persistent_reserved {
-	__u8			nr_replicas;
-};
-
-/*
- * device, data type counter fields:
- * [
- *   nr_buckets
- *   live sectors (in buckets of that data type)
- *   sectors of internal fragmentation
- * ]
- *
- * XXX: live sectors should've been done differently, you can have multiple data
- * types in the same bucket (user, stripe, cached) and this collapses them to
- * the bucket data type, and makes the internal fragmentation counter redundant
- */
-struct bch_acct_dev_data_type {
-	__u8			dev;
-	__u8			data_type;
-};
-
-/*
- * Compression type fields:
- * [
- *   number of extents
- *   uncompressed size
- *   compressed size
- * ]
- *
- * Compression ratio, average extent size (fragmentation).
- */
-struct bch_acct_compression {
-	__u8			type;
-};
-
-/*
- * On disk usage by snapshot id; counts same values as replicas counter, but
- * aggregated differently
- */
-struct bch_acct_snapshot {
-	__u32			id;
-} __packed;
-
-struct bch_acct_btree {
-	__u32			id;
-} __packed;
-
-/*
- * inum counter fields:
- * [
- *   number of extents
- *   sum of extent sizes - bkey size
- *     this field is similar to inode.bi_sectors, except here extents in
- *     different snapshots but the same inode number are all collapsed to the
- *     same counter
- *   sum of on disk size - same values tracked by replicas counters
- * ]
- *
- * This tracks on disk fragmentation.
- */
-struct bch_acct_inum {
-	__u64			inum;
-} __packed;
-
-/*
- * Simple counter of the amount of data (on disk sectors) rebalance needs to
- * move, extents counted here are also in the rebalance_work btree.
- */
-struct bch_acct_rebalance_work {
-};
-
-struct disk_accounting_pos {
-	union {
-	struct {
-		__u8				type;
-		union {
-		struct bch_acct_nr_inodes	nr_inodes;
-		struct bch_acct_persistent_reserved	persistent_reserved;
-		struct bch_replicas_entry_v1	replicas;
-		struct bch_acct_dev_data_type	dev_data_type;
-		struct bch_acct_compression	compression;
-		struct bch_acct_snapshot	snapshot;
-		struct bch_acct_btree		btree;
-		struct bch_acct_rebalance_work	rebalance_work;
-		struct bch_acct_inum		inum;
-		} __packed;
-	} __packed;
-		struct bpos			_pad;
-	};
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
deleted file mode 100644
index b1982131b206..000000000000
--- a/fs/bcachefs/disk_accounting_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-
-#include "darray.h"
-
-struct accounting_mem_entry {
-	struct bpos				pos;
-	struct bversion				bversion;
-	unsigned				nr_counters;
-	u64 __percpu				*v[2];
-};
-
-struct bch_accounting_mem {
-	DARRAY(struct accounting_mem_entry)	k;
-	bool					gc_running;
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
deleted file mode 100644
index cde842ac1886..000000000000
--- a/fs/bcachefs/disk_groups.c
+++ /dev/null
@@ -1,591 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "disk_groups.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int group_cmp(const void *_l, const void *_r)
-{
-	const struct bch_disk_group *l = _l;
-	const struct bch_disk_group *r = _r;
-
-	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
-		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
-		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
-		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
-		strncmp(l->label, r->label, sizeof(l->label));
-}
-
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_disk_groups *groups =
-		field_to_type(f, disk_groups);
-	struct bch_disk_group *g, *sorted = NULL;
-	unsigned nr_groups = disk_groups_nr(groups);
-	unsigned i, len;
-	int ret = 0;
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member m = bch2_sb_member_get(sb, i);
-		unsigned group_id;
-
-		if (!BCH_MEMBER_GROUP(&m))
-			continue;
-
-		group_id = BCH_MEMBER_GROUP(&m) - 1;
-
-		if (group_id >= nr_groups) {
-			prt_printf(err, "disk %u has invalid label %u (have %u)",
-				   i, group_id, nr_groups);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-
-		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
-			prt_printf(err, "disk %u has deleted label %u", i, group_id);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-	}
-
-	if (!nr_groups)
-		return 0;
-
-	for (i = 0; i < nr_groups; i++) {
-		g = groups->entries + i;
-
-		if (BCH_GROUP_DELETED(g))
-			continue;
-
-		len = strnlen(g->label, sizeof(g->label));
-		if (!len) {
-			prt_printf(err, "label %u empty", i);
-			return -BCH_ERR_invalid_sb_disk_groups;
-		}
-	}
-
-	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
-	if (!sorted)
-		return -BCH_ERR_ENOMEM_disk_groups_validate;
-
-	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
-	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
-
-	for (g = sorted; g + 1 < sorted + nr_groups; g++)
-		if (!BCH_GROUP_DELETED(g) &&
-		    !group_cmp(&g[0], &g[1])) {
-			prt_printf(err, "duplicate label %llu.%.*s",
-			       BCH_GROUP_PARENT(g),
-			       (int) sizeof(g->label), g->label);
-			ret = -BCH_ERR_invalid_sb_disk_groups;
-			goto err;
-		}
-err:
-	kfree(sorted);
-	return ret;
-}
-
-static void bch2_sb_disk_groups_to_text(struct printbuf *out,
-					struct bch_sb *sb,
-					struct bch_sb_field *f)
-{
-	struct bch_sb_field_disk_groups *groups =
-		field_to_type(f, disk_groups);
-	struct bch_disk_group *g;
-	unsigned nr_groups = disk_groups_nr(groups);
-
-	for (g = groups->entries;
-	     g < groups->entries + nr_groups;
-	     g++) {
-		if (g != groups->entries)
-			prt_printf(out, " ");
-
-		if (BCH_GROUP_DELETED(g))
-			prt_printf(out, "[deleted]");
-		else
-			prt_printf(out, "[parent %llu name %s]",
-			       BCH_GROUP_PARENT(g), g->label);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
-	.validate	= bch2_sb_disk_groups_validate,
-	.to_text	= bch2_sb_disk_groups_to_text
-};
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_disk_groups *groups;
-	struct bch_disk_groups_cpu *cpu_g, *old_g;
-	unsigned i, g, nr_groups;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
-	nr_groups	= disk_groups_nr(groups);
-
-	if (!groups)
-		return 0;
-
-	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
-	if (!cpu_g)
-		return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
-
-	cpu_g->nr = nr_groups;
-
-	for (i = 0; i < nr_groups; i++) {
-		struct bch_disk_group *src	= &groups->entries[i];
-		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
-
-		dst->deleted	= BCH_GROUP_DELETED(src);
-		dst->parent	= BCH_GROUP_PARENT(src);
-		memcpy(dst->label, src->label, sizeof(dst->label));
-	}
-
-	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
-		struct bch_disk_group_cpu *dst;
-
-		if (!bch2_member_alive(&m))
-			continue;
-
-		g = BCH_MEMBER_GROUP(&m);
-		while (g) {
-			dst = &cpu_g->entries[g - 1];
-			__set_bit(i, dst->devs.d);
-			g = dst->parent;
-		}
-	}
-
-	old_g = rcu_dereference_protected(c->disk_groups,
-				lockdep_is_held(&c->sb_lock));
-	rcu_assign_pointer(c->disk_groups, cpu_g);
-	if (old_g)
-		kfree_rcu(old_g, rcu);
-
-	return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
-	struct target t = target_decode(target);
-
-	guard(rcu)();
-
-	switch (t.type) {
-	case TARGET_NULL:
-		return NULL;
-	case TARGET_DEV: {
-		struct bch_dev *ca = t.dev < c->sb.nr_devices
-			? rcu_dereference(c->devs[t.dev])
-			: NULL;
-		return ca ? &ca->self : NULL;
-	}
-	case TARGET_GROUP: {
-		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-		return g && t.group < g->nr && !g->entries[t.group].deleted
-			? &g->entries[t.group].devs
-			: NULL;
-	}
-	default:
-		BUG();
-	}
-}
-
-bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-	struct target t = target_decode(target);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		return false;
-	case TARGET_DEV:
-		return dev == t.dev;
-	case TARGET_GROUP: {
-		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-		const struct bch_devs_mask *m =
-			g && t.group < g->nr && !g->entries[t.group].deleted
-			? &g->entries[t.group].devs
-			: NULL;
-
-		return m ? test_bit(dev, m->d) : false;
-	}
-	default:
-		BUG();
-	}
-}
-
-static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
-				  unsigned parent,
-				  const char *name, unsigned namelen)
-{
-	unsigned i, nr_groups = disk_groups_nr(groups);
-
-	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-		return -EINVAL;
-
-	for (i = 0; i < nr_groups; i++) {
-		struct bch_disk_group *g = groups->entries + i;
-
-		if (BCH_GROUP_DELETED(g))
-			continue;
-
-		if (!BCH_GROUP_DELETED(g) &&
-		    BCH_GROUP_PARENT(g) == parent &&
-		    strnlen(g->label, sizeof(g->label)) == namelen &&
-		    !memcmp(name, g->label, namelen))
-			return i;
-	}
-
-	return -1;
-}
-
-static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
-				 const char *name, unsigned namelen)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb->sb, disk_groups);
-	unsigned i, nr_groups = disk_groups_nr(groups);
-	struct bch_disk_group *g;
-
-	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
-		return -EINVAL;
-
-	for (i = 0;
-	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
-	     i++)
-		;
-
-	if (i == nr_groups) {
-		unsigned u64s =
-			(sizeof(struct bch_sb_field_disk_groups) +
-			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
-			sizeof(u64);
-
-		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
-		if (!groups)
-			return -BCH_ERR_ENOSPC_disk_label_add;
-
-		nr_groups = disk_groups_nr(groups);
-	}
-
-	BUG_ON(i >= nr_groups);
-
-	g = &groups->entries[i];
-
-	memcpy(g->label, name, namelen);
-	if (namelen < sizeof(g->label))
-		g->label[namelen] = '\0';
-	SET_BCH_GROUP_DELETED(g, 0);
-	SET_BCH_GROUP_PARENT(g, parent);
-	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-
-	return i;
-}
-
-int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb->sb, disk_groups);
-	int v = -1;
-
-	do {
-		const char *next = strchrnul(name, '.');
-		unsigned len = next - name;
-
-		if (*next == '.')
-			next++;
-
-		v = __bch2_disk_group_find(groups, v + 1, name, len);
-		name = next;
-	} while (*name && v >= 0);
-
-	return v;
-}
-
-int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
-{
-	struct bch_sb_field_disk_groups *groups;
-	unsigned parent = 0;
-	int v = -1;
-
-	do {
-		const char *next = strchrnul(name, '.');
-		unsigned len = next - name;
-
-		if (*next == '.')
-			next++;
-
-		groups = bch2_sb_field_get(sb->sb, disk_groups);
-
-		v = __bch2_disk_group_find(groups, parent, name, len);
-		if (v < 0)
-			v = __bch2_disk_group_add(sb, parent, name, len);
-		if (v < 0)
-			return v;
-
-		parent = v + 1;
-		name = next;
-	} while (*name && v >= 0);
-
-	return v;
-}
-
-static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
-				     unsigned v)
-{
-	u16 path[32];
-	unsigned nr = 0;
-
-	while (1) {
-		if (nr == ARRAY_SIZE(path))
-			goto invalid;
-
-		if (v >= (g ? g->nr : 0))
-			goto invalid;
-
-		struct bch_disk_group_cpu *e = g->entries + v;
-
-		if (e->deleted)
-			goto invalid;
-
-		path[nr++] = v;
-
-		if (!e->parent)
-			break;
-
-		v = e->parent - 1;
-	}
-
-	while (nr) {
-		struct bch_disk_group_cpu *e = g->entries + path[--nr];
-
-		prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
-		if (nr)
-			prt_printf(out, ".");
-	}
-	return;
-invalid:
-	prt_printf(out, "invalid label %u", v);
-}
-
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	bch2_printbuf_make_room(out, 4096);
-
-	out->atomic++;
-	guard(rcu)();
-	struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-	for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
-		prt_printf(out, "%2u: ", i);
-
-		if (g->entries[i].deleted) {
-			prt_printf(out, "[deleted]");
-			goto next;
-		}
-
-		__bch2_disk_path_to_text(out, g, i);
-
-		prt_printf(out, " devs");
-
-		for_each_member_device_rcu(c, ca, &g->entries[i].devs)
-			prt_printf(out, " %s", ca->name);
-next:
-		prt_newline(out);
-	}
-
-	out->atomic--;
-}
-
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-	out->atomic++;
-	guard(rcu)();
-	__bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
-	--out->atomic;
-}
-
-void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-	struct bch_sb_field_disk_groups *groups =
-		bch2_sb_field_get(sb, disk_groups);
-	struct bch_disk_group *g;
-	unsigned nr = 0;
-	u16 path[32];
-
-	while (1) {
-		if (nr == ARRAY_SIZE(path))
-			goto inval;
-
-		if (v >= disk_groups_nr(groups))
-			goto inval;
-
-		g = groups->entries + v;
-
-		if (BCH_GROUP_DELETED(g))
-			goto inval;
-
-		path[nr++] = v;
-
-		if (!BCH_GROUP_PARENT(g))
-			break;
-
-		v = BCH_GROUP_PARENT(g) - 1;
-	}
-
-	while (nr) {
-		v = path[--nr];
-		g = groups->entries + v;
-
-		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
-		if (nr)
-			prt_printf(out, ".");
-	}
-	return;
-inval:
-	prt_printf(out, "invalid label %u", v);
-}
-
-int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-	lockdep_assert_held(&c->sb_lock);
-
-
-	if (!strlen(name) || !strcmp(name, "none")) {
-		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-		SET_BCH_MEMBER_GROUP(mi, 0);
-	} else {
-		int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
-		if (v < 0)
-			return v;
-
-		struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-		SET_BCH_MEMBER_GROUP(mi, v + 1);
-	}
-
-	return bch2_sb_disk_groups_to_cpu(c);
-}
-
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	ret = __bch2_dev_group_set(c, ca, name) ?:
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
-			  struct printbuf *err)
-{
-	struct bch_dev *ca;
-	int g;
-
-	if (!val)
-		return -EINVAL;
-
-	if (!c)
-		return -BCH_ERR_option_needs_open_fs;
-
-	if (!strlen(val) || !strcmp(val, "none")) {
-		*res = 0;
-		return 0;
-	}
-
-	/* Is it a device? */
-	ca = bch2_dev_lookup(c, val);
-	if (!IS_ERR(ca)) {
-		*res = dev_to_target(ca->dev_idx);
-		bch2_dev_put(ca);
-		return 0;
-	}
-
-	mutex_lock(&c->sb_lock);
-	g = bch2_disk_path_find(&c->disk_sb, val);
-	mutex_unlock(&c->sb_lock);
-
-	if (g >= 0) {
-		*res = group_to_target(g);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
-	struct target t = target_decode(v);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		prt_printf(out, "none");
-		return;
-	case TARGET_DEV: {
-		out->atomic++;
-		guard(rcu)();
-		struct bch_dev *ca = t.dev < c->sb.nr_devices
-			? rcu_dereference(c->devs[t.dev])
-			: NULL;
-
-		if (ca && ca->disk_sb.bdev)
-			prt_printf(out, "/dev/%s", ca->name);
-		else if (ca)
-			prt_printf(out, "offline device %u", t.dev);
-		else
-			prt_printf(out, "invalid device %u", t.dev);
-
-		out->atomic--;
-		return;
-	}
-	case TARGET_GROUP:
-		bch2_disk_path_to_text(out, c, t.group);
-		return;
-	default:
-		BUG();
-	}
-}
-
-static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
-	struct target t = target_decode(v);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		prt_printf(out, "none");
-		break;
-	case TARGET_DEV: {
-		struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
-		if (bch2_member_exists(sb, t.dev)) {
-			prt_printf(out, "Device ");
-			pr_uuid(out, m.uuid.b);
-			prt_printf(out, " (%u)", t.dev);
-		} else {
-			prt_printf(out, "Bad device %u", t.dev);
-		}
-		break;
-	}
-	case TARGET_GROUP:
-		bch2_disk_path_to_text_sb(out, sb, t.group);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_opt_target_to_text(struct printbuf *out,
-			     struct bch_fs *c,
-			     struct bch_sb *sb,
-			     u64 v)
-{
-	if (c)
-		bch2_target_to_text(out, c, v);
-	else
-		bch2_target_to_text_sb(out, sb, v);
-}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
deleted file mode 100644
index 441826fff224..000000000000
--- a/fs/bcachefs/disk_groups.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_H
-#define _BCACHEFS_DISK_GROUPS_H
-
-#include "disk_groups_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
-	return groups
-		? (vstruct_end(&groups->field) -
-		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
-		: 0;
-}
-
-struct target {
-	enum {
-		TARGET_NULL,
-		TARGET_DEV,
-		TARGET_GROUP,
-	}			type;
-	union {
-		unsigned	dev;
-		unsigned	group;
-	};
-};
-
-#define TARGET_DEV_START	1
-#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
-	return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
-	return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
-	if (target >= TARGET_GROUP_START)
-		return (struct target) {
-			.type	= TARGET_GROUP,
-			.group	= target - TARGET_GROUP_START
-		};
-
-	if (target >= TARGET_DEV_START)
-		return (struct target) {
-			.type	= TARGET_DEV,
-			.group	= target - TARGET_DEV_START
-		};
-
-	return (struct target) { .type = TARGET_NULL };
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
-						  enum bch_data_type data_type,
-						  u16 target)
-{
-	struct bch_devs_mask devs = c->rw_devs[data_type];
-	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
-
-	if (t)
-		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
-	return devs;
-}
-
-static inline bool bch2_target_accepts_data(struct bch_fs *c,
-					    enum bch_data_type data_type,
-					    u16 target)
-{
-	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
-	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
-}
-
-bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
-
-int bch2_disk_path_find(struct bch_sb_handle *, const char *);
-
-/* Exported for userspace bcachefs-tools: */
-int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-
-void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
-void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-
-#define bch2_opt_target (struct bch_opt_fn) {		\
-	.parse		= bch2_opt_target_parse,	\
-	.to_text	= bch2_opt_target_to_text,	\
-}
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-
-const char *bch2_sb_validate_disk_groups(struct bch_sb *,
-					 struct bch_sb_field *);
-
-void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
deleted file mode 100644
index 698990bbf1d2..000000000000
--- a/fs/bcachefs/disk_groups_format.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
-#define _BCACHEFS_DISK_GROUPS_FORMAT_H
-
-#define BCH_SB_LABEL_SIZE		32
-
-struct bch_disk_group {
-	__u8			label[BCH_SB_LABEL_SIZE];
-	__le64			flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
-LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
-	struct bch_sb_field	field;
-	struct bch_disk_group	entries[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
deleted file mode 100644
index a54ef085b13d..000000000000
--- a/fs/bcachefs/disk_groups_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
-#define _BCACHEFS_DISK_GROUPS_TYPES_H
-
-struct bch_disk_group_cpu {
-	bool				deleted;
-	u16				parent;
-	u8				label[BCH_SB_LABEL_SIZE];
-	struct bch_devs_mask		devs;
-};
-
-struct bch_disk_groups_cpu {
-	struct rcu_head			rcu;
-	unsigned			nr;
-	struct bch_disk_group_cpu	entries[] __counted_by(nr);
-};
-
-#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
deleted file mode 100644
index 543dbba9b14f..000000000000
--- a/fs/bcachefs/ec.c
+++ /dev/null
@@ -1,2405 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* erasure coding */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "lru.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "util.h"
-
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-#ifdef __KERNEL__
-
-#include <linux/raid/pq.h>
-#include <linux/raid/xor.h>
-
-static void raid5_recov(unsigned disks, unsigned failed_idx,
-			size_t size, void **data)
-{
-	unsigned i = 2, nr;
-
-	BUG_ON(failed_idx >= disks);
-
-	swap(data[0], data[failed_idx]);
-	memcpy(data[0], data[1], size);
-
-	while (i < disks) {
-		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
-		xor_blocks(nr, size, data[0], data + i);
-		i += nr;
-	}
-
-	swap(data[0], data[failed_idx]);
-}
-
-static void raid_gen(int nd, int np, size_t size, void **v)
-{
-	if (np >= 1)
-		raid5_recov(nd + np, nd, size, v);
-	if (np >= 2)
-		raid6_call.gen_syndrome(nd + np, size, v);
-	BUG_ON(np > 2);
-}
-
-static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
-{
-	switch (nr) {
-	case 0:
-		break;
-	case 1:
-		if (ir[0] < nd + 1)
-			raid5_recov(nd + 1, ir[0], size, v);
-		else
-			raid6_call.gen_syndrome(nd + np, size, v);
-		break;
-	case 2:
-		if (ir[1] < nd) {
-			/* data+data failure. */
-			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
-		} else if (ir[0] < nd) {
-			/* data + p/q failure */
-
-			if (ir[1] == nd) /* data + p failure */
-				raid6_datap_recov(nd + np, size, ir[0], v);
-			else { /* data + q failure */
-				raid5_recov(nd + 1, ir[0], size, v);
-				raid6_call.gen_syndrome(nd + np, size, v);
-			}
-		} else {
-			raid_gen(nd, np, size, v);
-		}
-		break;
-	default:
-		BUG();
-	}
-}
-
-#else
-
-#include <raid/raid.h>
-
-#endif
-
-struct ec_bio {
-	struct bch_dev		*ca;
-	struct ec_stripe_buf	*buf;
-	size_t			idx;
-	int			rw;
-	u64			submit_time;
-	struct bio		bio;
-};
-
-/* Stripes btree keys: */
-
-int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
-			 struct bkey_validate_context from)
-{
-	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
-			 bpos_gt(k.k->p, POS(0, U32_MAX)),
-			 c, stripe_pos_bad,
-			 "stripe at bad pos");
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
-			 c, stripe_val_size_bad,
-			 "incorrect value size (%zu < %u)",
-			 bkey_val_u64s(k.k), stripe_val_u64s(s));
-
-	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
-			 c, stripe_csum_granularity_bad,
-			 "invalid csum granularity (%u >= 64)",
-			 s->csum_granularity_bits);
-
-	ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
-{
-	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
-	struct bch_stripe s = {};
-
-	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
-
-	unsigned nr_data = s.nr_blocks - s.nr_redundant;
-
-	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
-		   s.algorithm,
-		   le16_to_cpu(s.sectors),
-		   nr_data,
-		   s.nr_redundant);
-	bch2_prt_csum_type(out, s.csum_type);
-	prt_str(out, " gran ");
-	if (s.csum_granularity_bits < 64)
-		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
-	else
-		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
-
-	if (s.disk_label) {
-		prt_str(out, " label");
-		bch2_disk_path_to_text(out, c, s.disk_label - 1);
-	}
-
-	for (unsigned i = 0; i < s.nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = sp->ptrs + i;
-
-		if ((void *) ptr >= bkey_val_end(k))
-			break;
-
-		prt_char(out, ' ');
-		bch2_extent_ptr_to_text(out, c, ptr);
-
-		if (s.csum_type < BCH_CSUM_NR &&
-		    i < nr_data &&
-		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
-			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
-	}
-}
-
-/* Triggers: */
-
-static int __mark_stripe_bucket(struct btree_trans *trans,
-				struct bch_dev *ca,
-				struct bkey_s_c_stripe s,
-				unsigned ptr_idx, bool deleting,
-				struct bpos bucket,
-				struct bch_alloc_v4 *a,
-				enum btree_iter_update_trigger_flags flags)
-{
-	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
-	bool parity = ptr_idx >= nr_data;
-	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
-	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_fs *c = trans->c;
-	if (deleting)
-		sectors = -sectors;
-
-	if (!deleting) {
-		if (bch2_trans_inconsistent_on(a->stripe ||
-					       a->stripe_redundancy, trans,
-				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				a->dirty_sectors,
-				a->stripe, s.k->p.offset,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
-				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				a->dirty_sectors,
-				a->cached_sectors,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-	} else {
-		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
-					       a->stripe_redundancy != s.v->nr_redundant, trans,
-				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				a->stripe,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
-				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				bch2_data_type_str(a->data_type),
-				bch2_data_type_str(data_type),
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-
-		if (bch2_trans_inconsistent_on(parity &&
-					       (a->dirty_sectors != -sectors ||
-						a->cached_sectors), trans,
-				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
-				bucket.inode, bucket.offset, a->gen,
-				a->dirty_sectors,
-				a->cached_sectors,
-				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-	}
-
-	if (sectors) {
-		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
-					     a->gen, a->data_type, &a->dirty_sectors);
-		if (ret)
-			goto err;
-	}
-
-	if (!deleting) {
-		a->stripe		= s.k->p.offset;
-		a->stripe_redundancy	= s.v->nr_redundant;
-		alloc_data_type_set(a, data_type);
-	} else {
-		a->stripe		= 0;
-		a->stripe_redundancy	= 0;
-		alloc_data_type_set(a, BCH_DATA_user);
-	}
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int mark_stripe_bucket(struct btree_trans *trans,
-			      struct bkey_s_c_stripe s,
-			      unsigned ptr_idx, bool deleting,
-			      enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
-	if (unlikely(!ca)) {
-		if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
-			ret = bch_err_throw(c, mark_stripe);
-		goto err;
-	}
-
-	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct extent_ptr_decoded p = {
-			.ptr = *ptr,
-			.crc = bch2_extent_crc_unpack(s.k, NULL),
-		};
-		struct bkey_i_backpointer bp;
-		bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
-				      (const union bch_extent_entry *) ptr, &bp);
-
-		struct bkey_i_alloc_v4 *a =
-			bch2_trans_start_alloc_update(trans, bucket, 0);
-		ret   = PTR_ERR_OR_ZERO(a) ?:
-			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
-			bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
-						    !(flags & BTREE_TRIGGER_overwrite));
-		if (ret)
-			goto err;
-	}
-
-	if (flags & BTREE_TRIGGER_gc) {
-		struct bucket *g = gc_bucket(ca, bucket.offset);
-		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
-					    ptr->dev,
-					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
-			ret = bch_err_throw(c, mark_stripe);
-			goto err;
-		}
-
-		bucket_lock(g);
-		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
-		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
-		alloc_to_bucket(g, new);
-		bucket_unlock(g);
-
-		if (!ret)
-			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
-	}
-err:
-	bch2_dev_put(ca);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int mark_stripe_buckets(struct btree_trans *trans,
-			       struct bkey_s_c old, struct bkey_s_c new,
-			       enum btree_iter_update_trigger_flags flags)
-{
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
-
-	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
-
-	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
-	for (unsigned i = 0; i < nr_blocks; i++) {
-		if (new_s && old_s &&
-		    !memcmp(&new_s->ptrs[i],
-			    &old_s->ptrs[i],
-			    sizeof(new_s->ptrs[i])))
-			continue;
-
-		if (new_s) {
-			int ret = mark_stripe_bucket(trans,
-					bkey_s_c_to_stripe(new), i, false, flags);
-			if (ret)
-				return ret;
-		}
-
-		if (old_s) {
-			int ret = mark_stripe_bucket(trans,
-					bkey_s_c_to_stripe(old), i, true, flags);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_trigger_stripe(struct btree_trans *trans,
-			enum btree_id btree, unsigned level,
-			struct bkey_s_c old, struct bkey_s _new,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_s_c new = _new.s_c;
-	struct bch_fs *c = trans->c;
-	u64 idx = new.k->p.offset;
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
-
-	if (unlikely(flags & BTREE_TRIGGER_check_repair))
-		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
-
-	BUG_ON(new_s && old_s &&
-	       (new_s->nr_blocks	!= old_s->nr_blocks ||
-		new_s->nr_redundant	!= old_s->nr_redundant));
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		int ret = bch2_lru_change(trans,
-					  BCH_LRU_STRIPE_FRAGMENTATION,
-					  idx,
-					  stripe_lru_pos(old_s),
-					  stripe_lru_pos(new_s));
-		if (ret)
-			return ret;
-	}
-
-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
-		/*
-		 * If the pointers aren't changing, we don't need to do anything:
-		 */
-		if (new_s && old_s &&
-		    new_s->nr_blocks	== old_s->nr_blocks &&
-		    new_s->nr_redundant	== old_s->nr_redundant &&
-		    !memcmp(old_s->ptrs, new_s->ptrs,
-			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
-			return 0;
-
-		struct gc_stripe *gc = NULL;
-		if (flags & BTREE_TRIGGER_gc) {
-			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-			if (!gc) {
-				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
-				return bch_err_throw(c, ENOMEM_mark_stripe);
-			}
-
-			/*
-			 * This will be wrong when we bring back runtime gc: we should
-			 * be unmarking the old key and then marking the new key
-			 *
-			 * Also: when we bring back runtime gc, locking
-			 */
-			gc->alive	= true;
-			gc->sectors	= le16_to_cpu(new_s->sectors);
-			gc->nr_blocks	= new_s->nr_blocks;
-			gc->nr_redundant	= new_s->nr_redundant;
-
-			for (unsigned i = 0; i < new_s->nr_blocks; i++)
-				gc->ptrs[i] = new_s->ptrs[i];
-
-			/*
-			 * gc recalculates this field from stripe ptr
-			 * references:
-			 */
-			memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
-		}
-
-		if (new_s) {
-			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
-
-			struct disk_accounting_pos acc;
-			memset(&acc, 0, sizeof(acc));
-			acc.type = BCH_DISK_ACCOUNTING_replicas;
-			bch2_bkey_to_replicas(&acc.replicas, new);
-			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-			if (ret)
-				return ret;
-
-			if (gc)
-				unsafe_memcpy(&gc->r.e, &acc.replicas,
-					      replicas_entry_bytes(&acc.replicas), "VLA");
-		}
-
-		if (old_s) {
-			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
-
-			struct disk_accounting_pos acc;
-			memset(&acc, 0, sizeof(acc));
-			acc.type = BCH_DISK_ACCOUNTING_replicas;
-			bch2_bkey_to_replicas(&acc.replicas, old);
-			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-			if (ret)
-				return ret;
-		}
-
-		int ret = mark_stripe_buckets(trans, old, new, flags);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* returns blocknr in stripe that we matched: */
-static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
-						struct bkey_s_c k, unsigned *block)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
-
-	bkey_for_each_ptr(ptrs, ptr)
-		for (i = 0; i < nr_data; i++)
-			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-						      le16_to_cpu(s->sectors))) {
-				*block = i;
-				return ptr;
-			}
-
-	return NULL;
-}
-
-static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-
-	bkey_extent_entry_for_each(ptrs, entry)
-		if (extent_entry_type(entry) ==
-		    BCH_EXTENT_ENTRY_stripe_ptr &&
-		    entry->stripe_ptr.idx == idx)
-			return true;
-
-	return false;
-}
-
-/* Stripe bufs: */
-
-static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
-{
-	if (buf->key.k.type == KEY_TYPE_stripe) {
-		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
-		unsigned i;
-
-		for (i = 0; i < s->v.nr_blocks; i++) {
-			kvfree(buf->data[i]);
-			buf->data[i] = NULL;
-		}
-	}
-}
-
-/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct bch_fs *c,
-			      struct ec_stripe_buf *buf,
-			      unsigned offset, unsigned size)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1U << v->csum_granularity_bits;
-	unsigned end = offset + size;
-	unsigned i;
-
-	BUG_ON(end > le16_to_cpu(v->sectors));
-
-	offset	= round_down(offset, csum_granularity);
-	end	= min_t(unsigned, le16_to_cpu(v->sectors),
-			round_up(end, csum_granularity));
-
-	buf->offset	= offset;
-	buf->size	= end - offset;
-
-	memset(buf->valid, 0xFF, sizeof(buf->valid));
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
-		if (!buf->data[i])
-			goto err;
-	}
-
-	return 0;
-err:
-	ec_stripe_buf_exit(buf);
-	return bch_err_throw(c, ENOMEM_stripe_buf);
-}
-
-/* Checksumming: */
-
-static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
-					 unsigned block, unsigned offset)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned end = buf->offset + buf->size;
-	unsigned len = min(csum_granularity, end - offset);
-
-	BUG_ON(offset >= end);
-	BUG_ON(offset <  buf->offset);
-	BUG_ON(offset & (csum_granularity - 1));
-	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
-	       (len & (csum_granularity - 1)));
-
-	return bch2_checksum(NULL, v->csum_type,
-			     null_nonce(),
-			     buf->data[block] + ((offset - buf->offset) << 9),
-			     len << 9);
-}
-
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
-
-	if (!v->csum_type)
-		return;
-
-	BUG_ON(buf->offset);
-	BUG_ON(buf->size != le16_to_cpu(v->sectors));
-
-	for (i = 0; i < v->nr_blocks; i++)
-		for (j = 0; j < csums_per_device; j++)
-			stripe_csum_set(v, i, j,
-				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
-}
-
-static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned csum_granularity = 1 << v->csum_granularity_bits;
-	unsigned i;
-
-	if (!v->csum_type)
-		return;
-
-	for (i = 0; i < v->nr_blocks; i++) {
-		unsigned offset = buf->offset;
-		unsigned end = buf->offset + buf->size;
-
-		if (!test_bit(i, buf->valid))
-			continue;
-
-		while (offset < end) {
-			unsigned j = offset >> v->csum_granularity_bits;
-			unsigned len = min(csum_granularity, end - offset);
-			struct bch_csum want = stripe_csum_get(v, i, j);
-			struct bch_csum got = ec_block_checksum(buf, i, offset);
-
-			if (bch2_crc_cmp(want, got)) {
-				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
-				if (ca) {
-					struct printbuf err = PRINTBUF;
-
-					prt_str(&err, "stripe ");
-					bch2_csum_err_msg(&err, v->csum_type, want, got);
-					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
-					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
-					bch_err_ratelimited(ca, "%s", err.buf);
-					printbuf_exit(&err);
-
-					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-				}
-
-				clear_bit(i, buf->valid);
-				break;
-			}
-
-			offset += len;
-		}
-	}
-}
-
-/* Erasure coding: */
-
-static void ec_generate_ec(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned nr_data = v->nr_blocks - v->nr_redundant;
-	unsigned bytes = le16_to_cpu(v->sectors) << 9;
-
-	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
-}
-
-static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-
-	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
-}
-
-static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
-	unsigned nr_data = v->nr_blocks - v->nr_redundant;
-	unsigned bytes = buf->size << 9;
-
-	if (ec_nr_failed(buf) > v->nr_redundant) {
-		bch_err_ratelimited(c,
-			"error doing reconstruct read: unable to read enough blocks");
-		return -1;
-	}
-
-	for (i = 0; i < nr_data; i++)
-		if (!test_bit(i, buf->valid))
-			failed[nr_failed++] = i;
-
-	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
-	return 0;
-}
-
-/* IO: */
-
-static void ec_block_endio(struct bio *bio)
-{
-	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
-	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
-	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
-	struct bch_dev *ca = ec_bio->ca;
-	struct closure *cl = bio->bi_private;
-	int rw = ec_bio->rw;
-	unsigned ref = rw == READ
-		? BCH_DEV_READ_REF_ec_block
-		: BCH_DEV_WRITE_REF_ec_block;
-
-	bch2_account_io_completion(ca, bio_data_dir(bio),
-				   ec_bio->submit_time, !bio->bi_status);
-
-	if (bio->bi_status) {
-		bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
-			       str_write_read(bio_data_dir(bio)),
-			       bch2_blk_status_to_str(bio->bi_status));
-		clear_bit(ec_bio->idx, ec_bio->buf->valid);
-	}
-
-	int stale = dev_ptr_stale(ca, ptr);
-	if (stale) {
-		bch_err_ratelimited(ca->fs,
-				    "error %s stripe: stale/invalid pointer (%i) after io",
-				    bio_data_dir(bio) == READ ? "reading from" : "writing to",
-				    stale);
-		clear_bit(ec_bio->idx, ec_bio->buf->valid);
-	}
-
-	bio_put(&ec_bio->bio);
-	enumerated_ref_put(&ca->io_ref[rw], ref);
-	closure_put(cl);
-}
-
-static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
-			blk_opf_t opf, unsigned idx, struct closure *cl)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-	unsigned offset = 0, bytes = buf->size << 9;
-	struct bch_extent_ptr *ptr = &v->ptrs[idx];
-	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
-		? BCH_DATA_user
-		: BCH_DATA_parity;
-	int rw = op_is_write(opf);
-	unsigned ref = rw == READ
-		? BCH_DEV_READ_REF_ec_block
-		: BCH_DEV_WRITE_REF_ec_block;
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
-	if (!ca) {
-		clear_bit(idx, buf->valid);
-		return;
-	}
-
-	int stale = dev_ptr_stale(ca, ptr);
-	if (stale) {
-		bch_err_ratelimited(c,
-				    "error %s stripe: stale pointer (%i)",
-				    rw == READ ? "reading from" : "writing to",
-				    stale);
-		clear_bit(idx, buf->valid);
-		return;
-	}
-
-
-	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
-
-	while (offset < bytes) {
-		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
-					   DIV_ROUND_UP(bytes, PAGE_SIZE));
-		unsigned b = min_t(size_t, bytes - offset,
-				   nr_iovecs << PAGE_SHIFT);
-		struct ec_bio *ec_bio;
-
-		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
-						       nr_iovecs,
-						       opf,
-						       GFP_KERNEL,
-						       &c->ec_bioset),
-				      struct ec_bio, bio);
-
-		ec_bio->ca			= ca;
-		ec_bio->buf			= buf;
-		ec_bio->idx			= idx;
-		ec_bio->rw			= rw;
-		ec_bio->submit_time		= local_clock();
-
-		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
-		ec_bio->bio.bi_end_io		= ec_block_endio;
-		ec_bio->bio.bi_private		= cl;
-
-		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
-
-		closure_get(cl);
-		enumerated_ref_get(&ca->io_ref[rw], ref);
-
-		submit_bio(&ec_bio->bio);
-
-		offset += b;
-	}
-
-	enumerated_ref_put(&ca->io_ref[rw], ref);
-}
-
-static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
-				struct ec_stripe_buf *stripe)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-			       POS(0, idx), BTREE_ITER_slots);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-	if (k.k->type != KEY_TYPE_stripe) {
-		ret = -ENOENT;
-		goto err;
-	}
-	bkey_reassemble(&stripe->key, k);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
-			struct bkey_s_c orig_k)
-{
-	struct bch_fs *c = trans->c;
-	struct ec_stripe_buf *buf = NULL;
-	struct closure cl;
-	struct bch_stripe *v;
-	unsigned i, offset;
-	const char *msg = NULL;
-	struct printbuf msgbuf = PRINTBUF;
-	int ret = 0;
-
-	closure_init_stack(&cl);
-
-	BUG_ON(!rbio->pick.has_ec);
-
-	buf = kzalloc(sizeof(*buf), GFP_NOFS);
-	if (!buf)
-		return bch_err_throw(c, ENOMEM_ec_read_extent);
-
-	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
-	if (ret) {
-		msg = "stripe not found";
-		goto err;
-	}
-
-	v = &bkey_i_to_stripe(&buf->key)->v;
-
-	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
-		msg = "pointer doesn't match stripe";
-		goto err;
-	}
-
-	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
-	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
-		msg = "read is bigger than stripe";
-		goto err;
-	}
-
-	ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
-	if (ret) {
-		msg = "-ENOMEM";
-		goto err;
-	}
-
-	for (i = 0; i < v->nr_blocks; i++)
-		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-
-	closure_sync(&cl);
-
-	if (ec_nr_failed(buf) > v->nr_redundant) {
-		msg = "unable to read enough blocks";
-		goto err;
-	}
-
-	ec_validate_checksums(c, buf);
-
-	ret = ec_do_recov(c, buf);
-	if (ret)
-		goto err;
-
-	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
-		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-out:
-	ec_stripe_buf_exit(buf);
-	kfree(buf);
-	return ret;
-err:
-	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
-	bch_err_ratelimited(c,
-			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
-	printbuf_exit(&msgbuf);
-	ret = bch_err_throw(c, stripe_reconstruct);
-	goto out;
-}
-
-/* stripe bucket accounting: */
-
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
-{
-	if (c->gc_pos.phase != GC_PHASE_not_running &&
-	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
-		return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
-
-	return 0;
-}
-
-static int ec_stripe_mem_alloc(struct btree_trans *trans,
-			       struct btree_iter *iter)
-{
-	return allocate_dropping_locks_errcode(trans,
-			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
-}
-
-/*
- * Hash table of open stripes:
- * Stripes that are being created or modified are kept in a hash table, so that
- * stripe deletion can skip them.
- */
-
-static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-	struct ec_stripe_new *s;
-
-	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
-		if (s->idx == idx)
-			return true;
-	return false;
-}
-
-static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
-	bool ret = false;
-
-	spin_lock(&c->ec_stripes_new_lock);
-	ret = __bch2_stripe_is_open(c, idx);
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	return ret;
-}
-
-static bool bch2_try_open_stripe(struct bch_fs *c,
-				 struct ec_stripe_new *s,
-				 u64 idx)
-{
-	bool ret;
-
-	spin_lock(&c->ec_stripes_new_lock);
-	ret = !__bch2_stripe_is_open(c, idx);
-	if (ret) {
-		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-
-		s->idx = idx;
-		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
-	}
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	return ret;
-}
-
-static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	BUG_ON(!s->idx);
-
-	spin_lock(&c->ec_stripes_new_lock);
-	hlist_del_init(&s->hash);
-	spin_unlock(&c->ec_stripes_new_lock);
-
-	s->idx = 0;
-}
-
-/* stripe deletion */
-
-static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-					       BTREE_ID_stripes, POS(0, idx),
-					       BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	/*
-	 * We expect write buffer races here
-	 * Important: check stripe_is_open with stripe key locked:
-	 */
-	if (k.k->type == KEY_TYPE_stripe &&
-	    !bch2_stripe_is_open(trans->c, idx) &&
-	    stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
-		ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * XXX
- * can we kill this and delete stripes from the trigger?
- */
-static void ec_stripe_delete_work(struct work_struct *work)
-{
-	struct bch_fs *c =
-		container_of(work, struct bch_fs, ec_stripe_delete_work);
-
-	bch2_trans_run(c,
-		bch2_btree_write_buffer_tryflush(trans) ?:
-		for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
-				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
-				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
-				0, lru_k,
-				NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			ec_stripe_delete(trans, lru_k.k->p.offset);
-		})));
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-void bch2_do_stripe_deletes(struct bch_fs *c)
-{
-	if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
-	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-/* stripe creation: */
-
-static int ec_stripe_key_update(struct btree_trans *trans,
-				struct bkey_i_stripe *old,
-				struct bkey_i_stripe *new)
-{
-	struct bch_fs *c = trans->c;
-	bool create = !old;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
-					       new->k.p, BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
-				    c, "error %s stripe: got existing key type %s",
-				    create ? "creating" : "updating",
-				    bch2_bkey_types[k.k->type])) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (k.k->type == KEY_TYPE_stripe) {
-		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
-
-		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
-		BUG_ON(old->v.nr_blocks != v->nr_blocks);
-
-		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
-			unsigned sectors = stripe_blockcount_get(v, i);
-
-			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
-				struct printbuf buf = PRINTBUF;
-
-				prt_printf(&buf, "stripe changed nonempty block %u", i);
-				prt_str(&buf, "\nold: ");
-				bch2_bkey_val_to_text(&buf, c, k);
-				prt_str(&buf, "\nnew: ");
-				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
-				bch2_fs_inconsistent(c, "%s", buf.buf);
-				printbuf_exit(&buf);
-				ret = -EINVAL;
-				goto err;
-			}
-
-			/*
-			 * If the stripe ptr changed underneath us, it must have
-			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
-			 */
-			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
-				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
-
-				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
-					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
-			}
-
-			stripe_blockcount_set(&new->v, i, sectors);
-		}
-	}
-
-	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int ec_stripe_update_extent(struct btree_trans *trans,
-				   struct bch_dev *ca,
-				   struct bpos bucket, u8 gen,
-				   struct ec_stripe_buf *s,
-				   struct bkey_s_c_backpointer bp,
-				   struct bkey_buf *last_flushed)
-{
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	const struct bch_extent_ptr *ptr_c;
-	struct bch_extent_ptr *ec_ptr = NULL;
-	struct bch_extent_stripe_ptr stripe_ptr;
-	struct bkey_i *n;
-	int ret, dev, block;
-
-	if (bp.v->level) {
-		struct printbuf buf = PRINTBUF;
-		struct btree_iter node_iter;
-		struct btree *b;
-
-		b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
-		bch2_trans_iter_exit(trans, &node_iter);
-
-		if (!b)
-			return 0;
-
-		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
-		bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
-		bch2_fs_inconsistent(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		return bch_err_throw(c, erasure_coding_found_btree_node);
-	}
-
-	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-	if (!k.k) {
-		/*
-		 * extent no longer exists - we could flush the btree
-		 * write buffer and retry to verify, but no need:
-		 */
-		return 0;
-	}
-
-	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
-		goto out;
-
-	ptr_c = bkey_matches_stripe(v, k, &block);
-	/*
-	 * It doesn't generally make sense to erasure code cached ptrs:
-	 * XXX: should we be incrementing a counter?
-	 */
-	if (!ptr_c || ptr_c->cached)
-		goto out;
-
-	dev = v->ptrs[block].dev;
-
-	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		goto out;
-
-	bkey_reassemble(n, k);
-
-	bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
-	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
-	BUG_ON(!ec_ptr);
-
-	stripe_ptr = (struct bch_extent_stripe_ptr) {
-		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
-		.block		= block,
-		.redundancy	= v->nr_redundant,
-		.idx		= s->key.k.p.offset,
-	};
-
-	__extent_entry_insert(n,
-			(union bch_extent_entry *) ec_ptr,
-			(union bch_extent_entry *) &stripe_ptr);
-
-	ret = bch2_trans_update(trans, &iter, n, 0);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
-				   unsigned block)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	struct bch_extent_ptr ptr = v->ptrs[block];
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
-	if (!ca)
-		return bch_err_throw(c, ENOENT_dev_not_found);
-
-	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
-
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
-			bucket_pos_to_bp_start(ca, bucket_pos),
-			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
-			NULL, NULL,
-			BCH_TRANS_COMMIT_no_check_rw|
-			BCH_TRANS_COMMIT_no_enospc, ({
-		if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
-			break;
-
-		if (bp_k.k->type != KEY_TYPE_backpointer)
-			continue;
-
-		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-		if (bp.v->btree_id == BTREE_ID_stripes)
-			continue;
-
-		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
-					bp, &last_flushed);
-	}));
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch2_dev_put(ca);
-	return ret;
-}
-
-static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
-	unsigned nr_data = v->nr_blocks - v->nr_redundant;
-
-	int ret = bch2_btree_write_buffer_flush_sync(trans);
-	if (ret)
-		goto err;
-
-	for (unsigned i = 0; i < nr_data; i++) {
-		ret = ec_stripe_update_bucket(trans, s, i);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
-				       struct ec_stripe_new *s,
-				       unsigned block,
-				       struct open_bucket *ob)
-{
-	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
-				BCH_DEV_WRITE_REF_ec_bucket_zero);
-	if (!ca) {
-		s->err = bch_err_throw(c, erofs_no_writes);
-		return;
-	}
-
-	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
-	memset(s->new_stripe.data[block] + (offset << 9),
-	       0,
-	       ob->sectors_free << 9);
-
-	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
-			ob->bucket * ca->mi.bucket_size + offset,
-			ob->sectors_free,
-			GFP_KERNEL, 0);
-
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
-
-	if (ret)
-		s->err = ret;
-}
-
-void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	if (s->idx)
-		bch2_stripe_close(c, s);
-	kfree(s);
-}
-
-/*
- * data buckets of new stripe all written: create the stripe
- */
-static void ec_stripe_create(struct ec_stripe_new *s)
-{
-	struct bch_fs *c = s->c;
-	struct open_bucket *ob;
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-	int ret;
-
-	BUG_ON(s->h->s == s);
-
-	closure_sync(&s->iodone);
-
-	if (!s->err) {
-		for (i = 0; i < nr_data; i++)
-			if (s->blocks[i]) {
-				ob = c->open_buckets + s->blocks[i];
-
-				if (ob->sectors_free)
-					zero_out_rest_of_ec_bucket(c, s, i, ob);
-			}
-	}
-
-	if (s->err) {
-		if (!bch2_err_matches(s->err, EROFS))
-			bch_err(c, "error creating stripe: error writing data buckets");
-		ret = s->err;
-		goto err;
-	}
-
-	if (s->have_existing_stripe) {
-		ec_validate_checksums(c, &s->existing_stripe);
-
-		if (ec_do_recov(c, &s->existing_stripe)) {
-			bch_err(c, "error creating stripe: error reading existing stripe");
-			ret = bch_err_throw(c, ec_block_read);
-			goto err;
-		}
-
-		for (i = 0; i < nr_data; i++)
-			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
-				swap(s->new_stripe.data[i],
-				     s->existing_stripe.data[i]);
-
-		ec_stripe_buf_exit(&s->existing_stripe);
-	}
-
-	BUG_ON(!s->allocated);
-	BUG_ON(!s->idx);
-
-	ec_generate_ec(&s->new_stripe);
-
-	ec_generate_checksums(&s->new_stripe);
-
-	/* write p/q: */
-	for (i = nr_data; i < v->nr_blocks; i++)
-		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
-	closure_sync(&s->iodone);
-
-	if (ec_nr_failed(&s->new_stripe)) {
-		bch_err(c, "error creating stripe: error writing redundancy buckets");
-		ret = bch_err_throw(c, ec_block_write);
-		goto err;
-	}
-
-	ret = bch2_trans_commit_do(c, &s->res, NULL,
-		BCH_TRANS_COMMIT_no_check_rw|
-		BCH_TRANS_COMMIT_no_enospc,
-		ec_stripe_key_update(trans,
-				     s->have_existing_stripe
-				     ? bkey_i_to_stripe(&s->existing_stripe.key)
-				     : NULL,
-				     bkey_i_to_stripe(&s->new_stripe.key)));
-	bch_err_msg(c, ret, "creating stripe key");
-	if (ret) {
-		goto err;
-	}
-
-	ret = ec_stripe_update_extents(c, &s->new_stripe);
-	bch_err_msg(c, ret, "error updating extents");
-	if (ret)
-		goto err;
-err:
-	trace_stripe_create(c, s->idx, ret);
-
-	bch2_disk_reservation_put(c, &s->res);
-
-	for (i = 0; i < v->nr_blocks; i++)
-		if (s->blocks[i]) {
-			ob = c->open_buckets + s->blocks[i];
-
-			if (i < nr_data) {
-				ob->ec = NULL;
-				__bch2_open_bucket_put(c, ob);
-			} else {
-				bch2_open_bucket_put(c, ob);
-			}
-		}
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_del(&s->list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-	wake_up(&c->ec_stripe_new_wait);
-
-	ec_stripe_buf_exit(&s->existing_stripe);
-	ec_stripe_buf_exit(&s->new_stripe);
-	closure_debug_destroy(&s->iodone);
-
-	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
-}
-
-static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
-{
-	struct ec_stripe_new *s;
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(s, &c->ec_stripe_new_list, list)
-		if (!atomic_read(&s->ref[STRIPE_REF_io]))
-			goto out;
-	s = NULL;
-out:
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	return s;
-}
-
-static void ec_stripe_create_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work,
-		struct bch_fs, ec_stripe_create_work);
-	struct ec_stripe_new *s;
-
-	while ((s = get_pending_stripe(c)))
-		ec_stripe_create(s);
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-void bch2_ec_do_stripe_creates(struct bch_fs *c)
-{
-	enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
-
-	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct ec_stripe_new *s = h->s;
-
-	lockdep_assert_held(&h->lock);
-
-	BUG_ON(!s->allocated && !s->err);
-
-	h->s		= NULL;
-	s->pending	= true;
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_add(&s->list, &c->ec_stripe_new_list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	ec_stripe_new_put(c, s, STRIPE_REF_io);
-}
-
-static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
-{
-	h->s->err = err;
-	ec_stripe_new_set_pending(c, h);
-}
-
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
-{
-	struct ec_stripe_new *s = ob->ec;
-
-	s->err = err;
-}
-
-void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
-{
-	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
-	if (!ob)
-		return NULL;
-
-	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
-
-	struct bch_dev *ca	= ob_dev(c, ob);
-	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
-
-	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-	unsigned l = *((const unsigned *) _l);
-	unsigned r = *((const unsigned *) _r);
-
-	return cmp_int(l, r);
-}
-
-/* pick most common bucket size: */
-static unsigned pick_blocksize(struct bch_fs *c,
-			       struct bch_devs_mask *devs)
-{
-	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
-	struct {
-		unsigned nr, size;
-	} cur = { 0, 0 }, best = { 0, 0 };
-
-	for_each_member_device_rcu(c, ca, devs)
-		sizes[nr++] = ca->mi.bucket_size;
-
-	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
-
-	for (unsigned i = 0; i < nr; i++) {
-		if (sizes[i] != cur.size) {
-			if (cur.nr > best.nr)
-				best = cur;
-
-			cur.nr = 0;
-			cur.size = sizes[i];
-		}
-
-		cur.nr++;
-	}
-
-	if (cur.nr > best.nr)
-		best = cur;
-
-	return best.size;
-}
-
-static bool may_create_new_stripe(struct bch_fs *c)
-{
-	return false;
-}
-
-static void ec_stripe_key_init(struct bch_fs *c,
-			       struct bkey_i *k,
-			       unsigned nr_data,
-			       unsigned nr_parity,
-			       unsigned stripe_size,
-			       unsigned disk_label)
-{
-	struct bkey_i_stripe *s = bkey_stripe_init(k);
-	unsigned u64s;
-
-	s->v.sectors			= cpu_to_le16(stripe_size);
-	s->v.algorithm			= 0;
-	s->v.nr_blocks			= nr_data + nr_parity;
-	s->v.nr_redundant		= nr_parity;
-	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
-	s->v.csum_type			= BCH_CSUM_crc32c;
-	s->v.disk_label			= disk_label;
-
-	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
-		BUG_ON(1 << s->v.csum_granularity_bits >=
-		       le16_to_cpu(s->v.sectors) ||
-		       s->v.csum_granularity_bits == U8_MAX);
-		s->v.csum_granularity_bits++;
-	}
-
-	set_bkey_val_u64s(&s->k, u64s);
-}
-
-static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct ec_stripe_new *s;
-
-	lockdep_assert_held(&h->lock);
-
-	s = kzalloc(sizeof(*s), GFP_KERNEL);
-	if (!s)
-		return NULL;
-
-	mutex_init(&s->lock);
-	closure_init(&s->iodone, NULL);
-	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
-	atomic_set(&s->ref[STRIPE_REF_io], 1);
-	s->c		= c;
-	s->h		= h;
-	s->nr_data	= min_t(unsigned, h->nr_active_devs,
-				BCH_BKEY_PTRS_MAX) - h->redundancy;
-	s->nr_parity	= h->redundancy;
-
-	ec_stripe_key_init(c, &s->new_stripe.key,
-			   s->nr_data, s->nr_parity,
-			   h->blocksize, h->disk_label);
-	return s;
-}
-
-static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	struct bch_devs_mask devs = h->devs;
-	unsigned nr_devs, nr_devs_with_durability;
-
-	scoped_guard(rcu) {
-		h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
-					 ? group_to_target(h->disk_label - 1)
-					 : 0);
-		nr_devs = dev_mask_nr(&h->devs);
-
-		for_each_member_device_rcu(c, ca, &h->devs)
-			if (!ca->mi.durability)
-				__clear_bit(ca->dev_idx, h->devs.d);
-		nr_devs_with_durability = dev_mask_nr(&h->devs);
-
-		h->blocksize = pick_blocksize(c, &h->devs);
-
-		h->nr_active_devs = 0;
-		for_each_member_device_rcu(c, ca, &h->devs)
-			if (ca->mi.bucket_size == h->blocksize)
-				h->nr_active_devs++;
-	}
-
-	/*
-	 * If we only have redundancy + 1 devices, we're better off with just
-	 * replication:
-	 */
-	h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
-
-	if (h->insufficient_devs) {
-		const char *err;
-
-		if (nr_devs < h->redundancy + 2)
-			err = NULL;
-		else if (nr_devs_with_durability < h->redundancy + 2)
-			err = "cannot use durability=0 devices";
-		else
-			err = "mismatched bucket sizes";
-
-		if (err)
-			bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
-				h->nr_active_devs, h->redundancy + 2, err);
-	}
-
-	struct bch_devs_mask devs_leaving;
-	bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
-
-	if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
-		ec_stripe_new_cancel(c, h, -EINTR);
-
-	h->rw_devs_change_count = c->rw_devs_change_count;
-}
-
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
-			 unsigned algo, unsigned redundancy,
-			 enum bch_watermark watermark)
-{
-	struct ec_stripe_head *h;
-
-	h = kzalloc(sizeof(*h), GFP_KERNEL);
-	if (!h)
-		return NULL;
-
-	mutex_init(&h->lock);
-	BUG_ON(!mutex_trylock(&h->lock));
-
-	h->disk_label	= disk_label;
-	h->algo		= algo;
-	h->redundancy	= redundancy;
-	h->watermark	= watermark;
-
-	list_add(&h->list, &c->ec_stripe_head_list);
-	return h;
-}
-
-void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
-{
-	if (h->s &&
-	    h->s->allocated &&
-	    bitmap_weight(h->s->blocks_allocated,
-			  h->s->nr_data) == h->s->nr_data)
-		ec_stripe_new_set_pending(c, h);
-
-	mutex_unlock(&h->lock);
-}
-
-static struct ec_stripe_head *
-__bch2_ec_stripe_head_get(struct btree_trans *trans,
-			  unsigned disk_label,
-			  unsigned algo,
-			  unsigned redundancy,
-			  enum bch_watermark watermark)
-{
-	struct bch_fs *c = trans->c;
-	struct ec_stripe_head *h;
-	int ret;
-
-	if (!redundancy)
-		return NULL;
-
-	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (test_bit(BCH_FS_going_ro, &c->flags)) {
-		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
-		goto err;
-	}
-
-	list_for_each_entry(h, &c->ec_stripe_head_list, list)
-		if (h->disk_label	== disk_label &&
-		    h->algo		== algo &&
-		    h->redundancy	== redundancy &&
-		    h->watermark	== watermark) {
-			ret = bch2_trans_mutex_lock(trans, &h->lock);
-			if (ret) {
-				h = ERR_PTR(ret);
-				goto err;
-			}
-			goto found;
-		}
-
-	h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
-	if (!h) {
-		h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
-		goto err;
-	}
-found:
-	if (h->rw_devs_change_count != c->rw_devs_change_count)
-		ec_stripe_head_devs_update(c, h);
-
-	if (h->insufficient_devs) {
-		mutex_unlock(&h->lock);
-		h = NULL;
-	}
-err:
-	mutex_unlock(&c->ec_stripe_head_lock);
-	return h;
-}
-
-static int new_stripe_alloc_buckets(struct btree_trans *trans,
-				    struct alloc_request *req,
-				    struct ec_stripe_head *h, struct ec_stripe_new *s,
-				    struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	struct open_bucket *ob;
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
-	int ret = 0;
-
-	req->scratch_data_type		= req->data_type;
-	req->scratch_ptrs		= req->ptrs;
-	req->scratch_nr_replicas	= req->nr_replicas;
-	req->scratch_nr_effective	= req->nr_effective;
-	req->scratch_have_cache		= req->have_cache;
-	req->scratch_devs_may_alloc	= req->devs_may_alloc;
-
-	req->devs_may_alloc	= h->devs;
-	req->have_cache		= true;
-
-	BUG_ON(v->nr_blocks	!= s->nr_data + s->nr_parity);
-	BUG_ON(v->nr_redundant	!= s->nr_parity);
-
-	/* * We bypass the sector allocator which normally does this: */
-	bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
-		   c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
-
-	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
-		/*
-		 * Note: we don't yet repair invalid blocks (failed/removed
-		 * devices) when reusing stripes - we still need a codepath to
-		 * walk backpointers and update all extents that point to that
-		 * block when updating the stripe
-		 */
-		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
-			__clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
-
-		if (i < s->nr_data)
-			nr_have_data++;
-		else
-			nr_have_parity++;
-	}
-
-	BUG_ON(nr_have_data	> s->nr_data);
-	BUG_ON(nr_have_parity	> s->nr_parity);
-
-	req->ptrs.nr = 0;
-	if (nr_have_parity < s->nr_parity) {
-		req->nr_replicas	= s->nr_parity;
-		req->nr_effective	= nr_have_parity;
-		req->data_type		= BCH_DATA_parity;
-
-		ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
-
-		open_bucket_for_each(c, &req->ptrs, ob, i) {
-			j = find_next_zero_bit(s->blocks_gotten,
-					       s->nr_data + s->nr_parity,
-					       s->nr_data);
-			BUG_ON(j >= s->nr_data + s->nr_parity);
-
-			s->blocks[j] = req->ptrs.v[i];
-			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, s->blocks_gotten);
-		}
-
-		if (ret)
-			goto err;
-	}
-
-	req->ptrs.nr = 0;
-	if (nr_have_data < s->nr_data) {
-		req->nr_replicas	= s->nr_data;
-		req->nr_effective	= nr_have_data;
-		req->data_type		= BCH_DATA_user;
-
-		ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
-
-		open_bucket_for_each(c, &req->ptrs, ob, i) {
-			j = find_next_zero_bit(s->blocks_gotten,
-					       s->nr_data, 0);
-			BUG_ON(j >= s->nr_data);
-
-			s->blocks[j] = req->ptrs.v[i];
-			v->ptrs[j] = bch2_ob_ptr(c, ob);
-			__set_bit(j, s->blocks_gotten);
-		}
-
-		if (ret)
-			goto err;
-	}
-err:
-	req->data_type		= req->scratch_data_type;
-	req->ptrs		= req->scratch_ptrs;
-	req->nr_replicas	= req->scratch_nr_replicas;
-	req->nr_effective	= req->scratch_nr_effective;
-	req->have_cache		= req->scratch_have_cache;
-	req->devs_may_alloc	= req->scratch_devs_may_alloc;
-	return ret;
-}
-
-static int __get_existing_stripe(struct btree_trans *trans,
-				 struct ec_stripe_head *head,
-				 struct ec_stripe_buf *stripe,
-				 u64 idx)
-{
-	struct bch_fs *c = trans->c;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-					  BTREE_ID_stripes, POS(0, idx), 0);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	/* We expect write buffer races here */
-	if (k.k->type != KEY_TYPE_stripe)
-		goto out;
-
-	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-	if (stripe_lru_pos(s.v) <= 1)
-		goto out;
-
-	if (s.v->disk_label		== head->disk_label &&
-	    s.v->algorithm		== head->algo &&
-	    s.v->nr_redundant		== head->redundancy &&
-	    le16_to_cpu(s.v->sectors)	== head->blocksize &&
-	    bch2_try_open_stripe(c, head->s, idx)) {
-		bkey_reassemble(&stripe->key, k);
-		ret = 1;
-	}
-out:
-	bch2_set_btree_iter_dontneed(trans, &iter);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
-{
-	struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-	struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
-	unsigned i;
-
-	BUG_ON(existing_v->nr_redundant != s->nr_parity);
-	s->nr_data = existing_v->nr_blocks -
-		existing_v->nr_redundant;
-
-	int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
-	if (ret) {
-		bch2_stripe_close(c, s);
-		return ret;
-	}
-
-	BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
-
-	/*
-	 * Free buckets we initially allocated - they might conflict with
-	 * blocks from the stripe we're reusing:
-	 */
-	for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
-		bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
-		s->blocks[i] = 0;
-	}
-	memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
-	memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
-
-	for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
-		if (stripe_blockcount_get(existing_v, i)) {
-			__set_bit(i, s->blocks_gotten);
-			__set_bit(i, s->blocks_allocated);
-		}
-
-		ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
-	}
-
-	bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
-	s->have_existing_stripe = true;
-
-	return 0;
-}
-
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
-				       struct ec_stripe_new *s)
-{
-	struct bch_fs *c = trans->c;
-
-	/*
-	 * If we can't allocate a new stripe, and there's no stripes with empty
-	 * blocks for us to reuse, that means we have to wait on copygc:
-	 */
-	if (may_create_new_stripe(c))
-		return -1;
-
-	struct btree_iter lru_iter;
-	struct bkey_s_c lru_k;
-	int ret = 0;
-
-	for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
-			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
-			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
-			0, lru_k, ret) {
-		ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &lru_iter);
-	if (!ret)
-		ret = bch_err_throw(c, stripe_alloc_blocked);
-	if (ret == 1)
-		ret = 0;
-	if (ret)
-		return ret;
-
-	return init_new_stripe_from_existing(c, s);
-}
-
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
-					 struct ec_stripe_new *s)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bpos min_pos = POS(0, 1);
-	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
-	int ret;
-
-	if (!s->res.sectors) {
-		ret = bch2_disk_reservation_get(c, &s->res,
-					h->blocksize,
-					s->nr_parity,
-					BCH_DISK_RESERVATION_NOFAIL);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * Allocate stripe slot
-	 * XXX: we're going to need a bitrange btree of free stripes
-	 */
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
-			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
-		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
-			if (start_pos.offset) {
-				start_pos = min_pos;
-				bch2_btree_iter_set_pos(trans, &iter, start_pos);
-				continue;
-			}
-
-			ret = bch_err_throw(c, ENOSPC_stripe_create);
-			break;
-		}
-
-		if (bkey_deleted(k.k) &&
-		    bch2_try_open_stripe(c, s, k.k->p.offset))
-			break;
-	}
-
-	c->ec_stripe_hint = iter.pos.offset;
-
-	if (ret)
-		goto err;
-
-	ret = ec_stripe_mem_alloc(trans, &iter);
-	if (ret) {
-		bch2_stripe_close(c, s);
-		goto err;
-	}
-
-	s->new_stripe.key.k.p = iter.pos;
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-err:
-	bch2_disk_reservation_put(c, &s->res);
-	goto out;
-}
-
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
-					       struct alloc_request *req,
-					       unsigned algo,
-					       struct closure *cl)
-{
-	struct bch_fs *c = trans->c;
-	unsigned redundancy = req->nr_replicas - 1;
-	unsigned disk_label = 0;
-	struct target t = target_decode(req->target);
-	bool waiting = false;
-	int ret;
-
-	if (t.type == TARGET_GROUP) {
-		if (t.group > U8_MAX) {
-			bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
-			return NULL;
-		}
-		disk_label = t.group + 1; /* 0 == no label */
-	}
-
-	struct ec_stripe_head *h =
-		__bch2_ec_stripe_head_get(trans, disk_label, algo,
-					  redundancy, req->watermark);
-	if (IS_ERR_OR_NULL(h))
-		return h;
-
-	if (!h->s) {
-		h->s = ec_new_stripe_alloc(c, h);
-		if (!h->s) {
-			ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
-			bch_err(c, "failed to allocate new stripe");
-			goto err;
-		}
-
-		h->nr_created++;
-	}
-
-	struct ec_stripe_new *s = h->s;
-
-	if (s->allocated)
-		goto allocated;
-
-	if (s->have_existing_stripe)
-		goto alloc_existing;
-
-	/* First, try to allocate a full stripe: */
-	enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
-	swap(req->watermark, saved_watermark);
-	ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
-		__bch2_ec_stripe_head_reserve(trans, h, s);
-	swap(req->watermark, saved_watermark);
-
-	if (!ret)
-		goto allocate_buf;
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    bch2_err_matches(ret, ENOMEM))
-		goto err;
-
-	/*
-	 * Not enough buckets available for a full stripe: we must reuse an
-	 * existing stripe:
-	 */
-	while (1) {
-		ret = __bch2_ec_stripe_head_reuse(trans, h, s);
-		if (!ret)
-			break;
-		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
-			goto err;
-
-		if (req->watermark == BCH_WATERMARK_copygc) {
-			ret =   new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
-				__bch2_ec_stripe_head_reserve(trans, h, s);
-			if (ret)
-				goto err;
-			goto allocate_buf;
-		}
-
-		/* XXX freelist_wait? */
-		closure_wait(&c->freelist_wait, cl);
-		waiting = true;
-	}
-
-	if (waiting)
-		closure_wake_up(&c->freelist_wait);
-alloc_existing:
-	/*
-	 * Retry allocating buckets, with the watermark for this
-	 * particular write:
-	 */
-	ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
-	if (ret)
-		goto err;
-
-allocate_buf:
-	ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
-	if (ret)
-		goto err;
-
-	s->allocated = true;
-allocated:
-	BUG_ON(!s->idx);
-	BUG_ON(!s->new_stripe.data[0]);
-	BUG_ON(trans->restarted);
-	return h;
-err:
-	bch2_ec_stripe_head_put(c, h);
-	return ERR_PTR(ret);
-}
-
-/* device removal */
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bkey_s_c k,
-				  unsigned dev_idx,
-				  unsigned flags)
-{
-	if (k.k->type != KEY_TYPE_stripe)
-		return 0;
-
-	struct bch_fs *c = trans->c;
-	struct bkey_i_stripe *s =
-		bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
-	int ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		return ret;
-
-	struct disk_accounting_pos acc;
-
-	s64 sectors = 0;
-	for (unsigned i = 0; i < s->v.nr_blocks; i++)
-		sectors -= stripe_blockcount_get(&s->v, i);
-
-	memset(&acc, 0, sizeof(acc));
-	acc.type = BCH_DISK_ACCOUNTING_replicas;
-	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-	acc.replicas.data_type = BCH_DATA_user;
-	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-	if (ret)
-		return ret;
-
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
-
-	/* XXX: how much redundancy do we still have? check degraded flags */
-
-	unsigned nr_good = 0;
-
-	scoped_guard(rcu)
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (ptr->dev == dev_idx)
-				ptr->dev = BCH_SB_MEMBER_INVALID;
-
-			struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-			nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
-		}
-
-	if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-		return bch_err_throw(c, remove_would_lose_data);
-
-	unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
-
-	if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
-		return bch_err_throw(c, remove_would_lose_data);
-
-	sectors = -sectors;
-
-	memset(&acc, 0, sizeof(acc));
-	acc.type = BCH_DISK_ACCOUNTING_replicas;
-	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
-	acc.replicas.data_type = BCH_DATA_user;
-	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-}
-
-static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
-						    unsigned flags)
-{
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
-	if (!a->stripe)
-		return 0;
-
-	if (a->stripe_sectors) {
-		struct bch_fs *c = trans->c;
-		bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
-		return bch_err_throw(c, invalidate_stripe_to_dev);
-	}
-
-	struct btree_iter iter;
-	struct bkey_s_c_stripe s =
-		bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
-					 BTREE_ITER_slots, stripe);
-	int ret = bkey_err(s);
-	if (ret)
-		return ret;
-
-	ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_max_commit(trans, iter,
-				  BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
-				  BTREE_ITER_intent, k,
-				  NULL, NULL, 0, ({
-			bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
-	})));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* startup/shutdown */
-
-static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct ec_stripe_head *h;
-	struct open_bucket *ob;
-	unsigned i;
-
-	mutex_lock(&c->ec_stripe_head_lock);
-	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		mutex_lock(&h->lock);
-		if (!h->s)
-			goto unlock;
-
-		if (!ca)
-			goto found;
-
-		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
-			if (!h->s->blocks[i])
-				continue;
-
-			ob = c->open_buckets + h->s->blocks[i];
-			if (ob->dev == ca->dev_idx)
-				goto found;
-		}
-		goto unlock;
-found:
-		ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
-unlock:
-		mutex_unlock(&h->lock);
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-}
-
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-{
-	__bch2_ec_stop(c, ca);
-}
-
-void bch2_fs_ec_stop(struct bch_fs *c)
-{
-	__bch2_ec_stop(c, NULL);
-}
-
-static bool bch2_fs_ec_flush_done(struct bch_fs *c)
-{
-	sched_annotate_sleep();
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	bool ret = list_empty(&c->ec_stripe_new_list);
-	mutex_unlock(&c->ec_stripe_new_lock);
-
-	return ret;
-}
-
-void bch2_fs_ec_flush(struct bch_fs *c)
-{
-	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
-}
-
-int bch2_stripes_read(struct bch_fs *c)
-{
-	return 0;
-}
-
-static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
-				    struct ec_stripe_new *s)
-{
-	prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
-		   s->idx, s->nr_data, s->nr_parity,
-		   bitmap_weight(s->blocks_allocated, s->nr_data),
-		   atomic_read(&s->ref[STRIPE_REF_io]),
-		   atomic_read(&s->ref[STRIPE_REF_stripe]),
-		   bch2_watermarks[s->h->watermark]);
-
-	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
-	unsigned i;
-	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
-		prt_printf(out, " %u", s->blocks[i]);
-	prt_newline(out);
-	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
-	prt_newline(out);
-}
-
-void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct ec_stripe_head *h;
-	struct ec_stripe_new *s;
-
-	mutex_lock(&c->ec_stripe_head_lock);
-	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-		prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
-		       h->disk_label, h->algo, h->redundancy,
-		       bch2_watermarks[h->watermark],
-		       h->nr_created);
-
-		if (h->s)
-			bch2_new_stripe_to_text(out, c, h->s);
-	}
-	mutex_unlock(&c->ec_stripe_head_lock);
-
-	prt_printf(out, "in flight:\n");
-
-	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(s, &c->ec_stripe_new_list, list)
-		bch2_new_stripe_to_text(out, c, s);
-	mutex_unlock(&c->ec_stripe_new_lock);
-}
-
-void bch2_fs_ec_exit(struct bch_fs *c)
-{
-	struct ec_stripe_head *h;
-	unsigned i;
-
-	while (1) {
-		mutex_lock(&c->ec_stripe_head_lock);
-		h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
-		mutex_unlock(&c->ec_stripe_head_lock);
-
-		if (!h)
-			break;
-
-		if (h->s) {
-			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
-				BUG_ON(h->s->blocks[i]);
-
-			kfree(h->s);
-		}
-		kfree(h);
-	}
-
-	BUG_ON(!list_empty(&c->ec_stripe_new_list));
-
-	bioset_exit(&c->ec_bioset);
-}
-
-void bch2_fs_ec_init_early(struct bch_fs *c)
-{
-	spin_lock_init(&c->ec_stripes_new_lock);
-
-	INIT_LIST_HEAD(&c->ec_stripe_head_list);
-	mutex_init(&c->ec_stripe_head_lock);
-
-	INIT_LIST_HEAD(&c->ec_stripe_new_list);
-	mutex_init(&c->ec_stripe_new_lock);
-	init_waitqueue_head(&c->ec_stripe_new_wait);
-
-	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
-	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
-}
-
-int bch2_fs_ec_init(struct bch_fs *c)
-{
-	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
-			   BIOSET_NEED_BVECS);
-}
-
-static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
-					struct bkey_s_c k,
-					struct bkey_buf *last_flushed)
-{
-	if (k.k->type != KEY_TYPE_stripe)
-		return 0;
-
-	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-	u64 lru_idx = stripe_lru_pos(s.v);
-	if (lru_idx) {
-		int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
-					     k.k->p.offset, lru_idx, k, last_flushed);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
-{
-	struct bkey_buf last_flushed;
-
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
-				POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
deleted file mode 100644
index 548048adf0d5..000000000000
--- a/fs/bcachefs/ec.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_H
-#define _BCACHEFS_EC_H
-
-#include "ec_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
-			 struct bkey_validate_context);
-void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
-			 struct bkey_s_c);
-int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
-			struct bkey_s_c, struct bkey_s,
-			enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
-	.key_validate	= bch2_stripe_validate,		\
-	.val_to_text	= bch2_stripe_to_text,		\
-	.swab		= bch2_ptr_swab,		\
-	.trigger	= bch2_trigger_stripe,		\
-	.min_val_size	= 8,				\
-})
-
-static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
-			    1 << s->csum_granularity_bits);
-}
-
-static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
-					  unsigned dev, unsigned csum_idx)
-{
-	EBUG_ON(s->csum_type >= BCH_CSUM_NR);
-
-	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
-	return sizeof(struct bch_stripe) +
-		sizeof(struct bch_extent_ptr) * s->nr_blocks +
-		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
-						unsigned idx)
-{
-	return stripe_csum_offset(s, s->nr_blocks, 0) +
-		sizeof(u16) * idx;
-}
-
-static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
-					     unsigned idx)
-{
-	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
-}
-
-static inline void stripe_blockcount_set(struct bch_stripe *s,
-					 unsigned idx, unsigned v)
-{
-	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
-
-	*p = cpu_to_le16(v);
-}
-
-static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
-	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
-			    sizeof(u64));
-}
-
-static inline void *stripe_csum(struct bch_stripe *s,
-				unsigned block, unsigned csum_idx)
-{
-	EBUG_ON(block >= s->nr_blocks);
-	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
-
-	return (void *) s + stripe_csum_offset(s, block, csum_idx);
-}
-
-static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
-				   unsigned block, unsigned csum_idx)
-{
-	struct bch_csum csum = { 0 };
-
-	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
-	return csum;
-}
-
-static inline void stripe_csum_set(struct bch_stripe *s,
-				   unsigned block, unsigned csum_idx,
-				   struct bch_csum csum)
-{
-	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
-}
-
-#define STRIPE_LRU_POS_EMPTY	1
-
-static inline u64 stripe_lru_pos(const struct bch_stripe *s)
-{
-	if (!s)
-		return 0;
-
-	unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
-
-	for (unsigned i = 0; i < nr_data; i++)
-		blocks_empty += !stripe_blockcount_get(s, i);
-
-	/* Will be picked up by the stripe_delete worker */
-	if (blocks_empty == nr_data)
-		return STRIPE_LRU_POS_EMPTY;
-
-	if (!blocks_empty)
-		return 0;
-
-	/* invert: more blocks empty = reuse first */
-	return LRU_TIME_MAX - blocks_empty;
-}
-
-static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
-					     const struct bch_extent_ptr *data_ptr,
-					     unsigned sectors)
-{
-	return  (data_ptr->dev    == stripe_ptr->dev ||
-		 data_ptr->dev    == BCH_SB_MEMBER_INVALID ||
-		 stripe_ptr->dev  == BCH_SB_MEMBER_INVALID) &&
-		data_ptr->gen    == stripe_ptr->gen &&
-		data_ptr->offset >= stripe_ptr->offset &&
-		data_ptr->offset  < stripe_ptr->offset + sectors;
-}
-
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
-					   struct extent_ptr_decoded p)
-{
-	unsigned nr_data = s->nr_blocks - s->nr_redundant;
-
-	BUG_ON(!p.has_ec);
-
-	if (p.ec.block >= nr_data)
-		return false;
-
-	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
-					 le16_to_cpu(s->sectors));
-}
-
-static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
-					     struct extent_ptr_decoded p)
-{
-	unsigned nr_data = m->nr_blocks - m->nr_redundant;
-
-	BUG_ON(!p.has_ec);
-
-	if (p.ec.block >= nr_data)
-		return false;
-
-	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
-					 m->sectors);
-}
-
-static inline void gc_stripe_unlock(struct gc_stripe *s)
-{
-	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
-	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
-	smp_mb__after_atomic();
-	wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void gc_stripe_lock(struct gc_stripe *s)
-{
-	wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
-			 TASK_UNINTERRUPTIBLE);
-}
-
-struct bch_read_bio;
-
-struct ec_stripe_buf {
-	/* might not be buffering the entire stripe: */
-	unsigned		offset;
-	unsigned		size;
-	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
-	void			*data[BCH_BKEY_PTRS_MAX];
-
-	__BKEY_PADDED(key, 255);
-};
-
-struct ec_stripe_head;
-
-enum ec_stripe_ref {
-	STRIPE_REF_io,
-	STRIPE_REF_stripe,
-	STRIPE_REF_NR
-};
-
-struct ec_stripe_new {
-	struct bch_fs		*c;
-	struct ec_stripe_head	*h;
-	struct mutex		lock;
-	struct list_head	list;
-
-	struct hlist_node	hash;
-	u64			idx;
-
-	struct closure		iodone;
-
-	atomic_t		ref[STRIPE_REF_NR];
-
-	int			err;
-
-	u8			nr_data;
-	u8			nr_parity;
-	bool			allocated;
-	bool			pending;
-	bool			have_existing_stripe;
-
-	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
-	struct disk_reservation	res;
-
-	struct ec_stripe_buf	new_stripe;
-	struct ec_stripe_buf	existing_stripe;
-};
-
-struct ec_stripe_head {
-	struct list_head	list;
-	struct mutex		lock;
-
-	unsigned		disk_label;
-	unsigned		algo;
-	unsigned		redundancy;
-	enum bch_watermark	watermark;
-	bool			insufficient_devs;
-
-	unsigned long		rw_devs_change_count;
-
-	u64			nr_created;
-
-	struct bch_devs_mask	devs;
-	unsigned		nr_active_devs;
-
-	unsigned		blocksize;
-
-	struct dev_stripe_state	block_stripe;
-	struct dev_stripe_state	parity_stripe;
-
-	struct ec_stripe_new	*s;
-};
-
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
-
-void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
-
-int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-
-void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-
-struct alloc_request;
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
-			struct alloc_request *, unsigned, struct closure *);
-
-void bch2_do_stripe_deletes(struct bch_fs *);
-void bch2_ec_do_stripe_creates(struct bch_fs *);
-void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-
-static inline void ec_stripe_new_get(struct ec_stripe_new *s,
-				     enum ec_stripe_ref ref)
-{
-	atomic_inc(&s->ref[ref]);
-}
-
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
-				     enum ec_stripe_ref ref)
-{
-	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
-
-	if (atomic_dec_and_test(&s->ref[ref]))
-		switch (ref) {
-		case STRIPE_REF_stripe:
-			bch2_ec_stripe_new_free(c, s);
-			break;
-		case STRIPE_REF_io:
-			bch2_ec_do_stripe_creates(c);
-			break;
-		default:
-			BUG();
-		}
-}
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
-				  struct bkey_s_c, unsigned, unsigned);
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
-
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-void bch2_fs_ec_stop(struct bch_fs *);
-void bch2_fs_ec_flush(struct bch_fs *);
-
-int bch2_stripes_read(struct bch_fs *);
-
-void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_ec_exit(struct bch_fs *);
-void bch2_fs_ec_init_early(struct bch_fs *);
-int bch2_fs_ec_init(struct bch_fs *);
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *);
-
-#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
deleted file mode 100644
index b9770f24f213..000000000000
--- a/fs/bcachefs/ec_format.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_FORMAT_H
-#define _BCACHEFS_EC_FORMAT_H
-
-struct bch_stripe {
-	struct bch_val		v;
-	__le16			sectors;
-	__u8			algorithm;
-	__u8			nr_blocks;
-	__u8			nr_redundant;
-
-	__u8			csum_granularity_bits;
-	__u8			csum_type;
-
-	/*
-	 * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
-	 *
-	 * we can manage with this because this only needs to point to a
-	 * disk label, not a target:
-	 */
-	__u8			disk_label;
-
-	/*
-	 * Variable length sections:
-	 * - Pointers
-	 * - Checksums
-	 *   2D array of [stripe block/device][csum block], with checksum block
-	 *   size given by csum_granularity_bits
-	 * - Block sector counts: per-block array of u16s
-	 *
-	 * XXX:
-	 * Either checksums should have come last, or we should have included a
-	 * checksum_size field (the size in bytes of the checksum itself, not
-	 * the blocksize the checksum covers).
-	 *
-	 * Currently we aren't able to access the block sector counts if the
-	 * checksum type is unknown.
-	 */
-
-	struct bch_extent_ptr	ptrs[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
deleted file mode 100644
index 809446c78951..000000000000
--- a/fs/bcachefs/ec_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_TYPES_H
-#define _BCACHEFS_EC_TYPES_H
-
-#include "bcachefs_format.h"
-
-union bch_replicas_padded {
-	u8				bytes[struct_size_t(struct bch_replicas_entry_v1,
-							    devs, BCH_BKEY_PTRS_MAX)];
-	struct bch_replicas_entry_v1	e;
-};
-
-struct stripe {
-	size_t			heap_idx;
-	u16			sectors;
-	u8			algorithm;
-	u8			nr_blocks;
-	u8			nr_redundant;
-	u8			blocks_nonempty;
-	u8			disk_label;
-};
-
-struct gc_stripe {
-	u8			lock;
-	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
-	u16			sectors;
-	u8			nr_blocks;
-	u8			nr_redundant;
-	u16			block_sectors[BCH_BKEY_PTRS_MAX];
-	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
-
-	union bch_replicas_padded r;
-};
-
-#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
deleted file mode 100644
index 56ab430f209f..000000000000
--- a/fs/bcachefs/enumerated_ref.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "enumerated_ref.h"
-#include "util.h"
-
-#include <linux/completion.h>
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
-	BUG_ON(idx >= ref->nr);
-	atomic_long_inc(&ref->refs[idx]);
-}
-
-bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-	BUG_ON(idx >= ref->nr);
-	return atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-	BUG_ON(idx >= ref->nr);
-	return !ref->dying &&
-		atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
-	BUG_ON(idx >= ref->nr);
-	long v = atomic_long_dec_return(&ref->refs[idx]);
-
-	BUG_ON(v < 0);
-	if (v)
-		return;
-
-	for (unsigned i = 0; i < ref->nr; i++)
-		if (atomic_long_read(&ref->refs[i]))
-			return;
-
-	if (ref->stop_fn)
-		ref->stop_fn(ref);
-	complete(&ref->stop_complete);
-}
-#endif
-
-#ifndef ENUMERATED_REF_DEBUG
-static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
-{
-	struct enumerated_ref *ref =
-		container_of(percpu_ref, struct enumerated_ref, ref);
-
-	if (ref->stop_fn)
-		ref->stop_fn(ref);
-	complete(&ref->stop_complete);
-}
-#endif
-
-void enumerated_ref_stop_async(struct enumerated_ref *ref)
-{
-	reinit_completion(&ref->stop_complete);
-
-#ifndef ENUMERATED_REF_DEBUG
-	percpu_ref_kill(&ref->ref);
-#else
-	ref->dying = true;
-	for (unsigned i = 0; i < ref->nr; i++)
-		enumerated_ref_put(ref, i);
-#endif
-}
-
-void enumerated_ref_stop(struct enumerated_ref *ref,
-			 const char * const names[])
-{
-	enumerated_ref_stop_async(ref);
-	while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
-		prt_str(&buf, "Outstanding refs:\n");
-		enumerated_ref_to_text(&buf, ref, names);
-		printk(KERN_ERR "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-}
-
-void enumerated_ref_start(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-	percpu_ref_reinit(&ref->ref);
-#else
-	ref->dying = false;
-	for (unsigned i = 0; i < ref->nr; i++) {
-		BUG_ON(atomic_long_read(&ref->refs[i]));
-		atomic_long_inc(&ref->refs[i]);
-	}
-#endif
-}
-
-void enumerated_ref_exit(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-	percpu_ref_exit(&ref->ref);
-#else
-	kfree(ref->refs);
-	ref->refs = NULL;
-	ref->nr = 0;
-#endif
-}
-
-int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
-			void (*stop_fn)(struct enumerated_ref *))
-{
-	init_completion(&ref->stop_complete);
-	ref->stop_fn = stop_fn;
-
-#ifndef ENUMERATED_REF_DEBUG
-	return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
-			    PERCPU_REF_INIT_DEAD, GFP_KERNEL);
-#else
-	ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
-	if (!ref->refs)
-		return -ENOMEM;
-
-	ref->nr = nr;
-	return 0;
-#endif
-}
-
-void enumerated_ref_to_text(struct printbuf *out,
-			    struct enumerated_ref *ref,
-			    const char * const names[])
-{
-#ifdef ENUMERATED_REF_DEBUG
-	bch2_printbuf_tabstop_push(out, 32);
-
-	for (unsigned i = 0; i < ref->nr; i++)
-		prt_printf(out, "%s\t%li\n", names[i],
-			   atomic_long_read(&ref->refs[i]));
-#else
-	prt_str(out, "(not in debug mode)\n");
-#endif
-}
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
deleted file mode 100644
index ec01cf59ef80..000000000000
--- a/fs/bcachefs/enumerated_ref.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_H
-#define _BCACHEFS_ENUMERATED_REF_H
-
-#include "enumerated_ref_types.h"
-
-/*
- * A refcount where the users are enumerated: in debug mode, we create sepate
- * refcounts for each user, to make leaks and refcount errors easy to track
- * down:
- */
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *, unsigned);
-bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-void enumerated_ref_put(struct enumerated_ref *, unsigned);
-#else
-
-static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
-	percpu_ref_get(&ref->ref);
-}
-
-static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-	return percpu_ref_tryget(&ref->ref);
-}
-
-static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
-	return percpu_ref_tryget_live(&ref->ref);
-}
-
-static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
-	percpu_ref_put(&ref->ref);
-}
-#endif
-
-static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
-	return percpu_ref_is_zero(&ref->ref);
-#else
-	for (unsigned i = 0; i < ref->nr; i++)
-		if (atomic_long_read(&ref->refs[i]))
-			return false;
-	return true;
-#endif
-}
-
-void enumerated_ref_stop_async(struct enumerated_ref *);
-void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
-void enumerated_ref_start(struct enumerated_ref *);
-
-void enumerated_ref_exit(struct enumerated_ref *);
-int enumerated_ref_init(struct enumerated_ref *, unsigned,
-			void (*stop_fn)(struct enumerated_ref *));
-
-struct printbuf;
-void enumerated_ref_to_text(struct printbuf *,
-			    struct enumerated_ref *,
-			    const char * const[]);
-
-#endif /* _BCACHEFS_ENUMERATED_REF_H */
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
deleted file mode 100644
index 0e6076f466d3..000000000000
--- a/fs/bcachefs/enumerated_ref_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
-#define _BCACHEFS_ENUMERATED_REF_TYPES_H
-
-#include <linux/percpu-refcount.h>
-
-struct enumerated_ref {
-#ifdef ENUMERATED_REF_DEBUG
-	unsigned		nr;
-	bool			dying;
-	atomic_long_t		*refs;
-#else
-	struct percpu_ref	ref;
-#endif
-	void			(*stop_fn)(struct enumerated_ref *);
-	struct completion	stop_complete;
-};
-
-#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
deleted file mode 100644
index c39cf304c681..000000000000
--- a/fs/bcachefs/errcode.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "errcode.h"
-#include "trace.h"
-
-#include <linux/errname.h>
-
-static const char * const bch2_errcode_strs[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
-	BCH_ERRCODES()
-#undef x
-	NULL
-};
-
-static const unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
-	BCH_ERRCODES()
-#undef x
-};
-
-__attribute__((const))
-const char *bch2_err_str(int err)
-{
-	const char *errstr;
-
-	err = abs(err);
-
-	BUG_ON(err >= BCH_ERR_MAX);
-
-	if (err >= BCH_ERR_START)
-		errstr = bch2_errcode_strs[err - BCH_ERR_START];
-	else if (err)
-		errstr = errname(err);
-	else
-		errstr = "(No error)";
-	return errstr ?: "(Invalid error)";
-}
-
-__attribute__((const))
-bool __bch2_err_matches(int err, int class)
-{
-	err	= abs(err);
-	class	= abs(class);
-
-	BUG_ON(err	>= BCH_ERR_MAX);
-	BUG_ON(class	>= BCH_ERR_MAX);
-
-	while (err >= BCH_ERR_START && err != class)
-		err = bch2_errcode_parents[err - BCH_ERR_START];
-
-	return err == class;
-}
-
-int __bch2_err_class(int bch_err)
-{
-	int std_err = -bch_err;
-	BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
-
-	while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
-		std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
-
-	trace_error_downcast(bch_err, std_err, _RET_IP_);
-
-	return -std_err;
-}
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-	if (status == BLK_STS_REMOVED)
-		return "device removed";
-	return blk_status_to_str(status);
-}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
deleted file mode 100644
index acc3b7b67704..000000000000
--- a/fs/bcachefs/errcode.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERRCODE_H
-#define _BCACHEFS_ERRCODE_H
-
-#define BCH_ERRCODES()								\
-	x(ERANGE,			ERANGE_option_too_small)		\
-	x(ERANGE,			ERANGE_option_too_big)			\
-	x(EINVAL,			injected)				\
-	x(BCH_ERR_injected,		injected_fs_start)			\
-	x(EINVAL,			mount_option)				\
-	x(BCH_ERR_mount_option,		option_name)				\
-	x(BCH_ERR_mount_option,		option_value)				\
-	x(BCH_ERR_mount_option,         option_not_bool)                        \
-	x(ENOMEM,			ENOMEM_stripe_buf)			\
-	x(ENOMEM,			ENOMEM_replicas_table)			\
-	x(ENOMEM,			ENOMEM_cpu_replicas)			\
-	x(ENOMEM,			ENOMEM_replicas_gc)			\
-	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
-	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
-	x(ENOMEM,			ENOMEM_mark_snapshot)			\
-	x(ENOMEM,			ENOMEM_mark_stripe)			\
-	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
-	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
-	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
-	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
-	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
-	x(ENOMEM,			ENOMEM_trans_log_msg)			\
-	x(ENOMEM,			ENOMEM_do_encrypt)			\
-	x(ENOMEM,			ENOMEM_ec_read_extent)			\
-	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
-	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
-	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
-	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
-	x(ENOMEM,			ENOMEM_fs_counters_init)		\
-	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
-	x(ENOMEM,			ENOMEM_io_clock_init)			\
-	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
-	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
-	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
-	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
-	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
-	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
-	x(ENOMEM,			ENOMEM_journal_entry_add)		\
-	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
-	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
-	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
-	x(ENOMEM,			ENOMEM_bio_read_init)			\
-	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
-	x(ENOMEM,			ENOMEM_bio_write_init)			\
-	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
-	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
-	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
-	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
-	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
-	x(ENOMEM,			ENOMEM_promote_table_init)		\
-	x(ENOMEM,			ENOMEM_async_obj_init)			\
-	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
-	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
-	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
-	x(ENOMEM,			ENOMEM_backpointer_mismatches_bitmap)	\
-	x(EIO,				compression_workspace_not_initialized)	\
-	x(ENOMEM,			ENOMEM_bucket_gens)			\
-	x(ENOMEM,			ENOMEM_buckets_nouse)			\
-	x(ENOMEM,			ENOMEM_usage_init)			\
-	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
-	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
-	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
-	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
-	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
-	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
-	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
-	x(ENOMEM,			ENOMEM_dev_journal_init)		\
-	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
-	x(ENOMEM,			ENOMEM_journal_buf)			\
-	x(ENOMEM,			ENOMEM_gc_start)			\
-	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
-	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
-	x(ENOMEM,			ENOMEM_gc_gens)				\
-	x(ENOMEM,			ENOMEM_gc_repair_key)			\
-	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
-	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
-	x(ENOMEM,			ENOMEM_journal_key_insert)		\
-	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
-	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
-	x(ENOMEM,			ENOMEM_fs_alloc)			\
-	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
-	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
-	x(ENOMEM,			ENOMEM_dev_alloc)			\
-	x(ENOMEM,			ENOMEM_disk_accounting)			\
-	x(ENOMEM,			ENOMEM_stripe_head_alloc)		\
-	x(ENOMEM,                       ENOMEM_journal_read_bucket)             \
-	x(ENOSPC,			ENOSPC_disk_reservation)		\
-	x(ENOSPC,			ENOSPC_bucket_alloc)			\
-	x(ENOSPC,			ENOSPC_disk_label_add)			\
-	x(ENOSPC,			ENOSPC_stripe_create)			\
-	x(ENOSPC,			ENOSPC_inode_create)			\
-	x(ENOSPC,			ENOSPC_str_hash_create)			\
-	x(ENOSPC,			ENOSPC_snapshot_create)			\
-	x(ENOSPC,			ENOSPC_subvolume_create)		\
-	x(ENOSPC,			ENOSPC_sb)				\
-	x(ENOSPC,			ENOSPC_sb_journal)			\
-	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
-	x(ENOSPC,			ENOSPC_sb_quota)			\
-	x(ENOSPC,			ENOSPC_sb_replicas)			\
-	x(ENOSPC,			ENOSPC_sb_members)			\
-	x(ENOSPC,			ENOSPC_sb_members_v2)			\
-	x(ENOSPC,			ENOSPC_sb_crypt)			\
-	x(ENOSPC,			ENOSPC_sb_downgrade)			\
-	x(ENOSPC,			ENOSPC_btree_slot)			\
-	x(ENOSPC,			ENOSPC_snapshot_tree)			\
-	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
-	x(ENOENT,			ENOENT_str_hash_lookup)			\
-	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
-	x(ENOENT,			ENOENT_inode)				\
-	x(ENOENT,			ENOENT_not_subvol)			\
-	x(ENOENT,			ENOENT_not_directory)			\
-	x(ENOENT,			ENOENT_directory_dead)			\
-	x(ENOENT,			ENOENT_subvolume)			\
-	x(ENOENT,			ENOENT_snapshot_tree)			\
-	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
-	x(ENOENT,			ENOENT_dev_not_found)			\
-	x(ENOENT,			ENOENT_dev_bucket_not_found)		\
-	x(ENOENT,			ENOENT_dev_idx_not_found)		\
-	x(ENOENT,			ENOENT_inode_no_backpointer)		\
-	x(ENOENT,			ENOENT_no_snapshot_tree_subvol)		\
-	x(ENOENT,			btree_node_dying)			\
-	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
-	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
-	x(EEXIST,			EEXIST_str_hash_set)			\
-	x(EEXIST,			EEXIST_discard_in_flight_add)		\
-	x(EEXIST,			EEXIST_subvolume_create)		\
-	x(ENOSPC,			open_buckets_empty)			\
-	x(ENOSPC,			freelist_empty)				\
-	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
-	x(0,				transaction_restart)			\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
-	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
-	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
-	x(BCH_ERR_transaction_restart,	transaction_restart_commit)		\
-	x(0,				no_btree_node)				\
-	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
-	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
-	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
-	x(0,				btree_insert_fail)			\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
-	x(0,				backpointer_to_overwritten_btree_node)	\
-	x(0,				journal_reclaim_would_deadlock)		\
-	x(EINVAL,			fsck)					\
-	x(BCH_ERR_fsck,			fsck_ask)				\
-	x(BCH_ERR_fsck,			fsck_fix)				\
-	x(BCH_ERR_fsck,			fsck_delete_bkey)			\
-	x(BCH_ERR_fsck,			fsck_ignore)				\
-	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
-	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
-	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
-	x(EINVAL,			recovery_will_run)			\
-	x(BCH_ERR_recovery_will_run,	restart_recovery)			\
-	x(BCH_ERR_recovery_will_run,	cannot_rewind_recovery)			\
-	x(BCH_ERR_recovery_will_run,	recovery_pass_will_run)			\
-	x(0,				data_update_done)			\
-	x(0,				bkey_was_deleted)			\
-	x(BCH_ERR_data_update_done,	data_update_done_would_block)		\
-	x(BCH_ERR_data_update_done,	data_update_done_unwritten)		\
-	x(BCH_ERR_data_update_done,	data_update_done_no_writes_needed)	\
-	x(BCH_ERR_data_update_done,	data_update_done_no_snapshot)		\
-	x(BCH_ERR_data_update_done,	data_update_done_no_dev_refs)		\
-	x(BCH_ERR_data_update_done,	data_update_done_no_rw_devs)		\
-	x(EINVAL,			device_state_not_allowed)		\
-	x(EINVAL,			member_info_missing)			\
-	x(EINVAL,			mismatched_block_size)			\
-	x(EINVAL,			block_size_too_small)			\
-	x(EINVAL,			bucket_size_too_small)			\
-	x(EINVAL,			device_size_too_small)			\
-	x(EINVAL,			device_size_too_big)			\
-	x(EINVAL,			device_not_a_member_of_filesystem)	\
-	x(EINVAL,			device_has_been_removed)		\
-	x(EINVAL,			device_splitbrain)			\
-	x(EINVAL,			device_already_online)			\
-	x(EINVAL,			filesystem_uuid_already_open)		\
-	x(EINVAL,			insufficient_devices_to_start)		\
-	x(EINVAL,			invalid)				\
-	x(EINVAL,			internal_fsck_err)			\
-	x(EINVAL,			opt_parse_error)			\
-	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
-	x(EINVAL,			remove_would_lose_data)			\
-	x(EINVAL,			no_resize_with_buckets_nouse)		\
-	x(EINVAL,			inode_unpack_error)			\
-	x(EINVAL,			inode_not_unlinked)			\
-	x(EINVAL,			inode_has_child_snapshot)		\
-	x(EINVAL,			varint_decode_error)			\
-	x(EINVAL,			erasure_coding_found_btree_node)	\
-	x(EINVAL,			option_negative)			\
-	x(EOPNOTSUPP,			may_not_use_incompat_feature)		\
-	x(EROFS,			erofs_trans_commit)			\
-	x(EROFS,			erofs_no_writes)			\
-	x(EROFS,			erofs_journal_err)			\
-	x(EROFS,			erofs_sb_err)				\
-	x(EROFS,			erofs_unfixed_errors)			\
-	x(EROFS,			erofs_norecovery)			\
-	x(EROFS,			erofs_nochanges)			\
-	x(EROFS,			erofs_no_alloc_info)			\
-	x(EROFS,			erofs_filesystem_full)			\
-	x(EROFS,			insufficient_devices)			\
-	x(0,				operation_blocked)			\
-	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
-	x(BCH_ERR_operation_blocked,	journal_res_blocked)			\
-	x(BCH_ERR_journal_res_blocked,	journal_blocked)			\
-	x(BCH_ERR_journal_res_blocked,	journal_max_in_flight)			\
-	x(BCH_ERR_journal_res_blocked,	journal_max_open)			\
-	x(BCH_ERR_journal_res_blocked,	journal_full)				\
-	x(BCH_ERR_journal_res_blocked,	journal_pin_full)			\
-	x(BCH_ERR_journal_res_blocked,	journal_buf_enomem)			\
-	x(BCH_ERR_journal_res_blocked,	journal_stuck)				\
-	x(BCH_ERR_journal_res_blocked,	journal_retry_open)			\
-	x(BCH_ERR_journal_res_blocked,	bucket_alloc_blocked)			\
-	x(BCH_ERR_journal_res_blocked,	stripe_alloc_blocked)			\
-	x(BCH_ERR_invalid,		invalid_sb)				\
-	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_offset)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
-	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
-	x(BCH_ERR_invalid_sb_layout,    invalid_sb_layout_sb_max_size_bits)     \
-	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
-	x(BCH_ERR_invalid_sb,		invalid_replicas_entry)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
-	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_errors)			\
-	x(BCH_ERR_invalid_sb,		invalid_sb_opt_compression)		\
-	x(BCH_ERR_invalid_sb,		invalid_sb_ext)				\
-	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
-	x(BCH_ERR_invalid,		invalid_bkey)				\
-	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
-	x(EROFS,			journal_shutdown)			\
-	x(EIO,				journal_flush_err)			\
-	x(EIO,				journal_write_err)			\
-	x(EIO,				btree_node_read_err)			\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_cached)		\
-	x(EIO,				sb_not_downgraded)			\
-	x(EIO,				btree_node_write_all_failed)		\
-	x(EIO,				btree_node_read_error)			\
-	x(EIO,				btree_need_topology_repair)		\
-	x(EIO,				bucket_ref_update)			\
-	x(EIO,				trigger_alloc)				\
-	x(EIO,				trigger_pointer)			\
-	x(EIO,				trigger_stripe_pointer)			\
-	x(EIO,				metadata_bucket_inconsistency)		\
-	x(EIO,				mark_stripe)				\
-	x(EIO,				stripe_reconstruct)			\
-	x(EIO,				key_type_error)				\
-	x(EIO,				extent_poisoned)			\
-	x(EIO,				missing_indirect_extent)		\
-	x(EIO,				invalidate_stripe_to_dev)		\
-	x(EIO,				no_encryption_key)			\
-	x(EIO,				insufficient_journal_devices)		\
-	x(EIO,				device_offline)				\
-	x(EIO,				EIO_fault_injected)			\
-	x(EIO,				ec_block_read)				\
-	x(EIO,				ec_block_write)				\
-	x(EIO,				recompute_checksum)			\
-	x(EIO,				decompress)				\
-	x(BCH_ERR_decompress,		decompress_exceeded_max_encoded_extent)	\
-	x(BCH_ERR_decompress,		decompress_lz4)				\
-	x(BCH_ERR_decompress,		decompress_gzip)			\
-	x(BCH_ERR_decompress,		decompress_zstd_src_len_bad)		\
-	x(BCH_ERR_decompress,		decompress_zstd)			\
-	x(EIO,				data_write)				\
-	x(BCH_ERR_data_write,		data_write_io)				\
-	x(BCH_ERR_data_write,		data_write_csum)			\
-	x(BCH_ERR_data_write,		data_write_invalid_ptr)			\
-	x(BCH_ERR_data_write,		data_write_misaligned)			\
-	x(BCH_ERR_decompress,		data_read)				\
-	x(BCH_ERR_data_read,		no_device_to_read_from)			\
-	x(BCH_ERR_data_read,		no_devices_valid)			\
-	x(BCH_ERR_data_read,		data_read_io_err)			\
-	x(BCH_ERR_data_read,		data_read_csum_err)			\
-	x(BCH_ERR_data_read,		data_read_retry)			\
-	x(BCH_ERR_data_read_retry,	data_read_retry_avoid)			\
-	x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline)		\
-	x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err)			\
-	x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err)	\
-	x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err)		\
-	x(BCH_ERR_data_read_retry,	data_read_retry_csum_err_maybe_userspace)\
-	x(BCH_ERR_data_read,		data_read_decompress_err)		\
-	x(BCH_ERR_data_read,		data_read_decrypt_err)			\
-	x(BCH_ERR_data_read,		data_read_ptr_stale_race)		\
-	x(BCH_ERR_data_read_retry,	data_read_ptr_stale_retry)		\
-	x(BCH_ERR_data_read,		data_read_no_encryption_key)		\
-	x(BCH_ERR_data_read,		data_read_buffer_too_small)		\
-	x(BCH_ERR_data_read,		data_read_key_overwritten)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
-	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
-	x(0,				nopromote)				\
-	x(BCH_ERR_nopromote,		nopromote_may_not)			\
-	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
-	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
-	x(BCH_ERR_nopromote,		nopromote_congested)			\
-	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
-	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
-	x(BCH_ERR_nopromote,		nopromote_enomem)			\
-	x(0,				invalid_snapshot_node)			\
-	x(0,				option_needs_open_fs)			\
-	x(0,				remove_disk_accounting_entry)
-
-enum bch_errcode {
-	BCH_ERR_START		= 2048,
-#define x(class, err) BCH_ERR_##err,
-	BCH_ERRCODES()
-#undef x
-	BCH_ERR_MAX
-};
-
-__attribute__((const)) const char *bch2_err_str(int);
-
-__attribute__((const)) bool __bch2_err_matches(int, int);
-
-__attribute__((const))
-static inline bool _bch2_err_matches(int err, int class)
-{
-	return err < 0 && __bch2_err_matches(err, class);
-}
-
-#define bch2_err_matches(_err, _class)			\
-({							\
-	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
-	unlikely(_bch2_err_matches(_err, _class));	\
-})
-
-int __bch2_err_class(int);
-
-static inline long bch2_err_class(long err)
-{
-	return err < 0 ? __bch2_err_class(err) : err;
-}
-
-#define BLK_STS_REMOVED		((__force blk_status_t)128)
-
-#include <linux/blk_types.h>
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
deleted file mode 100644
index 267e73d9d7e6..000000000000
--- a/fs/bcachefs/error.c
+++ /dev/null
@@ -1,771 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "error.h"
-#include "journal.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "super.h"
-#include "thread_with_file.h"
-
-#define FSCK_ERR_RATELIMIT_NR	10
-
-void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
-{
-	printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
-	prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
-#endif
-}
-
-bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
-{
-	set_bit(BCH_FS_error, &c->flags);
-
-	switch (c->opts.errors) {
-	case BCH_ON_ERROR_continue:
-		return false;
-	case BCH_ON_ERROR_fix_safe:
-	case BCH_ON_ERROR_ro:
-		bch2_fs_emergency_read_only2(c, out);
-		return true;
-	case BCH_ON_ERROR_panic:
-		bch2_print_str(c, KERN_ERR, out->buf);
-		panic(bch2_fmt(c, "panic after error"));
-		return true;
-	default:
-		BUG();
-	}
-}
-
-bool bch2_inconsistent_error(struct bch_fs *c)
-{
-	struct printbuf buf = PRINTBUF;
-	buf.atomic++;
-
-	printbuf_indent_add_nextline(&buf, 2);
-
-	bool ret = __bch2_inconsistent_error(c, &buf);
-	if (ret)
-		bch_err(c, "%s", buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-__printf(3, 0)
-static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans,
-				       const char *fmt, va_list args)
-{
-	struct printbuf buf = PRINTBUF;
-	buf.atomic++;
-
-	bch2_log_msg_start(c, &buf);
-
-	prt_vprintf(&buf, fmt, args);
-	prt_newline(&buf);
-
-	if (trans)
-		bch2_trans_updates_to_text(&buf, trans);
-	bool ret = __bch2_inconsistent_error(c, &buf);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-
-	printbuf_exit(&buf);
-	return ret;
-}
-
-bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...)
-{
-	va_list args;
-	va_start(args, fmt);
-	bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...)
-{
-	va_list args;
-	va_start(args, fmt);
-	bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
-{
-	prt_printf(out, "btree topology error: ");
-
-	set_bit(BCH_FS_topology_error, &c->flags);
-	if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
-		__bch2_inconsistent_error(c, out);
-		return bch_err_throw(c, btree_need_topology_repair);
-	} else {
-		return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
-			bch_err_throw(c, btree_need_topology_repair);
-	}
-}
-
-int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
-{
-	struct printbuf buf = PRINTBUF;
-
-	bch2_log_msg_start(c, &buf);
-
-	va_list args;
-	va_start(args, fmt);
-	prt_vprintf(&buf, fmt, args);
-	va_end(args);
-
-	int ret = __bch2_topology_error(c, &buf);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-
-	printbuf_exit(&buf);
-	return ret;
-}
-
-void bch2_fatal_error(struct bch_fs *c)
-{
-	if (bch2_fs_emergency_read_only(c))
-		bch_err(c, "fatal error - emergency read only");
-}
-
-void bch2_io_error_work(struct work_struct *work)
-{
-	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
-	struct bch_fs *c = ca->fs;
-
-	/* XXX: if it's reads or checksums that are failing, set it to failed */
-
-	down_write(&c->state_lock);
-	unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
-
-	if (write_errors_start &&
-	    time_after(jiffies,
-		       write_errors_start + c->opts.write_error_timeout * HZ)) {
-		if (ca->mi.state >= BCH_MEMBER_STATE_ro)
-			goto out;
-
-		bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
-						 BCH_FORCE_IF_DEGRADED);
-		struct printbuf buf = PRINTBUF;
-		__bch2_log_msg_start(ca->name, &buf);
-
-		prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
-			c->opts.write_error_timeout,
-			dev ? "device" : "filesystem");
-		if (!dev)
-			bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-out:
-	up_write(&c->state_lock);
-}
-
-void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
-{
-	atomic64_inc(&ca->errors[type]);
-
-	if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
-		ca->write_errors_start = jiffies;
-
-	queue_work(system_long_wq, &ca->io_error_work);
-}
-
-enum ask_yn {
-	YN_NO,
-	YN_YES,
-	YN_ALLNO,
-	YN_ALLYES,
-};
-
-static enum ask_yn parse_yn_response(char *buf)
-{
-	buf = strim(buf);
-
-	if (strlen(buf) == 1)
-		switch (buf[0]) {
-		case 'n':
-			return YN_NO;
-		case 'y':
-			return YN_YES;
-		case 'N':
-			return YN_ALLNO;
-		case 'Y':
-			return YN_ALLYES;
-		}
-	return -1;
-}
-
-#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
-	struct stdio_redirect *stdio = c->stdio;
-
-	if (c->stdio_filter && c->stdio_filter != current)
-		stdio = NULL;
-
-	if (!stdio)
-		return YN_NO;
-
-	if (trans)
-		bch2_trans_unlock(trans);
-
-	unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
-	darray_char line = {};
-	int ret;
-
-	do {
-		unsigned long t;
-		bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
-rewait:
-		t = unlock_long_at
-			? max_t(long, unlock_long_at - jiffies, 0)
-			: MAX_SCHEDULE_TIMEOUT;
-
-		int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
-		if (r == -ETIME) {
-			bch2_trans_unlock_long(trans);
-			unlock_long_at = 0;
-			goto rewait;
-		}
-
-		if (r < 0) {
-			ret = YN_NO;
-			break;
-		}
-
-		darray_last(line) = '\0';
-	} while ((ret = parse_yn_response(line.data)) < 0);
-
-	darray_exit(&line);
-	return ret;
-}
-#else
-
-#include "tools-util.h"
-
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
-	char *buf = NULL;
-	size_t buflen = 0;
-	int ret;
-
-	do {
-		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
-		fflush(stdout);
-
-		if (getline(&buf, &buflen, stdin) < 0)
-			die("error reading from standard input");
-	} while ((ret = parse_yn_response(buf)) < 0);
-
-	free(buf);
-	return ret;
-}
-
-#endif
-
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
-					   enum bch_sb_error_id id)
-{
-	struct fsck_err_state *s;
-
-	list_for_each_entry(s, &c->fsck_error_msgs, list)
-		if (s->id == id) {
-			/*
-			 * move it to the head of the list: repeated fsck errors
-			 * are common
-			 */
-			list_move(&s->list, &c->fsck_error_msgs);
-			return s;
-		}
-
-	s = kzalloc(sizeof(*s), GFP_NOFS);
-	if (!s) {
-		if (!c->fsck_alloc_msgs_err)
-			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-		c->fsck_alloc_msgs_err = true;
-		return NULL;
-	}
-
-	INIT_LIST_HEAD(&s->list);
-	s->id = id;
-	list_add(&s->list, &c->fsck_error_msgs);
-	return s;
-}
-
-/* s/fix?/fixing/ s/recreate?/recreating/ */
-static void prt_actioning(struct printbuf *out, const char *action)
-{
-	unsigned len = strlen(action);
-
-	BUG_ON(action[len - 1] != '?');
-	--len;
-
-	if (action[len - 1] == 'e')
-		--len;
-
-	prt_bytes(out, action, len);
-	prt_str(out, "ing");
-}
-
-static const u8 fsck_flags_extra[] = {
-#define x(t, n, flags)		[BCH_FSCK_ERR_##t] = flags,
-	BCH_SB_ERRS()
-#undef x
-};
-
-static int do_fsck_ask_yn(struct bch_fs *c,
-			  struct btree_trans *trans,
-			  struct printbuf *question,
-			  const char *action)
-{
-	prt_str(question, ", ");
-	prt_str(question, action);
-
-	if (bch2_fs_stdio_redirect(c))
-		bch2_print(c, "%s", question->buf);
-	else
-		bch2_print_str(c, KERN_ERR, question->buf);
-
-	int ask = bch2_fsck_ask_yn(c, trans);
-
-	if (trans) {
-		int ret = bch2_trans_relock(trans);
-		if (ret)
-			return ret;
-	}
-
-	return ask;
-}
-
-static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
-			  enum bch_sb_error_id id, const char *msg,
-			  bool *repeat, bool *print, bool *suppress)
-{
-	bch2_sb_error_count(c, id);
-
-	struct fsck_err_state *s = fsck_err_get(c, id);
-	if (s) {
-		/*
-		 * We may be called multiple times for the same error on
-		 * transaction restart - this memoizes instead of asking the user
-		 * multiple times for the same error:
-		 */
-		if (s->last_msg && !strcmp(msg, s->last_msg)) {
-			*repeat = true;
-			*print = false;
-			return s;
-		}
-
-		kfree(s->last_msg);
-		s->last_msg = kstrdup(msg, GFP_KERNEL);
-
-		if (c->opts.ratelimit_errors &&
-		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
-			if (s->nr == FSCK_ERR_RATELIMIT_NR)
-				*suppress = true;
-			else
-				*print = false;
-		}
-
-		s->nr++;
-	}
-	return s;
-}
-
-bool __bch2_count_fsck_err(struct bch_fs *c,
-			   enum bch_sb_error_id id, struct printbuf *msg)
-{
-	bch2_sb_error_count(c, id);
-
-	mutex_lock(&c->fsck_error_msgs_lock);
-	bool print = true, repeat = false, suppress = false;
-
-	count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
-	mutex_unlock(&c->fsck_error_msgs_lock);
-
-	if (suppress)
-		prt_printf(msg, "Ratelimiting new instances of previous error\n");
-
-	return print && !repeat;
-}
-
-int bch2_fsck_err_opt(struct bch_fs *c,
-		      enum bch_fsck_flags flags,
-		      enum bch_sb_error_id err)
-{
-	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-		flags |= fsck_flags_extra[err];
-
-	if (test_bit(BCH_FS_in_fsck, &c->flags)) {
-		if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
-			return bch_err_throw(c, fsck_repair_unimplemented);
-
-		switch (c->opts.fix_errors) {
-		case FSCK_FIX_exit:
-			return bch_err_throw(c, fsck_errors_not_fixed);
-		case FSCK_FIX_yes:
-			if (flags & FSCK_CAN_FIX)
-				return bch_err_throw(c, fsck_fix);
-			fallthrough;
-		case FSCK_FIX_no:
-			if (flags & FSCK_CAN_IGNORE)
-				return bch_err_throw(c, fsck_ignore);
-			return bch_err_throw(c, fsck_errors_not_fixed);
-		case FSCK_FIX_ask:
-			if (flags & FSCK_AUTOFIX)
-				return bch_err_throw(c, fsck_fix);
-			return bch_err_throw(c, fsck_ask);
-		default:
-			BUG();
-		}
-	} else {
-		if ((flags & FSCK_AUTOFIX) &&
-		    (c->opts.errors == BCH_ON_ERROR_continue ||
-		     c->opts.errors == BCH_ON_ERROR_fix_safe))
-			return bch_err_throw(c, fsck_fix);
-
-		if (c->opts.errors == BCH_ON_ERROR_continue &&
-		    (flags & FSCK_CAN_IGNORE))
-			return bch_err_throw(c, fsck_ignore);
-		return bch_err_throw(c, fsck_errors_not_fixed);
-	}
-}
-
-int __bch2_fsck_err(struct bch_fs *c,
-		  struct btree_trans *trans,
-		  enum bch_fsck_flags flags,
-		  enum bch_sb_error_id err,
-		  const char *fmt, ...)
-{
-	va_list args;
-	struct printbuf buf = PRINTBUF, *out = &buf;
-	int ret = 0;
-	const char *action_orig = "fix?", *action = action_orig;
-
-	might_sleep();
-
-	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-		flags |= fsck_flags_extra[err];
-
-	if (!c)
-		c = trans->c;
-
-	/*
-	 * Ugly: if there's a transaction in the current task it has to be
-	 * passed in to unlock if we prompt for user input.
-	 *
-	 * But, plumbing a transaction and transaction restarts into
-	 * bkey_validate() is problematic.
-	 *
-	 * So:
-	 * - make all bkey errors AUTOFIX, they're simple anyways (we just
-	 *   delete the key)
-	 * - and we don't need to warn if we're not prompting
-	 */
-	WARN_ON((flags & FSCK_CAN_FIX) &&
-		!(flags & FSCK_AUTOFIX) &&
-		!trans &&
-		bch2_current_has_btree_trans(c));
-
-	if (test_bit(err, c->sb.errors_silent))
-		return flags & FSCK_CAN_FIX
-			? bch_err_throw(c, fsck_fix)
-			: bch_err_throw(c, fsck_ignore);
-
-	printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
-	if (strncmp(fmt, "bcachefs", 8))
-		prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
-	va_start(args, fmt);
-	prt_vprintf(out, fmt, args);
-	va_end(args);
-
-	/* Custom fix/continue/recreate/etc.? */
-	if (out->buf[out->pos - 1] == '?') {
-		const char *p = strrchr(out->buf, ',');
-		if (p) {
-			out->pos = p - out->buf;
-			action = kstrdup(p + 2, GFP_KERNEL);
-			if (!action) {
-				ret = -ENOMEM;
-				goto err;
-			}
-		}
-	}
-
-	mutex_lock(&c->fsck_error_msgs_lock);
-	bool repeat = false, print = true, suppress = false;
-	bool inconsistent = false, exiting = false;
-	struct fsck_err_state *s =
-		count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress);
-	if (repeat) {
-		ret = s->ret;
-		goto err_unlock;
-	}
-
-	if ((flags & FSCK_AUTOFIX) &&
-	    (c->opts.errors == BCH_ON_ERROR_continue ||
-	     c->opts.errors == BCH_ON_ERROR_fix_safe)) {
-		prt_str(out, ", ");
-		if (flags & FSCK_CAN_FIX) {
-			prt_actioning(out, action);
-			ret = bch_err_throw(c, fsck_fix);
-		} else {
-			prt_str(out, ", continuing");
-			ret = bch_err_throw(c, fsck_ignore);
-		}
-
-		goto print;
-	} else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
-		if (c->opts.errors != BCH_ON_ERROR_continue ||
-		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
-			prt_str_indented(out, ", shutting down\n"
-					 "error not marked as autofix and not in fsck\n"
-					 "run fsck, and forward to devs so error can be marked for self-healing");
-			inconsistent = true;
-			print = true;
-			ret = bch_err_throw(c, fsck_errors_not_fixed);
-		} else if (flags & FSCK_CAN_FIX) {
-			prt_str(out, ", ");
-			prt_actioning(out, action);
-			ret = bch_err_throw(c, fsck_fix);
-		} else {
-			prt_str(out, ", continuing");
-			ret = bch_err_throw(c, fsck_ignore);
-		}
-	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
-		prt_str(out, ", exiting");
-		ret = bch_err_throw(c, fsck_errors_not_fixed);
-	} else if (flags & FSCK_CAN_FIX) {
-		int fix = s && s->fix
-			? s->fix
-			: c->opts.fix_errors;
-
-		if (fix == FSCK_FIX_ask) {
-			print = false;
-
-			ret = do_fsck_ask_yn(c, trans, out, action);
-			if (ret < 0)
-				goto err_unlock;
-
-			if (ret >= YN_ALLNO && s)
-				s->fix = ret == YN_ALLNO
-					? FSCK_FIX_no
-					: FSCK_FIX_yes;
-
-			ret = ret & 1
-				? bch_err_throw(c, fsck_fix)
-				: bch_err_throw(c, fsck_ignore);
-		} else if (fix == FSCK_FIX_yes ||
-			   (c->opts.nochanges &&
-			    !(flags & FSCK_CAN_IGNORE))) {
-			prt_str(out, ", ");
-			prt_actioning(out, action);
-			ret = bch_err_throw(c, fsck_fix);
-		} else {
-			prt_str(out, ", not ");
-			prt_actioning(out, action);
-			ret = bch_err_throw(c, fsck_ignore);
-		}
-	} else {
-		if (flags & FSCK_CAN_IGNORE) {
-			prt_str(out, ", continuing");
-			ret = bch_err_throw(c, fsck_ignore);
-		} else {
-			prt_str(out, " (repair unimplemented)");
-			ret = bch_err_throw(c, fsck_repair_unimplemented);
-		}
-	}
-
-	if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
-	    (c->opts.fix_errors == FSCK_FIX_exit ||
-	     !(flags & FSCK_CAN_IGNORE)))
-		ret = bch_err_throw(c, fsck_errors_not_fixed);
-
-	if (test_bit(BCH_FS_in_fsck, &c->flags) &&
-	    (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
-	     !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
-		exiting = true;
-		print = true;
-	}
-print:
-	prt_newline(out);
-
-	if (inconsistent)
-		__bch2_inconsistent_error(c, out);
-	else if (exiting)
-		prt_printf(out, "Unable to continue, halting\n");
-	else if (suppress)
-		prt_printf(out, "Ratelimiting new instances of previous error\n");
-
-	if (print) {
-		/* possibly strip an empty line, from printbuf_indent_add */
-		while (out->pos && out->buf[out->pos - 1] == ' ')
-			--out->pos;
-		printbuf_nul_terminate(out);
-
-		if (bch2_fs_stdio_redirect(c))
-			bch2_print(c, "%s", out->buf);
-		else
-			bch2_print_str(c, KERN_ERR, out->buf);
-	}
-
-	if (s)
-		s->ret = ret;
-
-	if (trans &&
-	    !(flags & FSCK_ERR_NO_LOG) &&
-	    ret == -BCH_ERR_fsck_fix)
-		ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret;
-err_unlock:
-	mutex_unlock(&c->fsck_error_msgs_lock);
-err:
-	/*
-	 * We don't yet track whether the filesystem currently has errors, for
-	 * log_fsck_err()s: that would require us to track for every error type
-	 * which recovery pass corrects it, to get the fsck exit status correct:
-	 */
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		/* nothing */
-	} else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
-		set_bit(BCH_FS_errors_fixed, &c->flags);
-	} else {
-		set_bit(BCH_FS_errors_not_fixed, &c->flags);
-		set_bit(BCH_FS_error, &c->flags);
-	}
-
-	if (action != action_orig)
-		kfree(action);
-	printbuf_exit(&buf);
-
-	BUG_ON(!ret);
-	return ret;
-}
-
-static const char * const bch2_bkey_validate_contexts[] = {
-#define x(n) #n,
-	BKEY_VALIDATE_CONTEXTS()
-#undef x
-	NULL
-};
-
-int __bch2_bkey_fsck_err(struct bch_fs *c,
-			 struct bkey_s_c k,
-			 struct bkey_validate_context from,
-			 enum bch_sb_error_id err,
-			 const char *fmt, ...)
-{
-	if (from.flags & BCH_VALIDATE_silent)
-		return bch_err_throw(c, fsck_delete_bkey);
-
-	unsigned fsck_flags = 0;
-	if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
-		if (test_bit(err, c->sb.errors_silent))
-			return bch_err_throw(c, fsck_delete_bkey);
-
-		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
-	}
-	if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
-		fsck_flags |= fsck_flags_extra[err];
-
-	struct printbuf buf = PRINTBUF;
-	prt_printf(&buf, "invalid bkey in %s",
-		   bch2_bkey_validate_contexts[from.from]);
-
-	if (from.from == BKEY_VALIDATE_journal)
-		prt_printf(&buf, " journal seq=%llu offset=%u",
-			   from.journal_seq, from.journal_offset);
-
-	prt_str(&buf, " btree=");
-	bch2_btree_id_to_text(&buf, from.btree);
-	prt_printf(&buf, " level=%u: ", from.level);
-
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_newline(&buf);
-
-	va_list args;
-	va_start(args, fmt);
-	prt_vprintf(&buf, fmt, args);
-	va_end(args);
-
-	int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
-{
-	struct fsck_err_state *s, *n;
-
-	mutex_lock(&c->fsck_error_msgs_lock);
-
-	list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
-		if (print && s->ratelimited && s->last_msg)
-			bch_err(c, "Saw %llu errors like:\n  %s", s->nr, s->last_msg);
-
-		list_del(&s->list);
-		kfree(s->last_msg);
-		kfree(s);
-	}
-
-	mutex_unlock(&c->fsck_error_msgs_lock);
-}
-
-void bch2_flush_fsck_errs(struct bch_fs *c)
-{
-	__bch2_flush_fsck_errs(c, true);
-}
-
-void bch2_free_fsck_errs(struct bch_fs *c)
-{
-	__bch2_flush_fsck_errs(c, false);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-				    subvol_inum inum, u64 offset)
-{
-	u32 restart_count = trans->restart_count;
-	int ret = 0;
-
-	if (inum.subvol) {
-		ret = bch2_inum_to_path(trans, inum, out);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			return ret;
-	}
-	if (!inum.subvol || ret)
-		prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
-	prt_printf(out, " offset %llu: ", offset);
-
-	return trans_was_restarted(trans, restart_count);
-}
-
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
-			      subvol_inum inum, u64 offset)
-{
-	bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
-}
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-					struct bpos pos)
-{
-	int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
-	if (ret)
-		return ret;
-
-	prt_printf(out, " offset %llu: ", pos.offset << 8);
-	return 0;
-}
-
-void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
-				  struct bpos pos)
-{
-	bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
-}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
deleted file mode 100644
index 0c3c3a24fc6f..000000000000
--- a/fs/bcachefs/error.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERROR_H
-#define _BCACHEFS_ERROR_H
-
-#include <linux/list.h>
-#include <linux/printk.h>
-#include "bkey_types.h"
-#include "sb-errors.h"
-
-struct bch_dev;
-struct bch_fs;
-struct work_struct;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-void __bch2_log_msg_start(const char *, struct printbuf *);
-
-static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
-{
-	__bch2_log_msg_start(c->name, out);
-}
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *);
-bool bch2_inconsistent_error(struct bch_fs *);
-__printf(2, 3)
-bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...);
-
-#define bch2_fs_inconsistent_on(cond, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-	if (_ret)							\
-		bch2_fs_inconsistent(__VA_ARGS__);			\
-	_ret;								\
-})
-
-__printf(2, 3)
-bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...);
-
-#define bch2_trans_inconsistent_on(cond, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-	if (_ret)							\
-		bch2_trans_inconsistent(__VA_ARGS__);			\
-	_ret;								\
-})
-
-int __bch2_topology_error(struct bch_fs *, struct printbuf *);
-__printf(2, 3)
-int bch2_fs_topology_error(struct bch_fs *, const char *, ...);
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-struct fsck_err_state {
-	struct list_head	list;
-	enum bch_sb_error_id	id;
-	u64			nr;
-	bool			ratelimited;
-	int			ret;
-	int			fix;
-	char			*last_msg;
-};
-
-#define fsck_err_count(_c, _err)	bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-
-bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
-#define bch2_count_fsck_err(_c, _err, ...)				\
-	__bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
-
-int bch2_fsck_err_opt(struct bch_fs *,
-		      enum bch_fsck_flags,
-		      enum bch_sb_error_id);
-
-__printf(5, 6) __cold
-int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
-		  enum bch_fsck_flags,
-		  enum bch_sb_error_id,
-		  const char *, ...);
-#define bch2_fsck_err(c, _flags, _err_type, ...)				\
-	__bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
-			type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
-			_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
-
-void bch2_flush_fsck_errs(struct bch_fs *);
-void bch2_free_fsck_errs(struct bch_fs *);
-
-#define fsck_err_wrap(_do)						\
-({									\
-	int _ret = _do;							\
-	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&		\
-	    !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) {		\
-		ret = _ret;						\
-		goto fsck_err;						\
-	}								\
-									\
-	bch2_err_matches(_ret, BCH_ERR_fsck_fix);			\
-})
-
-#define __fsck_err(...)		fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
-({									\
-	might_sleep();							\
-									\
-	if (type_is(c, struct bch_fs *))				\
-		WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
-									\
-	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
-})
-
-#define mustfix_fsck_err(c, _err_type, ...)				\
-	__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, _err_type, ...)			\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define fsck_err(c, _err_type, ...)					\
-	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define fsck_err_on(cond, c, _err_type, ...)				\
-	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err(c, _err_type, ...)					\
-	__fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err_on(cond, ...)					\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-	if (_ret)							\
-		log_fsck_err(__VA_ARGS__);				\
-	_ret;								\
-})
-
-enum bch_validate_flags;
-__printf(5, 6)
-int __bch2_bkey_fsck_err(struct bch_fs *,
-			 struct bkey_s_c,
-			 struct bkey_validate_context from,
-			 enum bch_sb_error_id,
-			 const char *, ...);
-
-/*
- * for now, bkey fsck errors are always handled by deleting the entire key -
- * this will change at some point
- */
-#define bkey_fsck_err(c, _err_type, _err_msg, ...)			\
-do {									\
-	int _ret = __bch2_bkey_fsck_err(c, k, from,			\
-				BCH_FSCK_ERR_##_err_type,		\
-				_err_msg, ##__VA_ARGS__);		\
-	if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) &&		\
-	    !bch2_err_matches(_ret, BCH_ERR_fsck_ignore))		\
-		ret = _ret;						\
-	ret = bch_err_throw(c, fsck_delete_bkey);			\
-	goto fsck_err;							\
-} while (0)
-
-#define bkey_fsck_err_on(cond, ...)					\
-do {									\
-	if (unlikely(cond))						\
-		bkey_fsck_err(__VA_ARGS__);				\
-} while (0)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch2_fatal_error(struct bch_fs *);
-
-#define bch2_fs_fatal_error(c, _msg, ...)				\
-do {									\
-	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
-	bch2_fatal_error(c);						\
-} while (0)
-
-#define bch2_fs_fatal_err_on(cond, c, ...)				\
-({									\
-	bool _ret = unlikely(!!(cond));					\
-									\
-	if (_ret)							\
-		bch2_fs_fatal_error(c, __VA_ARGS__);			\
-	_ret;								\
-})
-
-/*
- * IO errors: either recoverable metadata IO (because we have replicas), or data
- * IO - we need to log it and print out a message, but we don't (necessarily)
- * want to shut down the fs:
- */
-
-void bch2_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-static inline void bch2_account_io_success_fail(struct bch_dev *ca,
-						enum bch_member_error_type type,
-						bool success)
-{
-	if (likely(success)) {
-		if (type == BCH_MEMBER_ERROR_write &&
-		    ca->write_errors_start)
-			ca->write_errors_start = 0;
-	} else {
-		bch2_io_error(ca, type);
-	}
-}
-
-static inline void bch2_account_io_completion(struct bch_dev *ca,
-					      enum bch_member_error_type type,
-					      u64 submit_time, bool success)
-{
-	if (unlikely(!ca))
-		return;
-
-	if (type != BCH_MEMBER_ERROR_checksum)
-		bch2_latency_acct(ca, submit_time, type);
-
-	bch2_account_io_success_fail(ca, type, success);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-
-void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
-void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
-
-#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
deleted file mode 100644
index e76e58a568bf..000000000000
--- a/fs/bcachefs/extent_update.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "extents.h"
-#include "extent_update.h"
-
-/*
- * This counts the number of iterators to the alloc & ec btrees we'll need
- * inserting/removing this extent:
- */
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	unsigned ret = 0, lru = 0;
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			/* Might also be updating LRU btree */
-			if (entry->ptr.cached)
-				lru++;
-
-			fallthrough;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			ret++;
-		}
-	}
-
-	/*
-	 * Updating keys in the alloc btree may also update keys in the
-	 * freespace or discard btrees:
-	 */
-	return lru + ret * 2;
-}
-
-#define EXTENT_ITERS_MAX	64
-
-static int count_iters_for_insert(struct btree_trans *trans,
-				  struct bkey_s_c k,
-				  unsigned offset,
-				  struct bpos *end,
-				  unsigned *nr_iters)
-{
-	int ret = 0, ret2 = 0;
-
-	if (*nr_iters >= EXTENT_ITERS_MAX) {
-		*end = bpos_min(*end, k.k->p);
-		ret = 1;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
-		if (*nr_iters >= EXTENT_ITERS_MAX) {
-			*end = bpos_min(*end, k.k->p);
-			ret = 1;
-		}
-
-		break;
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-		u64 idx = REFLINK_P_IDX(p.v);
-		unsigned sectors = bpos_min(*end, p.k->p).offset -
-			bkey_start_offset(p.k);
-		struct btree_iter iter;
-		struct bkey_s_c r_k;
-
-		for_each_btree_key_norestart(trans, iter,
-				   BTREE_ID_reflink, POS(0, idx + offset),
-				   BTREE_ITER_slots, r_k, ret2) {
-			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
-				break;
-
-			/* extent_update_to_keys(), for the reflink_v update */
-			*nr_iters += 1;
-
-			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
-			if (*nr_iters >= EXTENT_ITERS_MAX) {
-				struct bpos pos = bkey_start_pos(k.k);
-				pos.offset += min_t(u64, k.k->size,
-						    r_k.k->p.offset - idx);
-
-				*end = bpos_min(*end, pos);
-				ret = 1;
-				break;
-			}
-		}
-		bch2_trans_iter_exit(trans, &iter);
-
-		break;
-	}
-	}
-
-	return ret2 ?: ret;
-}
-
-int bch2_extent_atomic_end(struct btree_trans *trans,
-			   struct btree_iter *iter,
-			   struct bpos *end)
-{
-	unsigned nr_iters = 0;
-
-	struct btree_iter copy;
-	bch2_trans_copy_iter(trans, &copy, iter);
-
-	int ret = bch2_btree_iter_traverse(trans, &copy);
-	if (ret)
-		goto err;
-
-	struct bkey_s_c k;
-	for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
-		unsigned offset = 0;
-
-		if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
-			offset = iter->pos.offset - bkey_start_offset(k.k);
-
-		ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
-		if (ret)
-			break;
-	}
-err:
-	bch2_trans_iter_exit(trans, &copy);
-	return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct bkey_i *k)
-{
-	struct bpos end = k->k.p;
-	int ret = bch2_extent_atomic_end(trans, iter, &end);
-	if (ret)
-		return ret;
-
-	/* tracepoint */
-
-	if (bpos_lt(end, k->k.p)) {
-		if (trace_extent_trim_atomic_enabled()) {
-			CLASS(printbuf, buf)();
-			bch2_bpos_to_text(&buf, end);
-			prt_newline(&buf);
-			bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
-			trace_extent_trim_atomic(trans->c, buf.buf);
-		}
-		bch2_cut_back(end, k);
-	}
-	return 0;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
deleted file mode 100644
index 34467db53f45..000000000000
--- a/fs/bcachefs/extent_update.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENT_UPDATE_H
-#define _BCACHEFS_EXTENT_UPDATE_H
-
-#include "bcachefs.h"
-
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
-			   struct bpos *);
-int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
-			    struct bkey_i *);
-
-#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
deleted file mode 100644
index 83cbd77dcb9c..000000000000
--- a/fs/bcachefs/extents.c
+++ /dev/null
@@ -1,1735 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-#include "util.h"
-
-static const char * const bch2_extent_flags_strs[] = {
-#define x(n, v)	[BCH_EXTENT_FLAG_##n] = #n,
-	BCH_EXTENT_FLAGS()
-#undef x
-	NULL,
-};
-
-static unsigned bch2_crc_field_size_max[] = {
-	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
-	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *,
-				 struct bch_extent_crc_unpacked,
-				 enum bch_extent_entry_type);
-
-void bch2_io_failures_to_text(struct printbuf *out,
-			      struct bch_fs *c,
-			      struct bch_io_failures *failed)
-{
-	static const char * const error_types[] = {
-		"btree validate", "io", "checksum", "ec reconstruct", NULL
-	};
-
-	for (struct bch_dev_io_failures *f = failed->devs;
-	     f < failed->devs + failed->nr;
-	     f++) {
-		unsigned errflags =
-			((!!f->failed_btree_validate)	<< 0) |
-			((!!f->failed_io)		<< 1) |
-			((!!f->failed_csum_nr)		<< 2) |
-			((!!f->failed_ec)		<< 3);
-
-		bch2_printbuf_make_room(out, 1024);
-		out->atomic++;
-		scoped_guard(rcu) {
-			struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
-			if (ca)
-				prt_str(out, ca->name);
-			else
-				prt_printf(out, "(invalid device %u)", f->dev);
-		}
-		--out->atomic;
-
-		prt_char(out, ' ');
-
-		if (!errflags) {
-			prt_str(out, "no error - confused");
-		} else if (is_power_of_2(errflags)) {
-			prt_bitflags(out, error_types, errflags);
-			prt_str(out, " error");
-		} else {
-			prt_str(out, "errors: ");
-			prt_bitflags(out, error_types, errflags);
-		}
-		prt_newline(out);
-	}
-}
-
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
-						 unsigned dev)
-{
-	struct bch_dev_io_failures *i;
-
-	for (i = f->devs; i < f->devs + f->nr; i++)
-		if (i->dev == dev)
-			return i;
-
-	return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p,
-			  bool csum_error)
-{
-	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
-
-	if (!f) {
-		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-		f = &failed->devs[failed->nr++];
-		memset(f, 0, sizeof(*f));
-		f->dev = p->ptr.dev;
-	}
-
-	if (p->do_ec_reconstruct)
-		f->failed_ec = true;
-	else if (!csum_error)
-		f->failed_io = true;
-	else
-		f->failed_csum_nr++;
-}
-
-void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
-				      unsigned dev)
-{
-	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
-
-	if (!f) {
-		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
-		f = &failed->devs[failed->nr++];
-		memset(f, 0, sizeof(*f));
-		f->dev = dev;
-	}
-
-	f->failed_btree_validate = true;
-}
-
-static inline u64 dev_latency(struct bch_dev *ca)
-{
-	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
-}
-
-static inline int dev_failed(struct bch_dev *ca)
-{
-	return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
-			      const struct extent_ptr_decoded p1,
-			      u64 p1_latency,
-			      struct bch_dev *ca1,
-			      const struct extent_ptr_decoded p2,
-			      u64 p2_latency)
-{
-	struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
-	int failed_delta = dev_failed(ca1) - dev_failed(ca2);
-	if (unlikely(failed_delta))
-		return failed_delta < 0;
-
-	if (static_branch_unlikely(&bch2_force_reconstruct_read))
-		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
-
-	if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
-		return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
-
-	int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
-	if (unlikely(crc_retry_delta))
-		return crc_retry_delta < 0;
-
-	/* Pick at random, biased in favor of the faster device: */
-
-	return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-			       struct bch_io_failures *failed,
-			       struct extent_ptr_decoded *pick,
-			       int dev)
-{
-	bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
-	bool have_dirty_ptrs = false, have_pick = false;
-
-	if (k.k->type == KEY_TYPE_error)
-		return bch_err_throw(c, key_type_error);
-
-	rcu_read_lock();
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	u64 pick_latency;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		have_dirty_ptrs |= !p.ptr.cached;
-
-		/*
-		 * Unwritten extent: no need to actually read, treat it as a
-		 * hole and return 0s:
-		 */
-		if (p.ptr.unwritten) {
-			rcu_read_unlock();
-			return 0;
-		}
-
-		/* Are we being asked to read from a specific device? */
-		if (dev >= 0 && p.ptr.dev != dev)
-			continue;
-
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-
-		if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
-			rcu_read_unlock();
-			int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
-			if (ret)
-				return ret;
-			rcu_read_lock();
-		}
-
-		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
-			continue;
-
-		struct bch_dev_io_failures *f =
-			unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (unlikely(f)) {
-			p.crc_retry_nr	   = f->failed_csum_nr;
-			p.has_ec	  &= ~f->failed_ec;
-
-			if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
-				have_io_errors	|= f->failed_io;
-				have_io_errors	|= f->failed_btree_validate;
-				have_io_errors	|= f->failed_ec;
-			}
-			have_csum_errors	|= !!f->failed_csum_nr;
-
-			if (p.has_ec && (f->failed_io || f->failed_csum_nr))
-				p.do_ec_reconstruct = true;
-			else if (f->failed_io ||
-				 f->failed_btree_validate ||
-				 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
-				continue;
-		}
-
-		have_missing_devs |= ca && !bch2_dev_is_online(ca);
-
-		if (!ca || !bch2_dev_is_online(ca)) {
-			if (!p.has_ec)
-				continue;
-			p.do_ec_reconstruct = true;
-		}
-
-		if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
-			p.do_ec_reconstruct = true;
-
-		u64 p_latency = dev_latency(ca);
-		/*
-		 * Square the latencies, to bias more in favor of the faster
-		 * device - we never want to stop issuing reads to the slower
-		 * device altogether, so that we can update our latency numbers:
-		 */
-		p_latency *= p_latency;
-
-		if (!have_pick ||
-		    ptr_better(c,
-			       p, p_latency, ca,
-			       *pick, pick_latency)) {
-			*pick = p;
-			pick_latency = p_latency;
-			have_pick = true;
-		}
-	}
-	rcu_read_unlock();
-
-	if (have_pick)
-		return 1;
-	if (!have_dirty_ptrs)
-		return 0;
-	if (have_missing_devs)
-		return bch_err_throw(c, no_device_to_read_from);
-	if (have_csum_errors)
-		return bch_err_throw(c, data_read_csum_err);
-	if (have_io_errors)
-		return bch_err_throw(c, data_read_io_err);
-
-	/*
-	 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
-	 * but they don't point to valid devices:
-	 */
-	return bch_err_throw(c, no_devices_valid);
-}
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
-			 c, btree_ptr_val_too_big,
-			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-
-	ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			       struct bkey_validate_context from)
-{
-	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
-			 c, btree_ptr_v2_val_too_big,
-			 "value too big (%zu > %zu)",
-			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
-			 c, btree_ptr_v2_min_key_bad,
-			 "min_key > key");
-
-	if ((from.flags & BCH_VALIDATE_write) &&
-	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
-		bkey_fsck_err_on(!bp.v->sectors_written,
-				 c, btree_ptr_v2_written_0,
-				 "sectors_written == 0");
-
-	ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct bkey_s_c k)
-{
-	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-	prt_printf(out, "seq %llx written %u min_key %s",
-	       le64_to_cpu(bp.v->seq),
-	       le16_to_cpu(bp.v->sectors_written),
-	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
-
-	bch2_bpos_to_text(out, bp.v->min_key);
-	prt_printf(out, " ");
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
-			      unsigned big_endian, int write,
-			      struct bkey_s k)
-{
-	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
-
-	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
-
-	if (version < bcachefs_metadata_version_inode_btree_change &&
-	    btree_id_is_extents(btree_id) &&
-	    !bkey_eq(bp.v->min_key, POS_MIN))
-		bp.v->min_key = write
-			? bpos_nosnap_predecessor(bp.v->min_key)
-			: bpos_nosnap_successor(bp.v->min_key);
-}
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
-	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
-	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
-	union bch_extent_entry *en_l;
-	const union bch_extent_entry *en_r;
-	struct extent_ptr_decoded lp, rp;
-	bool use_right_ptr;
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-		if (extent_entry_type(en_l) != extent_entry_type(en_r))
-			return false;
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
-		return false;
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
-	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
-
-	guard(rcu)();
-
-	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
-	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
-		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
-		    rp.ptr.offset + rp.crc.offset ||
-		    lp.ptr.dev			!= rp.ptr.dev ||
-		    lp.ptr.gen			!= rp.ptr.gen ||
-		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
-		    lp.has_ec			!= rp.has_ec)
-			return false;
-
-		/* Extents may not straddle buckets: */
-		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
-		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
-
-		if (!same_bucket)
-			return false;
-
-		if (lp.has_ec			!= rp.has_ec ||
-		    (lp.has_ec &&
-		     (lp.ec.block		!= rp.ec.block ||
-		      lp.ec.redundancy		!= rp.ec.redundancy ||
-		      lp.ec.idx			!= rp.ec.idx)))
-			return false;
-
-		if (lp.crc.compression_type	!= rp.crc.compression_type ||
-		    lp.crc.nonce		!= rp.crc.nonce)
-			return false;
-
-		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
-		    lp.crc.uncompressed_size) {
-			/* can use left extent's crc entry */
-		} else if (lp.crc.live_size <= rp.crc.offset) {
-			/* can use right extent's crc entry */
-		} else {
-			/* check if checksums can be merged: */
-			if (lp.crc.csum_type		!= rp.crc.csum_type ||
-			    lp.crc.nonce		!= rp.crc.nonce ||
-			    crc_is_compressed(lp.crc) ||
-			    !bch2_checksum_mergeable(lp.crc.csum_type))
-				return false;
-
-			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
-			    rp.crc.offset)
-				return false;
-
-			if (lp.crc.csum_type &&
-			    lp.crc.uncompressed_size +
-			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
-				return false;
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
-		if (extent_entry_is_crc(en_l)) {
-			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
-			    bch2_crc_field_size_max[extent_entry_type(en_l)])
-				return false;
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	use_right_ptr = false;
-	en_l = l_ptrs.start;
-	en_r = r_ptrs.start;
-	while (en_l < l_ptrs.end) {
-		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
-		    use_right_ptr)
-			en_l->ptr = en_r->ptr;
-
-		if (extent_entry_is_crc(en_l)) {
-			struct bch_extent_crc_unpacked crc_l =
-				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-			struct bch_extent_crc_unpacked crc_r =
-				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-			use_right_ptr = false;
-
-			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
-			    crc_l.uncompressed_size) {
-				/* can use left extent's crc entry */
-			} else if (crc_l.live_size <= crc_r.offset) {
-				/* can use right extent's crc entry */
-				crc_r.offset -= crc_l.live_size;
-				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
-						     extent_entry_type(en_l));
-				use_right_ptr = true;
-			} else {
-				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-								 crc_l.csum,
-								 crc_r.csum,
-								 crc_r.uncompressed_size << 9);
-
-				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
-				crc_l.compressed_size	+= crc_r.compressed_size;
-				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-						     extent_entry_type(en_l));
-			}
-		}
-
-		en_l = extent_entry_next(en_l);
-		en_r = extent_entry_next(en_r);
-	}
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
-			      struct bkey_validate_context from)
-{
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
-			 c, reservation_key_nr_replicas_invalid,
-			 "invalid nr_replicas (%u)", r.v->nr_replicas);
-fsck_err:
-	return ret;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
-			      struct bkey_s_c k)
-{
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-	prt_printf(out, "generation %u replicas %u",
-	       le32_to_cpu(r.v->generation),
-	       r.v->nr_replicas);
-}
-
-bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
-
-	if (l.v->generation != r.v->generation ||
-	    l.v->nr_replicas != r.v->nr_replicas)
-		return false;
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-/* Extent checksum entries: */
-
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-					 struct bch_extent_crc_unpacked r)
-{
-	return (l.csum_type		!= r.csum_type ||
-		l.compression_type	!= r.compression_type ||
-		l.compressed_size	!= r.compressed_size ||
-		l.uncompressed_size	!= r.uncompressed_size ||
-		l.offset		!= r.offset ||
-		l.live_size		!= r.live_size ||
-		l.nonce			!= r.nonce ||
-		bch2_crc_cmp(l.csum, r.csum));
-}
-
-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
-				  struct bch_extent_crc_unpacked n)
-{
-	return !crc_is_compressed(u) &&
-		u.csum_type &&
-		u.uncompressed_size > u.live_size &&
-		bch2_csum_type_is_encryption(u.csum_type) ==
-		bch2_csum_type_is_encryption(n.csum_type);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
-				 struct bch_extent_crc_unpacked n)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	if (!n.csum_type)
-		return false;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (can_narrow_crc(crc, n))
-			return true;
-
-	return false;
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	struct bch_extent_crc_unpacked u;
-	struct extent_ptr_decoded p;
-	union bch_extent_entry *i;
-	bool ret = false;
-
-	/* Find a checksum entry that covers only live data: */
-	if (!n.csum_type) {
-		bkey_for_each_crc(&k->k, ptrs, u, i)
-			if (!crc_is_compressed(u) &&
-			    u.csum_type &&
-			    u.live_size == u.uncompressed_size) {
-				n = u;
-				goto found;
-			}
-		return false;
-	}
-found:
-	BUG_ON(crc_is_compressed(n));
-	BUG_ON(n.offset);
-	BUG_ON(n.live_size != k->k.size);
-
-restart_narrow_pointers:
-	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
-		if (can_narrow_crc(p.crc, n)) {
-			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
-			p.ptr.offset += p.crc.offset;
-			p.crc = n;
-			bch2_extent_ptr_decoded_append(k, &p);
-			ret = true;
-			goto restart_narrow_pointers;
-		}
-
-	return ret;
-}
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-				 struct bch_extent_crc_unpacked src,
-				 enum bch_extent_entry_type type)
-{
-#define common_fields(_src)						\
-		.type			= BIT(type),			\
-		.csum_type		= _src.csum_type,		\
-		.compression_type	= _src.compression_type,	\
-		._compressed_size	= _src.compressed_size - 1,	\
-		._uncompressed_size	= _src.uncompressed_size - 1,	\
-		.offset			= _src.offset
-
-	switch (type) {
-	case BCH_EXTENT_ENTRY_crc32:
-		dst->crc32		= (struct bch_extent_crc32) {
-			common_fields(src),
-			.csum		= (u32 __force) *((__le32 *) &src.csum.lo),
-		};
-		break;
-	case BCH_EXTENT_ENTRY_crc64:
-		dst->crc64		= (struct bch_extent_crc64) {
-			common_fields(src),
-			.nonce		= src.nonce,
-			.csum_lo	= (u64 __force) src.csum.lo,
-			.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi),
-		};
-		break;
-	case BCH_EXTENT_ENTRY_crc128:
-		dst->crc128		= (struct bch_extent_crc128) {
-			common_fields(src),
-			.nonce		= src.nonce,
-			.csum		= src.csum,
-		};
-		break;
-	default:
-		BUG();
-	}
-#undef set_common_fields
-}
-
-void bch2_extent_crc_append(struct bkey_i *k,
-			    struct bch_extent_crc_unpacked new)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	union bch_extent_crc *crc = (void *) ptrs.end;
-	enum bch_extent_entry_type type;
-
-	if (bch_crc_bytes[new.csum_type]	<= 4 &&
-	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
-	    new.nonce				<= CRC32_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc32;
-	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
-		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
-		   new.nonce			<= CRC64_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc64;
-	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
-		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
-		   new.nonce			<= CRC128_NONCE_MAX)
-		type = BCH_EXTENT_ENTRY_crc128;
-	else
-		BUG();
-
-	bch2_extent_crc_pack(crc, new, type);
-
-	k->k.u64s += extent_entry_u64s(ptrs.end);
-
-	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-/* Generic code for keys with pointers: */
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
-	return bch2_bkey_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
-	return k.k->type == KEY_TYPE_reservation
-		? bkey_s_c_to_reservation(k).v->nr_replicas
-		: bch2_bkey_dirty_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
-{
-	unsigned ret = 0;
-
-	if (k.k->type == KEY_TYPE_reservation) {
-		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
-	} else {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
-	}
-
-	return ret;
-}
-
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned ret = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (!p.ptr.cached && crc_is_compressed(p.crc))
-			ret += p.crc.compressed_size;
-
-	return ret;
-}
-
-bool bch2_bkey_is_incompressible(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-
-	bkey_for_each_crc(k.k, ptrs, crc, entry)
-		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
-			return true;
-	return false;
-}
-
-unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p = { 0 };
-	unsigned replicas = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.ptr.cached)
-			continue;
-
-		if (p.has_ec)
-			replicas += p.ec.redundancy;
-
-		replicas++;
-
-	}
-
-	return replicas;
-}
-
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
-{
-	if (p->ptr.cached)
-		return 0;
-
-	return p->has_ec
-		? p->ec.redundancy + 1
-		: ca->mi.durability;
-}
-
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-	return ca ? __extent_ptr_durability(ca, p) : 0;
-}
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
-	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
-	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
-		return 0;
-
-	return __extent_ptr_durability(ca, p);
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	guard(rcu)();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		durability += bch2_extent_ptr_durability(c, &p);
-	return durability;
-}
-
-static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned durability = 0;
-
-	guard(rcu)();
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
-			durability += bch2_extent_ptr_durability(c, &p);
-	return durability;
-}
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
-{
-	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-	union bch_extent_entry *next = extent_entry_next(entry);
-
-	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
-	k->k.u64s -= extent_entry_u64s(entry);
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
-				    struct extent_ptr_decoded *p)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	struct bch_extent_crc_unpacked crc =
-		bch2_extent_crc_unpack(&k->k, NULL);
-	union bch_extent_entry *pos;
-
-	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-		pos = ptrs.start;
-		goto found;
-	}
-
-	bkey_for_each_crc(&k->k, ptrs, crc, pos)
-		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-			pos = extent_entry_next(pos);
-			goto found;
-		}
-
-	bch2_extent_crc_append(k, p->crc);
-	pos = bkey_val_end(bkey_i_to_s(k));
-found:
-	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-	__extent_entry_insert(k, pos, to_entry(&p->ptr));
-
-	if (p->has_ec) {
-		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-		__extent_entry_insert(k, pos, to_entry(&p->ec));
-	}
-}
-
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
-					  union bch_extent_entry *entry)
-{
-	union bch_extent_entry *i = ptrs.start;
-
-	if (i == entry)
-		return NULL;
-
-	while (extent_entry_next(i) != entry)
-		i = extent_entry_next(i);
-	return i;
-}
-
-/*
- * Returns pointer to the next entry after the one being dropped:
- */
-void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry = to_entry(ptr), *next;
-	bool drop_crc = true;
-
-	if (k.k->type == KEY_TYPE_stripe) {
-		ptr->dev = BCH_SB_MEMBER_INVALID;
-		return;
-	}
-
-	EBUG_ON(ptr < &ptrs.start->ptr ||
-		ptr >= &ptrs.end->ptr);
-	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-
-	for (next = extent_entry_next(entry);
-	     next != ptrs.end;
-	     next = extent_entry_next(next)) {
-		if (extent_entry_is_crc(next)) {
-			break;
-		} else if (extent_entry_is_ptr(next)) {
-			drop_crc = false;
-			break;
-		}
-	}
-
-	extent_entry_drop(k, entry);
-
-	while ((entry = extent_entry_prev(ptrs, entry))) {
-		if (extent_entry_is_ptr(entry))
-			break;
-
-		if ((extent_entry_is_crc(entry) && drop_crc) ||
-		    extent_entry_is_stripe_ptr(entry))
-			extent_entry_drop(k, entry);
-	}
-}
-
-void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
-	if (k.k->type != KEY_TYPE_stripe) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (p.ptr.dev == ptr->dev && p.has_ec) {
-				ptr->dev = BCH_SB_MEMBER_INVALID;
-				return;
-			}
-	}
-
-	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
-
-	bch2_bkey_drop_ptr_noerror(k, ptr);
-
-	/*
-	 * If we deleted all the dirty pointers and there's still cached
-	 * pointers, we could set the cached pointers to dirty if they're not
-	 * stale - but to do that correctly we'd need to grab an open_bucket
-	 * reference so that we don't race with bucket reuse:
-	 */
-	if (have_dirty &&
-	    !bch2_bkey_dirty_devs(k.s_c).nr) {
-		k.k->type = KEY_TYPE_error;
-		set_bkey_val_u64s(k.k, 0);
-	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
-		k.k->type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(k.k, 0);
-	}
-}
-
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
-{
-	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
-}
-
-void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
-{
-	bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
-}
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->dev == dev)
-			return ptr;
-
-	return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_dev *ca;
-
-	guard(rcu)();
-	bkey_for_each_ptr(ptrs, ptr)
-		if (bch2_dev_in_target(c, ptr->dev, target) &&
-		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
-		    (!ptr->cached ||
-		     !dev_ptr_stale_rcu(ca, ptr)))
-			return true;
-
-	return false;
-}
-
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
-			   struct bch_extent_ptr m, u64 offset)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (p.ptr.dev	== m.dev &&
-		    p.ptr.gen	== m.gen &&
-		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
-		    (s64) m.offset  - offset)
-			return true;
-
-	return false;
-}
-
-/*
- * Returns true if two extents refer to the same data:
- */
-bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
-{
-	if (k1.k->type != k2.k->type)
-		return false;
-
-	if (bkey_extent_is_direct_data(k1.k)) {
-		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
-		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
-		const union bch_extent_entry *entry1, *entry2;
-		struct extent_ptr_decoded p1, p2;
-
-		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
-			return false;
-
-		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
-			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-				if (p1.ptr.dev		== p2.ptr.dev &&
-				    p1.ptr.gen		== p2.ptr.gen &&
-
-				    /*
-				     * This checks that the two pointers point
-				     * to the same region on disk - adjusting
-				     * for the difference in where the extents
-				     * start, since one may have been trimmed:
-				     */
-				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
-
-				    /*
-				     * This additionally checks that the
-				     * extents overlap on disk, since the
-				     * previous check may trigger spuriously
-				     * when one extent is immediately partially
-				     * overwritten with another extent (so that
-				     * on disk they are adjacent) and
-				     * compression is in use:
-				     */
-				    ((p1.ptr.offset >= p2.ptr.offset &&
-				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
-				     (p2.ptr.offset >= p1.ptr.offset &&
-				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
-					return true;
-
-		return false;
-	} else {
-		/* KEY_TYPE_deleted, etc. */
-		return true;
-	}
-}
-
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
-{
-	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
-	union bch_extent_entry *entry2;
-	struct extent_ptr_decoded p2;
-
-	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-		if (p1.ptr.dev		== p2.ptr.dev &&
-		    p1.ptr.gen		== p2.ptr.gen &&
-		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-			return &entry2->ptr;
-
-	return NULL;
-}
-
-static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
-			    struct bch_extent_ptr *ptr)
-{
-	unsigned target = opts->promote_target ?: opts->foreground_target;
-
-	if (target && !bch2_dev_in_target(c, ptr->dev, target))
-		return false;
-
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-
-	return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
-}
-
-void bch2_extent_ptr_set_cached(struct bch_fs *c,
-				struct bch_io_opts *opts,
-				struct bkey_s k,
-				struct bch_extent_ptr *ptr)
-{
-	struct bkey_ptrs ptrs;
-	union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	bool have_cached_ptr;
-	unsigned drop_dev = ptr->dev;
-
-	guard(rcu)();
-restart_drop_ptrs:
-	ptrs = bch2_bkey_ptrs(k);
-	have_cached_ptr = false;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		/*
-		 * Check if it's erasure coded - stripes can't contain cached
-		 * data. Possibly something we can fix in the future?
-		 */
-		if (&entry->ptr == ptr && p.has_ec)
-			goto drop;
-
-		if (p.ptr.cached) {
-			if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
-				bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
-				ptr = NULL;
-				goto restart_drop_ptrs;
-			}
-
-			have_cached_ptr = true;
-		}
-	}
-
-	if (!ptr)
-		bkey_for_each_ptr(ptrs, ptr2)
-			if (ptr2->dev == drop_dev)
-				ptr = ptr2;
-
-	if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
-		goto drop;
-
-	ptr->cached = true;
-	return;
-drop:
-	bch2_bkey_drop_ptr_noerror(k, ptr);
-}
-
-/*
- * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
-	struct bch_dev *ca;
-
-	guard(rcu)();
-	bch2_bkey_drop_ptrs(k, ptr,
-		ptr->cached &&
-		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
-		 dev_ptr_stale_rcu(ca, ptr) > 0));
-
-	return bkey_deleted(k.k);
-}
-
-/*
- * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
- *
- * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
- * the promote target.
- */
-bool bch2_extent_normalize_by_opts(struct bch_fs *c,
-				   struct bch_io_opts *opts,
-				   struct bkey_s k)
-{
-	struct bkey_ptrs ptrs;
-	bool have_cached_ptr;
-
-	guard(rcu)();
-restart_drop_ptrs:
-	ptrs = bch2_bkey_ptrs(k);
-	have_cached_ptr = false;
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->cached) {
-			if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
-				bch2_bkey_drop_ptr(k, ptr);
-				goto restart_drop_ptrs;
-			}
-			have_cached_ptr = true;
-		}
-
-	return bkey_deleted(k.k);
-}
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
-{
-	out->atomic++;
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-	if (!ca) {
-		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-			   (u64) ptr->offset, ptr->gen,
-			   ptr->cached ? " cached" : "");
-	} else {
-		u32 offset;
-		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-		prt_printf(out, "ptr: %u:%llu:%u gen %u",
-			   ptr->dev, b, offset, ptr->gen);
-		if (ca->mi.durability != 1)
-			prt_printf(out, " d=%u", ca->mi.durability);
-		if (ptr->cached)
-			prt_str(out, " cached");
-		if (ptr->unwritten)
-			prt_str(out, " unwritten");
-		int stale = dev_ptr_stale_rcu(ca, ptr);
-		if (stale > 0)
-			prt_printf(out, " stale");
-		else if (stale)
-			prt_printf(out, " invalid");
-	}
-	--out->atomic;
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
-{
-	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
-		   crc->compressed_size,
-		   crc->uncompressed_size,
-		   crc->offset, crc->nonce);
-	bch2_prt_csum_type(out, crc->csum_type);
-	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
-	prt_str(out, " compress ");
-	bch2_prt_compression_type(out, crc->compression_type);
-}
-
-static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
-					  const struct bch_extent_rebalance *r)
-{
-	prt_str(out, "rebalance:");
-
-	prt_printf(out, " replicas=%u", r->data_replicas);
-	if (r->data_replicas_from_inode)
-		prt_str(out, " (inode)");
-
-	prt_str(out, " checksum=");
-	bch2_prt_csum_opt(out, r->data_checksum);
-	if (r->data_checksum_from_inode)
-		prt_str(out, " (inode)");
-
-	if (r->background_compression || r->background_compression_from_inode) {
-		prt_str(out, " background_compression=");
-		bch2_compression_opt_to_text(out, r->background_compression);
-
-		if (r->background_compression_from_inode)
-			prt_str(out, " (inode)");
-	}
-
-	if (r->background_target || r->background_target_from_inode) {
-		prt_str(out, " background_target=");
-		if (c)
-			bch2_target_to_text(out, c, r->background_target);
-		else
-			prt_printf(out, "%u", r->background_target);
-
-		if (r->background_target_from_inode)
-			prt_str(out, " (inode)");
-	}
-
-	if (r->promote_target || r->promote_target_from_inode) {
-		prt_str(out, " promote_target=");
-		if (c)
-			bch2_target_to_text(out, c, r->promote_target);
-		else
-			prt_printf(out, "%u", r->promote_target);
-
-		if (r->promote_target_from_inode)
-			prt_str(out, " (inode)");
-	}
-
-	if (r->erasure_code || r->erasure_code_from_inode) {
-		prt_printf(out, " ec=%u", r->erasure_code);
-		if (r->erasure_code_from_inode)
-			prt_str(out, " (inode)");
-	}
-}
-
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	bool first = true;
-
-	if (c)
-		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		if (!first)
-			prt_printf(out, " ");
-
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
-			break;
-
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128: {
-			struct bch_extent_crc_unpacked crc =
-				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			bch2_extent_crc_unpacked_to_text(out, &crc);
-			break;
-		}
-		case BCH_EXTENT_ENTRY_stripe_ptr: {
-			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
-
-			prt_printf(out, "ec: idx %llu block %u",
-			       (u64) ec->idx, ec->block);
-			break;
-		}
-		case BCH_EXTENT_ENTRY_rebalance:
-			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
-			break;
-
-		case BCH_EXTENT_ENTRY_flags:
-			prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
-			break;
-
-		default:
-			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-			return;
-		}
-
-		first = false;
-	}
-}
-
-static int extent_ptr_validate(struct bch_fs *c,
-			       struct bkey_s_c k,
-			       struct bkey_validate_context from,
-			       const struct bch_extent_ptr *ptr,
-			       unsigned size_ondisk,
-			       bool metadata)
-{
-	int ret = 0;
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	bkey_for_each_ptr(ptrs, ptr2)
-		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
-				 c, ptr_to_duplicate_device,
-				 "multiple pointers to same device (%u)", ptr->dev);
-
-	/* bad pointers are repaired by check_fix_ptrs(): */
-	rcu_read_lock();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-	if (!ca) {
-		rcu_read_unlock();
-		return 0;
-	}
-	u32 bucket_offset;
-	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
-	unsigned first_bucket	= ca->mi.first_bucket;
-	u64 nbuckets		= ca->mi.nbuckets;
-	unsigned bucket_size	= ca->mi.bucket_size;
-	rcu_read_unlock();
-
-	bkey_fsck_err_on(bucket >= nbuckets,
-			 c, ptr_after_last_bucket,
-			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
-	bkey_fsck_err_on(bucket < first_bucket,
-			 c, ptr_before_first_bucket,
-			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
-	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
-			 c, ptr_spans_multiple_buckets,
-			 "pointer spans multiple buckets (%u + %u > %u)",
-		       bucket_offset, size_ondisk, bucket_size);
-fsck_err:
-	return ret;
-}
-
-int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct bch_extent_crc_unpacked crc;
-	unsigned size_ondisk = k.k->size;
-	unsigned nonce = UINT_MAX;
-	unsigned nr_ptrs = 0;
-	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
-	int ret = 0;
-
-	if (bkey_is_btree_ptr(k.k))
-		size_ondisk = btree_sectors(c);
-
-	bkey_extent_entry_for_each(ptrs, entry) {
-		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
-				 c, extent_ptrs_invalid_entry,
-				 "invalid extent entry type (got %u, max %u)",
-				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-
-		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
-				 !extent_entry_is_ptr(entry),
-				 c, btree_ptr_has_non_ptr,
-				 "has non ptr field");
-
-		switch (extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
-			if (ret)
-				return ret;
-
-			bkey_fsck_err_on(entry->ptr.cached && have_ec,
-					 c, ptr_cached_and_erasure_coded,
-					 "cached, erasure coded ptr");
-
-			if (!entry->ptr.unwritten)
-				have_written = true;
-			else
-				have_unwritten = true;
-
-			have_ec = false;
-			crc_since_last_ptr = false;
-			nr_ptrs++;
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-		case BCH_EXTENT_ENTRY_crc64:
-		case BCH_EXTENT_ENTRY_crc128:
-			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
-					 c, ptr_crc_csum_type_unknown,
-					 "invalid checksum type");
-			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
-					 c, ptr_crc_compression_type_unknown,
-					 "invalid compression type");
-
-			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
-					 c, ptr_crc_uncompressed_size_too_small,
-					 "checksum offset + key size > uncompressed size");
-			bkey_fsck_err_on(crc_is_encoded(crc) &&
-					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
-					 c, ptr_crc_uncompressed_size_too_big,
-					 "too large encoded extent");
-			bkey_fsck_err_on(!crc_is_compressed(crc) &&
-					 crc.compressed_size != crc.uncompressed_size,
-					 c, ptr_crc_uncompressed_size_mismatch,
-					 "not compressed but compressed != uncompressed size");
-
-			if (bch2_csum_type_is_encryption(crc.csum_type)) {
-				if (nonce == UINT_MAX)
-					nonce = crc.offset + crc.nonce;
-				else if (nonce != crc.offset + crc.nonce)
-					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
-						      "incorrect nonce");
-			}
-
-			bkey_fsck_err_on(crc_since_last_ptr,
-					 c, ptr_crc_redundant,
-					 "redundant crc entry");
-			crc_since_last_ptr = true;
-
-			size_ondisk = crc.compressed_size;
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			bkey_fsck_err_on(have_ec,
-					 c, ptr_stripe_redundant,
-					 "redundant stripe entry");
-			have_ec = true;
-			break;
-		case BCH_EXTENT_ENTRY_rebalance: {
-			/*
-			 * this shouldn't be a fsck error, for forward
-			 * compatibility; the rebalance code should just refetch
-			 * the compression opt if it's unknown
-			 */
-#if 0
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			if (!bch2_compression_opt_valid(r->compression)) {
-				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
-				prt_printf(err, "invalid compression opt %u:%u",
-					   opt.type, opt.level);
-				return bch_err_throw(c, invalid_bkey);
-			}
-#endif
-			break;
-		}
-		case BCH_EXTENT_ENTRY_flags:
-			bkey_fsck_err_on(entry != ptrs.start,
-					 c, extent_flags_not_at_start,
-					 "extent flags entry not at start");
-			break;
-		}
-	}
-
-	bkey_fsck_err_on(!nr_ptrs,
-			 c, extent_ptrs_no_ptrs,
-			 "no ptrs");
-	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
-			 c, extent_ptrs_too_many_ptrs,
-			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
-	bkey_fsck_err_on(have_written && have_unwritten,
-			 c, extent_ptrs_written_and_unwritten,
-			 "extent with unwritten and written ptrs");
-	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
-			 c, extent_ptrs_unwritten,
-			 "has unwritten ptrs");
-	bkey_fsck_err_on(crc_since_last_ptr,
-			 c, extent_ptrs_redundant_crc,
-			 "redundant crc entry");
-	bkey_fsck_err_on(have_ec,
-			 c, extent_ptrs_redundant_stripe,
-			 "redundant stripe entry");
-fsck_err:
-	return ret;
-}
-
-void bch2_ptr_swab(struct bkey_s k)
-{
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-	union bch_extent_entry *entry;
-	u64 *d;
-
-	for (d =  (u64 *) ptrs.start;
-	     d != (u64 *) ptrs.end;
-	     d++)
-		*d = swab64(*d);
-
-	for (entry = ptrs.start;
-	     entry < ptrs.end;
-	     entry = extent_entry_next(entry)) {
-		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr:
-			break;
-		case BCH_EXTENT_ENTRY_crc32:
-			entry->crc32.csum = swab32(entry->crc32.csum);
-			break;
-		case BCH_EXTENT_ENTRY_crc64:
-			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-			break;
-		case BCH_EXTENT_ENTRY_crc128:
-			entry->crc128.csum.hi = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.hi);
-			entry->crc128.csum.lo = (__force __le64)
-				swab64((__force u64) entry->crc128.csum.lo);
-			break;
-		case BCH_EXTENT_ENTRY_stripe_ptr:
-			break;
-		case BCH_EXTENT_ENTRY_rebalance:
-			break;
-		default:
-			/* Bad entry type: will be caught by validate() */
-			return;
-		}
-	}
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
-{
-	int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
-	if (ret)
-		return ret;
-
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-	if (ptrs.start != ptrs.end &&
-	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
-		ptrs.start->flags.flags = flags;
-	} else {
-		struct bch_extent_flags f = {
-			.type	= BIT(BCH_EXTENT_ENTRY_flags),
-			.flags	= flags,
-		};
-		__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
-	}
-
-	return 0;
-}
-
-/* Generic extent code: */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
-{
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 sub;
-
-	if (bkey_le(where, bkey_start_pos(k.k)))
-		return 0;
-
-	EBUG_ON(bkey_gt(where, k.k->p));
-
-	sub = where.offset - bkey_start_offset(k.k);
-
-	k.k->size -= sub;
-
-	if (!k.k->size) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v: {
-		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-		union bch_extent_entry *entry;
-		bool seen_crc = false;
-
-		bkey_extent_entry_for_each(ptrs, entry) {
-			switch (extent_entry_type(entry)) {
-			case BCH_EXTENT_ENTRY_ptr:
-				if (!seen_crc)
-					entry->ptr.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc32:
-				entry->crc32.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc64:
-				entry->crc64.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_crc128:
-				entry->crc128.offset += sub;
-				break;
-			case BCH_EXTENT_ENTRY_stripe_ptr:
-			case BCH_EXTENT_ENTRY_rebalance:
-			case BCH_EXTENT_ENTRY_flags:
-				break;
-			}
-
-			if (extent_entry_is_crc(entry))
-				seen_crc = true;
-		}
-
-		break;
-	}
-	case KEY_TYPE_reflink_p: {
-		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
-		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
-		break;
-	}
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data: {
-		void *p = bkey_inline_data_p(k);
-		unsigned bytes = bkey_inline_data_bytes(k.k);
-
-		sub = min_t(u64, sub << 9, bytes);
-
-		memmove(p, p + sub, bytes - sub);
-
-		new_val_u64s -= sub >> 3;
-		break;
-	}
-	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
-
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
-}
-
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
-{
-	unsigned new_val_u64s = bkey_val_u64s(k.k);
-	int val_u64s_delta;
-	u64 len = 0;
-
-	if (bkey_ge(where, k.k->p))
-		return 0;
-
-	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
-
-	len = where.offset - bkey_start_offset(k.k);
-
-	k.k->p.offset = where.offset;
-	k.k->size = len;
-
-	if (!len) {
-		k.k->type = KEY_TYPE_deleted;
-		new_val_u64s = 0;
-	}
-
-	switch (k.k->type) {
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data:
-		new_val_u64s = (bkey_inline_data_offset(k.k) +
-				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
-		break;
-	}
-
-	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
-	BUG_ON(val_u64s_delta < 0);
-
-	set_bkey_val_u64s(k.k, new_val_u64s);
-	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
-	return -val_u64s_delta;
-}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
deleted file mode 100644
index b8590e51b76e..000000000000
--- a/fs/bcachefs/extents.h
+++ /dev/null
@@ -1,768 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_H
-#define _BCACHEFS_EXTENTS_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "extents_types.h"
-
-struct bch_fs;
-struct btree_trans;
-
-/* extent entries: */
-
-#define extent_entry_last(_e)						\
-	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
-
-#define entry_to_ptr(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
-									\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const struct bch_extent_ptr *) (_entry),		\
-		(struct bch_extent_ptr *) (_entry));			\
-})
-
-/* downcast, preserves const */
-#define to_entry(_entry)						\
-({									\
-	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
-		     !type_is(_entry, struct bch_extent_ptr *) &&	\
-		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
-									\
-	__builtin_choose_expr(						\
-		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
-		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
-		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
-		(const union bch_extent_entry *) (_entry),		\
-		(union bch_extent_entry *) (_entry));			\
-})
-
-#define extent_entry_next(_entry)					\
-	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_next_safe(_entry, _end)				\
-	(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)	\
-	 ? extent_entry_next(_entry)					\
-	 : _end)
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
-	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
-	int ret = __ffs(e->type);
-
-	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
-	return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
-	switch (extent_entry_type(entry)) {
-#define x(f, n)						\
-	case BCH_EXTENT_ENTRY_##f:			\
-		return sizeof(struct bch_extent_##f);
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
-	return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
-					 union bch_extent_entry *dst,
-					 union bch_extent_entry *new)
-{
-	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-			      dst, (u64 *) end - (u64 *) dst);
-	k->k.u64s += extent_entry_u64s(new);
-	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
-static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-	union bch_extent_entry *next = extent_entry_next(entry);
-
-	/* stripes have ptrs, but their layout doesn't work with this code */
-	BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-	memmove_u64s_down(entry, next,
-			  (u64 *) bkey_val_end(k) - (u64 *) next);
-	k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
-	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
-{
-	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
-	switch (__extent_entry_type(e)) {
-	case BCH_EXTENT_ENTRY_crc32:
-	case BCH_EXTENT_ENTRY_crc64:
-	case BCH_EXTENT_ENTRY_crc128:
-		return true;
-	default:
-		return false;
-	}
-}
-
-union bch_extent_crc {
-	u8				type;
-	struct bch_extent_crc32		crc32;
-	struct bch_extent_crc64		crc64;
-	struct bch_extent_crc128	crc128;
-};
-
-#define __entry_to_crc(_entry)						\
-	__builtin_choose_expr(						\
-		type_is_exact(_entry, const union bch_extent_entry *),	\
-		(const union bch_extent_crc *) (_entry),		\
-		(union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry)						\
-({									\
-	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
-									\
-	__entry_to_crc(_entry);						\
-})
-
-static inline struct bch_extent_crc_unpacked
-bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
-{
-#define common_fields(_crc)						\
-		.csum_type		= _crc.csum_type,		\
-		.compression_type	= _crc.compression_type,	\
-		.compressed_size	= _crc._compressed_size + 1,	\
-		.uncompressed_size	= _crc._uncompressed_size + 1,	\
-		.offset			= _crc.offset,			\
-		.live_size		= k->size
-
-	if (!crc)
-		return (struct bch_extent_crc_unpacked) {
-			.compressed_size	= k->size,
-			.uncompressed_size	= k->size,
-			.live_size		= k->size,
-		};
-
-	switch (extent_entry_type(to_entry(crc))) {
-	case BCH_EXTENT_ENTRY_crc32: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc32),
-		};
-
-		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
-		return ret;
-	}
-	case BCH_EXTENT_ENTRY_crc64: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc64),
-			.nonce			= crc->crc64.nonce,
-			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
-		};
-
-		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
-
-		return ret;
-	}
-	case BCH_EXTENT_ENTRY_crc128: {
-		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
-			common_fields(crc->crc128),
-			.nonce			= crc->crc128.nonce,
-			.csum			= crc->crc128.csum,
-		};
-
-		return ret;
-	}
-	default:
-		BUG();
-	}
-#undef common_fields
-}
-
-static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
-{
-	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
-		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
-}
-
-static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
-{
-	return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
-
-/* bkey_ptrs: generically over any key type that has ptrs */
-
-struct bkey_ptrs_c {
-	const union bch_extent_entry	*start;
-	const union bch_extent_entry	*end;
-};
-
-struct bkey_ptrs {
-	union bch_extent_entry	*start;
-	union bch_extent_entry	*end;
-};
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr: {
-		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&e.v->start[0]),
-			to_entry(extent_entry_last(e))
-		};
-	}
-	case KEY_TYPE_extent: {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-		return (struct bkey_ptrs_c) {
-			e.v->start,
-			extent_entry_last(e)
-		};
-	}
-	case KEY_TYPE_stripe: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&s.v->ptrs[0]),
-			to_entry(&s.v->ptrs[s.v->nr_blocks]),
-		};
-	}
-	case KEY_TYPE_reflink_v: {
-		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-		return (struct bkey_ptrs_c) {
-			r.v->start,
-			bkey_val_end(r),
-		};
-	}
-	case KEY_TYPE_btree_ptr_v2: {
-		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
-
-		return (struct bkey_ptrs_c) {
-			to_entry(&e.v->start[0]),
-			to_entry(extent_entry_last(e))
-		};
-	}
-	default:
-		return (struct bkey_ptrs_c) { NULL, NULL };
-	}
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
-	return (struct bkey_ptrs) {
-		(void *) p.start,
-		(void *) p.end
-	};
-}
-
-#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
-	for ((_entry) = (_start);					\
-	     (_entry) < (_end);						\
-	     (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define __bkey_ptr_next(_ptr, _end)					\
-({									\
-	typeof(_end) _entry;						\
-									\
-	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
-		if (extent_entry_is_ptr(_entry))			\
-			break;						\
-									\
-	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
-})
-
-#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
-	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
-
-#define bkey_extent_entry_for_each(_p, _entry)				\
-	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
-
-#define __bkey_for_each_ptr(_start, _end, _ptr)				\
-	for (typeof(_start) (_ptr) = (_start);				\
-	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
-	     (_ptr)++)
-
-#define bkey_ptr_next(_p, _ptr)						\
-	__bkey_ptr_next(_ptr, (_p).end)
-
-#define bkey_for_each_ptr(_p, _ptr)					\
-	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
-
-#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
-({									\
-	__label__ out;							\
-									\
-	(_ptr).has_ec			= false;			\
-	(_ptr).do_ec_reconstruct	= false;			\
-	(_ptr).crc_retry_nr		= 0;				\
-									\
-	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
-		switch (__extent_entry_type(_entry)) {			\
-		case BCH_EXTENT_ENTRY_ptr:				\
-			(_ptr).ptr		= _entry->ptr;		\
-			goto out;					\
-		case BCH_EXTENT_ENTRY_crc32:				\
-		case BCH_EXTENT_ENTRY_crc64:				\
-		case BCH_EXTENT_ENTRY_crc128:				\
-			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
-					entry_to_crc(_entry));		\
-			break;						\
-		case BCH_EXTENT_ENTRY_stripe_ptr:			\
-			(_ptr).ec = _entry->stripe_ptr;			\
-			(_ptr).has_ec	= true;				\
-			break;						\
-		default:						\
-			/* nothing */					\
-			break;						\
-		}							\
-out:									\
-	_entry < (_end);						\
-})
-
-#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
-	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
-	     (_entry) = _start;						\
-	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
-	     (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
-	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
-				   _ptr, _entry)
-
-#define bkey_crc_next(_k, _end, _crc, _iter)			\
-({									\
-	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
-		if (extent_entry_is_crc(_iter)) {			\
-			(_crc) = bch2_extent_crc_unpack(_k,		\
-						entry_to_crc(_iter));	\
-			break;						\
-		}							\
-									\
-	(_iter) < (_end);						\
-})
-
-#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
-	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
-	     (_iter) = (_start);					\
-	     bkey_crc_next(_k, _end, _crc, _iter);		\
-	     (_iter) = extent_entry_next(_iter))
-
-#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
-	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-
-/* Iterate over pointers in KEY_TYPE_extent: */
-
-#define extent_ptr_next(_e, _ptr)					\
-	__bkey_ptr_next(_ptr, extent_entry_last(_e))
-
-#define extent_for_each_ptr(_e, _ptr)					\
-	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
-	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
-				   extent_entry_last(_e), _ptr, _entry)
-
-/* utility code common to all keys with pointers: */
-
-void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
-			      struct bch_io_failures *);
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
-						 unsigned);
-void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *, bool);
-void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
-int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
-			       struct bch_io_failures *,
-			       struct extent_ptr_decoded *, int);
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
-			    struct bkey_validate_context);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
-			       struct bkey_validate_context);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
-			      int, struct bkey_s);
-
-#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
-	.key_validate	= bch2_btree_ptr_validate,		\
-	.val_to_text	= bch2_btree_ptr_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.trigger	= bch2_trigger_extent,			\
-})
-
-#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
-	.key_validate	= bch2_btree_ptr_v2_validate,		\
-	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.compat		= bch2_btree_ptr_v2_compat,		\
-	.trigger	= bch2_trigger_extent,			\
-	.min_val_size	= 40,					\
-})
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
-	.key_validate	= bch2_bkey_ptrs_validate,		\
-	.val_to_text	= bch2_bkey_ptrs_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.key_normalize	= bch2_extent_normalize,		\
-	.key_merge	= bch2_extent_merge,			\
-	.trigger	= bch2_trigger_extent,			\
-})
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
-			      struct bkey_validate_context);
-void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
-	.key_validate	= bch2_reservation_validate,		\
-	.val_to_text	= bch2_reservation_to_text,		\
-	.key_merge	= bch2_reservation_merge,		\
-	.trigger	= bch2_trigger_reservation,		\
-	.min_val_size	= 8,					\
-})
-
-/* Extent checksum entries: */
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-				 struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
-void bch2_extent_crc_append(struct bkey_i *,
-			    struct bch_extent_crc_unpacked);
-
-/* Generic code for keys with pointers: */
-
-static inline bool bkey_is_btree_ptr(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_direct_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_inline_data(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_inline_data ||
-		k->type == KEY_TYPE_indirect_inline_data;
-}
-
-static inline unsigned bkey_inline_data_offset(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_inline_data:
-		return sizeof(struct bch_inline_data);
-	case KEY_TYPE_indirect_inline_data:
-		return sizeof(struct bch_indirect_inline_data);
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
-{
-	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
-}
-
-#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
-	return  bkey_extent_is_direct_data(k) ||
-		bkey_extent_is_inline_data(k) ||
-		k->type == KEY_TYPE_reflink_p;
-}
-
-/*
- * Should extent be counted under inode->i_sectors?
- */
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reservation:
-	case KEY_TYPE_reflink_p:
-	case KEY_TYPE_reflink_v:
-	case KEY_TYPE_inline_data:
-	case KEY_TYPE_indirect_inline_data:
-	case KEY_TYPE_error:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(ptrs, ptr)
-		if (ptr->unwritten)
-			return true;
-	return false;
-}
-
-static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
-{
-	return k.k->type == KEY_TYPE_reservation ||
-		bkey_extent_is_unwritten(k);
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		if (!ptr->cached)
-			ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
-	bkey_for_each_ptr(p, ptr)
-		if (ptr->cached)
-			ret.data[ret.nr++] = ptr->dev;
-
-	return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-bool bch2_bkey_is_incompressible(struct bkey_s_c);
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-
-unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
-
-static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
-{
-	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
-}
-
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-
-static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
-{
-	struct bch_extent_ptr *dest;
-
-	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
-
-	switch (k->k.type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-	case KEY_TYPE_extent:
-		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
-		*dest = ptr;
-		k->k.u64s++;
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
-				    struct extent_ptr_decoded *);
-void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
-void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
-#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond)			\
-do {									\
-	__label__ _again;						\
-	struct bkey_ptrs _ptrs;						\
-_again:									\
-	_ptrs = bch2_bkey_ptrs(_k);					\
-									\
-	bkey_for_each_ptr(_ptrs, _ptr)					\
-		if (_cond) {						\
-			bch2_bkey_drop_ptr_noerror(_k, _ptr);		\
-			goto _again;					\
-		}							\
-} while (0)
-
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
-do {									\
-	__label__ _again;						\
-	struct bkey_ptrs _ptrs;						\
-_again:									\
-	_ptrs = bch2_bkey_ptrs(_k);					\
-									\
-	bkey_for_each_ptr(_ptrs, _ptr)					\
-		if (_cond) {						\
-			bch2_bkey_drop_ptr(_k, _ptr);			\
-			goto _again;					\
-		}							\
-} while (0)
-
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
-			   struct bch_extent_ptr, u64);
-bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-
-void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
-				struct bkey_s, struct bch_extent_ptr *);
-
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
-int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
-			    struct bkey_validate_context);
-
-static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
-				      struct bch_extent_ptr ptr2)
-{
-	return (ptr1.cached	== ptr2.cached &&
-		ptr1.unwritten	== ptr2.unwritten &&
-		ptr1.offset	== ptr2.offset &&
-		ptr1.dev	== ptr2.dev &&
-		ptr1.gen	== ptr2.gen);
-}
-
-void bch2_ptr_swab(struct bkey_s);
-
-/* Generic extent code: */
-
-enum bch_extent_overlap {
-	BCH_EXTENT_OVERLAP_ALL		= 0,
-	BCH_EXTENT_OVERLAP_BACK		= 1,
-	BCH_EXTENT_OVERLAP_FRONT	= 2,
-	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-							  const struct bkey *m)
-{
-	int cmp1 = bkey_lt(k->p, m->p);
-	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
-
-	return (cmp1 << 1) + cmp2;
-}
-
-int bch2_cut_front_s(struct bpos, struct bkey_s);
-int bch2_cut_back_s(struct bpos, struct bkey_s);
-
-static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-	bch2_cut_front_s(where, bkey_i_to_s(k));
-}
-
-static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
-{
-	bch2_cut_back_s(where, bkey_i_to_s(k));
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
-{
-	k->p.offset -= k->size;
-	k->p.offset += new_size;
-	k->size = new_size;
-}
-
-static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
-{
-	if (ptrs.start != ptrs.end &&
-	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
-		return ptrs.start->flags.flags;
-	return 0;
-}
-
-static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
-{
-	return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
-
-#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
deleted file mode 100644
index 74c0252cbd98..000000000000
--- a/fs/bcachefs/extents_format.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_FORMAT_H
-#define _BCACHEFS_EXTENTS_FORMAT_H
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32	- 0b1
- * bch_extent_ptr	- 0b10
- * bch_extent_crc64	- 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-#define BCH_EXTENT_ENTRY_TYPES()		\
-	x(ptr,			0)		\
-	x(crc32,		1)		\
-	x(crc64,		2)		\
-	x(crc128,		3)		\
-	x(stripe_ptr,		4)		\
-	x(rebalance,		5)		\
-	x(flags,		6)
-#define BCH_EXTENT_ENTRY_MAX	7
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u32			type:2,
-				_compressed_size:7,
-				_uncompressed_size:7,
-				offset:7,
-				_unused:1,
-				csum_type:4,
-				compression_type:4;
-	__u32			csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u32			csum;
-	__u32			compression_type:4,
-				csum_type:4,
-				_unused:1,
-				offset:7,
-				_uncompressed_size:7,
-				_compressed_size:7,
-				type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX		(1U << 7)
-#define CRC32_NONCE_MAX		0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:3,
-				_compressed_size:9,
-				_uncompressed_size:9,
-				offset:9,
-				nonce:10,
-				csum_type:4,
-				compression_type:4,
-				csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			csum_hi:16,
-				compression_type:4,
-				csum_type:4,
-				nonce:10,
-				offset:9,
-				_uncompressed_size:9,
-				_compressed_size:9,
-				type:3;
-#endif
-	__u64			csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX		(1U << 9)
-#define CRC64_NONCE_MAX		((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:4,
-				_compressed_size:13,
-				_uncompressed_size:13,
-				offset:13,
-				nonce:13,
-				csum_type:4,
-				compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			compression_type:4,
-				csum_type:4,
-				nonce:13,
-				offset:13,
-				_uncompressed_size:13,
-				_compressed_size:13,
-				type:4;
-#endif
-	struct bch_csum		csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX		(1U << 13)
-#define CRC128_NONCE_MAX	((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:1,
-				cached:1,
-				unused:1,
-				unwritten:1,
-				offset:44, /* 8 petabytes */
-				dev:8,
-				gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			gen:8,
-				dev:8,
-				offset:44,
-				unwritten:1,
-				unused:1,
-				cached:1,
-				type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:5,
-				block:8,
-				redundancy:4,
-				idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			idx:47,
-				redundancy:4,
-				block:8,
-				type:5;
-#endif
-};
-
-#define BCH_EXTENT_FLAGS()		\
-	x(poisoned,		0)
-
-enum bch_extent_flags_e {
-#define x(n, v)	BCH_EXTENT_FLAG_##n = v,
-	BCH_EXTENT_FLAGS()
-#undef x
-};
-
-struct bch_extent_flags {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:7,
-				flags:57;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			flags:57,
-				type:7;
-#endif
-};
-
-/* bch_extent_rebalance: */
-#include "rebalance_format.h"
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
-	unsigned long			type;
-#elif __BITS_PER_LONG == 32
-	struct {
-		unsigned long		pad;
-		unsigned long		type;
-	};
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f	f;
-	BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
-	struct bch_val		v;
-
-	__u64			_data[0];
-	struct bch_extent_ptr	start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
-	struct bch_val		v;
-
-	__u64			mem_ptr;
-	__le64			seq;
-	__le16			sectors_written;
-	__le16			flags;
-	struct bpos		min_key;
-	__u64			_data[0];
-	struct bch_extent_ptr	start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
-	struct bch_val		v;
-
-	__u64			_data[0];
-	union bch_extent_entry	start[];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-	((sizeof(struct bch_extent_crc128) +			\
-	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX				\
-	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
-	((sizeof(struct bch_btree_ptr_v2) +			\
-	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX					\
-	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-struct bch_reservation {
-	struct bch_val		v;
-
-	__le32			generation;
-	__u8			nr_replicas;
-	__u8			pad[3];
-} __packed __aligned(8);
-
-struct bch_inline_data {
-	struct bch_val		v;
-	u8			data[];
-};
-
-#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
deleted file mode 100644
index b23ce4a373c0..000000000000
--- a/fs/bcachefs/extents_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_TYPES_H
-#define _BCACHEFS_EXTENTS_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_extent_crc_unpacked {
-	u32			compressed_size;
-	u32			uncompressed_size;
-	u32			live_size;
-
-	u8			csum_type;
-	u8			compression_type;
-
-	u16			offset;
-
-	u16			nonce;
-
-	struct bch_csum		csum;
-};
-
-struct extent_ptr_decoded {
-	bool				has_ec;
-	bool				do_ec_reconstruct;
-	u8				crc_retry_nr;
-	struct bch_extent_crc_unpacked	crc;
-	struct bch_extent_ptr		ptr;
-	struct bch_extent_stripe_ptr	ec;
-};
-
-struct bch_io_failures {
-	u8			nr;
-	struct bch_dev_io_failures {
-		u8		dev;
-		unsigned	failed_csum_nr:6,
-				failed_io:1,
-				failed_btree_validate:1,
-				failed_ec:1;
-	}			devs[BCH_REPLICAS_MAX + 1];
-};
-
-#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
deleted file mode 100644
index 0e742555cb0a..000000000000
--- a/fs/bcachefs/eytzinger.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "eytzinger.h"
-
-/**
- * is_aligned - is this pointer & size okay for word-wide copying?
- * @base: pointer to data
- * @size: size of each element
- * @align: required alignment (typically 4 or 8)
- *
- * Returns true if elements can be copied using word loads and stores.
- * The size must be a multiple of the alignment, and the base address must
- * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
- *
- * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
- * to "if ((a | b) & mask)", so we do that by hand.
- */
-__attribute_const__ __always_inline
-static bool is_aligned(const void *base, size_t size, unsigned char align)
-{
-	unsigned char lsbits = (unsigned char)size;
-
-	(void)base;
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	lsbits |= (unsigned char)(uintptr_t)base;
-#endif
-	return (lsbits & (align - 1)) == 0;
-}
-
-/**
- * swap_words_32 - swap two elements in 32-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 4)
- *
- * Exchange the two objects in memory.  This exploits base+index addressing,
- * which basically all CPUs have, to minimize loop overhead computations.
- *
- * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
- * bottom of the loop, even though the zero flag is still valid from the
- * subtract (since the intervening mov instructions don't alter the flags).
- * Gcc 8.1.0 doesn't have that problem.
- */
-static void swap_words_32(void *a, void *b, size_t n)
-{
-	do {
-		u32 t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-	} while (n);
-}
-
-/**
- * swap_words_64 - swap two elements in 64-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 8)
- *
- * Exchange the two objects in memory.  This exploits base+index
- * addressing, which basically all CPUs have, to minimize loop overhead
- * computations.
- *
- * We'd like to use 64-bit loads if possible.  If they're not, emulating
- * one requires base+index+4 addressing which x86 has but most other
- * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
- * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
- * x32 ABI).  Are there any cases the kernel needs to worry about?
- */
-static void swap_words_64(void *a, void *b, size_t n)
-{
-	do {
-#ifdef CONFIG_64BIT
-		u64 t = *(u64 *)(a + (n -= 8));
-		*(u64 *)(a + n) = *(u64 *)(b + n);
-		*(u64 *)(b + n) = t;
-#else
-		/* Use two 32-bit transfers to avoid base+index+4 addressing */
-		u32 t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-
-		t = *(u32 *)(a + (n -= 4));
-		*(u32 *)(a + n) = *(u32 *)(b + n);
-		*(u32 *)(b + n) = t;
-#endif
-	} while (n);
-}
-
-/**
- * swap_bytes - swap two elements a byte at a time
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size
- *
- * This is the fallback if alignment doesn't allow using larger chunks.
- */
-static void swap_bytes(void *a, void *b, size_t n)
-{
-	do {
-		char t = ((char *)a)[--n];
-		((char *)a)[n] = ((char *)b)[n];
-		((char *)b)[n] = t;
-	} while (n);
-}
-
-/*
- * The values are arbitrary as long as they can't be confused with
- * a pointer, but small integers make for the smallest compare
- * instructions.
- */
-#define SWAP_WORDS_64 (swap_r_func_t)0
-#define SWAP_WORDS_32 (swap_r_func_t)1
-#define SWAP_BYTES    (swap_r_func_t)2
-#define SWAP_WRAPPER  (swap_r_func_t)3
-
-struct wrapper {
-	cmp_func_t cmp;
-	swap_func_t swap_func;
-};
-
-/*
- * The function pointer is last to make tail calls most efficient if the
- * compiler decides not to inline this function.
- */
-static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
-{
-	if (swap_func == SWAP_WRAPPER) {
-		((const struct wrapper *)priv)->swap_func(a, b, (int)size);
-		return;
-	}
-
-	if (swap_func == SWAP_WORDS_64)
-		swap_words_64(a, b, size);
-	else if (swap_func == SWAP_WORDS_32)
-		swap_words_32(a, b, size);
-	else if (swap_func == SWAP_BYTES)
-		swap_bytes(a, b, size);
-	else
-		swap_func(a, b, (int)size, priv);
-}
-
-#define _CMP_WRAPPER ((cmp_r_func_t)0L)
-
-static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
-{
-	if (cmp == _CMP_WRAPPER)
-		return ((const struct wrapper *)priv)->cmp(a, b);
-	return cmp(a, b, priv);
-}
-
-static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
-			 cmp_r_func_t cmp_func, const void *priv,
-			 size_t l, size_t r)
-{
-	return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
-		      base1 + inorder_to_eytzinger1(r, n) * size,
-		      cmp_func, priv);
-}
-
-static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
-			   swap_r_func_t swap_func, const void *priv,
-			   size_t l, size_t r)
-{
-	do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
-		base1 + inorder_to_eytzinger1(r, n) * size,
-		size, swap_func, priv);
-}
-
-static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
-			      cmp_r_func_t cmp_func,
-			      swap_r_func_t swap_func,
-			      const void *priv)
-{
-	unsigned i, j, k;
-
-	/* called from 'sort' without swap function, let's pick the default */
-	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
-		swap_func = NULL;
-
-	if (!swap_func) {
-		if (is_aligned(base1, size, 8))
-			swap_func = SWAP_WORDS_64;
-		else if (is_aligned(base1, size, 4))
-			swap_func = SWAP_WORDS_32;
-		else
-			swap_func = SWAP_BYTES;
-	}
-
-	/* heapify */
-	for (i = n / 2; i >= 1; --i) {
-		/* Find the sift-down path all the way to the leaves. */
-		for (j = i; k = j * 2, k < n;)
-			j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-		/* Special case for the last leaf with no sibling. */
-		if (j * 2 == n)
-			j *= 2;
-
-		/* Backtrack to the correct location. */
-		while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
-			j /= 2;
-
-		/* Shift the element into its correct place. */
-		for (k = j; j != i;) {
-			j /= 2;
-			eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
-		}
-	}
-
-	/* sort */
-	for (i = n; i > 1; --i) {
-		eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
-
-		/* Find the sift-down path all the way to the leaves. */
-		for (j = 1; k = j * 2, k + 1 < i;)
-			j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
-		/* Special case for the last leaf with no sibling. */
-		if (j * 2 + 1 == i)
-			j *= 2;
-
-		/* Backtrack to the correct location. */
-		while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
-			j /= 2;
-
-		/* Shift the element into its correct place. */
-		for (k = j; j > 1;) {
-			j /= 2;
-			eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
-		}
-	}
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
-		       cmp_r_func_t cmp_func,
-		       swap_r_func_t swap_func,
-		       const void *priv)
-{
-	void *base1 = base - size;
-
-	return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     cmp_func_t cmp_func,
-		     swap_func_t swap_func)
-{
-	struct wrapper w = {
-		.cmp  = cmp_func,
-		.swap_func = swap_func,
-	};
-
-	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-
-#if 0
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/ktime.h>
-
-static u64 cmp_count;
-
-static int mycmp(const void *a, const void *b)
-{
-	u32 _a = *(u32 *)a;
-	u32 _b = *(u32 *)b;
-
-	cmp_count++;
-	if (_a < _b)
-		return -1;
-	else if (_a > _b)
-		return 1;
-	else
-		return 0;
-}
-
-static int test(void)
-{
-	size_t N, i;
-	ktime_t start, end;
-	s64 delta;
-	u32 *arr;
-
-	for (N = 10000; N <= 100000; N += 10000) {
-		arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
-		cmp_count = 0;
-
-		for (i = 0; i < N; i++)
-			arr[i] = get_random_u32();
-
-		start = ktime_get();
-		eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
-		end = ktime_get();
-
-		delta = ktime_us_delta(end, start);
-		printk(KERN_INFO "time: %lld\n", delta);
-		printk(KERN_INFO "comparisons: %lld\n", cmp_count);
-
-		u32 prev = 0;
-
-		eytzinger0_for_each(i, N) {
-			if (prev > arr[i])
-				goto err;
-			prev = arr[i];
-		}
-
-		kfree(arr);
-	}
-	return 0;
-
-err:
-	kfree(arr);
-	return -1;
-}
-#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
deleted file mode 100644
index 643c1f716061..000000000000
--- a/fs/bcachefs/eytzinger.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#ifdef EYTZINGER_DEBUG
-#include <linux/bug.h>
-#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
- *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
- */
-
-static inline unsigned eytzinger1_child(unsigned i, unsigned child)
-{
-	EYTZINGER_BUG_ON(child > 1);
-
-	return (i << 1) + child;
-}
-
-static inline unsigned eytzinger1_left_child(unsigned i)
-{
-	return eytzinger1_child(i, 0);
-}
-
-static inline unsigned eytzinger1_right_child(unsigned i)
-{
-	return eytzinger1_child(i, 1);
-}
-
-static inline unsigned eytzinger1_first(unsigned size)
-{
-	return size ? rounddown_pow_of_two(size) : 0;
-}
-
-static inline unsigned eytzinger1_last(unsigned size)
-{
-	return rounddown_pow_of_two(size + 1) - 1;
-}
-
-static inline unsigned eytzinger1_next(unsigned i, unsigned size)
-{
-	EYTZINGER_BUG_ON(i == 0 || i > size);
-
-	if (eytzinger1_right_child(i) <= size) {
-		i = eytzinger1_right_child(i);
-
-		i <<= __fls(size) - __fls(i);
-		i >>= i > size;
-	} else {
-		i >>= ffz(i) + 1;
-	}
-
-	return i;
-}
-
-static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
-{
-	EYTZINGER_BUG_ON(i == 0 || i > size);
-
-	if (eytzinger1_left_child(i) <= size) {
-		i = eytzinger1_left_child(i) + 1;
-
-		i <<= __fls(size) - __fls(i);
-		i -= 1;
-		i >>= i > size;
-	} else {
-		i >>= __ffs(i) + 1;
-	}
-
-	return i;
-}
-
-static inline unsigned eytzinger1_extra(unsigned size)
-{
-	return size
-		? (size + 1 - rounddown_pow_of_two(size)) << 1
-		: 0;
-}
-
-static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
-					      unsigned extra)
-{
-	unsigned b = __fls(i);
-	unsigned shift = __fls(size) - b;
-	int s;
-
-	EYTZINGER_BUG_ON(!i || i > size);
-
-	i  ^= 1U << b;
-	i <<= 1;
-	i  |= 1;
-	i <<= shift;
-
-	/*
-	 * sign bit trick:
-	 *
-	 * if (i > extra)
-	 *	i -= (i - extra) >> 1;
-	 */
-	s = extra - i;
-	i += (s >> 1) & (s >> 31);
-
-	return i;
-}
-
-static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	unsigned shift;
-	int s;
-
-	EYTZINGER_BUG_ON(!i || i > size);
-
-	/*
-	 * sign bit trick:
-	 *
-	 * if (i > extra)
-	 *	i += i - extra;
-	 */
-	s = extra - i;
-	i -= s & (s >> 31);
-
-	shift = __ffs(i);
-
-	i >>= shift + 1;
-	i  |= 1U << (__fls(size) - shift);
-
-	return i;
-}
-
-static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
-{
-	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
-{
-	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
-}
-
-#define eytzinger1_for_each(_i, _size)			\
-	for (unsigned (_i) = eytzinger1_first((_size));	\
-	     (_i) != 0;					\
-	     (_i) = eytzinger1_next((_i), (_size)))
-
-/* Zero based indexing version: */
-
-static inline unsigned eytzinger0_child(unsigned i, unsigned child)
-{
-	EYTZINGER_BUG_ON(child > 1);
-
-	return (i << 1) + 1 + child;
-}
-
-static inline unsigned eytzinger0_left_child(unsigned i)
-{
-	return eytzinger0_child(i, 0);
-}
-
-static inline unsigned eytzinger0_right_child(unsigned i)
-{
-	return eytzinger0_child(i, 1);
-}
-
-static inline unsigned eytzinger0_first(unsigned size)
-{
-	return eytzinger1_first(size) - 1;
-}
-
-static inline unsigned eytzinger0_last(unsigned size)
-{
-	return eytzinger1_last(size) - 1;
-}
-
-static inline unsigned eytzinger0_next(unsigned i, unsigned size)
-{
-	return eytzinger1_next(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
-{
-	return eytzinger1_prev(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_extra(unsigned size)
-{
-	return eytzinger1_extra(size);
-}
-
-static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
-}
-
-static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
-					       unsigned extra)
-{
-	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
-}
-
-static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-{
-	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
-{
-	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
-}
-
-#define eytzinger0_for_each(_i, _size)			\
-	for (unsigned (_i) = eytzinger0_first((_size));	\
-	     (_i) != -1;				\
-	     (_i) = eytzinger0_next((_i), (_size)))
-
-#define eytzinger0_for_each_prev(_i, _size)		\
-	for (unsigned (_i) = eytzinger0_last((_size));	\
-	     (_i) != -1;				\
-	     (_i) = eytzinger0_prev((_i), (_size)))
-
-/* return greatest node <= @search, or -1 if not found */
-static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
-				     cmp_func_t cmp, const void *search)
-{
-	void *base1 = base - size;
-	unsigned n = 1;
-
-	while (n <= nr)
-		n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
-	n >>= __ffs(n) + 1;
-	return n - 1;
-}
-
-/* return smallest node > @search, or -1 if not found */
-static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
-				     cmp_func_t cmp, const void *search)
-{
-	void *base1 = base - size;
-	unsigned n = 1;
-
-	while (n <= nr)
-		n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
-	n >>= __ffs(n + 1) + 1;
-	return n - 1;
-}
-
-/* return smallest node >= @search, or -1 if not found */
-static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
-				     cmp_func_t cmp, const void *search)
-{
-	void *base1 = base - size;
-	unsigned n = 1;
-
-	while (n <= nr)
-		n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
-	n >>= __ffs(n + 1) + 1;
-	return n - 1;
-}
-
-#define eytzinger0_find(base, nr, size, _cmp, search)			\
-({									\
-	size_t _size		= (size);				\
-	void *_base1		= (void *)(base) - _size;		\
-	const void *_search	= (search);				\
-	size_t _nr		= (nr);					\
-	size_t _i		= 1;					\
-	int _res;							\
-									\
-	while (_i <= _nr &&						\
-	       (_res = _cmp(_search, _base1 + _i * _size)))		\
-		_i = eytzinger1_child(_i, _res > 0);			\
-	_i - 1;								\
-})
-
-void eytzinger0_sort_r(void *, size_t, size_t,
-		       cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
-
-#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
deleted file mode 100644
index 2faec143eb31..000000000000
--- a/fs/bcachefs/fast_list.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Fast, unordered lists
- *
- * Supports add, remove, and iterate
- *
- * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
- * allocation and freeing.
- *
- * This means that adding, removing, and iterating over items is lockless,
- * except when refilling/emptying the percpu slot buffers.
- */
-
-#include "fast_list.h"
-
-struct fast_list_pcpu {
-	u32			nr;
-	u32			entries[31];
-};
-
-static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
-{
-	int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
-	if (unlikely(idx < 0))
-		return 0;
-
-	if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
-		ida_free(&l->slots_allocated, idx);
-		return 0;
-	}
-
-	return idx;
-}
-
-/**
- * fast_list_get_idx - get a slot in a fast_list
- * @l:		list to get slot in
- *
- * This allocates a slot in the radix tree without storing to it, so that we can
- * take the potential memory allocation failure early and do the list add later
- * when we can't take an allocation failure.
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_get_idx(struct fast_list *l)
-{
-	unsigned long flags;
-	int idx;
-retry:
-	local_irq_save(flags);
-	struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
-	if (unlikely(!lp->nr)) {
-		u32 entries[16], nr = 0;
-
-		local_irq_restore(flags);
-		while (nr < ARRAY_SIZE(entries) &&
-		       (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
-			entries[nr++] = idx;
-		local_irq_save(flags);
-
-		lp = this_cpu_ptr(l->buffer);
-
-		while (nr && lp->nr < ARRAY_SIZE(lp->entries))
-			lp->entries[lp->nr++] = entries[--nr];
-
-		if (unlikely(nr)) {
-			local_irq_restore(flags);
-			while (nr)
-				ida_free(&l->slots_allocated, entries[--nr]);
-			goto retry;
-		}
-
-		if (unlikely(!lp->nr)) {
-			local_irq_restore(flags);
-			return -ENOMEM;
-		}
-	}
-
-	idx = lp->entries[--lp->nr];
-	local_irq_restore(flags);
-
-	return idx;
-}
-
-/**
- * fast_list_add - add an item to a fast_list
- * @l:		list
- * @item:	item to add
- *
- * Allocates a slot in the radix tree and stores to it and then returns the
- * slot index, which must be passed to fast_list_remove().
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_add(struct fast_list *l, void *item)
-{
-	int idx = fast_list_get_idx(l);
-	if (idx < 0)
-		return idx;
-
-	*genradix_ptr_inlined(&l->items, idx) = item;
-	return idx;
-}
-
-/**
- * fast_list_remove - remove an item from a fast_list
- * @l:		list
- * @idx:	item's slot index
- *
- * Zeroes out the slot in the radix tree and frees the slot for future
- * fast_list_add() operations.
- */
-void fast_list_remove(struct fast_list *l, unsigned idx)
-{
-	u32 entries[16], nr = 0;
-	unsigned long flags;
-
-	if (!idx)
-		return;
-
-	*genradix_ptr_inlined(&l->items, idx) = NULL;
-
-	local_irq_save(flags);
-	struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
-	if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
-		while (nr < ARRAY_SIZE(entries))
-			entries[nr++] = lp->entries[--lp->nr];
-
-	lp->entries[lp->nr++] = idx;
-	local_irq_restore(flags);
-
-	if (unlikely(nr))
-		while (nr)
-			ida_free(&l->slots_allocated, entries[--nr]);
-}
-
-void fast_list_exit(struct fast_list *l)
-{
-	/* XXX: warn if list isn't empty */
-	free_percpu(l->buffer);
-	ida_destroy(&l->slots_allocated);
-	genradix_free(&l->items);
-}
-
-int fast_list_init(struct fast_list *l)
-{
-	genradix_init(&l->items);
-	ida_init(&l->slots_allocated);
-	l->buffer = alloc_percpu(*l->buffer);
-	if (!l->buffer)
-		return -ENOMEM;
-	return 0;
-}
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
deleted file mode 100644
index 73c9bf591fd6..000000000000
--- a/fs/bcachefs/fast_list.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _LINUX_FAST_LIST_H
-#define _LINUX_FAST_LIST_H
-
-#include <linux/generic-radix-tree.h>
-#include <linux/idr.h>
-#include <linux/percpu.h>
-
-struct fast_list_pcpu;
-
-struct fast_list {
-	GENRADIX(void *)	items;
-	struct ida		slots_allocated;;
-	struct fast_list_pcpu __percpu
-				*buffer;
-};
-
-static inline void *fast_list_iter_peek(struct genradix_iter *iter,
-					struct fast_list *list)
-{
-	void **p;
-	while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
-		genradix_iter_advance(iter, &list->items);
-
-	return p ? *p : NULL;
-}
-
-#define fast_list_for_each_from(_list, _iter, _i, _start)		\
-	for (_iter = genradix_iter_init(&(_list)->items, _start);	\
-	     (_i = fast_list_iter_peek(&(_iter), _list)) != NULL;	\
-	     genradix_iter_advance(&(_iter), &(_list)->items))
-
-#define fast_list_for_each(_list, _iter, _i)				\
-	fast_list_for_each_from(_list, _iter, _i, 0)
-
-int fast_list_get_idx(struct fast_list *l);
-int fast_list_add(struct fast_list *l, void *item);
-void fast_list_remove(struct fast_list *l, unsigned idx);
-void fast_list_exit(struct fast_list *l);
-int fast_list_init(struct fast_list *l);
-
-#endif /* _LINUX_FAST_LIST_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
deleted file mode 100644
index d8153fe27037..000000000000
--- a/fs/bcachefs/fifo.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FIFO_H
-#define _BCACHEFS_FIFO_H
-
-#include "util.h"
-
-#define FIFO(type)							\
-struct {								\
-	size_t front, back, size, mask;					\
-	type *data;							\
-}
-
-#define DECLARE_FIFO(type, name)	FIFO(type) name
-
-#define fifo_buf_size(fifo)						\
-	((fifo)->size							\
-	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
-	 : 0)
-
-#define init_fifo(fifo, _size, _gfp)					\
-({									\
-	(fifo)->front	= (fifo)->back = 0;				\
-	(fifo)->size	= (_size);					\
-	(fifo)->mask	= (fifo)->size					\
-		? roundup_pow_of_two((fifo)->size) - 1			\
-		: 0;							\
-	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
-})
-
-#define free_fifo(fifo)							\
-do {									\
-	kvfree((fifo)->data);						\
-	(fifo)->data = NULL;						\
-} while (0)
-
-#define fifo_swap(l, r)							\
-do {									\
-	swap((l)->front, (r)->front);					\
-	swap((l)->back, (r)->back);					\
-	swap((l)->size, (r)->size);					\
-	swap((l)->mask, (r)->mask);					\
-	swap((l)->data, (r)->data);					\
-} while (0)
-
-#define fifo_move(dest, src)						\
-do {									\
-	typeof(*((dest)->data)) _t;					\
-	while (!fifo_full(dest) &&					\
-	       fifo_pop(src, _t))					\
-		fifo_push(dest, _t);					\
-} while (0)
-
-#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
-#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
-#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx_abs(fifo, p)					\
-	((((p) >= &fifo_peek_front(fifo)				\
-	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
-	   (((p) - (fifo)->data)))
-
-#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
-
-#define fifo_push_back_ref(f)						\
-	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
-
-#define fifo_push_front_ref(f)						\
-	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
-
-#define fifo_push_back(fifo, new)					\
-({									\
-	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
-	if (_r)								\
-		*_r = (new);						\
-	_r != NULL;							\
-})
-
-#define fifo_push_front(fifo, new)					\
-({									\
-	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
-	if (_r)								\
-		*_r = (new);						\
-	_r != NULL;							\
-})
-
-#define fifo_pop_front(fifo, i)						\
-({									\
-	bool _r = !fifo_empty((fifo));					\
-	if (_r)								\
-		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
-	_r;								\
-})
-
-#define fifo_pop_back(fifo, i)						\
-({									\
-	bool _r = !fifo_empty((fifo));					\
-	if (_r)								\
-		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
-	_r;								\
-})
-
-#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
-#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo)		fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter)			\
-	for (typecheck(typeof((_fifo)->front), _iter),			\
-	     (_iter) = (_fifo)->front;					\
-	     ((_iter != (_fifo)->back) &&				\
-	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     (_iter)++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
-	for (typecheck(typeof((_fifo)->front), _iter),			\
-	     (_iter) = (_fifo)->front;					\
-	     ((_iter != (_fifo)->back) &&				\
-	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
-	     (_iter)++)
-
-#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
deleted file mode 100644
index 1c54b9b5bd69..000000000000
--- a/fs/bcachefs/fs-io-buffered.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return true;
-	if (bio->bi_iter.bi_size > UINT_MAX - len)
-		return true;
-	return false;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-	struct folio_iter fi;
-
-	bio_for_each_folio_all(fi, bio)
-		folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
-
-	bio_put(bio);
-}
-
-struct readpages_iter {
-	struct address_space	*mapping;
-	unsigned		idx;
-	folios			folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-			       struct readahead_control *ractl)
-{
-	struct folio *folio;
-
-	*iter = (struct readpages_iter) { ractl->mapping };
-
-	while ((folio = __readahead_folio(ractl))) {
-		if (!bch2_folio_create(folio, GFP_KERNEL) ||
-		    darray_push(&iter->folios, folio)) {
-			bch2_folio_release(folio);
-			ractl->_nr_pages += folio_nr_pages(folio);
-			ractl->_index -= folio_nr_pages(folio);
-			return iter->folios.nr ? 0 : -ENOMEM;
-		}
-
-		folio_put(folio);
-	}
-
-	return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-	if (iter->idx >= iter->folios.nr)
-		return NULL;
-	return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-	iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (crc.csum_type || crc.compression_type)
-			return true;
-	return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-			       struct readpages_iter *iter,
-			       struct bio *bio,
-			       unsigned sectors_this_extent,
-			       bool get_more)
-{
-	/* Don't hold btree locks while allocating memory: */
-	bch2_trans_unlock(trans);
-
-	while (bio_sectors(bio) < sectors_this_extent &&
-	       bio->bi_vcnt < bio->bi_max_vecs) {
-		struct folio *folio = readpage_iter_peek(iter);
-		int ret;
-
-		if (folio) {
-			readpage_iter_advance(iter);
-		} else {
-			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-			if (!get_more)
-				break;
-
-			unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
-
-			if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
-				break;
-
-			unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
-
-			/* ensure proper alignment */
-			order = min(order, __ffs(folio_offset|BIT(31)));
-
-			folio = xa_load(&iter->mapping->i_pages, folio_offset);
-			if (folio && !xa_is_value(folio))
-				break;
-
-			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
-			if (!folio)
-				break;
-
-			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-				folio_put(folio);
-				break;
-			}
-
-			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-			if (ret) {
-				__bch2_folio_release(folio);
-				folio_put(folio);
-				break;
-			}
-
-			folio_put(folio);
-		}
-
-		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-	}
-
-	return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-		       struct bch_read_bio *rbio,
-		       subvol_inum inum,
-		       struct readpages_iter *readpages_iter)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	int flags = BCH_READ_retry_if_stale|
-		BCH_READ_may_promote;
-	int ret = 0;
-
-	rbio->subvol = inum.subvol;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_begin(trans);
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, rbio->bio.bi_iter.bi_sector),
-			     BTREE_ITER_slots);
-	while (1) {
-		struct bkey_s_c k;
-		unsigned bytes, sectors;
-		s64 offset_into_extent;
-		enum btree_id data_btree = BTREE_ID_extents;
-
-		bch2_trans_begin(trans);
-
-		u32 snapshot;
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-		bch2_btree_iter_set_pos(trans, &iter,
-				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(trans, &iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			goto err;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
-		if (readpages_iter) {
-			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-						  extent_partial_reads_expensive(k));
-			if (ret)
-				goto err;
-		}
-
-		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-
-		if (rbio->bio.bi_iter.bi_size == bytes)
-			flags |= BCH_READ_last_fragment;
-
-		bch2_bio_page_state_set(&rbio->bio, k);
-
-		bch2_read_extent(trans, rbio, iter.pos,
-				 data_btree, k, offset_into_extent, flags);
-		/*
-		 * Careful there's a landmine here if bch2_read_extent() ever
-		 * starts returning transaction restarts here.
-		 *
-		 * We've changed rbio->bi_iter.bi_size to be "bytes we can read
-		 * from this extent" with the swap call, and we restore it
-		 * below. That restore needs to come before checking for
-		 * errors.
-		 *
-		 * But unlike __bch2_read(), we use the rbio bvec iter, not one
-		 * on the stack, so we can't do the restore right after the
-		 * bch2_read_extent() call: we don't own that iterator anymore
-		 * if BCH_READ_last_fragment is set, since we may have submitted
-		 * that rbio instead of cloning it.
-		 */
-
-		if (flags & BCH_READ_last_fragment)
-			break;
-
-		swap(rbio->bio.bi_iter.bi_size, bytes);
-		bio_advance(&rbio->bio, bytes);
-err:
-		if (ret &&
-		    !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret) {
-		struct printbuf buf = PRINTBUF;
-		lockrestart_do(trans,
-			bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
-		prt_printf(&buf, "read error %i from btree lookup", ret);
-		bch_err_ratelimited(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-
-		rbio->bio.bi_status = BLK_STS_IOERR;
-		bio_endio(&rbio->bio);
-	}
-
-	bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct folio *folio;
-	struct readpages_iter readpages_iter;
-	struct blk_plug plug;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	int ret = readpages_iter_init(&readpages_iter, ractl);
-	if (ret)
-		return;
-
-	/*
-	 * Besides being a general performance optimization, plugging helps with
-	 * avoiding btree transaction srcu warnings - submitting a bio can
-	 * block, and we don't want todo that with the transaction locked.
-	 *
-	 * However, plugged bios are submitted when we schedule; we ideally
-	 * would have our own scheduler hook to call unlock_long() before
-	 * scheduling.
-	 */
-	blk_start_plug(&plug);
-	bch2_pagecache_add_get(inode);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-	while ((folio = readpage_iter_peek(&readpages_iter))) {
-		unsigned n = min_t(unsigned,
-				   readpages_iter.folios.nr -
-				   readpages_iter.idx,
-				   BIO_MAX_VECS);
-		struct bch_read_bio *rbio =
-			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-						   GFP_KERNEL, &c->bio_read),
-				  c,
-				  opts,
-				  bch2_readpages_end_io);
-
-		readpage_iter_advance(&readpages_iter);
-
-		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-		bchfs_read(trans, rbio, inode_inum(inode),
-			   &readpages_iter);
-		bch2_trans_unlock(trans);
-	}
-	bch2_trans_put(trans);
-
-	bch2_pagecache_add_put(inode);
-	blk_finish_plug(&plug);
-	darray_exit(&readpages_iter.folios);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-	complete(bio->bi_private);
-}
-
-int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_read_bio *rbio;
-	struct bch_io_opts opts;
-	struct blk_plug plug;
-	int ret;
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	BUG_ON(folio_test_uptodate(folio));
-	BUG_ON(folio_test_dirty(folio));
-
-	if (!bch2_folio_create(folio, GFP_KERNEL))
-		return -ENOMEM;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-			 c,
-			 opts,
-			 bch2_read_single_folio_end_io);
-	rbio->bio.bi_private = &done;
-	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-	blk_start_plug(&plug);
-	bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
-	blk_finish_plug(&plug);
-	wait_for_completion(&done);
-
-	ret = blk_status_to_errno(rbio->bio.bi_status);
-	bio_put(&rbio->bio);
-
-	if (ret < 0)
-		return ret;
-
-	folio_mark_uptodate(folio);
-	return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-	int ret;
-
-	ret = bch2_read_single_folio(folio, folio->mapping);
-	folio_unlock(folio);
-	return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_io {
-	struct bch_inode_info		*inode;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct bch_writepage_state {
-	struct bch_writepage_io	*io;
-	struct bch_io_opts	opts;
-	struct bch_folio_sector	*tmp;
-	unsigned		tmp_sectors;
-	struct blk_plug		plug;
-};
-
-/*
- * Determine when a writepage io is full. We have to limit writepage bios to a
- * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
- * what the bounce path in bch2_write_extent() can handle. In theory we could
- * loosen this restriction for non-bounce I/O, but we don't have that context
- * here. Ideally, we can up this limit and make it configurable in the future
- * when the bounce path can be enhanced to accommodate larger source bios.
- */
-static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
-{
-	struct bio *bio = &io->op.wbio.bio;
-	return bio_full(bio, len) ||
-		(bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-	struct bch_writepage_io *io =
-		container_of(op, struct bch_writepage_io, op);
-	struct bch_fs *c = io->op.c;
-	struct bio *bio = &io->op.wbio.bio;
-	struct folio_iter fi;
-	unsigned i;
-
-	if (io->op.error) {
-		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			mapping_set_error(fi.folio->mapping, -EIO);
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	if (io->op.flags & BCH_WRITE_wrote_data_inline) {
-		bio_for_each_folio_all(fi, bio) {
-			struct bch_folio *s;
-
-			s = __bch2_folio(fi.folio);
-			spin_lock(&s->lock);
-			for (i = 0; i < folio_sectors(fi.folio); i++)
-				s->s[i].nr_replicas = 0;
-			spin_unlock(&s->lock);
-		}
-	}
-
-	/*
-	 * racing with fallocate can cause us to add fewer sectors than
-	 * expected - but we shouldn't add more sectors than expected:
-	 */
-	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-	/*
-	 * (error (due to going RO) halfway through a page can screw that up
-	 * slightly)
-	 * XXX wtf?
-	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-	 */
-
-	/*
-	 * The writeback flag is effectively our ref on the inode -
-	 * fixup i_blocks before calling folio_end_writeback:
-	 */
-	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-	bio_for_each_folio_all(fi, bio) {
-		struct bch_folio *s = __bch2_folio(fi.folio);
-
-		if (atomic_dec_and_test(&s->write_count))
-			folio_end_writeback(fi.folio);
-	}
-
-	bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-	struct bch_writepage_io *io = w->io;
-
-	w->io = NULL;
-	closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-				    struct writeback_control *wbc,
-				    struct bch_writepage_state *w,
-				    struct bch_inode_info *inode,
-				    u64 sector,
-				    unsigned nr_replicas)
-{
-	struct bch_write_op *op;
-
-	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-					      REQ_OP_WRITE,
-					      GFP_KERNEL,
-					      &c->writepage_bioset),
-			     struct bch_writepage_io, op.wbio.bio);
-
-	w->io->inode		= inode;
-	op			= &w->io->op;
-	bch2_write_op_init(op, c, w->opts);
-	op->target		= w->opts.foreground_target;
-	op->nr_replicas		= nr_replicas;
-	op->res.nr_replicas	= nr_replicas;
-	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->subvol		= inode->ei_inum.subvol;
-	op->pos			= POS(inode->v.i_ino, sector);
-	op->end_io		= bch2_writepage_io_done;
-	op->devs_need_flush	= &inode->ei_devs_need_flush;
-	op->wbio.bio.bi_iter.bi_sector = sector;
-	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-			    struct writeback_control *wbc,
-			    void *data)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_writepage_state *w = data;
-	struct bch_folio *s;
-	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-	loff_t i_size = i_size_read(&inode->v);
-	int ret;
-
-	EBUG_ON(!folio_test_uptodate(folio));
-
-	/* Is the folio fully inside i_size? */
-	if (folio_end_pos(folio) <= i_size)
-		goto do_io;
-
-	/* Is the folio fully outside i_size? (truncate in progress) */
-	if (folio_pos(folio) >= i_size) {
-		folio_unlock(folio);
-		return 0;
-	}
-
-	/*
-	 * The folio straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the folio size.  For a file that is not a multiple of
-	 * the  folio size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	folio_zero_segment(folio,
-			   i_size - folio_pos(folio),
-			   folio_size(folio));
-do_io:
-	f_sectors = folio_sectors(folio);
-	s = bch2_folio(folio);
-
-	if (f_sectors > w->tmp_sectors) {
-		kfree(w->tmp);
-		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
-		w->tmp_sectors = f_sectors;
-	}
-
-	/*
-	 * Things get really hairy with errors during writeback:
-	 */
-	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-	BUG_ON(ret);
-
-	/* Before unlocking the page, get copy of reservations: */
-	spin_lock(&s->lock);
-	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		nr_replicas_this_write =
-			min_t(unsigned, nr_replicas_this_write,
-			      s->s[i].nr_replicas +
-			      s->s[i].replicas_reserved);
-	}
-
-	for (i = 0; i < f_sectors; i++) {
-		if (s->s[i].state < SECTOR_dirty)
-			continue;
-
-		s->s[i].nr_replicas = w->opts.compression
-			? 0 : nr_replicas_this_write;
-
-		s->s[i].replicas_reserved = 0;
-		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
-	}
-	spin_unlock(&s->lock);
-
-	BUG_ON(atomic_read(&s->write_count));
-	atomic_set(&s->write_count, 1);
-
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-
-	folio_unlock(folio);
-
-	offset = 0;
-	while (1) {
-		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-		u64 sector;
-
-		while (offset < f_sectors &&
-		       w->tmp[offset].state < SECTOR_dirty)
-			offset++;
-
-		if (offset == f_sectors)
-			break;
-
-		while (offset + sectors < f_sectors &&
-		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
-			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-			sectors++;
-		}
-		BUG_ON(!sectors);
-
-		sector = folio_sector(folio) + offset;
-
-		if (w->io &&
-		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-		     bch_io_full(w->io, sectors << 9) ||
-		     bio_end_sector(&w->io->op.wbio.bio) != sector))
-			bch2_writepage_do_io(w);
-
-		if (!w->io)
-			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-						nr_replicas_this_write);
-
-		atomic_inc(&s->write_count);
-
-		BUG_ON(inode != w->io->inode);
-		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-				     sectors << 9, offset << 9));
-
-		w->io->op.res.sectors += reserved_sectors;
-		w->io->op.i_sectors_delta -= dirty_sectors;
-		w->io->op.new_i_size = i_size;
-
-		offset += sectors;
-	}
-
-	if (atomic_dec_and_test(&s->write_count))
-		folio_end_writeback(folio);
-
-	return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-	struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
-
-	bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
-
-	blk_start_plug(&w->plug);
-	int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
-	if (w->io)
-		bch2_writepage_do_io(w);
-	blk_finish_plug(&w->plug);
-	kfree(w->tmp);
-	kfree(w);
-	return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
-		     loff_t pos, unsigned len,
-		     struct folio **foliop, void **fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res;
-	struct folio *folio;
-	unsigned offset;
-	int ret = -ENOMEM;
-
-	res = kmalloc(sizeof(*res), GFP_KERNEL);
-	if (!res)
-		return -ENOMEM;
-
-	bch2_folio_reservation_init(c, inode, res);
-	*fsdata = res;
-
-	bch2_pagecache_add_get(inode);
-
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-				    FGP_WRITEBEGIN | fgf_set_order(len),
-				    mapping_gfp_mask(mapping));
-	if (IS_ERR(folio))
-		goto err_unlock;
-
-	offset = pos - folio_pos(folio);
-	len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-	if (folio_test_uptodate(folio))
-		goto out;
-
-	/* If we're writing entire folio, don't need to read it in first: */
-	if (!offset && len == folio_size(folio))
-		goto out;
-
-	if (!offset && pos + len >= inode->v.i_size) {
-		folio_zero_segment(folio, len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-
-	if (folio_pos(folio) >= inode->v.i_size) {
-		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-		flush_dcache_folio(folio);
-		goto out;
-	}
-readpage:
-	ret = bch2_read_single_folio(folio, mapping);
-	if (ret)
-		goto err;
-out:
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-	if (ret) {
-		if (!folio_test_uptodate(folio)) {
-			/*
-			 * If the folio hasn't been read in, we won't know if we
-			 * actually need a reservation - we don't actually need
-			 * to read here, we just need to check if the folio is
-			 * fully backed by uncompressed data:
-			 */
-			goto readpage;
-		}
-
-		goto err;
-	}
-
-	*foliop = folio;
-	return 0;
-err:
-	folio_unlock(folio);
-	folio_put(folio);
-err_unlock:
-	bch2_pagecache_add_put(inode);
-	kfree(res);
-	*fsdata = NULL;
-	return bch2_err_class(ret);
-}
-
-int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
-		   loff_t pos, unsigned len, unsigned copied,
-		   struct folio *folio, void *fsdata)
-{
-	struct bch_inode_info *inode = to_bch_ei(mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation *res = fsdata;
-	unsigned offset = pos - folio_pos(folio);
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-	BUG_ON(offset + copied > folio_size(folio));
-
-	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-		/*
-		 * The folio needs to be read in, but that would destroy
-		 * our partial write - simplest thing is to just force
-		 * userspace to redo the write:
-		 */
-		folio_zero_range(folio, 0, folio_size(folio));
-		flush_dcache_folio(folio);
-		copied = 0;
-	}
-
-	spin_lock(&inode->v.i_lock);
-	if (pos + copied > inode->v.i_size)
-		i_size_write(&inode->v, pos + copied);
-	spin_unlock(&inode->v.i_lock);
-
-	if (copied) {
-		if (!folio_test_uptodate(folio))
-			folio_mark_uptodate(folio);
-
-		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-		inode->ei_last_dirtied = (unsigned long) current;
-	}
-
-	folio_unlock(folio);
-	folio_put(folio);
-	bch2_pagecache_add_put(inode);
-
-	bch2_folio_reservation_put(c, inode, res);
-	kfree(res);
-
-	return copied;
-}
-
-static noinline void folios_trunc(folios *fs, struct folio **fi)
-{
-	while (fs->data + fs->nr > fi) {
-		struct folio *f = darray_pop(fs);
-
-		folio_unlock(f);
-		folio_put(f);
-	}
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-				 struct address_space *mapping,
-				 struct iov_iter *iter,
-				 loff_t pos, unsigned len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	folios fs;
-	struct folio *f;
-	unsigned copied = 0, f_offset, f_copied;
-	u64 end = pos + len, f_pos, f_len;
-	loff_t last_folio_pos = inode->v.i_size;
-	int ret = 0;
-
-	BUG_ON(!len);
-
-	bch2_folio_reservation_init(c, inode, &res);
-	darray_init(&fs);
-
-	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
-					       FGP_WRITEBEGIN | fgf_set_order(len),
-					       mapping_gfp_mask(mapping), &fs);
-	if (ret)
-		goto out;
-
-	BUG_ON(!fs.nr);
-
-	f = darray_first(fs);
-	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-		ret = bch2_read_single_folio(f, mapping);
-		if (ret)
-			goto out;
-	}
-
-	f = darray_last(fs);
-	end = min(end, folio_end_pos(f));
-	last_folio_pos = folio_pos(f);
-	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-		if (end >= inode->v.i_size) {
-			folio_zero_range(f, 0, folio_size(f));
-		} else {
-			ret = bch2_read_single_folio(f, mapping);
-			if (ret)
-				goto out;
-		}
-	}
-
-	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
-	if (ret)
-		goto out;
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		ssize_t f_reserved;
-
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-		f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
-
-		if (unlikely(f_reserved != f_len)) {
-			if (f_reserved < 0) {
-				if (f == darray_first(fs)) {
-					ret = f_reserved;
-					goto out;
-				}
-
-				folios_trunc(&fs, fi);
-				end = min(end, folio_end_pos(darray_last(fs)));
-			} else {
-				if (!folio_test_uptodate(f)) {
-					ret = bch2_read_single_folio(f, mapping);
-					if (ret)
-						goto out;
-				}
-
-				folios_trunc(&fs, fi + 1);
-				end = f_pos + f_reserved;
-			}
-
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (mapping_writably_mapped(mapping))
-		darray_for_each(fs, fi)
-			flush_dcache_folio(*fi);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-		f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
-		if (!f_copied) {
-			folios_trunc(&fs, fi);
-			break;
-		}
-
-		if (!folio_test_uptodate(f) &&
-		    f_copied != folio_size(f) &&
-		    pos + copied + f_copied < inode->v.i_size) {
-			iov_iter_revert(iter, f_copied);
-			folio_zero_range(f, 0, folio_size(f));
-			folios_trunc(&fs, fi);
-			break;
-		}
-
-		flush_dcache_folio(f);
-		copied += f_copied;
-
-		if (f_copied != f_len) {
-			folios_trunc(&fs, fi + 1);
-			break;
-		}
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	if (!copied)
-		goto out;
-
-	end = pos + copied;
-
-	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size)
-		i_size_write(&inode->v, end);
-	spin_unlock(&inode->v.i_lock);
-
-	f_pos = pos;
-	f_offset = pos - folio_pos(darray_first(fs));
-	darray_for_each(fs, fi) {
-		f = *fi;
-		f_len = min(end, folio_end_pos(f)) - f_pos;
-
-		if (!folio_test_uptodate(f))
-			folio_mark_uptodate(f);
-
-		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-		f_pos = folio_end_pos(f);
-		f_offset = 0;
-	}
-
-	inode->ei_last_dirtied = (unsigned long) current;
-out:
-	darray_for_each(fs, fi) {
-		folio_unlock(*fi);
-		folio_put(*fi);
-	}
-
-	/*
-	 * If the last folio added to the mapping starts beyond current EOF, we
-	 * performed a short write but left around at least one post-EOF folio.
-	 * Clean up the mapping before we return.
-	 */
-	if (last_folio_pos >= inode->v.i_size)
-		truncate_pagecache(&inode->v, inode->v.i_size);
-
-	darray_exit(&fs);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos = iocb->ki_pos;
-	ssize_t written = 0;
-	int ret = 0;
-
-	bch2_pagecache_add_get(inode);
-
-	do {
-		unsigned offset = pos & (PAGE_SIZE - 1);
-		unsigned bytes = iov_iter_count(iter);
-again:
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-			bytes = min_t(unsigned long, iov_iter_count(iter),
-				      PAGE_SIZE - offset);
-
-			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-				ret = -EFAULT;
-				break;
-			}
-		}
-
-		if (unlikely(fatal_signal_pending(current))) {
-			ret = -EINTR;
-			break;
-		}
-
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-		if (unlikely(ret < 0))
-			break;
-
-		cond_resched();
-
-		if (unlikely(ret == 0)) {
-			/*
-			 * If we were unable to copy any data at all, we must
-			 * fall back to a single segment length write.
-			 *
-			 * If we didn't fallback here, we could livelock
-			 * because not all segments in the iov can be copied at
-			 * once without a pagefault.
-			 */
-			bytes = min_t(unsigned long, PAGE_SIZE - offset,
-				      iov_iter_single_seg_count(iter));
-			goto again;
-		}
-		pos += ret;
-		written += ret;
-		ret = 0;
-
-		balance_dirty_pages_ratelimited(mapping);
-	} while (iov_iter_count(iter));
-
-	bch2_pagecache_add_put(inode);
-
-	return written ? written : ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	ssize_t ret;
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = bch2_direct_write(iocb, from);
-		goto out;
-	}
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs(file);
-	if (ret)
-		goto unlock;
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	ret = bch2_buffered_write(iocb, from);
-	if (likely(ret > 0))
-		iocb->ki_pos += ret;
-unlock:
-	inode_unlock(&inode->v);
-
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-out:
-	return bch2_err_class(ret);
-}
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->writepage_bioset);
-}
-
-int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->writepage_bioset,
-			4, offsetof(struct bch_writepage_io, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
deleted file mode 100644
index 14de91c27656..000000000000
--- a/fs/bcachefs/fs-io-buffered.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_BUFFERED_H
-#define _BCACHEFS_FS_IO_BUFFERED_H
-
-#ifndef NO_BCACHEFS_FS
-
-int bch2_read_single_folio(struct folio *, struct address_space *);
-int bch2_read_folio(struct file *, struct folio *);
-
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
-
-int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
-		     unsigned len, struct folio **, void **);
-int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
-		   unsigned len, unsigned copied, struct folio *, void *);
-
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
-int bch2_fs_fs_io_buffered_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
deleted file mode 100644
index 1f5154d9676b..000000000000
--- a/fs/bcachefs/fs-io-direct.c
+++ /dev/null
@@ -1,704 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "enumerated_ref.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/prefetch.h>
-#include <linux/task_io_accounting_ops.h>
-
-/* O_DIRECT reads */
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	bool				should_dirty;
-	struct bch_read_bio		rbio;
-};
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-	if (check_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static CLOSURE_CALLBACK(bch2_dio_read_complete)
-{
-	closure_type(dio, struct dio_read, cl);
-
-	dio->req->ki_complete(dio->req, dio->ret);
-	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-
-	if (bio->bi_status)
-		dio->ret = blk_status_to_errno(bio->bi_status);
-
-	closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-	struct dio_read *dio = bio->bi_private;
-	bool should_dirty = dio->should_dirty;
-
-	bch2_direct_IO_read_endio(bio);
-	bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_io_opts opts;
-	struct dio_read *dio;
-	struct bio *bio;
-	struct blk_plug plug;
-	loff_t offset = req->ki_pos;
-	bool sync = is_sync_kiocb(req);
-	bool split = false;
-	size_t shorten;
-	ssize_t ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	/* bios must be 512 byte aligned: */
-	if ((offset|iter->count) & (SECTOR_SIZE - 1))
-		return -EINVAL;
-
-	ret = min_t(loff_t, iter->count,
-		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-	if (!ret)
-		return ret;
-
-	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-	if (shorten >= iter->count)
-		shorten = 0;
-	iter->count -= shorten;
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_READ,
-			       GFP_KERNEL,
-			       &c->dio_read_bioset);
-
-	dio = container_of(bio, struct dio_read, rbio.bio);
-	closure_init(&dio->cl, NULL);
-
-	/*
-	 * this is a _really_ horrible hack just to avoid an atomic sub at the
-	 * end:
-	 */
-	if (!sync) {
-		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER -
-			   CLOSURE_RUNNING +
-			   CLOSURE_DESTRUCTOR);
-	} else {
-		atomic_set(&dio->cl.remaining,
-			   CLOSURE_REMAINING_INITIALIZER + 1);
-		dio->cl.closure_get_happened = true;
-	}
-
-	dio->req	= req;
-	dio->ret	= ret;
-	/*
-	 * This is one of the sketchier things I've encountered: we have to skip
-	 * the dirtying of requests that are internal from the kernel (i.e. from
-	 * loopback), because we'll deadlock on page_lock.
-	 */
-	dio->should_dirty = iter_is_iovec(iter);
-
-	blk_start_plug(&plug);
-
-	goto start;
-	while (iter->count) {
-		split = true;
-
-		bio = bio_alloc_bioset(NULL,
-				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-				       REQ_OP_READ,
-				       GFP_KERNEL,
-				       &c->bio_read);
-start:
-		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
-		bio->bi_iter.bi_sector	= offset >> 9;
-		bio->bi_private		= dio;
-
-		ret = bio_iov_iter_get_pages(bio, iter);
-		if (ret < 0) {
-			/* XXX: fault inject this path */
-			bio->bi_status = BLK_STS_RESOURCE;
-			bio_endio(bio);
-			break;
-		}
-
-		offset += bio->bi_iter.bi_size;
-
-		if (dio->should_dirty)
-			bio_set_pages_dirty(bio);
-
-		if (iter->count)
-			closure_get(&dio->cl);
-
-		struct bch_read_bio *rbio =
-			rbio_init(bio,
-				  c,
-				  opts,
-				  split
-				  ? bch2_direct_IO_read_split_endio
-				  : bch2_direct_IO_read_endio);
-
-		bch2_read(c, rbio, inode_inum(inode));
-	}
-
-	blk_finish_plug(&plug);
-
-	iter->count += shorten;
-
-	if (sync) {
-		closure_sync(&dio->cl);
-		closure_debug_destroy(&dio->cl);
-		ret = dio->ret;
-		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-		return ret;
-	} else {
-		return -EIOCBQUEUED;
-	}
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	size_t count = iov_iter_count(iter);
-	ssize_t ret = 0;
-
-	if (!count)
-		return 0; /* skip atime */
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		struct blk_plug plug;
-
-		if (unlikely(mapping->nrpages)) {
-			ret = filemap_write_and_wait_range(mapping,
-						iocb->ki_pos,
-						iocb->ki_pos + count - 1);
-			if (ret < 0)
-				goto out;
-		}
-
-		file_accessed(file);
-
-		blk_start_plug(&plug);
-		ret = bch2_direct_IO_read(iocb, iter);
-		blk_finish_plug(&plug);
-
-		if (ret >= 0)
-			iocb->ki_pos += ret;
-	} else {
-		bch2_pagecache_add_get(inode);
-		ret = filemap_read(iocb, iter, ret);
-		bch2_pagecache_add_put(inode);
-	}
-out:
-	return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-struct dio_write {
-	struct kiocb			*req;
-	struct address_space		*mapping;
-	struct bch_inode_info		*inode;
-	struct mm_struct		*mm;
-	const struct iovec		*iov;
-	unsigned			loop:1,
-					extending:1,
-					sync:1,
-					flush:1;
-	struct quota_res		quota_res;
-	u64				written;
-
-	struct iov_iter			iter;
-	struct iovec			inline_vecs[2];
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-				       u64 offset, u64 size,
-				       unsigned nr_replicas, bool compressed)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 end = offset + size;
-	u32 snapshot;
-	bool ret = true;
-	int err;
-retry:
-	bch2_trans_begin(trans);
-
-	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (err)
-		goto err;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_slots, k, err) {
-		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-			break;
-
-		if (k.k->p.snapshot != snapshot ||
-		    nr_replicas > bch2_bkey_replicas(c, k) ||
-		    (!compressed && bch2_bkey_sectors_compressed(k))) {
-			ret = false;
-			break;
-		}
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_put(trans);
-
-	return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	return bch2_check_range_allocated(c, inode_inum(inode),
-				dio->op.pos.offset, bio_sectors(bio),
-				dio->op.opts.data_replicas,
-				dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-	struct iovec *iov = dio->inline_vecs;
-
-	/*
-	 * iov_iter has a single embedded iovec - nothing to do:
-	 */
-	if (iter_is_ubuf(&dio->iter))
-		return 0;
-
-	/*
-	 * We don't currently handle non-iovec iov_iters here - return an error,
-	 * and we'll fall back to doing the IO synchronously:
-	 */
-	if (!iter_is_iovec(&dio->iter))
-		return -1;
-
-	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-		dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-				    GFP_KERNEL);
-		if (unlikely(!iov))
-			return -ENOMEM;
-	}
-
-	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-	dio->iter.__iov = iov;
-	return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
-{
-	closure_type(dio, struct dio_write, op.cl);
-	struct bch_fs *c = dio->op.c;
-
-	closure_debug_destroy(cl);
-
-	dio->op.error = bch2_journal_error(&c->journal);
-
-	bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	dio->flush = 0;
-
-	closure_init(&dio->op.cl, NULL);
-
-	if (!dio->op.error) {
-		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-		if (ret) {
-			dio->op.error = ret;
-		} else {
-			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
-						     &dio->op.cl);
-			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-		}
-	}
-
-	if (dio->sync) {
-		closure_sync(&dio->op.cl);
-		closure_debug_destroy(&dio->op.cl);
-	} else {
-		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-	}
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	bool sync = dio->sync;
-	long ret;
-
-	if (unlikely(dio->flush)) {
-		bch2_dio_write_flush(dio);
-		if (!sync)
-			return -EIOCBQUEUED;
-	}
-
-	bch2_pagecache_block_put(inode);
-
-	kfree(dio->iov);
-
-	ret = dio->op.error ?: ((long) dio->written << 9);
-	bio_put(&dio->op.wbio.bio);
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
-
-	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-	inode_dio_end(&inode->v);
-
-	if (ret < 0)
-		ret = bch2_err_class(ret);
-
-	if (!sync) {
-		req->ki_complete(req, ret);
-		ret = -EIOCBQUEUED;
-	}
-	return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct bch_inode_info *inode = dio->inode;
-	struct bio *bio = &dio->op.wbio.bio;
-
-	req->ki_pos	+= (u64) dio->op.written << 9;
-	dio->written	+= dio->op.written;
-
-	if (dio->extending) {
-		spin_lock(&inode->v.i_lock);
-		if (req->ki_pos > inode->v.i_size)
-			i_size_write(&inode->v, req->ki_pos);
-		spin_unlock(&inode->v.i_lock);
-	}
-
-	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-
-	bio_release_pages(bio, false);
-
-	if (unlikely(dio->op.error))
-		set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-	struct bch_fs *c = dio->op.c;
-	struct kiocb *req = dio->req;
-	struct address_space *mapping = dio->mapping;
-	struct bch_inode_info *inode = dio->inode;
-	struct bch_io_opts opts;
-	struct bio *bio = &dio->op.wbio.bio;
-	unsigned unaligned, iter_count;
-	bool sync = dio->sync, dropped_locks;
-	long ret;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	while (1) {
-		iter_count = dio->iter.count;
-
-		EBUG_ON(current->faults_disabled_mapping);
-		current->faults_disabled_mapping = mapping;
-
-		ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-		dropped_locks = fdm_dropped_locks();
-
-		current->faults_disabled_mapping = NULL;
-
-		/*
-		 * If the fault handler returned an error but also signalled
-		 * that it dropped & retook ei_pagecache_lock, we just need to
-		 * re-shoot down the page cache and retry:
-		 */
-		if (dropped_locks && ret)
-			ret = 0;
-
-		if (unlikely(ret < 0))
-			goto err;
-
-		if (unlikely(dropped_locks)) {
-			ret = bch2_write_invalidate_inode_pages_range(mapping,
-					req->ki_pos,
-					req->ki_pos + iter_count - 1);
-			if (unlikely(ret))
-				goto err;
-
-			if (!bio->bi_iter.bi_size)
-				continue;
-		}
-
-		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-		bio->bi_iter.bi_size -= unaligned;
-		iov_iter_revert(&dio->iter, unaligned);
-
-		if (!bio->bi_iter.bi_size) {
-			/*
-			 * bio_iov_iter_get_pages was only able to get <
-			 * blocksize worth of pages:
-			 */
-			ret = -EFAULT;
-			goto err;
-		}
-
-		bch2_write_op_init(&dio->op, c, opts);
-		dio->op.end_io		= sync
-			? NULL
-			: bch2_dio_write_loop_async;
-		dio->op.target		= dio->op.opts.foreground_target;
-		dio->op.write_point	= writepoint_hashed((unsigned long) current);
-		dio->op.nr_replicas	= dio->op.opts.data_replicas;
-		dio->op.subvol		= inode->ei_inum.subvol;
-		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
-
-		if (sync)
-			dio->op.flags |= BCH_WRITE_sync;
-		dio->op.flags |= BCH_WRITE_check_enospc;
-
-		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-						 bio_sectors(bio), true);
-		if (unlikely(ret))
-			goto err;
-
-		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-						dio->op.opts.data_replicas, 0);
-		if (unlikely(ret) &&
-		    !bch2_dio_write_check_allocated(dio))
-			goto err;
-
-		task_io_account_write(bio->bi_iter.bi_size);
-
-		if (unlikely(dio->iter.count) &&
-		    !dio->sync &&
-		    !dio->loop &&
-		    bch2_dio_write_copy_iov(dio))
-			dio->sync = sync = true;
-
-		dio->loop = true;
-		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-		if (!sync)
-			return -EIOCBQUEUED;
-
-		bch2_dio_write_end(dio);
-
-		if (likely(!dio->iter.count) || dio->op.error)
-			break;
-
-		bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
-	}
-out:
-	return bch2_dio_write_done(dio);
-err:
-	dio->op.error = ret;
-
-	bio_release_pages(bio, false);
-
-	bch2_quota_reservation_put(c, inode, &dio->quota_res);
-	goto out;
-}
-
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
-{
-	struct mm_struct *mm = dio->mm;
-
-	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
-
-	if (mm)
-		kthread_use_mm(mm);
-	bch2_dio_write_loop(dio);
-	if (mm)
-		kthread_unuse_mm(mm);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
-	struct dio_write *dio = container_of(op, struct dio_write, op);
-
-	bch2_dio_write_end(dio);
-
-	if (likely(!dio->iter.count) || dio->op.error)
-		bch2_dio_write_done(dio);
-	else
-		bch2_dio_write_continue(dio);
-}
-
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
-{
-	struct file *file = req->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct dio_write *dio;
-	struct bio *bio;
-	bool locked = true, extending;
-	ssize_t ret;
-
-	prefetch(&c->opts);
-	prefetch((void *) &c->opts + 64);
-	prefetch(&inode->ei_inode);
-	prefetch((void *) &inode->ei_inode + 64);
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
-		return -EROFS;
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(req, iter);
-	if (unlikely(ret <= 0))
-		goto err_put_write_ref;
-
-	ret = file_remove_privs(file);
-	if (unlikely(ret))
-		goto err_put_write_ref;
-
-	ret = file_update_time(file);
-	if (unlikely(ret))
-		goto err_put_write_ref;
-
-	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
-		ret = -EINVAL;
-		goto err_put_write_ref;
-	}
-
-	inode_dio_begin(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	extending = req->ki_pos + iter->count > inode->v.i_size;
-	if (!extending) {
-		inode_unlock(&inode->v);
-		locked = false;
-	}
-
-	bio = bio_alloc_bioset(NULL,
-			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
-			       GFP_KERNEL,
-			       &c->dio_write_bioset);
-	dio = container_of(bio, struct dio_write, op.wbio.bio);
-	dio->req		= req;
-	dio->mapping		= mapping;
-	dio->inode		= inode;
-	dio->mm			= current->mm;
-	dio->iov		= NULL;
-	dio->loop		= false;
-	dio->extending		= extending;
-	dio->sync		= is_sync_kiocb(req) || extending;
-	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-	dio->quota_res.sectors	= 0;
-	dio->written		= 0;
-	dio->iter		= *iter;
-	dio->op.c		= c;
-
-	if (unlikely(mapping->nrpages)) {
-		ret = bch2_write_invalidate_inode_pages_range(mapping,
-						req->ki_pos,
-						req->ki_pos + iter->count - 1);
-		if (unlikely(ret))
-			goto err_put_bio;
-	}
-
-	ret = bch2_dio_write_loop(dio);
-out:
-	if (locked)
-		inode_unlock(&inode->v);
-	return ret;
-err_put_bio:
-	bch2_pagecache_block_put(inode);
-	bio_put(bio);
-	inode_dio_end(&inode->v);
-err_put_write_ref:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
-	goto out;
-}
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->dio_write_bioset);
-	bioset_exit(&c->dio_read_bioset);
-}
-
-int bch2_fs_fs_io_direct_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->dio_read_bioset,
-			4, offsetof(struct dio_read, rbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-	if (bioset_init(&c->dio_write_bioset,
-			4, offsetof(struct dio_write, op.wbio.bio),
-			BIOSET_NEED_BVECS))
-		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
deleted file mode 100644
index 814621ec7f81..000000000000
--- a/fs/bcachefs/fs-io-direct.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_DIRECT_H
-#define _BCACHEFS_FS_IO_DIRECT_H
-
-#ifndef NO_BCACHEFS_FS
-ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *);
-int bch2_fs_fs_io_direct_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
deleted file mode 100644
index c2cc405822f2..000000000000
--- a/fs/bcachefs/fs-io-pagecache.c
+++ /dev/null
@@ -1,827 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "btree_iter.h"
-#include "extents.h"
-#include "fs-io.h"
-#include "fs-io-pagecache.h"
-#include "subvolume.h"
-
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
-				     loff_t start, u64 end,
-				     fgf_t fgp_flags, gfp_t gfp,
-				     folios *fs)
-{
-	struct folio *f;
-	u64 pos = start;
-	int ret = 0;
-
-	while (pos < end) {
-		if ((u64) pos >= (u64) start + (1ULL << 20))
-			fgp_flags &= ~FGP_CREAT;
-
-		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
-		if (ret)
-			break;
-
-		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-		if (IS_ERR(f))
-			break;
-
-		BUG_ON(fs->nr && folio_pos(f) != pos);
-
-		pos = folio_end_pos(f);
-		darray_push(fs, f);
-	}
-
-	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
-		ret = -ENOMEM;
-
-	return fs->nr ? 0 : ret;
-}
-
-/* pagecache_block must be held */
-int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
-					    loff_t start, loff_t end)
-{
-	int ret;
-
-	/*
-	 * XXX: the way this is currently implemented, we can spin if a process
-	 * is continually redirtying a specific page
-	 */
-	do {
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = filemap_write_and_wait_range(mapping, start, end);
-		if (ret)
-			break;
-
-		if (!mapping->nrpages)
-			return 0;
-
-		ret = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT,
-				end >> PAGE_SHIFT);
-	} while (ret == -EBUSY);
-
-	return ret;
-}
-
-#if 0
-/* Useful for debug tracing: */
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)	#n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-	NULL
-};
-#endif
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_dirty;
-	case SECTOR_reserved:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_dirty:
-		return SECTOR_unallocated;
-	case SECTOR_dirty_reserved:
-		return SECTOR_reserved;
-	default:
-		return state;
-	}
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-	switch (state) {
-	case SECTOR_unallocated:
-		return SECTOR_reserved;
-	case SECTOR_dirty:
-		return SECTOR_dirty_reserved;
-	default:
-		return state;
-	}
-}
-
-/* for newly allocated folios: */
-struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	struct bch_folio *s;
-
-	s = kzalloc(sizeof(*s) +
-		    sizeof(struct bch_folio_sector) *
-		    folio_sectors(folio), gfp);
-	if (!s)
-		return NULL;
-
-	spin_lock_init(&s->lock);
-	folio_attach_private(folio, s);
-	return s;
-}
-
-struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-	if (bkey_extent_is_reservation(k))
-		return SECTOR_reserved;
-	if (bkey_extent_is_allocation(k.k))
-		return SECTOR_allocated;
-	return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-			     unsigned pg_offset, unsigned pg_len,
-			     unsigned nr_ptrs, unsigned state)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	BUG_ON(pg_offset >= sectors);
-	BUG_ON(pg_offset + pg_len > sectors);
-
-	spin_lock(&s->lock);
-
-	for (i = pg_offset; i < pg_offset + pg_len; i++) {
-		s->s[i].nr_replicas	= nr_ptrs;
-		bch2_folio_sector_set(folio, s, i, state);
-	}
-
-	if (i == sectors)
-		s->uptodate = true;
-
-	spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-		   struct folio **fs, unsigned nr_folios)
-{
-	u64 offset = folio_sector(fs[0]);
-	bool need_set = false;
-
-	for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
-		if (!s)
-			return -ENOMEM;
-
-		need_set |= !s->uptodate;
-	}
-
-	if (!need_set)
-		return 0;
-
-	unsigned folio_idx = 0;
-
-	return bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-				   POS(inum.inum, offset),
-				   POS(inum.inum, U64_MAX),
-				   inum.subvol, BTREE_ITER_slots, k, ({
-			unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-			unsigned state = bkey_to_sector_state(k);
-
-			while (folio_idx < nr_folios) {
-				struct folio *folio = fs[folio_idx];
-				u64 folio_start	= folio_sector(folio);
-				u64 folio_end	= folio_end_sector(folio);
-				unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
-					folio_start;
-				unsigned folio_len = min(k.k->p.offset, folio_end) -
-					folio_offset - folio_start;
-
-				BUG_ON(k.k->p.offset < folio_start);
-				BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-				if (!bch2_folio(folio)->uptodate)
-					__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-				if (k.k->p.offset < folio_end)
-					break;
-				folio_idx++;
-			}
-
-			if (folio_idx == nr_folios)
-				break;
-			0;
-		})));
-}
-
-void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-	struct bvec_iter iter;
-	struct folio_vec fv;
-	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-	unsigned state = bkey_to_sector_state(k);
-
-	bio_for_each_folio(fv, bio, iter)
-		__bch2_folio_set(fv.fv_folio,
-				 fv.fv_offset >> 9,
-				 fv.fv_len >> 9,
-				 nr_ptrs, state);
-}
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
-				     u64 start, u64 end)
-{
-	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	unsigned i, j;
-
-	if (end <= start)
-		return;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-			unsigned folio_offset = max(start, folio_start) - folio_start;
-			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-			struct bch_folio *s;
-
-			BUG_ON(end <= folio_start);
-
-			folio_lock(folio);
-			s = bch2_folio(folio);
-
-			if (s) {
-				spin_lock(&s->lock);
-				for (j = folio_offset; j < folio_offset + folio_len; j++)
-					s->s[j].nr_replicas = 0;
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-}
-
-int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
-				 u64 *start, u64 end,
-				 bool nonblocking)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
-	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-	struct folio_batch fbatch;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	if (end <= *start)
-		return 0;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(inode->v.i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-
-			if (!nonblocking)
-				folio_lock(folio);
-			else if (!folio_trylock(folio)) {
-				folio_batch_release(&fbatch);
-				ret = -EAGAIN;
-				break;
-			}
-
-			u64 folio_start = folio_sector(folio);
-			u64 folio_end = folio_end_sector(folio);
-
-			BUG_ON(end <= folio_start);
-
-			*start = min(end, folio_end);
-
-			struct bch_folio *s = bch2_folio(folio);
-			if (s) {
-				unsigned folio_offset = max(*start, folio_start) - folio_start;
-				unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-
-				spin_lock(&s->lock);
-				for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
-					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-					bch2_folio_sector_set(folio, s, j,
-						folio_sector_reserve(s->s[j].state));
-				}
-				spin_unlock(&s->lock);
-			}
-
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-	return ret;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-					  unsigned nr_replicas)
-{
-	return max(0, (int) nr_replicas -
-		   s->nr_replicas -
-		   s->replicas_reserved);
-}
-
-int bch2_get_folio_disk_reservation(struct bch_fs *c,
-				struct bch_inode_info *inode,
-				struct folio *folio, bool check_enospc)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned nr_replicas = inode_nr_replicas(c, inode);
-	struct disk_reservation disk_res = { 0 };
-	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	for (i = 0; i < sectors; i++)
-		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-	if (!disk_res_sectors)
-		return 0;
-
-	ret = bch2_disk_reservation_get(c, &disk_res,
-					disk_res_sectors, 1,
-					!check_enospc
-					? BCH_DISK_RESERVATION_NOFAIL
-					: 0);
-	if (unlikely(ret))
-		return ret;
-
-	for (i = 0; i < sectors; i++)
-		s->s[i].replicas_reserved +=
-			sectors_to_reserve(&s->s[i], nr_replicas);
-
-	return 0;
-}
-
-void bch2_folio_reservation_put(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	bch2_disk_reservation_put(c, &res->disk);
-	bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int __bch2_folio_reservation_get(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			size_t offset, size_t len,
-			bool partial)
-{
-	struct bch_folio *s = bch2_folio_create(folio, 0);
-	unsigned i, disk_sectors = 0, quota_sectors = 0;
-	struct disk_reservation disk_res = {};
-	size_t reserved = len;
-	int ret;
-
-	if (!s)
-		return -ENOMEM;
-
-	BUG_ON(!s->uptodate);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
-		quota_sectors += s->s[i].state == SECTOR_unallocated;
-	}
-
-	if (disk_sectors) {
-		ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
-				partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
-		if (unlikely(ret))
-			return ret;
-
-		if (unlikely(disk_res.sectors != disk_sectors)) {
-			disk_sectors = quota_sectors = 0;
-
-			for (i = round_down(offset, block_bytes(c)) >> 9;
-			     i < round_up(offset + len, block_bytes(c)) >> 9;
-			     i++) {
-				disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
-				if (disk_sectors > disk_res.sectors) {
-					/*
-					 * Make sure to get a reservation that's
-					 * aligned to the filesystem blocksize:
-					 */
-					unsigned reserved_offset = round_down(i << 9, block_bytes(c));
-					reserved = clamp(reserved_offset, offset, offset + len) - offset;
-
-					if (!reserved) {
-						bch2_disk_reservation_put(c, &disk_res);
-						return bch_err_throw(c, ENOSPC_disk_reservation);
-					}
-					break;
-				}
-				quota_sectors += s->s[i].state == SECTOR_unallocated;
-			}
-		}
-	}
-
-	if (quota_sectors) {
-		ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
-		if (unlikely(ret)) {
-			bch2_disk_reservation_put(c, &disk_res);
-			return ret;
-		}
-	}
-
-	res->disk.sectors += disk_res.sectors;
-	return partial ? reserved : 0;
-}
-
-int bch2_folio_reservation_get(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			size_t offset, size_t len)
-{
-	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
-}
-
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct folio *folio,
-			struct bch2_folio_reservation *res,
-			size_t offset, size_t len)
-{
-	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_folio *s = bch2_folio(folio);
-	struct disk_reservation disk_res = { 0 };
-	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-	if (!s)
-		return;
-
-	EBUG_ON(!folio_test_locked(folio));
-	EBUG_ON(folio_test_writeback(folio));
-
-	for (i = 0; i < sectors; i++) {
-		disk_res.sectors += s->s[i].replicas_reserved;
-		s->s[i].replicas_reserved = 0;
-
-		dirty_sectors -= s->s[i].state == SECTOR_dirty;
-		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-	}
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-	bch2_folio_release(folio);
-}
-
-void bch2_set_folio_dirty(struct bch_fs *c,
-			  struct bch_inode_info *inode,
-			  struct folio *folio,
-			  struct bch2_folio_reservation *res,
-			  unsigned offset, unsigned len)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, dirty_sectors = 0;
-
-	WARN_ON((u64) folio_pos(folio) + offset + len >
-		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-	BUG_ON(!s->uptodate);
-
-	spin_lock(&s->lock);
-
-	for (i = round_down(offset, block_bytes(c)) >> 9;
-	     i < round_up(offset + len, block_bytes(c)) >> 9;
-	     i++) {
-		unsigned sectors = sectors_to_reserve(&s->s[i],
-						res->disk.nr_replicas);
-
-		/*
-		 * This can happen if we race with the error path in
-		 * bch2_writepage_io_done():
-		 */
-		sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-		s->s[i].replicas_reserved += sectors;
-		res->disk.sectors -= sectors;
-
-		dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-	}
-
-	spin_unlock(&s->lock);
-
-	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-	if (!folio_test_dirty(folio))
-		filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-	struct file *file = vmf->vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct address_space *fdm = faults_disabled_mapping();
-	struct bch_inode_info *inode = file_bch_inode(file);
-	vm_fault_t ret;
-
-	if (fdm == mapping)
-		return VM_FAULT_SIGBUS;
-
-	/* Lock ordering: */
-	if (fdm > mapping) {
-		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-		if (bch2_pagecache_add_tryget(inode))
-			goto got_lock;
-
-		bch2_pagecache_block_put(fdm_host);
-
-		bch2_pagecache_add_get(inode);
-		bch2_pagecache_add_put(inode);
-
-		bch2_pagecache_block_get(fdm_host);
-
-		/* Signal that lock has been dropped: */
-		set_fdm_dropped_locks();
-		return VM_FAULT_SIGBUS;
-	}
-
-	bch2_pagecache_add_get(inode);
-got_lock:
-	ret = filemap_fault(vmf);
-	bch2_pagecache_add_put(inode);
-
-	return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-	struct folio *folio = page_folio(vmf->page);
-	struct file *file = vmf->vma->vm_file;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct address_space *mapping = file->f_mapping;
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch2_folio_reservation res;
-	vm_fault_t ret;
-
-	loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
-	unsigned offset = file_offset - folio_pos(folio);
-	unsigned len = max(PAGE_SIZE, block_bytes(c));
-
-	BUG_ON(offset + len > folio_size(folio));
-
-	bch2_folio_reservation_init(c, inode, &res);
-
-	sb_start_pagefault(inode->v.i_sb);
-	file_update_time(file);
-
-	/*
-	 * Not strictly necessary, but helps avoid dio writes livelocking in
-	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
-	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
-	 * page lock before invalidating page
-	 */
-	bch2_pagecache_add_get(inode);
-
-	folio_lock(folio);
-	u64 isize = i_size_read(&inode->v);
-
-	if (folio->mapping != mapping || file_offset >= isize) {
-		folio_unlock(folio);
-		ret = VM_FAULT_NOPAGE;
-		goto out;
-	}
-
-	len = min_t(unsigned, len, isize - file_offset);
-
-	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-	    bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
-		folio_unlock(folio);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
-	bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
-	bch2_folio_reservation_put(c, inode, &res);
-
-	folio_wait_stable(folio);
-	ret = VM_FAULT_LOCKED;
-out:
-	bch2_pagecache_add_put(inode);
-	sb_end_pagefault(inode->v.i_sb);
-
-	return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-	if (offset || length < folio_size(folio))
-		return;
-
-	bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-	if (folio_test_dirty(folio) || folio_test_writeback(folio))
-		return false;
-
-	bch2_clear_folio_bits(folio);
-	return true;
-}
-
-/* fseek: */
-
-static int folio_data_offset(struct folio *folio, loff_t pos,
-			     unsigned min_replicas)
-{
-	struct bch_folio *s = bch2_folio(folio);
-	unsigned i, sectors = folio_sectors(folio);
-
-	if (s)
-		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-			if (s->s[i].state >= SECTOR_dirty &&
-			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-				return i << SECTOR_SHIFT;
-
-	return -1;
-}
-
-loff_t bch2_seek_pagecache_data(struct inode *vinode,
-				loff_t start_offset,
-				loff_t end_offset,
-				unsigned min_replicas,
-				bool nonblock)
-{
-	struct folio_batch fbatch;
-	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
-	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
-	pgoff_t index		= start_index;
-	unsigned i;
-	loff_t ret;
-	int offset;
-
-	folio_batch_init(&fbatch);
-
-	while (filemap_get_folios(vinode->i_mapping,
-				  &index, end_index, &fbatch)) {
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
-
-			if (!nonblock) {
-				folio_lock(folio);
-			} else if (!folio_trylock(folio)) {
-				folio_batch_release(&fbatch);
-				return -EAGAIN;
-			}
-
-			offset = folio_data_offset(folio,
-					max(folio_pos(folio), start_offset),
-					min_replicas);
-			if (offset >= 0) {
-				ret = clamp(folio_pos(folio) + offset,
-					    start_offset, end_offset);
-				folio_unlock(folio);
-				folio_batch_release(&fbatch);
-				return ret;
-			}
-			folio_unlock(folio);
-		}
-		folio_batch_release(&fbatch);
-		cond_resched();
-	}
-
-	return end_offset;
-}
-
-/*
- * Search for a hole in a folio.
- *
- * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
- * code to indicate a pagecache hole exists at the returned offset. Otherwise
- * return 0 if the folio is filled with data, or an error code. This function
- * can return -EAGAIN if nonblock is specified.
- */
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-			      unsigned min_replicas, bool nonblock)
-{
-	struct folio *folio;
-	struct bch_folio *s;
-	unsigned i, sectors;
-	int ret = -ENOENT;
-
-	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
-
-	s = bch2_folio(folio);
-	if (!s)
-		goto unlock;
-
-	sectors = folio_sectors(folio);
-	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-		if (s->s[i].state < SECTOR_dirty ||
-		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-			*offset = max(*offset,
-				      folio_pos(folio) + (i << SECTOR_SHIFT));
-			goto unlock;
-		}
-
-	*offset = folio_end_pos(folio);
-	ret = 0;
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-	return ret;
-}
-
-loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-				loff_t start_offset,
-				loff_t end_offset,
-				unsigned min_replicas,
-				bool nonblock)
-{
-	struct address_space *mapping = vinode->i_mapping;
-	loff_t offset = start_offset;
-	loff_t ret = 0;
-
-	while (!ret && offset < end_offset)
-		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
-
-	if (ret && ret != -ENOENT)
-		return ret;
-	return min(offset, end_offset);
-}
-
-int bch2_clamp_data_hole(struct inode *inode,
-			 u64 *hole_start,
-			 u64 *hole_end,
-			 unsigned min_replicas,
-			 bool nonblock)
-{
-	loff_t ret;
-
-	ret = bch2_seek_pagecache_hole(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_start = ret;
-
-	if (*hole_start == *hole_end)
-		return 0;
-
-	ret = bch2_seek_pagecache_data(inode,
-		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-	if (ret < 0)
-		return ret;
-
-	*hole_end = ret;
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
deleted file mode 100644
index fad911cf5068..000000000000
--- a/fs/bcachefs/fs-io-pagecache.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
-#define _BCACHEFS_FS_IO_PAGECACHE_H
-
-#include <linux/pagemap.h>
-
-typedef DARRAY(struct folio *) folios;
-
-int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-				     u64, fgf_t, gfp_t, folios *);
-int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-	return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-	return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-	return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-	return folio_end_pos(folio) >> 9;
-}
-
-#define BCH_FOLIO_SECTOR_STATE()	\
-	x(unallocated)			\
-	x(reserved)			\
-	x(dirty)			\
-	x(dirty_reserved)		\
-	x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)	SECTOR_##n,
-	BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-struct bch_folio_sector {
-	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-	u8			nr_replicas:4,
-	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-				replicas_reserved:4;
-	u8			state;
-};
-
-struct bch_folio {
-	spinlock_t		lock;
-	atomic_t		write_count;
-	/*
-	 * Is the sector state up to date with the btree?
-	 * (Not the data itself)
-	 */
-	bool			uptodate;
-	struct bch_folio_sector	s[];
-};
-
-/* Helper for when we need to add debug instrumentation: */
-static inline void bch2_folio_sector_set(struct folio *folio,
-			     struct bch_folio *s,
-			     unsigned i, unsigned n)
-{
-	s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-	u64 f_offset = pos - folio_pos(folio);
-
-	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-	return f_offset >> SECTOR_SHIFT;
-}
-
-/* for newly allocated folios: */
-static inline void __bch2_folio_release(struct folio *folio)
-{
-	kfree(folio_detach_private(folio));
-}
-
-static inline void bch2_folio_release(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-	__bch2_folio_release(folio);
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-	return folio_get_private(folio);
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-	EBUG_ON(!folio_test_locked(folio));
-
-	return __bch2_folio(folio);
-}
-
-struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
-struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
-
-struct bch2_folio_reservation {
-	struct disk_reservation	disk;
-	struct quota_res	quota;
-};
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	/* XXX: this should not be open coded */
-	return inode->ei_inode.bi_data_replicas
-		? inode->ei_inode.bi_data_replicas - 1
-		: c->opts.data_replicas;
-}
-
-static inline void bch2_folio_reservation_init(struct bch_fs *c,
-			struct bch_inode_info *inode,
-			struct bch2_folio_reservation *res)
-{
-	memset(res, 0, sizeof(*res));
-
-	res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
-void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
-
-int bch2_get_folio_disk_reservation(struct bch_fs *,
-				struct bch_inode_info *,
-				struct folio *, bool);
-
-void bch2_folio_reservation_put(struct bch_fs *,
-			struct bch_inode_info *,
-			struct bch2_folio_reservation *);
-int bch2_folio_reservation_get(struct bch_fs *,
-			struct bch_inode_info *,
-			struct folio *,
-			struct bch2_folio_reservation *,
-			size_t, size_t);
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
-			struct bch_inode_info *,
-			struct folio *,
-			struct bch2_folio_reservation *,
-			size_t, size_t);
-
-void bch2_set_folio_dirty(struct bch_fs *,
-			  struct bch_inode_info *,
-			  struct folio *,
-			  struct bch2_folio_reservation *,
-			  unsigned, unsigned);
-
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
-loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
-loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
-int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
deleted file mode 100644
index a233f45875e9..000000000000
--- a/fs/bcachefs/fs-io.c
+++ /dev/null
@@ -1,1102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "quota.h"
-#include "reflink.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/rmap.h>
-#include <linux/sched/signal.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-
-#include <trace/events/writeback.h>
-
-struct nocow_flush {
-	struct closure	*cl;
-	struct bch_dev	*ca;
-	struct bio	bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
-	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
-	closure_put(bio->cl);
-	enumerated_ref_put(&bio->ca->io_ref[WRITE],
-			   BCH_DEV_WRITE_REF_nocow_flush);
-	bio_put(&bio->bio);
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct closure *cl)
-{
-	struct nocow_flush *bio;
-	struct bch_dev *ca;
-	struct bch_devs_mask devs;
-	unsigned dev;
-
-	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-	if (dev == BCH_SB_MEMBERS_MAX)
-		return;
-
-	devs = inode->ei_devs_need_flush;
-	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-		scoped_guard(rcu) {
-			ca = rcu_dereference(c->devs[dev]);
-			if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
-							 BCH_DEV_WRITE_REF_nocow_flush))
-				ca = NULL;
-		}
-
-		if (!ca)
-			continue;
-
-		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-						    REQ_OP_WRITE|REQ_PREFLUSH,
-						    GFP_KERNEL,
-						    &c->nocow_flush_bioset),
-				   struct nocow_flush, bio);
-		bio->cl			= cl;
-		bio->ca			= ca;
-		bio->bio.bi_end_io	= nocow_flush_endio;
-		closure_bio_submit(&bio->bio, cl);
-	}
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-					 struct bch_inode_info *inode)
-{
-	struct closure cl;
-
-	closure_init_stack(&cl);
-	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-	closure_sync(&cl);
-
-	return 0;
-}
-
-/* i_size updates: */
-
-struct inode_new_size {
-	loff_t		new_size;
-	u64		now;
-	unsigned	fields;
-};
-
-static int inode_set_size(struct btree_trans *trans,
-			  struct bch_inode_info *inode,
-			  struct bch_inode_unpacked *bi,
-			  void *p)
-{
-	struct inode_new_size *s = p;
-
-	bi->bi_size = s->new_size;
-	if (s->fields & ATTR_ATIME)
-		bi->bi_atime = s->now;
-	if (s->fields & ATTR_MTIME)
-		bi->bi_mtime = s->now;
-	if (s->fields & ATTR_CTIME)
-		bi->bi_ctime = s->now;
-
-	return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       loff_t new_size, unsigned fields)
-{
-	struct inode_new_size s = {
-		.new_size	= new_size,
-		.now		= bch2_current_time(c),
-		.fields		= fields,
-	};
-
-	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, s64 sectors)
-{
-	if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-			   inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-			   inode->ei_inode.bi_sectors);
-
-		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-
-		if (sectors < 0)
-			sectors = -inode->v.i_blocks;
-		else
-			sectors = 0;
-	}
-
-	inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-	if (quota_res &&
-	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-	    sectors > 0) {
-		BUG_ON(sectors > quota_res->sectors);
-		BUG_ON(sectors > inode->ei_quota_reserved);
-
-		quota_res->sectors -= sectors;
-		inode->ei_quota_reserved -= sectors;
-	} else {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-	}
-#endif
-}
-
-/* fsync: */
-
-static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
-					    u64 *seq)
-{
-	struct printbuf buf = PRINTBUF;
-	struct bch_inode_unpacked u;
-	struct btree_iter iter;
-	int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
-	if (ret)
-		return ret;
-
-	u64 cur_seq = journal_cur_seq(&trans->c->journal);
-	*seq = min(cur_seq, u.bi_journal_seq);
-
-	if (fsck_err_on(u.bi_journal_seq > cur_seq,
-			trans, inode_journal_seq_in_future,
-			"inode journal seq in future (currently at %llu)\n%s",
-			cur_seq,
-			(bch2_inode_unpacked_to_text(&buf, &u),
-			buf.buf))) {
-		u.bi_journal_seq = cur_seq;
-		ret = bch2_inode_write(trans, &iter, &u);
-	}
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
- * insert trigger: look up the btree inode instead
- */
-static int bch2_flush_inode(struct bch_fs *c,
-			    struct bch_inode_info *inode)
-{
-	if (c->opts.journal_flush_disabled)
-		return 0;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
-		return -EROFS;
-
-	u64 seq;
-	int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-			bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
-		  bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
-		  bch2_inode_flush_nocow_writes(c, inode);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
-	return ret;
-}
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret, err;
-
-	trace_bch2_fsync(file, datasync);
-
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		goto out;
-	ret = sync_inode_metadata(&inode->v, 1);
-	if (ret)
-		goto out;
-	ret = bch2_flush_inode(c, inode);
-out:
-	ret = bch2_err_class(ret);
-	if (ret == -EROFS)
-		ret = -EIO;
-
-	err = file_check_and_advance_wb_err(file);
-	if (!ret)
-		ret = err;
-
-	return ret;
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c, u32 subvol,
-				 struct bpos start,
-				 struct bpos end)
-{
-	return bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
-						    subvol, 0, k, ({
-			bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
-		})));
-}
-
-static int __bch2_truncate_folio(struct bch_inode_info *inode,
-				 pgoff_t index, loff_t start, loff_t end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_folio *s;
-	unsigned start_offset;
-	unsigned end_offset;
-	unsigned i;
-	struct folio *folio;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-	u64 end_pos;
-
-	folio = filemap_lock_folio(mapping, index);
-	if (IS_ERR_OR_NULL(folio)) {
-		/*
-		 * XXX: we're doing two index lookups when we end up reading the
-		 * folio
-		 */
-		ret = range_has_data(c, inode->ei_inum.subvol,
-				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
-				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
-		if (ret <= 0)
-			return ret;
-
-		folio = __filemap_get_folio(mapping, index,
-					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-		if (IS_ERR(folio)) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
-
-	BUG_ON(start	>= folio_end_pos(folio));
-	BUG_ON(end	<= folio_pos(folio));
-
-	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
-	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
-
-	/* Folio boundary? Nothing to do */
-	if (start_offset == 0 &&
-	    end_offset == folio_size(folio)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	s = bch2_folio_create(folio, 0);
-	if (!s) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
-
-	if (!folio_test_uptodate(folio)) {
-		ret = bch2_read_single_folio(folio, mapping);
-		if (ret)
-			goto unlock;
-	}
-
-	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-	if (ret)
-		goto unlock;
-
-	for (i = round_up(start_offset, block_bytes(c)) >> 9;
-	     i < round_down(end_offset, block_bytes(c)) >> 9;
-	     i++) {
-		s->s[i].nr_replicas	= 0;
-
-		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
-	}
-
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	/*
-	 * Caller needs to know whether this folio will be written out by
-	 * writeback - doing an i_size update if necessary - or whether it will
-	 * be responsible for the i_size update.
-	 *
-	 * Note that we shouldn't ever see a folio beyond EOF, but check and
-	 * warn if so. This has been observed by failure to clean up folios
-	 * after a short write and there's still a chance reclaim will fix
-	 * things up.
-	 */
-	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
-	end_pos = folio_end_pos(folio);
-	if (inode->v.i_size > folio_pos(folio))
-		end_pos = min_t(u64, inode->v.i_size, end_pos);
-	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
-
-	folio_zero_segment(folio, start_offset, end_offset);
-
-	/*
-	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
-	 *
-	 * XXX: because we aren't currently tracking whether the folio has actual
-	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
-	 */
-	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
-
-	/*
-	 * This removes any writeable userspace mappings; we need to force
-	 * .page_mkwrite to be called again before any mmapped writes, to
-	 * redirty the full page:
-	 */
-	folio_mkclean(folio);
-	filemap_dirty_folio(mapping, folio);
-unlock:
-	folio_unlock(folio);
-	folio_put(folio);
-out:
-	return ret;
-}
-
-static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
-{
-	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
-				     from, ANYSINT_MAX(loff_t));
-}
-
-static int bch2_truncate_folios(struct bch_inode_info *inode,
-				loff_t start, loff_t end)
-{
-	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
-					start, end);
-
-	if (ret >= 0 &&
-	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
-		ret = __bch2_truncate_folio(inode,
-					(end - 1) >> PAGE_SHIFT,
-					start, end);
-	return ret;
-}
-
-static int bch2_extend(struct mnt_idmap *idmap,
-		       struct bch_inode_info *inode,
-		       struct bch_inode_unpacked *inode_u,
-		       struct iattr *iattr)
-{
-	struct address_space *mapping = inode->v.i_mapping;
-	int ret;
-
-	/*
-	 * sync appends:
-	 *
-	 * this has to be done _before_ extending i_size:
-	 */
-	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
-	if (ret)
-		return ret;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	return bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-int bchfs_truncate(struct mnt_idmap *idmap,
-		  struct bch_inode_info *inode, struct iattr *iattr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	struct bch_inode_unpacked inode_u;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	/*
-	 * If the truncate call with change the size of the file, the
-	 * cmtimes should be updated. If the size will not change, we
-	 * do not need to update the cmtimes.
-	 */
-	if (iattr->ia_size != inode->v.i_size) {
-		if (!(iattr->ia_valid & ATTR_MTIME))
-			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
-		if (!(iattr->ia_valid & ATTR_CTIME))
-			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
-		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
-	}
-
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
-	if (ret)
-		goto err;
-
-	/*
-	 * check this before next assertion; on filesystem error our normal
-	 * invariants are a bit broken (truncate has to truncate the page cache
-	 * before the inode).
-	 */
-	ret = bch2_journal_error(&c->journal);
-	if (ret)
-		goto err;
-
-	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
-		  inode->v.i_size < inode_u.bi_size,
-		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
-		  (u64) inode->v.i_size, inode_u.bi_size);
-
-	if (iattr->ia_size > inode->v.i_size) {
-		ret = bch2_extend(idmap, inode, &inode_u, iattr);
-		goto err;
-	}
-
-	iattr->ia_valid &= ~ATTR_SIZE;
-
-	ret = bch2_truncate_folio(inode, iattr->ia_size);
-	if (unlikely(ret < 0))
-		goto err;
-	ret = 0;
-
-	truncate_setsize(&inode->v, iattr->ia_size);
-
-	/*
-	 * When extending, we're going to write the new i_size to disk
-	 * immediately so we need to flush anything above the current on disk
-	 * i_size first:
-	 *
-	 * Also, when extending we need to flush the page that i_size currently
-	 * straddles - if it's mapped to userspace, we need to ensure that
-	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
-	 * again to allocate the part of the page that was extended.
-	 */
-	if (iattr->ia_size > inode_u.bi_size)
-		ret = filemap_write_and_wait_range(mapping,
-				inode_u.bi_size,
-				iattr->ia_size - 1);
-	else if (iattr->ia_size & (PAGE_SIZE - 1))
-		ret = filemap_write_and_wait_range(mapping,
-				round_down(iattr->ia_size, PAGE_SIZE),
-				iattr->ia_size - 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	if (unlikely(ret)) {
-		/*
-		 * If we error here, VFS caches are now inconsistent with btree
-		 */
-		set_bit(EI_INODE_ERROR, &inode->ei_flags);
-		goto err;
-	}
-
-	if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
-		     !bch2_journal_error(&c->journal))) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-		prt_printf(&buf,
-			   "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
-			   inode->v.i_ino, (u64) inode->v.i_blocks,
-			   inode->ei_inode.bi_sectors);
-
-		bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
-		if (print)
-			bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	ret = bch2_setattr_nonsize(idmap, inode, iattr);
-err:
-	bch2_pagecache_block_put(inode);
-	return bch2_err_class(ret);
-}
-
-/* fallocate: */
-
-static int inode_update_times_fn(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
-				 struct bch_inode_unpacked *bi, void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-	return 0;
-}
-
-static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 end		= offset + len;
-	u64 block_start	= round_up(offset, block_bytes(c));
-	u64 block_end	= round_down(end, block_bytes(c));
-	bool truncated_last_page;
-	int ret = 0;
-
-	ret = bch2_truncate_folios(inode, offset, end);
-	if (unlikely(ret < 0))
-		goto err;
-
-	truncated_last_page = ret;
-
-	truncate_pagecache_range(&inode->v, offset, end - 1);
-
-	if (block_start < block_end) {
-		s64 i_sectors_delta = 0;
-
-		ret = bch2_fpunch(c, inode_inum(inode),
-				  block_start >> 9, block_end >> 9,
-				  &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	if (end >= inode->v.i_size && !truncated_last_page) {
-		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
-					    ATTR_MTIME|ATTR_CTIME);
-	} else {
-		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-				       ATTR_MTIME|ATTR_CTIME);
-	}
-	mutex_unlock(&inode->ei_update_lock);
-err:
-	return ret;
-}
-
-static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
-				   loff_t offset, loff_t len,
-				   bool insert)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct address_space *mapping = inode->v.i_mapping;
-	s64 i_sectors_delta = 0;
-	int ret = 0;
-
-	if ((offset | len) & (block_bytes(c) - 1))
-		return -EINVAL;
-
-	if (insert) {
-		if (offset >= inode->v.i_size)
-			return -EINVAL;
-	} else {
-		if (offset + len >= inode->v.i_size)
-			return -EINVAL;
-	}
-
-	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
-	if (ret)
-		return ret;
-
-	if (insert)
-		i_size_write(&inode->v, inode->v.i_size + len);
-
-	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
-				     insert, &i_sectors_delta);
-	if (!ret && !insert)
-		i_size_write(&inode->v, inode->v.i_size - len);
-	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-	return ret;
-}
-
-static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			     u64 start_sector, u64 end_sector)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
-	struct bch_io_opts opts;
-	int ret = 0;
-
-	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			POS(inode->v.i_ino, start_sector),
-			BTREE_ITER_slots|BTREE_ITER_intent);
-
-	while (!ret) {
-		s64 i_sectors_delta = 0;
-		struct quota_res quota_res = { 0 };
-		struct bkey_s_c k;
-		unsigned sectors;
-		bool is_allocation;
-		u64 hole_start, hole_end;
-		u32 snapshot;
-
-		bch2_trans_begin(trans);
-
-		if (bkey_ge(iter.pos, end_pos))
-			break;
-
-		ret = bch2_subvolume_get_snapshot(trans,
-					inode->ei_inum.subvol, &snapshot);
-		if (ret)
-			goto bkey_err;
-
-		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-		k = bch2_btree_iter_peek_slot(trans, &iter);
-		if ((ret = bkey_err(k)))
-			goto bkey_err;
-
-		hole_start	= iter.pos.offset;
-		hole_end	= bpos_min(k.k->p, end_pos).offset;
-		is_allocation	= bkey_extent_is_allocation(k.k);
-
-		/* already reserved */
-		if (bkey_extent_is_reservation(k) &&
-		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
-			bch2_btree_iter_advance(trans, &iter);
-			continue;
-		}
-
-		if (bkey_extent_is_data(k.k) &&
-		    !(mode & FALLOC_FL_ZERO_RANGE)) {
-			bch2_btree_iter_advance(trans, &iter);
-			continue;
-		}
-
-		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-			/*
-			 * Lock ordering - can't be holding btree locks while
-			 * blocking on a folio lock:
-			 */
-			if (bch2_clamp_data_hole(&inode->v,
-						 &hole_start,
-						 &hole_end,
-						 opts.data_replicas, true)) {
-				ret = drop_locks_do(trans,
-					(bch2_clamp_data_hole(&inode->v,
-							      &hole_start,
-							      &hole_end,
-							      opts.data_replicas, false), 0));
-				if (ret)
-					goto bkey_err;
-			}
-			bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
-
-			if (ret)
-				goto bkey_err;
-
-			if (hole_start == hole_end)
-				continue;
-		}
-
-		sectors	= hole_end - hole_start;
-
-		if (!is_allocation) {
-			ret = bch2_quota_reservation_add(c, inode,
-					&quota_res, sectors, true);
-			if (unlikely(ret))
-				goto bkey_err;
-		}
-
-		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
-					    sectors, opts, &i_sectors_delta,
-					    writepoint_hashed((unsigned long) current));
-		if (ret)
-			goto bkey_err;
-
-		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-
-		if (bch2_mark_pagecache_reserved(inode, &hole_start,
-						 iter.pos.offset, true)) {
-			ret = drop_locks_do(trans,
-				bch2_mark_pagecache_reserved(inode, &hole_start,
-							     iter.pos.offset, false));
-			if (ret)
-				goto bkey_err;
-		}
-bkey_err:
-		bch2_quota_reservation_put(c, inode, &quota_res);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-	}
-
-	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
-		struct quota_res quota_res = { 0 };
-		s64 i_sectors_delta = 0;
-
-		bch2_fpunch_at(trans, &iter, inode_inum(inode),
-			       end_sector, &i_sectors_delta);
-		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-		bch2_quota_reservation_put(c, inode, &quota_res);
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
-			    loff_t offset, loff_t len)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 end		= offset + len;
-	u64 block_start	= round_down(offset,	block_bytes(c));
-	u64 block_end	= round_up(end,		block_bytes(c));
-	bool truncated_last_page = false;
-	int ret, ret2 = 0;
-
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
-		ret = inode_newsize_ok(&inode->v, end);
-		if (ret)
-			return ret;
-	}
-
-	if (mode & FALLOC_FL_ZERO_RANGE) {
-		ret = bch2_truncate_folios(inode, offset, end);
-		if (unlikely(ret < 0))
-			return ret;
-
-		truncated_last_page = ret;
-
-		truncate_pagecache_range(&inode->v, offset, end - 1);
-
-		block_start	= round_up(offset,	block_bytes(c));
-		block_end	= round_down(end,	block_bytes(c));
-	}
-
-	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-
-	/*
-	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
-	 * so that the VFS cache i_size is consistent with the btree i_size:
-	 */
-	if (ret &&
-	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
-		return ret;
-
-	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
-		end = inode->v.i_size;
-
-	if (end >= inode->v.i_size &&
-	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
-	     !(mode & FALLOC_FL_KEEP_SIZE))) {
-		spin_lock(&inode->v.i_lock);
-		i_size_write(&inode->v, end);
-		spin_unlock(&inode->v.i_lock);
-
-		mutex_lock(&inode->ei_update_lock);
-		ret2 = bch2_write_inode_size(c, inode, end, 0);
-		mutex_unlock(&inode->ei_update_lock);
-	}
-
-	return ret ?: ret2;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
-			     loff_t offset, loff_t len)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	long ret;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
-		return -EROFS;
-
-	inode_lock(&inode->v);
-	inode_dio_wait(&inode->v);
-	bch2_pagecache_block_get(inode);
-
-	ret = file_modified(file);
-	if (ret)
-		goto err;
-
-	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
-		ret = bchfs_fallocate(inode, mode, offset, len);
-	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
-		ret = bchfs_fpunch(inode, offset, len);
-	else if (mode == FALLOC_FL_INSERT_RANGE)
-		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
-	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
-		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
-	else
-		ret = -EOPNOTSUPP;
-err:
-	bch2_pagecache_block_put(inode);
-	inode_unlock(&inode->v);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
-
-	return bch2_err_class(ret);
-}
-
-/*
- * Take a quota reservation for unallocated blocks in a given file range
- * Does not check pagecache
- */
-static int quota_reserve_range(struct bch_inode_info *inode,
-			       struct quota_res *res,
-			       u64 start, u64 end)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	u64 sectors = end - start;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter,
-				BTREE_ID_extents,
-				POS(inode->v.i_ino, start),
-				POS(inode->v.i_ino, end - 1),
-				inode->ei_inum.subvol, 0, k, ({
-			if (bkey_extent_is_allocation(k.k)) {
-				u64 s = min(end, k.k->p.offset) -
-					max(start, bkey_start_offset(k.k));
-				BUG_ON(s > sectors);
-				sectors -= s;
-			}
-
-			0;
-		})));
-
-	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
-}
-
-loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
-			     struct file *file_dst, loff_t pos_dst,
-			     loff_t len, unsigned remap_flags)
-{
-	struct bch_inode_info *src = file_bch_inode(file_src);
-	struct bch_inode_info *dst = file_bch_inode(file_dst);
-	struct bch_fs *c = src->v.i_sb->s_fs_info;
-	struct quota_res quota_res = { 0 };
-	s64 i_sectors_delta = 0;
-	u64 aligned_len;
-	loff_t ret = 0;
-
-	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
-		return -EINVAL;
-
-	if ((pos_src & (block_bytes(c) - 1)) ||
-	    (pos_dst & (block_bytes(c) - 1)))
-		return -EINVAL;
-
-	if (src == dst &&
-	    abs(pos_src - pos_dst) < len)
-		return -EINVAL;
-
-	lock_two_nondirectories(&src->v, &dst->v);
-	bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-
-	inode_dio_wait(&src->v);
-	inode_dio_wait(&dst->v);
-
-	ret = generic_remap_file_range_prep(file_src, pos_src,
-					    file_dst, pos_dst,
-					    &len, remap_flags);
-	if (ret < 0 || len == 0)
-		goto err;
-
-	aligned_len = round_up((u64) len, block_bytes(c));
-
-	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
-				pos_dst, pos_dst + len - 1);
-	if (ret)
-		goto err;
-
-	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
-				  (pos_dst + aligned_len) >> 9);
-	if (ret)
-		goto err;
-
-	if (!(remap_flags & REMAP_FILE_DEDUP))
-		file_update_time(file_dst);
-
-	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
-				   (pos_src + aligned_len) >> 9);
-
-	/*
-	 * XXX: we'd like to be telling bch2_remap_range() if we have
-	 * permission to write to the source file, and thus if io path option
-	 * changes should be propagated through the copy, but we need mnt_idmap
-	 * from the pathwalk, awkward
-	 */
-	ret = bch2_remap_range(c,
-			       inode_inum(dst), pos_dst >> 9,
-			       inode_inum(src), pos_src >> 9,
-			       aligned_len >> 9,
-			       pos_dst + len, &i_sectors_delta,
-			       false);
-	if (ret < 0)
-		goto err;
-
-	/*
-	 * due to alignment, we might have remapped slightly more than requsted
-	 */
-	ret = min((u64) ret << 9, (u64) len);
-
-	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
-
-	spin_lock(&dst->v.i_lock);
-	if (pos_dst + ret > dst->v.i_size)
-		i_size_write(&dst->v, pos_dst + ret);
-	spin_unlock(&dst->v.i_lock);
-
-	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
-	    IS_SYNC(file_inode(file_dst)))
-		ret = bch2_flush_inode(c, dst);
-err:
-	bch2_quota_reservation_put(c, dst, &quota_res);
-	bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-	unlock_two_nondirectories(&src->v, &dst->v);
-
-	return bch2_err_class(ret);
-}
-
-/* fseek: */
-
-static loff_t bch2_seek_data(struct file *file, u64 offset)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	subvol_inum inum = inode_inum(inode);
-	u64 isize, next_data = MAX_LFS_FILESIZE;
-
-	isize = i_size_read(&inode->v);
-	if (offset >= isize)
-		return -ENXIO;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-				   POS(inode->v.i_ino, offset >> 9),
-				   POS(inode->v.i_ino, U64_MAX),
-				   inum.subvol, 0, k, ({
-			if (bkey_extent_is_data(k.k)) {
-				next_data = max(offset, bkey_start_offset(k.k) << 9);
-				break;
-			} else if (k.k->p.offset >> 9 > isize)
-				break;
-			0;
-		})));
-	if (ret)
-		return ret;
-
-	if (next_data > offset)
-		next_data = bch2_seek_pagecache_data(&inode->v,
-					offset, next_data, 0, false);
-
-	if (next_data >= isize)
-		return -ENXIO;
-
-	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static loff_t bch2_seek_hole(struct file *file, u64 offset)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	subvol_inum inum = inode_inum(inode);
-	u64 isize, next_hole = MAX_LFS_FILESIZE;
-
-	isize = i_size_read(&inode->v);
-	if (offset >= isize)
-		return -ENXIO;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
-				   POS(inode->v.i_ino, offset >> 9),
-				   POS(inode->v.i_ino, U64_MAX),
-				   inum.subvol, BTREE_ITER_slots, k, ({
-			if (k.k->p.inode != inode->v.i_ino ||
-			    !bkey_extent_is_data(k.k)) {
-				loff_t start_offset = k.k->p.inode == inode->v.i_ino
-					? max(offset, bkey_start_offset(k.k) << 9)
-					: offset;
-				loff_t end_offset = k.k->p.inode == inode->v.i_ino
-					? MAX_LFS_FILESIZE
-					: k.k->p.offset << 9;
-
-				/*
-				 * Found a hole in the btree, now make sure it's
-				 * a hole in the pagecache. We might have to
-				 * keep searching if this hole is entirely dirty
-				 * in the page cache:
-				 */
-				bch2_trans_unlock(trans);
-				loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v,
-								start_offset, end_offset, 0, false);
-				if (pagecache_hole < end_offset) {
-					next_hole = pagecache_hole;
-					break;
-				}
-			} else {
-				offset = max(offset, bkey_start_offset(k.k) << 9);
-			}
-			0;
-		})));
-	if (ret)
-		return ret;
-
-	if (next_hole > isize)
-		next_hole = isize;
-
-	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	switch (whence) {
-	case SEEK_SET:
-	case SEEK_CUR:
-	case SEEK_END:
-		ret = generic_file_llseek(file, offset, whence);
-		break;
-	case SEEK_DATA:
-		ret = bch2_seek_data(file, offset);
-		break;
-	case SEEK_HOLE:
-		ret = bch2_seek_hole(file, offset);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return bch2_err_class(ret);
-}
-
-void bch2_fs_fsio_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->nocow_flush_bioset);
-}
-
-int bch2_fs_fsio_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->nocow_flush_bioset,
-			1, offsetof(struct nocow_flush, bio), 0))
-		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
-
-	return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
deleted file mode 100644
index ca70346e68dc..000000000000
--- a/fs/bcachefs/fs-io.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_H
-#define _BCACHEFS_FS_IO_H
-
-#ifndef NO_BCACHEFS_FS
-
-#include "buckets.h"
-#include "fs.h"
-#include "io_write_types.h"
-#include "quota.h"
-
-#include <linux/uio.h>
-
-struct folio_vec {
-	struct folio	*fv_folio;
-	size_t		fv_offset;
-	size_t		fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-	struct folio *folio	= page_folio(bv.bv_page);
-	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-		bv.bv_offset;
-	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-	return (struct folio_vec) {
-		.fv_folio	= folio,
-		.fv_offset	= offset,
-		.fv_len		= len,
-	};
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-						    struct bvec_iter iter)
-{
-	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)			\
-	for (iter = (start);						\
-	     (iter).bi_size &&						\
-		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
-	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)				\
-	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-struct quota_res {
-	u64				sectors;
-};
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res)
-{
-	BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-	inode->ei_quota_reserved -= res->sectors;
-	res->sectors = 0;
-}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res)
-{
-	if (res->sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_quota_reservation_put(c, inode, res);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      u64 sectors,
-				      bool check_enospc)
-{
-	int ret;
-
-	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-		return 0;
-
-	mutex_lock(&inode->ei_quota_lock);
-	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-	if (likely(!ret)) {
-		inode->ei_quota_reserved += sectors;
-		res->sectors += sectors;
-	}
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
-}
-
-#else
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
-					 struct bch_inode_info *inode,
-					 struct quota_res *res) {}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
-				       struct bch_inode_info *inode,
-				       struct quota_res *res) {}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
-				      struct bch_inode_info *inode,
-				      struct quota_res *res,
-				      unsigned sectors,
-				      bool check_enospc)
-{
-	return 0;
-}
-
-#endif
-
-void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
-			   struct quota_res *, s64);
-
-static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-				       struct quota_res *quota_res, s64 sectors)
-{
-	if (sectors) {
-		mutex_lock(&inode->ei_quota_lock);
-		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
-		mutex_unlock(&inode->ei_quota_lock);
-	}
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-	current->faults_disabled_mapping =
-		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-	return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
-			struct bch_inode_info *, struct closure *);
-
-int __must_check bch2_write_inode_size(struct bch_fs *,
-				       struct bch_inode_info *,
-				       loff_t, unsigned);
-
-int bch2_fsync(struct file *, loff_t, loff_t, int);
-
-int bchfs_truncate(struct mnt_idmap *,
-		  struct bch_inode_info *, struct iattr *);
-long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
-			     loff_t, loff_t, unsigned);
-
-loff_t bch2_llseek(struct file *, loff_t, int);
-
-void bch2_fs_fsio_exit(struct bch_fs *);
-int bch2_fs_fsio_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
deleted file mode 100644
index 4e72e654da96..000000000000
--- a/fs/bcachefs/fs-ioctl.c
+++ /dev/null
@@ -1,442 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "fs.h"
-#include "fs-ioctl.h"
-#include "namei.h"
-#include "quota.h"
-
-#include <linux/compat.h>
-#include <linux/fsnotify.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/writeback.h>
-
-#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
-#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
-#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
-#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
-
-static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   void *p)
-{
-	struct bch_inode_info *dir = p;
-
-	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
-}
-
-static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
-				    struct file *file,
-				    struct bch_inode_info *src,
-				    const char __user *name)
-{
-	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
-	struct bch_inode_info *dst;
-	struct inode *vinode = NULL;
-	char *kname = NULL;
-	struct qstr qstr;
-	int ret = 0;
-	subvol_inum inum;
-
-	kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
-	if (!kname)
-		return -ENOMEM;
-
-	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
-	if (unlikely(ret < 0))
-		goto err1;
-
-	qstr.len	= ret;
-	qstr.name	= kname;
-
-	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
-	if (ret)
-		goto err1;
-
-	vinode = bch2_vfs_inode_get(c, inum);
-	ret = PTR_ERR_OR_ZERO(vinode);
-	if (ret)
-		goto err1;
-
-	dst = to_bch_ei(vinode);
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		goto err2;
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-	if (inode_attr_changing(src, dst, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, dst,
-					     src->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err3;
-	}
-
-	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
-err3:
-	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
-
-	/* return true if we did work */
-	if (ret >= 0)
-		ret = !ret;
-
-	mnt_drop_write_file(file);
-err2:
-	iput(vinode);
-err1:
-	kfree(kname);
-
-	return ret;
-}
-
-static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
-{
-	return put_user(inode->v.i_generation, arg);
-}
-
-static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
-{
-	int ret;
-	size_t len;
-	char label[BCH_SB_LABEL_SIZE];
-
-	BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
-
-	mutex_lock(&c->sb_lock);
-	memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
-	mutex_unlock(&c->sb_lock);
-
-	len = strnlen(label, BCH_SB_LABEL_SIZE);
-	if (len == BCH_SB_LABEL_SIZE) {
-		bch_warn(c,
-			"label is too long, return the first %zu bytes",
-			--len);
-	}
-
-	ret = copy_to_user(user_label, label, len);
-
-	return ret ? -EFAULT : 0;
-}
-
-static int bch2_ioc_setlabel(struct bch_fs *c,
-			     struct file *file,
-			     struct bch_inode_info *inode,
-			     const char __user *user_label)
-{
-	int ret;
-	char label[BCH_SB_LABEL_SIZE];
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (copy_from_user(label, user_label, sizeof(label)))
-		return -EFAULT;
-
-	if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
-		bch_err(c,
-			"unable to set label with more than %d bytes",
-			BCH_SB_LABEL_SIZE - 1);
-		return -EINVAL;
-	}
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		return ret;
-
-	mutex_lock(&c->sb_lock);
-	strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
-	ret = bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	mnt_drop_write_file(file);
-	return ret;
-}
-
-static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
-{
-	u32 flags;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (get_user(flags, arg))
-		return -EFAULT;
-
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	prt_printf(&buf, "shutdown by ioctl type %u", flags);
-
-	switch (flags) {
-	case FSOP_GOING_FLAGS_DEFAULT:
-		ret = bdev_freeze(c->vfs_sb->s_bdev);
-		if (ret)
-			break;
-		bch2_journal_flush(&c->journal);
-		bch2_fs_emergency_read_only2(c, &buf);
-		bdev_thaw(c->vfs_sb->s_bdev);
-		break;
-	case FSOP_GOING_FLAGS_LOGFLUSH:
-		bch2_journal_flush(&c->journal);
-		fallthrough;
-	case FSOP_GOING_FLAGS_NOLOGFLUSH:
-		bch2_fs_emergency_read_only2(c, &buf);
-		break;
-	default:
-		ret = -EINVAL;
-		goto noprint;
-	}
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-noprint:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-					struct bch_ioctl_subvolume arg)
-{
-	struct inode *dir;
-	struct bch_inode_info *inode;
-	struct user_namespace *s_user_ns;
-	struct dentry *dst_dentry;
-	struct path src_path, dst_path;
-	int how = LOOKUP_FOLLOW;
-	int error;
-	subvol_inum snapshot_src = { 0 };
-	unsigned lookup_flags = 0;
-	unsigned create_flags = BCH_CREATE_SUBVOL;
-
-	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
-			  BCH_SUBVOL_SNAPSHOT_RO))
-		return -EINVAL;
-
-	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-	    (arg.src_ptr ||
-	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
-		return -EINVAL;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
-		create_flags |= BCH_CREATE_SNAPSHOT;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
-		create_flags |= BCH_CREATE_SNAPSHOT_RO;
-
-	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
-		/* sync_inodes_sb enforce s_umount is locked */
-		down_read(&c->vfs_sb->s_umount);
-		sync_inodes_sb(c->vfs_sb);
-		up_read(&c->vfs_sb->s_umount);
-	}
-
-	if (arg.src_ptr) {
-		error = user_path_at(arg.dirfd,
-				(const char __user *)(unsigned long)arg.src_ptr,
-				how, &src_path);
-		if (error)
-			goto err1;
-
-		if (src_path.dentry->d_sb->s_fs_info != c) {
-			path_put(&src_path);
-			error = -EXDEV;
-			goto err1;
-		}
-
-		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
-	}
-
-	dst_dentry = user_path_create(arg.dirfd,
-			(const char __user *)(unsigned long)arg.dst_ptr,
-			&dst_path, lookup_flags);
-	error = PTR_ERR_OR_ZERO(dst_dentry);
-	if (error)
-		goto err2;
-
-	if (dst_dentry->d_sb->s_fs_info != c) {
-		error = -EXDEV;
-		goto err3;
-	}
-
-	if (dst_dentry->d_inode) {
-		error = bch_err_throw(c, EEXIST_subvolume_create);
-		goto err3;
-	}
-
-	dir = dst_path.dentry->d_inode;
-	if (IS_DEADDIR(dir)) {
-		error = bch_err_throw(c, ENOENT_directory_dead);
-		goto err3;
-	}
-
-	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
-		error = -EOVERFLOW;
-		goto err3;
-	}
-
-	error = inode_permission(file_mnt_idmap(filp),
-				 dir, MAY_WRITE | MAY_EXEC);
-	if (error)
-		goto err3;
-
-	if (!IS_POSIXACL(dir))
-		arg.mode &= ~current_umask();
-
-	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
-	if (error)
-		goto err3;
-
-	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
-	    !arg.src_ptr)
-		snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
-
-	down_write(&c->snapshot_create_lock);
-	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
-			      dst_dentry, arg.mode|S_IFDIR,
-			      0, snapshot_src, create_flags);
-	up_write(&c->snapshot_create_lock);
-
-	error = PTR_ERR_OR_ZERO(inode);
-	if (error)
-		goto err3;
-
-	d_instantiate(dst_dentry, &inode->v);
-	fsnotify_mkdir(dir, dst_dentry);
-err3:
-	done_path_create(&dst_path, dst_dentry);
-err2:
-	if (arg.src_ptr)
-		path_put(&src_path);
-err1:
-	return error;
-}
-
-static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
-				struct bch_ioctl_subvolume arg)
-{
-	const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
-	struct path path;
-	struct inode *dir;
-	struct dentry *victim;
-	int ret = 0;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	victim = user_path_locked_at(arg.dirfd, name, &path);
-	if (IS_ERR(victim))
-		return PTR_ERR(victim);
-
-	dir = d_inode(path.dentry);
-	if (victim->d_sb->s_fs_info != c) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	ret =   inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?:
-		__bch2_unlink(dir, victim, true);
-	if (!ret) {
-		fsnotify_rmdir(dir, victim);
-		d_invalidate(victim);
-	}
-err:
-	inode_unlock(dir);
-	dput(victim);
-	path_put(&path);
-	return ret;
-}
-
-long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	long ret;
-
-	switch (cmd) {
-	case BCHFS_IOC_REINHERIT_ATTRS:
-		ret = bch2_ioc_reinherit_attrs(c, file, inode,
-					       (void __user *) arg);
-		break;
-
-	case FS_IOC_GETVERSION:
-		ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
-		break;
-
-	case FS_IOC_SETVERSION:
-		ret = -ENOTTY;
-		break;
-
-	case FS_IOC_GETFSLABEL:
-		ret = bch2_ioc_getlabel(c, (void __user *) arg);
-		break;
-
-	case FS_IOC_SETFSLABEL:
-		ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
-		break;
-
-	case FS_IOC_GOINGDOWN:
-		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
-		break;
-
-	case BCH_IOCTL_SUBVOLUME_CREATE: {
-		struct bch_ioctl_subvolume i;
-
-		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-			? -EFAULT
-			: bch2_ioctl_subvolume_create(c, file, i);
-		break;
-	}
-
-	case BCH_IOCTL_SUBVOLUME_DESTROY: {
-		struct bch_ioctl_subvolume i;
-
-		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
-			? -EFAULT
-			: bch2_ioctl_subvolume_destroy(c, file, i);
-		break;
-	}
-
-	default:
-		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
-		break;
-	}
-
-	return bch2_err_class(ret);
-}
-
-#ifdef CONFIG_COMPAT
-long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
-	/* These are just misnamed, they actually get/put from/to user an int */
-	switch (cmd) {
-	case FS_IOC32_GETFLAGS:
-		cmd = FS_IOC_GETFLAGS;
-		break;
-	case FS_IOC32_SETFLAGS:
-		cmd = FS_IOC_SETFLAGS;
-		break;
-	case FS_IOC32_GETVERSION:
-		cmd = FS_IOC_GETVERSION;
-		break;
-	case FS_IOC_GETFSLABEL:
-	case FS_IOC_SETFSLABEL:
-		break;
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
deleted file mode 100644
index a657e4994b71..000000000000
--- a/fs/bcachefs/fs-ioctl.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IOCTL_H
-#define _BCACHEFS_FS_IOCTL_H
-
-long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
-long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
-
-#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
deleted file mode 100644
index 687af0eea0c2..000000000000
--- a/fs/bcachefs/fs.c
+++ /dev/null
@@ -1,2768 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "errcode.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-ioctl.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "journal.h"
-#include "keylist.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/exportfs.h>
-#include <linux/fiemap.h>
-#include <linux/fileattr.h>
-#include <linux/fs_context.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/posix_acl.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/siphash.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch2_inode_cache;
-
-static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
-				struct bch_inode_info *,
-				struct bch_inode_unpacked *,
-				struct bch_subvolume *);
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	static const __maybe_unused unsigned bch_flags_to_vfs[] = {
-		[__BCH_INODE_sync]		= S_SYNC,
-		[__BCH_INODE_immutable]		= S_IMMUTABLE,
-		[__BCH_INODE_append]		= S_APPEND,
-		[__BCH_INODE_noatime]		= S_NOATIME,
-	};
-
-	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-
-	if (bch2_inode_casefold(c, &inode->ei_inode))
-		inode->v.i_flags |= S_CASEFOLD;
-	else
-		inode->v.i_flags &= ~S_CASEFOLD;
-}
-
-void bch2_inode_update_after_write(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   struct bch_inode_unpacked *bi,
-				   unsigned fields)
-{
-	struct bch_fs *c = trans->c;
-
-	BUG_ON(bi->bi_inum != inode->v.i_ino);
-
-	bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
-
-	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
-	i_uid_write(&inode->v, bi->bi_uid);
-	i_gid_write(&inode->v, bi->bi_gid);
-	inode->v.i_mode	= bi->bi_mode;
-
-	if (fields & ATTR_SIZE)
-		i_size_write(&inode->v, bi->bi_size);
-
-	if (fields & ATTR_ATIME)
-		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
-	if (fields & ATTR_MTIME)
-		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
-	if (fields & ATTR_CTIME)
-		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
-
-	inode->ei_inode		= *bi;
-
-	bch2_inode_flags_to_vfs(c, inode);
-}
-
-int __must_check bch2_write_inode(struct bch_fs *c,
-				  struct bch_inode_info *inode,
-				  inode_set_fn set,
-				  void *p, unsigned fields)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = {};
-	struct bch_inode_unpacked inode_u;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-
-	ret = (set ? set(trans, inode, &inode_u, p) : 0);
-	if (ret)
-		goto err;
-
-	struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-	bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
-
-	if (rebalance_changed) {
-		ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
-		if (ret)
-			goto err;
-	}
-
-	ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
-	/*
-	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
-	 * this is important for inode updates via bchfs_write_index_update
-	 */
-	if (!ret)
-		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (rebalance_changed)
-		bch2_rebalance_wakeup(c);
-
-	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "%s: inode %llu:%llu not found when updating",
-			     bch2_err_str(ret),
-			     inode_inum(inode).subvol,
-			     inode_inum(inode).inum);
-
-	bch2_trans_put(trans);
-	return ret < 0 ? ret : 0;
-}
-
-int bch2_fs_quota_transfer(struct bch_fs *c,
-			   struct bch_inode_info *inode,
-			   struct bch_qid new_qid,
-			   unsigned qtypes,
-			   enum quota_acct_mode mode)
-{
-	unsigned i;
-	int ret;
-
-	qtypes &= enabled_qtypes(c);
-
-	for (i = 0; i < QTYP_NR; i++)
-		if (new_qid.q[i] == inode->ei_qid.q[i])
-			qtypes &= ~(1U << i);
-
-	if (!qtypes)
-		return 0;
-
-	mutex_lock(&inode->ei_quota_lock);
-
-	ret = bch2_quota_transfer(c, qtypes, new_qid,
-				  inode->ei_qid,
-				  inode->v.i_blocks +
-				  inode->ei_quota_reserved,
-				  mode);
-	if (!ret)
-		for (i = 0; i < QTYP_NR; i++)
-			if (qtypes & (1 << i))
-				inode->ei_qid.q[i] = new_qid.q[i];
-
-	mutex_unlock(&inode->ei_quota_lock);
-
-	return ret;
-}
-
-static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
-{
-	const subvol_inum *inum = data;
-	siphash_key_t k = { .key[0] = seed };
-
-	return siphash_2u64(inum->subvol, inum->inum, &k);
-}
-
-static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
-{
-	const struct bch_inode_info *inode = data;
-
-	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
-}
-
-static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
-				 const void *obj)
-{
-	const struct bch_inode_info *inode = obj;
-	const subvol_inum *v = arg->key;
-
-	return !subvol_inum_eq(inode->ei_inum, *v);
-}
-
-static const struct rhashtable_params bch2_vfs_inodes_params = {
-	.head_offset		= offsetof(struct bch_inode_info, hash),
-	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
-	.key_len		= sizeof(subvol_inum),
-	.hashfn			= bch2_vfs_inode_hash_fn,
-	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
-	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
-	.automatic_shrinking	= true,
-};
-
-static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
-	.head_offset		= offsetof(struct bch_inode_info, by_inum_hash),
-	.key_offset		= offsetof(struct bch_inode_info, ei_inum.inum),
-	.key_len		= sizeof(u64),
-	.automatic_shrinking	= true,
-};
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
-{
-	struct bch_fs *c = trans->c;
-	struct rhltable *ht = &c->vfs_inodes_by_inum_table;
-	u64 inum = p.offset;
-	DARRAY(u32) subvols;
-	int ret = 0;
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return false;
-
-	darray_init(&subvols);
-restart_from_top:
-
-	/*
-	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
-	 * subvolumes in which the given inode number is open.
-	 *
-	 * For this to work, we don't include the subvolume ID in the key that
-	 * we hash - all inodes with the same inode number regardless of
-	 * subvolume will hash to the same slot.
-	 *
-	 * This will be less than ideal if the same file is ever open
-	 * simultaneously in many different snapshots:
-	 */
-	rcu_read_lock();
-	struct rhash_lock_head __rcu *const *bkt;
-	struct rhash_head *he;
-	unsigned int hash;
-	struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
-restart:
-	hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
-	bkt = rht_bucket(tbl, hash);
-	do {
-		struct bch_inode_info *inode;
-
-		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
-			if (inode->ei_inum.inum == inum) {
-				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
-						      GFP_NOWAIT|__GFP_NOWARN);
-				if (ret) {
-					rcu_read_unlock();
-					ret = darray_make_room(&subvols, 1);
-					if (ret)
-						goto err;
-					subvols.nr = 0;
-					goto restart_from_top;
-				}
-			}
-		}
-		/* An object might have been moved to a different hash chain,
-		 * while we walk along it - better check and retry.
-		 */
-	} while (he != RHT_NULLS_MARKER(bkt));
-
-	/* Ensure we see any new tables. */
-	smp_rmb();
-
-	tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
-	if (unlikely(tbl))
-		goto restart;
-	rcu_read_unlock();
-
-	darray_for_each(subvols, i) {
-		u32 snap;
-		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
-		if (ret)
-			goto err;
-
-		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
-		if (ret)
-			break;
-	}
-err:
-	darray_exit(&subvols);
-	return ret;
-}
-
-static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
-	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
-}
-
-static void __wait_on_freeing_inode(struct bch_fs *c,
-				    struct bch_inode_info *inode,
-				    subvol_inum inum)
-{
-	wait_queue_head_t *wq;
-	struct wait_bit_queue_entry wait;
-
-	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
-	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-	spin_unlock(&inode->v.i_lock);
-
-	if (__bch2_inode_hash_find(c, inum) == inode)
-		schedule_timeout(HZ * 10);
-	finish_wait(wq, &wait.wq_entry);
-}
-
-static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
-						   subvol_inum inum)
-{
-	struct bch_inode_info *inode;
-repeat:
-	inode = __bch2_inode_hash_find(c, inum);
-	if (inode) {
-		spin_lock(&inode->v.i_lock);
-		if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
-			spin_unlock(&inode->v.i_lock);
-			return NULL;
-		}
-		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
-			if (!trans) {
-				__wait_on_freeing_inode(c, inode, inum);
-			} else {
-				int ret = drop_locks_do(trans,
-						(__wait_on_freeing_inode(c, inode, inum), 0));
-				if (ret)
-					return ERR_PTR(ret);
-			}
-			goto repeat;
-		}
-		__iget(&inode->v);
-		spin_unlock(&inode->v.i_lock);
-	}
-
-	return inode;
-}
-
-static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
-{
-	spin_lock(&inode->v.i_lock);
-	bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-	spin_unlock(&inode->v.i_lock);
-
-	if (remove) {
-		int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
-					&inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
-		BUG_ON(ret);
-
-		ret = rhashtable_remove_fast(&c->vfs_inodes_table,
-					&inode->hash, bch2_vfs_inodes_params);
-		BUG_ON(ret);
-		inode->v.i_hash.pprev = NULL;
-		/*
-		 * This pairs with the bch2_inode_hash_find() ->
-		 * __wait_on_freeing_inode() path
-		 */
-		inode_wake_up_bit(&inode->v, __I_NEW);
-	}
-}
-
-static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
-						     struct btree_trans *trans,
-						     struct bch_inode_info *inode)
-{
-	struct bch_inode_info *old = inode;
-
-	set_bit(EI_INODE_HASHED, &inode->ei_flags);
-retry:
-	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
-					&inode->ei_inum,
-					&inode->hash,
-					bch2_vfs_inodes_params))) {
-		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
-		if (!old)
-			goto retry;
-
-		clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-
-		/*
-		 * bcachefs doesn't use I_NEW; we have no use for it since we
-		 * only insert fully created inodes in the inode hash table. But
-		 * discard_new_inode() expects it to be set...
-		 */
-		inode->v.i_state |= I_NEW;
-		/*
-		 * We don't want bch2_evict_inode() to delete the inode on disk,
-		 * we just raced and had another inode in cache. Normally new
-		 * inodes don't have nlink == 0 - except tmpfiles do...
-		 */
-		set_nlink(&inode->v, 1);
-		discard_new_inode(&inode->v);
-		return old;
-	} else {
-		int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
-					  &inode->by_inum_hash,
-					  bch2_vfs_inodes_by_inum_params);
-		BUG_ON(ret);
-
-		inode_fake_hash(&inode->v);
-
-		inode_sb_list_add(&inode->v);
-
-		mutex_lock(&c->vfs_inodes_lock);
-		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-		mutex_unlock(&c->vfs_inodes_lock);
-		return inode;
-	}
-}
-
-#define memalloc_flags_do(_flags, _do)						\
-({										\
-	unsigned _saved_flags = memalloc_flags_save(_flags);			\
-	typeof(_do) _ret = _do;							\
-	memalloc_noreclaim_restore(_saved_flags);				\
-	_ret;									\
-})
-
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
-	BUG();
-}
-
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
-{
-	struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
-						bch2_inode_cache, gfp);
-	if (!inode)
-		return NULL;
-
-	inode_init_once(&inode->v);
-	mutex_init(&inode->ei_update_lock);
-	two_state_lock_init(&inode->ei_pagecache_lock);
-	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
-	inode->ei_flags = 0;
-	mutex_init(&inode->ei_quota_lock);
-	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-	if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
-		kmem_cache_free(bch2_inode_cache, inode);
-		return NULL;
-	}
-
-	return inode;
-}
-
-/*
- * Allocate a new inode, dropping/retaking btree locks if necessary:
- */
-static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
-{
-	struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
-
-	if (unlikely(!inode)) {
-		int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
-		if (ret && inode) {
-			__destroy_inode(&inode->v);
-			kmem_cache_free(bch2_inode_cache, inode);
-		}
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	return inode;
-}
-
-static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
-							  subvol_inum inum,
-							  struct bch_inode_unpacked *bi,
-							  struct bch_subvolume *subvol)
-{
-	struct bch_inode_info *inode = bch2_new_inode(trans);
-	if (IS_ERR(inode))
-		return inode;
-
-	bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
-
-	return bch2_inode_hash_insert(trans->c, trans, inode);
-
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
-{
-	struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
-	if (inode)
-		return &inode->v;
-
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	struct bch_inode_unpacked inode_u;
-	struct bch_subvolume subvol;
-	int ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-	bch2_trans_put(trans);
-
-	return ret ? ERR_PTR(ret) : &inode->v;
-}
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *idmap,
-	      struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
-	      unsigned flags)
-{
-	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode;
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *default_acl = NULL, *acl = NULL;
-	subvol_inum inum;
-	struct bch_subvolume subvol;
-	u64 journal_seq = 0;
-	kuid_t kuid;
-	kgid_t kgid;
-	int ret;
-
-	/*
-	 * preallocate acls + vfs inode before btree transaction, so that
-	 * nothing can fail after the transaction succeeds:
-	 */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
-	if (ret)
-		return ERR_PTR(ret);
-#endif
-	inode = __bch2_new_inode(c, GFP_NOFS);
-	if (unlikely(!inode)) {
-		inode = ERR_PTR(-ENOMEM);
-		goto err;
-	}
-
-	bch2_inode_init_early(c, &inode_u);
-
-	if (!(flags & BCH_CREATE_TMPFILE))
-		mutex_lock(&dir->ei_update_lock);
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
-	kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
-	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
-		bch2_create_trans(trans,
-				  inode_inum(dir), &dir_u, &inode_u,
-				  !(flags & BCH_CREATE_TMPFILE)
-				  ? &dentry->d_name : NULL,
-				  from_kuid(i_user_ns(&dir->v), kuid),
-				  from_kgid(i_user_ns(&dir->v), kgid),
-				  mode, rdev,
-				  default_acl, acl, snapshot_src, flags) ?:
-		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
-				KEY_TYPE_QUOTA_PREALLOC);
-	if (unlikely(ret))
-		goto err_before_quota;
-
-	inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
-	inum.inum = inode_u.bi_inum;
-
-	ret   = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-		bch2_trans_commit(trans, NULL, &journal_seq, 0);
-	if (unlikely(ret)) {
-		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
-				KEY_TYPE_QUOTA_WARN);
-err_before_quota:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
-		goto err_trans;
-	}
-
-	if (!(flags & BCH_CREATE_TMPFILE)) {
-		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-		mutex_unlock(&dir->ei_update_lock);
-	}
-
-	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-
-	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-
-	/*
-	 * we must insert the new inode into the inode cache before calling
-	 * bch2_trans_exit() and dropping locks, else we could race with another
-	 * thread pulling the inode in and modifying it:
-	 *
-	 * also, calling bch2_inode_hash_insert() without passing in the
-	 * transaction object is sketchy - if we could ever end up in
-	 * __wait_on_freeing_inode(), we'd risk deadlock.
-	 *
-	 * But that shouldn't be possible, since we still have the inode locked
-	 * that we just created, and we _really_ can't take a transaction
-	 * restart here.
-	 */
-	inode = bch2_inode_hash_insert(c, NULL, inode);
-	bch2_trans_put(trans);
-err:
-	posix_acl_release(default_acl);
-	posix_acl_release(acl);
-	return inode;
-err_trans:
-	if (!(flags & BCH_CREATE_TMPFILE))
-		mutex_unlock(&dir->ei_update_lock);
-
-	bch2_trans_put(trans);
-	make_bad_inode(&inode->v);
-	iput(&inode->v);
-	inode = ERR_PTR(ret);
-	goto err;
-}
-
-/* methods */
-
-static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
-			subvol_inum dir, struct bch_hash_info *dir_hash_info,
-			const struct qstr *name)
-{
-	struct bch_fs *c = trans->c;
-	subvol_inum inum = {};
-	struct printbuf buf = PRINTBUF;
-
-	struct qstr lookup_name;
-	int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
-	if (ret)
-		return ERR_PTR(ret);
-
-	struct btree_iter dirent_iter = {};
-	struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
-					     dir_hash_info, dir, &lookup_name, 0);
-	ret = bkey_err(k);
-	if (ret)
-		return ERR_PTR(ret);
-
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-	ret = bch2_dirent_read_target(trans, dir, d, &inum);
-	if (ret > 0)
-		ret = -ENOENT;
-	if (ret)
-		goto err;
-
-	struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
-	if (inode)
-		goto out;
-
-	/*
-	 * Note: if check/repair needs it, we commit before
-	 * bch2_inode_hash_init_insert(), as after that point we can't take a
-	 * restart - not in the top level loop with a commit_do(), like we
-	 * usually do:
-	 */
-
-	struct bch_subvolume subvol;
-	struct bch_inode_unpacked inode_u;
-	ret =   bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
-		bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-
-	/*
-	 * don't remove it: check_inodes might find another inode that points
-	 * back to this dirent
-	 */
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-				c, "dirent to missing inode:\n%s",
-				(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
-	if (ret)
-		goto err;
-out:
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	printbuf_exit(&buf);
-	return inode;
-err:
-	inode = ERR_PTR(ret);
-	goto out;
-}
-
-static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
-				  unsigned int flags)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-
-	struct bch_inode_info *inode;
-	bch2_trans_do(c,
-		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
-							  &hash, &dentry->d_name)));
-	if (IS_ERR(inode))
-		inode = NULL;
-
-	if (!inode && IS_CASEFOLDED(vdir)) {
-		/*
-		 * Do not cache a negative dentry in casefolded directories
-		 * as it would need to be invalidated in the following situation:
-		 * - Lookup file "blAH" in a casefolded directory
-		 * - Creation of file "BLAH" in a casefolded directory
-		 * - Lookup file "blAH" in a casefolded directory
-		 * which would fail if we had a negative dentry.
-		 *
-		 * We should come back to this when VFS has a method to handle
-		 * this edgecase.
-		 */
-		return NULL;
-	}
-
-	return d_splice_alias(&inode->v, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
-		      struct inode *vdir, struct dentry *dentry,
-		      umode_t mode, dev_t rdev)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
-			      (subvol_inum) { 0 }, 0);
-
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-static int bch2_create(struct mnt_idmap *idmap,
-		       struct inode *vdir, struct dentry *dentry,
-		       umode_t mode, bool excl)
-{
-	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
-}
-
-static int __bch2_link(struct bch_fs *c,
-		       struct bch_inode_info *inode,
-		       struct bch_inode_info *dir,
-		       struct dentry *dentry)
-{
-	struct bch_inode_unpacked dir_u, inode_u;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	ret = commit_do(trans, NULL, NULL, 0,
-			bch2_link_trans(trans,
-					inode_inum(dir),   &dir_u,
-					inode_inum(inode), &inode_u,
-					&dentry->d_name));
-
-	if (likely(!ret)) {
-		bch2_inode_update_after_write(trans, dir, &dir_u,
-					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
-	}
-
-	bch2_trans_put(trans);
-	mutex_unlock(&inode->ei_update_lock);
-	return ret;
-}
-
-static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
-		     struct dentry *dentry)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
-	int ret;
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
-		bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-		__bch2_link(c, inode, dir, dentry);
-	if (unlikely(ret))
-		return bch2_err_class(ret);
-
-	ihold(&inode->v);
-	d_instantiate(dentry, &inode->v);
-	return 0;
-}
-
-int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-		  bool deleting_snapshot)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir);
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_inode_unpacked dir_u, inode_u;
-	int ret;
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	ret = commit_do(trans, NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc,
-		bch2_unlink_trans(trans,
-				  inode_inum(dir), &dir_u,
-				  &inode_u, &dentry->d_name,
-				  deleting_snapshot));
-	if (unlikely(ret))
-		goto err;
-
-	bch2_inode_update_after_write(trans, dir, &dir_u,
-				      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-	bch2_inode_update_after_write(trans, inode, &inode_u,
-				      ATTR_MTIME);
-
-	if (inode_u.bi_subvol) {
-		/*
-		 * Subvolume deletion is asynchronous, but we still want to tell
-		 * the VFS that it's been deleted here:
-		 */
-		set_nlink(&inode->v, 0);
-	}
-
-	if (IS_CASEFOLDED(vdir))
-		d_invalidate(dentry);
-err:
-	bch2_trans_put(trans);
-	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
-	return ret;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
-{
-	struct bch_inode_info *dir= to_bch_ei(vdir);
-	struct bch_fs *c = dir->v.i_sb->s_fs_info;
-
-	int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
-		__bch2_unlink(vdir, dentry, false);
-	return bch2_err_class(ret);
-}
-
-static int bch2_symlink(struct mnt_idmap *idmap,
-			struct inode *vdir, struct dentry *dentry,
-			const char *symname)
-{
-	struct bch_fs *c = vdir->i_sb->s_fs_info;
-	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
-	int ret;
-
-	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	inode_lock(&inode->v);
-	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
-	inode_unlock(&inode->v);
-
-	if (unlikely(ret))
-		goto err;
-
-	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
-	if (unlikely(ret))
-		goto err;
-
-	ret = __bch2_link(c, inode, dir, dentry);
-	if (unlikely(ret))
-		goto err;
-
-	d_instantiate(dentry, &inode->v);
-	return 0;
-err:
-	iput(&inode->v);
-	return bch2_err_class(ret);
-}
-
-static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
-				 struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
-	return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
-}
-
-static int bch2_rename2(struct mnt_idmap *idmap,
-			struct inode *src_vdir, struct dentry *src_dentry,
-			struct inode *dst_vdir, struct dentry *dst_dentry,
-			unsigned flags)
-{
-	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
-	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
-	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
-	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
-	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
-	struct bch_inode_unpacked dst_dir_u, src_dir_u;
-	struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
-	struct btree_trans *trans;
-	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
-		? BCH_RENAME_EXCHANGE
-		: dst_dentry->d_inode
-		? BCH_RENAME_OVERWRITE : BCH_RENAME;
-	bool whiteout = !!(flags & RENAME_WHITEOUT);
-	int ret;
-
-	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
-		return -EINVAL;
-
-	if (mode == BCH_RENAME_OVERWRITE) {
-		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
-						   0, LLONG_MAX);
-		if (ret)
-			return ret;
-	}
-
-	bch2_lock_inodes(INODE_UPDATE_LOCK,
-			 src_dir,
-			 dst_dir,
-			 src_inode,
-			 dst_inode);
-
-	trans = bch2_trans_get(c);
-
-	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
-		bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
-	if (ret)
-		goto err_tx_restart;
-
-	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, src_inode,
-					     dst_dir->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE &&
-	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
-		ret = bch2_fs_quota_transfer(c, dst_inode,
-					     src_dir->ei_qid,
-					     1 << QTYP_PRJ,
-					     KEY_TYPE_QUOTA_PREALLOC);
-		if (ret)
-			goto err;
-	}
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_rename_trans(trans,
-				inode_inum(src_dir), &src_dir_u,
-				inode_inum(dst_dir), &dst_dir_u,
-				&src_inode_u,
-				&dst_inode_u,
-				&src_dentry->d_name,
-				&dst_dentry->d_name,
-				mode);
-	if (unlikely(ret))
-		goto err_tx_restart;
-
-	if (whiteout) {
-		whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
-		ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
-		if (unlikely(ret))
-			goto err_tx_restart;
-		bch2_inode_init_early(c, whiteout_inode_u);
-
-		ret = bch2_create_trans(trans,
-					inode_inum(src_dir), &src_dir_u,
-					whiteout_inode_u,
-					&src_dentry->d_name,
-					from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
-					from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
-					S_IFCHR|WHITEOUT_MODE, 0,
-					NULL, NULL, (subvol_inum) { 0 }, 0) ?:
-		      bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
-				      KEY_TYPE_QUOTA_PREALLOC);
-		if (unlikely(ret))
-			goto err_tx_restart;
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, 0);
-	if (unlikely(ret)) {
-err_tx_restart:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
-		goto err;
-	}
-
-	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
-	BUG_ON(dst_inode &&
-	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
-
-	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
-				      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
-	if (src_dir != dst_dir)
-		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
-					      ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
-	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
-				      ATTR_CTIME);
-
-	if (dst_inode)
-		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
-					      ATTR_CTIME);
-err:
-	bch2_trans_put(trans);
-
-	bch2_fs_quota_transfer(c, src_inode,
-			       bch_qid(&src_inode->ei_inode),
-			       1 << QTYP_PRJ,
-			       KEY_TYPE_QUOTA_NOCHECK);
-	if (dst_inode)
-		bch2_fs_quota_transfer(c, dst_inode,
-				       bch_qid(&dst_inode->ei_inode),
-				       1 << QTYP_PRJ,
-				       KEY_TYPE_QUOTA_NOCHECK);
-
-	bch2_unlock_inodes(INODE_UPDATE_LOCK,
-			   src_dir,
-			   dst_dir,
-			   src_inode,
-			   dst_inode);
-
-	return bch2_err_class(ret);
-}
-
-static void bch2_setattr_copy(struct mnt_idmap *idmap,
-			      struct bch_inode_info *inode,
-			      struct bch_inode_unpacked *bi,
-			      struct iattr *attr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	unsigned int ia_valid = attr->ia_valid;
-	kuid_t kuid;
-	kgid_t kgid;
-
-	if (ia_valid & ATTR_UID) {
-		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
-		bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
-	}
-	if (ia_valid & ATTR_GID) {
-		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
-		bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
-	}
-
-	if (ia_valid & ATTR_SIZE)
-		bi->bi_size = attr->ia_size;
-
-	if (ia_valid & ATTR_ATIME)
-		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
-	if (ia_valid & ATTR_MTIME)
-		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
-
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = attr->ia_mode;
-		kgid_t gid = ia_valid & ATTR_GID
-			? kgid
-			: inode->v.i_gid;
-
-		if (!in_group_or_capable(idmap, &inode->v,
-			make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
-			mode &= ~S_ISGID;
-		bi->bi_mode = mode;
-	}
-}
-
-int bch2_setattr_nonsize(struct mnt_idmap *idmap,
-			 struct bch_inode_info *inode,
-			 struct iattr *attr)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_qid qid;
-	struct btree_trans *trans;
-	struct btree_iter inode_iter = {};
-	struct bch_inode_unpacked inode_u;
-	struct posix_acl *acl = NULL;
-	kuid_t kuid;
-	kgid_t kgid;
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-
-	qid = inode->ei_qid;
-
-	if (attr->ia_valid & ATTR_UID) {
-		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
-		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
-	}
-
-	if (attr->ia_valid & ATTR_GID) {
-		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
-		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
-	}
-
-	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
-				     KEY_TYPE_QUOTA_PREALLOC);
-	if (ret)
-		goto err;
-
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-	kfree(acl);
-	acl = NULL;
-
-	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
-			      BTREE_ITER_intent);
-	if (ret)
-		goto btree_err;
-
-	bch2_setattr_copy(idmap, inode, &inode_u, attr);
-
-	if (attr->ia_valid & ATTR_MODE) {
-		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
-				     inode_u.bi_mode, &acl);
-		if (ret)
-			goto btree_err;
-	}
-
-	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	if (unlikely(ret))
-		goto err_trans;
-
-	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
-
-	if (acl)
-		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-err_trans:
-	bch2_trans_put(trans);
-err:
-	mutex_unlock(&inode->ei_update_lock);
-
-	return bch2_err_class(ret);
-}
-
-static int bch2_getattr(struct mnt_idmap *idmap,
-			const struct path *path, struct kstat *stat,
-			u32 request_mask, unsigned query_flags)
-{
-	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
-	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
-
-	stat->dev	= inode->v.i_sb->s_dev;
-	stat->ino	= inode->v.i_ino;
-	stat->mode	= inode->v.i_mode;
-	stat->nlink	= inode->v.i_nlink;
-	stat->uid	= vfsuid_into_kuid(vfsuid);
-	stat->gid	= vfsgid_into_kgid(vfsgid);
-	stat->rdev	= inode->v.i_rdev;
-	stat->size	= i_size_read(&inode->v);
-	stat->atime	= inode_get_atime(&inode->v);
-	stat->mtime	= inode_get_mtime(&inode->v);
-	stat->ctime	= inode_get_ctime(&inode->v);
-	stat->blksize	= block_bytes(c);
-	stat->blocks	= inode->v.i_blocks;
-
-	stat->subvol	= inode->ei_inum.subvol;
-	stat->result_mask |= STATX_SUBVOL;
-
-	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
-		stat->result_mask |= STATX_DIOALIGN;
-		/*
-		 * this is incorrect; we should be tracking this in superblock,
-		 * and checking the alignment of open devices
-		 */
-		stat->dio_mem_align = SECTOR_SIZE;
-		stat->dio_offset_align = block_bytes(c);
-	}
-
-	if (request_mask & STATX_BTIME) {
-		stat->result_mask |= STATX_BTIME;
-		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
-	}
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
-		stat->attributes |= STATX_ATTR_IMMUTABLE;
-	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_append)
-		stat->attributes |= STATX_ATTR_APPEND;
-	stat->attributes_mask	 |= STATX_ATTR_APPEND;
-
-	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
-		stat->attributes |= STATX_ATTR_NODUMP;
-	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
-
-	return 0;
-}
-
-static int bch2_setattr(struct mnt_idmap *idmap,
-			struct dentry *dentry, struct iattr *iattr)
-{
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret;
-
-	lockdep_assert_held(&inode->v.i_rwsem);
-
-	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-		setattr_prepare(idmap, dentry, iattr);
-	if (ret)
-		return ret;
-
-	return iattr->ia_valid & ATTR_SIZE
-		? bchfs_truncate(idmap, inode, iattr)
-		: bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-static int bch2_tmpfile(struct mnt_idmap *idmap,
-			struct inode *vdir, struct file *file, umode_t mode)
-{
-	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir),
-			      file->f_path.dentry, mode, 0,
-			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-
-	if (IS_ERR(inode))
-		return bch2_err_class(PTR_ERR(inode));
-
-	d_mark_tmpfile(file, &inode->v);
-	d_instantiate(file->f_path.dentry, &inode->v);
-	return finish_open_simple(file, 0);
-}
-
-struct bch_fiemap_extent {
-	struct bkey_buf	kbuf;
-	unsigned	flags;
-};
-
-static int bch2_fill_extent(struct bch_fs *c,
-			    struct fiemap_extent_info *info,
-			    struct bch_fiemap_extent *fe)
-{
-	struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
-	unsigned flags = fe->flags;
-
-	BUG_ON(!k.k->size);
-
-	if (bkey_extent_is_direct_data(k.k)) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		int ret;
-
-		if (k.k->type == KEY_TYPE_reflink_v)
-			flags |= FIEMAP_EXTENT_SHARED;
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			int flags2 = 0;
-			u64 offset = p.ptr.offset;
-
-			if (p.ptr.unwritten)
-				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
-
-			if (p.crc.compression_type)
-				flags2 |= FIEMAP_EXTENT_ENCODED;
-			else
-				offset += p.crc.offset;
-
-			if ((offset & (block_sectors(c) - 1)) ||
-			    (k.k->size & (block_sectors(c) - 1)))
-				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
-			ret = fiemap_fill_next_extent(info,
-						bkey_start_offset(k.k) << 9,
-						offset << 9,
-						k.k->size << 9, flags|flags2);
-			if (ret)
-				return ret;
-		}
-
-		return 0;
-	} else if (bkey_extent_is_inline_data(k.k)) {
-		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(k.k) << 9,
-					       0, k.k->size << 9,
-					       flags|
-					       FIEMAP_EXTENT_DATA_INLINE);
-	} else if (k.k->type == KEY_TYPE_reservation) {
-		return fiemap_fill_next_extent(info,
-					       bkey_start_offset(k.k) << 9,
-					       0, k.k->size << 9,
-					       flags|
-					       FIEMAP_EXTENT_DELALLOC|
-					       FIEMAP_EXTENT_UNWRITTEN);
-	} else {
-		BUG();
-	}
-}
-
-/*
- * Scan a range of an inode for data in pagecache.
- *
- * Intended to be retryable, so don't modify the output params until success is
- * imminent.
- */
-static int
-bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
-			   bool nonblock)
-{
-	loff_t	dstart, dend;
-
-	dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
-	if (dstart < 0)
-		return dstart;
-
-	if (dstart == *end) {
-		*start = dstart;
-		return 0;
-	}
-
-	dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
-	if (dend < 0)
-		return dend;
-
-	/* race */
-	BUG_ON(dstart == dend);
-
-	*start = dstart;
-	*end = dend;
-	return 0;
-}
-
-/*
- * Scan a range of pagecache that corresponds to a file mapping hole in the
- * extent btree. If data is found, fake up an extent key so it looks like a
- * delalloc extent to the rest of the fiemap processing code.
- */
-static int
-bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
-				  u64 start, u64 end, struct bch_fiemap_extent *cur)
-{
-	struct bch_fs		*c = trans->c;
-	struct bkey_i_extent	*delextent;
-	struct bch_extent_ptr	ptr = {};
-	loff_t			dstart = start << 9, dend = end << 9;
-	int			ret;
-
-	/*
-	 * We hold btree locks here so we cannot block on folio locks without
-	 * dropping trans locks first. Run a nonblocking scan for the common
-	 * case of no folios over holes and fall back on failure.
-	 *
-	 * Note that dropping locks like this is technically racy against
-	 * writeback inserting to the extent tree, but a non-sync fiemap scan is
-	 * fundamentally racy with writeback anyways. Therefore, just report the
-	 * range as delalloc regardless of whether we have to cycle trans locks.
-	 */
-	ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
-	if (ret == -EAGAIN)
-		ret = drop_locks_do(trans,
-			bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Create a fake extent key in the buffer. We have to add a dummy extent
-	 * pointer for the fill code to add an extent entry. It's explicitly
-	 * zeroed to reflect delayed allocation (i.e. phys offset 0).
-	 */
-	bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
-	delextent = bkey_extent_init(cur->kbuf.k);
-	delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
-	delextent->k.size = (dend - dstart) >> 9;
-	bch2_bkey_append_ptr(&delextent->k_i, ptr);
-
-	cur->flags = FIEMAP_EXTENT_DELALLOC;
-
-	return 0;
-}
-
-static int bch2_next_fiemap_extent(struct btree_trans *trans,
-				   struct bch_inode_info *inode,
-				   u64 start, u64 end,
-				   struct bch_fiemap_extent *cur)
-{
-	u32 snapshot;
-	int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inode->ei_inum.inum, start, snapshot), 0);
-
-	struct bkey_s_c k =
-		bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
-
-	ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
-	if (ret)
-		goto err;
-
-	struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
-
-	/*
-	 * Does the pagecache or the btree take precedence?
-	 *
-	 * It _should_ be the pagecache, so that we correctly report delalloc
-	 * extents when dirty in the pagecache (we're COW, after all).
-	 *
-	 * But we'd have to add per-sector writeback tracking to
-	 * bch_folio_state, otherwise we report delalloc extents for clean
-	 * cached data in the pagecache.
-	 *
-	 * We should do this, but even then fiemap won't report stable mappings:
-	 * on bcachefs data moves around in the background (copygc, rebalance)
-	 * and we don't provide a way for userspace to lock that out.
-	 */
-	if (k.k &&
-	    bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
-		    pagecache_start)) {
-		bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
-		bch2_cut_front(iter.pos, cur->kbuf.k);
-		bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
-		cur->flags = 0;
-	} else if (k.k) {
-		bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
-	}
-
-	if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
-		unsigned sectors = cur->kbuf.k->k.size;
-		s64 offset_into_extent = 0;
-		enum btree_id data_btree = BTREE_ID_extents;
-		ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
-						&cur->kbuf);
-		if (ret)
-			goto err;
-
-		struct bkey_i *k = cur->kbuf.k;
-		sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
-
-		bch2_cut_front(POS(k->k.p.inode,
-				   bkey_start_offset(&k->k) + offset_into_extent),
-			       k);
-		bch2_key_resize(&k->k, sectors);
-		k->k.p = iter.pos;
-		k->k.p.offset += k->k.size;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
-		       u64 start, u64 len)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *ei = to_bch_ei(vinode);
-	struct btree_trans *trans;
-	struct bch_fiemap_extent cur, prev;
-	int ret = 0;
-
-	ret = fiemap_prep(&ei->v, info, start, &len, 0);
-	if (ret)
-		return ret;
-
-	if (start + len < start)
-		return -EINVAL;
-
-	start >>= 9;
-	u64 end = (start + len) >> 9;
-
-	bch2_bkey_buf_init(&cur.kbuf);
-	bch2_bkey_buf_init(&prev.kbuf);
-	bkey_init(&prev.kbuf.k->k);
-
-	trans = bch2_trans_get(c);
-
-	while (start < end) {
-		ret = lockrestart_do(trans,
-			bch2_next_fiemap_extent(trans, ei, start, end, &cur));
-		if (ret)
-			goto err;
-
-		BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
-		BUG_ON(cur.kbuf.k->k.p.offset > end);
-
-		if (bkey_start_offset(&cur.kbuf.k->k) == end)
-			break;
-
-		start = cur.kbuf.k->k.p.offset;
-
-		if (!bkey_deleted(&prev.kbuf.k->k)) {
-			bch2_trans_unlock(trans);
-			ret = bch2_fill_extent(c, info, &prev);
-			if (ret)
-				goto err;
-		}
-
-		bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
-		prev.flags = cur.flags;
-	}
-
-	if (!bkey_deleted(&prev.kbuf.k->k)) {
-		bch2_trans_unlock(trans);
-		prev.flags |= FIEMAP_EXTENT_LAST;
-		ret = bch2_fill_extent(c, info, &prev);
-	}
-err:
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&cur.kbuf, c);
-	bch2_bkey_buf_exit(&prev.kbuf, c);
-
-	return bch2_err_class(ret < 0 ? ret : 0);
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
-	.fault		= bch2_page_fault,
-	.map_pages	= filemap_map_pages,
-	.page_mkwrite   = bch2_page_mkwrite,
-};
-
-static int bch2_mmap_prepare(struct vm_area_desc *desc)
-{
-	file_accessed(desc->file);
-
-	desc->vm_ops = &bch_vm_ops;
-	return 0;
-}
-
-/* Directories: */
-
-static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	return generic_file_llseek_size(file, offset, whence,
-					S64_MAX, S64_MAX);
-}
-
-static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct bch_inode_info *inode = file_bch_inode(file);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-
-	int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
-
-	bch_err_fn(c, ret);
-	return bch2_err_class(ret);
-}
-
-static int bch2_open(struct inode *vinode, struct file *file)
-{
-	if (file->f_flags & (O_WRONLY|O_RDWR)) {
-		struct bch_inode_info *inode = to_bch_ei(vinode);
-		struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-		int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
-		if (ret)
-			return ret;
-	}
-
-	file->f_mode |= FMODE_CAN_ODIRECT;
-
-	return generic_file_open(vinode, file);
-}
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
-	[__BCH_INODE_sync]		= FS_SYNC_FL,
-	[__BCH_INODE_immutable]		= FS_IMMUTABLE_FL,
-	[__BCH_INODE_append]		= FS_APPEND_FL,
-	[__BCH_INODE_nodump]		= FS_NODUMP_FL,
-	[__BCH_INODE_noatime]		= FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
-	[__BCH_INODE_sync]	= FS_XFLAG_SYNC,
-	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE,
-	[__BCH_INODE_append]	= FS_XFLAG_APPEND,
-	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP,
-	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME,
-};
-
-static int bch2_fileattr_get(struct dentry *dentry,
-			     struct file_kattr *fa)
-{
-	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
-
-	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
-		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
-	if (bch2_inode_casefold(c, &inode->ei_inode))
-		fa->flags |= FS_CASEFOLD_FL;
-
-	fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-	return 0;
-}
-
-struct flags_set {
-	unsigned		mask;
-	unsigned		flags;
-	unsigned		projid;
-	bool			set_project;
-	bool			set_casefold;
-	bool			casefold;
-};
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
-				      struct bch_inode_info *inode,
-				      struct bch_inode_unpacked *bi,
-				      void *p)
-{
-	struct bch_fs *c = trans->c;
-	struct flags_set *s = p;
-
-	/*
-	 * We're relying on btree locking here for exclusion with other ioctl
-	 * calls - use the flags in the btree (@bi), not inode->i_flags:
-	 */
-	if (!S_ISREG(bi->bi_mode) &&
-	    !S_ISDIR(bi->bi_mode) &&
-	    (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
-		return -EINVAL;
-
-	if (s->casefold != bch2_inode_casefold(c, bi)) {
-		int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
-		if (ret)
-			return ret;
-	}
-
-	if (s->set_project) {
-		bi->bi_project = s->projid;
-		bi->bi_fields_set |= BIT(Inode_opt_project);
-	}
-
-	bi->bi_flags &= ~s->mask;
-	bi->bi_flags |= s->flags;
-
-	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
-	return 0;
-}
-
-static int bch2_fileattr_set(struct mnt_idmap *idmap,
-			     struct dentry *dentry,
-			     struct file_kattr *fa)
-{
-	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct flags_set s = {};
-	int ret;
-
-	if (fa->fsx_valid) {
-		fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
-		s.mask = map_defined(bch_flags_to_xflags);
-		s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
-		if (fa->fsx_xflags)
-			return -EOPNOTSUPP;
-
-		if (fa->fsx_projid >= U32_MAX)
-			return -EINVAL;
-
-		/*
-		 * inode fields accessible via the xattr interface are stored with a +1
-		 * bias, so that 0 means unset:
-		 */
-		if ((inode->ei_inode.bi_project ||
-		     fa->fsx_projid) &&
-		    inode->ei_inode.bi_project != fa->fsx_projid + 1) {
-			s.projid = fa->fsx_projid + 1;
-			s.set_project = true;
-		}
-	}
-
-	if (fa->flags_valid) {
-		s.mask = map_defined(bch_flags_to_uflags);
-
-		s.set_casefold = true;
-		s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
-		fa->flags &= ~FS_CASEFOLD_FL;
-
-		s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
-		if (fa->flags)
-			return -EOPNOTSUPP;
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
-		(s.set_project
-		 ? bch2_set_projid(c, inode, fa->fsx_projid)
-		 : 0) ?:
-		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
-			       ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
-
-	return bch2_err_class(ret);
-}
-
-static const struct file_operations bch_file_operations = {
-	.open		= bch2_open,
-	.llseek		= bch2_llseek,
-	.read_iter	= bch2_read_iter,
-	.write_iter	= bch2_write_iter,
-	.mmap_prepare	= bch2_mmap_prepare,
-	.get_unmapped_area = thp_get_unmapped_area,
-	.fsync		= bch2_fsync,
-	.splice_read	= filemap_splice_read,
-	.splice_write	= iter_file_splice_write,
-	.fallocate	= bch2_fallocate_dispatch,
-	.unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= bch2_compat_fs_ioctl,
-#endif
-	.remap_file_range = bch2_remap_file_range,
-};
-
-static const struct inode_operations bch_file_inode_operations = {
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.fiemap		= bch2_fiemap,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_inode_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-	.fileattr_get	= bch2_fileattr_get,
-	.fileattr_set	= bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
-	.lookup		= bch2_lookup,
-	.create		= bch2_create,
-	.link		= bch2_link,
-	.unlink		= bch2_unlink,
-	.symlink	= bch2_symlink,
-	.mkdir		= bch2_mkdir,
-	.rmdir		= bch2_unlink,
-	.mknod		= bch2_mknod,
-	.rename		= bch2_rename2,
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.tmpfile	= bch2_tmpfile,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_inode_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-	.fileattr_get	= bch2_fileattr_get,
-	.fileattr_set	= bch2_fileattr_set,
-};
-
-static const struct file_operations bch_dir_file_operations = {
-	.llseek		= bch2_dir_llseek,
-	.read		= generic_read_dir,
-	.iterate_shared	= bch2_vfs_readdir,
-	.fsync		= bch2_fsync,
-	.unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= bch2_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
-	.get_link	= page_get_link,
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_inode_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-	.fileattr_get	= bch2_fileattr_get,
-	.fileattr_set	= bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_special_inode_operations = {
-	.getattr	= bch2_getattr,
-	.setattr	= bch2_setattr,
-	.listxattr	= bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_inode_acl	= bch2_get_acl,
-	.set_acl	= bch2_set_acl,
-#endif
-	.fileattr_get	= bch2_fileattr_get,
-	.fileattr_set	= bch2_fileattr_set,
-};
-
-static const struct address_space_operations bch_address_space_operations = {
-	.read_folio	= bch2_read_folio,
-	.writepages	= bch2_writepages,
-	.readahead	= bch2_readahead,
-	.dirty_folio	= filemap_dirty_folio,
-	.write_begin	= bch2_write_begin,
-	.write_end	= bch2_write_end,
-	.invalidate_folio = bch2_invalidate_folio,
-	.release_folio	= bch2_release_folio,
-#ifdef CONFIG_MIGRATION
-	.migrate_folio	= filemap_migrate_folio,
-#endif
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-struct bcachefs_fid {
-	u64		inum;
-	u32		subvol;
-	u32		gen;
-} __packed;
-
-struct bcachefs_fid_with_parent {
-	struct bcachefs_fid	fid;
-	struct bcachefs_fid	dir;
-} __packed;
-
-static int bcachefs_fid_valid(int fh_len, int fh_type)
-{
-	switch (fh_type) {
-	case FILEID_BCACHEFS_WITHOUT_PARENT:
-		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
-	case FILEID_BCACHEFS_WITH_PARENT:
-		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
-	default:
-		return false;
-	}
-}
-
-static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
-{
-	return (struct bcachefs_fid) {
-		.inum	= inode->ei_inum.inum,
-		.subvol	= inode->ei_inum.subvol,
-		.gen	= inode->ei_inode.bi_generation,
-	};
-}
-
-static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
-			  struct inode *vdir)
-{
-	struct bch_inode_info *inode	= to_bch_ei(vinode);
-	struct bch_inode_info *dir	= to_bch_ei(vdir);
-	int min_len;
-
-	if (!S_ISDIR(inode->v.i_mode) && dir) {
-		struct bcachefs_fid_with_parent *fid = (void *) fh;
-
-		min_len = sizeof(*fid) / sizeof(u32);
-		if (*len < min_len) {
-			*len = min_len;
-			return FILEID_INVALID;
-		}
-
-		fid->fid = bch2_inode_to_fid(inode);
-		fid->dir = bch2_inode_to_fid(dir);
-
-		*len = min_len;
-		return FILEID_BCACHEFS_WITH_PARENT;
-	} else {
-		struct bcachefs_fid *fid = (void *) fh;
-
-		min_len = sizeof(*fid) / sizeof(u32);
-		if (*len < min_len) {
-			*len = min_len;
-			return FILEID_INVALID;
-		}
-		*fid = bch2_inode_to_fid(inode);
-
-		*len = min_len;
-		return FILEID_BCACHEFS_WITHOUT_PARENT;
-	}
-}
-
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-					struct bcachefs_fid fid)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
-				    .subvol = fid.subvol,
-				    .inum = fid.inum,
-	});
-	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
-		iput(vinode);
-		vinode = ERR_PTR(-ESTALE);
-	}
-	return vinode;
-}
-
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
-		int fh_len, int fh_type)
-{
-	struct bcachefs_fid *fid = (void *) _fid;
-
-	if (!bcachefs_fid_valid(fh_len, fh_type))
-		return NULL;
-
-	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
-}
-
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
-		int fh_len, int fh_type)
-{
-	struct bcachefs_fid_with_parent *fid = (void *) _fid;
-
-	if (!bcachefs_fid_valid(fh_len, fh_type) ||
-	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
-		return NULL;
-
-	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
-}
-
-static struct dentry *bch2_get_parent(struct dentry *child)
-{
-	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	subvol_inum parent_inum = {
-		.subvol = inode->ei_inode.bi_parent_subvol ?:
-			inode->ei_inum.subvol,
-		.inum = inode->ei_inode.bi_dir,
-	};
-
-	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
-}
-
-static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
-{
-	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
-	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter1;
-	struct btree_iter iter2;
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	struct bch_inode_unpacked inode_u;
-	subvol_inum target;
-	u32 snapshot;
-	struct qstr dirent_name;
-	unsigned name_len = 0;
-	int ret;
-
-	if (!S_ISDIR(dir->v.i_mode))
-		return -EINVAL;
-
-	trans = bch2_trans_get(c);
-
-	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
-			     POS(dir->ei_inode.bi_inum, 0), 0);
-	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
-			     POS(dir->ei_inode.bi_inum, 0), 0);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
-	bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
-
-	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
-	if (ret)
-		goto err;
-
-	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
-		bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
-
-		k = bch2_btree_iter_peek_slot(trans, &iter1);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (k.k->type != KEY_TYPE_dirent) {
-			ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-			goto err;
-		}
-
-		d = bkey_s_c_to_dirent(k);
-		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-		if (ret > 0)
-			ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-		if (ret)
-			goto err;
-
-		if (subvol_inum_eq(target, inode->ei_inum))
-			goto found;
-	} else {
-		/*
-		 * File with multiple hardlinks and our backref is to the wrong
-		 * directory - linear search:
-		 */
-		for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
-			if (k.k->p.inode > dir->ei_inode.bi_inum)
-				break;
-
-			if (k.k->type != KEY_TYPE_dirent)
-				continue;
-
-			d = bkey_s_c_to_dirent(k);
-			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
-			if (ret < 0)
-				break;
-			if (ret)
-				continue;
-
-			if (subvol_inum_eq(target, inode->ei_inum))
-				goto found;
-		}
-	}
-
-	ret = -ENOENT;
-	goto err;
-found:
-	dirent_name = bch2_dirent_get_name(d);
-
-	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
-	memcpy(name, dirent_name.name, name_len);
-	name[name_len] = '\0';
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_iter_exit(trans, &iter1);
-	bch2_trans_iter_exit(trans, &iter2);
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-static const struct export_operations bch_export_ops = {
-	.encode_fh	= bch2_encode_fh,
-	.fh_to_dentry	= bch2_fh_to_dentry,
-	.fh_to_parent	= bch2_fh_to_parent,
-	.get_parent	= bch2_get_parent,
-	.get_name	= bch2_get_name,
-};
-
-static void bch2_vfs_inode_init(struct btree_trans *trans,
-				subvol_inum inum,
-				struct bch_inode_info *inode,
-				struct bch_inode_unpacked *bi,
-				struct bch_subvolume *subvol)
-{
-	inode->v.i_ino		= inum.inum;
-	inode->ei_inum		= inum;
-	inode->ei_inode.bi_inum	= inum.inum;
-	bch2_inode_update_after_write(trans, inode, bi, ~0);
-
-	inode->v.i_blocks	= bi->bi_sectors;
-	inode->v.i_rdev		= bi->bi_dev;
-	inode->v.i_generation	= bi->bi_generation;
-	inode->v.i_size		= bi->bi_size;
-
-	inode->ei_flags		= 0;
-	inode->ei_quota_reserved = 0;
-	inode->ei_qid		= bch_qid(bi);
-
-	if (BCH_SUBVOLUME_SNAP(subvol))
-		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
-	inode->v.i_mapping->a_ops = &bch_address_space_operations;
-
-	switch (inode->v.i_mode & S_IFMT) {
-	case S_IFREG:
-		inode->v.i_op	= &bch_file_inode_operations;
-		inode->v.i_fop	= &bch_file_operations;
-		break;
-	case S_IFDIR:
-		inode->v.i_op	= &bch_dir_inode_operations;
-		inode->v.i_fop	= &bch_dir_file_operations;
-		break;
-	case S_IFLNK:
-		inode_nohighmem(&inode->v);
-		inode->v.i_op	= &bch_symlink_inode_operations;
-		break;
-	default:
-		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
-		inode->v.i_op	= &bch_special_inode_operations;
-		break;
-	}
-
-	mapping_set_folio_min_order(inode->v.i_mapping,
-				    get_order(trans->c->opts.block_size));
-}
-
-static void bch2_free_inode(struct inode *vinode)
-{
-	kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
-}
-
-static int inode_update_times_fn(struct btree_trans *trans,
-				 struct bch_inode_info *inode,
-				 struct bch_inode_unpacked *bi,
-				 void *p)
-{
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
-	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
-	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
-	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
-
-	return 0;
-}
-
-static int bch2_vfs_write_inode(struct inode *vinode,
-				struct writeback_control *wbc)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	int ret;
-
-	mutex_lock(&inode->ei_update_lock);
-	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
-	mutex_unlock(&inode->ei_update_lock);
-
-	return bch2_err_class(ret);
-}
-
-static void bch2_evict_inode(struct inode *vinode)
-{
-	struct bch_fs *c = vinode->i_sb->s_fs_info;
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
-
-	/*
-	 * evict() has waited for outstanding writeback, we'll do no more IO
-	 * through this inode: it's safe to remove from VFS inode hashtable here
-	 *
-	 * Do that now so that other threads aren't blocked from pulling it back
-	 * in, there's no reason for them to be:
-	 */
-	if (!delete)
-		bch2_inode_hash_remove(c, inode);
-
-	truncate_inode_pages_final(&inode->v.i_data);
-
-	clear_inode(&inode->v);
-
-	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
-
-	if (delete) {
-		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
-				KEY_TYPE_QUOTA_WARN);
-		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
-				KEY_TYPE_QUOTA_WARN);
-		int ret = bch2_inode_rm(c, inode_inum(inode));
-		if (ret && !bch2_err_matches(ret, EROFS)) {
-			bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
-				    inode->ei_inum.subvol,
-				    inode->ei_inum.inum);
-			bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
-		}
-
-		/*
-		 * If we are deleting, we need it present in the vfs hash table
-		 * so that fsck can check if unlinked inodes are still open:
-		 */
-		bch2_inode_hash_remove(c, inode);
-	}
-
-	mutex_lock(&c->vfs_inodes_lock);
-	list_del_init(&inode->ei_vfs_inode_list);
-	mutex_unlock(&c->vfs_inodes_lock);
-}
-
-void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
-{
-	struct bch_inode_info *inode;
-	DARRAY(struct bch_inode_info *) grabbed;
-	bool clean_pass = false, this_pass_clean;
-
-	/*
-	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
-	 * be pruned with d_mark_dontcache().
-	 *
-	 * Once we've had a clean pass where we didn't find any inodes without
-	 * I_DONTCACHE, we wait for them to be freed:
-	 */
-
-	darray_init(&grabbed);
-	darray_make_room(&grabbed, 1024);
-again:
-	cond_resched();
-	this_pass_clean = true;
-
-	mutex_lock(&c->vfs_inodes_lock);
-	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
-		if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
-			continue;
-
-		if (!(inode->v.i_state & I_DONTCACHE) &&
-		    !(inode->v.i_state & I_FREEING) &&
-		    igrab(&inode->v)) {
-			this_pass_clean = false;
-
-			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
-				iput(&inode->v);
-				break;
-			}
-		} else if (clean_pass && this_pass_clean) {
-			struct wait_bit_queue_entry wqe;
-			struct wait_queue_head *wq_head;
-
-			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
-			prepare_to_wait_event(wq_head, &wqe.wq_entry,
-					      TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&c->vfs_inodes_lock);
-
-			schedule();
-			finish_wait(wq_head, &wqe.wq_entry);
-			goto again;
-		}
-	}
-	mutex_unlock(&c->vfs_inodes_lock);
-
-	darray_for_each(grabbed, i) {
-		inode = *i;
-		d_mark_dontcache(&inode->v);
-		d_prune_aliases(&inode->v);
-		iput(&inode->v);
-	}
-	grabbed.nr = 0;
-
-	if (!clean_pass || !this_pass_clean) {
-		clean_pass = this_pass_clean;
-		goto again;
-	}
-
-	darray_exit(&grabbed);
-}
-
-static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct bch_fs *c = sb->s_fs_info;
-	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-	unsigned shift = sb->s_blocksize_bits - 9;
-	/*
-	 * this assumes inodes take up 64 bytes, which is a decent average
-	 * number:
-	 */
-	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-
-	buf->f_type	= BCACHEFS_STATFS_MAGIC;
-	buf->f_bsize	= sb->s_blocksize;
-	buf->f_blocks	= usage.capacity >> shift;
-	buf->f_bfree	= usage.free >> shift;
-	buf->f_bavail	= avail_factor(usage.free) >> shift;
-
-	buf->f_files	= usage.nr_inodes + avail_inodes;
-	buf->f_ffree	= avail_inodes;
-
-	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
-	buf->f_namelen	= BCH_NAME_MAX;
-
-	return 0;
-}
-
-static int bch2_sync_fs(struct super_block *sb, int wait)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret;
-
-	trace_bch2_sync_fs(sb, wait);
-
-	if (c->opts.journal_flush_disabled)
-		return 0;
-
-	if (!wait) {
-		bch2_journal_flush_async(&c->journal, NULL);
-		return 0;
-	}
-
-	ret = bch2_journal_flush(&c->journal);
-	return bch2_err_class(ret);
-}
-
-static struct bch_fs *bch2_path_to_fs(const char *path)
-{
-	struct bch_fs *c;
-	dev_t dev;
-	int ret;
-
-	ret = lookup_bdev(path, &dev);
-	if (ret)
-		return ERR_PTR(ret);
-
-	c = bch2_dev_to_fs(dev);
-	if (c)
-		closure_put(&c->cl);
-	return c ?: ERR_PTR(-ENOENT);
-}
-
-static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
-{
-	struct bch_fs *c = root->d_sb->s_fs_info;
-	bool first = true;
-
-	guard(rcu)();
-	for_each_online_member_rcu(c, ca) {
-		if (!first)
-			seq_putc(seq, ':');
-		first = false;
-		seq_puts(seq, ca->disk_sb.sb_name);
-	}
-
-	return 0;
-}
-
-static int bch2_show_options(struct seq_file *seq, struct dentry *root)
-{
-	struct bch_fs *c = root->d_sb->s_fs_info;
-	struct printbuf buf = PRINTBUF;
-
-	bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
-			  OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
-	printbuf_nul_terminate(&buf);
-	seq_printf(seq, ",%s", buf.buf);
-
-	int ret = buf.allocation_failure ? -ENOMEM : 0;
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static void bch2_put_super(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	__bch2_fs_stop(c);
-}
-
-/*
- * bcachefs doesn't currently integrate intwrite freeze protection but the
- * internal write references serve the same purpose. Therefore reuse the
- * read-only transition code to perform the quiesce. The caveat is that we don't
- * currently have the ability to block tasks that want a write reference while
- * the superblock is frozen. This is fine for now, but we should either add
- * blocking support or find a way to integrate sb_start_intwrite() and friends.
- */
-static int bch2_freeze(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	down_write(&c->state_lock);
-	bch2_fs_read_only(c);
-	up_write(&c->state_lock);
-	return 0;
-}
-
-static int bch2_unfreeze(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret;
-
-	if (test_bit(BCH_FS_emergency_ro, &c->flags))
-		return 0;
-
-	down_write(&c->state_lock);
-	ret = bch2_fs_read_write(c);
-	up_write(&c->state_lock);
-	return ret;
-}
-
-static const struct super_operations bch_super_operations = {
-	.alloc_inode	= bch2_alloc_inode,
-	.free_inode	= bch2_free_inode,
-	.write_inode	= bch2_vfs_write_inode,
-	.evict_inode	= bch2_evict_inode,
-	.sync_fs	= bch2_sync_fs,
-	.statfs		= bch2_statfs,
-	.show_devname	= bch2_show_devname,
-	.show_options	= bch2_show_options,
-	.put_super	= bch2_put_super,
-	.freeze_fs	= bch2_freeze,
-	.unfreeze_fs	= bch2_unfreeze,
-};
-
-static int bch2_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return 0;
-}
-
-static int bch2_noset_super(struct super_block *s, void *data)
-{
-	return -EBUSY;
-}
-
-typedef DARRAY(struct bch_fs *) darray_fs;
-
-static int bch2_test_super(struct super_block *s, void *data)
-{
-	struct bch_fs *c = s->s_fs_info;
-	darray_fs *d = data;
-
-	if (!c)
-		return false;
-
-	darray_for_each(*d, i)
-		if (c != *i)
-			return false;
-	return true;
-}
-
-static int bch2_fs_get_tree(struct fs_context *fc)
-{
-	struct bch_fs *c;
-	struct super_block *sb;
-	struct inode *vinode;
-	struct bch2_opts_parse *opts_parse = fc->fs_private;
-	struct bch_opts opts = opts_parse->opts;
-	darray_const_str devs;
-	darray_fs devs_to_fs = {};
-	int ret;
-
-	opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-	opt_set(opts, nostart, true);
-
-	if (!fc->source || strlen(fc->source) == 0)
-		return -EINVAL;
-
-	ret = bch2_split_devs(fc->source, &devs);
-	if (ret)
-		return ret;
-
-	darray_for_each(devs, i) {
-		ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
-		if (ret)
-			goto err;
-	}
-
-	sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
-	if (!IS_ERR(sb))
-		goto got_sb;
-
-	c = bch2_fs_open(&devs, &opts);
-	ret = PTR_ERR_OR_ZERO(c);
-	if (ret)
-		goto err;
-
-	if (opt_defined(opts, discard))
-		set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
-
-	/* Some options can't be parsed until after the fs is started: */
-	opts = bch2_opts_empty();
-	ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
-	if (ret)
-		goto err_stop_fs;
-
-	bch2_opts_apply(&c->opts, opts);
-
-	ret = bch2_fs_start(c);
-	if (ret)
-		goto err_stop_fs;
-
-	/*
-	 * We might be doing a RO mount because other options required it, or we
-	 * have no alloc info and it's a small image with no room to regenerate
-	 * it
-	 */
-	if (c->opts.read_only)
-		fc->sb_flags |= SB_RDONLY;
-
-	sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
-	ret = PTR_ERR_OR_ZERO(sb);
-	if (ret)
-		goto err_stop_fs;
-got_sb:
-	c = sb->s_fs_info;
-
-	if (sb->s_root) {
-		if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
-			ret = -EBUSY;
-			goto err_put_super;
-		}
-		goto out;
-	}
-
-	sb->s_blocksize		= block_bytes(c);
-	sb->s_blocksize_bits	= ilog2(block_bytes(c));
-	sb->s_maxbytes		= MAX_LFS_FILESIZE;
-	sb->s_op		= &bch_super_operations;
-	sb->s_export_op		= &bch_export_ops;
-#ifdef CONFIG_BCACHEFS_QUOTA
-	sb->s_qcop		= &bch2_quotactl_operations;
-	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
-#endif
-	sb->s_xattr		= bch2_xattr_handlers;
-	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
-	sb->s_time_gran		= c->sb.nsec_per_time_unit;
-	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
-	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
-	super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
-
-	if (c->sb.multi_device)
-		super_set_sysfs_name_uuid(sb);
-	else
-		strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
-
-	sb->s_shrink->seeks	= 0;
-	c->vfs_sb		= sb;
-	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-
-	ret = super_setup_bdi(sb);
-	if (ret)
-		goto err_put_super;
-
-	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
-
-	scoped_guard(rcu) {
-		for_each_online_member_rcu(c, ca) {
-			struct block_device *bdev = ca->disk_sb.bdev;
-
-			/* XXX: create an anonymous device for multi device filesystems */
-			sb->s_bdev	= bdev;
-			sb->s_dev	= bdev->bd_dev;
-			break;
-		}
-	}
-
-	c->dev = sb->s_dev;
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	if (c->opts.acl)
-		sb->s_flags	|= SB_POSIXACL;
-#endif
-
-	sb->s_shrink->seeks = 0;
-
-#ifdef CONFIG_UNICODE
-	if (bch2_fs_casefold_enabled(c))
-		sb->s_encoding = c->cf_encoding;
-	generic_set_sb_d_ops(sb);
-#endif
-
-	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-	ret = PTR_ERR_OR_ZERO(vinode);
-	bch_err_msg(c, ret, "mounting: error getting root inode");
-	if (ret)
-		goto err_put_super;
-
-	sb->s_root = d_make_root(vinode);
-	if (!sb->s_root) {
-		bch_err(c, "error mounting: error allocating root dentry");
-		ret = -ENOMEM;
-		goto err_put_super;
-	}
-
-	sb->s_flags |= SB_ACTIVE;
-out:
-	fc->root = dget(sb->s_root);
-err:
-	darray_exit(&devs_to_fs);
-	bch2_darray_str_exit(&devs);
-	if (ret)
-		pr_err("error: %s", bch2_err_str(ret));
-	/*
-	 * On an inconsistency error in recovery we might see an -EROFS derived
-	 * errorcode (from the journal), but we don't want to return that to
-	 * userspace as that causes util-linux to retry the mount RO - which is
-	 * confusing:
-	 */
-	if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
-		ret = -EIO;
-	return bch2_err_class(ret);
-
-err_stop_fs:
-	bch2_fs_stop(c);
-	goto err;
-
-err_put_super:
-	if (!sb->s_root)
-		__bch2_fs_stop(c);
-	deactivate_locked_super(sb);
-	goto err;
-}
-
-static void bch2_kill_sb(struct super_block *sb)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	generic_shutdown_super(sb);
-	bch2_fs_free(c);
-}
-
-static void bch2_fs_context_free(struct fs_context *fc)
-{
-	struct bch2_opts_parse *opts = fc->fs_private;
-
-	if (opts) {
-		printbuf_exit(&opts->parse_later);
-		kfree(opts);
-	}
-}
-
-static int bch2_fs_parse_param(struct fs_context *fc,
-			       struct fs_parameter *param)
-{
-	/*
-	 * the "source" param, i.e., the name of the device(s) to mount,
-	 * is handled by the VFS layer.
-	 */
-	if (!strcmp(param->key, "source"))
-		return -ENOPARAM;
-
-	struct bch2_opts_parse *opts = fc->fs_private;
-	struct bch_fs *c = NULL;
-
-	/* for reconfigure, we already have a struct bch_fs */
-	if (fc->root)
-		c = fc->root->d_sb->s_fs_info;
-
-	int ret = bch2_parse_one_mount_opt(c, &opts->opts,
-					   &opts->parse_later, param->key,
-					   param->string);
-	if (ret)
-		pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
-
-	return bch2_err_class(ret);
-}
-
-static int bch2_fs_reconfigure(struct fs_context *fc)
-{
-	struct super_block *sb = fc->root->d_sb;
-	struct bch2_opts_parse *opts = fc->fs_private;
-	struct bch_fs *c = sb->s_fs_info;
-	int ret = 0;
-
-	opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-
-	if (opts->opts.read_only != c->opts.read_only) {
-		down_write(&c->state_lock);
-
-		if (opts->opts.read_only) {
-			bch2_fs_read_only(c);
-
-			sb->s_flags |= SB_RDONLY;
-		} else {
-			ret = bch2_fs_read_write(c);
-			if (ret) {
-				bch_err(c, "error going rw: %i", ret);
-				up_write(&c->state_lock);
-				ret = -EINVAL;
-				goto err;
-			}
-
-			sb->s_flags &= ~SB_RDONLY;
-		}
-
-		c->opts.read_only = opts->opts.read_only;
-
-		up_write(&c->state_lock);
-	}
-
-	if (opt_defined(opts->opts, errors))
-		c->opts.errors = opts->opts.errors;
-err:
-	return bch2_err_class(ret);
-}
-
-static const struct fs_context_operations bch2_context_ops = {
-	.free        = bch2_fs_context_free,
-	.parse_param = bch2_fs_parse_param,
-	.get_tree    = bch2_fs_get_tree,
-	.reconfigure = bch2_fs_reconfigure,
-};
-
-static int bch2_init_fs_context(struct fs_context *fc)
-{
-	struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
-
-	if (!opts)
-		return -ENOMEM;
-
-	opts->parse_later = PRINTBUF;
-
-	fc->ops = &bch2_context_ops;
-	fc->fs_private = opts;
-
-	return 0;
-}
-
-void bch2_fs_vfs_exit(struct bch_fs *c)
-{
-	if (c->vfs_inodes_by_inum_table.ht.tbl)
-		rhltable_destroy(&c->vfs_inodes_by_inum_table);
-	if (c->vfs_inodes_table.tbl)
-		rhashtable_destroy(&c->vfs_inodes_table);
-}
-
-int bch2_fs_vfs_init(struct bch_fs *c)
-{
-	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
-		rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
-}
-
-static struct file_system_type bcache_fs_type = {
-	.owner			= THIS_MODULE,
-	.name			= "bcachefs",
-	.init_fs_context	= bch2_init_fs_context,
-	.kill_sb		= bch2_kill_sb,
-	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
-};
-
-MODULE_ALIAS_FS("bcachefs");
-
-void bch2_vfs_exit(void)
-{
-	unregister_filesystem(&bcache_fs_type);
-	kmem_cache_destroy(bch2_inode_cache);
-}
-
-int __init bch2_vfs_init(void)
-{
-	int ret = -ENOMEM;
-
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
-				      SLAB_ACCOUNT);
-	if (!bch2_inode_cache)
-		goto err;
-
-	ret = register_filesystem(&bcache_fs_type);
-	if (ret)
-		goto err;
-
-	return 0;
-err:
-	bch2_vfs_exit();
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
deleted file mode 100644
index dd2198541455..000000000000
--- a/fs/bcachefs/fs.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_H
-#define _BCACHEFS_FS_H
-
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "quota_types.h"
-#include "two_state_shared_lock.h"
-
-#include <linux/seqlock.h>
-#include <linux/stat.h>
-
-struct bch_inode_info {
-	struct inode		v;
-	struct rhash_head	hash;
-	struct rhlist_head	by_inum_hash;
-	subvol_inum		ei_inum;
-
-	struct list_head	ei_vfs_inode_list;
-	unsigned long		ei_flags;
-
-	struct mutex		ei_update_lock;
-	u64			ei_quota_reserved;
-	unsigned long		ei_last_dirtied;
-	two_state_lock_t	ei_pagecache_lock;
-
-	struct mutex		ei_quota_lock;
-	struct bch_qid		ei_qid;
-
-	/*
-	 * When we've been doing nocow writes we'll need to issue flushes to the
-	 * underlying block devices
-	 *
-	 * XXX: a device may have had a flush issued by some other codepath. It
-	 * would be better to keep for each device a sequence number that's
-	 * incremented when we isusue a cache flush, and track here the sequence
-	 * number that needs flushing.
-	 */
-	struct bch_devs_mask	ei_devs_need_flush;
-
-	/* copy of inode in btree: */
-	struct bch_inode_unpacked ei_inode;
-};
-
-#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
-
-#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
-#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
-
-static inline subvol_inum inode_inum(struct bch_inode_info *inode)
-{
-	return inode->ei_inum;
-}
-
-/*
- * Set if we've gotten a btree error for this inode, and thus the vfs inode and
- * btree inode may be inconsistent:
- */
-#define EI_INODE_ERROR			0
-
-/*
- * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
- * those:
- */
-#define EI_INODE_SNAPSHOT		1
-#define EI_INODE_HASHED			2
-
-#define to_bch_ei(_inode)					\
-	container_of_or_null(_inode, struct bch_inode_info, v)
-
-static inline int ptrcmp(void *l, void *r)
-{
-	return cmp_int(l, r);
-}
-
-enum bch_inode_lock_op {
-	INODE_PAGECACHE_BLOCK	= (1U << 0),
-	INODE_UPDATE_LOCK	= (1U << 1),
-};
-
-#define bch2_lock_inodes(_locks, ...)					\
-do {									\
-	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
-	unsigned i;							\
-									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
-									\
-	for (i = 1; i < ARRAY_SIZE(a); i++)				\
-		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_get(a[i]);\
-			if ((_locks) & INODE_UPDATE_LOCK)			\
-				mutex_lock_nested(&a[i]->ei_update_lock, i);\
-		}							\
-} while (0)
-
-#define bch2_unlock_inodes(_locks, ...)					\
-do {									\
-	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
-	unsigned i;							\
-									\
-	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
-									\
-	for (i = 1; i < ARRAY_SIZE(a); i++)				\
-		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
-				bch2_pagecache_block_put(a[i]);\
-			if ((_locks) & INODE_UPDATE_LOCK)			\
-				mutex_unlock(&a[i]->ei_update_lock);	\
-		}							\
-} while (0)
-
-static inline struct bch_inode_info *file_bch_inode(struct file *file)
-{
-	return to_bch_ei(file_inode(file));
-}
-
-static inline bool inode_attr_changing(struct bch_inode_info *dir,
-				struct bch_inode_info *inode,
-				enum inode_opt_id id)
-{
-	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
-		bch2_inode_opt_get(&dir->ei_inode, id) !=
-		bch2_inode_opt_get(&inode->ei_inode, id);
-}
-
-static inline bool inode_attrs_changing(struct bch_inode_info *dir,
-				 struct bch_inode_info *inode)
-{
-	unsigned id;
-
-	for (id = 0; id < Inode_opt_nr; id++)
-		if (inode_attr_changing(dir, inode, id))
-			return true;
-
-	return false;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHEFS_FS
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
-	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
-
-int bch2_fs_quota_transfer(struct bch_fs *,
-			   struct bch_inode_info *,
-			   struct bch_qid,
-			   unsigned,
-			   enum quota_acct_mode);
-
-static inline int bch2_set_projid(struct bch_fs *c,
-				  struct bch_inode_info *inode,
-				  u32 projid)
-{
-	struct bch_qid qid = inode->ei_qid;
-
-	qid.q[QTYP_PRJ] = projid;
-
-	return bch2_fs_quota_transfer(c, inode, qid,
-				      1 << QTYP_PRJ,
-				      KEY_TYPE_QUOTA_PREALLOC);
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct btree_trans *,
-			    struct bch_inode_info *,
-			    struct bch_inode_unpacked *, void *);
-
-void bch2_inode_update_after_write(struct btree_trans *,
-				   struct bch_inode_info *,
-				   struct bch_inode_unpacked *,
-				   unsigned);
-int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-				  inode_set_fn, void *, unsigned);
-
-int bch2_setattr_nonsize(struct mnt_idmap *,
-			 struct bch_inode_info *,
-			 struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, bool);
-
-void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
-
-void bch2_fs_vfs_exit(struct bch_fs *);
-int bch2_fs_vfs_init(struct bch_fs *);
-
-void bch2_vfs_exit(void);
-int bch2_vfs_init(void);
-
-#else
-
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
-
-static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
-
-static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-					       snapshot_id_list *s) {}
-
-static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
-static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_vfs_exit(void) {}
-static inline int bch2_vfs_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
deleted file mode 100644
index 15c1e890d299..000000000000
--- a/fs/bcachefs/fsck.c
+++ /dev/null
@@ -1,3363 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "darray.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "super.h"
-#include "thread_with_file.h"
-#include "xattr.h"
-
-#include <linux/bsearch.h>
-#include <linux/dcache.h> /* struct qstr */
-
-static int dirent_points_to_inode_nowarn(struct bch_fs *c,
-					 struct bkey_s_c_dirent d,
-					 struct bch_inode_unpacked *inode)
-{
-	if (d.v->d_type == DT_SUBVOL
-	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
-	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
-		return 0;
-	return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-}
-
-static void dirent_inode_mismatch_msg(struct printbuf *out,
-				      struct bch_fs *c,
-				      struct bkey_s_c_dirent dirent,
-				      struct bch_inode_unpacked *inode)
-{
-	prt_str(out, "inode points to dirent that does not point back:");
-	prt_newline(out);
-	bch2_bkey_val_to_text(out, c, dirent.s_c);
-	prt_newline(out);
-	bch2_inode_unpacked_to_text(out, inode);
-}
-
-static int dirent_points_to_inode(struct bch_fs *c,
-				  struct bkey_s_c_dirent dirent,
-				  struct bch_inode_unpacked *inode)
-{
-	int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
-	if (ret) {
-		struct printbuf buf = PRINTBUF;
-		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
-		bch_warn(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-	return ret;
-}
-
-/*
- * XXX: this is handling transaction restarts without returning
- * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
- */
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
-				    u32 snapshot)
-{
-	u64 sectors = 0;
-
-	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-				SPOS(inum, 0, snapshot),
-				POS(inum, U64_MAX),
-				0, k, ({
-		if (bkey_extent_is_allocation(k.k))
-			sectors += k.k->size;
-		0;
-	}));
-
-	return ret ?: sectors;
-}
-
-static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
-				    u32 snapshot)
-{
-	u64 subdirs = 0;
-
-	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
-				    SPOS(inum, 0, snapshot),
-				    POS(inum, U64_MAX),
-				    0, k, ({
-		if (k.k->type == KEY_TYPE_dirent &&
-		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
-			subdirs++;
-		0;
-	}));
-
-	return ret ?: subdirs;
-}
-
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
-			 u32 *snapshot, u64 *inum)
-{
-	struct bch_subvolume s;
-	int ret = bch2_subvolume_get(trans, subvol, false, &s);
-
-	*snapshot = le32_to_cpu(s.snapshot);
-	*inum = le64_to_cpu(s.inode);
-	return ret;
-}
-
-static int lookup_dirent_in_snapshot(struct btree_trans *trans,
-			   struct bch_hash_info hash_info,
-			   subvol_inum dir, struct qstr *name,
-			   u64 *target, unsigned *type, u32 snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
-							 &hash_info, dir, name, 0, snapshot);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-	*target = le64_to_cpu(d.v->d_inum);
-	*type = d.v->d_type;
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-/*
- * Find any subvolume associated with a tree of snapshots
- * We can't rely on master_subvol - it might have been deleted.
- */
-static int find_snapshot_tree_subvol(struct btree_trans *trans,
-				     u32 tree_id, u32 *subvol)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_snapshot)
-			continue;
-
-		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-		if (le32_to_cpu(s.v->tree) != tree_id)
-			continue;
-
-		if (s.v->subvol) {
-			*subvol = le32_to_cpu(s.v->subvol);
-			goto found;
-		}
-	}
-	ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
-found:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
-			    struct bch_inode_unpacked *lostfound,
-			    u64 reattaching_inum)
-{
-	struct bch_fs *c = trans->c;
-	struct qstr lostfound_str = QSTR("lost+found");
-	struct btree_iter lostfound_iter = {};
-	u64 inum = 0;
-	unsigned d_type = 0;
-	int ret;
-
-	struct bch_snapshot_tree st;
-	ret = bch2_snapshot_tree_lookup(trans,
-			bch2_snapshot_tree(c, snapshot), &st);
-	if (ret)
-		return ret;
-
-	u32 subvolid;
-	ret = find_snapshot_tree_subvol(trans,
-				bch2_snapshot_tree(c, snapshot), &subvolid);
-	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
-		    bch2_snapshot_tree(c, snapshot));
-	if (ret)
-		return ret;
-
-	struct bch_subvolume subvol;
-	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
-	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
-	if (ret)
-		return ret;
-
-	if (!subvol.inode) {
-		struct btree_iter iter;
-		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
-				BTREE_ID_subvolumes, POS(0, subvolid),
-				0, subvolume);
-		ret = PTR_ERR_OR_ZERO(subvol);
-		if (ret)
-			return ret;
-
-		subvol->v.inode = cpu_to_le64(reattaching_inum);
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	subvol_inum root_inum = {
-		.subvol = subvolid,
-		.inum = le64_to_cpu(subvol.inode)
-	};
-
-	struct bch_inode_unpacked root_inode;
-	struct bch_hash_info root_hash_info;
-	ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
-	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
-		    root_inum.inum, subvolid);
-	if (ret)
-		return ret;
-
-	root_hash_info = bch2_hash_info_init(c, &root_inode);
-
-	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
-			      &lostfound_str, &inum, &d_type, snapshot);
-	if (bch2_err_matches(ret, ENOENT))
-		goto create_lostfound;
-
-	bch_err_fn(c, ret);
-	if (ret)
-		return ret;
-
-	if (d_type != DT_DIR) {
-		bch_err(c, "error looking up lost+found: not a directory");
-		return bch_err_throw(c, ENOENT_not_directory);
-	}
-
-	/*
-	 * The bch2_check_dirents pass has already run, dangling dirents
-	 * shouldn't exist here:
-	 */
-	ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
-	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
-		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
-	return ret;
-
-create_lostfound:
-	/*
-	 * we always create lost+found in the root snapshot; we don't want
-	 * different branches of the snapshot tree to have different lost+found
-	 */
-	snapshot = le32_to_cpu(st.root_snapshot);
-	/*
-	 * XXX: we could have a nicer log message here  if we had a nice way to
-	 * walk backpointers to print a path
-	 */
-	struct printbuf path = PRINTBUF;
-	ret = bch2_inum_to_path(trans, root_inum, &path);
-	if (ret)
-		goto err;
-
-	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
-		   path.buf, root_inum.subvol, snapshot);
-	printbuf_exit(&path);
-
-	u64 now = bch2_current_time(c);
-	u64 cpu = raw_smp_processor_id();
-
-	bch2_inode_init_early(c, lostfound);
-	bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
-	lostfound->bi_dir = root_inode.bi_inum;
-	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
-
-	root_inode.bi_nlink++;
-
-	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
-	if (ret)
-		goto err;
-
-	bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
-	ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
-	if (ret)
-		goto err;
-
-	ret =   bch2_dirent_create_snapshot(trans,
-				0, root_inode.bi_inum, snapshot, &root_hash_info,
-				mode_to_type(lostfound->bi_mode),
-				&lostfound_str,
-				lostfound->bi_inum,
-				&lostfound->bi_dir_offset,
-				BTREE_UPDATE_internal_snapshot_node|
-				STR_HASH_must_create) ?:
-		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
-				       BTREE_UPDATE_internal_snapshot_node);
-err:
-	bch_err_msg(c, ret, "creating lost+found");
-	bch2_trans_iter_exit(trans, &lostfound_iter);
-	return ret;
-}
-
-static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
-{
-	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
-	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
-		return false;
-
-	/*
-	 * Subvolume roots are special: older versions of subvolume roots may be
-	 * disconnected, it's only the newest version that matters.
-	 *
-	 * We only keep a single dirent pointing to a subvolume root, i.e.
-	 * older versions of snapshots will not have a different dirent pointing
-	 * to the same subvolume root.
-	 *
-	 * This is because dirents that point to subvolumes are only visible in
-	 * the parent subvolume - versioning is not needed - and keeping them
-	 * around would break fsck, because when we're crossing subvolumes we
-	 * don't have a consistent snapshot ID to do check the inode <-> dirent
-	 * relationships.
-	 *
-	 * Thus, a subvolume root that's been renamed after a snapshot will have
-	 * a disconnected older version - that's expected.
-	 *
-	 * Note that taking a snapshot always updates the root inode (to update
-	 * the dirent backpointer), so a subvolume root inode with
-	 * BCH_INODE_has_child_snapshot is never visible.
-	 */
-	if (inode->bi_subvol &&
-	    (inode->bi_flags & BCH_INODE_has_child_snapshot))
-		return false;
-
-	return !bch2_inode_has_backpointer(inode) &&
-		!(inode->bi_flags & BCH_INODE_unlinked);
-}
-
-static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
-					SPOS(d_pos.inode, d_pos.offset, snapshot),
-					BTREE_ITER_intent|
-					BTREE_ITER_with_updates);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (bpos_eq(k.k->p, d_pos)) {
-		/*
-		 * delet_at() doesn't work because the update path doesn't
-		 * internally use BTREE_ITER_with_updates yet
-		 */
-		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
-		ret = PTR_ERR_OR_ZERO(k);
-		if (ret)
-			goto err;
-
-		bkey_init(&k->k);
-		k->k.type = KEY_TYPE_whiteout;
-		k->k.p = iter.pos;
-		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked lostfound;
-	char name_buf[20];
-	int ret;
-
-	u32 dirent_snapshot = inode->bi_snapshot;
-	if (inode->bi_subvol) {
-		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
-
-		struct btree_iter subvol_iter;
-		struct bkey_i_subvolume *subvol =
-			bch2_bkey_get_mut_typed(trans, &subvol_iter,
-						BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
-						0, subvolume);
-		ret = PTR_ERR_OR_ZERO(subvol);
-		if (ret)
-			return ret;
-
-		subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
-		bch2_trans_iter_exit(trans, &subvol_iter);
-
-		u64 root_inum;
-		ret = subvol_lookup(trans, inode->bi_parent_subvol,
-				    &dirent_snapshot, &root_inum);
-		if (ret)
-			return ret;
-
-		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
-	} else {
-		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
-	}
-
-	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
-	if (ret)
-		return ret;
-
-	bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
-
-	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
-
-	/* ensure lost+found inode is also present in inode snapshot */
-	if (!inode->bi_subvol) {
-		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
-		lostfound.bi_snapshot = inode->bi_snapshot;
-	}
-
-	ret = __bch2_fsck_write_inode(trans, &lostfound);
-	if (ret)
-		return ret;
-
-	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
-	struct qstr name = QSTR(name_buf);
-
-	inode->bi_dir = lostfound.bi_inum;
-
-	ret = bch2_dirent_create_snapshot(trans,
-				inode->bi_parent_subvol, lostfound.bi_inum,
-				dirent_snapshot,
-				&dir_hash,
-				inode_d_type(inode),
-				&name,
-				inode->bi_subvol ?: inode->bi_inum,
-				&inode->bi_dir_offset,
-				BTREE_UPDATE_internal_snapshot_node|
-				STR_HASH_must_create);
-	if (ret) {
-		bch_err_msg(c, ret, "error creating dirent");
-		return ret;
-	}
-
-	ret = __bch2_fsck_write_inode(trans, inode);
-	if (ret)
-		return ret;
-
-	{
-		CLASS(printbuf, buf)();
-		ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
-						 inode->bi_snapshot, NULL, &buf);
-		if (ret)
-			return ret;
-
-		bch_info(c, "reattached at %s", buf.buf);
-	}
-
-	/*
-	 * Fix up inodes in child snapshots: if they should also be reattached
-	 * update the backpointer field, if they should not be we need to emit
-	 * whiteouts for the dirent we just created.
-	 */
-	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
-		snapshot_id_list whiteouts_done;
-		struct btree_iter iter;
-		struct bkey_s_c k;
-
-		darray_init(&whiteouts_done);
-
-		for_each_btree_key_reverse_norestart(trans, iter,
-				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
-				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
-			if (k.k->p.offset != inode->bi_inum)
-				break;
-
-			if (!bkey_is_inode(k.k) ||
-			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
-			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
-				continue;
-
-			struct bch_inode_unpacked child_inode;
-			ret = bch2_inode_unpack(k, &child_inode);
-			if (ret)
-				break;
-
-			if (!inode_should_reattach(&child_inode)) {
-				ret = maybe_delete_dirent(trans,
-							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
-							       dirent_snapshot),
-							  k.k->p.snapshot);
-				if (ret)
-					break;
-
-				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
-				if (ret)
-					break;
-			} else {
-				iter.snapshot = k.k->p.snapshot;
-				child_inode.bi_dir = inode->bi_dir;
-				child_inode.bi_dir_offset = inode->bi_dir_offset;
-
-				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
-							     BTREE_UPDATE_internal_snapshot_node);
-				if (ret)
-					break;
-			}
-		}
-		darray_exit(&whiteouts_done);
-		bch2_trans_iter_exit(trans, &iter);
-	}
-
-	return ret;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-						struct btree_iter *iter,
-						struct bpos pos)
-{
-	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static int remove_backpointer(struct btree_trans *trans,
-			      struct bch_inode_unpacked *inode)
-{
-	if (!bch2_inode_has_backpointer(inode))
-		return 0;
-
-	u32 snapshot = inode->bi_snapshot;
-
-	if (inode->bi_parent_subvol) {
-		int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
-		if (ret)
-			return ret;
-	}
-
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
-				     SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
-	int ret = bkey_err(d) ?:
-		  dirent_points_to_inode(c, d, inode) ?:
-		  bch2_fsck_remove_dirent(trans, d.k->p);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
-	struct bch_fs *c = trans->c;
-
-	struct bch_inode_unpacked inode;
-	int ret = bch2_inode_find_by_inum_trans(trans,
-				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-				&inode);
-	if (ret)
-		return ret;
-
-	ret = remove_backpointer(trans, &inode);
-	if (!bch2_err_matches(ret, ENOENT))
-		bch_err_msg(c, ret, "removing dirent");
-	if (ret)
-		return ret;
-
-	ret = reattach_inode(trans, &inode);
-	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-	return ret;
-}
-
-static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
-{
-	struct bch_fs *c = trans->c;
-
-	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
-		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
-		return bch_err_throw(c, fsck_repair_unimplemented);
-	}
-
-	/*
-	 * If inum isn't set, that means we're being called from check_dirents,
-	 * not check_inodes - the root of this subvolume doesn't exist or we
-	 * would have found it there:
-	 */
-	if (!inum) {
-		struct btree_iter inode_iter = {};
-		struct bch_inode_unpacked new_inode;
-		u64 cpu = raw_smp_processor_id();
-
-		bch2_inode_init_early(c, &new_inode);
-		bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
-
-		new_inode.bi_subvol = subvolid;
-
-		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
-			  bch2_btree_iter_traverse(trans, &inode_iter) ?:
-			  bch2_inode_write(trans, &inode_iter, &new_inode);
-		bch2_trans_iter_exit(trans, &inode_iter);
-		if (ret)
-			return ret;
-
-		inum = new_inode.bi_inum;
-	}
-
-	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
-
-	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
-	int ret = PTR_ERR_OR_ZERO(new_subvol);
-	if (ret)
-		return ret;
-
-	bkey_subvolume_init(&new_subvol->k_i);
-	new_subvol->k.p.offset	= subvolid;
-	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
-	new_subvol->v.inode	= cpu_to_le64(inum);
-	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
-	if (ret)
-		return ret;
-
-	struct btree_iter iter;
-	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshots, POS(0, snapshotid),
-			0, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
-	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
-	if (ret)
-		return ret;
-
-	u32 snapshot_tree = le32_to_cpu(s->v.tree);
-
-	s->v.subvol = cpu_to_le32(subvolid);
-	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
-	bch2_trans_iter_exit(trans, &iter);
-
-	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
-			0, snapshot_tree);
-	ret = PTR_ERR_OR_ZERO(st);
-	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
-	if (ret)
-		return ret;
-
-	if (!st->v.master_subvol)
-		st->v.master_subvol = cpu_to_le32(subvolid);
-
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
-{
-	struct bch_fs *c = trans->c;
-	unsigned i_mode = S_IFREG;
-	u64 i_size = 0;
-
-	switch (btree) {
-	case BTREE_ID_extents: {
-		struct btree_iter iter = {};
-
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
-		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
-		bch2_trans_iter_exit(trans, &iter);
-		int ret = bkey_err(k);
-		if (ret)
-			return ret;
-
-		i_size = k.k->p.offset << 9;
-		break;
-	}
-	case BTREE_ID_dirents:
-		i_mode = S_IFDIR;
-		break;
-	case BTREE_ID_xattrs:
-		break;
-	default:
-		BUG();
-	}
-
-	struct bch_inode_unpacked new_inode;
-	bch2_inode_init_early(c, &new_inode);
-	bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
-	new_inode.bi_size = i_size;
-	new_inode.bi_inum = inum;
-	new_inode.bi_snapshot = snapshot;
-
-	return __bch2_fsck_write_inode(trans, &new_inode);
-}
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
-	darray_exit(&s->ids);
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
-	memset(s, 0, sizeof(*s));
-}
-
-static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
-	u32 *i;
-	__darray_for_each(s->ids, i) {
-		if (*i == id)
-			return 0;
-		if (*i > id)
-			break;
-	}
-
-	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
-			s->ids.size);
-	return ret;
-}
-
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
-				 enum btree_id btree_id, struct bpos pos)
-{
-	if (!bkey_eq(s->pos, pos))
-		s->ids.nr = 0;
-	s->pos = pos;
-
-	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
-}
-
-/**
- * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
- * and @ancestor hasn't been overwritten in @seen
- *
- * @c:		filesystem handle
- * @seen:	list of snapshot ids already seen at current position
- * @id:		descendent snapshot id
- * @ancestor:	ancestor snapshot id
- *
- * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
- */
-static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
-				    u32 id, u32 ancestor)
-{
-	EBUG_ON(id > ancestor);
-
-	if (id == ancestor)
-		return true;
-
-	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
-		return false;
-
-	/*
-	 * We know that @id is a descendant of @ancestor, we're checking if
-	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
-	 * @ascestor and with @id as a descendent.
-	 *
-	 * But we already know that we're scanning IDs between @id and @ancestor
-	 * numerically, since snapshot ID lists are kept sorted, so if we find
-	 * an id that's an ancestor of @id we're done:
-	 */
-	darray_for_each_reverse(seen->ids, i)
-		if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
-			return false;
-
-	return true;
-}
-
-/**
- * ref_visible - given a key with snapshot id @src that points to a key with
- * snapshot id @dst, test whether there is some snapshot in which @dst is
- * visible.
- *
- * @c:		filesystem handle
- * @s:		list of snapshot IDs already seen at @src
- * @src:	snapshot ID of src key
- * @dst:	snapshot ID of dst key
- * Returns:	true if there is some snapshot in which @dst is visible
- *
- * Assumes we're visiting @src keys in natural key order
- */
-static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-			u32 src, u32 dst)
-{
-	return dst <= src
-		? key_visible_in_snapshot(c, s, dst, src)
-		: bch2_snapshot_is_ancestor(c, src, dst);
-}
-
-static int ref_visible2(struct bch_fs *c,
-			u32 src, struct snapshots_seen *src_seen,
-			u32 dst, struct snapshots_seen *dst_seen)
-{
-	if (dst > src) {
-		swap(dst, src);
-		swap(dst_seen, src_seen);
-	}
-	return key_visible_in_snapshot(c, src_seen, dst, src);
-}
-
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
-	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
-	     (_i)->inode.bi_snapshot <= (_snapshot); _i++)				\
-		if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
-
-struct inode_walker_entry {
-	struct bch_inode_unpacked inode;
-	bool			whiteout;
-	u64			count;
-	u64			i_size;
-};
-
-struct inode_walker {
-	bool				first_this_inode;
-	bool				have_inodes;
-	bool				recalculate_sums;
-	struct bpos			last_pos;
-
-	DARRAY(struct inode_walker_entry) inodes;
-	snapshot_id_list		deletes;
-};
-
-static void inode_walker_exit(struct inode_walker *w)
-{
-	darray_exit(&w->inodes);
-	darray_exit(&w->deletes);
-}
-
-static struct inode_walker inode_walker_init(void)
-{
-	return (struct inode_walker) { 0, };
-}
-
-static int add_inode(struct bch_fs *c, struct inode_walker *w,
-		     struct bkey_s_c inode)
-{
-	int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
-		.whiteout	= !bkey_is_inode(inode.k),
-	}));
-	if (ret)
-		return ret;
-
-	struct inode_walker_entry *n = &darray_last(w->inodes);
-	if (!n->whiteout) {
-		return bch2_inode_unpack(inode, &n->inode);
-	} else {
-		n->inode.bi_inum	= inode.k->p.offset;
-		n->inode.bi_snapshot	= inode.k->p.snapshot;
-		return 0;
-	}
-}
-
-static int get_inodes_all_snapshots(struct btree_trans *trans,
-				    struct inode_walker *w, u64 inum)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	/*
-	 * We no longer have inodes for w->last_pos; clear this to avoid
-	 * screwing up check_i_sectors/check_subdir_count if we take a
-	 * transaction restart here:
-	 */
-	w->have_inodes = false;
-	w->recalculate_sums = false;
-	w->inodes.nr = 0;
-
-	for_each_btree_key_max_norestart(trans, iter,
-			BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
-			BTREE_ITER_all_snapshots, k, ret) {
-		ret = add_inode(c, w, k);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		return ret;
-
-	w->first_this_inode = true;
-	w->have_inodes = true;
-	return 0;
-}
-
-static int get_visible_inodes(struct btree_trans *trans,
-			      struct inode_walker *w,
-			      struct snapshots_seen *s,
-			      u64 inum)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	w->inodes.nr = 0;
-	w->deletes.nr = 0;
-
-	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
-			   BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inum)
-			break;
-
-		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
-			continue;
-
-		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
-			continue;
-
-		ret = bkey_is_inode(k.k)
-			? add_inode(c, w, k)
-			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-
-	struct inode_walker_entry *i = darray_find_p(w->inodes, i,
-		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
-
-	if (!i)
-		return NULL;
-
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
-			trans, snapshot_key_missing_inode_snapshot,
-			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
-			 "unexpected because we should always update the inode when we update a key in that inode\n"
-			 "%s",
-			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
-			 (bch2_bkey_val_to_text(&buf, c, k),
-			  buf.buf))) {
-		if (!i->whiteout) {
-			struct bch_inode_unpacked new = i->inode;
-			new.bi_snapshot = k.k->p.snapshot;
-			ret = __bch2_fsck_write_inode(trans, &new);
-		} else {
-			struct bkey_i whiteout;
-			bkey_init(&whiteout.k);
-			whiteout.k.type = KEY_TYPE_whiteout;
-			whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
-			ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-							  &whiteout,
-							  BTREE_UPDATE_internal_snapshot_node);
-		}
-
-		if (ret)
-			goto fsck_err;
-
-		ret = bch2_trans_commit(trans, NULL, NULL, 0);
-		if (ret)
-			goto fsck_err;
-
-		struct inode_walker_entry new_entry = *i;
-
-		new_entry.inode.bi_snapshot	= k.k->p.snapshot;
-		new_entry.count			= 0;
-		new_entry.i_size		= 0;
-
-		while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
-			--i;
-
-		size_t pos = i - w->inodes.data;
-		ret = darray_insert_item(&w->inodes, pos, new_entry);
-		if (ret)
-			goto fsck_err;
-
-		ret = bch_err_throw(c, transaction_restart_nested);
-		goto fsck_err;
-	}
-
-	printbuf_exit(&buf);
-	return i;
-fsck_err:
-	printbuf_exit(&buf);
-	return ERR_PTR(ret);
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w,
-					     struct bkey_s_c k)
-{
-	if (w->last_pos.inode != k.k->p.inode) {
-		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	w->last_pos = k.k->p;
-
-	return lookup_inode_for_snapshot(trans, w, k);
-}
-
-/*
- * Prefer to delete the first one, since that will be the one at the wrong
- * offset:
- * return value: 0 -> delete k1, 1 -> delete k2
- */
-int bch2_fsck_update_backpointers(struct btree_trans *trans,
-				  struct snapshots_seen *s,
-				  const struct bch_hash_desc desc,
-				  struct bch_hash_info *hash_info,
-				  struct bkey_i *new)
-{
-	if (new->k.type != KEY_TYPE_dirent)
-		return 0;
-
-	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
-	struct inode_walker target = inode_walker_init();
-	int ret = 0;
-
-	if (d->v.d_type == DT_SUBVOL) {
-		bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
-		ret = -BCH_ERR_fsck_repair_unimplemented;
-	} else {
-		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
-		if (ret)
-			goto err;
-
-		darray_for_each(target.inodes, i) {
-			i->inode.bi_dir_offset = d->k.p.offset;
-			ret = __bch2_fsck_write_inode(trans, &i->inode);
-			if (ret)
-				goto err;
-		}
-	}
-err:
-	inode_walker_exit(&target);
-	return ret;
-}
-
-static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
-					       struct btree_iter *iter,
-					       struct bch_inode_unpacked *inode,
-					       u32 *snapshot)
-{
-	if (inode->bi_subvol) {
-		u64 inum;
-		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
-		if (ret)
-			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
-	}
-
-	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
-}
-
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
-	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int check_inode_dirent_inode(struct btree_trans *trans,
-				    struct bch_inode_unpacked *inode,
-				    bool *write_inode)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	u32 inode_snapshot = inode->bi_snapshot;
-	struct btree_iter dirent_iter = {};
-	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
-	int ret = bkey_err(d);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
-	    inode->bi_subvol &&
-	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
-		/* Older version of a renamed subvolume root: we won't have a
-		 * correct dirent for it. That's expected, see
-		 * inode_should_reattach().
-		 *
-		 * We don't clear the backpointer field when doing the rename
-		 * because there might be arbitrarily many versions in older
-		 * snapshots.
-		 */
-		inode->bi_dir = 0;
-		inode->bi_dir_offset = 0;
-		*write_inode = true;
-		goto out;
-	}
-
-	if (fsck_err_on(ret,
-			trans, inode_points_to_missing_dirent,
-			"inode points to missing dirent\n%s",
-			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
-	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
-			trans, inode_points_to_wrong_dirent,
-			"%s",
-			(printbuf_reset(&buf),
-			 dirent_inode_mismatch_msg(&buf, c, d, inode),
-			 buf.buf))) {
-		/*
-		 * We just clear the backpointer fields for now. If we find a
-		 * dirent that points to this inode in check_dirents(), we'll
-		 * update it then; then when we get to check_path() if the
-		 * backpointer is still 0 we'll reattach it.
-		 */
-		inode->bi_dir = 0;
-		inode->bi_dir_offset = 0;
-		*write_inode = true;
-	}
-out:
-	ret = 0;
-fsck_err:
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
-		       struct btree_iter *iter,
-		       struct bkey_s_c k,
-		       struct bch_inode_unpacked *snapshot_root,
-		       struct snapshots_seen *s)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct bch_inode_unpacked u;
-	bool do_update = false;
-	int ret;
-
-	ret = bch2_check_key_has_snapshot(trans, iter, k);
-	if (ret < 0)
-		goto err;
-	if (ret)
-		return 0;
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k))
-		return 0;
-
-	ret = bch2_inode_unpack(k, &u);
-	if (ret)
-		goto err;
-
-	if (snapshot_root->bi_inum != u.bi_inum) {
-		ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
-		if (ret)
-			goto err;
-	}
-
-	if (u.bi_hash_seed	!= snapshot_root->bi_hash_seed ||
-	    INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root)) {
-		ret = bch2_repair_inode_hash_info(trans, snapshot_root);
-		BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
-	if (ret)
-		goto err;
-
-	if (bch2_inode_has_backpointer(&u)) {
-		ret = check_inode_dirent_inode(trans, &u, &do_update);
-		if (ret)
-			goto err;
-	}
-
-	if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
-			(u.bi_flags & BCH_INODE_unlinked),
-			trans, inode_unlinked_but_has_dirent,
-			"inode unlinked but has dirent\n%s",
-			(printbuf_reset(&buf),
-			 bch2_inode_unpacked_to_text(&buf, &u),
-			 buf.buf))) {
-		u.bi_flags &= ~BCH_INODE_unlinked;
-		do_update = true;
-	}
-
-	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
-		/* Check for this early so that check_unreachable_inode() will reattach it */
-
-		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
-		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
-			goto err;
-
-		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
-			    "dir unlinked but not empty\n%s",
-			    (printbuf_reset(&buf),
-			     bch2_inode_unpacked_to_text(&buf, &u),
-			     buf.buf));
-		u.bi_flags &= ~BCH_INODE_unlinked;
-		do_update = true;
-		ret = 0;
-	}
-
-	if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
-			trans, inode_dir_has_nonzero_i_size,
-			"directory %llu:%u with nonzero i_size %lli",
-			u.bi_inum, u.bi_snapshot, u.bi_size)) {
-		u.bi_size = 0;
-		do_update = true;
-	}
-
-	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-	if (ret < 0)
-		goto err;
-
-	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
-			trans, inode_has_child_snapshots_wrong,
-			"inode has_child_snapshots flag wrong (should be %u)\n%s",
-			ret,
-			(printbuf_reset(&buf),
-			 bch2_inode_unpacked_to_text(&buf, &u),
-			 buf.buf))) {
-		if (ret)
-			u.bi_flags |= BCH_INODE_has_child_snapshot;
-		else
-			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
-		do_update = true;
-	}
-	ret = 0;
-
-	if ((u.bi_flags & BCH_INODE_unlinked) &&
-	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
-		if (!test_bit(BCH_FS_started, &c->flags)) {
-			/*
-			 * If we're not in online fsck, don't delete unlinked
-			 * inodes, just make sure they're on the deleted list.
-			 *
-			 * They might be referred to by a logged operation -
-			 * i.e. we might have crashed in the middle of a
-			 * truncate on an unlinked but open file - so we want to
-			 * let the delete_dead_inodes kill it after resuming
-			 * logged ops.
-			 */
-			ret = check_inode_deleted_list(trans, k.k->p);
-			if (ret < 0)
-				goto err_noprint;
-
-			fsck_err_on(!ret,
-				    trans, unlinked_inode_not_on_deleted_list,
-				    "inode %llu:%u unlinked, but not on deleted list",
-				    u.bi_inum, k.k->p.snapshot);
-
-			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
-			if (ret)
-				goto err;
-		} else {
-			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
-			if (ret < 0)
-				goto err;
-
-			if (fsck_err_on(!ret,
-					trans, inode_unlinked_and_not_open,
-				      "inode %llu:%u unlinked and not open",
-				      u.bi_inum, u.bi_snapshot)) {
-				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
-				bch_err_msg(c, ret, "in fsck deleting inode");
-				goto err_noprint;
-			}
-			ret = 0;
-		}
-	}
-
-	if (fsck_err_on(u.bi_parent_subvol &&
-			(u.bi_subvol == 0 ||
-			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
-			trans, inode_bi_parent_nonzero,
-			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
-			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
-		u.bi_parent_subvol = 0;
-		do_update = true;
-	}
-
-	if (u.bi_subvol) {
-		struct bch_subvolume s;
-
-		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
-			goto do_update;
-		}
-
-		if (fsck_err_on(ret,
-				trans, inode_bi_subvol_missing,
-				"inode %llu:%u bi_subvol points to missing subvolume %u",
-				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
-		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
-				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
-							   k.k->p.snapshot),
-				trans, inode_bi_subvol_wrong,
-				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
-				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
-				le64_to_cpu(s.inode),
-				le32_to_cpu(s.snapshot))) {
-			u.bi_subvol = 0;
-			u.bi_parent_subvol = 0;
-			do_update = true;
-		}
-	}
-
-	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
-			trans, inode_journal_seq_in_future,
-			"inode journal seq in future (currently at %llu)\n%s",
-			journal_cur_seq(&c->journal),
-			(printbuf_reset(&buf),
-			 bch2_inode_unpacked_to_text(&buf, &u),
-			buf.buf))) {
-		u.bi_journal_seq = journal_cur_seq(&c->journal);
-		do_update = true;
-	}
-do_update:
-	if (do_update) {
-		ret = __bch2_fsck_write_inode(trans, &u);
-		bch_err_msg(c, ret, "in fsck updating inode");
-		if (ret)
-			goto err_noprint;
-	}
-err:
-fsck_err:
-	bch_err_fn(c, ret);
-err_noprint:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_inodes(struct bch_fs *c)
-{
-	struct bch_inode_unpacked snapshot_root = {};
-	struct snapshots_seen s;
-
-	snapshots_seen_init(&s);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-				POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_inode(trans, &iter, k, &snapshot_root, &s)));
-
-	snapshots_seen_exit(&s);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
-					    struct bch_inode_unpacked *inode)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	/*
-	 * We look for inodes to reattach in natural key order, leaves first,
-	 * but we should do the reattach at the oldest version that needs to be
-	 * reattached:
-	 */
-	for_each_btree_key_norestart(trans, iter,
-				     BTREE_ID_inodes,
-				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
-				     BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inode->bi_inum)
-			break;
-
-		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
-			continue;
-
-		if (!bkey_is_inode(k.k))
-			break;
-
-		struct bch_inode_unpacked parent_inode;
-		ret = bch2_inode_unpack(k, &parent_inode);
-		if (ret)
-			break;
-
-		if (!inode_should_reattach(&parent_inode))
-			break;
-
-		*inode = parent_inode;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static int check_unreachable_inode(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bkey_s_c k)
-{
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (!bkey_is_inode(k.k))
-		return 0;
-
-	struct bch_inode_unpacked inode;
-	ret = bch2_inode_unpack(k, &inode);
-	if (ret)
-		return ret;
-
-	if (!inode_should_reattach(&inode))
-		return 0;
-
-	ret = find_oldest_inode_needs_reattach(trans, &inode);
-	if (ret)
-		return ret;
-
-	if (fsck_err(trans, inode_unreachable,
-		     "unreachable inode:\n%s",
-		     (bch2_inode_unpacked_to_text(&buf, &inode),
-		      buf.buf)))
-		ret = reattach_inode(trans, &inode);
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * Reattach unreachable (but not unlinked) inodes
- *
- * Run after check_inodes() and check_dirents(), so we node that inode
- * backpointer fields point to valid dirents, and every inode that has a dirent
- * that points to it has its backpointer field set - so we're just looking for
- * non-unlinked inodes without backpointers:
- *
- * XXX: this is racy w.r.t. hardlink removal in online fsck
- */
-int bch2_check_unreachable_inodes(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-				POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_unreachable_inode(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
-{
-	switch (btree) {
-	case BTREE_ID_extents:
-		return S_ISREG(mode) || S_ISLNK(mode);
-	case BTREE_ID_dirents:
-		return S_ISDIR(mode);
-	case BTREE_ID_xattrs:
-		return true;
-	default:
-		BUG();
-	}
-}
-
-static int check_key_has_inode(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct inode_walker *inode,
-			       struct inode_walker_entry *i,
-			       struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter iter2 = {};
-	int ret = PTR_ERR_OR_ZERO(i);
-	if (ret)
-		return ret;
-
-	if (k.k->type == KEY_TYPE_whiteout)
-		goto out;
-
-	bool have_inode = i && !i->whiteout;
-
-	if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
-		goto reconstruct;
-
-	if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
-		goto out;
-
-	prt_printf(&buf, ", ");
-
-	bool have_old_inode = false;
-	darray_for_each(inode->inodes, i2)
-		if (!i2->whiteout &&
-		    bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
-		    btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
-			prt_printf(&buf, "but found good inode in older snapshot\n");
-			bch2_inode_unpacked_to_text(&buf, &i2->inode);
-			prt_newline(&buf);
-			have_old_inode = true;
-			break;
-		}
-
-	struct bkey_s_c k2;
-	unsigned nr_keys = 0;
-
-	prt_printf(&buf, "found keys:\n");
-
-	for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
-					 SPOS(k.k->p.inode, 0, k.k->p.snapshot),
-					 POS(k.k->p.inode, U64_MAX),
-					 0, k2, ret) {
-		nr_keys++;
-		if (nr_keys <= 10) {
-			bch2_bkey_val_to_text(&buf, c, k2);
-			prt_newline(&buf);
-		}
-		if (nr_keys >= 100)
-			break;
-	}
-
-	if (ret)
-		goto err;
-
-	if (nr_keys > 100)
-		prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
-	else if (nr_keys > 10)
-		prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
-
-	if (!have_inode) {
-		if (fsck_err_on(!have_inode,
-				trans, key_in_missing_inode,
-				"key in missing inode%s", buf.buf)) {
-			/*
-			 * Maybe a deletion that raced with data move, or something
-			 * weird like that? But if we know the inode was deleted, or
-			 * it's just a few keys, we can safely delete them.
-			 *
-			 * If it's many keys, we should probably recreate the inode
-			 */
-			if (have_old_inode || nr_keys <= 2)
-				goto delete;
-			else
-				goto reconstruct;
-		}
-	} else {
-		/*
-		 * not autofix, this one would be a giant wtf - bit error in the
-		 * inode corrupting i_mode?
-		 *
-		 * may want to try repairing inode instead of deleting
-		 */
-		if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
-				trans, key_in_wrong_inode_type,
-				"key for wrong inode mode %o%s",
-				i->inode.bi_mode, buf.buf))
-			goto delete;
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter2);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-delete:
-	/*
-	 * XXX: print out more info
-	 * count up extents for this inode, check if we have different inode in
-	 * an older snapshot version, perhaps decide if we want to reconstitute
-	 */
-	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
-	goto out;
-reconstruct:
-	ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
-		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto err;
-
-	inode->last_pos.inode--;
-	ret = bch_err_throw(c, transaction_restart_nested);
-	goto out;
-}
-
-static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-	s64 count2;
-
-	darray_for_each(w->inodes, i) {
-		if (i->inode.bi_sectors == i->count)
-			continue;
-
-		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
-
-		if (w->recalculate_sums)
-			i->count = count2;
-
-		if (i->count != count2) {
-			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
-			i->count = count2;
-		}
-
-		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
-				trans, inode_i_sectors_wrong,
-				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-				w->last_pos.inode, i->inode.bi_snapshot,
-				i->inode.bi_sectors, i->count)) {
-			i->inode.bi_sectors = i->count;
-			ret = bch2_fsck_write_inode(trans, &i->inode);
-			if (ret)
-				break;
-		}
-	}
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
-{
-	u32 restart_count = trans->restart_count;
-	return check_i_sectors_notnested(trans, w) ?:
-		trans_was_restarted(trans, restart_count);
-}
-
-struct extent_end {
-	u32			snapshot;
-	u64			offset;
-	struct snapshots_seen	seen;
-};
-
-struct extent_ends {
-	struct bpos			last_pos;
-	DARRAY(struct extent_end)	e;
-};
-
-static void extent_ends_reset(struct extent_ends *extent_ends)
-{
-	darray_for_each(extent_ends->e, i)
-		snapshots_seen_exit(&i->seen);
-	extent_ends->e.nr = 0;
-}
-
-static void extent_ends_exit(struct extent_ends *extent_ends)
-{
-	extent_ends_reset(extent_ends);
-	darray_exit(&extent_ends->e);
-}
-
-static void extent_ends_init(struct extent_ends *extent_ends)
-{
-	memset(extent_ends, 0, sizeof(*extent_ends));
-}
-
-static int extent_ends_at(struct bch_fs *c,
-			  struct extent_ends *extent_ends,
-			  struct snapshots_seen *seen,
-			  struct bkey_s_c k)
-{
-	struct extent_end *i, n = (struct extent_end) {
-		.offset		= k.k->p.offset,
-		.snapshot	= k.k->p.snapshot,
-		.seen		= *seen,
-	};
-
-	n.seen.ids.data = kmemdup(seen->ids.data,
-			      sizeof(seen->ids.data[0]) * seen->ids.size,
-			      GFP_KERNEL);
-	if (!n.seen.ids.data)
-		return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
-
-	__darray_for_each(extent_ends->e, i) {
-		if (i->snapshot == k.k->p.snapshot) {
-			snapshots_seen_exit(&i->seen);
-			*i = n;
-			return 0;
-		}
-
-		if (i->snapshot >= k.k->p.snapshot)
-			break;
-	}
-
-	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
-}
-
-static int overlapping_extents_found(struct btree_trans *trans,
-				     enum btree_id btree,
-				     struct bpos pos1, struct snapshots_seen *pos1_seen,
-				     struct bkey pos2,
-				     bool *fixed,
-				     struct extent_end *extent_end)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter iter1, iter2 = {};
-	struct bkey_s_c k1, k2;
-	int ret;
-
-	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
-
-	bch2_trans_iter_init(trans, &iter1, btree, pos1,
-			     BTREE_ITER_all_snapshots|
-			     BTREE_ITER_not_extents);
-	k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
-	ret = bkey_err(k1);
-	if (ret)
-		goto err;
-
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, k1);
-
-	if (!bpos_eq(pos1, k1.k->p)) {
-		prt_str(&buf, "\nwanted\n  ");
-		bch2_bpos_to_text(&buf, pos1);
-		prt_str(&buf, "\n");
-		bch2_bkey_to_text(&buf, &pos2);
-
-		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
-			__func__, buf.buf);
-		ret = bch_err_throw(c, internal_fsck_err);
-		goto err;
-	}
-
-	bch2_trans_copy_iter(trans, &iter2, &iter1);
-
-	while (1) {
-		bch2_btree_iter_advance(trans, &iter2);
-
-		k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
-		ret = bkey_err(k2);
-		if (ret)
-			goto err;
-
-		if (bpos_ge(k2.k->p, pos2.p))
-			break;
-	}
-
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, k2);
-
-	if (bpos_gt(k2.k->p, pos2.p) ||
-	    pos2.size != k2.k->size) {
-		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
-			__func__, buf.buf);
-		ret = bch_err_throw(c, internal_fsck_err);
-		goto err;
-	}
-
-	prt_printf(&buf, "\noverwriting %s extent",
-		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
-
-	if (fsck_err(trans, extent_overlapping,
-		     "overlapping extents%s", buf.buf)) {
-		struct btree_iter *old_iter = &iter1;
-		struct disk_reservation res = { 0 };
-
-		if (pos1.snapshot < pos2.p.snapshot) {
-			old_iter = &iter2;
-			swap(k1, k2);
-		}
-
-		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
-
-		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
-				BTREE_UPDATE_internal_snapshot_node,
-				k1, k2) ?:
-			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
-		bch2_disk_reservation_put(c, &res);
-
-		bch_info(c, "repair ret %s", bch2_err_str(ret));
-
-		if (ret)
-			goto err;
-
-		*fixed = true;
-
-		if (pos1.snapshot == pos2.p.snapshot) {
-			/*
-			 * We overwrote the first extent, and did the overwrite
-			 * in the same snapshot:
-			 */
-			extent_end->offset = bkey_start_offset(&pos2);
-		} else if (pos1.snapshot > pos2.p.snapshot) {
-			/*
-			 * We overwrote the first extent in pos2's snapshot:
-			 */
-			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
-		} else {
-			/*
-			 * We overwrote the second extent - restart
-			 * check_extent() from the top:
-			 */
-			ret = bch_err_throw(c, transaction_restart_nested);
-		}
-	}
-fsck_err:
-err:
-	bch2_trans_iter_exit(trans, &iter2);
-	bch2_trans_iter_exit(trans, &iter1);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_overlapping_extents(struct btree_trans *trans,
-			      struct snapshots_seen *seen,
-			      struct extent_ends *extent_ends,
-			      struct bkey_s_c k,
-			      struct btree_iter *iter,
-			      bool *fixed)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	/* transaction restart, running again */
-	if (bpos_eq(extent_ends->last_pos, k.k->p))
-		return 0;
-
-	if (extent_ends->last_pos.inode != k.k->p.inode)
-		extent_ends_reset(extent_ends);
-
-	darray_for_each(extent_ends->e, i) {
-		if (i->offset <= bkey_start_offset(k.k))
-			continue;
-
-		if (!ref_visible2(c,
-				  k.k->p.snapshot, seen,
-				  i->snapshot, &i->seen))
-			continue;
-
-		ret = overlapping_extents_found(trans, iter->btree_id,
-						SPOS(iter->pos.inode,
-						     i->offset,
-						     i->snapshot),
-						&i->seen,
-						*k.k, fixed, i);
-		if (ret)
-			goto err;
-	}
-
-	extent_ends->last_pos = k.k->p;
-err:
-	return ret;
-}
-
-static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
-				struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (crc_is_encoded(crc) &&
-		    crc.uncompressed_size > encoded_extent_max_sectors) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_bkey_val_to_text(&buf, c, k);
-			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-	return 0;
-}
-
-static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
-			struct bkey_s_c k,
-			struct inode_walker *inode,
-			struct snapshots_seen *s,
-			struct extent_ends *extent_ends,
-			struct disk_reservation *res)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = bch2_check_key_has_snapshot(trans, iter, k);
-	if (ret) {
-		ret = ret < 0 ? ret : 0;
-		goto out;
-	}
-
-	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
-		ret = check_i_sectors(trans, inode);
-		if (ret)
-			goto err;
-	}
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
-	ret = PTR_ERR_OR_ZERO(extent_i);
-	if (ret)
-		goto err;
-
-	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
-	if (ret)
-		goto err;
-
-	if (k.k->type != KEY_TYPE_whiteout) {
-		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
-						&inode->recalculate_sums);
-		if (ret)
-			goto err;
-
-		/*
-		 * Check inodes in reverse order, from oldest snapshots to
-		 * newest, starting from the inode that matches this extent's
-		 * snapshot. If we didn't have one, iterate over all inodes:
-		 */
-		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
-		     inode->inodes.data && i >= inode->inodes.data;
-		     --i) {
-			if (i->inode.bi_snapshot > k.k->p.snapshot ||
-			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
-				continue;
-
-			u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
-
-			if (fsck_err_on(k.k->p.offset > last_block &&
-					!bkey_extent_is_reservation(k),
-					trans, extent_past_end_of_inode,
-					"extent type past end of inode %llu:%u, i_size %llu\n%s",
-					i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
-					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-				ret =   snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?:
-					bch2_fpunch_snapshot(trans,
-							     SPOS(i->inode.bi_inum,
-								  last_block,
-								  i->inode.bi_snapshot),
-							     POS(i->inode.bi_inum, U64_MAX));
-				if (ret)
-					goto err;
-
-				iter->k.type = KEY_TYPE_whiteout;
-				break;
-			}
-		}
-	}
-
-	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto err;
-
-	if (bkey_extent_is_allocation(k.k)) {
-		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
-		     inode->inodes.data && i >= inode->inodes.data;
-		     --i) {
-			if (i->whiteout ||
-			    i->inode.bi_snapshot > k.k->p.snapshot ||
-			    !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
-				continue;
-
-			i->count += k.k->size;
-		}
-	}
-
-	if (k.k->type != KEY_TYPE_whiteout) {
-		ret = extent_ends_at(c, extent_ends, s, k);
-		if (ret)
-			goto err;
-	}
-out:
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-int bch2_check_extents(struct bch_fs *c)
-{
-	struct inode_walker w = inode_walker_init();
-	struct snapshots_seen s;
-	struct extent_ends extent_ends;
-	struct disk_reservation res = { 0 };
-
-	snapshots_seen_init(&s);
-	extent_ends_init(&extent_ends);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_extents,
-				POS(BCACHEFS_ROOT_INO, 0),
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
-			bch2_disk_reservation_put(c, &res);
-			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
-			check_extent_overbig(trans, &iter, k);
-		})) ?:
-		check_i_sectors_notnested(trans, &w));
-
-	bch2_disk_reservation_put(c, &res);
-	extent_ends_exit(&extent_ends);
-	inode_walker_exit(&w);
-	snapshots_seen_exit(&s);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_check_indirect_extents(struct bch_fs *c)
-{
-	struct disk_reservation res = { 0 };
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
-				POS_MIN,
-				BTREE_ITER_prefetch, k,
-				&res, NULL,
-				BCH_TRANS_COMMIT_no_enospc, ({
-			bch2_disk_reservation_put(c, &res);
-			check_extent_overbig(trans, &iter, k);
-		})));
-
-	bch2_disk_reservation_put(c, &res);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-	s64 count2;
-
-	darray_for_each(w->inodes, i) {
-		if (i->inode.bi_nlink == i->count)
-			continue;
-
-		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
-		if (count2 < 0)
-			return count2;
-
-		if (i->count != count2) {
-			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
-					    w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
-			i->count = count2;
-			if (i->inode.bi_nlink == i->count)
-				continue;
-		}
-
-		if (i->inode.bi_nlink != i->count) {
-			CLASS(printbuf, buf)();
-
-			lockrestart_do(trans,
-				       bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
-								  i->inode.bi_snapshot, NULL, &buf));
-
-			if (fsck_err_on(i->inode.bi_nlink != i->count,
-					trans, inode_dir_wrong_nlink,
-					"directory with wrong i_nlink: got %u, should be %llu\n%s",
-					i->inode.bi_nlink, i->count, buf.buf)) {
-				i->inode.bi_nlink = i->count;
-				ret = bch2_fsck_write_inode(trans, &i->inode);
-				if (ret)
-					break;
-			}
-		}
-	}
-fsck_err:
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
-{
-	u32 restart_count = trans->restart_count;
-	return check_subdir_count_notnested(trans, w) ?:
-		trans_was_restarted(trans, restart_count);
-}
-
-/* find a subvolume that's a descendent of @snapshot: */
-static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-
-		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
-			bch2_trans_iter_exit(trans, &iter);
-			*subvolid = k.k->p.offset;
-			goto found;
-		}
-	}
-	if (!ret)
-		ret = -ENOENT;
-found:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-noinline_for_stack
-static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
-				  struct bkey_s_c_dirent d)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter subvol_iter = {};
-	struct bch_inode_unpacked subvol_root;
-	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
-	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-	u32 parent_snapshot;
-	u32 new_parent_subvol = 0;
-	u64 parent_inum;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (ret ||
-	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
-		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
-		if (ret2 && !bch2_err_matches(ret, ENOENT))
-			return ret2;
-	}
-
-	if (ret &&
-	    !new_parent_subvol &&
-	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
-		/*
-		 * Couldn't find a subvol for dirent's snapshot - but we lost
-		 * subvols, so we need to reconstruct:
-		 */
-		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
-		if (ret)
-			return ret;
-
-		parent_snapshot = d.k->p.snapshot;
-	}
-
-	if (fsck_err_on(ret,
-			trans, dirent_to_missing_parent_subvol,
-			"dirent parent_subvol points to missing subvolume\n%s",
-			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
-	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
-			trans, dirent_not_visible_in_parent_subvol,
-			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
-			parent_snapshot,
-			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-		if (!new_parent_subvol) {
-			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
-			return bch_err_throw(c, fsck_repair_unimplemented);
-		}
-
-		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
-		ret = PTR_ERR_OR_ZERO(new_dirent);
-		if (ret)
-			goto err;
-
-		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
-	}
-
-	struct bkey_s_c_subvolume s =
-		bch2_bkey_get_iter_typed(trans, &subvol_iter,
-					 BTREE_ID_subvolumes, POS(0, target_subvol),
-					 0, subvolume);
-	ret = bkey_err(s.s_c);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (ret) {
-		if (fsck_err(trans, dirent_to_missing_subvol,
-			     "dirent points to missing subvolume\n%s",
-			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
-			return bch2_fsck_remove_dirent(trans, d.k->p);
-		ret = 0;
-		goto out;
-	}
-
-	if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
-		printbuf_reset(&buf);
-
-		prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
-			   parent_subvol);
-
-		ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
-					le64_to_cpu(s.v->inode) }, &buf);
-		if (ret)
-			goto err;
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, s.s_c);
-
-		if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
-			struct bkey_i_subvolume *n =
-				bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
-			ret = PTR_ERR_OR_ZERO(n);
-			if (ret)
-				goto err;
-
-			n->v.fs_path_parent = cpu_to_le32(parent_subvol);
-		}
-	}
-
-	u64 target_inum = le64_to_cpu(s.v->inode);
-	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
-
-	ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
-					       &subvol_root, 0);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (ret) {
-		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
-		ret = bch_err_throw(c, fsck_repair_unimplemented);
-		goto err;
-	}
-
-	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
-			trans, inode_bi_parent_wrong,
-			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
-			target_inum,
-			subvol_root.bi_parent_subvol, parent_subvol)) {
-		subvol_root.bi_parent_subvol = parent_subvol;
-		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
-		ret = __bch2_fsck_write_inode(trans, &subvol_root);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
-	if (ret)
-		goto err;
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &subvol_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
-			struct bkey_s_c k,
-			struct bch_hash_info *hash_info,
-			struct inode_walker *dir,
-			struct inode_walker *target,
-			struct snapshots_seen *s,
-			bool *need_second_pass)
-{
-	struct bch_fs *c = trans->c;
-	struct inode_walker_entry *i;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = bch2_check_key_has_snapshot(trans, iter, k);
-	if (ret) {
-		ret = ret < 0 ? ret : 0;
-		goto out;
-	}
-
-	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
-	if (ret)
-		goto err;
-
-	if (k.k->type == KEY_TYPE_whiteout)
-		goto out;
-
-	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
-		ret = check_subdir_dirents_count(trans, dir);
-		if (ret)
-			goto err;
-	}
-
-	i = walk_inode(trans, dir, k);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret < 0)
-		goto err;
-
-	ret = check_key_has_inode(trans, iter, dir, i, k);
-	if (ret)
-		goto err;
-
-	if (!i || i->whiteout)
-		goto out;
-
-	if (dir->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &i->inode);
-	dir->first_this_inode = false;
-
-	hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
-
-	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
-				      iter, k, need_second_pass);
-	if (ret < 0)
-		goto err;
-	if (ret) {
-		/* dirent has been deleted */
-		ret = 0;
-		goto out;
-	}
-
-	if (k.k->type != KEY_TYPE_dirent)
-		goto out;
-
-	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-	/* check casefold */
-	if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
-			trans, dirent_casefold_mismatch,
-			"dirent casefold does not match dir casefold\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, k),
-			 buf.buf))) {
-		subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
-				? le32_to_cpu(d.v->d_parent_subvol)
-				: 0,
-		};
-		u64 target = d.v->d_type == DT_SUBVOL
-			? le32_to_cpu(d.v->d_child_subvol)
-			: le64_to_cpu(d.v->d_inum);
-		struct qstr name = bch2_dirent_get_name(d);
-
-		struct bkey_i_dirent *new_d =
-			bch2_dirent_create_key(trans, hash_info, dir_inum,
-					       d.v->d_type, &name, NULL, target);
-		ret = PTR_ERR_OR_ZERO(new_d);
-		if (ret)
-			goto out;
-
-		new_d->k.p.inode	= d.k->p.inode;
-		new_d->k.p.snapshot	= d.k->p.snapshot;
-
-		struct btree_iter dup_iter = {};
-		ret =	bch2_hash_delete_at(trans,
-					    bch2_dirent_hash_desc, hash_info, iter,
-					    BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_str_hash_repair_key(trans, s,
-						 &bch2_dirent_hash_desc, hash_info,
-						 iter, bkey_i_to_s_c(&new_d->k_i),
-						 &dup_iter, bkey_s_c_null,
-						 need_second_pass);
-		goto out;
-	}
-
-	if (d.v->d_type == DT_SUBVOL) {
-		ret = check_dirent_to_subvol(trans, iter, d);
-		if (ret)
-			goto err;
-	} else {
-		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(!target->inodes.nr,
-				trans, dirent_to_missing_inode,
-				"dirent points to missing inode:\n%s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k),
-				 buf.buf))) {
-			ret = bch2_fsck_remove_dirent(trans, d.k->p);
-			if (ret)
-				goto err;
-		}
-
-		darray_for_each(target->inodes, i) {
-			ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
-			if (ret)
-				goto err;
-		}
-
-		darray_for_each(target->deletes, i)
-			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
-					trans, dirent_to_overwritten_inode,
-					"dirent points to inode overwritten in snapshot %u:\n%s",
-					*i,
-					(printbuf_reset(&buf),
-					 bch2_bkey_val_to_text(&buf, c, k),
-					 buf.buf))) {
-				struct btree_iter delete_iter;
-				bch2_trans_iter_init(trans, &delete_iter,
-						     BTREE_ID_dirents,
-						     SPOS(k.k->p.inode, k.k->p.offset, *i),
-						     BTREE_ITER_intent);
-				ret =   bch2_btree_iter_traverse(trans, &delete_iter) ?:
-					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-							  hash_info,
-							  &delete_iter,
-							  BTREE_UPDATE_internal_snapshot_node);
-				bch2_trans_iter_exit(trans, &delete_iter);
-				if (ret)
-					goto err;
-
-			}
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-	if (ret)
-		goto err;
-
-	for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
-		if (d.v->d_type == DT_DIR)
-			i->count++;
-		i->i_size += bkey_bytes(d.k);
-	}
-out:
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-int bch2_check_dirents(struct bch_fs *c)
-{
-	struct inode_walker dir = inode_walker_init();
-	struct inode_walker target = inode_walker_init();
-	struct snapshots_seen s;
-	struct bch_hash_info hash_info;
-	bool need_second_pass = false, did_second_pass = false;
-	int ret;
-
-	snapshots_seen_init(&s);
-again:
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
-				POS(BCACHEFS_ROOT_INO, 0),
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
-				     &need_second_pass)) ?:
-		check_subdir_count_notnested(trans, &dir));
-
-	if (!ret && need_second_pass && !did_second_pass) {
-		bch_info(c, "check_dirents requires second pass");
-		swap(did_second_pass, need_second_pass);
-		goto again;
-	}
-
-	if (!ret && need_second_pass) {
-		bch_err(c, "dirents not repairing");
-		ret = -EINVAL;
-	}
-
-	snapshots_seen_exit(&s);
-	inode_walker_exit(&dir);
-	inode_walker_exit(&target);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
-		       struct bkey_s_c k,
-		       struct bch_hash_info *hash_info,
-		       struct inode_walker *inode)
-{
-	struct bch_fs *c = trans->c;
-
-	int ret = bch2_check_key_has_snapshot(trans, iter, k);
-	if (ret < 0)
-		return ret;
-	if (ret)
-		return 0;
-
-	struct inode_walker_entry *i = walk_inode(trans, inode, k);
-	ret = PTR_ERR_OR_ZERO(i);
-	if (ret)
-		return ret;
-
-	ret = check_key_has_inode(trans, iter, inode, i, k);
-	if (ret)
-		return ret;
-
-	if (!i || i->whiteout)
-		return 0;
-
-	if (inode->first_this_inode)
-		*hash_info = bch2_hash_info_init(c, &i->inode);
-	inode->first_this_inode = false;
-
-	bool need_second_pass = false;
-	return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
-				      iter, k, &need_second_pass);
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-int bch2_check_xattrs(struct bch_fs *c)
-{
-	struct inode_walker inode = inode_walker_init();
-	struct bch_hash_info hash_info;
-	int ret = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-			POS(BCACHEFS_ROOT_INO, 0),
-			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
-			k,
-			NULL, NULL,
-			BCH_TRANS_COMMIT_no_enospc,
-		check_xattr(trans, &iter, k, &hash_info, &inode)));
-
-	inode_walker_exit(&inode);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_root_trans(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked root_inode;
-	u32 snapshot;
-	u64 inum;
-	int ret;
-
-	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
-				"root subvol missing")) {
-		struct bkey_i_subvolume *root_subvol =
-			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
-		ret = PTR_ERR_OR_ZERO(root_subvol);
-		if (ret)
-			goto err;
-
-		snapshot	= U32_MAX;
-		inum		= BCACHEFS_ROOT_INO;
-
-		bkey_subvolume_init(&root_subvol->k_i);
-		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
-		root_subvol->v.flags	= 0;
-		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
-		root_subvol->v.inode	= cpu_to_le64(inum);
-		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
-		bch_err_msg(c, ret, "writing root subvol");
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
-					       &root_inode, 0);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (mustfix_fsck_err_on(ret,
-				trans, root_dir_missing,
-				"root directory missing") ||
-	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
-				trans, root_inode_not_dir,
-				"root inode not a directory")) {
-		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
-				0, NULL);
-		root_inode.bi_inum = inum;
-		root_inode.bi_snapshot = snapshot;
-
-		ret = __bch2_fsck_write_inode(trans, &root_inode);
-		bch_err_msg(c, ret, "writing root inode");
-	}
-err:
-fsck_err:
-	return ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-int bch2_check_root(struct bch_fs *c)
-{
-	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		check_root_trans(trans));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static bool darray_u32_has(darray_u32 *d, u32 v)
-{
-	darray_for_each(*d, i)
-		if (*i == v)
-			return true;
-	return false;
-}
-
-static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter parent_iter = {};
-	darray_u32 subvol_path = {};
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_subvolume)
-		return 0;
-
-	subvol_inum start = {
-		.subvol = k.k->p.offset,
-		.inum	= le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
-	};
-
-	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
-		ret = darray_push(&subvol_path, k.k->p.offset);
-		if (ret)
-			goto err;
-
-		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
-		struct bch_inode_unpacked subvol_root;
-		ret = bch2_inode_find_by_inum_trans(trans,
-					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
-					&subvol_root);
-		if (ret)
-			break;
-
-		u32 parent = le32_to_cpu(s.v->fs_path_parent);
-
-		if (darray_u32_has(&subvol_path, parent)) {
-			printbuf_reset(&buf);
-			prt_printf(&buf, "subvolume loop: ");
-
-			ret = bch2_inum_to_path(trans, start, &buf);
-			if (ret)
-				goto err;
-
-			if (fsck_err(trans, subvol_loop, "%s", buf.buf))
-				ret = reattach_subvol(trans, s);
-			break;
-		}
-
-		bch2_trans_iter_exit(trans, &parent_iter);
-		bch2_trans_iter_init(trans, &parent_iter,
-				     BTREE_ID_subvolumes, POS(0, parent), 0);
-		k = bch2_btree_iter_peek_slot(trans, &parent_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
-				trans, subvol_unreachable,
-				"unreachable subvolume %s",
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, s.s_c),
-				 buf.buf))) {
-			ret = reattach_subvol(trans, s);
-			break;
-		}
-	}
-fsck_err:
-err:
-	printbuf_exit(&buf);
-	darray_exit(&subvol_path);
-	bch2_trans_iter_exit(trans, &parent_iter);
-	return ret;
-}
-
-int bch2_check_subvolume_structure(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_subvol_path(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
-				      u64 inum, u32 snapshot,
-				      u32 new_depth)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-					       SPOS(0, inum, snapshot), 0);
-
-	struct bch_inode_unpacked inode;
-	int ret = bkey_err(k) ?:
-		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
-		: bch2_inode_unpack(k, &inode);
-	if (ret)
-		goto err;
-
-	if (inode.bi_depth != new_depth) {
-		inode.bi_depth = new_depth;
-		ret = __bch2_fsck_write_inode(trans, &inode) ?:
-			bch2_trans_commit(trans, NULL, NULL, 0);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
-				  u32 snapshot, u32 new_bi_depth)
-{
-	u32 restart_count = trans->restart_count;
-	int ret = 0;
-
-	darray_for_each_reverse(*path, i) {
-		ret = nested_lockrestart_do(trans,
-				bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
-		bch_err_fn(trans->c, ret);
-		if (ret)
-			break;
-
-		new_bi_depth++;
-	}
-
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter inode_iter = {};
-	darray_u64 path = {};
-	struct printbuf buf = PRINTBUF;
-	u32 snapshot = inode_k.k->p.snapshot;
-	bool redo_bi_depth = false;
-	u32 min_bi_depth = U32_MAX;
-	int ret = 0;
-
-	struct bpos start = inode_k.k->p;
-
-	struct bch_inode_unpacked inode;
-	ret = bch2_inode_unpack(inode_k, &inode);
-	if (ret)
-		return ret;
-
-	/*
-	 * If we're running full fsck, check_dirents() will have already ran,
-	 * and we shouldn't see any missing backpointers here - otherwise that's
-	 * handled separately, by check_unreachable_inodes
-	 */
-	while (!inode.bi_subvol &&
-	       bch2_inode_has_backpointer(&inode)) {
-		struct btree_iter dirent_iter;
-		struct bkey_s_c_dirent d;
-
-		d = dirent_get_by_pos(trans, &dirent_iter,
-				      SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
-		ret = bkey_err(d.s_c);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto out;
-
-		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
-			bch2_trans_iter_exit(trans, &dirent_iter);
-
-		if (bch2_err_matches(ret, ENOENT)) {
-			printbuf_reset(&buf);
-			bch2_bkey_val_to_text(&buf, c, inode_k);
-			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
-				bch2_err_str(ret), buf.buf);
-			goto out;
-		}
-
-		bch2_trans_iter_exit(trans, &dirent_iter);
-
-		ret = darray_push(&path, inode.bi_inum);
-		if (ret)
-			return ret;
-
-		bch2_trans_iter_exit(trans, &inode_iter);
-		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
-					     SPOS(0, inode.bi_dir, snapshot), 0);
-
-		struct bch_inode_unpacked parent_inode;
-		ret = bkey_err(inode_k) ?:
-			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
-			: bch2_inode_unpack(inode_k, &parent_inode);
-		if (ret) {
-			/* Should have been caught in dirents pass */
-			bch_err_msg(c, ret, "error looking up parent directory");
-			goto out;
-		}
-
-		min_bi_depth = parent_inode.bi_depth;
-
-		if (parent_inode.bi_depth < inode.bi_depth &&
-		    min_bi_depth < U16_MAX)
-			break;
-
-		inode = parent_inode;
-		redo_bi_depth = true;
-
-		if (darray_find(path, inode.bi_inum)) {
-			printbuf_reset(&buf);
-			prt_printf(&buf, "directory structure loop in snapshot %u: ",
-				   snapshot);
-
-			ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
-			if (ret)
-				goto out;
-
-			if (c->opts.verbose) {
-				prt_newline(&buf);
-				darray_for_each(path, i)
-					prt_printf(&buf, "%llu ", *i);
-			}
-
-			if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
-				ret = remove_backpointer(trans, &inode);
-				bch_err_msg(c, ret, "removing dirent");
-				if (ret)
-					goto out;
-
-				ret = reattach_inode(trans, &inode);
-				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
-			}
-
-			goto out;
-		}
-	}
-
-	if (inode.bi_subvol)
-		min_bi_depth = 0;
-
-	if (redo_bi_depth)
-		ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	darray_exit(&path);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Check for loops in the directory structure: all other connectivity issues
- * have been fixed by prior passes
- */
-int bch2_check_directory_structure(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
-					  BTREE_ITER_intent|
-					  BTREE_ITER_prefetch|
-					  BTREE_ITER_all_snapshots, k,
-					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			if (!S_ISDIR(bkey_inode_mode(k)))
-				continue;
-
-			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
-				continue;
-
-			check_path_loop(trans, k);
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-struct nlink_table {
-	size_t		nr;
-	size_t		size;
-
-	struct nlink {
-		u64	inum;
-		u32	snapshot;
-		u32	count;
-	}		*d;
-};
-
-static int add_nlink(struct bch_fs *c, struct nlink_table *t,
-		     u64 inum, u32 snapshot)
-{
-	if (t->nr == t->size) {
-		size_t new_size = max_t(size_t, 128UL, t->size * 2);
-		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
-
-		if (!d) {
-			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
-				new_size);
-			return bch_err_throw(c, ENOMEM_fsck_add_nlink);
-		}
-
-		if (t->d)
-			memcpy(d, t->d, t->size * sizeof(t->d[0]));
-		kvfree(t->d);
-
-		t->d = d;
-		t->size = new_size;
-	}
-
-
-	t->d[t->nr++] = (struct nlink) {
-		.inum		= inum,
-		.snapshot	= snapshot,
-	};
-
-	return 0;
-}
-
-static int nlink_cmp(const void *_l, const void *_r)
-{
-	const struct nlink *l = _l;
-	const struct nlink *r = _r;
-
-	return cmp_int(l->inum, r->inum);
-}
-
-static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
-		     struct nlink_table *links,
-		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
-{
-	struct nlink *link, key = {
-		.inum = inum, .snapshot = U32_MAX,
-	};
-
-	if (inum < range_start || inum >= range_end)
-		return;
-
-	link = __inline_bsearch(&key, links->d, links->nr,
-				sizeof(links->d[0]), nlink_cmp);
-	if (!link)
-		return;
-
-	while (link > links->d && link[0].inum == link[-1].inum)
-		--link;
-
-	for (; link < links->d + links->nr && link->inum == inum; link++)
-		if (ref_visible(c, s, snapshot, link->snapshot)) {
-			link->count++;
-			if (link->snapshot >= snapshot)
-				break;
-		}
-}
-
-noinline_for_stack
-static int check_nlinks_find_hardlinks(struct bch_fs *c,
-				       struct nlink_table *t,
-				       u64 start, u64 *end)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_inodes,
-				   POS(0, start),
-				   BTREE_ITER_intent|
-				   BTREE_ITER_prefetch|
-				   BTREE_ITER_all_snapshots, k, ({
-			if (!bkey_is_inode(k.k))
-				continue;
-
-			/* Should never fail, checked by bch2_inode_invalid: */
-			struct bch_inode_unpacked u;
-			_ret3 = bch2_inode_unpack(k, &u);
-			if (_ret3)
-				break;
-
-			/*
-			 * Backpointer and directory structure checks are sufficient for
-			 * directories, since they can't have hardlinks:
-			 */
-			if (S_ISDIR(u.bi_mode))
-				continue;
-
-			/*
-			 * Previous passes ensured that bi_nlink is nonzero if
-			 * it had multiple hardlinks:
-			 */
-			if (!u.bi_nlink)
-				continue;
-
-			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
-			if (ret) {
-				*end = k.k->p.offset;
-				ret = 0;
-				break;
-			}
-			0;
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
-				     u64 range_start, u64 range_end)
-{
-	struct snapshots_seen s;
-
-	snapshots_seen_init(&s);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
-				   BTREE_ITER_intent|
-				   BTREE_ITER_prefetch|
-				   BTREE_ITER_all_snapshots, k, ({
-			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
-			if (ret)
-				break;
-
-			if (k.k->type == KEY_TYPE_dirent) {
-				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
-				if (d.v->d_type != DT_DIR &&
-				    d.v->d_type != DT_SUBVOL)
-					inc_link(c, &s, links, range_start, range_end,
-						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
-			}
-			0;
-		})));
-
-	snapshots_seen_exit(&s);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     struct nlink_table *links,
-				     size_t *idx, u64 range_end)
-{
-	struct bch_inode_unpacked u;
-	struct nlink *link = &links->d[*idx];
-	int ret = 0;
-
-	if (k.k->p.offset >= range_end)
-		return 1;
-
-	if (!bkey_is_inode(k.k))
-		return 0;
-
-	ret = bch2_inode_unpack(k, &u);
-	if (ret)
-		return ret;
-
-	if (S_ISDIR(u.bi_mode))
-		return 0;
-
-	if (!u.bi_nlink)
-		return 0;
-
-	while ((cmp_int(link->inum, k.k->p.offset) ?:
-		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
-		BUG_ON(*idx == links->nr);
-		link = &links->d[++*idx];
-	}
-
-	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
-			trans, inode_wrong_nlink,
-			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
-			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
-			bch2_inode_nlink_get(&u), link->count)) {
-		bch2_inode_nlink_set(&u, link->count);
-		ret = __bch2_fsck_write_inode(trans, &u);
-	}
-fsck_err:
-	return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_update_hardlinks(struct bch_fs *c,
-			       struct nlink_table *links,
-			       u64 range_start, u64 range_end)
-{
-	size_t idx = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
-				POS(0, range_start),
-				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
-	if (ret < 0) {
-		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
-		return ret;
-	}
-
-	return 0;
-}
-
-int bch2_check_nlinks(struct bch_fs *c)
-{
-	struct nlink_table links = { 0 };
-	u64 this_iter_range_start, next_iter_range_start = 0;
-	int ret = 0;
-
-	do {
-		this_iter_range_start = next_iter_range_start;
-		next_iter_range_start = U64_MAX;
-
-		ret = check_nlinks_find_hardlinks(c, &links,
-						  this_iter_range_start,
-						  &next_iter_range_start);
-
-		ret = check_nlinks_walk_dirents(c, &links,
-					  this_iter_range_start,
-					  next_iter_range_start);
-		if (ret)
-			break;
-
-		ret = check_nlinks_update_hardlinks(c, &links,
-					 this_iter_range_start,
-					 next_iter_range_start);
-		if (ret)
-			break;
-
-		links.nr = 0;
-	} while (next_iter_range_start != U64_MAX);
-
-	kvfree(links.d);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
-			     struct bkey_s_c k)
-{
-	struct bkey_s_c_reflink_p p;
-	struct bkey_i_reflink_p *u;
-
-	if (k.k->type != KEY_TYPE_reflink_p)
-		return 0;
-
-	p = bkey_s_c_to_reflink_p(k);
-
-	if (!p.v->front_pad && !p.v->back_pad)
-		return 0;
-
-	u = bch2_trans_kmalloc(trans, sizeof(*u));
-	int ret = PTR_ERR_OR_ZERO(u);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(&u->k_i, k);
-	u->v.front_pad	= 0;
-	u->v.back_pad	= 0;
-
-	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
-}
-
-int bch2_fix_reflink_p(struct bch_fs *c)
-{
-	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
-		return 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_extents, POS_MIN,
-				BTREE_ITER_intent|BTREE_ITER_prefetch|
-				BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			fix_reflink_p_key(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-#ifndef NO_BCACHEFS_CHARDEV
-
-struct fsck_thread {
-	struct thread_with_stdio thr;
-	struct bch_fs		*c;
-	struct bch_opts		opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
-	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
-	kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	int ret = PTR_ERR_OR_ZERO(c);
-	if (ret)
-		return ret;
-
-	ret = bch2_fs_start(thr->c);
-	if (ret)
-		goto err;
-
-	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
-		ret |= 1;
-	}
-	if (test_bit(BCH_FS_error, &c->flags)) {
-		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-		ret |= 4;
-	}
-err:
-	bch2_fs_stop(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_offline_thread_fn,
-};
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
-	struct bch_ioctl_fsck_offline arg;
-	struct fsck_thread *thr = NULL;
-	darray_const_str devs = {};
-	long ret = 0;
-
-	if (copy_from_user(&arg, user_arg, sizeof(arg)))
-		return -EFAULT;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	for (size_t i = 0; i < arg.nr_devs; i++) {
-		u64 dev_u64;
-		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
-		if (ret)
-			goto err;
-
-		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
-		ret = PTR_ERR_OR_ZERO(dev_str);
-		if (ret)
-			goto err;
-
-		ret = darray_push(&devs, dev_str);
-		if (ret) {
-			kfree(dev_str);
-			goto err;
-		}
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
-		if (!IS_ERR(optstr))
-			kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
-	opt_set(thr->opts, read_only, 1);
-	opt_set(thr->opts, ratelimit_errors, 0);
-
-	/* We need request_key() to be called before we punt to kthread: */
-	opt_set(thr->opts, nostart, true);
-
-	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
-	thr->c = bch2_fs_open(&devs, &thr->opts);
-
-	if (!IS_ERR(thr->c) &&
-	    thr->c->opts.errors == BCH_ON_ERROR_panic)
-		thr->c->opts.errors = BCH_ON_ERROR_ro;
-
-	ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
-	darray_for_each(devs, i)
-		kfree(*i);
-	darray_exit(&devs);
-	return ret;
-err:
-	if (thr)
-		bch2_fsck_thread_exit(&thr->thr);
-	pr_err("ret %s", bch2_err_str(ret));
-	goto out;
-}
-
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
-	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
-	struct bch_fs *c = thr->c;
-
-	c->stdio_filter = current;
-	c->stdio = &thr->thr.stdio;
-
-	/*
-	 * XXX: can we figure out a way to do this without mucking with c->opts?
-	 */
-	unsigned old_fix_errors = c->opts.fix_errors;
-	if (opt_defined(thr->opts, fix_errors))
-		c->opts.fix_errors = thr->opts.fix_errors;
-	else
-		c->opts.fix_errors = FSCK_FIX_ask;
-
-	c->opts.fsck = true;
-	set_bit(BCH_FS_in_fsck, &c->flags);
-
-	int ret = bch2_run_online_recovery_passes(c, ~0ULL);
-
-	clear_bit(BCH_FS_in_fsck, &c->flags);
-	bch_err_fn(c, ret);
-
-	c->stdio = NULL;
-	c->stdio_filter = NULL;
-	c->opts.fix_errors = old_fix_errors;
-
-	up(&c->recovery.run_lock);
-	bch2_ro_ref_put(c);
-	return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
-	.exit		= bch2_fsck_thread_exit,
-	.fn		= bch2_fsck_online_thread_fn,
-};
-
-long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
-{
-	struct fsck_thread *thr = NULL;
-	long ret = 0;
-
-	if (arg.flags)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (!bch2_ro_ref_tryget(c))
-		return -EROFS;
-
-	if (down_trylock(&c->recovery.run_lock)) {
-		bch2_ro_ref_put(c);
-		return -EAGAIN;
-	}
-
-	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
-	if (!thr) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	thr->c = c;
-	thr->opts = bch2_opts_empty();
-
-	if (arg.opts) {
-		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
-		ret =   PTR_ERR_OR_ZERO(optstr) ?:
-			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
-		if (!IS_ERR(optstr))
-			kfree(optstr);
-
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
-	if (ret < 0) {
-		bch_err_fn(c, ret);
-		if (thr)
-			bch2_fsck_thread_exit(&thr->thr);
-		up(&c->recovery.run_lock);
-		bch2_ro_ref_put(c);
-	}
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
deleted file mode 100644
index e5fe7cf7b251..000000000000
--- a/fs/bcachefs/fsck.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FSCK_H
-#define _BCACHEFS_FSCK_H
-
-#include "str_hash.h"
-
-/* recoverds snapshot IDs of overwrites at @pos */
-struct snapshots_seen {
-	struct bpos			pos;
-	snapshot_id_list		ids;
-};
-
-int bch2_fsck_update_backpointers(struct btree_trans *,
-				  struct snapshots_seen *,
-				  const struct bch_hash_desc,
-				  struct bch_hash_info *,
-				  struct bkey_i *);
-
-int bch2_check_inodes(struct bch_fs *);
-int bch2_check_extents(struct bch_fs *);
-int bch2_check_indirect_extents(struct bch_fs *);
-int bch2_check_dirents(struct bch_fs *);
-int bch2_check_xattrs(struct bch_fs *);
-int bch2_check_root(struct bch_fs *);
-int bch2_check_subvolume_structure(struct bch_fs *);
-int bch2_check_unreachable_inodes(struct bch_fs *);
-int bch2_check_directory_structure(struct bch_fs *);
-int bch2_check_nlinks(struct bch_fs *);
-int bch2_fix_reflink_p(struct bch_fs *);
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
-long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
-
-#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
deleted file mode 100644
index ef4cc7395b86..000000000000
--- a/fs/bcachefs/inode.c
+++ /dev/null
@@ -1,1566 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_write_buffer.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "inode.h"
-#include "namei.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "varint.h"
-
-#include <linux/random.h>
-
-#include <linux/unaligned.h>
-
-#define x(name, ...)	#name,
-const char * const bch2_inode_opts[] = {
-	BCH_INODE_OPTS()
-	NULL,
-};
-
-static const char * const bch2_inode_flag_strs[] = {
-	BCH_INODE_FLAGS()
-	NULL
-};
-#undef  x
-
-static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
-static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-
-static int inode_decode_field(const u8 *in, const u8 *end,
-			      u64 out[2], unsigned *out_bits)
-{
-	__be64 be[2] = { 0, 0 };
-	unsigned bytes, shift;
-	u8 *p;
-
-	if (in >= end)
-		return -BCH_ERR_inode_unpack_error;
-
-	if (!*in)
-		return -BCH_ERR_inode_unpack_error;
-
-	/*
-	 * position of highest set bit indicates number of bytes:
-	 * shift = number of bits to remove in high byte:
-	 */
-	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
-	bytes	= byte_table[shift - 1];
-
-	if (in + bytes > end)
-		return -BCH_ERR_inode_unpack_error;
-
-	p = (u8 *) be + 16 - bytes;
-	memcpy(p, in, bytes);
-	*p ^= (1 << 8) >> shift;
-
-	out[0] = be64_to_cpu(be[0]);
-	out[1] = be64_to_cpu(be[1]);
-	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
-
-	return bytes;
-}
-
-static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
-					   const struct bch_inode_unpacked *inode)
-{
-	struct bkey_i_inode_v3 *k = &packed->inode;
-	u8 *out = k->v.fields;
-	u8 *end = (void *) &packed[1];
-	u8 *last_nonzero_field = out;
-	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-	unsigned bytes;
-	int ret;
-
-	bkey_inode_v3_init(&packed->inode.k_i);
-	packed->inode.k.p.offset	= inode->bi_inum;
-	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
-	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
-	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
-	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
-	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
-	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
-	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
-
-
-#define x(_name, _bits)							\
-	nr_fields++;							\
-									\
-	if (inode->_name) {						\
-		ret = bch2_varint_encode_fast(out, inode->_name);	\
-		out += ret;						\
-									\
-		if (_bits > 64)						\
-			*out++ = 0;					\
-									\
-		last_nonzero_field = out;				\
-		last_nonzero_fieldnr = nr_fields;			\
-	} else {							\
-		*out++ = 0;						\
-									\
-		if (_bits > 64)						\
-			*out++ = 0;					\
-	}
-
-	BCH_INODE_FIELDS_v3()
-#undef  x
-	BUG_ON(out > end);
-
-	out = last_nonzero_field;
-	nr_fields = last_nonzero_fieldnr;
-
-	bytes = out - (u8 *) &packed->inode.v;
-	set_bkey_val_bytes(&packed->inode.k, bytes);
-	memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
-
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		struct bch_inode_unpacked unpacked;
-
-		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
-		BUG_ON(ret);
-		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
-		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
-		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
-		BUG_ON(unpacked.bi_size		!= inode->bi_size);
-		BUG_ON(unpacked.bi_version	!= inode->bi_version);
-		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
-
-#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
-			panic("unpacked %llu should be %llu",		\
-			      (u64) unpacked._name, (u64) inode->_name);
-		BCH_INODE_FIELDS_v3()
-#undef  x
-	}
-}
-
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-		     const struct bch_inode_unpacked *inode)
-{
-	bch2_inode_pack_inlined(packed, inode);
-}
-
-static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
-				struct bch_inode_unpacked *unpacked)
-{
-	const u8 *in = inode.v->fields;
-	const u8 *end = bkey_val_end(inode);
-	u64 field[2];
-	unsigned fieldnr = 0, field_bits;
-	int ret;
-
-#define x(_name, _bits)							\
-	if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) {			\
-		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
-		memset((void *) unpacked + offset, 0,			\
-		       sizeof(*unpacked) - offset);			\
-		return 0;						\
-	}								\
-									\
-	ret = inode_decode_field(in, end, field, &field_bits);		\
-	if (ret < 0)							\
-		return ret;						\
-									\
-	if (field_bits > sizeof(unpacked->_name) * 8)			\
-		return -BCH_ERR_inode_unpack_error;			\
-									\
-	unpacked->_name = field[1];					\
-	in += ret;
-
-	BCH_INODE_FIELDS_v2()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
-				const u8 *in, const u8 *end,
-				unsigned nr_fields)
-{
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v[2];
-
-#define x(_name, _bits)							\
-	if (fieldnr < nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-									\
-		if (_bits > 64) {					\
-			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
-			if (ret < 0)					\
-				return ret;				\
-			in += ret;					\
-		} else {						\
-			v[1] = 0;					\
-		}							\
-	} else {							\
-		v[0] = v[1] = 0;					\
-	}								\
-									\
-	unpacked->_name = v[0];						\
-	if (v[1] || v[0] != unpacked->_name)				\
-		return -BCH_ERR_inode_unpack_error;			\
-	fieldnr++;
-
-	BCH_INODE_FIELDS_v2()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static int bch2_inode_unpack_v3(struct bkey_s_c k,
-				struct bch_inode_unpacked *unpacked)
-{
-	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-	const u8 *in = inode.v->fields;
-	const u8 *end = bkey_val_end(inode);
-	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
-	unsigned fieldnr = 0;
-	int ret;
-	u64 v[2];
-
-	unpacked->bi_inum	= inode.k->p.offset;
-	unpacked->bi_snapshot	= inode.k->p.snapshot;
-	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
-	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
-	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
-	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
-	unpacked->bi_mode	= INODEv3_MODE(inode.v);
-
-#define x(_name, _bits)							\
-	if (fieldnr < nr_fields) {					\
-		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
-		if (ret < 0)						\
-			return ret;					\
-		in += ret;						\
-									\
-		if (_bits > 64) {					\
-			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
-			if (ret < 0)					\
-				return ret;				\
-			in += ret;					\
-		} else {						\
-			v[1] = 0;					\
-		}							\
-	} else {							\
-		v[0] = v[1] = 0;					\
-	}								\
-									\
-	unpacked->_name = v[0];						\
-	if (v[1] || v[0] != unpacked->_name)				\
-		return -BCH_ERR_inode_unpack_error;			\
-	fieldnr++;
-
-	BCH_INODE_FIELDS_v3()
-#undef  x
-
-	/* XXX: signal if there were more fields than expected? */
-	return 0;
-}
-
-static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
-					       struct bch_inode_unpacked *unpacked)
-{
-	memset(unpacked, 0, sizeof(*unpacked));
-
-	switch (k.k->type) {
-	case KEY_TYPE_inode: {
-		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-
-		unpacked->bi_inum	= inode.k->p.offset;
-		unpacked->bi_snapshot	= inode.k->p.snapshot;
-		unpacked->bi_journal_seq= 0;
-		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
-		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-		if (INODEv1_NEW_VARINT(inode.v)) {
-			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-						    bkey_val_end(inode),
-						    INODEv1_NR_FIELDS(inode.v));
-		} else {
-			return bch2_inode_unpack_v1(inode, unpacked);
-		}
-		break;
-	}
-	case KEY_TYPE_inode_v2: {
-		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-
-		unpacked->bi_inum	= inode.k->p.offset;
-		unpacked->bi_snapshot	= inode.k->p.snapshot;
-		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
-		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
-		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
-		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
-
-		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
-					    bkey_val_end(inode),
-					    INODEv2_NR_FIELDS(inode.v));
-	}
-	default:
-		BUG();
-	}
-}
-
-int bch2_inode_unpack(struct bkey_s_c k,
-		      struct bch_inode_unpacked *unpacked)
-{
-	return likely(k.k->type == KEY_TYPE_inode_v3)
-		? bch2_inode_unpack_v3(k, unpacked)
-		: bch2_inode_unpack_slowpath(k, unpacked);
-}
-
-int __bch2_inode_peek(struct btree_trans *trans,
-		      struct btree_iter *iter,
-		      struct bch_inode_unpacked *inode,
-		      subvol_inum inum, unsigned flags,
-		      bool warn)
-{
-	u32 snapshot;
-	int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
-	if (ret)
-		return ret;
-
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
-					       SPOS(0, inum.inum, snapshot),
-					       flags|BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_unpack(k, inode);
-	if (ret)
-		goto err;
-
-	return 0;
-err:
-	if (warn)
-		bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
-	bch2_trans_iter_exit(trans, iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
-					    u64 inode_nr, u32 snapshot,
-					    struct bch_inode_unpacked *inode,
-					    unsigned flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-					       SPOS(0, inode_nr, snapshot), flags);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	ret = bkey_is_inode(k.k)
-		? bch2_inode_unpack(k, inode)
-		: -BCH_ERR_ENOENT_inode;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
-				  subvol_inum inum,
-				  struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	int ret;
-
-	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
-	if (!ret)
-		bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
-				  subvol_inum inum,
-				  struct bch_inode_unpacked *inode)
-{
-	struct btree_iter iter;
-	int ret;
-
-	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
-	if (!ret)
-		bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
-			    struct bch_inode_unpacked *inode)
-{
-	return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
-				  struct bch_inode_unpacked *root)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
-					     SPOS(0, inum, U32_MAX),
-					     BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inum)
-			break;
-		if (bkey_is_inode(k.k)) {
-			ret = bch2_inode_unpack(k, root);
-			goto out;
-		}
-	}
-	/* We're only called when we know we have an inode for @inum */
-	BUG_ON(!ret);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *trans,
-		     struct btree_iter *iter,
-		     struct bch_inode_unpacked *inode,
-		     enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_inode_buf *inode_p;
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack_inlined(inode_p, inode);
-	inode_p->inode.k.p.snapshot = iter->snapshot;
-	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-	struct bkey_inode_buf *inode_p =
-		bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = inode->bi_snapshot;
-
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-				&inode_p->inode.k_i,
-				BTREE_UPDATE_internal_snapshot_node);
-}
-
-int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __bch2_fsck_write_inode(trans, inode));
-	bch_err_fn(trans->c, ret);
-	return ret;
-}
-
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
-{
-	struct bch_inode_unpacked u;
-	struct bkey_inode_buf *inode_p;
-	int ret;
-
-	if (!bkey_is_inode(&k->k))
-		return ERR_PTR(-ENOENT);
-
-	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-	if (IS_ERR(inode_p))
-		return ERR_CAST(inode_p);
-
-	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
-	if (ret)
-		return ERR_PTR(ret);
-
-	bch2_inode_pack(inode_p, &u);
-	return &inode_p->inode.k_i;
-}
-
-static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-				 struct bkey_validate_context from)
-{
-	struct bch_inode_unpacked unpacked;
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode,
-			 c, inode_pos_inode_nonzero,
-			 "nonzero k.p.inode");
-
-	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
-			 c, inode_pos_blockdev_range,
-			 "fs inode in blockdev range");
-
-	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
-			 c, inode_unpack_error,
-			 "invalid variable length fields");
-
-	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
-			 c, inode_checksum_type_invalid,
-			 "invalid data checksum type (%u >= %u",
-			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-
-	bkey_fsck_err_on(unpacked.bi_compression &&
-			 !bch2_compression_opt_valid(unpacked.bi_compression - 1),
-			 c, inode_compression_type_invalid,
-			 "invalid compression opt %u", unpacked.bi_compression - 1);
-
-	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
-			 unpacked.bi_nlink != 0,
-			 c, inode_unlinked_but_nlink_nonzero,
-			 "flagged as unlinked but bi_nlink != 0");
-
-	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
-			 c, inode_subvol_root_but_not_dir,
-			 "subvolume root but not a directory");
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
-			struct bkey_validate_context from)
-{
-	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-			 c, inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-			 c, inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
-			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
-			 c, inode_v3_fields_start_bad,
-			 "invalid fields_start (got %llu, min %u max %zu)",
-			 INODEv3_FIELDS_START(inode.v),
-			 INODEv3_FIELDS_START_INITIAL,
-			 bkey_val_u64s(inode.k));
-
-	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
-			 c, inode_str_hash_invalid,
-			 "invalid str hash type (%llu >= %u)",
-			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
-	ret = __bch2_inode_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-static void __bch2_inode_unpacked_to_text(struct printbuf *out,
-					  struct bch_inode_unpacked *inode)
-{
-	prt_printf(out, "\n");
-	printbuf_indent_add(out, 2);
-	prt_printf(out, "mode=%o\n", inode->bi_mode);
-
-	prt_str(out, "flags=");
-	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
-	prt_printf(out, "(%x)\n", inode->bi_flags);
-
-	prt_printf(out, "journal_seq=%llu\n",	inode->bi_journal_seq);
-	prt_printf(out, "hash_seed=%llx\n",	inode->bi_hash_seed);
-	prt_printf(out, "hash_type=");
-	bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
-	prt_newline(out);
-	prt_printf(out, "bi_size=%llu\n",	inode->bi_size);
-	prt_printf(out, "bi_sectors=%llu\n",	inode->bi_sectors);
-	prt_printf(out, "bi_version=%llu\n",	inode->bi_version);
-
-#define x(_name, _bits)						\
-	prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
-	BCH_INODE_FIELDS_v3()
-#undef  x
-
-	bch2_printbuf_strip_trailing_newline(out);
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
-{
-	prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
-	__bch2_inode_unpacked_to_text(out, inode);
-}
-
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bch_inode_unpacked inode;
-
-	if (bch2_inode_unpack(k, &inode)) {
-		prt_printf(out, "(unpack error)");
-		return;
-	}
-
-	__bch2_inode_unpacked_to_text(out, &inode);
-}
-
-static inline u64 bkey_inode_flags(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-	case KEY_TYPE_inode_v2:
-		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-	case KEY_TYPE_inode_v3:
-		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-	default:
-		return 0;
-	}
-}
-
-static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
-		return;
-	case KEY_TYPE_inode_v2:
-		bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
-		return;
-	case KEY_TYPE_inode_v3:
-		bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
-		return;
-	default:
-		BUG();
-	}
-}
-
-static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
-{
-	unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
-
-	return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
-}
-
-static struct bkey_s_c
-bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
-				   enum btree_id btree, struct bpos pos,
-				   unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key_max_norestart(trans, *iter, btree,
-					  bpos_successor(pos),
-					  SPOS(pos.inode, pos.offset, U32_MAX),
-					  flags|BTREE_ITER_all_snapshots, k, ret)
-		if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
-			return k;
-
-	bch2_trans_iter_exit(trans, iter);
-	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-static struct bkey_s_c
-bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
-				    struct bpos pos, unsigned flags)
-{
-	struct bkey_s_c k;
-again:
-	k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
-	if (!k.k ||
-	    bkey_err(k) ||
-	    bkey_is_inode(k.k))
-		return k;
-
-	bch2_trans_iter_exit(trans, iter);
-	pos = k.k->p;
-	goto again;
-}
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key_max_norestart(trans, iter,
-			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
-			BTREE_ITER_all_snapshots|
-			BTREE_ITER_with_updates, k, ret)
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
-		    bkey_is_inode(k.k)) {
-			ret = 1;
-			break;
-		}
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int update_inode_has_children(struct btree_trans *trans,
-				     struct bkey_s k,
-				     bool have_child)
-{
-	if (!have_child) {
-		int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-		if (ret)
-			return ret < 0 ? ret : 0;
-	}
-
-	u64 f = bkey_inode_flags(k.s_c);
-	if (have_child != !!(f & BCH_INODE_has_child_snapshot))
-		bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
-
-	return 0;
-}
-
-static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
-					    bool have_child)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
-						&iter, pos, BTREE_ITER_with_updates);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-	if (!k.k)
-		return 0;
-
-	if (!have_child) {
-		ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-		if (ret) {
-			ret = ret < 0 ? ret : 0;
-			goto err;
-		}
-	}
-
-	u64 f = bkey_inode_flags(k);
-	if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
-		struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
-					     BTREE_UPDATE_internal_snapshot_node);
-		ret = PTR_ERR_OR_ZERO(update);
-		if (ret)
-			goto err;
-
-		bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_trigger_inode(struct btree_trans *trans,
-		       enum btree_id btree_id, unsigned level,
-		       struct bkey_s_c old,
-		       struct bkey_s new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-
-	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
-		BUG_ON(!trans->journal_res.seq);
-		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
-	}
-
-	s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
-	if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
-		int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
-		if (ret)
-			return ret;
-	}
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		int unlinked_delta =	(int) bkey_is_unlinked_inode(new.s_c) -
-					(int) bkey_is_unlinked_inode(old);
-		if (unlinked_delta) {
-			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-							      new.k->p, unlinked_delta > 0);
-			if (ret)
-				return ret;
-		}
-
-		/*
-		 * If we're creating or deleting an inode at this snapshot ID,
-		 * and there might be an inode in a parent snapshot ID, we might
-		 * need to set or clear the has_child_snapshot flag on the
-		 * parent.
-		 */
-		int deleted_delta = (int) bkey_is_inode(new.k) -
-				    (int) bkey_is_inode(old.k);
-		if (deleted_delta &&
-		    bch2_snapshot_parent(c, new.k->p.snapshot)) {
-			int ret = update_parent_inode_has_children(trans, new.k->p,
-								   deleted_delta > 0);
-			if (ret)
-				return ret;
-		}
-
-		/*
-		 * When an inode is first updated in a new snapshot, we may need
-		 * to clear has_child_snapshot
-		 */
-		if (deleted_delta > 0) {
-			int ret = update_inode_has_children(trans, new, false);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
-				   struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode,
-			 c, inode_pos_inode_nonzero,
-			 "nonzero k.p.inode");
-fsck_err:
-	return ret;
-}
-
-void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
-				   struct bkey_s_c k)
-{
-	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
-
-	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
-}
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
-				   struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
-			 c, inode_alloc_cursor_inode_bad,
-			 "k.p.inode bad");
-fsck_err:
-	return ret;
-}
-
-void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
-				     struct bkey_s_c k)
-{
-	struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
-
-	prt_printf(out, "idx %llu generation %llu",
-		   le64_to_cpu(i.v->idx),
-		   le64_to_cpu(i.v->gen));
-}
-
-void bch2_inode_init_early(struct bch_fs *c,
-			   struct bch_inode_unpacked *inode_u)
-{
-	enum bch_str_hash_type str_hash =
-		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
-
-	memset(inode_u, 0, sizeof(*inode_u));
-
-	SET_INODE_STR_HASH(inode_u, str_hash);
-	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
-}
-
-void bch2_inode_init_late(struct bch_fs *c,
-			  struct bch_inode_unpacked *inode_u, u64 now,
-			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-			  struct bch_inode_unpacked *parent)
-{
-	inode_u->bi_mode	= mode;
-	inode_u->bi_uid		= uid;
-	inode_u->bi_gid		= gid;
-	inode_u->bi_dev		= rdev;
-	inode_u->bi_atime	= now;
-	inode_u->bi_mtime	= now;
-	inode_u->bi_ctime	= now;
-	inode_u->bi_otime	= now;
-
-	if (parent && parent->bi_mode & S_ISGID) {
-		inode_u->bi_gid = parent->bi_gid;
-		if (S_ISDIR(mode))
-			inode_u->bi_mode |= S_ISGID;
-	}
-
-	if (parent) {
-#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
-		BCH_INODE_OPTS()
-#undef x
-	}
-
-	if (!S_ISDIR(mode))
-		inode_u->bi_casefold = 0;
-
-	if (bch2_inode_casefold(c, inode_u))
-		inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
-}
-
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		     struct bch_inode_unpacked *parent)
-{
-	bch2_inode_init_early(c, inode_u);
-	bch2_inode_init_late(c, inode_u, bch2_current_time(c),
-			     uid, gid, mode, rdev, parent);
-}
-
-static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
-{
-	struct bch_fs *c = trans->c;
-
-	u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
-
-	cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
-					BTREE_ID_logged_ops,
-					POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
-					BTREE_ITER_cached);
-	int ret = bkey_err(k);
-	if (ret)
-		return ERR_PTR(ret);
-
-	struct bkey_i_inode_alloc_cursor *cursor =
-		k.k->type == KEY_TYPE_inode_alloc_cursor
-		? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
-		: bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
-	ret = PTR_ERR_OR_ZERO(cursor);
-	if (ret)
-		goto err;
-
-	if (c->opts.inodes_32bit) {
-		*min = BLOCKDEV_INODE_MAX;
-		*max = INT_MAX;
-	} else {
-		cursor->v.bits = c->opts.shard_inode_numbers_bits;
-
-		unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
-
-		*min = max(cpu << bits, (u64) INT_MAX + 1);
-		*max = (cpu << bits) | ~(ULLONG_MAX << bits);
-	}
-
-	if (le64_to_cpu(cursor->v.idx)  < *min)
-		cursor->v.idx = cpu_to_le64(*min);
-
-	if (le64_to_cpu(cursor->v.idx) >= *max) {
-		cursor->v.idx = cpu_to_le64(*min);
-		le32_add_cpu(&cursor->v.gen, 1);
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret ? ERR_PTR(ret) : cursor;
-}
-
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
-		      struct btree_iter *iter,
-		      struct bch_inode_unpacked *inode_u,
-		      u32 snapshot, u64 cpu)
-{
-	u64 min, max;
-	struct bkey_i_inode_alloc_cursor *cursor =
-		bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
-	int ret = PTR_ERR_OR_ZERO(cursor);
-	if (ret)
-		return ret;
-
-	u64 start = le64_to_cpu(cursor->v.idx);
-	u64 pos = start;
-
-	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
-			     BTREE_ITER_all_snapshots|
-			     BTREE_ITER_intent);
-	struct bkey_s_c k;
-again:
-	while ((k = bch2_btree_iter_peek(trans, iter)).k &&
-	       !(ret = bkey_err(k)) &&
-	       bkey_lt(k.k->p, POS(0, max))) {
-		if (pos < iter->pos.offset)
-			goto found_slot;
-
-		/*
-		 * We don't need to iterate over keys in every snapshot once
-		 * we've found just one:
-		 */
-		pos = iter->pos.offset + 1;
-		bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
-	}
-
-	if (!ret && pos < max)
-		goto found_slot;
-
-	if (!ret && start == min)
-		ret = bch_err_throw(trans->c, ENOSPC_inode_create);
-
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
-
-	/* Retry from start */
-	pos = start = min;
-	bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
-	le32_add_cpu(&cursor->v.gen, 1);
-	goto again;
-found_slot:
-	bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
-	k = bch2_btree_iter_peek_slot(trans, iter);
-	ret = bkey_err(k);
-	if (ret) {
-		bch2_trans_iter_exit(trans, iter);
-		return ret;
-	}
-
-	inode_u->bi_inum	= k.k->p.offset;
-	inode_u->bi_generation	= le64_to_cpu(cursor->v.gen);
-	cursor->v.idx		= cpu_to_le64(k.k->p.offset + 1);
-	return 0;
-}
-
-static int bch2_inode_delete_keys(struct btree_trans *trans,
-				  subvol_inum inum, enum btree_id id)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i delete;
-	struct bpos end = POS(inum.inum, U64_MAX);
-	u32 snapshot;
-	int ret = 0;
-
-	/*
-	 * We're never going to be deleting partial extents, no need to use an
-	 * extent iterator:
-	 */
-	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-			     BTREE_ITER_intent);
-
-	while (1) {
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-		k = bch2_btree_iter_peek_max(trans, &iter, end);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (!k.k)
-			break;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter.pos;
-
-		if (iter.flags & BTREE_ITER_is_extents)
-			bch2_key_resize(&delete.k,
-					bpos_min(end, k.k->p).offset -
-					iter.pos.offset);
-
-		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-		      bch2_trans_commit(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc);
-err:
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter = {};
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
-	if (ret)
-		goto err2;
-
-	/*
-	 * If this was a directory, there shouldn't be any real dirents left -
-	 * but there could be whiteouts (from hash collisions) that we should
-	 * delete:
-	 *
-	 * XXX: the dirent code ideally would delete whiteouts when they're no
-	 * longer needed
-	 */
-	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
-		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
-		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
-	if (ret)
-		goto err2;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inum.inum, snapshot),
-			       BTREE_ITER_intent|BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(c,
-				     "inode %llu:%u not found when deleting",
-				     inum.inum, snapshot);
-		ret = bch_err_throw(c, ENOENT_inode);
-		goto err;
-	}
-
-	ret   = bch2_btree_delete_at(trans, &iter, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	if (ret)
-		goto err2;
-
-	ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
-err2:
-	bch2_trans_put(trans);
-	return ret;
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
-	if (bi->bi_flags & BCH_INODE_unlinked)
-		bi->bi_flags &= ~BCH_INODE_unlinked;
-	else {
-		if (bi->bi_nlink == U32_MAX)
-			return -EINVAL;
-
-		bi->bi_nlink++;
-	}
-
-	return 0;
-}
-
-void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
-{
-	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
-		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
-					bi->bi_inum);
-		return;
-	}
-
-	if (bi->bi_flags & BCH_INODE_unlinked) {
-		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
-		return;
-	}
-
-	if (bi->bi_nlink)
-		bi->bi_nlink--;
-	else
-		bi->bi_flags |= BCH_INODE_unlinked;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
-{
-	struct bch_opts ret = { 0 };
-#define x(_name, _bits)							\
-	if (inode->bi_##_name)						\
-		opt_set(ret, _name, inode->bi_##_name - 1);
-	BCH_INODE_OPTS()
-#undef x
-	return ret;
-}
-
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
-			 struct bch_inode_unpacked *inode)
-{
-#define x(_name, _bits)							\
-	if ((inode)->bi_##_name) {					\
-		opts->_name = inode->bi_##_name - 1;			\
-		opts->_name##_from_inode = true;			\
-	} else {							\
-		opts->_name = c->opts._name;				\
-		opts->_name##_from_inode = false;			\
-	}
-	BCH_INODE_OPTS()
-#undef x
-
-	bch2_io_opts_fixups(opts);
-}
-
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
-{
-	struct bch_inode_unpacked inode;
-	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
-
-	if (ret)
-		return ret;
-
-	bch2_inode_opts_get(opts, trans->c, &inode);
-	return 0;
-}
-
-int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
-			    struct bch_inode_unpacked *bi, unsigned v)
-{
-	struct bch_fs *c = trans->c;
-
-#ifndef CONFIG_UNICODE
-	bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
-	return -EOPNOTSUPP;
-#endif
-
-	if (c->opts.casefold_disabled)
-		return -EOPNOTSUPP;
-
-	int ret = 0;
-	/* Not supported on individual files. */
-	if (!S_ISDIR(bi->bi_mode))
-		return -EOPNOTSUPP;
-
-	/*
-	 * Make sure the dir is empty, as otherwise we'd need to
-	 * rehash everything and update the dirent keys.
-	 */
-	ret = bch2_empty_dir_trans(trans, inum);
-	if (ret < 0)
-		return ret;
-
-	ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
-	if (ret)
-		return ret;
-
-	bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-
-	bi->bi_casefold = v + 1;
-	bi->bi_fields_set |= BIT(Inode_opt_casefold);
-
-	return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
-}
-
-static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = {};
-	struct bkey_i_inode_generation delete;
-	struct bch_inode_unpacked inode_u;
-	struct bkey_s_c k;
-	int ret;
-
-	do {
-		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL) ?:
-			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-						      SPOS(inum, 0, snapshot),
-						      SPOS(inum, U64_MAX, snapshot),
-						      0, NULL);
-	} while (ret == -BCH_ERR_transaction_restart_nested);
-	if (ret)
-		goto err;
-retry:
-	bch2_trans_begin(trans);
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inum, snapshot), BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!bkey_is_inode(k.k)) {
-		bch2_fs_inconsistent(c,
-				     "inode %llu:%u not found when deleting",
-				     inum, snapshot);
-		ret = bch_err_throw(c, ENOENT_inode);
-		goto err;
-	}
-
-	bch2_inode_unpack(k, &inode_u);
-
-	/* Subvolume root? */
-	if (inode_u.bi_subvol)
-		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
-	bkey_inode_generation_init(&delete.k_i);
-	delete.k.p = iter.pos;
-	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
-/*
- * After deleting an inode, there may be versions in older snapshots that should
- * also be deleted - if they're not referenced by sibling snapshots and not open
- * in other subvolumes:
- */
-static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-next_parent:
-	ret = lockrestart_do(trans,
-		bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
-	if (ret || !k.k)
-		return ret;
-
-	bool unlinked = bkey_is_unlinked_inode(k);
-	pos = k.k->p;
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (!unlinked)
-		return 0;
-
-	ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
-	if (ret)
-		return ret < 0 ? ret : 0;
-
-	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
-	if (ret)
-		return ret;
-	goto next_parent;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
-	return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
-		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
-}
-
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
-				    bool from_deleted_inodes)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter inode_iter;
-	struct bkey_s_c k;
-	struct bch_inode_unpacked inode;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
-	if (fsck_err_on(from_deleted_inodes && ret,
-			trans, deleted_inode_missing,
-			"nonexistent inode %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
-	if (ret)
-		goto out;
-
-	ret = bch2_inode_unpack(k, &inode);
-	if (ret)
-		goto out;
-
-	if (S_ISDIR(inode.bi_mode)) {
-		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
-		if (fsck_err_on(from_deleted_inodes &&
-				bch2_err_matches(ret, ENOTEMPTY),
-				trans, deleted_inode_is_dir,
-				"non empty directory %llu:%u in deleted_inodes btree",
-				pos.offset, pos.snapshot))
-			goto delete;
-		if (ret)
-			goto out;
-	}
-
-	ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
-	if (fsck_err_on(from_deleted_inodes && ret,
-			trans, deleted_inode_not_unlinked,
-			"non-deleted inode %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
-	if (ret)
-		goto out;
-
-	ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
-		? 0 : bch_err_throw(c, inode_has_child_snapshot);
-
-	if (fsck_err_on(from_deleted_inodes && ret,
-			trans, deleted_inode_has_child_snapshots,
-			"inode with child snapshots %llu:%u in deleted_inodes btree",
-			pos.offset, pos.snapshot))
-		goto delete;
-	if (ret)
-		goto out;
-
-	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
-	if (ret < 0)
-		goto out;
-
-	if (ret) {
-		if (fsck_err(trans, inode_has_child_snapshots_wrong,
-			     "inode has_child_snapshots flag wrong (should be set)\n%s",
-			     (printbuf_reset(&buf),
-			      bch2_inode_unpacked_to_text(&buf, &inode),
-			      buf.buf))) {
-			inode.bi_flags |= BCH_INODE_has_child_snapshot;
-			ret = __bch2_fsck_write_inode(trans, &inode);
-			if (ret)
-				goto out;
-		}
-
-		if (!from_deleted_inodes) {
-			ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-				bch_err_throw(c, inode_has_child_snapshot);
-			goto out;
-		}
-
-		goto delete;
-
-	}
-
-	if (from_deleted_inodes) {
-		if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
-		    !fsck_err(trans, deleted_inode_but_clean,
-			      "filesystem marked as clean but have deleted inode %llu:%u",
-			      pos.offset, pos.snapshot)) {
-			ret = 0;
-			goto out;
-		}
-
-		ret = 1;
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	printbuf_exit(&buf);
-	return ret;
-delete:
-	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
-	goto out;
-}
-
-static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
-{
-	u32 snapshot;
-
-	return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
-		may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
-}
-
-int bch2_delete_dead_inodes(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret;
-
-	/*
-	 * if we ran check_inodes() unlinked inodes will have already been
-	 * cleaned up but the write buffer will be out of sync; therefore we
-	 * alway need a write buffer flush
-	 */
-	ret = bch2_btree_write_buffer_flush_sync(trans);
-	if (ret)
-		goto err;
-
-	/*
-	 * Weird transaction restart handling here because on successful delete,
-	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
-	 * but we can't retry because the btree write buffer won't have been
-	 * flushed and we'd spin:
-	 */
-	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
-					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		ret = may_delete_deleted_inode(trans, k.k->p, true);
-		if (ret > 0) {
-			bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
-						k.k->p.offset, k.k->p.snapshot);
-
-			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
-			/*
-			 * We don't want to loop here: a transaction restart
-			 * error here means we handled a transaction restart and
-			 * we're actually done, but if we loop we'll retry the
-			 * same key because the write buffer hasn't been flushed
-			 * yet
-			 */
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-				ret = 0;
-				continue;
-			}
-		}
-
-		ret;
-	}));
-err:
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
deleted file mode 100644
index b8ec3e628d90..000000000000
--- a/fs/bcachefs/inode.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_H
-#define _BCACHEFS_INODE_H
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "opts.h"
-#include "snapshot.h"
-
-extern const char * const bch2_inode_opts[];
-
-int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
-			struct bkey_validate_context);
-int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
-
-static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
-	return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
-		? __bch2_inode_has_child_snapshots(trans, pos)
-		: 0;
-}
-
-int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s,
-		       enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
-	.key_validate	= bch2_inode_validate,		\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 16,				\
-})
-
-#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
-	.key_validate	= bch2_inode_v2_validate,	\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 32,				\
-})
-
-#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
-	.key_validate	= bch2_inode_v3_validate,	\
-	.val_to_text	= bch2_inode_to_text,		\
-	.trigger	= bch2_trigger_inode,		\
-	.min_val_size	= 48,				\
-})
-
-static inline bool bkey_is_inode(const struct bkey *k)
-{
-	return  k->type == KEY_TYPE_inode ||
-		k->type == KEY_TYPE_inode_v2 ||
-		k->type == KEY_TYPE_inode_v3;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
-				   struct bkey_validate_context);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
-	.key_validate	= bch2_inode_generation_validate,	\
-	.val_to_text	= bch2_inode_generation_to_text,	\
-	.min_val_size	= 8,					\
-})
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
-				     struct bkey_validate_context);
-void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) {	\
-	.key_validate	= bch2_inode_alloc_cursor_validate,	\
-	.val_to_text	= bch2_inode_alloc_cursor_to_text,	\
-	.min_val_size	= 16,					\
-})
-
-#if 0
-typedef struct {
-	u64			lo;
-	u32			hi;
-} __packed __aligned(4) u96;
-#endif
-typedef u64 u96;
-
-struct bch_inode_unpacked {
-	u64			bi_inum;
-	u32			bi_snapshot;
-	u64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	u64			bi_size;
-	u64			bi_sectors;
-	u64			bi_version;
-	u32			bi_flags;
-	u16			bi_mode;
-
-#define x(_name, _bits)	u##_bits _name;
-	BCH_INODE_FIELDS_v3()
-#undef  x
-};
-BITMASK(INODE_STR_HASH,	struct bch_inode_unpacked, bi_flags, 20, 24);
-
-struct bkey_inode_buf {
-	struct bkey_i_inode_v3	inode;
-
-#define x(_name, _bits)		+ 8 + _bits / 8
-	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
-#undef  x
-};
-
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
-
-void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-
-int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-		      struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
-
-static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
-					 struct btree_iter *iter,
-					 struct bch_inode_unpacked *inode,
-					 subvol_inum inum, unsigned flags)
-{
-	return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
-}
-
-static inline int bch2_inode_peek(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bch_inode_unpacked *inode,
-				  subvol_inum inum, unsigned flags)
-{
-	return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
-				     struct bch_inode_unpacked *, unsigned);
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
-				  subvol_inum,
-				  struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
-				  struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
-			    struct bch_inode_unpacked *);
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
-				  struct bch_inode_unpacked *root);
-
-int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
-		     struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
-
-static inline int bch2_inode_write(struct btree_trans *trans,
-		     struct btree_iter *iter,
-		     struct bch_inode_unpacked *inode)
-{
-	return bch2_inode_write_flags(trans, iter, inode, 0);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-
-void bch2_inode_init_early(struct bch_fs *,
-			   struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
-			  uid_t, gid_t, umode_t, dev_t,
-			  struct bch_inode_unpacked *);
-void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-		     uid_t, gid_t, umode_t, dev_t,
-		     struct bch_inode_unpacked *);
-
-int bch2_inode_create(struct btree_trans *, struct btree_iter *,
-		      struct bch_inode_unpacked *, u32, u64);
-
-int bch2_inode_rm(struct bch_fs *, subvol_inum);
-
-#define inode_opt_get(_c, _inode, _name)			\
-	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
-
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
-				      enum inode_opt_id id, u64 v)
-{
-	switch (id) {
-#define x(_name, ...)							\
-	case Inode_opt_##_name:						\
-		inode->bi_##_name = v;					\
-		break;
-	BCH_INODE_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
-				     enum inode_opt_id id)
-{
-	switch (id) {
-#define x(_name, ...)							\
-	case Inode_opt_##_name:						\
-		return inode->bi_##_name;
-	BCH_INODE_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-static inline u8 mode_to_type(umode_t mode)
-{
-	return (mode >> 12) & 15;
-}
-
-static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
-{
-	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
-}
-
-static inline u32 bch2_inode_flags(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
-	case KEY_TYPE_inode_v2:
-		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
-	case KEY_TYPE_inode_v3:
-		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
-	default:
-		return 0;
-	}
-}
-
-static inline unsigned bkey_inode_mode(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_inode:
-		return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
-	case KEY_TYPE_inode_v2:
-		return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
-	case KEY_TYPE_inode_v3:
-		return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
-	default:
-		return 0;
-	}
-}
-
-static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
-	/* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
-	return bi->bi_casefold
-		? bi->bi_casefold - 1
-		: c->opts.casefold;
-}
-
-static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi)
-{
-	return bi->bi_dir || bi->bi_dir_offset;
-}
-
-/* i_nlink: */
-
-static inline unsigned nlink_bias(umode_t mode)
-{
-	return S_ISDIR(mode) ? 2 : 1;
-}
-
-static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
-{
-	return bi->bi_flags & BCH_INODE_unlinked
-		  ? 0
-		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
-}
-
-static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
-					unsigned nlink)
-{
-	if (nlink) {
-		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
-		bi->bi_flags &= ~BCH_INODE_unlinked;
-	} else {
-		bi->bi_nlink = 0;
-		bi->bi_flags |= BCH_INODE_unlinked;
-	}
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
-void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
-			 struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
-int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
-			    struct bch_inode_unpacked *, unsigned);
-
-#include "rebalance.h"
-
-static inline struct bch_extent_rebalance
-bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
-	struct bch_io_opts io_opts;
-	bch2_inode_opts_get(&io_opts, c, inode);
-	return io_opts_to_rebalance_opts(c, &io_opts);
-}
-
-#define BCACHEFS_ROOT_SUBVOL_INUM					\
-	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
-
-static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
-	return a.subvol == b.subvol && a.inum == b.inum;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
-int bch2_delete_dead_inodes(struct bch_fs *);
-
-#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
deleted file mode 100644
index 1f00938b1bdc..000000000000
--- a/fs/bcachefs/inode_format.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_FORMAT_H
-#define _BCACHEFS_INODE_FORMAT_H
-
-#define BLOCKDEV_INODE_MAX	4096
-#define BCACHEFS_ROOT_INO	4096
-
-struct bch_inode {
-	struct bch_val		v;
-
-	__le64			bi_hash_seed;
-	__le32			bi_flags;
-	__le16			bi_mode;
-	__u8			fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
-	struct bch_val		v;
-
-	__le64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	__le64			bi_flags;
-	__le16			bi_mode;
-	__u8			fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
-	struct bch_val		v;
-
-	__le64			bi_journal_seq;
-	__le64			bi_hash_seed;
-	__le64			bi_flags;
-	__le64			bi_sectors;
-	__le64			bi_size;
-	__le64			bi_version;
-	__u8			fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL	6
-#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
-	struct bch_val		v;
-
-	__le32			bi_generation;
-	__le32			pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2()			\
-	x(bi_atime,			96)	\
-	x(bi_ctime,			96)	\
-	x(bi_mtime,			96)	\
-	x(bi_otime,			96)	\
-	x(bi_size,			64)	\
-	x(bi_sectors,			64)	\
-	x(bi_uid,			32)	\
-	x(bi_gid,			32)	\
-	x(bi_nlink,			32)	\
-	x(bi_generation,		32)	\
-	x(bi_dev,			32)	\
-	x(bi_data_checksum,		8)	\
-	x(bi_compression,		8)	\
-	x(bi_project,			32)	\
-	x(bi_background_compression,	8)	\
-	x(bi_data_replicas,		8)	\
-	x(bi_promote_target,		16)	\
-	x(bi_foreground_target,		16)	\
-	x(bi_background_target,		16)	\
-	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)	\
-	x(bi_dir,			64)	\
-	x(bi_dir_offset,		64)	\
-	x(bi_subvol,			32)	\
-	x(bi_parent_subvol,		32)
-
-#define BCH_INODE_FIELDS_v3()			\
-	x(bi_atime,			96)	\
-	x(bi_ctime,			96)	\
-	x(bi_mtime,			96)	\
-	x(bi_otime,			96)	\
-	x(bi_uid,			32)	\
-	x(bi_gid,			32)	\
-	x(bi_nlink,			32)	\
-	x(bi_generation,		32)	\
-	x(bi_dev,			32)	\
-	x(bi_data_checksum,		8)	\
-	x(bi_compression,		8)	\
-	x(bi_project,			32)	\
-	x(bi_background_compression,	8)	\
-	x(bi_data_replicas,		8)	\
-	x(bi_promote_target,		16)	\
-	x(bi_foreground_target,		16)	\
-	x(bi_background_target,		16)	\
-	x(bi_erasure_code,		16)	\
-	x(bi_fields_set,		16)	\
-	x(bi_dir,			64)	\
-	x(bi_dir_offset,		64)	\
-	x(bi_subvol,			32)	\
-	x(bi_parent_subvol,		32)	\
-	x(bi_nocow,			8)	\
-	x(bi_depth,			32)	\
-	x(bi_inodes_32bit,		8)	\
-	x(bi_casefold,			8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS()			\
-	x(data_checksum,		8)	\
-	x(compression,			8)	\
-	x(project,			32)	\
-	x(background_compression,	8)	\
-	x(data_replicas,		8)	\
-	x(promote_target,		16)	\
-	x(foreground_target,		16)	\
-	x(background_target,		16)	\
-	x(erasure_code,			16)	\
-	x(nocow,			8)	\
-	x(inodes_32bit,			8)	\
-	x(casefold,			8)
-
-enum inode_opt_id {
-#define x(name, ...)				\
-	Inode_opt_##name,
-	BCH_INODE_OPTS()
-#undef  x
-	Inode_opt_nr,
-};
-
-/*
- * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
- * for overlayfs
- */
-#define BCH_INODE_FLAGS()			\
-	x(sync,				0)	\
-	x(immutable,			1)	\
-	x(append,			2)	\
-	x(nodump,			3)	\
-	x(noatime,			4)	\
-	x(i_size_dirty,			5)	\
-	x(i_sectors_dirty,		6)	\
-	x(unlinked,			7)	\
-	x(backptr_untrusted,		8)	\
-	x(has_child_snapshot,		9)	\
-	x(has_case_insensitive,		10)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n)	BCH_INODE_##t = 1U << n,
-	BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n)	__BCH_INODE_##t = n,
-	BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODEv1_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODEv1_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
-				struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
-
-struct bch_inode_alloc_cursor {
-	struct bch_val		v;
-	__u8			bits;
-	__u8			pad;
-	__le32			gen;
-	__le64			idx;
-};
-
-#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
deleted file mode 100644
index 07023667a475..000000000000
--- a/fs/bcachefs/io_misc.c
+++ /dev/null
@@ -1,570 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * io_misc.c - fallocate, fpunch, truncate:
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "subvolume.h"
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-			  subvol_inum inum,
-			  struct btree_iter *iter,
-			  u64 sectors,
-			  struct bch_io_opts opts,
-			  s64 *i_sectors_delta,
-			  struct write_point_specifier write_point)
-{
-	struct bch_fs *c = trans->c;
-	struct disk_reservation disk_res = { 0 };
-	struct closure cl;
-	struct open_buckets open_buckets = { 0 };
-	struct bkey_s_c k;
-	struct bkey_buf old, new;
-	unsigned sectors_allocated = 0, new_replicas;
-	bool unwritten = opts.nocow &&
-	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-	int ret;
-
-	bch2_bkey_buf_init(&old);
-	bch2_bkey_buf_init(&new);
-	closure_init_stack(&cl);
-
-	k = bch2_btree_iter_peek_slot(trans, iter);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-	new_replicas = max(0, (int) opts.data_replicas -
-			   (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-
-	/*
-	 * Get a disk reservation before (in the nocow case) calling
-	 * into the allocator:
-	 */
-	ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-	if (unlikely(ret))
-		goto err_noprint;
-
-	bch2_bkey_buf_reassemble(&old, c, k);
-
-	if (!unwritten) {
-		struct bkey_i_reservation *reservation;
-
-		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-		reservation = bkey_reservation_init(new.k);
-		reservation->k.p = iter->pos;
-		bch2_key_resize(&reservation->k, sectors);
-		reservation->v.nr_replicas = opts.data_replicas;
-	} else {
-		struct bkey_i_extent *e;
-		struct bch_devs_list devs_have;
-		struct write_point *wp;
-
-		devs_have.nr = 0;
-
-		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-		e = bkey_extent_init(new.k);
-		e->k.p = iter->pos;
-
-		ret = bch2_alloc_sectors_start_trans(trans,
-				opts.foreground_target,
-				false,
-				write_point,
-				&devs_have,
-				opts.data_replicas,
-				opts.data_replicas,
-				BCH_WATERMARK_normal, 0, &cl, &wp);
-		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-			ret = bch_err_throw(c, transaction_restart_nested);
-		if (ret)
-			goto err;
-
-		sectors = min_t(u64, sectors, wp->sectors_free);
-		sectors_allocated = sectors;
-
-		bch2_key_resize(&e->k, sectors);
-
-		bch2_open_bucket_get(c, wp, &open_buckets);
-		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-		bch2_alloc_sectors_done(c, wp);
-
-		extent_for_each_ptr(extent_i_to_s(e), ptr)
-			ptr->unwritten = true;
-	}
-
-	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-				 0, i_sectors_delta, true);
-err:
-	if (!ret && sectors_allocated)
-		bch2_increment_clock(c, sectors_allocated, WRITE);
-	if (should_print_err(ret)) {
-		struct printbuf buf = PRINTBUF;
-		lockrestart_do(trans,
-			bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
-		prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
-		bch_err_ratelimited(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-err_noprint:
-	bch2_open_buckets_put(c, &open_buckets);
-	bch2_disk_reservation_put(c, &disk_res);
-	bch2_bkey_buf_exit(&new, c);
-	bch2_bkey_buf_exit(&old, c);
-
-	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock_long(trans);
-		bch2_wait_on_allocator(c, &cl);
-	}
-
-	return ret;
-}
-
-/* For fsck */
-int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
-{
-	u32 restart_count = trans->restart_count;
-	struct bch_fs *c = trans->c;
-	struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bkey_i delete;
-
-	int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
-			start, end, 0, k,
-			&disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		bkey_init(&delete.k);
-		delete.k.p = iter.pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end, &delete);
-
-		bch2_extent_trim_atomic(trans, &iter, &delete) ?:
-		bch2_trans_update(trans, &iter, &delete, 0);
-	}));
-
-	bch2_disk_reservation_put(c, &disk_res);
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-		   subvol_inum inum, u64 end,
-		   s64 *i_sectors_delta)
-{
-	struct bch_fs *c	= trans->c;
-	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
-	struct bpos end_pos = POS(inum.inum, end);
-	struct bkey_s_c k;
-	int ret = 0, ret2 = 0;
-	u32 snapshot;
-
-	while (!ret ||
-	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete;
-
-		if (ret)
-			ret2 = ret;
-
-		bch2_trans_begin(trans);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(trans, iter, snapshot);
-
-		/*
-		 * peek_max() doesn't have ideal semantics for extents:
-		 */
-		k = bch2_btree_iter_peek_max(trans, iter, end_pos);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k);
-		if (ret)
-			continue;
-
-		bkey_init(&delete.k);
-		delete.k.p = iter->pos;
-
-		/* create the biggest key we can */
-		bch2_key_resize(&delete.k, max_sectors);
-		bch2_cut_back(end_pos, &delete);
-
-		ret = bch2_extent_update(trans, inum, iter, &delete,
-				&disk_res, 0, i_sectors_delta, false);
-		bch2_disk_reservation_put(c, &disk_res);
-	}
-
-	return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-		s64 *i_sectors_delta)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, start),
-			     BTREE_ITER_intent);
-
-	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-
-	return ret;
-}
-
-/* truncate: */
-
-void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
-
-	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
-	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
-	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
-}
-
-static int truncate_set_isize(struct btree_trans *trans,
-			      subvol_inum inum,
-			      u64 new_i_size,
-			      bool warn)
-{
-	struct btree_iter iter = {};
-	struct bch_inode_unpacked inode_u;
-	int ret;
-
-	ret   = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
-		(inode_u.bi_size = new_i_size, 0) ?:
-		bch2_inode_write(trans, &iter, &inode_u);
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
-					    struct bkey_i *op_k,
-					    u64 *i_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter fpunch_iter;
-	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
-	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
-	bool warn_errors = i_sectors_delta != NULL;
-	int ret;
-
-	ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
-			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
-			     BTREE_ITER_intent);
-	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
-	bch2_trans_iter_exit(trans, &fpunch_iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		ret = 0;
-err:
-	if (warn_errors)
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
-{
-	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
-}
-
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
-{
-	struct bkey_i_logged_op_truncate op;
-
-	bkey_logged_op_truncate_init(&op.k_i);
-	op.v.subvol	= cpu_to_le32(inum.subvol);
-	op.v.inum	= cpu_to_le64(inum.inum);
-	op.v.new_i_size	= cpu_to_le64(new_i_size);
-
-	/*
-	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-	 * snapshot while they're in progress, then crashing, will result in the
-	 * resume only proceeding in one of the snapshots
-	 */
-	down_read(&c->snapshot_create_lock);
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = bch2_logged_op_start(trans, &op.k_i);
-	if (ret)
-		goto out;
-	ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
-	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
-	bch2_trans_put(trans);
-	up_read(&c->snapshot_create_lock);
-
-	return ret;
-}
-
-/* finsert/fcollapse: */
-
-void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
-
-	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
-	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
-	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
-	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
-}
-
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
-			 u64 offset, s64 len, bool warn)
-{
-	struct btree_iter iter;
-	struct bch_inode_unpacked inode_u;
-	int ret;
-
-	offset	<<= 9;
-	len	<<= 9;
-
-	ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
-	if (ret)
-		return ret;
-
-	if (len > 0) {
-		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
-			ret = -EFBIG;
-			goto err;
-		}
-
-		if (offset >= inode_u.bi_size) {
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	inode_u.bi_size += len;
-	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
-
-	ret = bch2_inode_write(trans, &iter, &inode_u);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
-					   struct bkey_i *op_k,
-					   u64 *i_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
-	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
-	struct bch_io_opts opts;
-	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
-	u64 src_offset = le64_to_cpu(op->v.src_offset);
-	s64 shift = dst_offset - src_offset;
-	u64 len = abs(shift);
-	u64 pos = le64_to_cpu(op->v.pos);
-	bool insert = shift > 0;
-	u32 snapshot;
-	bool warn_errors = i_sectors_delta != NULL;
-	int ret = 0;
-
-	ret = bch2_inum_opts_get(trans, inum, &opts);
-	if (ret)
-		return ret;
-
-	/*
-	 * check for missing subvolume before fpunch, as in resume we don't want
-	 * it to be a fatal error
-	 */
-	ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
-	if (ret)
-		return ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, 0),
-			     BTREE_ITER_intent);
-
-	switch (op->v.state) {
-case LOGGED_OP_FINSERT_start:
-	op->v.state = LOGGED_OP_FINSERT_shift_extents;
-
-	if (insert) {
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-		if (ret)
-			goto err;
-	} else {
-		bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
-
-		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
-		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto err;
-
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				bch2_logged_op_update(trans, &op->k_i));
-	}
-
-	fallthrough;
-case LOGGED_OP_FINSERT_shift_extents:
-	while (1) {
-		struct disk_reservation disk_res =
-			bch2_disk_reservation_init(c, 0);
-		struct bkey_i delete, *copy;
-		struct bkey_s_c k;
-		struct bpos src_pos = POS(inum.inum, src_offset);
-
-		bch2_trans_begin(trans);
-
-		ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
-						    warn_errors);
-		if (ret)
-			goto btree_err;
-
-		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-		bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
-
-		k = insert
-			? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
-			: bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
-		if ((ret = bkey_err(k)))
-			goto btree_err;
-
-		if (!k.k ||
-		    k.k->p.inode != inum.inum ||
-		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
-			break;
-
-		copy = bch2_bkey_make_mut_noupdate(trans, k);
-		if ((ret = PTR_ERR_OR_ZERO(copy)))
-			goto btree_err;
-
-		if (insert &&
-		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
-			bch2_cut_front(src_pos, copy);
-
-			/* Splitting compressed extent? */
-			bch2_disk_reservation_add(c, &disk_res,
-					copy->k.size *
-					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
-					BCH_DISK_RESERVATION_NOFAIL);
-		}
-
-		bkey_init(&delete.k);
-		delete.k.p = copy->k.p;
-		delete.k.p.snapshot = snapshot;
-		delete.k.size = copy->k.size;
-
-		copy->k.p.offset += shift;
-		copy->k.p.snapshot = snapshot;
-
-		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
-
-		ret =   bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
-			bch2_logged_op_update(trans, &op->k_i) ?:
-			bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
-btree_err:
-		bch2_disk_reservation_put(c, &disk_res);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			goto err;
-
-		pos = le64_to_cpu(op->v.pos);
-	}
-
-	op->v.state = LOGGED_OP_FINSERT_finish;
-
-	if (!insert) {
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-	} else {
-		/* We need an inode update to update bi_journal_seq for fsync: */
-		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
-				bch2_logged_op_update(trans, &op->k_i));
-	}
-
-	break;
-case LOGGED_OP_FINSERT_finish:
-	break;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	if (warn_errors)
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
-{
-	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
-}
-
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
-			   u64 offset, u64 len, bool insert,
-			   s64 *i_sectors_delta)
-{
-	struct bkey_i_logged_op_finsert op;
-	s64 shift = insert ? len : -len;
-
-	bkey_logged_op_finsert_init(&op.k_i);
-	op.v.subvol	= cpu_to_le32(inum.subvol);
-	op.v.inum	= cpu_to_le64(inum.inum);
-	op.v.dst_offset	= cpu_to_le64(offset + shift);
-	op.v.src_offset	= cpu_to_le64(offset);
-	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
-
-	/*
-	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
-	 * snapshot while they're in progress, then crashing, will result in the
-	 * resume only proceeding in one of the snapshots
-	 */
-	down_read(&c->snapshot_create_lock);
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = bch2_logged_op_start(trans, &op.k_i);
-	if (ret)
-		goto out;
-	ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
-	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
-	bch2_trans_put(trans);
-	up_read(&c->snapshot_create_lock);
-
-	return ret;
-}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
deleted file mode 100644
index b93e4d4b3c0c..000000000000
--- a/fs/bcachefs/io_misc.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_MISC_H
-#define _BCACHEFS_IO_MISC_H
-
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-			  u64, struct bch_io_opts, s64 *,
-			  struct write_point_specifier);
-
-int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-		   subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
-	.val_to_text	= bch2_logged_op_truncate_to_text,	\
-	.min_val_size	= 24,					\
-})
-
-int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
-
-int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
-
-void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
-	.val_to_text	= bch2_logged_op_finsert_to_text,	\
-	.min_val_size	= 24,					\
-})
-
-int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
-
-int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
-
-#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
deleted file mode 100644
index e0874ad9a6cf..000000000000
--- a/fs/bcachefs/io_read.c
+++ /dev/null
@@ -1,1543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-#endif
-
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
-		   bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
-		 "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	const struct bch_devs_mask *devs;
-	unsigned d, nr = 0, total = 0;
-	u64 now = local_clock(), last;
-	s64 congested;
-	struct bch_dev *ca;
-
-	if (!target)
-		return false;
-
-	guard(rcu)();
-	devs = bch2_target_to_mask(c, target) ?:
-		&c->rw_devs[BCH_DATA_user];
-
-	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-		ca = rcu_dereference(c->devs[d]);
-		if (!ca)
-			continue;
-
-		congested = atomic_read(&ca->congested);
-		last = READ_ONCE(ca->congested_last);
-		if (time_after64(now, last))
-			congested -= (now - last) >> 12;
-
-		total += max(congested, 0LL);
-		nr++;
-	}
-
-	return get_random_u32_below(nr * CONGESTED_MAX) < total;
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-	return false;
-}
-
-#endif
-
-/* Cache promotion on read */
-
-static const struct rhashtable_params bch_promote_params = {
-	.head_offset		= offsetof(struct promote_op, hash),
-	.key_offset		= offsetof(struct promote_op, pos),
-	.key_len		= sizeof(struct bpos),
-	.automatic_shrinking	= true,
-};
-
-static inline bool have_io_error(struct bch_io_failures *failed)
-{
-	return failed && failed->nr;
-}
-
-static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
-{
-	EBUG_ON(rbio->split);
-
-	return rbio->data_update
-		? container_of(rbio, struct data_update, rbio)
-		: NULL;
-}
-
-static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
-{
-	struct data_update *u = rbio_data_update(orig);
-	if (!u)
-		return false;
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
-	unsigned i = 0;
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (ptr->dev == dev &&
-		    u->data_opts.rewrite_ptrs & BIT(i))
-			return true;
-		i++;
-	}
-
-	return false;
-}
-
-static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
-				  struct bpos pos,
-				  struct bch_io_opts opts,
-				  unsigned flags,
-				  struct bch_io_failures *failed)
-{
-	if (!have_io_error(failed)) {
-		BUG_ON(!opts.promote_target);
-
-		if (!(flags & BCH_READ_may_promote))
-			return bch_err_throw(c, nopromote_may_not);
-
-		if (bch2_bkey_has_target(c, k, opts.promote_target))
-			return bch_err_throw(c, nopromote_already_promoted);
-
-		if (bkey_extent_is_unwritten(k))
-			return bch_err_throw(c, nopromote_unwritten);
-
-		if (bch2_target_congested(c, opts.promote_target))
-			return bch_err_throw(c, nopromote_congested);
-	}
-
-	if (rhashtable_lookup_fast(&c->promote_table, &pos,
-				   bch_promote_params))
-		return bch_err_throw(c, nopromote_in_flight);
-
-	return 0;
-}
-
-static noinline void promote_free(struct bch_read_bio *rbio)
-{
-	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-	struct bch_fs *c = rbio->c;
-
-	int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-					 bch_promote_params);
-	BUG_ON(ret);
-
-	async_object_list_del(c, promote, op->list_idx);
-	async_object_list_del(c, rbio, rbio->list_idx);
-
-	bch2_data_update_exit(&op->write);
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
-	kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-	struct promote_op *op = container_of(wop, struct promote_op, write.op);
-	struct bch_fs *c = op->write.rbio.c;
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
-	promote_free(&op->write.rbio);
-}
-
-static void promote_start_work(struct work_struct *work)
-{
-	struct promote_op *op = container_of(work, struct promote_op, work);
-
-	bch2_data_update_read_done(&op->write);
-}
-
-static noinline void promote_start(struct bch_read_bio *rbio)
-{
-	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-
-	trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
-
-	INIT_WORK(&op->work, promote_start_work);
-	queue_work(rbio->c->write_ref_wq, &op->work);
-}
-
-static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
-					    enum btree_id btree_id,
-					    struct bkey_s_c k,
-					    struct bpos pos,
-					    struct extent_ptr_decoded *pick,
-					    unsigned sectors,
-					    struct bch_read_bio *orig,
-					    struct bch_io_failures *failed)
-{
-	struct bch_fs *c = trans->c;
-	int ret;
-
-	struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
-
-	if (!have_io_error(failed)) {
-		update_opts.target = orig->opts.promote_target;
-		update_opts.extra_replicas = 1;
-		update_opts.write_flags |= BCH_WRITE_cached;
-		update_opts.write_flags |= BCH_WRITE_only_specified_devs;
-	} else {
-		update_opts.target = orig->opts.foreground_target;
-
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		unsigned ptr_bit = 1;
-		bkey_for_each_ptr(ptrs, ptr) {
-			if (bch2_dev_io_failures(failed, ptr->dev) &&
-			    !ptr_being_rewritten(orig, ptr->dev))
-				update_opts.rewrite_ptrs |= ptr_bit;
-			ptr_bit <<= 1;
-		}
-
-		if (!update_opts.rewrite_ptrs)
-			return NULL;
-	}
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
-		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
-
-	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op) {
-		ret = bch_err_throw(c, nopromote_enomem);
-		goto err_put;
-	}
-
-	op->start_time = local_clock();
-	op->pos = pos;
-
-	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-					  bch_promote_params)) {
-		ret = bch_err_throw(c, nopromote_in_flight);
-		goto err;
-	}
-
-	ret = async_object_list_add(c, promote, op, &op->list_idx);
-	if (ret < 0)
-		goto err_remove_hash;
-
-	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
-			writepoint_hashed((unsigned long) current),
-			&orig->opts,
-			update_opts,
-			btree_id, k);
-	op->write.type = BCH_DATA_UPDATE_promote;
-	/*
-	 * possible errors: -BCH_ERR_nocow_lock_blocked,
-	 * -BCH_ERR_ENOSPC_disk_reservation:
-	 */
-	if (ret)
-		goto err_remove_list;
-
-	rbio_init_fragment(&op->write.rbio.bio, orig);
-	op->write.rbio.bounce	= true;
-	op->write.rbio.promote	= true;
-	op->write.op.end_io = promote_done;
-
-	return &op->write.rbio;
-err_remove_list:
-	async_object_list_del(c, promote, op->list_idx);
-err_remove_hash:
-	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
-				      bch_promote_params));
-err:
-	bio_free_pages(&op->write.op.wbio.bio);
-	/* We may have added to the rhashtable and thus need rcu freeing: */
-	kfree_rcu(op, rcu);
-err_put:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
-	return ERR_PTR(ret);
-}
-
-noinline
-static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
-					struct bvec_iter iter,
-					struct bkey_s_c k,
-					struct extent_ptr_decoded *pick,
-					unsigned flags,
-					struct bch_read_bio *orig,
-					bool *bounce,
-					bool *read_full,
-					struct bch_io_failures *failed)
-{
-	/*
-	 * We're in the retry path, but we don't know what to repair yet, and we
-	 * don't want to do a promote here:
-	 */
-	if (failed && !failed->nr)
-		return NULL;
-
-	struct bch_fs *c = trans->c;
-	/*
-	 * if failed != NULL we're not actually doing a promote, we're
-	 * recovering from an io/checksum error
-	 */
-	bool promote_full = (have_io_error(failed) ||
-			     *read_full ||
-			     READ_ONCE(c->opts.promote_whole_extents));
-	/* data might have to be decompressed in the write path: */
-	unsigned sectors = promote_full
-		? max(pick->crc.compressed_size, pick->crc.live_size)
-		: bvec_iter_sectors(iter);
-	struct bpos pos = promote_full
-		? bkey_start_pos(k.k)
-		: POS(k.k->p.inode, iter.bi_sector);
-	int ret;
-
-	ret = should_promote(c, k, pos, orig->opts, flags, failed);
-	if (ret)
-		goto nopromote;
-
-	struct bch_read_bio *promote =
-		__promote_alloc(trans,
-				k.k->type == KEY_TYPE_reflink_v
-				? BTREE_ID_reflink
-				: BTREE_ID_extents,
-				k, pos, pick, sectors, orig, failed);
-	if (!promote)
-		return NULL;
-
-	ret = PTR_ERR_OR_ZERO(promote);
-	if (ret)
-		goto nopromote;
-
-	*bounce		= true;
-	*read_full	= promote_full;
-
-	if (have_io_error(failed))
-		orig->self_healing = true;
-
-	return promote;
-nopromote:
-	trace_io_read_nopromote(c, ret);
-	return NULL;
-}
-
-void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
-{
-	if (!op->write.read_done) {
-		prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
-		printbuf_indent_add(out, 2);
-		bch2_read_bio_to_text(out, op->write.rbio.parent);
-		printbuf_indent_sub(out, 2);
-	}
-
-	bch2_data_update_to_text(out, &op->write);
-}
-
-/* Read */
-
-static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
-				   struct bch_read_bio *rbio, struct bpos read_pos)
-{
-	int ret = lockrestart_do(trans,
-		bch2_inum_offset_err_msg_trans(trans, out,
-				(subvol_inum) { rbio->subvol, read_pos.inode },
-				read_pos.offset << 9));
-	if (ret)
-		return ret;
-
-	if (rbio->data_update)
-		prt_str(out, "(internal move) ");
-
-	return 0;
-}
-
-static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
-			      struct bch_read_bio *rbio, struct bpos read_pos)
-{
-	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
-}
-
-enum rbio_context {
-	RBIO_CONTEXT_NULL,
-	RBIO_CONTEXT_HIGHPRI,
-	RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-	return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-			   enum rbio_context context,
-			   struct workqueue_struct *wq)
-{
-	if (context <= rbio->context) {
-		fn(&rbio->work);
-	} else {
-		rbio->work.func		= fn;
-		rbio->context		= context;
-		queue_work(wq, &rbio->work);
-	}
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-	BUG_ON(rbio->bounce && !rbio->split);
-
-	if (rbio->have_ioref) {
-		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
-		enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
-	}
-
-	if (rbio->split) {
-		struct bch_read_bio *parent = rbio->parent;
-
-		if (unlikely(rbio->promote)) {
-			if (!rbio->bio.bi_status)
-				promote_start(rbio);
-			else
-				promote_free(rbio);
-		} else {
-			async_object_list_del(rbio->c, rbio, rbio->list_idx);
-
-			if (rbio->bounce)
-				bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-			bio_put(&rbio->bio);
-		}
-
-		rbio = parent;
-	}
-
-	return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-	if (rbio->start_time)
-		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-				       rbio->start_time);
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	if (rbio->list_idx)
-		async_object_list_del(rbio->c, rbio, rbio->list_idx);
-#endif
-	bio_endio(&rbio->bio);
-}
-
-static void get_rbio_extent(struct btree_trans *trans,
-			    struct bch_read_bio *rbio,
-			    struct bkey_buf *sk)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = lockrestart_do(trans,
-			bkey_err(k = bch2_bkey_get_iter(trans, &iter,
-						rbio->data_btree, rbio->data_pos, 0)));
-	if (ret)
-		return;
-
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	bkey_for_each_ptr(ptrs, ptr)
-		if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
-			bch2_bkey_buf_reassemble(sk, trans->c, k);
-			break;
-		}
-
-	bch2_trans_iter_exit(trans, &iter);
-}
-
-static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
-					enum btree_id btree, struct bkey_s_c read_k)
-{
-	if (!bch2_poison_extents_on_checksum_error)
-		return 0;
-
-	struct bch_fs *c = trans->c;
-
-	struct data_update *u = rbio_data_update(rbio);
-	if (u)
-		read_k = bkey_i_to_s_c(u->k.k);
-
-	u64 flags = bch2_bkey_extent_flags(read_k);
-	if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-		return 0;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
-					       BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (!bkey_and_val_eq(k, read_k))
-		goto out;
-
-	struct bkey_i *new = bch2_trans_kmalloc(trans,
-					bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
-	ret =   PTR_ERR_OR_ZERO(new) ?:
-		(bkey_reassemble(new, k), 0) ?:
-		bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
-		bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0);
-
-	/*
-	 * Propagate key change back to data update path, in particular so it
-	 * knows the extent has been poisoned and it's safe to change the
-	 * checksum
-	 */
-	if (u && !ret)
-		bch2_bkey_buf_copy(&u->k, c, new);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
-					struct bch_read_bio *rbio,
-					struct bvec_iter bvec_iter,
-					struct bch_io_failures *failed,
-					unsigned flags)
-{
-	struct data_update *u = container_of(rbio, struct data_update, rbio);
-retry:
-	bch2_trans_begin(trans);
-
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = lockrestart_do(trans,
-		bkey_err(k = bch2_bkey_get_iter(trans, &iter,
-				u->btree_id, bkey_start_pos(&u->k.k->k),
-				0)));
-	if (ret)
-		goto err;
-
-	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
-		/* extent we wanted to read no longer exists: */
-		rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
-		goto err;
-	}
-
-	ret = __bch2_read_extent(trans, rbio, bvec_iter,
-				 bkey_start_pos(&u->k.k->k),
-				 u->btree_id,
-				 bkey_i_to_s_c(u->k.k),
-				 0, failed, flags, -1);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-	    bch2_err_matches(ret, BCH_ERR_data_read_retry))
-		goto retry;
-
-	if (ret) {
-		rbio->bio.bi_status	= BLK_STS_IOERR;
-		rbio->ret		= ret;
-	}
-
-	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
-	return ret;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bvec_iter iter	= rbio->bvec_iter;
-	unsigned flags		= rbio->flags;
-	subvol_inum inum = {
-		.subvol = rbio->subvol,
-		.inum	= rbio->read_pos.inode,
-	};
-	struct bch_io_failures failed = { .nr = 0 };
-
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	struct bkey_buf sk;
-	bch2_bkey_buf_init(&sk);
-	bkey_init(&sk.k->k);
-
-	trace_io_read_retry(&rbio->bio);
-	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
-		     bvec_iter_sectors(rbio->bvec_iter));
-
-	get_rbio_extent(trans, rbio, &sk);
-
-	if (!bkey_deleted(&sk.k->k) &&
-	    bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
-		bch2_mark_io_failure(&failed, &rbio->pick,
-				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
-
-	if (!rbio->split) {
-		rbio->bio.bi_status	= 0;
-		rbio->ret		= 0;
-	}
-
-	unsigned subvol		= rbio->subvol;
-	struct bpos read_pos	= rbio->read_pos;
-
-	rbio = bch2_rbio_free(rbio);
-
-	flags |= BCH_READ_in_retry;
-	flags &= ~BCH_READ_may_promote;
-	flags &= ~BCH_READ_last_fragment;
-	flags |= BCH_READ_must_clone;
-
-	int ret = rbio->data_update
-		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
-		: __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
-
-	if (ret) {
-		rbio->ret = ret;
-		rbio->bio.bi_status = BLK_STS_IOERR;
-	}
-
-	if (failed.nr || ret) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		lockrestart_do(trans,
-			bch2_inum_offset_err_msg_trans(trans, &buf,
-					(subvol_inum) { subvol, read_pos.inode },
-					read_pos.offset << 9));
-		if (rbio->data_update)
-			prt_str(&buf, "(internal move) ");
-
-		prt_str(&buf, "data read error, ");
-		if (!ret) {
-			prt_str(&buf, "successful retry");
-			if (rbio->self_healing)
-				prt_str(&buf, ", self healing");
-		} else
-			prt_str(&buf, bch2_err_str(ret));
-		prt_newline(&buf);
-
-
-		if (!bkey_deleted(&sk.k->k)) {
-			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
-			prt_newline(&buf);
-		}
-
-		bch2_io_failures_to_text(&buf, c, &failed);
-
-		bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	bch2_rbio_done(rbio);
-	bch2_bkey_buf_exit(&sk, c);
-	bch2_trans_put(trans);
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio,
-			    int ret, blk_status_t blk_error)
-{
-	BUG_ON(ret >= 0);
-
-	rbio->ret		= ret;
-	rbio->bio.bi_status	= blk_error;
-
-	bch2_rbio_parent(rbio)->saw_error = true;
-
-	if (rbio->flags & BCH_READ_in_retry)
-		return;
-
-	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
-		bch2_rbio_punt(rbio, bch2_rbio_retry,
-			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	} else {
-		rbio = bch2_rbio_free(rbio);
-
-		rbio->ret		= ret;
-		rbio->bio.bi_status	= blk_error;
-
-		bch2_rbio_done(rbio);
-	}
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-				   struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-	struct bch_extent_crc_unpacked new_crc;
-	struct btree_iter iter;
-	struct bkey_i *new;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	if (crc_is_compressed(rbio->pick.crc))
-		return 0;
-
-	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-			       BTREE_ITER_slots|BTREE_ITER_intent);
-	if ((ret = bkey_err(k)))
-		goto out;
-
-	if (bversion_cmp(k.k->bversion, rbio->version) ||
-	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-		goto out;
-
-	/* Extent was merged? */
-	if (bkey_start_offset(k.k) < data_offset ||
-	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-		goto out;
-
-	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-			rbio->pick.crc, NULL, &new_crc,
-			bkey_start_offset(k.k) - data_offset, k.k->size,
-			rbio->pick.crc.csum_type)) {
-		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-		ret = 0;
-		goto out;
-	}
-
-	/*
-	 * going to be temporarily appending another checksum entry:
-	 */
-	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-				 sizeof(struct bch_extent_crc128));
-	if ((ret = PTR_ERR_OR_ZERO(new)))
-		goto out;
-
-	bkey_reassemble(new, k);
-
-	if (!bch2_bkey_narrow_crcs(new, new_crc))
-		goto out;
-
-	ret = bch2_trans_update(trans, &iter, new,
-				BTREE_UPDATE_internal_snapshot_node);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			     __bch2_rbio_narrow_crcs(trans, rbio));
-}
-
-static void bch2_read_decompress_err(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct printbuf buf = PRINTBUF;
-
-	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
-	prt_str(&buf, "decompression error");
-
-	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	if (ca)
-		bch_err_ratelimited(ca, "%s", buf.buf);
-	else
-		bch_err_ratelimited(c, "%s", buf.buf);
-
-	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
-	printbuf_exit(&buf);
-}
-
-static void bch2_read_decrypt_err(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct printbuf buf = PRINTBUF;
-
-	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
-	prt_str(&buf, "decrypt error");
-
-	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	if (ca)
-		bch_err_ratelimited(ca, "%s", buf.buf);
-	else
-		bch_err_ratelimited(c, "%s", buf.buf);
-
-	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
-	printbuf_exit(&buf);
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
-	struct bio *src			= &rbio->bio;
-	struct bio *dst			= &parent->bio;
-	struct bvec_iter dst_iter	= rbio->bvec_iter;
-	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-	struct nonce nonce = extent_nonce(rbio->version, crc);
-	unsigned nofs_flags;
-	struct bch_csum csum;
-	int ret;
-
-	nofs_flags = memalloc_nofs_save();
-
-	/* Reset iterator for checksumming and copying bounced data: */
-	if (rbio->bounce) {
-		src->bi_iter.bi_size		= crc.compressed_size << 9;
-		src->bi_iter.bi_idx		= 0;
-		src->bi_iter.bi_bvec_done	= 0;
-	} else {
-		src->bi_iter			= rbio->bvec_iter;
-	}
-
-	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
-
-	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
-
-	/*
-	 * Checksum error: if the bio wasn't bounced, we may have been
-	 * reading into buffers owned by userspace (that userspace can
-	 * scribble over) - retry the read, bouncing it this time:
-	 */
-	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
-		rbio->flags |= BCH_READ_must_bounce;
-		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
-				BLK_STS_IOERR);
-		goto out;
-	}
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
-	if (!csum_good)
-		goto csum_err;
-
-	/*
-	 * XXX
-	 * We need to rework the narrow_crcs path to deliver the read completion
-	 * first, and then punt to a different workqueue, otherwise we're
-	 * holding up reads while doing btree updates which is bad for memory
-	 * reclaim.
-	 */
-	if (unlikely(rbio->narrow_crcs))
-		bch2_rbio_narrow_crcs(rbio);
-
-	if (likely(!parent->data_update)) {
-		/* Adjust crc to point to subset of data we want: */
-		crc.offset     += rbio->offset_into_extent;
-		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
-
-		if (crc_is_compressed(crc)) {
-			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-			if (ret)
-				goto decrypt_err;
-
-			if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-			    !c->opts.no_data_io)
-				goto decompression_err;
-		} else {
-			/* don't need to decrypt the entire bio: */
-			nonce = nonce_add(nonce, crc.offset << 9);
-			bio_advance(src, crc.offset << 9);
-
-			BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-			src->bi_iter.bi_size = dst_iter.bi_size;
-
-			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-			if (ret)
-				goto decrypt_err;
-
-			if (rbio->bounce) {
-				struct bvec_iter src_iter = src->bi_iter;
-
-				bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-			}
-		}
-	} else {
-		if (rbio->split)
-			rbio->parent->pick = rbio->pick;
-
-		if (rbio->bounce) {
-			struct bvec_iter src_iter = src->bi_iter;
-
-			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-		}
-	}
-
-	if (rbio->promote) {
-		/*
-		 * Re encrypt data we decrypted, so it's consistent with
-		 * rbio->crc:
-		 */
-		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-		if (ret)
-			goto decrypt_err;
-	}
-
-	if (likely(!(rbio->flags & BCH_READ_in_retry))) {
-		rbio = bch2_rbio_free(rbio);
-		bch2_rbio_done(rbio);
-	}
-out:
-	memalloc_nofs_restore(nofs_flags);
-	return;
-csum_err:
-	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
-	goto out;
-decompression_err:
-	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	goto out;
-decrypt_err:
-	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-	goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-	struct bch_read_bio *rbio =
-		container_of(bio, struct bch_read_bio, bio);
-	struct bch_fs *c	= rbio->c;
-	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
-	struct workqueue_struct *wq = NULL;
-	enum rbio_context context = RBIO_CONTEXT_NULL;
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-				   rbio->submit_time, !bio->bi_status);
-
-	if (!rbio->split)
-		rbio->bio.bi_end_io = rbio->end_io;
-
-	if (unlikely(bio->bi_status)) {
-		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
-		return;
-	}
-
-	if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
-	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
-		trace_and_count(c, io_read_reuse_race, &rbio->bio);
-
-		if (rbio->flags & BCH_READ_retry_if_stale)
-			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
-		else
-			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
-		return;
-	}
-
-	if (rbio->narrow_crcs ||
-	    rbio->promote ||
-	    crc_is_compressed(rbio->pick.crc) ||
-	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
-	else if (rbio->pick.crc.csum_type)
-		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
-
-	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-						   struct bch_dev *ca,
-						   struct bkey_s_c k,
-						   struct bch_extent_ptr ptr)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct printbuf buf = PRINTBUF;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     PTR_BUCKET_POS(ca, &ptr),
-			     BTREE_ITER_cached);
-
-	int gen = bucket_gen_get(ca, iter.pos.offset);
-	if (gen >= 0) {
-		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
-		printbuf_indent_add(&buf, 2);
-
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_newline(&buf);
-
-		prt_printf(&buf, "memory gen: %u", gen);
-
-		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
-		if (!ret) {
-			prt_newline(&buf);
-			bch2_bkey_val_to_text(&buf, c, k);
-		}
-	} else {
-		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
-			   iter.pos.inode, iter.pos.offset);
-		printbuf_indent_add(&buf, 2);
-
-		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
-			   ca->mi.first_bucket, ca->mi.nbuckets);
-
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_newline(&buf);
-	}
-
-	bch2_fs_inconsistent(c, "%s", buf.buf);
-
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bpos read_pos,
-		       enum btree_id data_btree, struct bkey_s_c k,
-		       unsigned offset_into_extent,
-		       struct bch_io_failures *failed, unsigned flags, int dev)
-{
-	struct bch_fs *c = trans->c;
-	struct extent_ptr_decoded pick;
-	struct bch_read_bio *rbio = NULL;
-	bool bounce = false, read_full = false, narrow_crcs = false;
-	struct bpos data_pos = bkey_start_pos(k.k);
-	struct data_update *u = rbio_data_update(orig);
-	int ret = 0;
-
-	if (bkey_extent_is_inline_data(k.k)) {
-		unsigned bytes = min_t(unsigned, iter.bi_size,
-				       bkey_inline_data_bytes(k.k));
-
-		swap(iter.bi_size, bytes);
-		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-		swap(iter.bi_size, bytes);
-		bio_advance_iter(&orig->bio, &iter, bytes);
-		zero_fill_bio_iter(&orig->bio, iter);
-		this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
-			     bvec_iter_sectors(iter));
-		goto out_read_done;
-	}
-
-	if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
-	    !orig->data_update)
-		return bch_err_throw(c, extent_poisoned);
-retry_pick:
-	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
-
-	/* hole or reservation - just zero fill: */
-	if (!ret)
-		goto hole;
-
-	if (unlikely(ret < 0)) {
-		if (ret == -BCH_ERR_data_read_csum_err) {
-			int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
-			if (ret2) {
-				ret = ret2;
-				goto err;
-			}
-
-			trace_and_count(c, io_read_fail_and_poison, &orig->bio);
-		}
-
-		struct printbuf buf = PRINTBUF;
-		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
-		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
-		bch2_bkey_val_to_text(&buf, c, k);
-
-		bch_err_ratelimited(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		goto err;
-	}
-
-	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
-	    !c->chacha20_key_set) {
-		struct printbuf buf = PRINTBUF;
-		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
-		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
-		bch2_bkey_val_to_text(&buf, c, k);
-
-		bch_err_ratelimited(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		ret = bch_err_throw(c, data_read_no_encryption_key);
-		goto err;
-	}
-
-	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
-					BCH_DEV_READ_REF_io_read);
-
-	/*
-	 * Stale dirty pointers are treated as IO errors, but @failed isn't
-	 * allocated unless we're in the retry path - so if we're not in the
-	 * retry path, don't check here, it'll be caught in bch2_read_endio()
-	 * and we'll end up in the retry path:
-	 */
-	if ((flags & BCH_READ_in_retry) &&
-	    !pick.ptr.cached &&
-	    ca &&
-	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
-		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
-		bch2_mark_io_failure(failed, &pick, false);
-		enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
-		goto retry_pick;
-	}
-
-	if (likely(!u)) {
-		if (!(flags & BCH_READ_last_fragment) ||
-		    bio_flagged(&orig->bio, BIO_CHAIN))
-			flags |= BCH_READ_must_clone;
-
-		narrow_crcs = !(flags & BCH_READ_in_retry) &&
-			bch2_can_narrow_extent_crcs(k, pick.crc);
-
-		if (narrow_crcs && (flags & BCH_READ_user_mapped))
-			flags |= BCH_READ_must_bounce;
-
-		EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-		if (crc_is_compressed(pick.crc) ||
-		    (pick.crc.csum_type != BCH_CSUM_none &&
-		     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-		      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-		       (flags & BCH_READ_user_mapped)) ||
-		      (flags & BCH_READ_must_bounce)))) {
-			read_full = true;
-			bounce = true;
-		}
-	} else {
-		/*
-		 * can happen if we retry, and the extent we were going to read
-		 * has been merged in the meantime:
-		 */
-		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
-			if (ca)
-				enumerated_ref_put(&ca->io_ref[READ],
-					BCH_DEV_READ_REF_io_read);
-			rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
-			goto out_read_done;
-		}
-
-		iter.bi_size	= pick.crc.compressed_size << 9;
-		read_full = true;
-	}
-
-	if (orig->opts.promote_target || have_io_error(failed))
-		rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
-				     &bounce, &read_full, failed);
-
-	if (!read_full) {
-		EBUG_ON(crc_is_compressed(pick.crc));
-		EBUG_ON(pick.crc.csum_type &&
-			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-			 bvec_iter_sectors(iter) != pick.crc.live_size ||
-			 pick.crc.offset ||
-			 offset_into_extent));
-
-		data_pos.offset += offset_into_extent;
-		pick.ptr.offset += pick.crc.offset +
-			offset_into_extent;
-		offset_into_extent		= 0;
-		pick.crc.compressed_size	= bvec_iter_sectors(iter);
-		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
-		pick.crc.offset			= 0;
-		pick.crc.live_size		= bvec_iter_sectors(iter);
-	}
-
-	if (rbio) {
-		/*
-		 * promote already allocated bounce rbio:
-		 * promote needs to allocate a bio big enough for uncompressing
-		 * data in the write path, but we're not going to use it all
-		 * here:
-		 */
-		EBUG_ON(rbio->bio.bi_iter.bi_size <
-		       pick.crc.compressed_size << 9);
-		rbio->bio.bi_iter.bi_size =
-			pick.crc.compressed_size << 9;
-	} else if (bounce) {
-		unsigned sectors = pick.crc.compressed_size;
-
-		rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
-						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
-						  0,
-						  GFP_NOFS,
-						  &c->bio_read_split),
-				 orig);
-
-		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-		rbio->bounce	= true;
-	} else if (flags & BCH_READ_must_clone) {
-		/*
-		 * Have to clone if there were any splits, due to error
-		 * reporting issues (if a split errored, and retrying didn't
-		 * work, when it reports the error to its parent (us) we don't
-		 * know if the error was from our bio, and we should retry, or
-		 * from the whole bio, in which case we don't want to retry and
-		 * lose the error)
-		 */
-		rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-						 &c->bio_read_split),
-				 orig);
-		rbio->bio.bi_iter = iter;
-	} else {
-		rbio = orig;
-		rbio->bio.bi_iter = iter;
-		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-	}
-
-	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-	rbio->submit_time	= local_clock();
-	if (!rbio->split)
-		rbio->end_io	= orig->bio.bi_end_io;
-	rbio->bvec_iter		= iter;
-	rbio->offset_into_extent= offset_into_extent;
-	rbio->flags		= flags;
-	rbio->have_ioref	= ca != NULL;
-	rbio->narrow_crcs	= narrow_crcs;
-	rbio->ret		= 0;
-	rbio->context		= 0;
-	rbio->pick		= pick;
-	rbio->subvol		= orig->subvol;
-	rbio->read_pos		= read_pos;
-	rbio->data_btree	= data_btree;
-	rbio->data_pos		= data_pos;
-	rbio->version		= k.k->bversion;
-	INIT_WORK(&rbio->work, NULL);
-
-	rbio->bio.bi_opf	= orig->bio.bi_opf;
-	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-	rbio->bio.bi_end_io	= bch2_read_endio;
-
-	async_object_list_add(c, rbio, rbio, &rbio->list_idx);
-
-	if (rbio->bounce)
-		trace_and_count(c, io_read_bounce, &rbio->bio);
-
-	if (!u)
-		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-	else
-		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
-	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-	/*
-	 * If it's being moved internally, we don't want to flag it as a cache
-	 * hit:
-	 */
-	if (ca && pick.ptr.cached && !u)
-		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-			PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-	if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
-		bio_inc_remaining(&orig->bio);
-		trace_and_count(c, io_read_split, &orig->bio);
-	}
-
-	/*
-	 * Unlock the iterator while the btree node's lock is still in
-	 * cache, before doing the IO:
-	 */
-	if (!(flags & BCH_READ_in_retry))
-		bch2_trans_unlock(trans);
-	else
-		bch2_trans_unlock_long(trans);
-
-	if (likely(!rbio->pick.do_ec_reconstruct)) {
-		if (unlikely(!rbio->have_ioref)) {
-			bch2_rbio_error(rbio,
-					-BCH_ERR_data_read_retry_device_offline,
-					BLK_STS_IOERR);
-			goto out;
-		}
-
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-			     bio_sectors(&rbio->bio));
-		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-		if (unlikely(c->opts.no_data_io)) {
-			if (likely(!(flags & BCH_READ_in_retry)))
-				bio_endio(&rbio->bio);
-		} else {
-			if (likely(!(flags & BCH_READ_in_retry)))
-				submit_bio(&rbio->bio);
-			else
-				submit_bio_wait(&rbio->bio);
-		}
-
-		/*
-		 * We just submitted IO which may block, we expect relock fail
-		 * events and shouldn't count them:
-		 */
-		trans->notrace_relock_fail = true;
-	} else {
-		/* Attempting reconstruct read: */
-		if (bch2_ec_read_extent(trans, rbio, k)) {
-			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
-					BLK_STS_IOERR);
-			goto out;
-		}
-
-		if (likely(!(flags & BCH_READ_in_retry)))
-			bio_endio(&rbio->bio);
-	}
-out:
-	if (likely(!(flags & BCH_READ_in_retry))) {
-		return 0;
-	} else {
-		bch2_trans_unlock(trans);
-
-		int ret;
-
-		rbio->context = RBIO_CONTEXT_UNBOUND;
-		bch2_read_endio(&rbio->bio);
-
-		ret = rbio->ret;
-		rbio = bch2_rbio_free(rbio);
-
-		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
-			bch2_mark_io_failure(failed, &pick,
-					ret == -BCH_ERR_data_read_retry_csum_err);
-
-		return ret;
-	}
-
-err:
-	if (flags & BCH_READ_in_retry)
-		return ret;
-
-	orig->bio.bi_status	= BLK_STS_IOERR;
-	orig->ret		= ret;
-	goto out_read_done;
-
-hole:
-	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
-		     bvec_iter_sectors(iter));
-	/*
-	 * won't normally happen in the data update (bch2_move_extent()) path,
-	 * but if we retry and the extent we wanted to read no longer exists we
-	 * have to signal that:
-	 */
-	if (u)
-		orig->ret = bch_err_throw(c, data_read_key_overwritten);
-
-	zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-	if ((flags & BCH_READ_last_fragment) &&
-	    !(flags & BCH_READ_in_retry))
-		bch2_rbio_done(orig);
-	return 0;
-}
-
-int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
-		struct bvec_iter bvec_iter, subvol_inum inum,
-		struct bch_io_failures *failed,
-		struct bkey_buf *prev_read,
-		unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	enum btree_id data_btree;
-	int ret;
-
-	EBUG_ON(rbio->data_update);
-
-	bch2_bkey_buf_init(&sk);
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     POS(inum.inum, bvec_iter.bi_sector),
-			     BTREE_ITER_slots);
-
-	while (1) {
-		data_btree = BTREE_ID_extents;
-
-		bch2_trans_begin(trans);
-
-		u32 snapshot;
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
-		bch2_btree_iter_set_pos(trans, &iter,
-				POS(inum.inum, bvec_iter.bi_sector));
-
-		k = bch2_btree_iter_peek_slot(trans, &iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		s64 offset_into_extent = iter.pos.offset -
-			bkey_start_offset(k.k);
-		unsigned sectors = k.k->size - offset_into_extent;
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-
-		ret = bch2_read_indirect_extent(trans, &data_btree,
-					&offset_into_extent, &sk);
-		if (ret)
-			goto err;
-
-		k = bkey_i_to_s_c(sk.k);
-
-		if (unlikely(flags & BCH_READ_in_retry)) {
-			if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
-				failed->nr = 0;
-			bch2_bkey_buf_copy(prev_read, c, sk.k);
-		}
-
-		/*
-		 * With indirect extents, the amount of data to read is the min
-		 * of the original extent and the indirect extent:
-		 */
-		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
-		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-		swap(bvec_iter.bi_size, bytes);
-
-		if (bvec_iter.bi_size == bytes)
-			flags |= BCH_READ_last_fragment;
-
-		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
-					 data_btree, k,
-					 offset_into_extent, failed, flags, -1);
-		swap(bvec_iter.bi_size, bytes);
-
-		if (ret)
-			goto err;
-
-		if (flags & BCH_READ_last_fragment)
-			break;
-
-		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-err:
-		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
-			flags |= BCH_READ_must_bounce;
-
-		if (ret &&
-		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
-		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
-			break;
-	}
-
-	if (unlikely(ret)) {
-		if (ret != -BCH_ERR_extent_poisoned) {
-			struct printbuf buf = PRINTBUF;
-			lockrestart_do(trans,
-				       bch2_inum_offset_err_msg_trans(trans, &buf, inum,
-								      bvec_iter.bi_sector << 9));
-			prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
-			bch_err_ratelimited(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-
-		rbio->bio.bi_status	= BLK_STS_IOERR;
-		rbio->ret		= ret;
-
-		if (!(flags & BCH_READ_in_retry))
-			bch2_rbio_done(rbio);
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_bkey_buf_exit(&sk, c);
-	return ret;
-}
-
-static const char * const bch2_read_bio_flags[] = {
-#define x(n)	#n,
-	BCH_READ_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
-{
-	u64 now = local_clock();
-	prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
-	prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
-
-	if (!rbio->split)
-		prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
-	else
-		prt_printf(out, "parent:\t%px\n", rbio->parent);
-
-	prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
-
-	prt_printf(out, "promote:\t%u\n",	rbio->promote);
-	prt_printf(out, "bounce:\t%u\n",	rbio->bounce);
-	prt_printf(out, "split:\t%u\n",		rbio->split);
-	prt_printf(out, "have_ioref:\t%u\n",	rbio->have_ioref);
-	prt_printf(out, "narrow_crcs:\t%u\n",	rbio->narrow_crcs);
-	prt_printf(out, "context:\t%u\n",	rbio->context);
-
-	int ret = READ_ONCE(rbio->ret);
-	if (ret < 0)
-		prt_printf(out, "ret:\t%s\n",		bch2_err_str(ret));
-	else
-		prt_printf(out, "ret:\t%i\n",		ret);
-
-	prt_printf(out, "flags:\t");
-	bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
-	prt_newline(out);
-
-	bch2_bio_to_text(out, &rbio->bio);
-}
-
-void bch2_fs_io_read_exit(struct bch_fs *c)
-{
-	if (c->promote_table.tbl)
-		rhashtable_destroy(&c->promote_table);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
-	mempool_exit(&c->bio_bounce_pages);
-}
-
-int bch2_fs_io_read_init(struct bch_fs *c)
-{
-	if (mempool_init_page_pool(&c->bio_bounce_pages,
-				   max_t(unsigned,
-					 c->opts.btree_node_size,
-					 c->opts.encoded_extent_max) /
-				   PAGE_SIZE, 0))
-		return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
-
-	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return bch_err_throw(c, ENOMEM_bio_read_init);
-
-	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS))
-		return bch_err_throw(c, ENOMEM_bio_read_split_init);
-
-	if (rhashtable_init(&c->promote_table, &bch_promote_params))
-		return bch_err_throw(c, ENOMEM_promote_table_init);
-
-	return 0;
-}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
deleted file mode 100644
index 9c5ddbf861b3..000000000000
--- a/fs/bcachefs/io_read.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_READ_H
-#define _BCACHEFS_IO_READ_H
-
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "extents_types.h"
-#include "reflink.h"
-
-struct bch_read_bio {
-	struct bch_fs		*c;
-	u64			start_time;
-	u64			submit_time;
-
-	/*
-	 * Reads will often have to be split, and if the extent being read from
-	 * was checksummed or compressed we'll also have to allocate bounce
-	 * buffers and copy the data back into the original bio.
-	 *
-	 * If we didn't have to split, we have to save and restore the original
-	 * bi_end_io - @split below indicates which:
-	 */
-	union {
-	struct bch_read_bio	*parent;
-	bio_end_io_t		*end_io;
-	};
-
-	/*
-	 * Saved copy of bio->bi_iter, from submission time - allows us to
-	 * resubmit on IO error, and also to copy data back to the original bio
-	 * when we're bouncing:
-	 */
-	struct bvec_iter	bvec_iter;
-
-	unsigned		offset_into_extent;
-
-	u16			flags;
-	union {
-	struct {
-	u16			data_update:1,
-				promote:1,
-				bounce:1,
-				split:1,
-				have_ioref:1,
-				narrow_crcs:1,
-				saw_error:1,
-				self_healing:1,
-				context:2;
-	};
-	u16			_state;
-	};
-	s16			ret;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	unsigned		list_idx;
-#endif
-
-	struct extent_ptr_decoded pick;
-
-	/*
-	 * pos we read from - different from data_pos for indirect extents:
-	 */
-	u32			subvol;
-	struct bpos		read_pos;
-
-	/*
-	 * start pos of data we read (may not be pos of data we want) - for
-	 * promote, narrow extents paths:
-	 */
-	enum btree_id		data_btree;
-	struct bpos		data_pos;
-	struct bversion		version;
-
-	struct bch_io_opts	opts;
-
-	struct work_struct	work;
-
-	struct bio		bio;
-};
-
-#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-					    enum btree_id *data_btree,
-					    s64 *offset_into_extent,
-					    struct bkey_buf *extent)
-{
-	if (extent->k->k.type != KEY_TYPE_reflink_p)
-		return 0;
-
-	*data_btree = BTREE_ID_reflink;
-
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
-						offset_into_extent,
-						bkey_i_to_s_c_reflink_p(extent->k),
-						true, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (bkey_deleted(k.k)) {
-		bch2_trans_iter_exit(trans, &iter);
-		return bch_err_throw(c, missing_indirect_extent);
-	}
-
-	bch2_bkey_buf_reassemble(extent, c, k);
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-#define BCH_READ_FLAGS()		\
-	x(retry_if_stale)		\
-	x(may_promote)			\
-	x(user_mapped)			\
-	x(last_fragment)		\
-	x(must_bounce)			\
-	x(must_clone)			\
-	x(in_retry)
-
-enum __bch_read_flags {
-#define x(n)	__BCH_READ_##n,
-	BCH_READ_FLAGS()
-#undef x
-};
-
-enum bch_read_flags {
-#define x(n)	BCH_READ_##n = BIT(__BCH_READ_##n),
-	BCH_READ_FLAGS()
-#undef x
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-		       struct bvec_iter, struct bpos, enum btree_id,
-		       struct bkey_s_c, unsigned,
-		       struct bch_io_failures *, unsigned, int);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-			struct bch_read_bio *rbio, struct bpos read_pos,
-			enum btree_id data_btree, struct bkey_s_c k,
-			unsigned offset_into_extent, unsigned flags)
-{
-	int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-				     data_btree, k, offset_into_extent, NULL, flags, -1);
-	/* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
-	WARN(ret, "unhandled error from __bch2_read_extent()");
-}
-
-int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
-		subvol_inum,
-		struct bch_io_failures *, struct bkey_buf *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     subvol_inum inum)
-{
-	BUG_ON(rbio->_state);
-
-	rbio->subvol = inum.subvol;
-
-	bch2_trans_run(c,
-		__bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
-			    BCH_READ_retry_if_stale|
-			    BCH_READ_may_promote|
-			    BCH_READ_user_mapped));
-}
-
-static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
-						      struct bch_read_bio *orig)
-{
-	struct bch_read_bio *rbio = to_rbio(bio);
-
-	rbio->c			= orig->c;
-	rbio->_state		= 0;
-	rbio->flags		= 0;
-	rbio->ret		= 0;
-	rbio->split		= true;
-	rbio->parent		= orig;
-	rbio->opts		= orig->opts;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	rbio->list_idx	= 0;
-#endif
-	return rbio;
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-					     struct bch_fs *c,
-					     struct bch_io_opts opts,
-					     bio_end_io_t end_io)
-{
-	struct bch_read_bio *rbio = to_rbio(bio);
-
-	rbio->start_time	= local_clock();
-	rbio->c			= c;
-	rbio->_state		= 0;
-	rbio->flags		= 0;
-	rbio->ret		= 0;
-	rbio->opts		= opts;
-	rbio->bio.bi_end_io	= end_io;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	rbio->list_idx	= 0;
-#endif
-	return rbio;
-}
-
-struct promote_op;
-void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
-void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
-
-void bch2_fs_io_read_exit(struct bch_fs *);
-int bch2_fs_io_read_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
deleted file mode 100644
index 88b1eec8eff3..000000000000
--- a/fs/bcachefs/io_write.c
+++ /dev/null
@@ -1,1780 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_write_corrupt_ratio;
-module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(write_corrupt_ratio, "");
-#endif
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
-				       u64 now, int rw)
-{
-	u64 latency_capable =
-		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
-	/* ideally we'd be taking into account the device's variance here: */
-	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
-	s64 latency_over = io_latency - latency_threshold;
-
-	if (latency_threshold && latency_over > 0) {
-		/*
-		 * bump up congested by approximately latency_over * 4 /
-		 * latency_threshold - we don't need much accuracy here so don't
-		 * bother with the divide:
-		 */
-		if (atomic_read(&ca->congested) < CONGESTED_MAX)
-			atomic_add(latency_over >>
-				   max_t(int, ilog2(latency_threshold) - 2, 0),
-				   &ca->congested);
-
-		ca->congested_last = now;
-	} else if (atomic_read(&ca->congested) > 0) {
-		atomic_dec(&ca->congested);
-	}
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
-	atomic64_t *latency = &ca->cur_latency[rw];
-	u64 now = local_clock();
-	u64 io_latency = time_after64(now, submit_time)
-		? now - submit_time
-		: 0;
-	u64 old, new;
-
-	old = atomic64_read(latency);
-	do {
-		/*
-		 * If the io latency was reasonably close to the current
-		 * latency, skip doing the update and atomic operation - most of
-		 * the time:
-		 */
-		if (abs((int) (old - io_latency)) < (old >> 1) &&
-		    now & ~(~0U << 5))
-			break;
-
-		new = ewma_add(old, io_latency, 5);
-	} while (!atomic64_try_cmpxchg(latency, &old, new));
-
-	bch2_congested_acct(ca, io_latency, now, rw);
-
-	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
-	struct bvec_iter_all iter;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, iter)
-		if (bv->bv_page != ZERO_PAGE(0))
-			mempool_free(bv->bv_page, &c->bio_bounce_pages);
-	bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
-	struct page *page;
-
-	if (likely(!*using_mempool)) {
-		page = alloc_page(GFP_NOFS);
-		if (unlikely(!page)) {
-			mutex_lock(&c->bio_bounce_pages_lock);
-			*using_mempool = true;
-			goto pool_alloc;
-
-		}
-	} else {
-pool_alloc:
-		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
-	}
-
-	return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-			       size_t size)
-{
-	bool using_mempool = false;
-
-	while (size) {
-		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-		unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-		BUG_ON(!bio_add_page(bio, page, len, 0));
-		size -= len;
-	}
-
-	if (using_mempool)
-		mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
-			       struct btree_iter *extent_iter,
-			       struct bkey_i *new,
-			       bool *usage_increasing,
-			       s64 *i_sectors_delta,
-			       s64 *disk_sectors_delta)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c old;
-	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
-	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
-	int ret = 0;
-
-	*usage_increasing	= false;
-	*i_sectors_delta	= 0;
-	*disk_sectors_delta	= 0;
-
-	bch2_trans_copy_iter(trans, &iter, extent_iter);
-
-	for_each_btree_key_max_continue_norestart(trans, iter,
-				new->k.p, BTREE_ITER_slots, old, ret) {
-		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
-			max(bkey_start_offset(&new->k),
-			    bkey_start_offset(old.k));
-
-		*i_sectors_delta += sectors *
-			(bkey_extent_is_allocation(&new->k) -
-			 bkey_extent_is_allocation(old.k));
-
-		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
-		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
-			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
-			: 0;
-
-		if (!*usage_increasing &&
-		    (new->k.p.snapshot != old.k->p.snapshot ||
-		     new_replicas > bch2_bkey_replicas(c, old) ||
-		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
-			*usage_increasing = true;
-
-		if (bkey_ge(old.k->p, new->k.p))
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
-						    struct btree_iter *extent_iter,
-						    u64 new_i_size,
-						    s64 i_sectors_delta)
-{
-	/*
-	 * Crazy performance optimization:
-	 * Every extent update needs to also update the inode: the inode trigger
-	 * will set bi->journal_seq to the journal sequence number of this
-	 * transaction - for fsync.
-	 *
-	 * But if that's the only reason we're updating the inode (we're not
-	 * updating bi_size or bi_sectors), then we don't need the inode update
-	 * to be journalled - if we crash, the bi_journal_seq update will be
-	 * lost, but that's fine.
-	 */
-	unsigned inode_update_flags = BTREE_UPDATE_nojournal;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			      SPOS(0,
-				   extent_iter->pos.inode,
-				   extent_iter->snapshot),
-			      BTREE_ITER_intent|
-			      BTREE_ITER_cached);
-	int ret = bkey_err(k);
-	if (unlikely(ret))
-		return ret;
-
-	/*
-	 * varint_decode_fast(), in the inode .invalid method, reads up to 7
-	 * bytes past the end of the buffer:
-	 */
-	struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
-	ret = PTR_ERR_OR_ZERO(k_mut);
-	if (unlikely(ret))
-		goto err;
-
-	bkey_reassemble(k_mut, k);
-
-	if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
-		k_mut = bch2_inode_to_v3(trans, k_mut);
-		ret = PTR_ERR_OR_ZERO(k_mut);
-		if (unlikely(ret))
-			goto err;
-	}
-
-	struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
-
-	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
-	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
-		inode->v.bi_size = cpu_to_le64(new_i_size);
-		inode_update_flags = 0;
-	}
-
-	if (i_sectors_delta) {
-		s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
-		if (unlikely(bi_sectors + i_sectors_delta < 0)) {
-			struct bch_fs *c = trans->c;
-			struct printbuf buf = PRINTBUF;
-			bch2_log_msg_start(c, &buf);
-			prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
-				   extent_iter->pos.inode, bi_sectors, i_sectors_delta);
-
-			bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
-			if (print)
-				bch2_print_str(c, KERN_ERR, buf.buf);
-			printbuf_exit(&buf);
-
-			if (i_sectors_delta < 0)
-				i_sectors_delta = -bi_sectors;
-			else
-				i_sectors_delta = 0;
-		}
-
-		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
-		inode_update_flags = 0;
-	}
-
-	/*
-	 * extents, dirents and xattrs updates require that an inode update also
-	 * happens - to ensure that if a key exists in one of those btrees with
-	 * a given snapshot ID an inode is also present - so we may have to skip
-	 * the nojournal optimization:
-	 */
-	if (inode->k.p.snapshot != iter.snapshot) {
-		inode->k.p.snapshot = iter.snapshot;
-		inode_update_flags = 0;
-	}
-
-	ret = bch2_trans_update(trans, &iter, &inode->k_i,
-				BTREE_UPDATE_internal_snapshot_node|
-				inode_update_flags);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
-		       subvol_inum inum,
-		       struct btree_iter *iter,
-		       struct bkey_i *k,
-		       struct disk_reservation *disk_res,
-		       u64 new_i_size,
-		       s64 *i_sectors_delta_total,
-		       bool check_enospc)
-{
-	struct bpos next_pos;
-	bool usage_increasing;
-	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-	int ret;
-
-	/*
-	 * This traverses us the iterator without changing iter->path->pos to
-	 * search_key() (which is pos + 1 for extents): we want there to be a
-	 * path already traversed at iter->pos because
-	 * bch2_trans_extent_update() will use it to attempt extent merging
-	 */
-	ret = __bch2_btree_iter_traverse(trans, iter);
-	if (ret)
-		return ret;
-
-	ret = bch2_extent_trim_atomic(trans, iter, k);
-	if (ret)
-		return ret;
-
-	next_pos = k->k.p;
-
-	ret = bch2_sum_sector_overwrites(trans, iter, k,
-			&usage_increasing,
-			&i_sectors_delta,
-			&disk_sectors_delta);
-	if (ret)
-		return ret;
-
-	if (disk_res &&
-	    disk_sectors_delta > (s64) disk_res->sectors) {
-		ret = bch2_disk_reservation_add(trans->c, disk_res,
-					disk_sectors_delta - disk_res->sectors,
-					!check_enospc || !usage_increasing
-					? BCH_DISK_RESERVATION_NOFAIL : 0);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * Note:
-	 * We always have to do an inode update - even when i_size/i_sectors
-	 * aren't changing - for fsync to work properly; fsync relies on
-	 * inode->bi_journal_seq which is updated by the trigger code:
-	 */
-	ret =   bch2_extent_update_i_size_sectors(trans, iter,
-						  min(k->k.p.offset << 9, new_i_size),
-						  i_sectors_delta) ?:
-		bch2_trans_update(trans, iter, k, 0) ?:
-		bch2_trans_commit(trans, disk_res, NULL,
-				BCH_TRANS_COMMIT_no_check_rw|
-				BCH_TRANS_COMMIT_no_enospc);
-	if (unlikely(ret))
-		return ret;
-
-	if (i_sectors_delta_total)
-		*i_sectors_delta_total += i_sectors_delta;
-	bch2_btree_iter_set_pos(trans, iter, next_pos);
-	return 0;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct bkey_buf sk;
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_i *k = bch2_keylist_front(keys);
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	subvol_inum inum = {
-		.subvol = op->subvol,
-		.inum	= k->k.p.inode,
-	};
-	int ret;
-
-	BUG_ON(!inum.subvol);
-
-	bch2_bkey_buf_init(&sk);
-
-	do {
-		bch2_trans_begin(trans);
-
-		k = bch2_keylist_front(keys);
-		bch2_bkey_buf_copy(&sk, c, k);
-
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
-						  &sk.k->k.p.snapshot);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-				     bkey_start_pos(&sk.k->k),
-				     BTREE_ITER_slots|BTREE_ITER_intent);
-
-		ret =   bch2_extent_update(trans, inum, &iter, sk.k,
-					&op->res,
-					op->new_i_size, &op->i_sectors_delta,
-					op->flags & BCH_WRITE_check_enospc);
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		if (bkey_ge(iter.pos, k->k.p))
-			bch2_keylist_pop_front(&op->insert_keys);
-		else
-			bch2_cut_front(iter.pos, k);
-	} while (!bch2_keylist_empty(keys));
-
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret;
-}
-
-/* Writes */
-
-void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
-{
-	struct printbuf buf = PRINTBUF;
-
-	if (op->subvol) {
-		bch2_inum_offset_err_msg(op->c, &buf,
-					 (subvol_inum) { op->subvol, op->pos.inode, },
-					 offset << 9);
-	} else {
-		struct bpos pos = op->pos;
-		pos.offset = offset;
-		bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
-	}
-
-	prt_str(&buf, "write error: ");
-
-	va_list args;
-	va_start(args, fmt);
-	prt_vprintf(&buf, fmt, args);
-	va_end(args);
-
-	if (op->flags & BCH_WRITE_move) {
-		struct data_update *u = container_of(op, struct data_update, op);
-
-		prt_printf(&buf, "\n  from internal move ");
-		bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
-	}
-
-	bch_err_ratelimited(op->c, "%s", buf.buf);
-	printbuf_exit(&buf);
-}
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
-			       enum bch_data_type type,
-			       const struct bkey_i *k,
-			       bool nocow)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-	struct bch_write_bio *n;
-	unsigned ref_rw  = type == BCH_DATA_btree ? READ : WRITE;
-	unsigned ref_idx = type == BCH_DATA_btree
-		? BCH_DEV_READ_REF_btree_node_write
-		: BCH_DEV_WRITE_REF_io_write;
-
-	BUG_ON(c->opts.nochanges);
-
-	const struct bch_extent_ptr *last = NULL;
-	bkey_for_each_ptr(ptrs, ptr)
-		last = ptr;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		/*
-		 * XXX: btree writes should be using io_ref[WRITE], but we
-		 * aren't retrying failed btree writes yet (due to device
-		 * removal/ro):
-		 */
-		struct bch_dev *ca = nocow
-			? bch2_dev_have_ref(c, ptr->dev)
-			: bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
-
-		if (ptr != last) {
-			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
-
-			n->bio.bi_end_io	= wbio->bio.bi_end_io;
-			n->bio.bi_private	= wbio->bio.bi_private;
-			n->parent		= wbio;
-			n->split		= true;
-			n->bounce		= false;
-			n->put_bio		= true;
-			n->bio.bi_opf		= wbio->bio.bi_opf;
-			bio_inc_remaining(&wbio->bio);
-		} else {
-			n = wbio;
-			n->split		= false;
-		}
-
-		n->c			= c;
-		n->dev			= ptr->dev;
-		n->have_ioref		= ca != NULL;
-		n->nocow		= nocow;
-		n->submit_time		= local_clock();
-		n->inode_offset		= bkey_start_offset(&k->k);
-		if (nocow)
-			n->nocow_bucket	= PTR_BUCKET_NR(ca, ptr);
-		n->bio.bi_iter.bi_sector = ptr->offset;
-
-		if (likely(n->have_ioref)) {
-			this_cpu_add(ca->io_done->sectors[WRITE][type],
-				     bio_sectors(&n->bio));
-
-			bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
-			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
-				bio_endio(&n->bio);
-				continue;
-			}
-
-			submit_bio(&n->bio);
-		} else {
-			n->bio.bi_status	= BLK_STS_REMOVED;
-			bio_endio(&n->bio);
-		}
-	}
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bch_fs *c = op->c;
-
-	EBUG_ON(op->open_buckets.nr);
-
-	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-	bch2_disk_reservation_put(c, &op->res);
-
-	if (!(op->flags & BCH_WRITE_move))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
-	bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
-	EBUG_ON(cl->parent);
-	closure_debug_destroy(cl);
-	async_object_list_del(c, write_op, op->list_idx);
-	if (op->end_io)
-		op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct keylist *keys = &op->insert_keys;
-	struct bkey_i *src, *dst = keys->keys, *n;
-
-	for (src = keys->keys; src != keys->top; src = n) {
-		n = bkey_next(src);
-
-		if (bkey_extent_is_direct_data(&src->k)) {
-			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
-					    test_bit(ptr->dev, op->failed.d));
-
-			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
-				return bch_err_throw(c, data_write_io);
-		}
-
-		if (dst != src)
-			memmove_u64s_down(dst, src, src->k.u64s);
-		dst = bkey_next(dst);
-	}
-
-	keys->top = dst;
-	return 0;
-}
-
-/**
- * __bch2_write_index - after a write, update index to point to new data
- * @op:		bch_write_op to process
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct keylist *keys = &op->insert_keys;
-	unsigned dev;
-	int ret = 0;
-
-	if (unlikely(op->flags & BCH_WRITE_io_error)) {
-		ret = bch2_write_drop_io_error_ptrs(op);
-		if (ret)
-			goto err;
-	}
-
-	if (!bch2_keylist_empty(keys)) {
-		u64 sectors_start = keylist_sectors(keys);
-
-		ret = !(op->flags & BCH_WRITE_move)
-			? bch2_write_index_default(op)
-			: bch2_data_update_index_update(op);
-
-		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-		BUG_ON(keylist_sectors(keys) && !ret);
-
-		op->written += sectors_start - keylist_sectors(keys);
-
-		if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
-			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
-			bch2_write_op_error(op, bkey_start_offset(&insert->k),
-					    "btree update error: %s", bch2_err_str(ret));
-		}
-
-		if (ret)
-			goto err;
-	}
-out:
-	/* If some a bucket wasn't written, we can't erasure code it: */
-	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
-		bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
-
-	bch2_open_buckets_put(c, &op->open_buckets);
-	return;
-err:
-	keys->top = keys->keys;
-	op->error = ret;
-	op->flags |= BCH_WRITE_submitted;
-	goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
-	if (state != wp->state) {
-		struct task_struct *p = current;
-		u64 now = ktime_get_ns();
-		u64 runtime = p->se.sum_exec_runtime +
-			(now - p->se.exec_start);
-
-		if (state == WRITE_POINT_runnable)
-			wp->last_runtime = runtime;
-		else if (wp->state == WRITE_POINT_runnable)
-			wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
-
-		if (wp->last_state_change &&
-		    time_after64(now, wp->last_state_change))
-			wp->time[wp->state] += now - wp->last_state_change;
-		wp->state = state;
-		wp->last_state_change = now;
-	}
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
-	enum write_point_state state;
-
-	state = running			 ? WRITE_POINT_runnable:
-		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
-					 : WRITE_POINT_stopped;
-
-	__wp_update_state(wp, state);
-}
-
-static CLOSURE_CALLBACK(bch2_write_index)
-{
-	closure_type(op, struct bch_write_op, cl);
-	struct write_point *wp = op->wp;
-	struct workqueue_struct *wq = index_update_wq(op);
-	unsigned long flags;
-
-	if ((op->flags & BCH_WRITE_submitted) &&
-	    (op->flags & BCH_WRITE_move))
-		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
-	spin_lock_irqsave(&wp->writes_lock, flags);
-	if (wp->state == WRITE_POINT_waiting_io)
-		__wp_update_state(wp, WRITE_POINT_waiting_work);
-	list_add_tail(&op->wp_list, &wp->writes);
-	spin_unlock_irqrestore (&wp->writes_lock, flags);
-
-	queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
-	op->wp = wp;
-
-	if (wp->state == WRITE_POINT_stopped) {
-		spin_lock_irq(&wp->writes_lock);
-		__wp_update_state(wp, WRITE_POINT_waiting_io);
-		spin_unlock_irq(&wp->writes_lock);
-	}
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
-	struct write_point *wp =
-		container_of(work, struct write_point, index_update_work);
-	struct bch_write_op *op;
-
-	while (1) {
-		spin_lock_irq(&wp->writes_lock);
-		op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
-		wp_update_state(wp, op != NULL);
-		spin_unlock_irq(&wp->writes_lock);
-
-		if (!op)
-			break;
-
-		op->flags |= BCH_WRITE_in_worker;
-
-		__bch2_write_index(op);
-
-		if (!(op->flags & BCH_WRITE_submitted))
-			__bch2_write(op);
-		else
-			bch2_write_done(&op->cl);
-	}
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
-	struct closure *cl		= bio->bi_private;
-	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
-	struct bch_write_bio *wbio	= to_wbio(bio);
-	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
-	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= wbio->have_ioref
-		? bch2_dev_have_ref(c, wbio->dev)
-		: NULL;
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-				   wbio->submit_time, !bio->bi_status);
-
-	if (unlikely(bio->bi_status)) {
-		if (ca)
-			bch_err_inum_offset_ratelimited(ca,
-					    op->pos.inode,
-					    wbio->inode_offset << 9,
-					    "data write error: %s",
-					    bch2_blk_status_to_str(bio->bi_status));
-		else
-			bch_err_inum_offset_ratelimited(c,
-					    op->pos.inode,
-					    wbio->inode_offset << 9,
-					    "data write error: %s",
-					    bch2_blk_status_to_str(bio->bi_status));
-		set_bit(wbio->dev, op->failed.d);
-		op->flags |= BCH_WRITE_io_error;
-	}
-
-	if (wbio->nocow) {
-		bch2_bucket_nocow_unlock(&c->nocow_locks,
-					 POS(ca->dev_idx, wbio->nocow_bucket),
-					 BUCKET_NOCOW_LOCK_UPDATE);
-		set_bit(wbio->dev, op->devs_need_flush->d);
-	}
-
-	if (wbio->have_ioref)
-		enumerated_ref_put(&ca->io_ref[WRITE],
-				   BCH_DEV_WRITE_REF_io_write);
-
-	if (wbio->bounce)
-		bch2_bio_free_pages_pool(c, bio);
-
-	if (wbio->put_bio)
-		bio_put(bio);
-
-	if (parent)
-		bio_endio(&parent->bio);
-	else
-		closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
-			       struct write_point *wp,
-			       struct bversion version,
-			       struct bch_extent_crc_unpacked crc)
-{
-	struct bkey_i_extent *e;
-
-	op->pos.offset += crc.uncompressed_size;
-
-	e = bkey_extent_init(op->insert_keys.top);
-	e->k.p		= op->pos;
-	e->k.size	= crc.uncompressed_size;
-	e->k.bversion	= version;
-
-	if (crc.csum_type ||
-	    crc.compression_type ||
-	    crc.nonce)
-		bch2_extent_crc_append(&e->k_i, crc);
-
-	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
-				       op->flags & BCH_WRITE_cached);
-
-	if (!(op->flags & BCH_WRITE_move))
-		bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
-
-	bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
-					struct write_point *wp,
-					struct bio *src,
-					bool *page_alloc_failed,
-					void *buf)
-{
-	struct bch_write_bio *wbio;
-	struct bio *bio;
-	unsigned output_available =
-		min(wp->sectors_free << 9, src->bi_iter.bi_size);
-	unsigned pages = DIV_ROUND_UP(output_available +
-				      (buf
-				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
-				       : 0), PAGE_SIZE);
-
-	pages = min(pages, BIO_MAX_VECS);
-
-	bio = bio_alloc_bioset(NULL, pages, 0,
-			       GFP_NOFS, &c->bio_write);
-	wbio			= wbio_init(bio);
-	wbio->put_bio		= true;
-	/* copy WRITE_SYNC flag */
-	wbio->bio.bi_opf	= src->bi_opf;
-
-	if (buf) {
-		bch2_bio_map(bio, buf, output_available);
-		return bio;
-	}
-
-	wbio->bounce		= true;
-
-	/*
-	 * We can't use mempool for more than c->sb.encoded_extent_max
-	 * worth of pages, but we'd like to allocate more if we can:
-	 */
-	bch2_bio_alloc_pages_pool(c, bio,
-				  min_t(unsigned, output_available,
-					c->opts.encoded_extent_max));
-
-	if (bio->bi_iter.bi_size < output_available)
-		*page_alloc_failed =
-			bch2_bio_alloc_pages(bio,
-					     output_available -
-					     bio->bi_iter.bi_size,
-					     GFP_NOFS) != 0;
-
-	return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
-				 struct bch_write_op *op,
-				 unsigned new_csum_type)
-{
-	struct bio *bio = &op->wbio.bio;
-	struct bch_extent_crc_unpacked new_crc;
-
-	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
-	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
-	    bch2_csum_type_is_encryption(new_csum_type))
-		new_csum_type = op->crc.csum_type;
-
-	int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
-				      NULL, &new_crc,
-				      op->crc.offset, op->crc.live_size,
-				      new_csum_type);
-	if (ret)
-		return ret;
-
-	bio_advance(bio, op->crc.offset << 9);
-	bio->bi_iter.bi_size = op->crc.live_size << 9;
-	op->crc = new_crc;
-	return 0;
-}
-
-static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
-	struct bch_fs *c = op->c;
-	struct bio *bio = &op->wbio.bio;
-	struct bch_csum csum;
-	int ret = 0;
-
-	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
-	/* Can we just write the entire extent as is? */
-	if (op->crc.uncompressed_size == op->crc.live_size &&
-	    op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
-	    op->crc.compressed_size <= wp->sectors_free &&
-	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
-	     op->incompressible)) {
-		if (!crc_is_compressed(op->crc) &&
-		    op->csum_type != op->crc.csum_type) {
-			ret = bch2_write_rechecksum(c, op, op->csum_type);
-			if (ret)
-				return ret;
-		}
-
-		return 1;
-	}
-
-	/*
-	 * If the data is compressed and we couldn't write the entire extent as
-	 * is, we have to decompress it:
-	 */
-	if (crc_is_compressed(op->crc)) {
-		/* Last point we can still verify checksum: */
-		struct nonce nonce = extent_nonce(op->version, op->crc);
-		csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
-		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-			goto csum_err;
-
-		if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
-			ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
-			if (ret)
-				return ret;
-
-			op->crc.csum_type = 0;
-			op->crc.csum = (struct bch_csum) { 0, 0 };
-		}
-
-		ret = bch2_bio_uncompress_inplace(op, bio);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * No longer have compressed data after this point - data might be
-	 * encrypted:
-	 */
-
-	/*
-	 * If the data is checksummed and we're only writing a subset,
-	 * rechecksum and adjust bio to point to currently live data:
-	 */
-	if (op->crc.live_size != op->crc.uncompressed_size ||
-	    op->crc.csum_type != op->csum_type) {
-		ret = bch2_write_rechecksum(c, op, op->csum_type);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * If we want to compress the data, it has to be decrypted:
-	 */
-	if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
-	    (op->compression_opt || op->crc.csum_type != op->csum_type)) {
-		struct nonce nonce = extent_nonce(op->version, op->crc);
-		csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
-		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-			goto csum_err;
-
-		ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
-		if (ret)
-			return ret;
-
-		op->crc.csum_type = 0;
-		op->crc.csum = (struct bch_csum) { 0, 0 };
-	}
-
-	return 0;
-csum_err:
-	bch2_write_op_error(op, op->pos.offset,
-		"error verifying existing checksum while moving existing data (memory corruption?)\n"
-		"  expected %0llx:%0llx got %0llx:%0llx type %s",
-		op->crc.csum.hi,
-		op->crc.csum.lo,
-		csum.hi,
-		csum.lo,
-		op->crc.csum_type < BCH_CSUM_NR
-		? __bch2_csum_types[op->crc.csum_type]
-		: "(unknown)");
-	return bch_err_throw(c, data_write_csum);
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
-			     struct bio **_dst)
-{
-	struct bch_fs *c = op->c;
-	struct bio *src = &op->wbio.bio, *dst = src;
-	struct bvec_iter saved_iter;
-	void *ec_buf;
-	unsigned total_output = 0, total_input = 0;
-	bool bounce = false;
-	bool page_alloc_failed = false;
-	int ret, more = 0;
-
-	if (op->incompressible)
-		op->compression_opt = 0;
-
-	BUG_ON(!bio_sectors(src));
-
-	ec_buf = bch2_writepoint_ec_buf(c, wp);
-
-	if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
-		ret = bch2_write_prep_encoded_data(op, wp);
-		if (ret < 0)
-			goto err;
-		if (ret) {
-			if (ec_buf) {
-				dst = bch2_write_bio_alloc(c, wp, src,
-							   &page_alloc_failed,
-							   ec_buf);
-				bio_copy_data(dst, src);
-				bounce = true;
-			}
-			init_append_extent(op, wp, op->version, op->crc);
-			goto do_write;
-		}
-	}
-
-	if (ec_buf ||
-	    op->compression_opt ||
-	    (op->csum_type &&
-	     !(op->flags & BCH_WRITE_pages_stable)) ||
-	    (bch2_csum_type_is_encryption(op->csum_type) &&
-	     !(op->flags & BCH_WRITE_pages_owned))) {
-		dst = bch2_write_bio_alloc(c, wp, src,
-					   &page_alloc_failed,
-					   ec_buf);
-		bounce = true;
-	}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
-	if (!bounce && write_corrupt_ratio) {
-		dst = bch2_write_bio_alloc(c, wp, src,
-					   &page_alloc_failed,
-					   ec_buf);
-		bounce = true;
-	}
-#endif
-	saved_iter = dst->bi_iter;
-
-	do {
-		struct bch_extent_crc_unpacked crc = { 0 };
-		struct bversion version = op->version;
-		size_t dst_len = 0, src_len = 0;
-
-		if (page_alloc_failed &&
-		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
-		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
-			break;
-
-		BUG_ON(op->compression_opt &&
-		       (op->flags & BCH_WRITE_data_encoded) &&
-		       bch2_csum_type_is_encryption(op->crc.csum_type));
-		BUG_ON(op->compression_opt && !bounce);
-
-		crc.compression_type = op->incompressible
-			? BCH_COMPRESSION_TYPE_incompressible
-			: op->compression_opt
-			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-					    op->compression_opt)
-			: 0;
-		if (!crc_is_compressed(crc)) {
-			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
-			if (op->csum_type)
-				dst_len = min_t(unsigned, dst_len,
-						c->opts.encoded_extent_max);
-
-			if (bounce) {
-				swap(dst->bi_iter.bi_size, dst_len);
-				bio_copy_data(dst, src);
-				swap(dst->bi_iter.bi_size, dst_len);
-			}
-
-			src_len = dst_len;
-		}
-
-		BUG_ON(!src_len || !dst_len);
-
-		if (bch2_csum_type_is_encryption(op->csum_type)) {
-			if (bversion_zero(version)) {
-				version.lo = atomic64_inc_return(&c->key_version);
-			} else {
-				crc.nonce = op->nonce;
-				op->nonce += src_len >> 9;
-			}
-		}
-
-		if ((op->flags & BCH_WRITE_data_encoded) &&
-		    !crc_is_compressed(crc) &&
-		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
-		    bch2_csum_type_is_encryption(op->csum_type)) {
-			u8 compression_type = crc.compression_type;
-			u16 nonce = crc.nonce;
-			/*
-			 * Note: when we're using rechecksum(), we need to be
-			 * checksumming @src because it has all the data our
-			 * existing checksum covers - if we bounced (because we
-			 * were trying to compress), @dst will only have the
-			 * part of the data the new checksum will cover.
-			 *
-			 * But normally we want to be checksumming post bounce,
-			 * because part of the reason for bouncing is so the
-			 * data can't be modified (by userspace) while it's in
-			 * flight.
-			 */
-			ret = bch2_rechecksum_bio(c, src, version, op->crc,
-					&crc, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->csum_type);
-			if (ret)
-				goto err;
-			/*
-			 * rchecksum_bio sets compression_type on crc from op->crc,
-			 * this isn't always correct as sometimes we're changing
-			 * an extent from uncompressed to incompressible.
-			 */
-			crc.compression_type = compression_type;
-			crc.nonce = nonce;
-		} else {
-			if ((op->flags & BCH_WRITE_data_encoded) &&
-			    (ret = bch2_rechecksum_bio(c, src, version, op->crc,
-					NULL, &op->crc,
-					src_len >> 9,
-					bio_sectors(src) - (src_len >> 9),
-					op->crc.csum_type)))
-				goto err;
-
-			crc.compressed_size	= dst_len >> 9;
-			crc.uncompressed_size	= src_len >> 9;
-			crc.live_size		= src_len >> 9;
-
-			swap(dst->bi_iter.bi_size, dst_len);
-			ret = bch2_encrypt_bio(c, op->csum_type,
-					       extent_nonce(version, crc), dst);
-			if (ret)
-				goto err;
-
-			crc.csum = bch2_checksum_bio(c, op->csum_type,
-					 extent_nonce(version, crc), dst);
-			crc.csum_type = op->csum_type;
-			swap(dst->bi_iter.bi_size, dst_len);
-		}
-
-		init_append_extent(op, wp, version, crc);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-		if (write_corrupt_ratio) {
-			swap(dst->bi_iter.bi_size, dst_len);
-			bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
-			swap(dst->bi_iter.bi_size, dst_len);
-		}
-#endif
-
-		if (dst != src)
-			bio_advance(dst, dst_len);
-		bio_advance(src, src_len);
-		total_output	+= dst_len;
-		total_input	+= src_len;
-	} while (dst->bi_iter.bi_size &&
-		 src->bi_iter.bi_size &&
-		 wp->sectors_free &&
-		 !bch2_keylist_realloc(&op->insert_keys,
-				      op->inline_keys,
-				      ARRAY_SIZE(op->inline_keys),
-				      BKEY_EXTENT_U64s_MAX));
-
-	more = src->bi_iter.bi_size != 0;
-
-	dst->bi_iter = saved_iter;
-
-	if (dst == src && more) {
-		BUG_ON(total_output != total_input);
-
-		dst = bio_split(src, total_input >> 9,
-				GFP_NOFS, &c->bio_write);
-		wbio_init(dst)->put_bio	= true;
-		/* copy WRITE_SYNC flag */
-		dst->bi_opf		= src->bi_opf;
-	}
-
-	dst->bi_iter.bi_size = total_output;
-do_write:
-	*_dst = dst;
-	return more;
-err:
-	if (to_wbio(dst)->bounce)
-		bch2_bio_free_pages_pool(c, dst);
-	if (to_wbio(dst)->put_bio)
-		bio_put(dst);
-
-	return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
-				     struct bkey_s_c k)
-{
-	struct bch_fs *c = op->c;
-	struct bkey_s_c_extent e;
-	struct extent_ptr_decoded p;
-	const union bch_extent_entry *entry;
-	unsigned replicas = 0;
-
-	if (k.k->type != KEY_TYPE_extent)
-		return false;
-
-	e = bkey_s_c_to_extent(k);
-
-	guard(rcu)();
-	extent_for_each_ptr_decode(e, p, entry) {
-		if (crc_is_encoded(p.crc) || p.has_ec)
-			return false;
-
-		replicas += bch2_extent_ptr_durability(c, &p);
-	}
-
-	return replicas >= op->opts.data_replicas;
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
-						  struct btree_iter *iter,
-						  struct bkey_i *orig,
-						  struct bkey_s_c k,
-						  u64 new_i_size)
-{
-	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
-		/* trace this */
-		return 0;
-	}
-
-	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-	int ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		return ret;
-
-	bch2_cut_front(bkey_start_pos(&orig->k), new);
-	bch2_cut_back(orig->k.p, new);
-
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-	bkey_for_each_ptr(ptrs, ptr)
-		ptr->unwritten = 0;
-
-	/*
-	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
-	 * that was done when we kicked off the write, and here it's important
-	 * that we update the extent that we wrote to - even if a snapshot has
-	 * since been created. The write is still outstanding, so we're ok
-	 * w.r.t. snapshot atomicity:
-	 */
-	return  bch2_extent_update_i_size_sectors(trans, iter,
-					min(new->k.p.offset << 9, new_i_size), 0) ?:
-		bch2_trans_update(trans, iter, new,
-				  BTREE_UPDATE_internal_snapshot_node);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = 0;
-
-	for_each_keylist_key(&op->insert_keys, orig) {
-		ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
-				     bkey_start_pos(&orig->k), orig->k.p,
-				     BTREE_ITER_intent, k,
-				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
-		}));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_put(trans);
-
-	if (ret && !bch2_err_matches(ret, EROFS)) {
-		struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-		bch2_write_op_error(op, bkey_start_offset(&insert->k),
-				    "btree update error: %s", bch2_err_str(ret));
-	}
-
-	if (ret)
-		op->error = ret;
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
-	if (unlikely(op->flags & BCH_WRITE_io_error)) {
-		op->error = bch_err_throw(op->c, data_write_io);
-	} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
-		bch2_nocow_write_convert_unwritten(op);
-}
-
-static CLOSURE_CALLBACK(bch2_nocow_write_done)
-{
-	closure_type(op, struct bch_write_op, cl);
-
-	__bch2_nocow_write_done(op);
-	bch2_write_done(cl);
-}
-
-struct bucket_to_lock {
-	struct bpos		b;
-	unsigned		gen;
-	struct nocow_lock_bucket *l;
-};
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
-	u32 snapshot;
-	struct bucket_to_lock *stale_at;
-	int stale, ret;
-
-	if (op->flags & BCH_WRITE_move)
-		return;
-
-	darray_init(&buckets);
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
-	if (unlikely(ret))
-		goto err;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(op->pos.inode, op->pos.offset, snapshot),
-			     BTREE_ITER_slots);
-	while (1) {
-		struct bio *bio = &op->wbio.bio;
-
-		buckets.nr = 0;
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
-			break;
-
-		k = bch2_btree_iter_peek_slot(trans, &iter);
-		ret = bkey_err(k);
-		if (ret)
-			break;
-
-		/* fall back to normal cow write path? */
-		if (unlikely(k.k->p.snapshot != snapshot ||
-			     !bch2_extent_is_writeable(op, k)))
-			break;
-
-		if (bch2_keylist_realloc(&op->insert_keys,
-					 op->inline_keys,
-					 ARRAY_SIZE(op->inline_keys),
-					 k.k->u64s))
-			break;
-
-		/* Get iorefs before dropping btree locks: */
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		bkey_for_each_ptr(ptrs, ptr) {
-			struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
-							BCH_DEV_WRITE_REF_io_write);
-			if (unlikely(!ca))
-				goto err_get_ioref;
-
-			struct bpos b = PTR_BUCKET_POS(ca, ptr);
-			struct nocow_lock_bucket *l =
-				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
-			prefetch(l);
-
-			/* XXX allocating memory with btree locks held - rare */
-			darray_push_gfp(&buckets, ((struct bucket_to_lock) {
-						   .b = b, .gen = ptr->gen, .l = l,
-						   }), GFP_KERNEL|__GFP_NOFAIL);
-
-			if (ptr->unwritten)
-				op->flags |= BCH_WRITE_convert_unwritten;
-		}
-
-		/* Unlock before taking nocow locks, doing IO: */
-		bkey_reassemble(op->insert_keys.top, k);
-		bch2_trans_unlock(trans);
-
-		bch2_cut_front(op->pos, op->insert_keys.top);
-		if (op->flags & BCH_WRITE_convert_unwritten)
-			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
-		darray_for_each(buckets, i) {
-			struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
-
-			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
-						 bucket_to_u64(i->b),
-						 BUCKET_NOCOW_LOCK_UPDATE);
-
-			int gen = bucket_gen_get(ca, i->b.offset);
-			stale = gen < 0 ? gen : gen_after(gen, i->gen);
-			if (unlikely(stale)) {
-				stale_at = i;
-				goto err_bucket_stale;
-			}
-		}
-
-		bio = &op->wbio.bio;
-		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
-			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
-					GFP_KERNEL, &c->bio_write);
-			wbio_init(bio)->put_bio = true;
-			bio->bi_opf = op->wbio.bio.bi_opf;
-		} else {
-			op->flags |= BCH_WRITE_submitted;
-		}
-
-		op->pos.offset += bio_sectors(bio);
-		op->written += bio_sectors(bio);
-
-		bio->bi_end_io	= bch2_write_endio;
-		bio->bi_private	= &op->cl;
-		bio->bi_opf |= REQ_OP_WRITE;
-		closure_get(&op->cl);
-
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-					  op->insert_keys.top, true);
-
-		bch2_keylist_push(&op->insert_keys);
-		if (op->flags & BCH_WRITE_submitted)
-			break;
-		bch2_btree_iter_advance(trans, &iter);
-	}
-out:
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	darray_exit(&buckets);
-
-	if (ret) {
-		bch2_write_op_error(op, op->pos.offset,
-				    "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
-		op->error = ret;
-		op->flags |= BCH_WRITE_submitted;
-	}
-
-	/* fallback to cow write path? */
-	if (!(op->flags & BCH_WRITE_submitted)) {
-		closure_sync(&op->cl);
-		__bch2_nocow_write_done(op);
-		op->insert_keys.top = op->insert_keys.keys;
-	} else if (op->flags & BCH_WRITE_sync) {
-		closure_sync(&op->cl);
-		bch2_nocow_write_done(&op->cl.work);
-	} else {
-		/*
-		 * XXX
-		 * needs to run out of process context because ei_quota_lock is
-		 * a mutex
-		 */
-		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
-	}
-	return;
-err_get_ioref:
-	darray_for_each(buckets, i)
-		enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
-				   BCH_DEV_WRITE_REF_io_write);
-
-	/* Fall back to COW path: */
-	goto out;
-err_bucket_stale:
-	darray_for_each(buckets, i) {
-		bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
-		if (i == stale_at)
-			break;
-	}
-
-	struct printbuf buf = PRINTBUF;
-	if (bch2_fs_inconsistent_on(stale < 0, c,
-				    "pointer to invalid bucket in nocow path on device %llu\n  %s",
-				    stale_at->b.inode,
-				    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = bch_err_throw(c, data_write_invalid_ptr);
-	} else {
-		/* We can retry this: */
-		ret = bch_err_throw(c, transaction_restart);
-	}
-	printbuf_exit(&buf);
-
-	goto err_get_ioref;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
-	struct bch_fs *c = op->c;
-	struct write_point *wp = NULL;
-	struct bio *bio = NULL;
-	unsigned nofs_flags;
-	int ret;
-
-	nofs_flags = memalloc_nofs_save();
-
-	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
-		bch2_nocow_write(op);
-		if (op->flags & BCH_WRITE_submitted)
-			goto out_nofs_restore;
-	}
-again:
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	do {
-		struct bkey_i *key_to_write;
-		unsigned key_to_write_offset = op->insert_keys.top_p -
-			op->insert_keys.keys_p;
-
-		/* +1 for possible cache device: */
-		if (op->open_buckets.nr + op->nr_replicas + 1 >
-		    ARRAY_SIZE(op->open_buckets.v))
-			break;
-
-		if (bch2_keylist_realloc(&op->insert_keys,
-					op->inline_keys,
-					ARRAY_SIZE(op->inline_keys),
-					BKEY_EXTENT_U64s_MAX))
-			break;
-
-		/*
-		 * The copygc thread is now global, which means it's no longer
-		 * freeing up space on specific disks, which means that
-		 * allocations for specific disks may hang arbitrarily long:
-		 */
-		ret = bch2_trans_run(c, lockrestart_do(trans,
-			bch2_alloc_sectors_start_trans(trans,
-				op->target,
-				op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
-				op->write_point,
-				&op->devs_have,
-				op->nr_replicas,
-				op->nr_replicas_required,
-				op->watermark,
-				op->flags,
-				&op->cl, &wp)));
-		if (unlikely(ret)) {
-			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-				break;
-
-			goto err;
-		}
-
-		EBUG_ON(!wp);
-
-		bch2_open_bucket_get(c, wp, &op->open_buckets);
-		ret = bch2_write_extent(op, wp, &bio);
-
-		bch2_alloc_sectors_done_inlined(c, wp);
-err:
-		if (ret <= 0) {
-			op->flags |= BCH_WRITE_submitted;
-
-			if (unlikely(ret < 0)) {
-				if (!(op->flags & BCH_WRITE_alloc_nowait))
-					bch2_write_op_error(op, op->pos.offset,
-							    "%s(): %s", __func__, bch2_err_str(ret));
-				op->error = ret;
-				break;
-			}
-		}
-
-		bio->bi_end_io	= bch2_write_endio;
-		bio->bi_private	= &op->cl;
-		bio->bi_opf |= REQ_OP_WRITE;
-
-		closure_get(bio->bi_private);
-
-		key_to_write = (void *) (op->insert_keys.keys_p +
-					 key_to_write_offset);
-
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-					  key_to_write, false);
-	} while (ret);
-
-	/*
-	 * Sync or no?
-	 *
-	 * If we're running asynchronously, wne may still want to block
-	 * synchronously here if we weren't able to submit all of the IO at
-	 * once, as that signals backpressure to the caller.
-	 */
-	if ((op->flags & BCH_WRITE_sync) ||
-	    (!(op->flags & BCH_WRITE_submitted) &&
-	     !(op->flags & BCH_WRITE_in_worker))) {
-		bch2_wait_on_allocator(c, &op->cl);
-
-		__bch2_write_index(op);
-
-		if (!(op->flags & BCH_WRITE_submitted))
-			goto again;
-		bch2_write_done(&op->cl);
-	} else {
-		bch2_write_queue(op, wp);
-		continue_at(&op->cl, bch2_write_index, NULL);
-	}
-out_nofs_restore:
-	memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
-	struct bio *bio = &op->wbio.bio;
-	struct bvec_iter iter;
-	struct bkey_i_inline_data *id;
-	unsigned sectors;
-	int ret;
-
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	op->flags |= BCH_WRITE_wrote_data_inline;
-	op->flags |= BCH_WRITE_submitted;
-
-	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
-	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
-				   ARRAY_SIZE(op->inline_keys),
-				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
-	if (ret) {
-		op->error = ret;
-		goto err;
-	}
-
-	sectors = bio_sectors(bio);
-	op->pos.offset += sectors;
-
-	id = bkey_inline_data_init(op->insert_keys.top);
-	id->k.p		= op->pos;
-	id->k.bversion	= op->version;
-	id->k.size	= sectors;
-
-	iter = bio->bi_iter;
-	iter.bi_size = data_len;
-	memcpy_from_bio(id->v.data, bio, iter);
-
-	while (data_len & 7)
-		id->v.data[data_len++] = '\0';
-	set_bkey_val_bytes(&id->k, data_len);
-	bch2_keylist_push(&op->insert_keys);
-
-	__bch2_write_index(op);
-err:
-	bch2_write_done(&op->cl);
-}
-
-/**
- * bch2_write() - handle a write to a cache device or flash only volume
- * @cl:		&bch_write_op->cl
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-CLOSURE_CALLBACK(bch2_write)
-{
-	closure_type(op, struct bch_write_op, cl);
-	struct bio *bio = &op->wbio.bio;
-	struct bch_fs *c = op->c;
-	unsigned data_len;
-
-	EBUG_ON(op->cl.parent);
-	BUG_ON(!op->nr_replicas);
-	BUG_ON(!op->write_point.v);
-	BUG_ON(bkey_eq(op->pos, POS_MAX));
-
-	async_object_list_add(c, write_op, op, &op->list_idx);
-
-	if (op->flags & BCH_WRITE_only_specified_devs)
-		op->flags |= BCH_WRITE_alloc_nowait;
-
-	op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
-	op->start_time = local_clock();
-	bch2_keylist_init(&op->insert_keys, op->inline_keys);
-	wbio_init(bio)->put_bio = false;
-
-	if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
-		bch2_write_op_error(op, op->pos.offset, "misaligned write");
-		op->error = bch_err_throw(c, data_write_misaligned);
-		goto err;
-	}
-
-	if (c->opts.nochanges) {
-		op->error = bch_err_throw(c, erofs_no_writes);
-		goto err;
-	}
-
-	if (!(op->flags & BCH_WRITE_move) &&
-	    !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
-		op->error = bch_err_throw(c, erofs_no_writes);
-		goto err;
-	}
-
-	if (!(op->flags & BCH_WRITE_move))
-		this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
-	bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
-	data_len = min_t(u64, bio->bi_iter.bi_size,
-			 op->new_i_size - (op->pos.offset << 9));
-
-	if (c->opts.inline_data &&
-	    data_len <= min(block_bytes(c) / 2, 1024U)) {
-		bch2_write_data_inline(op, data_len);
-		return;
-	}
-
-	__bch2_write(op);
-	return;
-err:
-	bch2_disk_reservation_put(c, &op->res);
-
-	closure_debug_destroy(&op->cl);
-	async_object_list_del(c, write_op, op->list_idx);
-	if (op->end_io)
-		op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f)	#f,
-	BCH_WRITE_FLAGS()
-#undef x
-	NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	prt_printf(out, "pos:\t");
-	bch2_bpos_to_text(out, op->pos);
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "started:\t");
-	bch2_pr_time_units(out, local_clock() - op->start_time);
-	prt_newline(out);
-
-	prt_printf(out, "flags:\t");
-	prt_bitflags(out, bch2_write_flags, op->flags);
-	prt_newline(out);
-
-	prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
-	prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
-
-	prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
-	prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
-
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_fs_io_write_exit(struct bch_fs *c)
-{
-	bioset_exit(&c->replica_set);
-	bioset_exit(&c->bio_write);
-}
-
-int bch2_fs_io_write_init(struct bch_fs *c)
-{
-	if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
-		return bch_err_throw(c, ENOMEM_bio_write_init);
-
-	return 0;
-}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
deleted file mode 100644
index 2c0a8f35ee1f..000000000000
--- a/fs/bcachefs/io_write.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_H
-#define _BCACHEFS_IO_WRITE_H
-
-#include "checksum.h"
-#include "io_write_types.h"
-
-#define to_wbio(_bio)			\
-	container_of((_bio), struct bch_write_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       enum bch_data_type, const struct bkey_i *, bool);
-
-__printf(3, 4)
-void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-	return op->watermark == BCH_WATERMARK_copygc
-		? op->c->copygc_wq
-		: op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-			       struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-		       struct btree_iter *, struct bkey_i *,
-		       struct disk_reservation *, u64, s64 *, bool);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-				      struct bch_io_opts opts)
-{
-	op->c			= c;
-	op->end_io		= NULL;
-	op->flags		= 0;
-	op->written		= 0;
-	op->error		= 0;
-	op->csum_type		= bch2_data_checksum_type(c, opts);
-	op->compression_opt	= opts.compression;
-	op->nr_replicas		= 0;
-	op->nr_replicas_required = c->opts.data_replicas_required;
-	op->watermark		= BCH_WATERMARK_normal;
-	op->incompressible	= 0;
-	op->open_buckets.nr	= 0;
-	op->devs_have.nr	= 0;
-	op->target		= 0;
-	op->opts		= opts;
-	op->subvol		= 0;
-	op->pos			= POS_MAX;
-	op->version		= ZERO_VERSION;
-	op->write_point		= (struct write_point_specifier) { 0 };
-	op->res			= (struct disk_reservation) { 0 };
-	op->new_i_size		= U64_MAX;
-	op->i_sectors_delta	= 0;
-	op->devs_need_flush	= NULL;
-}
-
-CLOSURE_CALLBACK(bch2_write);
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-	struct bch_write_bio *wbio = to_wbio(bio);
-
-	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-	return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-void bch2_fs_io_write_exit(struct bch_fs *);
-int bch2_fs_io_write_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
deleted file mode 100644
index 5da4eb8bb6f6..000000000000
--- a/fs/bcachefs/io_write_types.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_TYPES_H
-#define _BCACHEFS_IO_WRITE_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-#define BCH_WRITE_FLAGS()		\
-	x(alloc_nowait)			\
-	x(cached)			\
-	x(data_encoded)			\
-	x(pages_stable)			\
-	x(pages_owned)			\
-	x(only_specified_devs)		\
-	x(wrote_data_inline)		\
-	x(check_enospc)			\
-	x(sync)				\
-	x(move)				\
-	x(in_worker)			\
-	x(submitted)			\
-	x(io_error)			\
-	x(convert_unwritten)
-
-enum __bch_write_flags {
-#define x(f)	__BCH_WRITE_##f,
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
-	BCH_WRITE_FLAGS()
-#undef x
-};
-
-struct bch_write_bio {
-	struct_group(wbio,
-	struct bch_fs		*c;
-	struct bch_write_bio	*parent;
-
-	u64			submit_time;
-	u64			inode_offset;
-	u64			nocow_bucket;
-
-	struct bch_devs_list	failed;
-	u8			dev;
-
-	unsigned		split:1,
-				bounce:1,
-				put_bio:1,
-				have_ioref:1,
-				nocow:1,
-				used_mempool:1,
-				first_btree_write:1;
-	);
-
-	struct bio		bio;
-};
-
-struct bch_write_op {
-	struct closure		cl;
-	struct bch_fs		*c;
-	void			(*end_io)(struct bch_write_op *);
-	u64			start_time;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-	unsigned		list_idx;
-#endif
-
-	unsigned		written; /* sectors */
-	u16			flags;
-	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
-
-	unsigned		compression_opt:8;
-	unsigned		csum_type:4;
-	unsigned		nr_replicas:4;
-	unsigned		nr_replicas_required:4;
-	unsigned		watermark:3;
-	unsigned		incompressible:1;
-	unsigned		stripe_waited:1;
-
-	struct bch_devs_list	devs_have;
-	u16			target;
-	u16			nonce;
-	struct bch_io_opts	opts;
-
-	u32			subvol;
-	struct bpos		pos;
-	struct bversion		version;
-
-	/* For BCH_WRITE_data_encoded: */
-	struct bch_extent_crc_unpacked crc;
-
-	struct write_point_specifier write_point;
-
-	struct write_point	*wp;
-	struct list_head	wp_list;
-
-	struct disk_reservation	res;
-
-	struct open_buckets	open_buckets;
-
-	u64			new_i_size;
-	s64			i_sectors_delta;
-
-	struct bch_devs_mask	failed;
-
-	struct keylist		insert_keys;
-	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
-	/*
-	 * Bitmask of devices that have had nocow writes issued to them since
-	 * last flush:
-	 */
-	struct bch_devs_mask	*devs_need_flush;
-
-	/* Must be last: */
-	struct bch_write_bio	wbio;
-};
-
-#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
deleted file mode 100644
index ddfeb0dafc9d..000000000000
--- a/fs/bcachefs/journal.c
+++ /dev/null
@@ -1,1832 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs journalling code, for btree insertions
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_methods.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "trace.h"
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-	return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-	return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-	return __journal_entry_is_open(j->reservations);
-}
-
-static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
-{
-	union journal_res_state s = READ_ONCE(j->reservations);
-	unsigned i = seq & JOURNAL_BUF_MASK;
-	struct journal_buf *buf = j->buf + i;
-
-	prt_printf(out, "seq:\t%llu\n", seq);
-	printbuf_indent_add(out, 2);
-
-	if (!buf->write_started)
-		prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
-
-	struct closure *cl = &buf->io;
-	int r = atomic_read(&cl->remaining);
-	prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
-
-	if (buf->data) {
-		prt_printf(out, "size:\t");
-		prt_human_readable_u64(out, vstruct_bytes(buf->data));
-		prt_newline(out);
-	}
-
-	prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
-
-	prt_printf(out, "flags:\t");
-	if (buf->noflush)
-		prt_str(out, "noflush ");
-	if (buf->must_flush)
-		prt_str(out, "must_flush ");
-	if (buf->separate_flush)
-		prt_str(out, "separate_flush ");
-	if (buf->need_flush_to_write_buffer)
-		prt_str(out, "need_flush_to_write_buffer ");
-	if (buf->write_started)
-		prt_str(out, "write_started ");
-	if (buf->write_allocated)
-		prt_str(out, "write_allocated ");
-	if (buf->write_done)
-		prt_str(out, "write_done");
-	prt_newline(out);
-
-	printbuf_indent_sub(out, 2);
-}
-
-static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
-{
-	lockdep_assert_held(&j->lock);
-	out->atomic++;
-
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 24);
-
-	for (u64 seq = journal_last_unwritten_seq(j);
-	     seq <= journal_cur_seq(j);
-	     seq++)
-		bch2_journal_buf_to_text(out, j, seq);
-	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
-
-	--out->atomic;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
-	struct journal_buf *buf = NULL;
-
-	EBUG_ON(seq > journal_cur_seq(j));
-
-	if (journal_seq_unwritten(j, seq))
-		buf = j->buf + (seq & JOURNAL_BUF_MASK);
-	return buf;
-}
-
-static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
-		INIT_LIST_HEAD(&p->unflushed[i]);
-	for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
-		INIT_LIST_HEAD(&p->flushed[i]);
-	atomic_set(&p->count, count);
-	p->devs.nr = 0;
-}
-
-/*
- * Detect stuck journal conditions and trigger shutdown. Technically the journal
- * can end up stuck for a variety of reasons, such as a blocked I/O, journal
- * reservation lockup, etc. Since this is a fatal error with potentially
- * unpredictable characteristics, we want to be fairly conservative before we
- * decide to shut things down.
- *
- * Consider the journal stuck when it appears full with no ability to commit
- * btree transactions, to discard journal buckets, nor acquire priority
- * (reserved watermark) reservation.
- */
-static inline bool
-journal_error_check_stuck(struct journal *j, int error, unsigned flags)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool stuck = false;
-	struct printbuf buf = PRINTBUF;
-
-	buf.atomic++;
-
-	if (!(error == -BCH_ERR_journal_full ||
-	      error == -BCH_ERR_journal_pin_full) ||
-	    nr_unwritten_journal_entries(j) ||
-	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
-		return stuck;
-
-	spin_lock(&j->lock);
-
-	if (j->can_discard) {
-		spin_unlock(&j->lock);
-		return stuck;
-	}
-
-	stuck = true;
-
-	/*
-	 * The journal shutdown path will set ->err_seq, but do it here first to
-	 * serialize against concurrent failures and avoid duplicate error
-	 * reports.
-	 */
-	if (j->err_seq) {
-		spin_unlock(&j->lock);
-		return stuck;
-	}
-	j->err_seq = journal_cur_seq(j);
-
-	__bch2_journal_debug_to_text(&buf, j);
-	spin_unlock(&j->lock);
-	prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
-				  bch2_err_str(error));
-	bch2_print_str(c, KERN_ERR, buf.buf);
-
-	printbuf_reset(&buf);
-	bch2_journal_pins_to_text(&buf, j);
-	bch_err(c, "Journal pins:\n%s", buf.buf);
-	printbuf_exit(&buf);
-
-	bch2_fatal_error(c);
-	dump_stack();
-
-	return stuck;
-}
-
-void bch2_journal_do_writes(struct journal *j)
-{
-	for (u64 seq = journal_last_unwritten_seq(j);
-	     seq <= journal_cur_seq(j);
-	     seq++) {
-		unsigned idx = seq & JOURNAL_BUF_MASK;
-		struct journal_buf *w = j->buf + idx;
-
-		if (w->write_started && !w->write_allocated)
-			break;
-		if (w->write_started)
-			continue;
-
-		if (!journal_state_seq_count(j, j->reservations, seq)) {
-			j->seq_write_started = seq;
-			w->write_started = true;
-			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
-		}
-
-		break;
-	}
-}
-
-/*
- * Final processing when the last reference of a journal buffer has been
- * dropped. Drop the pin list reference acquired at journal entry open and write
- * the buffer, if requested.
- */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq)
-{
-	lockdep_assert_held(&j->lock);
-
-	if (__bch2_journal_pin_put(j, seq))
-		bch2_journal_reclaim_fast(j);
-	bch2_journal_do_writes(j);
-
-	/*
-	 * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
-	 * open journal entry
-	 */
-	wake_up(&j->wait);
-}
-
-/*
- * Returns true if journal entry is now closed:
- *
- * We don't close a journal_buf until the next journal_buf is finished writing,
- * and can be opened again - this also initializes the next journal_buf:
- */
-static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf = journal_cur_buf(j);
-	union journal_res_state old, new;
-	unsigned sectors;
-
-	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
-	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
-
-	lockdep_assert_held(&j->lock);
-
-	old.v = atomic64_read(&j->reservations.counter);
-	do {
-		new.v = old.v;
-		new.cur_entry_offset = closed_val;
-
-		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
-		    old.cur_entry_offset == new.cur_entry_offset)
-			return;
-	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
-				       &old.v, new.v));
-
-	if (!__journal_entry_is_open(old))
-		return;
-
-	if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
-		old.cur_entry_offset = j->cur_entry_offset_if_blocked;
-
-	/* Close out old buffer: */
-	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
-
-	if (trace_journal_entry_close_enabled() && trace) {
-		struct printbuf pbuf = PRINTBUF;
-		pbuf.atomic++;
-
-		prt_str(&pbuf, "entry size: ");
-		prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
-		prt_newline(&pbuf);
-		bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
-		trace_journal_entry_close(c, pbuf.buf);
-		printbuf_exit(&pbuf);
-	}
-
-	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
-				      buf->u64s_reserved) << c->block_bits;
-	if (unlikely(sectors > buf->sectors)) {
-		struct printbuf err = PRINTBUF;
-		err.atomic++;
-
-		prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
-			   sectors, buf->sectors);
-		prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
-			   le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
-			   j->cur_entry_u64s,
-			   c->block_bits);
-		prt_printf(&err, "fatal error - emergency read only");
-		bch2_journal_halt_locked(j);
-
-		bch_err(c, "%s", err.buf);
-		printbuf_exit(&err);
-		return;
-	}
-
-	buf->sectors = sectors;
-
-	/*
-	 * We have to set last_seq here, _before_ opening a new journal entry:
-	 *
-	 * A threads may replace an old pin with a new pin on their current
-	 * journal reservation - the expectation being that the journal will
-	 * contain either what the old pin protected or what the new pin
-	 * protects.
-	 *
-	 * After the old pin is dropped journal_last_seq() won't include the old
-	 * pin, so we can only write the updated last_seq on the entry that
-	 * contains whatever the new pin protects.
-	 *
-	 * Restated, we can _not_ update last_seq for a given entry if there
-	 * could be a newer entry open with reservations/pins that have been
-	 * taken against it.
-	 *
-	 * Hence, we want update/set last_seq on the current journal entry right
-	 * before we open a new one:
-	 */
-	buf->last_seq		= journal_last_seq(j);
-	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
-	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
-
-	cancel_delayed_work(&j->write_work);
-
-	bch2_journal_space_available(j);
-
-	__bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
-}
-
-void bch2_journal_halt_locked(struct journal *j)
-{
-	lockdep_assert_held(&j->lock);
-
-	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
-	if (!j->err_seq)
-		j->err_seq = journal_cur_seq(j);
-	journal_wake(j);
-}
-
-void bch2_journal_halt(struct journal *j)
-{
-	spin_lock(&j->lock);
-	bch2_journal_halt_locked(j);
-	spin_unlock(&j->lock);
-}
-
-static bool journal_entry_want_write(struct journal *j)
-{
-	bool ret = !journal_entry_is_open(j) ||
-		journal_cur_seq(j) == journal_last_unwritten_seq(j);
-
-	/* Don't close it yet if we already have a write in flight: */
-	if (ret)
-		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-	else if (nr_unwritten_journal_entries(j)) {
-		struct journal_buf *buf = journal_cur_buf(j);
-
-		if (!buf->flush_time) {
-			buf->flush_time	= local_clock() ?: 1;
-			buf->expires = jiffies;
-		}
-	}
-
-	return ret;
-}
-
-bool bch2_journal_entry_close(struct journal *j)
-{
-	bool ret;
-
-	spin_lock(&j->lock);
-	ret = journal_entry_want_write(j);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf = j->buf +
-		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
-	union journal_res_state old, new;
-	int u64s;
-
-	lockdep_assert_held(&j->lock);
-	BUG_ON(journal_entry_is_open(j));
-	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
-	if (j->blocked)
-		return bch_err_throw(c, journal_blocked);
-
-	if (j->cur_entry_error)
-		return j->cur_entry_error;
-
-	int ret = bch2_journal_error(j);
-	if (unlikely(ret))
-		return ret;
-
-	if (!fifo_free(&j->pin))
-		return bch_err_throw(c, journal_pin_full);
-
-	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
-		return bch_err_throw(c, journal_max_in_flight);
-
-	if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
-		return bch_err_throw(c, journal_max_open);
-
-	if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
-		bch_err(c, "cannot start: journal seq overflow");
-		if (bch2_fs_emergency_read_only_locked(c))
-			bch_err(c, "fatal error - emergency read only");
-		return bch_err_throw(c, journal_shutdown);
-	}
-
-	if (!j->free_buf && !buf->data)
-		return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
-
-	BUG_ON(!j->cur_entry_sectors);
-
-	if (!buf->data) {
-		swap(buf->data,		j->free_buf);
-		swap(buf->buf_size,	j->free_buf_size);
-	}
-
-	buf->expires		=
-		(journal_cur_seq(j) == j->flushed_seq_ondisk
-		 ? jiffies
-		 : j->last_flush_write) +
-		msecs_to_jiffies(c->opts.journal_flush_delay);
-
-	buf->u64s_reserved	= j->entry_u64s_reserved;
-	buf->disk_sectors	= j->cur_entry_sectors;
-	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
-
-	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
-		journal_entry_overhead(j);
-	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
-
-	if (u64s <= (ssize_t) j->early_journal_entries.nr)
-		return bch_err_throw(c, journal_full);
-
-	if (fifo_empty(&j->pin) && j->reclaim_thread)
-		wake_up_process(j->reclaim_thread);
-
-	/*
-	 * The fifo_push() needs to happen at the same time as j->seq is
-	 * incremented for journal_last_seq() to be calculated correctly
-	 */
-	atomic64_inc(&j->seq);
-	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-
-	if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) {
-		bch_err(c, "attempting to open blacklisted journal seq %llu",
-			journal_cur_seq(j));
-		if (bch2_fs_emergency_read_only_locked(c))
-			bch_err(c, "fatal error - emergency read only");
-		return bch_err_throw(c, journal_shutdown);
-	}
-
-	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
-	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
-
-	bkey_extent_init(&buf->key);
-	buf->noflush		= false;
-	buf->must_flush		= false;
-	buf->separate_flush	= false;
-	buf->flush_time		= 0;
-	buf->need_flush_to_write_buffer = true;
-	buf->write_started	= false;
-	buf->write_allocated	= false;
-	buf->write_done		= false;
-
-	memset(buf->data, 0, sizeof(*buf->data));
-	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
-	buf->data->u64s	= 0;
-
-	if (j->early_journal_entries.nr) {
-		memcpy(buf->data->_data, j->early_journal_entries.data,
-		       j->early_journal_entries.nr * sizeof(u64));
-		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
-	}
-
-	/*
-	 * Must be set before marking the journal entry as open:
-	 */
-	j->cur_entry_u64s = u64s;
-
-	old.v = atomic64_read(&j->reservations.counter);
-	do {
-		new.v = old.v;
-
-		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
-
-		new.idx++;
-		BUG_ON(journal_state_count(new, new.idx));
-		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
-
-		journal_state_inc(&new);
-
-		/* Handle any already added entries */
-		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
-	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
-				       &old.v, new.v));
-
-	if (nr_unwritten_journal_entries(j) == 1)
-		mod_delayed_work(j->wq,
-				 &j->write_work,
-				 msecs_to_jiffies(c->opts.journal_flush_delay));
-	journal_wake(j);
-
-	if (j->early_journal_entries.nr)
-		darray_exit(&j->early_journal_entries);
-	return 0;
-}
-
-static bool journal_quiesced(struct journal *j)
-{
-	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
-
-	if (!ret)
-		bch2_journal_entry_close(j);
-	return ret;
-}
-
-static void journal_quiesce(struct journal *j)
-{
-	wait_event(j->wait, journal_quiesced(j));
-}
-
-static void journal_write_work(struct work_struct *work)
-{
-	struct journal *j = container_of(work, struct journal, write_work.work);
-
-	spin_lock(&j->lock);
-	if (__journal_entry_is_open(j->reservations)) {
-		long delta = journal_cur_buf(j)->expires - jiffies;
-
-		if (delta > 0)
-			mod_delayed_work(j->wq, &j->write_work, delta);
-		else
-			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-	}
-	spin_unlock(&j->lock);
-}
-
-static void journal_buf_prealloc(struct journal *j)
-{
-	if (j->free_buf &&
-	    j->free_buf_size >= j->buf_size_want)
-		return;
-
-	unsigned buf_size = j->buf_size_want;
-
-	spin_unlock(&j->lock);
-	void *buf = kvmalloc(buf_size, GFP_NOFS);
-	spin_lock(&j->lock);
-
-	if (buf &&
-	    (!j->free_buf ||
-	     buf_size > j->free_buf_size)) {
-		swap(buf,	j->free_buf);
-		swap(buf_size,	j->free_buf_size);
-	}
-
-	if (unlikely(buf)) {
-		spin_unlock(&j->lock);
-		/* kvfree can sleep */
-		kvfree(buf);
-		spin_lock(&j->lock);
-	}
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
-			     unsigned flags)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
-	bool can_discard;
-	int ret;
-retry:
-	if (journal_res_get_fast(j, res, flags))
-		return 0;
-
-	ret = bch2_journal_error(j);
-	if (unlikely(ret))
-		return ret;
-
-	if (j->blocked)
-		return bch_err_throw(c, journal_blocked);
-
-	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-		ret = bch_err_throw(c, journal_full);
-		can_discard = j->can_discard;
-		goto out;
-	}
-
-	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
-		ret = bch_err_throw(c, journal_max_in_flight);
-		goto out;
-	}
-
-	spin_lock(&j->lock);
-
-	journal_buf_prealloc(j);
-
-	/*
-	 * Recheck after taking the lock, so we don't race with another thread
-	 * that just did journal_entry_open() and call bch2_journal_entry_close()
-	 * unnecessarily
-	 */
-	if (journal_res_get_fast(j, res, flags)) {
-		ret = 0;
-		goto unlock;
-	}
-
-	/*
-	 * If we couldn't get a reservation because the current buf filled up,
-	 * and we had room for a bigger entry on disk, signal that we want to
-	 * realloc the journal bufs:
-	 */
-	buf = journal_cur_buf(j);
-	if (journal_entry_is_open(j) &&
-	    buf->buf_size >> 9 < buf->disk_sectors &&
-	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
-		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
-
-	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-	ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
-unlock:
-	can_discard = j->can_discard;
-	spin_unlock(&j->lock);
-out:
-	if (likely(!ret))
-		return 0;
-	if (ret == -BCH_ERR_journal_retry_open)
-		goto retry;
-
-	if (journal_error_check_stuck(j, ret, flags))
-		ret = bch_err_throw(c, journal_stuck);
-
-	if (ret == -BCH_ERR_journal_max_in_flight &&
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
-	    trace_journal_entry_full_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_printbuf_make_room(&buf, 4096);
-
-		spin_lock(&j->lock);
-		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
-		bch2_journal_bufs_to_text(&buf, j);
-		spin_unlock(&j->lock);
-
-		trace_journal_entry_full(c, buf.buf);
-		printbuf_exit(&buf);
-		count_event(c, journal_entry_full);
-	}
-
-	if (ret == -BCH_ERR_journal_max_open &&
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
-	    trace_journal_entry_full_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_printbuf_make_room(&buf, 4096);
-
-		spin_lock(&j->lock);
-		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
-		bch2_journal_bufs_to_text(&buf, j);
-		spin_unlock(&j->lock);
-
-		trace_journal_entry_full(c, buf.buf);
-		printbuf_exit(&buf);
-		count_event(c, journal_entry_full);
-	}
-
-	/*
-	 * Journal is full - can't rely on reclaim from work item due to
-	 * freezing:
-	 */
-	if ((ret == -BCH_ERR_journal_full ||
-	     ret == -BCH_ERR_journal_pin_full) &&
-	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
-		if (can_discard) {
-			bch2_journal_do_discards(j);
-			goto retry;
-		}
-
-		if (mutex_trylock(&j->reclaim_lock)) {
-			bch2_journal_reclaim(j);
-			mutex_unlock(&j->reclaim_lock);
-		}
-	}
-
-	return ret;
-}
-
-static unsigned max_dev_latency(struct bch_fs *c)
-{
-	u64 nsecs = 0;
-
-	guard(rcu)();
-	for_each_rw_member_rcu(c, ca)
-		nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
-
-	return nsecs_to_jiffies(nsecs);
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcachefs is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the next
- * write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				  unsigned flags,
-				  struct btree_trans *trans)
-{
-	int ret;
-
-	if (closure_wait_event_timeout(&j->async_wait,
-		   !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-		   (flags & JOURNAL_RES_GET_NONBLOCK),
-		   HZ))
-		return ret;
-
-	if (trans)
-		bch2_trans_unlock_long(trans);
-
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
-
-	remaining_wait = max(0, remaining_wait - HZ);
-
-	if (closure_wait_event_timeout(&j->async_wait,
-		   !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-		   (flags & JOURNAL_RES_GET_NONBLOCK),
-		   remaining_wait))
-		return ret;
-
-	struct printbuf buf = PRINTBUF;
-	bch2_journal_debug_to_text(&buf, j);
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
-	printbuf_exit(&buf);
-
-	closure_wait_event(&j->async_wait,
-		   !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
-		   (flags & JOURNAL_RES_GET_NONBLOCK));
-	return ret;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *j,
-				   struct journal_entry_res *res,
-				   unsigned new_u64s)
-{
-	union journal_res_state state;
-	int d = new_u64s - res->u64s;
-
-	spin_lock(&j->lock);
-
-	j->entry_u64s_reserved += d;
-	if (d <= 0)
-		goto out;
-
-	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
-	state = READ_ONCE(j->reservations);
-
-	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
-	    state.cur_entry_offset > j->cur_entry_u64s) {
-		j->cur_entry_u64s += d;
-		/*
-		 * Not enough room in current journal entry, have to flush it:
-		 */
-		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-	} else {
-		journal_cur_buf(j)->u64s_reserved += d;
-	}
-out:
-	spin_unlock(&j->lock);
-	res->u64s += d;
-}
-
-/* journal flushing: */
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- * @j:		journal object
- * @seq:	seq to flush
- * @parent:	closure object to wait with
- * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
- *		-BCH_ERR_journal_flush_err if @seq will never be flushed
- *
- * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
-				 struct closure *parent)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
-	int ret = 0;
-
-	if (seq <= j->flushed_seq_ondisk)
-		return 1;
-
-	spin_lock(&j->lock);
-
-	if (WARN_ONCE(seq > journal_cur_seq(j),
-		      "requested to flush journal seq %llu, but currently at %llu",
-		      seq, journal_cur_seq(j)))
-		goto out;
-
-	/* Recheck under lock: */
-	if (j->err_seq && seq >= j->err_seq) {
-		ret = bch_err_throw(c, journal_flush_err);
-		goto out;
-	}
-
-	if (seq <= j->flushed_seq_ondisk) {
-		ret = 1;
-		goto out;
-	}
-
-	/* if seq was written, but not flushed - flush a newer one instead */
-	seq = max(seq, journal_last_unwritten_seq(j));
-
-recheck_need_open:
-	if (seq > journal_cur_seq(j)) {
-		struct journal_res res = { 0 };
-
-		if (journal_entry_is_open(j))
-			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
-		spin_unlock(&j->lock);
-
-		/*
-		 * We're called from bch2_journal_flush_seq() -> wait_event();
-		 * but this might block. We won't usually block, so we won't
-		 * livelock:
-		 */
-		sched_annotate_sleep();
-		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
-		if (ret)
-			return ret;
-
-		seq = res.seq;
-		buf = journal_seq_to_buf(j, seq);
-		buf->must_flush = true;
-
-		if (!buf->flush_time) {
-			buf->flush_time	= local_clock() ?: 1;
-			buf->expires = jiffies;
-		}
-
-		if (parent && !closure_wait(&buf->wait, parent))
-			BUG();
-
-		bch2_journal_res_put(j, &res);
-
-		spin_lock(&j->lock);
-		goto want_write;
-	}
-
-	/*
-	 * if write was kicked off without a flush, or if we promised it
-	 * wouldn't be a flush, flush the next sequence number instead
-	 */
-	buf = journal_seq_to_buf(j, seq);
-	if (buf->noflush) {
-		seq++;
-		goto recheck_need_open;
-	}
-
-	buf->must_flush = true;
-	j->flushing_seq = max(j->flushing_seq, seq);
-
-	if (parent && !closure_wait(&buf->wait, parent))
-		BUG();
-want_write:
-	if (seq == journal_cur_seq(j))
-		journal_entry_want_write(j);
-out:
-	spin_unlock(&j->lock);
-	return ret;
-}
-
-int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
-{
-	u64 start_time = local_clock();
-	int ret, ret2;
-
-	/*
-	 * Don't update time_stats when @seq is already flushed:
-	 */
-	if (seq <= j->flushed_seq_ondisk)
-		return 0;
-
-	ret = wait_event_state(j->wait,
-			       (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
-			       task_state);
-
-	if (!ret)
-		bch2_time_stats_update(j->flush_seq_time, start_time);
-
-	return ret ?: ret2 < 0 ? ret2 : 0;
-}
-
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
-{
-	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
-}
-
-int bch2_journal_flush(struct journal *j)
-{
-	return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
- * range [start, end)
- * @seq
- */
-bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	u64 unwritten_seq;
-	bool ret = false;
-
-	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
-		return false;
-
-	if (c->journal.flushed_seq_ondisk >= start)
-		return false;
-
-	spin_lock(&j->lock);
-	if (c->journal.flushed_seq_ondisk >= start)
-		goto out;
-
-	for (unwritten_seq = journal_last_unwritten_seq(j);
-	     unwritten_seq < end;
-	     unwritten_seq++) {
-		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
-
-		/* journal flush already in flight, or flush requseted */
-		if (buf->must_flush)
-			goto out;
-
-		buf->noflush = true;
-	}
-
-	ret = true;
-out:
-	spin_unlock(&j->lock);
-	return ret;
-}
-
-static int __bch2_journal_meta(struct journal *j)
-{
-	struct journal_res res = {};
-	int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
-	if (ret)
-		return ret;
-
-	struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
-	buf->must_flush = true;
-
-	if (!buf->flush_time) {
-		buf->flush_time	= local_clock() ?: 1;
-		buf->expires = jiffies;
-	}
-
-	bch2_journal_res_put(j, &res);
-
-	return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
-}
-
-int bch2_journal_meta(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
-		return bch_err_throw(c, erofs_no_writes);
-
-	int ret = __bch2_journal_meta(j);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
-	return ret;
-}
-
-/* block/unlock the journal: */
-
-void bch2_journal_unblock(struct journal *j)
-{
-	spin_lock(&j->lock);
-	if (!--j->blocked &&
-	    j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
-	    j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
-		union journal_res_state old, new;
-
-		old.v = atomic64_read(&j->reservations.counter);
-		do {
-			new.v = old.v;
-			new.cur_entry_offset = j->cur_entry_offset_if_blocked;
-		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-	}
-	spin_unlock(&j->lock);
-
-	journal_wake(j);
-}
-
-static void __bch2_journal_block(struct journal *j)
-{
-	if (!j->blocked++) {
-		union journal_res_state old, new;
-
-		old.v = atomic64_read(&j->reservations.counter);
-		do {
-			j->cur_entry_offset_if_blocked = old.cur_entry_offset;
-
-			if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
-				break;
-
-			new.v = old.v;
-			new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
-		} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-
-		if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
-			journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
-	}
-}
-
-void bch2_journal_block(struct journal *j)
-{
-	spin_lock(&j->lock);
-	__bch2_journal_block(j);
-	spin_unlock(&j->lock);
-
-	journal_quiesce(j);
-}
-
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
-						u64 max_seq, bool *blocked)
-{
-	struct journal_buf *ret = NULL;
-
-	/* We're inside wait_event(), but using mutex_lock(: */
-	sched_annotate_sleep();
-	mutex_lock(&j->buf_lock);
-	spin_lock(&j->lock);
-	max_seq = min(max_seq, journal_cur_seq(j));
-
-	for (u64 seq = journal_last_unwritten_seq(j);
-	     seq <= max_seq;
-	     seq++) {
-		unsigned idx = seq & JOURNAL_BUF_MASK;
-		struct journal_buf *buf = j->buf + idx;
-
-		if (buf->need_flush_to_write_buffer) {
-			union journal_res_state s;
-			s.v = atomic64_read_acquire(&j->reservations.counter);
-
-			unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
-
-			if (open && !*blocked) {
-				__bch2_journal_block(j);
-				s.v = atomic64_read_acquire(&j->reservations.counter);
-				*blocked = true;
-			}
-
-			ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
-				? ERR_PTR(-EAGAIN)
-				: buf;
-			break;
-		}
-	}
-
-	spin_unlock(&j->lock);
-	if (IS_ERR_OR_NULL(ret))
-		mutex_unlock(&j->buf_lock);
-	return ret;
-}
-
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
-							     u64 max_seq, bool *blocked)
-{
-	struct journal_buf *ret;
-	*blocked = false;
-
-	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
-						max_seq, blocked)) != ERR_PTR(-EAGAIN));
-	if (IS_ERR_OR_NULL(ret) && *blocked)
-		bch2_journal_unblock(j);
-
-	return ret;
-}
-
-/* allocate journal on a device: */
-
-static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
-					    bool new_fs, struct closure *cl)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal_device *ja = &ca->journal;
-	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
-	struct open_bucket **ob = NULL;
-	long *bu = NULL;
-	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
-	int ret = 0;
-
-	BUG_ON(nr <= ja->nr);
-
-	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
-	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
-	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
-	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
-	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
-		ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-		goto err_free;
-	}
-
-	for (nr_got = 0; nr_got < nr_want; nr_got++) {
-		enum bch_watermark watermark = new_fs
-			? BCH_WATERMARK_btree
-			: BCH_WATERMARK_normal;
-
-		ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
-					       BCH_DATA_journal, cl);
-		ret = PTR_ERR_OR_ZERO(ob[nr_got]);
-		if (ret)
-			break;
-
-		if (!new_fs) {
-			ret = bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(trans, ca,
-						ob[nr_got]->bucket, BCH_DATA_journal,
-						ca->mi.bucket_size, BTREE_TRIGGER_transactional));
-			if (ret) {
-				bch2_open_bucket_put(c, ob[nr_got]);
-				bch_err_msg(c, ret, "marking new journal buckets");
-				break;
-			}
-		}
-
-		bu[nr_got] = ob[nr_got]->bucket;
-	}
-
-	if (!nr_got)
-		goto err_free;
-
-	/* Don't return an error if we successfully allocated some buckets: */
-	ret = 0;
-
-	if (c) {
-		bch2_journal_flush_all_pins(&c->journal);
-		bch2_journal_block(&c->journal);
-		mutex_lock(&c->sb_lock);
-	}
-
-	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
-	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
-
-	BUG_ON(ja->discard_idx > ja->nr);
-
-	pos = ja->discard_idx ?: ja->nr;
-
-	memmove(new_buckets + pos + nr_got,
-		new_buckets + pos,
-		sizeof(new_buckets[0]) * (ja->nr - pos));
-	memmove(new_bucket_seq + pos + nr_got,
-		new_bucket_seq + pos,
-		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
-
-	for (i = 0; i < nr_got; i++) {
-		new_buckets[pos + i] = bu[i];
-		new_bucket_seq[pos + i] = 0;
-	}
-
-	nr = ja->nr + nr_got;
-
-	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
-	if (ret)
-		goto err_unblock;
-
-	bch2_write_super(c);
-
-	/* Commit: */
-	if (c)
-		spin_lock(&c->journal.lock);
-
-	swap(new_buckets,	ja->buckets);
-	swap(new_bucket_seq,	ja->bucket_seq);
-	ja->nr = nr;
-
-	if (pos <= ja->discard_idx)
-		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
-	if (pos <= ja->dirty_idx_ondisk)
-		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
-	if (pos <= ja->dirty_idx)
-		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
-	if (pos <= ja->cur_idx)
-		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
-
-	if (c)
-		spin_unlock(&c->journal.lock);
-err_unblock:
-	if (c) {
-		bch2_journal_unblock(&c->journal);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	if (ret && !new_fs)
-		for (i = 0; i < nr_got; i++)
-			bch2_trans_run(c,
-				bch2_trans_mark_metadata_bucket(trans, ca,
-						bu[i], BCH_DATA_free, 0,
-						BTREE_TRIGGER_transactional));
-err_free:
-	for (i = 0; i < nr_got; i++)
-		bch2_open_bucket_put(c, ob[i]);
-
-	kfree(new_bucket_seq);
-	kfree(new_buckets);
-	kfree(ob);
-	kfree(bu);
-	return ret;
-}
-
-static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
-					    unsigned nr, bool new_fs)
-{
-	struct journal_device *ja = &ca->journal;
-	int ret = 0;
-
-	struct closure cl;
-	closure_init_stack(&cl);
-
-	/* don't handle reducing nr of buckets yet: */
-	if (nr < ja->nr)
-		return 0;
-
-	while (!ret && ja->nr < nr) {
-		struct disk_reservation disk_res = { 0, 0, 0 };
-
-		/*
-		 * note: journal buckets aren't really counted as _sectors_ used yet, so
-		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
-		 * when space used goes up without a reservation - but we do need the
-		 * reservation to ensure we'll actually be able to allocate:
-		 *
-		 * XXX: that's not right, disk reservations only ensure a
-		 * filesystem-wide allocation will succeed, this is a device
-		 * specific allocation - we can hang here:
-		 */
-		if (!new_fs) {
-			ret = bch2_disk_reservation_get(c, &disk_res,
-							bucket_to_sector(ca, nr - ja->nr), 1, 0);
-			if (ret)
-				break;
-		}
-
-		ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
-
-		if (ret == -BCH_ERR_bucket_alloc_blocked ||
-		    ret == -BCH_ERR_open_buckets_empty)
-			ret = 0; /* wait and retry */
-
-		bch2_disk_reservation_put(c, &disk_res);
-		bch2_wait_on_allocator(c, &cl);
-	}
-
-	return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-				unsigned nr)
-{
-	down_write(&c->state_lock);
-	int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
-	up_write(&c->state_lock);
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal *j = &c->journal;
-	struct journal_device *ja = &ca->journal;
-
-	guard(mutex)(&c->sb_lock);
-	unsigned pos;
-	for (pos = 0; pos < ja->nr; pos++)
-		if (ja->buckets[pos] == b)
-			break;
-
-	if (pos == ja->nr) {
-		bch_err(ca, "journal bucket %llu not found when deleting", b);
-		return -EINVAL;
-	}
-
-	u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
-	if (!new_buckets)
-		return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-
-	memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
-	memmove(&new_buckets[pos],
-		&new_buckets[pos + 1],
-		(ja->nr - 1 - pos) * sizeof(new_buckets[0]));
-
-	int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
-		bch2_write_super(c);
-	if (ret) {
-		kfree(new_buckets);
-		return ret;
-	}
-
-	scoped_guard(spinlock, &j->lock) {
-		if (pos < ja->discard_idx)
-			--ja->discard_idx;
-		if (pos < ja->dirty_idx_ondisk)
-			--ja->dirty_idx_ondisk;
-		if (pos < ja->dirty_idx)
-			--ja->dirty_idx;
-		if (pos < ja->cur_idx)
-			--ja->cur_idx;
-
-		ja->nr--;
-
-		memmove(&ja->buckets[pos],
-			&ja->buckets[pos + 1],
-			(ja->nr - pos) * sizeof(ja->buckets[0]));
-
-		memmove(&ja->bucket_seq[pos],
-			&ja->bucket_seq[pos + 1],
-			(ja->nr - pos) * sizeof(ja->bucket_seq[0]));
-
-		bch2_journal_space_available(j);
-	}
-
-	kfree(new_buckets);
-	return 0;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
-{
-	struct bch_fs *c = ca->fs;
-
-	if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
-		return 0;
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-		bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
-		return bch_err_throw(c, erofs_filesystem_full);
-	}
-
-	unsigned nr;
-	int ret;
-
-	if (dynamic_fault("bcachefs:add:journal_alloc")) {
-		ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-		goto err;
-	}
-
-	/* 1/128th of the device by default: */
-	nr = ca->mi.nbuckets >> 7;
-
-	/*
-	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
-	 * is smaller:
-	 */
-	nr = clamp_t(unsigned, nr,
-		     BCH_JOURNAL_BUCKETS_MIN,
-		     min(1 << 13,
-			 (1 << 24) / ca->mi.bucket_size));
-
-	ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
-err:
-	bch_err_fn(ca, ret);
-	return ret;
-}
-
-int bch2_fs_journal_alloc(struct bch_fs *c)
-{
-	for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
-		if (ca->journal.nr)
-			continue;
-
-		int ret = bch2_dev_journal_alloc(ca, true);
-		if (ret) {
-			enumerated_ref_put(&ca->io_ref[READ],
-					   BCH_DEV_READ_REF_fs_journal_alloc);
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-/* startup/shutdown: */
-
-static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
-{
-	bool ret = false;
-	u64 seq;
-
-	spin_lock(&j->lock);
-	for (seq = journal_last_unwritten_seq(j);
-	     seq <= journal_cur_seq(j) && !ret;
-	     seq++) {
-		struct journal_buf *buf = journal_seq_to_buf(j, seq);
-
-		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
-			ret = true;
-	}
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
-{
-	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
-}
-
-void bch2_fs_journal_stop(struct journal *j)
-{
-	if (!test_bit(JOURNAL_running, &j->flags))
-		return;
-
-	bch2_journal_reclaim_stop(j);
-	bch2_journal_flush_all_pins(j);
-
-	wait_event(j->wait, bch2_journal_entry_close(j));
-
-	/*
-	 * Always write a new journal entry, to make sure the clock hands are up
-	 * to date (and match the superblock)
-	 */
-	__bch2_journal_meta(j);
-
-	journal_quiesce(j);
-	cancel_delayed_work_sync(&j->write_work);
-
-	WARN(!bch2_journal_error(j) &&
-	     test_bit(JOURNAL_replay_done, &j->flags) &&
-	     j->last_empty_seq != journal_cur_seq(j),
-	     "journal shutdown error: cur seq %llu but last empty seq %llu",
-	     journal_cur_seq(j), j->last_empty_seq);
-
-	if (!bch2_journal_error(j))
-		clear_bit(JOURNAL_running, &j->flags);
-}
-
-int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_entry_pin_list *p;
-	struct journal_replay *i, **_i;
-	struct genradix_iter iter;
-	bool had_entries = false;
-
-	/*
-	 *
-	 * XXX pick most recent non blacklisted sequence number
-	 */
-
-	cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
-
-	if (cur_seq >= JOURNAL_SEQ_MAX) {
-		bch_err(c, "cannot start: journal seq overflow");
-		return -EINVAL;
-	}
-
-	/* Clean filesystem? */
-	if (!last_seq)
-		last_seq = cur_seq;
-
-	u64 nr = cur_seq - last_seq;
-
-	/*
-	 * Extra fudge factor, in case we crashed when the journal pin fifo was
-	 * nearly or completely full. We'll need to be able to open additional
-	 * journal entries (at least a few) in order for journal replay to get
-	 * going:
-	 */
-	nr += nr / 4;
-
-	nr = max(nr, JOURNAL_PIN);
-	init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-	if (!j->pin.data) {
-		bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-		return bch_err_throw(c, ENOMEM_journal_pin_fifo);
-	}
-
-	j->replay_journal_seq	= last_seq;
-	j->replay_journal_seq_end = cur_seq;
-	j->last_seq_ondisk	= last_seq;
-	j->flushed_seq_ondisk	= cur_seq - 1;
-	j->seq_write_started	= cur_seq - 1;
-	j->seq_ondisk		= cur_seq - 1;
-	j->pin.front		= last_seq;
-	j->pin.back		= cur_seq;
-	atomic64_set(&j->seq, cur_seq - 1);
-
-	u64 seq;
-	fifo_for_each_entry_ptr(p, &j->pin, seq)
-		journal_pin_list_init(p, 1);
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		seq = le64_to_cpu(i->j.seq);
-		BUG_ON(seq >= cur_seq);
-
-		if (seq < last_seq)
-			continue;
-
-		if (journal_entry_empty(&i->j))
-			j->last_empty_seq = le64_to_cpu(i->j.seq);
-
-		p = journal_seq_pin(j, seq);
-
-		p->devs.nr = 0;
-		darray_for_each(i->ptrs, ptr)
-			bch2_dev_list_add_dev(&p->devs, ptr->dev);
-
-		had_entries = true;
-	}
-
-	if (!had_entries)
-		j->last_empty_seq = cur_seq - 1; /* to match j->seq */
-
-	spin_lock(&j->lock);
-	j->last_flush_write = jiffies;
-
-	j->reservations.idx = journal_cur_seq(j);
-
-	c->last_bucket_seq_cleanup = journal_cur_seq(j);
-	spin_unlock(&j->lock);
-
-	return 0;
-}
-
-void bch2_journal_set_replay_done(struct journal *j)
-{
-	/*
-	 * journal_space_available must happen before setting JOURNAL_running
-	 * JOURNAL_running must happen before JOURNAL_replay_done
-	 */
-	spin_lock(&j->lock);
-	bch2_journal_space_available(j);
-
-	set_bit(JOURNAL_need_flush_write, &j->flags);
-	set_bit(JOURNAL_running, &j->flags);
-	set_bit(JOURNAL_replay_done, &j->flags);
-	spin_unlock(&j->lock);
-}
-
-/* init/exit: */
-
-void bch2_dev_journal_exit(struct bch_dev *ca)
-{
-	struct journal_device *ja = &ca->journal;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-		kfree(ja->bio[i]);
-		ja->bio[i] = NULL;
-	}
-
-	kfree(ja->buckets);
-	kfree(ja->bucket_seq);
-	ja->buckets	= NULL;
-	ja->bucket_seq	= NULL;
-}
-
-int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal_device *ja = &ca->journal;
-	struct bch_sb_field_journal *journal_buckets =
-		bch2_sb_field_get(sb, journal);
-	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
-		bch2_sb_field_get(sb, journal_v2);
-
-	ja->nr = 0;
-
-	if (journal_buckets_v2) {
-		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-
-		for (unsigned i = 0; i < nr; i++)
-			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
-	} else if (journal_buckets) {
-		ja->nr = bch2_nr_journal_buckets(journal_buckets);
-	}
-
-	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
-	if (!ja->bucket_seq)
-		return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
-		ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
-				     nr_bvecs), GFP_KERNEL);
-		if (!ja->bio[i])
-			return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-		ja->bio[i]->ca = ca;
-		ja->bio[i]->buf_idx = i;
-		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
-	}
-
-	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
-	if (!ja->buckets)
-		return bch_err_throw(c, ENOMEM_dev_journal_init);
-
-	if (journal_buckets_v2) {
-		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-		unsigned dst = 0;
-
-		for (unsigned i = 0; i < nr; i++)
-			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
-				ja->buckets[dst++] =
-					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
-	} else if (journal_buckets) {
-		for (unsigned i = 0; i < ja->nr; i++)
-			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
-	}
-
-	return 0;
-}
-
-void bch2_fs_journal_exit(struct journal *j)
-{
-	if (j->wq)
-		destroy_workqueue(j->wq);
-
-	darray_exit(&j->early_journal_entries);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
-		kvfree(j->buf[i].data);
-	kvfree(j->free_buf);
-	free_fifo(&j->pin);
-}
-
-void bch2_fs_journal_init_early(struct journal *j)
-{
-	static struct lock_class_key res_key;
-
-	mutex_init(&j->buf_lock);
-	spin_lock_init(&j->lock);
-	spin_lock_init(&j->err_lock);
-	init_waitqueue_head(&j->wait);
-	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	init_waitqueue_head(&j->reclaim_wait);
-	init_waitqueue_head(&j->pin_flush_wait);
-	mutex_init(&j->reclaim_lock);
-	mutex_init(&j->discard_lock);
-
-	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
-	atomic64_set(&j->reservations.counter,
-		((union journal_res_state)
-		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-}
-
-int bch2_fs_journal_init(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
-	j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
-	if (!j->free_buf)
-		return bch_err_throw(c, ENOMEM_journal_buf);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
-		j->buf[i].idx = i;
-
-	j->wq = alloc_workqueue("bcachefs_journal",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
-	if (!j->wq)
-		return bch_err_throw(c, ENOMEM_fs_other_alloc);
-	return 0;
-}
-
-/* debug: */
-
-static const char * const bch2_journal_flags_strs[] = {
-#define x(n)	#n,
-	JOURNAL_FLAGS()
-#undef x
-	NULL
-};
-
-void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union journal_res_state s;
-	unsigned long now = jiffies;
-	u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
-
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 28);
-	out->atomic++;
-
-	guard(rcu)();
-	s = READ_ONCE(j->reservations);
-
-	prt_printf(out, "flags:\t");
-	prt_bitflags(out, bch2_journal_flags_strs, j->flags);
-	prt_newline(out);
-	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
-	prt_printf(out, "seq:\t%llu\n",				journal_cur_seq(j));
-	prt_printf(out, "seq_ondisk:\t%llu\n",			j->seq_ondisk);
-	prt_printf(out, "last_seq:\t%llu\n",			journal_last_seq(j));
-	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
-	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",		j->flushed_seq_ondisk);
-	prt_printf(out, "watermark:\t%s\n",			bch2_watermarks[j->watermark]);
-	prt_printf(out, "each entry reserved:\t%u\n",		j->entry_u64s_reserved);
-	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
-	prt_printf(out, "nr noflush writes:\t%llu\n",		j->nr_noflush_writes);
-	prt_printf(out, "average write size:\t");
-	prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
-	prt_newline(out);
-	prt_printf(out, "free buf:\t%u\n",			j->free_buf ? j->free_buf_size : 0);
-	prt_printf(out, "nr direct reclaim:\t%llu\n",		j->nr_direct_reclaim);
-	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
-	prt_printf(out, "reclaim kicked:\t%u\n",		j->reclaim_kicked);
-	prt_printf(out, "reclaim runs in:\t%u ms\n",		time_after(j->next_reclaim, now)
-	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-	prt_printf(out, "blocked:\t%u\n",			j->blocked);
-	prt_printf(out, "current entry sectors:\t%u\n",		j->cur_entry_sectors);
-	prt_printf(out, "current entry error:\t%s\n",		bch2_err_str(j->cur_entry_error));
-	prt_printf(out, "current entry:\t");
-
-	switch (s.cur_entry_offset) {
-	case JOURNAL_ENTRY_ERROR_VAL:
-		prt_printf(out, "error\n");
-		break;
-	case JOURNAL_ENTRY_CLOSED_VAL:
-		prt_printf(out, "closed\n");
-		break;
-	case JOURNAL_ENTRY_BLOCKED_VAL:
-		prt_printf(out, "blocked\n");
-		break;
-	default:
-		prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
-		break;
-	}
-
-	prt_printf(out, "unwritten entries:\n");
-	bch2_journal_bufs_to_text(out, j);
-
-	prt_printf(out, "space:\n");
-	printbuf_indent_add(out, 2);
-	prt_printf(out, "discarded\t%u:%u\n",
-	       j->space[journal_space_discarded].next_entry,
-	       j->space[journal_space_discarded].total);
-	prt_printf(out, "clean ondisk\t%u:%u\n",
-	       j->space[journal_space_clean_ondisk].next_entry,
-	       j->space[journal_space_clean_ondisk].total);
-	prt_printf(out, "clean\t%u:%u\n",
-	       j->space[journal_space_clean].next_entry,
-	       j->space[journal_space_clean].total);
-	prt_printf(out, "total\t%u:%u\n",
-	       j->space[journal_space_total].next_entry,
-	       j->space[journal_space_total].total);
-	printbuf_indent_sub(out, 2);
-
-	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-		if (!ca->mi.durability)
-			continue;
-
-		struct journal_device *ja = &ca->journal;
-
-		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
-			continue;
-
-		if (!ja->nr)
-			continue;
-
-		prt_printf(out, "dev %u:\n",			ca->dev_idx);
-		prt_printf(out, "durability %u:\n",		ca->mi.durability);
-		printbuf_indent_add(out, 2);
-		prt_printf(out, "nr\t%u\n",			ja->nr);
-		prt_printf(out, "bucket size\t%u\n",		ca->mi.bucket_size);
-		prt_printf(out, "available\t%u:%u\n",		bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
-		prt_printf(out, "discard_idx\t%u\n",		ja->discard_idx);
-		prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
-		prt_printf(out, "dirty_idx\t%u (seq %llu)\n",	ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
-		prt_printf(out, "cur_idx\t%u (seq %llu)\n",	ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
-		printbuf_indent_sub(out, 2);
-	}
-
-	prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
-
-	--out->atomic;
-}
-
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
-	spin_lock(&j->lock);
-	__bch2_journal_debug_to_text(out, j);
-	spin_unlock(&j->lock);
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
deleted file mode 100644
index 977907038d98..000000000000
--- a/fs/bcachefs/journal.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_H
-#define _BCACHEFS_JOURNAL_H
-
-/*
- * THE JOURNAL:
- *
- * The primary purpose of the journal is to log updates (insertions) to the
- * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
- *
- * Without the journal, the b-tree is always internally consistent on
- * disk - and in fact, in the earliest incarnations bcache didn't have a journal
- * but did handle unclean shutdowns by doing all index updates synchronously
- * (with coalescing).
- *
- * Updates to interior nodes still happen synchronously and without the journal
- * (for simplicity) - this may change eventually but updates to interior nodes
- * are rare enough it's not a huge priority.
- *
- * This means the journal is relatively separate from the b-tree; it consists of
- * just a list of keys and journal replay consists of just redoing those
- * insertions in same order that they appear in the journal.
- *
- * PERSISTENCE:
- *
- * For synchronous updates (where we're waiting on the index update to hit
- * disk), the journal entry will be written out immediately (or as soon as
- * possible, if the write for the previous journal entry was still in flight).
- *
- * Synchronous updates are specified by passing a closure (@flush_cl) to
- * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will wait on the journal write to
- * complete (via closure_wait()).
- *
- * If the index update wasn't synchronous, the journal entry will be
- * written out after 10 ms have elapsed, by default (the delay_ms field
- * in struct journal).
- *
- * JOURNAL ENTRIES:
- *
- * A journal entry is variable size (struct jset), it's got a fixed length
- * header and then a variable number of struct jset_entry entries.
- *
- * Journal entries are identified by monotonically increasing 64 bit sequence
- * numbers - jset->seq; other places in the code refer to this sequence number.
- *
- * A jset_entry entry contains one or more bkeys (which is what gets inserted
- * into the b-tree). We need a container to indicate which b-tree the key is
- * for; also, the roots of the various b-trees are stored in jset_entry entries
- * (one for each b-tree) - this lets us add new b-tree types without changing
- * the on disk format.
- *
- * We also keep some things in the journal header that are logically part of the
- * superblock - all the things that are frequently updated. This is for future
- * bcache on raw flash support; the superblock (which will become another
- * journal) can't be moved or wear leveled, so it contains just enough
- * information to find the main journal, and the superblock only has to be
- * rewritten when we want to move/wear level the main journal.
- *
- * JOURNAL LAYOUT ON DISK:
- *
- * The journal is written to a ringbuffer of buckets (which is kept in the
- * superblock); the individual buckets are not necessarily contiguous on disk
- * which means that journal entries are not allowed to span buckets, but also
- * that we can resize the journal at runtime if desired (unimplemented).
- *
- * The journal buckets exist in the same pool as all the other buckets that are
- * managed by the allocator and garbage collection - garbage collection marks
- * the journal buckets as metadata buckets.
- *
- * OPEN/DIRTY JOURNAL ENTRIES:
- *
- * Open/dirty journal entries are journal entries that contain b-tree updates
- * that have not yet been written out to the b-tree on disk. We have to track
- * which journal entries are dirty, and we also have to avoid wrapping around
- * the journal and overwriting old but still dirty journal entries with new
- * journal entries.
- *
- * On disk, this is represented with the "last_seq" field of struct jset;
- * last_seq is the first sequence number that journal replay has to replay.
- *
- * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
- * journal_device->seq) of for each journal bucket, the highest sequence number
- * any journal entry it contains. Then, by comparing that against last_seq we
- * can determine whether that journal bucket contains dirty journal entries or
- * not.
- *
- * To track which journal entries are dirty, we maintain a fifo of refcounts
- * (where each entry corresponds to a specific sequence number) - when a ref
- * goes to 0, that journal entry is no longer dirty.
- *
- * Journalling of index updates is done at the same time as the b-tree itself is
- * being modified (see btree_insert_key()); when we add the key to the journal
- * the pending b-tree write takes a ref on the journal entry the key was added
- * to. If a pending b-tree write would need to take refs on multiple dirty
- * journal entries, it only keeps the ref on the oldest one (since a newer
- * journal entry will still be replayed if an older entry was dirty).
- *
- * JOURNAL FILLING UP:
- *
- * There are two ways the journal could fill up; either we could run out of
- * space to write to, or we could have too many open journal entries and run out
- * of room in the fifo of refcounts. Since those refcounts are decremented
- * without any locking we can't safely resize that fifo, so we handle it the
- * same way.
- *
- * If the journal fills up, we start flushing dirty btree nodes until we can
- * allocate space for a journal write again - preferentially flushing btree
- * nodes that are pinning the oldest journal entries first.
- */
-
-#include <linux/hash.h>
-
-#include "journal_types.h"
-
-struct bch_fs;
-
-static inline void journal_wake(struct journal *j)
-{
-	wake_up(&j->wait);
-	closure_wake_up(&j->async_wait);
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
-	return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
-	return atomic64_read(&j->seq);
-}
-
-static inline u64 journal_last_unwritten_seq(struct journal *j)
-{
-	return j->seq_ondisk + 1;
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
-	unsigned idx = (journal_cur_seq(j) &
-			JOURNAL_BUF_MASK &
-			~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
-
-	return j->buf + idx;
-}
-
-static inline int journal_state_count(union journal_res_state s, int idx)
-{
-	switch (idx) {
-	case 0: return s.buf0_count;
-	case 1: return s.buf1_count;
-	case 2: return s.buf2_count;
-	case 3: return s.buf3_count;
-	}
-	BUG();
-}
-
-static inline int journal_state_seq_count(struct journal *j,
-					  union journal_res_state s, u64 seq)
-{
-	if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
-		return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
-	else
-		return 0;
-}
-
-static inline void journal_state_inc(union journal_res_state *s)
-{
-	s->buf0_count += s->idx == 0;
-	s->buf1_count += s->idx == 1;
-	s->buf2_count += s->idx == 2;
-	s->buf3_count += s->idx == 3;
-}
-
-/*
- * Amount of space that will be taken up by some keys in the journal (i.e.
- * including the jset header)
- */
-static inline unsigned jset_u64s(unsigned u64s)
-{
-	return u64s + sizeof(struct jset_entry) / sizeof(u64);
-}
-
-static inline int journal_entry_overhead(struct journal *j)
-{
-	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
-{
-	struct jset *jset = buf->data;
-	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
-
-	memset(entry, 0, sizeof(*entry));
-	entry->u64s = cpu_to_le16(u64s);
-
-	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-
-	return entry;
-}
-
-static inline struct jset_entry *
-journal_res_entry(struct journal *j, struct journal_res *res)
-{
-	return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
-}
-
-static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
-					  enum btree_id id, unsigned level,
-					  unsigned u64s)
-{
-	entry->u64s	= cpu_to_le16(u64s);
-	entry->btree_id = id;
-	entry->level	= level;
-	entry->type	= type;
-	entry->pad[0]	= 0;
-	entry->pad[1]	= 0;
-	entry->pad[2]	= 0;
-	return jset_u64s(u64s);
-}
-
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
-					  enum btree_id id, unsigned level,
-					  const void *data, unsigned u64s)
-{
-	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
-
-	memcpy_u64s_small(entry->_data, data, u64s);
-	return ret;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry(struct journal *j, struct journal_res *res,
-			 unsigned type, enum btree_id id,
-			 unsigned level, unsigned u64s)
-{
-	struct jset_entry *entry = journal_res_entry(j, res);
-	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
-
-	EBUG_ON(!res->ref);
-	EBUG_ON(actual > res->u64s);
-
-	res->offset	+= actual;
-	res->u64s	-= actual;
-	return entry;
-}
-
-static inline bool journal_entry_empty(struct jset *j)
-{
-	if (j->seq != j->last_seq)
-		return false;
-
-	vstruct_for_each(j, i)
-		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
-			return false;
-	return true;
-}
-
-/*
- * Drop reference on a buffer index and return true if the count has hit zero.
- */
-static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
-{
-	union journal_res_state s;
-
-	s.v = atomic64_sub_return(((union journal_res_state) {
-				    .buf0_count = idx == 0,
-				    .buf1_count = idx == 1,
-				    .buf2_count = idx == 2,
-				    .buf3_count = idx == 3,
-				    }).v, &j->reservations.counter);
-	return s;
-}
-
-bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_do_writes(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64);
-
-static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
-{
-	unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
-	union journal_res_state s;
-
-	s = journal_state_buf_put(j, idx);
-	if (!journal_state_count(s, idx))
-		bch2_journal_buf_put_final(j, seq);
-}
-
-static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
-{
-	unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
-	union journal_res_state s;
-
-	s = journal_state_buf_put(j, idx);
-	if (!journal_state_count(s, idx)) {
-		spin_lock(&j->lock);
-		bch2_journal_buf_put_final(j, seq);
-		spin_unlock(&j->lock);
-	} else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
-		wake_up(&j->wait);
-}
-
-/*
- * This function releases the journal write structure so other threads can
- * then proceed to add their keys as well.
- */
-static inline void bch2_journal_res_put(struct journal *j,
-				       struct journal_res *res)
-{
-	if (!res->ref)
-		return;
-
-	lock_release(&j->res_map, _THIS_IP_);
-
-	while (res->u64s)
-		bch2_journal_add_entry(j, res,
-				       BCH_JSET_ENTRY_btree_keys,
-				       0, 0, 0);
-
-	bch2_journal_buf_put(j, res->seq);
-
-	res->ref = 0;
-}
-
-int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				  unsigned, struct btree_trans *);
-
-/* First bits for BCH_WATERMARK: */
-enum journal_res_flags {
-	__JOURNAL_RES_GET_NONBLOCK	= BCH_WATERMARK_BITS,
-	__JOURNAL_RES_GET_CHECK,
-};
-
-#define JOURNAL_RES_GET_NONBLOCK	(1 << __JOURNAL_RES_GET_NONBLOCK)
-#define JOURNAL_RES_GET_CHECK		(1 << __JOURNAL_RES_GET_CHECK)
-
-static inline int journal_res_get_fast(struct journal *j,
-				       struct journal_res *res,
-				       unsigned flags)
-{
-	union journal_res_state old, new;
-
-	old.v = atomic64_read(&j->reservations.counter);
-	do {
-		new.v = old.v;
-
-		/*
-		 * Check if there is still room in the current journal
-		 * entry, smp_rmb() guarantees that reads from reservations.counter
-		 * occur before accessing cur_entry_u64s:
-		 */
-		smp_rmb();
-		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
-			return 0;
-
-		EBUG_ON(!journal_state_count(new, new.idx));
-
-		if ((flags & BCH_WATERMARK_MASK) < j->watermark)
-			return 0;
-
-		new.cur_entry_offset += res->u64s;
-		journal_state_inc(&new);
-
-		/*
-		 * If the refcount would overflow, we have to wait:
-		 * XXX - tracepoint this:
-		 */
-		if (!journal_state_count(new, new.idx))
-			return 0;
-
-		if (flags & JOURNAL_RES_GET_CHECK)
-			return 1;
-	} while (!atomic64_try_cmpxchg(&j->reservations.counter,
-				       &old.v, new.v));
-
-	res->ref	= true;
-	res->offset	= old.cur_entry_offset;
-	res->seq	= journal_cur_seq(j);
-	res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
-	return 1;
-}
-
-static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				       unsigned u64s, unsigned flags,
-				       struct btree_trans *trans)
-{
-	int ret;
-
-	EBUG_ON(res->ref);
-	EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
-
-	res->u64s = u64s;
-
-	if (journal_res_get_fast(j, res, flags))
-		goto out;
-
-	ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
-	if (ret)
-		return ret;
-out:
-	if (!(flags & JOURNAL_RES_GET_CHECK)) {
-		lock_acquire_shared(&j->res_map, 0,
-				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
-				    NULL, _THIS_IP_);
-		EBUG_ON(!res->ref);
-		BUG_ON(!res->seq);
-	}
-	return 0;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *,
-				   struct journal_entry_res *,
-				   unsigned);
-
-int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
-void bch2_journal_flush_async(struct journal *, struct closure *);
-
-int bch2_journal_flush_seq(struct journal *, u64, unsigned);
-int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64, u64);
-int bch2_journal_meta(struct journal *);
-
-void bch2_journal_halt_locked(struct journal *);
-void bch2_journal_halt(struct journal *);
-
-static inline int bch2_journal_error(struct journal *j)
-{
-	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
-		? -BCH_ERR_journal_shutdown : 0;
-}
-
-struct bch_dev;
-
-void bch2_journal_unblock(struct journal *);
-void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
-
-void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
-int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
-
-int bch2_dev_journal_alloc(struct bch_dev *, bool);
-int bch2_fs_journal_alloc(struct bch_fs *);
-
-void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
-
-void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, u64);
-void bch2_journal_set_replay_done(struct journal *);
-
-void bch2_dev_journal_exit(struct bch_dev *);
-int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
-void bch2_fs_journal_exit(struct journal *);
-void bch2_fs_journal_init_early(struct journal *);
-int bch2_fs_journal_init(struct journal *);
-
-#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
deleted file mode 100644
index 9e028dbcc3d0..000000000000
--- a/fs/bcachefs/journal_io.c
+++ /dev/null
@@ -1,2242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_io.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/string_choices.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
-{
-	lockdep_assert_held(&c->sb_lock);
-
-	for_each_member_device(c, ca) {
-		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
-		m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
-		m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
-	}
-}
-
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
-{
-	mutex_lock(&c->sb_lock);
-	for_each_member_device(c, ca) {
-		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-
-		unsigned idx = le32_to_cpu(m.last_journal_bucket);
-		if (idx < ca->journal.nr)
-			ca->journal.cur_idx = idx;
-		unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
-		if (offset <= ca->mi.bucket_size)
-			ca->journal.sectors_free = ca->mi.bucket_size - offset;
-	}
-	mutex_unlock(&c->sb_lock);
-}
-
-static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
-{
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
-	prt_printf(out, "%s %u:%u:%u (sector %llu)",
-		   ca ? ca->name : "(invalid dev)",
-		   p->dev, p->bucket, p->bucket_offset, p->sector);
-	bch2_dev_put(ca);
-}
-
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
-{
-	darray_for_each(j->ptrs, i) {
-		if (i != j->ptrs.data)
-			prt_printf(out, " ");
-		bch2_journal_ptr_to_text(out, c, i);
-	}
-}
-
-static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
-{
-	for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
-		struct jset_entry_datetime *datetime =
-			container_of(entry, struct jset_entry_datetime, entry);
-		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-		break;
-	}
-}
-
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
-					struct journal_replay *j)
-{
-	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-	bch2_journal_datetime_to_text(out, &j->j);
-	prt_char(out, ' ');
-	bch2_journal_ptrs_to_text(out, c, j);
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = ((__le32 *) &jset->seq)[0],
-		[2] = ((__le32 *) &jset->seq)[1],
-		[3] = BCH_NONCE_JOURNAL,
-	}};
-}
-
-static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
-{
-	if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
-		*csum = (struct bch_csum) {};
-		return false;
-	}
-
-	*csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
-	return !bch2_crc_cmp(j->csum, *csum);
-}
-
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
-	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
-static void __journal_replay_free(struct bch_fs *c,
-				  struct journal_replay *i)
-{
-	struct journal_replay **p =
-		genradix_ptr(&c->journal_entries,
-			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
-
-	BUG_ON(*p != i);
-	*p = NULL;
-	kvfree(i);
-}
-
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
-{
-	if (blacklisted)
-		i->ignore_blacklisted = true;
-	else
-		i->ignore_not_dirty = true;
-
-	if (!c->opts.read_entire_journal)
-		__journal_replay_free(c, i);
-}
-
-struct journal_list {
-	struct closure		cl;
-	u64			last_seq;
-	struct mutex		lock;
-	int			ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK		0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-			     struct journal_ptr entry_ptr,
-			     struct journal_list *jlist, struct jset *j)
-{
-	struct genradix_iter iter;
-	struct journal_replay **_i, *i, *dup;
-	size_t bytes = vstruct_bytes(j);
-	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
-	struct printbuf buf = PRINTBUF;
-	int ret = JOURNAL_ENTRY_ADD_OK;
-
-	if (last_seq && c->opts.journal_rewind)
-		last_seq = min(last_seq, c->opts.journal_rewind);
-
-	if (!c->journal.oldest_seq_found_ondisk ||
-	    le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
-		c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
-
-	/* Is this entry older than the range we need? */
-	if (!c->opts.read_entire_journal &&
-	    le64_to_cpu(j->seq) < jlist->last_seq)
-		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-
-	/*
-	 * genradixes are indexed by a ulong, not a u64, so we can't index them
-	 * by sequence number directly: Assume instead that they will all fall
-	 * within the range of +-2billion of the filrst one we find.
-	 */
-	if (!c->journal_entries_base_seq)
-		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
-
-	/* Drop entries we don't need anymore */
-	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
-		genradix_for_each_from(&c->journal_entries, iter, _i,
-				       journal_entry_radix_idx(c, jlist->last_seq)) {
-			i = *_i;
-
-			if (journal_replay_ignore(i))
-				continue;
-
-			if (le64_to_cpu(i->j.seq) >= last_seq)
-				break;
-
-			journal_replay_free(c, i, false);
-		}
-	}
-
-	jlist->last_seq = max(jlist->last_seq, last_seq);
-
-	_i = genradix_ptr_alloc(&c->journal_entries,
-				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
-				GFP_KERNEL);
-	if (!_i)
-		return bch_err_throw(c, ENOMEM_journal_entry_add);
-
-	/*
-	 * Duplicate journal entries? If so we want the one that didn't have a
-	 * checksum error:
-	 */
-	dup = *_i;
-	if (dup) {
-		bool identical = bytes == vstruct_bytes(&dup->j) &&
-			!memcmp(j, &dup->j, bytes);
-		bool not_identical = !identical &&
-			entry_ptr.csum_good &&
-			dup->csum_good;
-
-		bool same_device = false;
-		darray_for_each(dup->ptrs, ptr)
-			if (ptr->dev == ca->dev_idx)
-				same_device = true;
-
-		ret = darray_push(&dup->ptrs, entry_ptr);
-		if (ret)
-			goto out;
-
-		bch2_journal_replay_to_text(&buf, c, dup);
-
-		fsck_err_on(same_device,
-			    c, journal_entry_dup_same_device,
-			    "duplicate journal entry on same device\n%s",
-			    buf.buf);
-
-		fsck_err_on(not_identical,
-			    c, journal_entry_replicas_data_mismatch,
-			    "found duplicate but non identical journal entries\n%s",
-			    buf.buf);
-
-		if (entry_ptr.csum_good && !identical)
-			goto replace;
-
-		goto out;
-	}
-replace:
-	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-	if (!i)
-		return bch_err_throw(c, ENOMEM_journal_entry_add);
-
-	darray_init(&i->ptrs);
-	i->csum_good		= entry_ptr.csum_good;
-	i->ignore_blacklisted	= false;
-	i->ignore_not_dirty	= false;
-	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-
-	if (dup) {
-		/* The first ptr should represent the jset we kept: */
-		darray_for_each(dup->ptrs, ptr)
-			darray_push(&i->ptrs, *ptr);
-		__journal_replay_free(c, dup);
-	} else {
-		darray_push(&i->ptrs, entry_ptr);
-	}
-
-	*_i = i;
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
-	struct jset_entry *entry;
-
-	for (entry = start; entry != end; entry = vstruct_next(entry))
-		memset(entry, 0, sizeof(*entry));
-}
-
-#define JOURNAL_ENTRY_REREAD	5
-#define JOURNAL_ENTRY_NONE	6
-#define JOURNAL_ENTRY_BAD	7
-
-static void journal_entry_err_msg(struct printbuf *out,
-				  u32 version,
-				  struct jset *jset,
-				  struct jset_entry *entry)
-{
-	prt_str(out, "invalid journal entry, version=");
-	bch2_version_to_text(out, version);
-
-	if (entry) {
-		prt_str(out, " type=");
-		bch2_prt_jset_entry_type(out, entry->type);
-	}
-
-	if (!jset) {
-		prt_printf(out, " in superblock");
-	} else {
-
-		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
-
-		if (entry)
-			prt_printf(out, " offset=%zi/%u",
-				   (u64 *) entry - jset->_data,
-				   le32_to_cpu(jset->u64s));
-	}
-
-	prt_str(out, ": ");
-}
-
-#define journal_entry_err(c, version, jset, entry, _err, msg, ...)	\
-({									\
-	struct printbuf _buf = PRINTBUF;				\
-									\
-	journal_entry_err_msg(&_buf, version, jset, entry);		\
-	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
-									\
-	switch (from.flags & BCH_VALIDATE_write) {			\
-	case READ:							\
-		mustfix_fsck_err(c, _err, "%s", _buf.buf);		\
-		break;							\
-	case WRITE:							\
-		bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);		\
-		if (bch2_fs_inconsistent(c,				\
-				"corrupt metadata before write: %s\n", _buf.buf)) {\
-			ret = bch_err_throw(c, fsck_errors_not_fixed);		\
-			goto fsck_err;					\
-		}							\
-		break;							\
-	}								\
-									\
-	printbuf_exit(&_buf);						\
-	true;								\
-})
-
-#define journal_entry_err_on(cond, ...)					\
-	((cond) ? journal_entry_err(__VA_ARGS__) : false)
-
-#define FSCK_DELETED_KEY	5
-
-static int journal_validate_key(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				struct bkey_i *k,
-				struct bkey_validate_context from,
-				unsigned version, int big_endian)
-{
-	enum bch_validate_flags flags = from.flags;
-	int write = flags & BCH_VALIDATE_write;
-	void *next = vstruct_next(entry);
-	int ret = 0;
-
-	if (journal_entry_err_on(!k->k.u64s,
-				 c, version, jset, entry,
-				 journal_entry_bkey_u64s_0,
-				 "k->u64s 0")) {
-		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(vstruct_next(entry), next);
-		return FSCK_DELETED_KEY;
-	}
-
-	if (journal_entry_err_on((void *) bkey_next(k) >
-				 (void *) vstruct_next(entry),
-				 c, version, jset, entry,
-				 journal_entry_bkey_past_end,
-				 "extends past end of journal entry")) {
-		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(vstruct_next(entry), next);
-		return FSCK_DELETED_KEY;
-	}
-
-	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-				 c, version, jset, entry,
-				 journal_entry_bkey_bad_format,
-				 "bad format %u", k->k.format)) {
-		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
-		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(vstruct_next(entry), next);
-		return FSCK_DELETED_KEY;
-	}
-
-	if (!write)
-		bch2_bkey_compat(from.level, from.btree, version, big_endian,
-				 write, NULL, bkey_to_packed(k));
-
-	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
-	if (ret == -BCH_ERR_fsck_delete_bkey) {
-		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
-		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(vstruct_next(entry), next);
-		return FSCK_DELETED_KEY;
-	}
-	if (ret)
-		goto fsck_err;
-
-	if (write)
-		bch2_bkey_compat(from.level, from.btree, version, big_endian,
-				 write, NULL, bkey_to_packed(k));
-fsck_err:
-	return ret;
-}
-
-static int journal_entry_btree_keys_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct bkey_i *k = entry->start;
-
-	from.level	= entry->level;
-	from.btree	= entry->btree_id;
-
-	while (k != vstruct_last(entry)) {
-		int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
-		if (ret == FSCK_DELETED_KEY)
-			continue;
-		else if (ret)
-			return ret;
-
-		k = bkey_next(k);
-	}
-
-	return 0;
-}
-
-static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
-					     struct jset_entry *entry)
-{
-	bool first = true;
-
-	jset_entry_for_each_key(entry, k) {
-		/* We may be called on entries that haven't been validated: */
-		if (!k->k.u64s)
-			break;
-
-		if (!first) {
-			prt_newline(out);
-			bch2_prt_jset_entry_type(out, entry->type);
-			prt_str(out, ": ");
-		}
-		bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
-		prt_char(out, ' ');
-		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
-		first = false;
-	}
-}
-
-static int journal_entry_btree_root_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct bkey_i *k = entry->start;
-	int ret = 0;
-
-	from.root	= true;
-	from.level	= entry->level + 1;
-	from.btree	= entry->btree_id;
-
-	if (journal_entry_err_on(!entry->u64s ||
-				 le16_to_cpu(entry->u64s) != k->k.u64s,
-				 c, version, jset, entry,
-				 journal_entry_btree_root_bad_size,
-				 "invalid btree root journal entry: wrong number of keys")) {
-		void *next = vstruct_next(entry);
-		/*
-		 * we don't want to null out this jset_entry,
-		 * just the contents, so that later we can tell
-		 * we were _supposed_ to have a btree root
-		 */
-		entry->u64s = 0;
-		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
-	}
-
-	ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
-	if (ret == FSCK_DELETED_KEY)
-		ret = 0;
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
-					     struct jset_entry *entry)
-{
-	journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	/* obsolete, don't care: */
-	return 0;
-}
-
-static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-}
-
-static int journal_entry_blacklist_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-				 c, version, jset, entry,
-				 journal_entry_blacklist_bad_size,
-		"invalid journal seq blacklist entry: bad size")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-	}
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-	struct jset_entry_blacklist *bl =
-		container_of(entry, struct jset_entry_blacklist, entry);
-
-	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
-}
-
-static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct jset_entry_blacklist_v2 *bl_entry;
-	int ret = 0;
-
-	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-				 c, version, jset, entry,
-				 journal_entry_blacklist_v2_bad_size,
-		"invalid journal seq blacklist entry: bad size")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		goto out;
-	}
-
-	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
-				 le64_to_cpu(bl_entry->end),
-				 c, version, jset, entry,
-				 journal_entry_blacklist_v2_start_past_end,
-		"invalid journal seq blacklist entry: start > end")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-	}
-out:
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
-					       struct jset_entry *entry)
-{
-	struct jset_entry_blacklist_v2 *bl =
-		container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-	prt_printf(out, "start=%llu end=%llu",
-	       le64_to_cpu(bl->start),
-	       le64_to_cpu(bl->end));
-}
-
-static int journal_entry_usage_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct jset_entry_usage *u =
-		container_of(entry, struct jset_entry_usage, entry);
-	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-	int ret = 0;
-
-	if (journal_entry_err_on(bytes < sizeof(*u),
-				 c, version, jset, entry,
-				 journal_entry_usage_bad_size,
-				 "invalid journal entry usage: bad size")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
-					struct jset_entry *entry)
-{
-	struct jset_entry_usage *u =
-		container_of(entry, struct jset_entry_usage, entry);
-
-	prt_str(out, "type=");
-	bch2_prt_fs_usage_type(out, u->entry.btree_id);
-	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_data_usage_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct jset_entry_data_usage *u =
-		container_of(entry, struct jset_entry_data_usage, entry);
-	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-	struct printbuf err = PRINTBUF;
-	int ret = 0;
-
-	if (journal_entry_err_on(bytes < sizeof(*u) ||
-				 bytes < sizeof(*u) + u->r.nr_devs,
-				 c, version, jset, entry,
-				 journal_entry_data_usage_bad_size,
-				 "invalid journal entry usage: bad size")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		goto out;
-	}
-
-	if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
-				 c, version, jset, entry,
-				 journal_entry_data_usage_bad_size,
-				 "invalid journal entry usage: %s", err.buf)) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		goto out;
-	}
-out:
-fsck_err:
-	printbuf_exit(&err);
-	return ret;
-}
-
-static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
-					     struct jset_entry *entry)
-{
-	struct jset_entry_data_usage *u =
-		container_of(entry, struct jset_entry_data_usage, entry);
-
-	bch2_replicas_entry_to_text(out, &u->r);
-	prt_printf(out, "=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_clock_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct jset_entry_clock *clock =
-		container_of(entry, struct jset_entry_clock, entry);
-	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-	int ret = 0;
-
-	if (journal_entry_err_on(bytes != sizeof(*clock),
-				 c, version, jset, entry,
-				 journal_entry_clock_bad_size,
-				 "bad size")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-
-	if (journal_entry_err_on(clock->rw > 1,
-				 c, version, jset, entry,
-				 journal_entry_clock_bad_rw,
-				 "bad rw")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
-					struct jset_entry *entry)
-{
-	struct jset_entry_clock *clock =
-		container_of(entry, struct jset_entry_clock, entry);
-
-	prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
-}
-
-static int journal_entry_dev_usage_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	struct jset_entry_dev_usage *u =
-		container_of(entry, struct jset_entry_dev_usage, entry);
-	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-	unsigned expected = sizeof(*u);
-	int ret = 0;
-
-	if (journal_entry_err_on(bytes < expected,
-				 c, version, jset, entry,
-				 journal_entry_dev_usage_bad_size,
-				 "bad size (%u < %u)",
-				 bytes, expected)) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-
-	if (journal_entry_err_on(u->pad,
-				 c, version, jset, entry,
-				 journal_entry_dev_usage_bad_pad,
-				 "bad pad")) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-	struct jset_entry_dev_usage *u =
-		container_of(entry, struct jset_entry_dev_usage, entry);
-	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
-	if (vstruct_bytes(entry) < sizeof(*u))
-		return;
-
-	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
-
-	printbuf_indent_add(out, 2);
-	for (i = 0; i < nr_types; i++) {
-		prt_newline(out);
-		bch2_prt_data_type(out, i);
-		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
-		       le64_to_cpu(u->d[i].buckets),
-		       le64_to_cpu(u->d[i].sectors),
-		       le64_to_cpu(u->d[i].fragmented));
-	}
-	printbuf_indent_sub(out, 2);
-}
-
-static int journal_entry_log_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	return 0;
-}
-
-static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
-				      struct jset_entry *entry)
-{
-	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-
-	prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
-}
-
-static int journal_entry_overwrite_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	from.flags = 0;
-	return journal_entry_btree_keys_validate(c, jset, entry,
-				version, big_endian, from);
-}
-
-static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-	journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_log_bkey_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	from.flags = 0;
-	return journal_entry_btree_keys_validate(c, jset, entry,
-				version, big_endian, from);
-}
-
-static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
-					   struct jset_entry *entry)
-{
-	journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	return journal_entry_btree_keys_validate(c, jset, entry,
-				version, big_endian, from);
-}
-
-static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-	journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_datetime_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	unsigned bytes = vstruct_bytes(entry);
-	unsigned expected = 16;
-	int ret = 0;
-
-	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
-				 c, version, jset, entry,
-				 journal_entry_dev_usage_bad_size,
-				 "bad size (%u < %u)",
-				 bytes, expected)) {
-		journal_entry_null_range(entry, vstruct_next(entry));
-		return ret;
-	}
-fsck_err:
-	return ret;
-}
-
-static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
-					    struct jset_entry *entry)
-{
-	struct jset_entry_datetime *datetime =
-		container_of(entry, struct jset_entry_datetime, entry);
-
-	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-}
-
-struct jset_entry_ops {
-	int (*validate)(struct bch_fs *, struct jset *,
-			struct jset_entry *, unsigned, int,
-			struct bkey_validate_context);
-	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
-};
-
-static const struct jset_entry_ops bch2_jset_entry_ops[] = {
-#define x(f, nr)						\
-	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
-		.validate	= journal_entry_##f##_validate,	\
-		.to_text	= journal_entry_##f##_to_text,	\
-	},
-	BCH_JSET_ENTRY_TYPES()
-#undef x
-};
-
-int bch2_journal_entry_validate(struct bch_fs *c,
-				struct jset *jset,
-				struct jset_entry *entry,
-				unsigned version, int big_endian,
-				struct bkey_validate_context from)
-{
-	return entry->type < BCH_JSET_ENTRY_NR
-		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-				version, big_endian, from)
-		: 0;
-}
-
-void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
-				struct jset_entry *entry)
-{
-	bch2_prt_jset_entry_type(out, entry->type);
-
-	if (entry->type < BCH_JSET_ENTRY_NR) {
-		prt_str(out, ": ");
-		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
-	}
-}
-
-static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-				 enum bch_validate_flags flags)
-{
-	struct bkey_validate_context from = {
-		.flags		= flags,
-		.from		= BKEY_VALIDATE_journal,
-		.journal_seq	= le64_to_cpu(jset->seq),
-	};
-
-	unsigned version = le32_to_cpu(jset->version);
-	int ret = 0;
-
-	vstruct_for_each(jset, entry) {
-		from.journal_offset = (u64 *) entry - jset->_data;
-
-		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
-				c, version, jset, entry,
-				journal_entry_past_jset_end,
-				"journal entry extends past end of jset")) {
-			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
-			break;
-		}
-
-		ret = bch2_journal_entry_validate(c, jset, entry, version,
-						  JSET_BIG_ENDIAN(jset), from);
-		if (ret)
-			break;
-	}
-fsck_err:
-	return ret;
-}
-
-static int jset_validate(struct bch_fs *c,
-			 struct bch_dev *ca,
-			 struct jset *jset, u64 sector,
-			 enum bch_validate_flags flags)
-{
-	struct bkey_validate_context from = {
-		.flags		= flags,
-		.from		= BKEY_VALIDATE_journal,
-		.journal_seq	= le64_to_cpu(jset->seq),
-	};
-	int ret = 0;
-
-	if (le64_to_cpu(jset->magic) != jset_magic(c))
-		return JOURNAL_ENTRY_NONE;
-
-	unsigned version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on(!bch2_version_compatible(version),
-			c, version, jset, NULL,
-			jset_unsupported_version,
-			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
-			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq),
-			BCH_VERSION_MAJOR(version),
-			BCH_VERSION_MINOR(version))) {
-		/* don't try to continue: */
-		return -EINVAL;
-	}
-
-	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-			c, version, jset, NULL,
-			jset_unknown_csum,
-			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
-			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq),
-			JSET_CSUM_TYPE(jset)))
-		ret = JOURNAL_ENTRY_BAD;
-
-	/* last_seq is ignored when JSET_NO_FLUSH is true */
-	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
-				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-				 c, version, jset, NULL,
-				 jset_last_seq_newer_than_seq,
-				 "invalid journal entry: last_seq > seq (%llu > %llu)",
-				 le64_to_cpu(jset->last_seq),
-				 le64_to_cpu(jset->seq))) {
-		jset->last_seq = jset->seq;
-		return JOURNAL_ENTRY_BAD;
-	}
-
-	ret = jset_validate_entries(c, jset, flags);
-fsck_err:
-	return ret;
-}
-
-static int jset_validate_early(struct bch_fs *c,
-			 struct bch_dev *ca,
-			 struct jset *jset, u64 sector,
-			 unsigned bucket_sectors_left,
-			 unsigned sectors_read)
-{
-	struct bkey_validate_context from = {
-		.from		= BKEY_VALIDATE_journal,
-		.journal_seq	= le64_to_cpu(jset->seq),
-	};
-	int ret = 0;
-
-	if (le64_to_cpu(jset->magic) != jset_magic(c))
-		return JOURNAL_ENTRY_NONE;
-
-	unsigned version = le32_to_cpu(jset->version);
-	if (journal_entry_err_on(!bch2_version_compatible(version),
-			c, version, jset, NULL,
-			jset_unsupported_version,
-			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
-			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq),
-			BCH_VERSION_MAJOR(version),
-			BCH_VERSION_MINOR(version))) {
-		/* don't try to continue: */
-		return -EINVAL;
-	}
-
-	size_t bytes = vstruct_bytes(jset);
-	if (bytes > (sectors_read << 9) &&
-	    sectors_read < bucket_sectors_left)
-		return JOURNAL_ENTRY_REREAD;
-
-	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-			c, version, jset, NULL,
-			jset_past_bucket_end,
-			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-			ca ? ca->name : c->name,
-			sector, le64_to_cpu(jset->seq), bytes))
-		le32_add_cpu(&jset->u64s,
-			     -((bytes - (bucket_sectors_left << 9)) / 8));
-fsck_err:
-	return ret;
-}
-
-struct journal_read_buf {
-	void		*data;
-	size_t		size;
-};
-
-static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
-				    size_t new_size)
-{
-	void *n;
-
-	/* the bios are sized for this many pages, max: */
-	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-		return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
-	new_size = roundup_pow_of_two(new_size);
-	n = kvmalloc(new_size, GFP_KERNEL);
-	if (!n)
-		return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
-	kvfree(b->data);
-	b->data = n;
-	b->size = new_size;
-	return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
-			       struct journal_read_buf *buf,
-			       struct journal_list *jlist,
-			       unsigned bucket)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal_device *ja = &ca->journal;
-	struct jset *j = NULL;
-	unsigned sectors, sectors_read = 0;
-	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
-	    end = offset + ca->mi.bucket_size;
-	bool saw_bad = false, csum_good;
-	int ret = 0;
-
-	pr_debug("reading %u", bucket);
-
-	while (offset < end) {
-		if (!sectors_read) {
-			struct bio *bio;
-			unsigned nr_bvecs;
-reread:
-			sectors_read = min_t(unsigned,
-				end - offset, buf->size >> 9);
-			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
-
-			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-			if (!bio)
-				return bch_err_throw(c, ENOMEM_journal_read_bucket);
-			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
-
-			bio->bi_iter.bi_sector = offset;
-			bch2_bio_map(bio, buf->data, sectors_read << 9);
-
-			u64 submit_time = local_clock();
-			ret = submit_bio_wait(bio);
-			kfree(bio);
-
-			if (!ret && bch2_meta_read_fault("journal"))
-				ret = bch_err_throw(c, EIO_fault_injected);
-
-			bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
-						   submit_time, !ret);
-
-			if (ret) {
-				bch_err_dev_ratelimited(ca,
-					"journal read error: sector %llu", offset);
-				/*
-				 * We don't error out of the recovery process
-				 * here, since the relevant journal entry may be
-				 * found on a different device, and missing or
-				 * no journal entries will be handled later
-				 */
-				return 0;
-			}
-
-			j = buf->data;
-		}
-
-		ret = jset_validate_early(c, ca, j, offset,
-				    end - offset, sectors_read);
-		switch (ret) {
-		case 0:
-			sectors = vstruct_sectors(j, c->block_bits);
-			break;
-		case JOURNAL_ENTRY_REREAD:
-			if (vstruct_bytes(j) > buf->size) {
-				ret = journal_read_buf_realloc(c, buf,
-							vstruct_bytes(j));
-				if (ret)
-					return ret;
-			}
-			goto reread;
-		case JOURNAL_ENTRY_NONE:
-			if (!saw_bad)
-				return 0;
-			/*
-			 * On checksum error we don't really trust the size
-			 * field of the journal entry we read, so try reading
-			 * again at next block boundary:
-			 */
-			sectors = block_sectors(c);
-			goto next_block;
-		default:
-			return ret;
-		}
-
-		if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
-			ja->highest_seq_found = le64_to_cpu(j->seq);
-			ja->cur_idx = bucket;
-			ja->sectors_free = ca->mi.bucket_size -
-				bucket_remainder(ca, offset) - sectors;
-		}
-
-		/*
-		 * This happens sometimes if we don't have discards on -
-		 * when we've partially overwritten a bucket with new
-		 * journal entries. We don't need the rest of the
-		 * bucket:
-		 */
-		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-			return 0;
-
-		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
-		struct bch_csum csum;
-		csum_good = jset_csum_good(c, j, &csum);
-
-		bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
-		if (!csum_good) {
-			/*
-			 * Don't print an error here, we'll print the error
-			 * later if we need this journal entry
-			 */
-			saw_bad = true;
-		}
-
-		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
-			     j->encrypted_start,
-			     vstruct_end(j) - (void *) j->encrypted_start);
-		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
-
-		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, (struct journal_ptr) {
-					.csum_good	= csum_good,
-					.csum		= csum,
-					.dev		= ca->dev_idx,
-					.bucket		= bucket,
-					.bucket_offset	= offset -
-						bucket_to_sector(ca, ja->buckets[bucket]),
-					.sector		= offset,
-					}, jlist, j);
-		mutex_unlock(&jlist->lock);
-
-		switch (ret) {
-		case JOURNAL_ENTRY_ADD_OK:
-			break;
-		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
-			break;
-		default:
-			return ret;
-		}
-next_block:
-		pr_debug("next");
-		offset		+= sectors;
-		sectors_read	-= sectors;
-		j = ((void *) j) + (sectors << 9);
-	}
-
-	return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_journal_read_device)
-{
-	closure_type(ja, struct journal_device, read);
-	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
-	struct bch_fs *c = ca->fs;
-	struct journal_list *jlist =
-		container_of(cl->parent, struct journal_list, cl);
-	struct journal_read_buf buf = { NULL, 0 };
-	unsigned i;
-	int ret = 0;
-
-	if (!ja->nr)
-		goto out;
-
-	ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
-	if (ret)
-		goto err;
-
-	pr_debug("%u journal buckets", ja->nr);
-
-	for (i = 0; i < ja->nr; i++) {
-		ret = journal_read_bucket(ca, &buf, jlist, i);
-		if (ret)
-			goto err;
-	}
-
-	/*
-	 * Set dirty_idx to indicate the entire journal is full and needs to be
-	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
-	 * pinned when it first runs:
-	 */
-	ja->discard_idx = ja->dirty_idx_ondisk =
-		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
-out:
-	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-	kvfree(buf.data);
-	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
-	closure_return(cl);
-	return;
-err:
-	mutex_lock(&jlist->lock);
-	jlist->ret = ret;
-	mutex_unlock(&jlist->lock);
-	goto out;
-}
-
-noinline_for_stack
-static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
-{
-	struct printbuf buf = PRINTBUF;
-	enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
-	bool have_good = false;
-
-	prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
-	bch2_journal_datetime_to_text(&buf, &j->j);
-	prt_newline(&buf);
-
-	darray_for_each(j->ptrs, ptr)
-		if (!ptr->csum_good) {
-			bch2_journal_ptr_to_text(&buf, c, ptr);
-			prt_char(&buf, ' ');
-			bch2_csum_to_text(&buf, csum_type, ptr->csum);
-			prt_newline(&buf);
-		} else {
-			have_good = true;
-		}
-
-	prt_printf(&buf, "should be ");
-	bch2_csum_to_text(&buf, csum_type, j->j.csum);
-
-	if (have_good)
-		prt_printf(&buf, "\n(had good copy on another device)");
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
-{
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	struct genradix_iter radix_iter;
-	struct journal_replay *i, **_i, *prev = NULL;
-	u64 seq = start_seq;
-
-	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		BUG_ON(seq > le64_to_cpu(i->j.seq));
-
-		while (seq < le64_to_cpu(i->j.seq)) {
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			if (seq == le64_to_cpu(i->j.seq))
-				break;
-
-			u64 missing_start = seq;
-
-			while (seq < le64_to_cpu(i->j.seq) &&
-			       !bch2_journal_seq_is_blacklisted(c, seq, false))
-				seq++;
-
-			u64 missing_end = seq - 1;
-
-			printbuf_reset(&buf);
-			prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-				   missing_start, missing_end,
-				   start_seq, end_seq);
-
-			prt_printf(&buf, "\nprev at ");
-			if (prev) {
-				bch2_journal_ptrs_to_text(&buf, c, prev);
-				prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
-			} else
-				prt_printf(&buf, "(none)");
-
-			prt_printf(&buf, "\nnext at ");
-			bch2_journal_ptrs_to_text(&buf, c, i);
-			prt_printf(&buf, ", continue?");
-
-			fsck_err(c, journal_entries_missing, "%s", buf.buf);
-		}
-
-		prev = i;
-		seq++;
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_journal_read(struct bch_fs *c,
-		      u64 *last_seq,
-		      u64 *blacklist_seq,
-		      u64 *start_seq)
-{
-	struct journal_list jlist;
-	struct journal_replay *i, **_i;
-	struct genradix_iter radix_iter;
-	struct printbuf buf = PRINTBUF;
-	bool degraded = false, last_write_torn = false;
-	u64 seq;
-	int ret = 0;
-
-	closure_init_stack(&jlist.cl);
-	mutex_init(&jlist.lock);
-	jlist.last_seq = 0;
-	jlist.ret = 0;
-
-	for_each_member_device(c, ca) {
-		if (!c->opts.fsck &&
-		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
-			continue;
-
-		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
-		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
-		    enumerated_ref_tryget(&ca->io_ref[READ],
-					  BCH_DEV_READ_REF_journal_read))
-			closure_call(&ca->journal.read,
-				     bch2_journal_read_device,
-				     system_unbound_wq,
-				     &jlist.cl);
-		else
-			degraded = true;
-	}
-
-	while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
-		;
-
-	if (jlist.ret)
-		return jlist.ret;
-
-	*last_seq	= 0;
-	*start_seq	= 0;
-	*blacklist_seq	= 0;
-
-	/*
-	 * Find most recent flush entry, and ignore newer non flush entries -
-	 * those entries will be blacklisted:
-	 */
-	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		if (!*start_seq)
-			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
-
-		if (JSET_NO_FLUSH(&i->j)) {
-			i->ignore_blacklisted = true;
-			continue;
-		}
-
-		if (!last_write_torn && !i->csum_good) {
-			last_write_torn = true;
-			i->ignore_blacklisted = true;
-			continue;
-		}
-
-		struct bkey_validate_context from = {
-			.from		= BKEY_VALIDATE_journal,
-			.journal_seq	= le64_to_cpu(i->j.seq),
-		};
-		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-					 c, le32_to_cpu(i->j.version), &i->j, NULL,
-					 jset_last_seq_newer_than_seq,
-					 "invalid journal entry: last_seq > seq (%llu > %llu)",
-					 le64_to_cpu(i->j.last_seq),
-					 le64_to_cpu(i->j.seq)))
-			i->j.last_seq = i->j.seq;
-
-		*last_seq	= le64_to_cpu(i->j.last_seq);
-		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
-		break;
-	}
-
-	if (!*start_seq) {
-		bch_info(c, "journal read done, but no entries found");
-		return 0;
-	}
-
-	if (!*last_seq) {
-		fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
-			 "journal read done, but no entries found after dropping non-flushes");
-		return 0;
-	}
-
-	printbuf_reset(&buf);
-	prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
-		   *last_seq, *blacklist_seq - 1);
-
-	/*
-	 * Drop blacklisted entries and entries older than last_seq (or start of
-	 * journal rewind:
-	 */
-	u64 drop_before = *last_seq;
-	if (c->opts.journal_rewind) {
-		drop_before = min(drop_before, c->opts.journal_rewind);
-		prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
-	}
-
-	*last_seq = drop_before;
-	if (*start_seq != *blacklist_seq)
-		prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
-	bch_info(c, "%s", buf.buf);
-	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		i = *_i;
-
-		if (journal_replay_ignore(i))
-			continue;
-
-		seq = le64_to_cpu(i->j.seq);
-		if (seq < drop_before) {
-			journal_replay_free(c, i, false);
-			continue;
-		}
-
-		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
-			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
-				    jset_seq_blacklisted,
-				    "found blacklisted journal entry %llu", seq);
-			i->ignore_blacklisted = true;
-		}
-	}
-
-	ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
-	if (ret)
-		goto err;
-
-	genradix_for_each(&c->journal_entries, radix_iter, _i) {
-		union bch_replicas_padded replicas = {
-			.e.data_type = BCH_DATA_journal,
-			.e.nr_devs = 0,
-			.e.nr_required = 1,
-		};
-
-		i = *_i;
-		if (journal_replay_ignore(i))
-			continue;
-
-		/*
-		 * Don't print checksum errors until we know we're going to use
-		 * a given journal entry:
-		 */
-		darray_for_each(i->ptrs, ptr)
-			if (!ptr->csum_good) {
-				bch2_journal_print_checksum_error(c, i);
-				break;
-			}
-
-		ret = jset_validate(c,
-				    bch2_dev_have_ref(c, i->ptrs.data[0].dev),
-				    &i->j,
-				    i->ptrs.data[0].sector,
-				    READ);
-		if (ret)
-			goto err;
-
-		darray_for_each(i->ptrs, ptr)
-			replicas_entry_add_dev(&replicas.e, ptr->dev);
-
-		bch2_replicas_entry_sort(&replicas.e);
-
-		printbuf_reset(&buf);
-		bch2_replicas_entry_to_text(&buf, &replicas.e);
-
-		if (!degraded &&
-		    !bch2_replicas_marked(c, &replicas.e) &&
-		    (le64_to_cpu(i->j.seq) == *last_seq ||
-		     fsck_err(c, journal_entry_replicas_not_marked,
-			      "superblock not marked as containing replicas for journal entry %llu\n%s",
-			      le64_to_cpu(i->j.seq), buf.buf))) {
-			ret = bch2_mark_replicas(c, &replicas.e);
-			if (ret)
-				goto err;
-		}
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/* journal write: */
-
-static void journal_advance_devs_to_next_bucket(struct journal *j,
-						struct dev_alloc_list *devs,
-						unsigned sectors, __le64 seq)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	guard(rcu)();
-	darray_for_each(*devs, i) {
-		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
-		if (!ca)
-			continue;
-
-		struct journal_device *ja = &ca->journal;
-
-		if (sectors > ja->sectors_free &&
-		    sectors <= ca->mi.bucket_size &&
-		    bch2_journal_dev_buckets_available(j, ja,
-					journal_space_discarded)) {
-			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-			ja->sectors_free = ca->mi.bucket_size;
-
-			/*
-			 * ja->bucket_seq[ja->cur_idx] must always have
-			 * something sensible:
-			 */
-			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
-		}
-	}
-}
-
-static void __journal_write_alloc(struct journal *j,
-				  struct journal_buf *w,
-				  struct dev_alloc_list *devs,
-				  unsigned sectors,
-				  unsigned *replicas,
-				  unsigned replicas_want)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	darray_for_each(*devs, i) {
-		struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
-					BCH_DEV_WRITE_REF_journal_write);
-		if (!ca)
-			continue;
-
-		struct journal_device *ja = &ca->journal;
-
-		/*
-		 * Check that we can use this device, and aren't already using
-		 * it:
-		 */
-		if (!ca->mi.durability ||
-		    ca->mi.state != BCH_MEMBER_STATE_rw ||
-		    !ja->nr ||
-		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
-		    sectors > ja->sectors_free) {
-			enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-			continue;
-		}
-
-		bch2_dev_stripe_increment(ca, &j->wp.stripe);
-
-		bch2_bkey_append_ptr(&w->key,
-			(struct bch_extent_ptr) {
-				  .offset = bucket_to_sector(ca,
-					ja->buckets[ja->cur_idx]) +
-					ca->mi.bucket_size -
-					ja->sectors_free,
-				  .dev = ca->dev_idx,
-		});
-
-		ja->sectors_free -= sectors;
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
-		*replicas += ca->mi.durability;
-
-		if (*replicas >= replicas_want)
-			break;
-	}
-}
-
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned *replicas)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_devs_mask devs;
-	struct dev_alloc_list devs_sorted;
-	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-	unsigned target = c->opts.metadata_target ?:
-		c->opts.foreground_target;
-	unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
-	unsigned replicas_need = min_t(unsigned, replicas_want,
-				       READ_ONCE(c->opts.metadata_replicas_required));
-	bool advance_done = false;
-
-retry_target:
-	devs = target_rw_devs(c, BCH_DATA_journal, target);
-	bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
-retry_alloc:
-	__journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
-
-	if (likely(*replicas >= replicas_want))
-		goto done;
-
-	if (!advance_done) {
-		journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
-		advance_done = true;
-		goto retry_alloc;
-	}
-
-	if (*replicas < replicas_want && target) {
-		/* Retry from all devices: */
-		target = 0;
-		advance_done = false;
-		goto retry_target;
-	}
-done:
-	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
-
-#if 0
-	/*
-	 * XXX: we need a way to alert the user when we go degraded for any
-	 * reason
-	 */
-	if (*replicas < min(replicas_want,
-			    dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
-	}
-#endif
-
-	return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	/* we aren't holding j->lock: */
-	unsigned new_size = READ_ONCE(j->buf_size_want);
-	void *new_buf;
-
-	if (buf->buf_size >= new_size)
-		return;
-
-	size_t btree_write_buffer_size = new_size / 64;
-
-	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
-		return;
-
-	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
-	if (!new_buf)
-		return;
-
-	memcpy(new_buf, buf->data, buf->buf_size);
-
-	spin_lock(&j->lock);
-	swap(buf->data,		new_buf);
-	swap(buf->buf_size,	new_size);
-	spin_unlock(&j->lock);
-
-	kvfree(new_buf);
-}
-
-static CLOSURE_CALLBACK(journal_write_done)
-{
-	closure_type(w, struct journal_buf, io);
-	struct journal *j = container_of(w, struct journal, buf[w->idx]);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union bch_replicas_padded replicas;
-	u64 seq = le64_to_cpu(w->data->seq);
-	int err = 0;
-
-	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
-			       ? j->flush_write_time
-			       : j->noflush_write_time, j->write_start_time);
-
-	if (!w->devs_written.nr) {
-		err = bch_err_throw(c, journal_write_err);
-	} else {
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-					 w->devs_written);
-		err = bch2_mark_replicas(c, &replicas.e);
-	}
-
-	if (err && !bch2_journal_error(j)) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		if (err == -BCH_ERR_journal_write_err)
-			prt_printf(&buf, "unable to write journal to sufficient devices\n");
-		else
-			prt_printf(&buf, "journal write error marking replicas: %s\n",
-				   bch2_err_str(err));
-
-		bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	closure_debug_destroy(cl);
-
-	spin_lock(&j->lock);
-	if (seq >= j->pin.front)
-		journal_seq_pin(j, seq)->devs = w->devs_written;
-	if (err && (!j->err_seq || seq < j->err_seq))
-		j->err_seq	= seq;
-	w->write_done = true;
-
-	if (!j->free_buf || j->free_buf_size < w->buf_size) {
-		swap(j->free_buf,	w->data);
-		swap(j->free_buf_size,	w->buf_size);
-	}
-
-	if (w->data) {
-		void *buf = w->data;
-		w->data = NULL;
-		w->buf_size = 0;
-
-		spin_unlock(&j->lock);
-		kvfree(buf);
-		spin_lock(&j->lock);
-	}
-
-	bool completed = false;
-	bool do_discards = false;
-
-	for (seq = journal_last_unwritten_seq(j);
-	     seq <= journal_cur_seq(j);
-	     seq++) {
-		w = j->buf + (seq & JOURNAL_BUF_MASK);
-		if (!w->write_done)
-			break;
-
-		if (!j->err_seq && !w->noflush) {
-			j->flushed_seq_ondisk = seq;
-			j->last_seq_ondisk = w->last_seq;
-
-			closure_wake_up(&c->freelist_wait);
-			bch2_reset_alloc_cursors(c);
-			do_discards = true;
-		}
-
-		j->seq_ondisk = seq;
-
-		/*
-		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-		 * more buckets:
-		 *
-		 * Must come before signaling write completion, for
-		 * bch2_fs_journal_stop():
-		 */
-		if (j->watermark != BCH_WATERMARK_stripe)
-			journal_reclaim_kick(&c->journal);
-
-		closure_wake_up(&w->wait);
-		completed = true;
-	}
-
-	if (completed) {
-		bch2_journal_reclaim_fast(j);
-		bch2_journal_space_available(j);
-
-		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
-
-		journal_wake(j);
-	}
-
-	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
-	    j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
-		struct journal_buf *buf = journal_cur_buf(j);
-		long delta = buf->expires - jiffies;
-
-		/*
-		 * We don't close a journal entry to write it while there's
-		 * previous entries still in flight - the current journal entry
-		 * might want to be written now:
-		 */
-		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
-	}
-
-	/*
-	 * We don't typically trigger journal writes from her - the next journal
-	 * write will be triggered immediately after the previous one is
-	 * allocated, in bch2_journal_write() - but the journal write error path
-	 * is special:
-	 */
-	bch2_journal_do_writes(j);
-	spin_unlock(&j->lock);
-
-	if (do_discards)
-		bch2_do_discards(c);
-}
-
-static void journal_write_endio(struct bio *bio)
-{
-	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
-	struct bch_dev *ca = jbio->ca;
-	struct journal *j = &ca->fs->journal;
-	struct journal_buf *w = j->buf + jbio->buf_idx;
-
-	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
-				   jbio->submit_time, !bio->bi_status);
-
-	if (bio->bi_status) {
-		bch_err_dev_ratelimited(ca,
-			       "error writing journal entry %llu: %s",
-			       le64_to_cpu(w->data->seq),
-			       bch2_blk_status_to_str(bio->bi_status));
-
-		unsigned long flags;
-		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
-		spin_unlock_irqrestore(&j->err_lock, flags);
-	}
-
-	closure_put(&w->io);
-	enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-}
-
-static CLOSURE_CALLBACK(journal_write_submit)
-{
-	closure_type(w, struct journal_buf, io);
-	struct journal *j = container_of(w, struct journal, buf[w->idx]);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-			     sectors);
-
-		struct journal_device *ja = &ca->journal;
-		struct journal_bio *jbio = ja->bio[w->idx];
-		struct bio *bio = &jbio->bio;
-
-		jbio->submit_time	= local_clock();
-
-		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_end_io		= journal_write_endio;
-		bio->bi_private		= ca;
-		bio->bi_ioprio		= IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
-
-		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
-		ca->prev_journal_sector = bio->bi_iter.bi_sector;
-
-		if (!JSET_NO_FLUSH(w->data))
-			bio->bi_opf    |= REQ_FUA;
-		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
-			bio->bi_opf    |= REQ_PREFLUSH;
-
-		bch2_bio_map(bio, w->data, sectors << 9);
-
-		trace_and_count(c, journal_write, bio);
-		closure_bio_submit(bio, cl);
-
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-	}
-
-	continue_at(cl, journal_write_done, j->wq);
-}
-
-static CLOSURE_CALLBACK(journal_write_preflush)
-{
-	closure_type(w, struct journal_buf, io);
-	struct journal *j = container_of(w, struct journal, buf[w->idx]);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	/*
-	 * Wait for previous journal writes to comelete; they won't necessarily
-	 * be flushed if they're still in flight
-	 */
-	if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
-		spin_lock(&j->lock);
-		if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
-			closure_wait(&j->async_wait, cl);
-			spin_unlock(&j->lock);
-			continue_at(cl, journal_write_preflush, j->wq);
-			return;
-		}
-		spin_unlock(&j->lock);
-	}
-
-	if (w->separate_flush) {
-		for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
-			enumerated_ref_get(&ca->io_ref[WRITE],
-					   BCH_DEV_WRITE_REF_journal_write);
-
-			struct journal_device *ja = &ca->journal;
-			struct bio *bio = &ja->bio[w->idx]->bio;
-			bio_reset(bio, ca->disk_sb.bdev,
-				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
-			bio->bi_end_io		= journal_write_endio;
-			bio->bi_private		= ca;
-			closure_bio_submit(bio, cl);
-		}
-
-		continue_at(cl, journal_write_submit, j->wq);
-	} else {
-		/*
-		 * no need to punt to another work item if we're not waiting on
-		 * preflushes
-		 */
-		journal_write_submit(&cl->work);
-	}
-}
-
-static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct jset_entry *start, *end;
-	struct jset *jset = w->data;
-	struct journal_keys_to_wb wb = { NULL };
-	unsigned u64s;
-	unsigned long btree_roots_have = 0;
-	u64 seq = le64_to_cpu(jset->seq);
-	int ret;
-
-	/*
-	 * Simple compaction, dropping empty jset_entries (from journal
-	 * reservations that weren't fully used) and merging jset_entries that
-	 * can be.
-	 *
-	 * If we wanted to be really fancy here, we could sort all the keys in
-	 * the jset and drop keys that were overwritten - probably not worth it:
-	 */
-	vstruct_for_each(jset, i) {
-		unsigned u64s = le16_to_cpu(i->u64s);
-
-		/* Empty entry: */
-		if (!u64s)
-			continue;
-
-		/*
-		 * New btree roots are set by journalling them; when the journal
-		 * entry gets written we have to propagate them to
-		 * c->btree_roots
-		 *
-		 * But, every journal entry we write has to contain all the
-		 * btree roots (at least for now); so after we copy btree roots
-		 * to c->btree_roots we have to get any missing btree roots and
-		 * add them to this journal entry:
-		 */
-		switch (i->type) {
-		case BCH_JSET_ENTRY_btree_root:
-			bch2_journal_entry_to_btree_root(c, i);
-			__set_bit(i->btree_id, &btree_roots_have);
-			break;
-		case BCH_JSET_ENTRY_write_buffer_keys:
-			EBUG_ON(!w->need_flush_to_write_buffer);
-
-			if (!wb.wb)
-				bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
-
-			jset_entry_for_each_key(i, k) {
-				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
-				if (ret) {
-					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
-							    bch2_err_str(ret));
-					bch2_journal_keys_to_write_buffer_end(c, &wb);
-					return ret;
-				}
-			}
-			i->type = BCH_JSET_ENTRY_btree_keys;
-			break;
-		}
-	}
-
-	if (wb.wb) {
-		ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
-		if (ret) {
-			bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
-					    bch2_err_str(ret));
-			return ret;
-		}
-	}
-
-	spin_lock(&c->journal.lock);
-	w->need_flush_to_write_buffer = false;
-	spin_unlock(&c->journal.lock);
-
-	start = end = vstruct_last(jset);
-
-	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
-
-	struct jset_entry_datetime *d =
-		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
-	d->entry.type	= BCH_JSET_ENTRY_datetime;
-	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
-
-	bch2_journal_super_entries_add_common(c, &end, seq);
-	u64s	= (u64 *) end - (u64 *) start;
-
-	WARN_ON(u64s > j->entry_u64s_reserved);
-
-	le32_add_cpu(&jset->u64s, u64s);
-
-	unsigned sectors = vstruct_sectors(jset, c->block_bits);
-
-	if (sectors > w->sectors) {
-		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
-				    vstruct_bytes(jset), w->sectors << 9,
-				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct jset *jset = w->data;
-	u64 seq = le64_to_cpu(jset->seq);
-	bool validate_before_checksum = false;
-	int ret = 0;
-
-	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= cpu_to_le32(c->sb.version);
-
-	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
-	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
-	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
-		j->last_empty_seq = seq;
-
-	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
-		validate_before_checksum = true;
-
-	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
-		validate_before_checksum = true;
-
-	if (validate_before_checksum &&
-	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
-		return ret;
-
-	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-		    jset->encrypted_start,
-		    vstruct_end(jset) - (void *) jset->encrypted_start);
-	if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
-		return ret;
-
-	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
-				  journal_nonce(jset), jset);
-
-	if (!validate_before_checksum &&
-	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
-		return ret;
-
-	unsigned sectors = vstruct_sectors(jset, c->block_bits);
-	unsigned bytes	= vstruct_bytes(jset);
-	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
-	return 0;
-}
-
-static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	int error = bch2_journal_error(j);
-
-	/*
-	 * If the journal is in an error state - we did an emergency shutdown -
-	 * we prefer to continue doing journal writes. We just mark them as
-	 * noflush so they'll never be used, but they'll still be visible by the
-	 * list_journal tool - this helps in debugging.
-	 *
-	 * There's a caveat: the first journal write after marking the
-	 * superblock dirty must always be a flush write, because on startup
-	 * from a clean shutdown we didn't necessarily read the journal and the
-	 * new journal write might overwrite whatever was in the journal
-	 * previously - we can't leave the journal without any flush writes in
-	 * it.
-	 *
-	 * So if we're in an error state, and we're still starting up, we don't
-	 * write anything at all.
-	 */
-	if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
-		return error;
-
-	if (error ||
-	    w->noflush ||
-	    (!w->must_flush &&
-	     time_before(jiffies, j->last_flush_write +
-		 msecs_to_jiffies(c->opts.journal_flush_delay)) &&
-	     test_bit(JOURNAL_may_skip_flush, &j->flags))) {
-		w->noflush = true;
-		SET_JSET_NO_FLUSH(w->data, true);
-		w->data->last_seq	= 0;
-		w->last_seq		= 0;
-
-		j->nr_noflush_writes++;
-	} else {
-		w->must_flush = true;
-		j->last_flush_write = jiffies;
-		j->nr_flush_writes++;
-		clear_bit(JOURNAL_need_flush_write, &j->flags);
-	}
-
-	return 0;
-}
-
-CLOSURE_CALLBACK(bch2_journal_write)
-{
-	closure_type(w, struct journal_buf, io);
-	struct journal *j = container_of(w, struct journal, buf[w->idx]);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union bch_replicas_padded replicas;
-	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
-	int ret;
-
-	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-	BUG_ON(!w->write_started);
-	BUG_ON(w->write_allocated);
-	BUG_ON(w->write_done);
-
-	j->write_start_time = local_clock();
-
-	spin_lock(&j->lock);
-	if (nr_rw_members > 1)
-		w->separate_flush = true;
-
-	ret = bch2_journal_write_pick_flush(j, w);
-	spin_unlock(&j->lock);
-
-	if (unlikely(ret))
-		goto err;
-
-	mutex_lock(&j->buf_lock);
-	journal_buf_realloc(j, w);
-
-	ret = bch2_journal_write_prep(j, w);
-	mutex_unlock(&j->buf_lock);
-
-	if (unlikely(ret))
-		goto err;
-
-	unsigned replicas_allocated = 0;
-	while (1) {
-		ret = journal_write_alloc(j, w, &replicas_allocated);
-		if (!ret || !j->can_discard)
-			break;
-
-		bch2_journal_do_discards(j);
-	}
-
-	if (unlikely(ret))
-		goto err_allocate_write;
-
-	ret = bch2_journal_write_checksum(j, w);
-	if (unlikely(ret))
-		goto err;
-
-	spin_lock(&j->lock);
-	/*
-	 * write is allocated, no longer need to account for it in
-	 * bch2_journal_space_available():
-	 */
-	w->sectors = 0;
-	w->write_allocated = true;
-	j->entry_bytes_written += vstruct_bytes(w->data);
-
-	/*
-	 * journal entry has been compacted and allocated, recalculate space
-	 * available:
-	 */
-	bch2_journal_space_available(j);
-	bch2_journal_do_writes(j);
-	spin_unlock(&j->lock);
-
-	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
-
-	/*
-	 * Mark journal replicas before we submit the write to guarantee
-	 * recovery will find the journal entries after a crash.
-	 */
-	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-				 w->devs_written);
-	ret = bch2_mark_replicas(c, &replicas.e);
-	if (ret)
-		goto err;
-
-	if (c->opts.nochanges)
-		goto no_io;
-
-	if (!JSET_NO_FLUSH(w->data))
-		continue_at(cl, journal_write_preflush, j->wq);
-	else
-		continue_at(cl, journal_write_submit, j->wq);
-	return;
-err_allocate_write:
-	if (!bch2_journal_error(j)) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_journal_debug_to_text(&buf, j);
-		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
-					  le64_to_cpu(w->data->seq),
-					  vstruct_sectors(w->data, c->block_bits),
-					  bch2_err_str(ret));
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-err:
-	bch2_fatal_error(c);
-no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-	}
-
-	continue_at(cl, journal_write_done, j->wq);
-}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
deleted file mode 100644
index 6fa82c4050fe..000000000000
--- a/fs/bcachefs/journal_io.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_IO_H
-#define _BCACHEFS_JOURNAL_IO_H
-
-#include "darray.h"
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *);
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-
-struct journal_ptr {
-	bool		csum_good;
-	struct bch_csum	csum;
-	u8		dev;
-	u32		bucket;
-	u32		bucket_offset;
-	u64		sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
-	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
-	bool			csum_good;
-	bool			ignore_blacklisted;
-	bool			ignore_not_dirty;
-	/* must be last: */
-	struct jset		j;
-};
-
-static inline bool journal_replay_ignore(struct journal_replay *i)
-{
-	return !i || i->ignore_blacklisted || i->ignore_not_dirty;
-}
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
-					struct jset_entry *entry, unsigned type)
-{
-	while (entry < vstruct_last(jset)) {
-		if (entry->type == type)
-			return entry;
-
-		entry = vstruct_next(entry);
-	}
-
-	return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type)			\
-	for (struct jset_entry *entry = (jset)->start;			\
-	     (entry = __jset_entry_type_next(jset, entry, type));	\
-	     entry = vstruct_next(entry))
-
-#define jset_entry_for_each_key(_e, _k)					\
-	for (struct bkey_i *_k = (_e)->start;				\
-	     _k < vstruct_last(_e);					\
-	     _k = bkey_next(_k))
-
-#define for_each_jset_key(k, entry, jset)				\
-	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
-		jset_entry_for_each_key(entry, k)
-
-int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-				struct jset_entry *, unsigned, int,
-				struct bkey_validate_context);
-void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
-				struct jset_entry *);
-
-void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
-			       struct journal_replay *);
-
-int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
-
-CLOSURE_CALLBACK(bch2_journal_write);
-
-static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-	struct jset_entry *entry = *end;
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-	memset(entry, 0, u64s * sizeof(u64));
-	/*
-	 * The u64s field counts from the start of data, ignoring the shared
-	 * fields.
-	 */
-	entry->u64s = cpu_to_le16(u64s - 1);
-
-	*end = vstruct_next(*end);
-	return entry;
-}
-
-#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
deleted file mode 100644
index 0042d43b8e57..000000000000
--- a/fs/bcachefs/journal_reclaim.c
+++ /dev/null
@@ -1,1037 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "trace.h"
-
-#include <linux/kthread.h>
-#include <linux/sched/mm.h>
-
-static bool __should_discard_bucket(struct journal *, struct journal_device *);
-
-/* Free space calculations: */
-
-static unsigned journal_space_from(struct journal_device *ja,
-				   enum journal_space_from from)
-{
-	switch (from) {
-	case journal_space_discarded:
-		return ja->discard_idx;
-	case journal_space_clean_ondisk:
-		return ja->dirty_idx_ondisk;
-	case journal_space_clean:
-		return ja->dirty_idx;
-	default:
-		BUG();
-	}
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *j,
-					    struct journal_device *ja,
-					    enum journal_space_from from)
-{
-	if (!ja->nr)
-		return 0;
-
-	unsigned available = (journal_space_from(ja, from) -
-			      ja->cur_idx - 1 + ja->nr) % ja->nr;
-
-	/*
-	 * Don't use the last bucket unless writing the new last_seq
-	 * will make another bucket available:
-	 */
-	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
-		--available;
-
-	return available;
-}
-
-void bch2_journal_set_watermark(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool low_on_space = j->space[journal_space_clean].total * 4 <=
-		j->space[journal_space_total].total;
-	bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
-	bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
-	unsigned watermark = low_on_space || low_on_pin || low_on_wb
-		? BCH_WATERMARK_reclaim
-		: BCH_WATERMARK_stripe;
-
-	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
-		trace_and_count(c, journal_full, c);
-
-	mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
-
-	swap(watermark, j->watermark);
-	if (watermark > j->watermark)
-		journal_wake(j);
-}
-
-static struct journal_space
-journal_dev_space_available(struct journal *j, struct bch_dev *ca,
-			    enum journal_space_from from)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_device *ja = &ca->journal;
-	unsigned sectors, buckets, unwritten;
-	unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
-	u64 seq;
-
-	if (from == journal_space_total)
-		return (struct journal_space) {
-			.next_entry	= bucket_size_aligned,
-			.total		= bucket_size_aligned * ja->nr,
-		};
-
-	buckets = bch2_journal_dev_buckets_available(j, ja, from);
-	sectors = round_down(ja->sectors_free, block_sectors(c));
-
-	/*
-	 * We that we don't allocate the space for a journal entry
-	 * until we write it out - thus, account for it here:
-	 */
-	for (seq = journal_last_unwritten_seq(j);
-	     seq <= journal_cur_seq(j);
-	     seq++) {
-		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
-
-		if (!unwritten)
-			continue;
-
-		/* entry won't fit on this device, skip: */
-		if (unwritten > bucket_size_aligned)
-			continue;
-
-		if (unwritten >= sectors) {
-			if (!buckets) {
-				sectors = 0;
-				break;
-			}
-
-			buckets--;
-			sectors = bucket_size_aligned;
-		}
-
-		sectors -= unwritten;
-	}
-
-	if (sectors < ca->mi.bucket_size && buckets) {
-		buckets--;
-		sectors = bucket_size_aligned;
-	}
-
-	return (struct journal_space) {
-		.next_entry	= sectors,
-		.total		= sectors + buckets * bucket_size_aligned,
-	};
-}
-
-static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
-			    enum journal_space_from from)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned pos, nr_devs = 0;
-	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
-	unsigned min_bucket_size = U32_MAX;
-
-	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
-
-	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-		if (!ca->journal.nr ||
-		    !ca->mi.durability)
-			continue;
-
-		min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
-
-		space = journal_dev_space_available(j, ca, from);
-		if (!space.next_entry)
-			continue;
-
-		for (pos = 0; pos < nr_devs; pos++)
-			if (space.total > dev_space[pos].total)
-				break;
-
-		array_insert_item(dev_space, nr_devs, pos, space);
-	}
-
-	if (nr_devs < nr_devs_want)
-		return (struct journal_space) { 0, 0 };
-
-	/*
-	 * It's possible for bucket size to be misaligned w.r.t. the filesystem
-	 * block size:
-	 */
-	min_bucket_size = round_down(min_bucket_size, block_sectors(c));
-
-	/*
-	 * We sorted largest to smallest, and we want the smallest out of the
-	 * @nr_devs_want largest devices:
-	 */
-	space = dev_space[nr_devs_want - 1];
-	space.next_entry = min(space.next_entry, min_bucket_size);
-	return space;
-}
-
-void bch2_journal_space_available(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned clean, clean_ondisk, total;
-	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
-				       j->buf[1].buf_size >> 9);
-	unsigned nr_online = 0, nr_devs_want;
-	bool can_discard = false;
-	int ret = 0;
-
-	lockdep_assert_held(&j->lock);
-	guard(rcu)();
-
-	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
-		struct journal_device *ja = &ca->journal;
-
-		if (!ja->nr)
-			continue;
-
-		while (ja->dirty_idx != ja->cur_idx &&
-		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
-			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-
-		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
-		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
-			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-
-		can_discard |= __should_discard_bucket(j, ja);
-
-		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
-		nr_online++;
-	}
-
-	j->can_discard = can_discard;
-
-	if (nr_online < metadata_replicas_required(c)) {
-		if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
-			struct printbuf buf = PRINTBUF;
-			buf.atomic++;
-			prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
-				   "rw journal devs:", nr_online, metadata_replicas_required(c));
-
-			for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
-				prt_printf(&buf, " %s", ca->name);
-
-			bch_err(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-		}
-		ret = bch_err_throw(c, insufficient_journal_devices);
-		goto out;
-	}
-
-	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
-
-	for (unsigned i = 0; i < journal_space_nr; i++)
-		j->space[i] = __journal_space_available(j, nr_devs_want, i);
-
-	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
-	clean		= j->space[journal_space_clean].total;
-	total		= j->space[journal_space_total].total;
-
-	if (!j->space[journal_space_discarded].next_entry)
-		ret = bch_err_throw(c, journal_full);
-
-	if ((j->space[journal_space_clean_ondisk].next_entry <
-	     j->space[journal_space_clean_ondisk].total) &&
-	    (clean - clean_ondisk <= total / 8) &&
-	    (clean_ondisk * 2 > clean))
-		set_bit(JOURNAL_may_skip_flush, &j->flags);
-	else
-		clear_bit(JOURNAL_may_skip_flush, &j->flags);
-
-	bch2_journal_set_watermark(j);
-out:
-	j->cur_entry_sectors	= !ret
-		? j->space[journal_space_discarded].next_entry
-		: 0;
-	j->cur_entry_error	= ret;
-
-	if (!ret)
-		journal_wake(j);
-}
-
-/* Discards - last part of journal reclaim: */
-
-static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-	unsigned min_free = max(4, ja->nr / 8);
-
-	return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
-		min_free &&
-		ja->discard_idx != ja->dirty_idx_ondisk;
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-	spin_lock(&j->lock);
-	bool ret = __should_discard_bucket(j, ja);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-/*
- * Advance ja->discard_idx as long as it points to buckets that are no longer
- * dirty, issuing discards if necessary:
- */
-void bch2_journal_do_discards(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
-	mutex_lock(&j->discard_lock);
-
-	for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
-		struct journal_device *ja = &ca->journal;
-
-		while (should_discard_bucket(j, ja)) {
-			if (!c->opts.nochanges &&
-			    bch2_discard_opt_enabled(c, ca) &&
-			    bdev_max_discard_sectors(ca->disk_sb.bdev))
-				blkdev_issue_discard(ca->disk_sb.bdev,
-					bucket_to_sector(ca,
-						ja->buckets[ja->discard_idx]),
-					ca->mi.bucket_size, GFP_NOFS);
-
-			spin_lock(&j->lock);
-			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-
-			bch2_journal_space_available(j);
-			spin_unlock(&j->lock);
-		}
-	}
-
-	mutex_unlock(&j->discard_lock);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, holding it open to ensure it gets replayed during recovery:
- */
-
-void bch2_journal_reclaim_fast(struct journal *j)
-{
-	bool popped = false;
-
-	lockdep_assert_held(&j->lock);
-
-	/*
-	 * Unpin journal entries whose reference counts reached zero, meaning
-	 * all btree nodes got written out
-	 */
-	while (!fifo_empty(&j->pin) &&
-	       j->pin.front <= j->seq_ondisk &&
-	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
-		j->pin.front++;
-		popped = true;
-	}
-
-	if (popped) {
-		bch2_journal_space_available(j);
-		__closure_wake_up(&j->reclaim_flush_wait);
-	}
-}
-
-bool __bch2_journal_pin_put(struct journal *j, u64 seq)
-{
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-	return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch2_journal_pin_put(struct journal *j, u64 seq)
-{
-	if (__bch2_journal_pin_put(j, seq)) {
-		spin_lock(&j->lock);
-		bch2_journal_reclaim_fast(j);
-		spin_unlock(&j->lock);
-	}
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
-				      struct journal_entry_pin *pin)
-{
-	struct journal_entry_pin_list *pin_list;
-
-	if (!journal_pin_active(pin))
-		return false;
-
-	if (j->flush_in_progress == pin)
-		j->flush_in_progress_dropped = true;
-
-	pin_list = journal_seq_pin(j, pin->seq);
-	pin->seq = 0;
-	list_del_init(&pin->list);
-
-	if (j->reclaim_flush_wait.list.first)
-		__closure_wake_up(&j->reclaim_flush_wait);
-
-	/*
-	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
-	 * writing a new last_seq will now make another bucket available:
-	 */
-	return atomic_dec_and_test(&pin_list->count) &&
-		pin_list == &fifo_peek_front(&j->pin);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
-			   struct journal_entry_pin *pin)
-{
-	spin_lock(&j->lock);
-	if (__journal_pin_drop(j, pin))
-		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
-}
-
-static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
-					      journal_pin_flush_fn fn)
-{
-	if (fn == bch2_btree_node_flush0 ||
-	    fn == bch2_btree_node_flush1) {
-		unsigned idx = fn == bch2_btree_node_flush1;
-		struct btree *b = container_of(pin, struct btree, writes[idx].journal);
-
-		return JOURNAL_PIN_TYPE_btree0 - b->c.level;
-	} else if (fn == bch2_btree_key_cache_journal_flush)
-		return JOURNAL_PIN_TYPE_key_cache;
-	else
-		return JOURNAL_PIN_TYPE_other;
-}
-
-static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
-			  struct journal_entry_pin *pin,
-			  journal_pin_flush_fn flush_fn,
-			  enum journal_pin_type type)
-{
-	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-	/*
-	 * flush_fn is how we identify journal pins in debugfs, so must always
-	 * exist, even if it doesn't do anything:
-	 */
-	BUG_ON(!flush_fn);
-
-	atomic_inc(&pin_list->count);
-	pin->seq	= seq;
-	pin->flush	= flush_fn;
-
-	if (list_empty(&pin_list->unflushed[type]) &&
-	    j->reclaim_flush_wait.list.first)
-		__closure_wake_up(&j->reclaim_flush_wait);
-
-	list_add(&pin->list, &pin_list->unflushed[type]);
-}
-
-void bch2_journal_pin_copy(struct journal *j,
-			   struct journal_entry_pin *dst,
-			   struct journal_entry_pin *src,
-			   journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-
-	u64 seq = READ_ONCE(src->seq);
-
-	if (seq < journal_last_seq(j)) {
-		/*
-		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
-		 * the src pin - with the pin dropped, the entry to pin might no
-		 * longer to exist, but that means there's no longer anything to
-		 * copy and we can bail out here:
-		 */
-		spin_unlock(&j->lock);
-		return;
-	}
-
-	bool reclaim = __journal_pin_drop(j, dst);
-
-	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
-
-	if (reclaim)
-		bch2_journal_reclaim_fast(j);
-
-	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
-	 */
-	if (seq == journal_last_seq(j))
-		journal_wake(j);
-	spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_set(struct journal *j, u64 seq,
-			  struct journal_entry_pin *pin,
-			  journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-
-	BUG_ON(seq < journal_last_seq(j));
-
-	bool reclaim = __journal_pin_drop(j, pin);
-
-	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
-
-	if (reclaim)
-		bch2_journal_reclaim_fast(j);
-	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
-	 */
-	if (seq == journal_last_seq(j))
-		journal_wake(j);
-
-	spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_pin_flush: ensure journal pin callback is no longer running
- * @j:		journal object
- * @pin:	pin to flush
- */
-void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
-{
-	BUG_ON(journal_pin_active(pin));
-
-	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
-}
-
-/*
- * Journal reclaim: flush references to open journal entries to reclaim space in
- * the journal
- *
- * May be done by the journal code in the background as needed to free up space
- * for more journal entries, or as part of doing a clean shutdown, or to migrate
- * data off of a specific device:
- */
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j,
-		     u64 seq_to_flush,
-		     unsigned allowed_below_seq,
-		     unsigned allowed_above_seq,
-		     u64 *seq)
-{
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *ret = NULL;
-
-	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
-		if (*seq > seq_to_flush && !allowed_above_seq)
-			break;
-
-		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
-			if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
-			    (BIT(i) & allowed_above_seq)) {
-				ret = list_first_entry_or_null(&pin_list->unflushed[i],
-					struct journal_entry_pin, list);
-				if (ret)
-					return ret;
-			}
-	}
-
-	return NULL;
-}
-
-/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j,
-				 u64 seq_to_flush,
-				 unsigned allowed_below_seq,
-				 unsigned allowed_above_seq,
-				 unsigned min_any,
-				 unsigned min_key_cache)
-{
-	struct journal_entry_pin *pin;
-	size_t nr_flushed = 0;
-	journal_pin_flush_fn flush_fn;
-	u64 seq;
-	int err;
-
-	lockdep_assert_held(&j->reclaim_lock);
-
-	while (1) {
-		unsigned allowed_above = allowed_above_seq;
-		unsigned allowed_below = allowed_below_seq;
-
-		if (min_any) {
-			allowed_above |= ~0;
-			allowed_below |= ~0;
-		}
-
-		if (min_key_cache) {
-			allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
-			allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
-		}
-
-		cond_resched();
-
-		j->last_flushed = jiffies;
-
-		spin_lock(&j->lock);
-		pin = journal_get_next_pin(j, seq_to_flush,
-					   allowed_below,
-					   allowed_above, &seq);
-		if (pin) {
-			BUG_ON(j->flush_in_progress);
-			j->flush_in_progress = pin;
-			j->flush_in_progress_dropped = false;
-			flush_fn = pin->flush;
-		}
-		spin_unlock(&j->lock);
-
-		if (!pin)
-			break;
-
-		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
-			min_key_cache--;
-
-		if (min_any)
-			min_any--;
-
-		err = flush_fn(j, pin, seq);
-
-		spin_lock(&j->lock);
-		/* Pin might have been dropped or rearmed: */
-		if (likely(!err && !j->flush_in_progress_dropped))
-			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
-		j->flush_in_progress = NULL;
-		j->flush_in_progress_dropped = false;
-		spin_unlock(&j->lock);
-
-		wake_up(&j->pin_flush_wait);
-
-		if (err)
-			break;
-
-		nr_flushed++;
-	}
-
-	return nr_flushed;
-}
-
-static u64 journal_seq_to_flush(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	u64 seq_to_flush = 0;
-
-	guard(spinlock)(&j->lock);
-	guard(rcu)();
-
-	for_each_rw_member_rcu(c, ca) {
-		struct journal_device *ja = &ca->journal;
-		unsigned nr_buckets, bucket_to_flush;
-
-		if (!ja->nr)
-			continue;
-
-		/* Try to keep the journal at most half full: */
-		nr_buckets = ja->nr / 2;
-
-		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
-		seq_to_flush = max(seq_to_flush,
-				   ja->bucket_seq[bucket_to_flush]);
-	}
-
-	/* Also flush if the pin fifo is more than half full */
-	return max_t(s64, seq_to_flush,
-		     (s64) journal_cur_seq(j) -
-		     (j->pin.size >> 1));
-}
-
-/**
- * __bch2_journal_reclaim - free up journal buckets
- * @j:		journal object
- * @direct:	direct or background reclaim?
- * @kicked:	requested to run since we last ran?
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct btree_cache *bc = &c->btree_cache;
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	u64 seq_to_flush;
-	size_t min_nr, min_key_cache, nr_flushed;
-	unsigned flags;
-	int ret = 0;
-
-	/*
-	 * We can't invoke memory reclaim while holding the reclaim_lock -
-	 * journal reclaim is required to make progress for memory reclaim
-	 * (cleaning the caches), so we can't get stuck in memory reclaim while
-	 * we're holding the reclaim lock:
-	 */
-	lockdep_assert_held(&j->reclaim_lock);
-	flags = memalloc_noreclaim_save();
-
-	do {
-		if (kthread && kthread_should_stop())
-			break;
-
-		ret = bch2_journal_error(j);
-		if (ret)
-			break;
-
-		/* XXX shove journal discards off to another thread */
-		bch2_journal_do_discards(j);
-
-		seq_to_flush = journal_seq_to_flush(j);
-		min_nr = 0;
-
-		/*
-		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
-		 * make sure to flush at least one journal pin:
-		 */
-		if (time_after(jiffies, j->last_flushed +
-			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
-			min_nr = 1;
-
-		if (j->watermark != BCH_WATERMARK_stripe)
-			min_nr = 1;
-
-		size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
-		if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
-			min_nr = 1;
-
-		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
-		trace_and_count(c, journal_reclaim_start, c,
-				direct, kicked,
-				min_nr, min_key_cache,
-				atomic_long_read(&bc->nr_dirty), btree_cache_live,
-				atomic_long_read(&c->btree_key_cache.nr_dirty),
-				atomic_long_read(&c->btree_key_cache.nr_keys));
-
-		nr_flushed = journal_flush_pins(j, seq_to_flush,
-						~0, 0,
-						min_nr, min_key_cache);
-
-		if (direct)
-			j->nr_direct_reclaim += nr_flushed;
-		else
-			j->nr_background_reclaim += nr_flushed;
-		trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
-
-		if (nr_flushed)
-			wake_up(&j->reclaim_wait);
-	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
-
-	memalloc_noreclaim_restore(flags);
-
-	return ret;
-}
-
-int bch2_journal_reclaim(struct journal *j)
-{
-	return __bch2_journal_reclaim(j, true, true);
-}
-
-static int bch2_journal_reclaim_thread(void *arg)
-{
-	struct journal *j = arg;
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	unsigned long delay, now;
-	bool journal_empty;
-	int ret = 0;
-
-	set_freezable();
-
-	j->last_flushed = jiffies;
-
-	while (!ret && !kthread_should_stop()) {
-		bool kicked = j->reclaim_kicked;
-
-		j->reclaim_kicked = false;
-
-		mutex_lock(&j->reclaim_lock);
-		ret = __bch2_journal_reclaim(j, false, kicked);
-		mutex_unlock(&j->reclaim_lock);
-
-		now = jiffies;
-		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
-		j->next_reclaim = j->last_flushed + delay;
-
-		if (!time_in_range(j->next_reclaim, now, now + delay))
-			j->next_reclaim = now + delay;
-
-		while (1) {
-			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
-			if (kthread_should_stop())
-				break;
-			if (j->reclaim_kicked)
-				break;
-
-			spin_lock(&j->lock);
-			journal_empty = fifo_empty(&j->pin);
-			spin_unlock(&j->lock);
-
-			long timeout = j->next_reclaim - jiffies;
-
-			if (journal_empty)
-				schedule();
-			else if (timeout > 0)
-				schedule_timeout(timeout);
-			else
-				break;
-		}
-		__set_current_state(TASK_RUNNING);
-	}
-
-	return 0;
-}
-
-void bch2_journal_reclaim_stop(struct journal *j)
-{
-	struct task_struct *p = j->reclaim_thread;
-
-	j->reclaim_thread = NULL;
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-int bch2_journal_reclaim_start(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct task_struct *p;
-	int ret;
-
-	if (j->reclaim_thread)
-		return 0;
-
-	p = kthread_create(bch2_journal_reclaim_thread, j,
-			   "bch-reclaim/%s", c->name);
-	ret = PTR_ERR_OR_ZERO(p);
-	bch_err_msg(c, ret, "creating journal reclaim thread");
-	if (ret)
-		return ret;
-
-	get_task_struct(p);
-	j->reclaim_thread = p;
-	wake_up_process(p);
-	return 0;
-}
-
-static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
-					unsigned types)
-{
-	struct journal_entry_pin_list *pin_list;
-	u64 seq;
-
-	spin_lock(&j->lock);
-	fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
-		if (seq > seq_to_flush)
-			break;
-
-		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
-			if ((BIT(i) & types) &&
-			    (!list_empty(&pin_list->unflushed[i]) ||
-			     !list_empty(&pin_list->flushed[i]))) {
-				spin_unlock(&j->lock);
-				return true;
-			}
-	}
-	spin_unlock(&j->lock);
-
-	return false;
-}
-
-static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
-						 unsigned types)
-{
-	return  journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
-		journal_pins_still_flushing(j, seq_to_flush, types);
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
-			      bool *did_work)
-{
-	int ret = 0;
-
-	ret = bch2_journal_error(j);
-	if (ret)
-		return ret;
-
-	mutex_lock(&j->reclaim_lock);
-
-	for (int type = JOURNAL_PIN_TYPE_NR - 1;
-	     type >= 0;
-	     --type)
-		if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
-			*did_work = true;
-			goto unlock;
-		}
-
-	if (seq_to_flush > journal_cur_seq(j))
-		bch2_journal_entry_close(j);
-
-	spin_lock(&j->lock);
-	/*
-	 * If journal replay hasn't completed, the unreplayed journal entries
-	 * hold refs on their corresponding sequence numbers
-	 */
-	ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
-		journal_last_seq(j) > seq_to_flush ||
-		!fifo_used(&j->pin);
-
-	spin_unlock(&j->lock);
-unlock:
-	mutex_unlock(&j->reclaim_lock);
-
-	return ret;
-}
-
-bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
-	/* time_stats this */
-	bool did_work = false;
-
-	if (!test_bit(JOURNAL_running, &j->flags))
-		return false;
-
-	closure_wait_event(&j->reclaim_flush_wait,
-		journal_flush_done(j, seq_to_flush, &did_work));
-
-	return did_work;
-}
-
-int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_entry_pin_list *p;
-	u64 iter, seq = 0;
-	int ret = 0;
-
-	spin_lock(&j->lock);
-	fifo_for_each_entry_ptr(p, &j->pin, iter)
-		if (dev_idx >= 0
-		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
-		    : p->devs.nr < c->opts.metadata_replicas)
-			seq = iter;
-	spin_unlock(&j->lock);
-
-	bch2_journal_flush_pins(j, seq);
-
-	ret = bch2_journal_error(j);
-	if (ret)
-		return ret;
-
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
-
-	/*
-	 * Now that we've populated replicas_gc, write to the journal to mark
-	 * active journal devices. This handles the case where the journal might
-	 * be empty. Otherwise we could clear all journal replicas and
-	 * temporarily put the fs into an unrecoverable state. Journal recovery
-	 * expects to find devices marked for journal data on unclean mount.
-	 */
-	ret = bch2_journal_meta(&c->journal);
-	if (ret)
-		goto err;
-
-	seq = 0;
-	spin_lock(&j->lock);
-	while (!ret) {
-		union bch_replicas_padded replicas;
-
-		seq = max(seq, journal_last_seq(j));
-		if (seq >= j->pin.back)
-			break;
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-					 journal_seq_pin(j, seq)->devs);
-		seq++;
-
-		if (replicas.e.nr_devs) {
-			spin_unlock(&j->lock);
-			ret = bch2_mark_replicas(c, &replicas.e);
-			spin_lock(&j->lock);
-		}
-	}
-	spin_unlock(&j->lock);
-err:
-	ret = bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
-
-	return ret;
-}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *pin;
-
-	spin_lock(&j->lock);
-	if (!test_bit(JOURNAL_running, &j->flags)) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	*seq = max(*seq, j->pin.front);
-
-	if (*seq >= j->pin.back) {
-		spin_unlock(&j->lock);
-		return true;
-	}
-
-	out->atomic++;
-
-	pin_list = journal_seq_pin(j, *seq);
-
-	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "unflushed:\n");
-	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
-		list_for_each_entry(pin, &pin_list->unflushed[i], list)
-			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-	prt_printf(out, "flushed:\n");
-	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
-		list_for_each_entry(pin, &pin_list->flushed[i], list)
-			prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-	printbuf_indent_sub(out, 2);
-
-	--out->atomic;
-	spin_unlock(&j->lock);
-
-	return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
-	u64 seq = 0;
-
-	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
-		seq++;
-}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
deleted file mode 100644
index 0a73d7134e1c..000000000000
--- a/fs/bcachefs/journal_reclaim.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
-#define _BCACHEFS_JOURNAL_RECLAIM_H
-
-#define JOURNAL_PIN	(32 * 1024)
-
-static inline void journal_reclaim_kick(struct journal *j)
-{
-	struct task_struct *p = READ_ONCE(j->reclaim_thread);
-
-	j->reclaim_kicked = true;
-	if (p)
-		wake_up_process(p);
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *,
-					    struct journal_device *,
-					    enum journal_space_from);
-void bch2_journal_set_watermark(struct journal *);
-void bch2_journal_space_available(struct journal *);
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
-	return pin->seq != 0;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
-	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
-
-	return &j->pin.data[seq & j->pin.mask];
-}
-
-void bch2_journal_reclaim_fast(struct journal *);
-bool __bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
-			  journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
-					struct journal_entry_pin *pin,
-					journal_pin_flush_fn flush_fn)
-{
-	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
-		bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_copy(struct journal *,
-			   struct journal_entry_pin *,
-			   struct journal_entry_pin *,
-			   journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
-					   struct journal_entry_pin *pin,
-					   journal_pin_flush_fn flush_fn)
-{
-	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
-		bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_do_discards(struct journal *);
-int bch2_journal_reclaim(struct journal *);
-
-void bch2_journal_reclaim_stop(struct journal *);
-int bch2_journal_reclaim_start(struct journal *);
-
-bool bch2_journal_flush_pins(struct journal *, u64);
-
-static inline bool bch2_journal_flush_all_pins(struct journal *j)
-{
-	return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-int bch2_journal_flush_device_pins(struct journal *, int);
-
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
-
-#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
deleted file mode 100644
index 0cb9b93f13e7..000000000000
--- a/fs/bcachefs/journal_sb.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "journal_sb.h"
-#include "darray.h"
-
-#include <linux/sort.h>
-
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
-	const u64 *l = _l;
-	const u64 *r = _r;
-
-	return cmp_int(*l, *r);
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-	int ret = -BCH_ERR_invalid_sb_journal;
-	unsigned nr;
-	unsigned i;
-	u64 *b;
-
-	nr = bch2_nr_journal_buckets(journal);
-	if (!nr)
-		return 0;
-
-	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
-	if (!b)
-		return -BCH_ERR_ENOMEM_sb_journal_validate;
-
-	for (i = 0; i < nr; i++)
-		b[i] = le64_to_cpu(journal->buckets[i]);
-
-	sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-	if (!b[0]) {
-		prt_printf(err, "journal bucket at sector 0");
-		goto err;
-	}
-
-	if (b[0] < le16_to_cpu(m.first_bucket)) {
-		prt_printf(err, "journal bucket %llu before first bucket %u",
-		       b[0], le16_to_cpu(m.first_bucket));
-		goto err;
-	}
-
-	if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
-		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-		       b[nr - 1], le64_to_cpu(m.nbuckets));
-		goto err;
-	}
-
-	for (i = 0; i + 1 < nr; i++)
-		if (b[i] == b[i + 1]) {
-			prt_printf(err, "duplicate journal buckets %llu", b[i]);
-			goto err;
-		}
-
-	ret = 0;
-err:
-	kfree(b);
-	return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
-				    struct bch_sb_field *f)
-{
-	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-	unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-	prt_printf(out, "Buckets: ");
-	for (i = 0; i < nr; i++)
-		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
-	prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-	.validate	= bch2_sb_journal_validate,
-	.to_text	= bch2_sb_journal_to_text,
-};
-
-struct u64_range {
-	u64	start;
-	u64	end;
-};
-
-static int u64_range_cmp(const void *_l, const void *_r)
-{
-	const struct u64_range *l = _l;
-	const struct u64_range *r = _r;
-
-	return cmp_int(l->start, r->start);
-}
-
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-	int ret = -BCH_ERR_invalid_sb_journal;
-	u64 sum = 0;
-	unsigned nr;
-	unsigned i;
-	struct u64_range *b;
-
-	nr = bch2_sb_field_journal_v2_nr_entries(journal);
-	if (!nr)
-		return 0;
-
-	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
-	if (!b)
-		return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
-
-	for (i = 0; i < nr; i++) {
-		b[i].start = le64_to_cpu(journal->d[i].start);
-		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
-
-		if (b[i].end <= b[i].start) {
-			prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
-				   le64_to_cpu(journal->d[i].start),
-				   le64_to_cpu(journal->d[i].nr));
-			goto err;
-		}
-
-		sum += le64_to_cpu(journal->d[i].nr);
-	}
-
-	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
-
-	if (!b[0].start) {
-		prt_printf(err, "journal bucket at sector 0");
-		goto err;
-	}
-
-	if (b[0].start < le16_to_cpu(m.first_bucket)) {
-		prt_printf(err, "journal bucket %llu before first bucket %u",
-		       b[0].start, le16_to_cpu(m.first_bucket));
-		goto err;
-	}
-
-	if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
-		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-		       b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
-		goto err;
-	}
-
-	for (i = 0; i + 1 < nr; i++) {
-		if (b[i].end > b[i + 1].start) {
-			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
-			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
-			goto err;
-		}
-	}
-
-	if (sum > UINT_MAX) {
-		prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
-		goto err;
-	}
-
-	ret = 0;
-err:
-	kfree(b);
-	return ret;
-}
-
-static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
-				    struct bch_sb_field *f)
-{
-	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
-
-	prt_printf(out, "Buckets: ");
-	for (i = 0; i < nr; i++)
-		prt_printf(out, " %llu-%llu",
-		       le64_to_cpu(journal->d[i].start),
-		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
-	prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
-	.validate	= bch2_sb_journal_v2_validate,
-	.to_text	= bch2_sb_journal_v2_to_text,
-};
-
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
-			       u64 *buckets, unsigned nr)
-{
-	struct bch_sb_field_journal_v2 *j;
-	unsigned i, dst = 0, nr_compacted = 1;
-
-	if (c)
-		lockdep_assert_held(&c->sb_lock);
-
-	if (!nr) {
-		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
-		return 0;
-	}
-
-	for (i = 0; i + 1 < nr; i++)
-		if (buckets[i] + 1 != buckets[i + 1])
-			nr_compacted++;
-
-	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
-			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
-	if (!j)
-		return bch_err_throw(c, ENOSPC_sb_journal);
-
-	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-
-	j->d[dst].start = cpu_to_le64(buckets[0]);
-	j->d[dst].nr	= cpu_to_le64(1);
-
-	for (i = 1; i < nr; i++) {
-		if (buckets[i] == buckets[i - 1] + 1) {
-			le64_add_cpu(&j->d[dst].nr, 1);
-		} else {
-			dst++;
-			j->d[dst].start = cpu_to_le64(buckets[i]);
-			j->d[dst].nr	= cpu_to_le64(1);
-		}
-	}
-
-	BUG_ON(dst + 1 != nr_compacted);
-	return 0;
-}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
deleted file mode 100644
index ba40a7e8d90a..000000000000
--- a/fs/bcachefs/journal_sb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include "super-io.h"
-#include "vstructs.h"
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-	return j
-		? (__le64 *) vstruct_end(&j->field) - j->buckets
-		: 0;
-}
-
-static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
-{
-	if (!j)
-		return 0;
-
-	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
-
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
deleted file mode 100644
index af4fe416d9ec..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ /dev/null
@@ -1,264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "eytzinger.h"
-#include "journal.h"
-#include "journal_seq_blacklist.h"
-#include "super-io.h"
-
-/*
- * journal_seq_blacklist machinery:
- *
- * To guarantee order of btree updates after a crash, we need to detect when a
- * btree node entry (bset) is newer than the newest journal entry that was
- * successfully written, and ignore it - effectively ignoring any btree updates
- * that didn't make it into the journal.
- *
- * If we didn't do this, we might have two btree nodes, a and b, both with
- * updates that weren't written to the journal yet: if b was updated after a,
- * but b was flushed and not a - oops; on recovery we'll find that the updates
- * to b happened, but not the updates to a that happened before it.
- *
- * Ignoring bsets that are newer than the newest journal entry is always safe,
- * because everything they contain will also have been journalled - and must
- * still be present in the journal on disk until a journal entry has been
- * written _after_ that bset was written.
- *
- * To accomplish this, bsets record the newest journal sequence number they
- * contain updates for; then, on startup, the btree code queries the journal
- * code to ask "Is this sequence number newer than the newest journal entry? If
- * so, ignore it."
- *
- * When this happens, we must blacklist that journal sequence number: the
- * journal must not write any entries with that sequence number, and it must
- * record that it was blacklisted so that a) on recovery we don't think we have
- * missing journal entries and b) so that the btree code continues to ignore
- * that bset, until that btree node is rewritten.
- */
-
-static unsigned sb_blacklist_u64s(unsigned nr)
-{
-	struct bch_sb_field_journal_seq_blacklist *bl;
-
-	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
-}
-
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
-{
-	struct bch_sb_field_journal_seq_blacklist *bl;
-	unsigned i = 0, nr;
-	int ret = 0;
-
-	mutex_lock(&c->sb_lock);
-	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-	nr = blacklist_nr_entries(bl);
-
-	while (i < nr) {
-		struct journal_seq_blacklist_entry *e =
-			bl->start + i;
-
-		if (end < le64_to_cpu(e->start))
-			break;
-
-		if (start > le64_to_cpu(e->end)) {
-			i++;
-			continue;
-		}
-
-		/*
-		 * Entry is contiguous or overlapping with new entry: merge it
-		 * with new entry, and delete:
-		 */
-
-		start	= min(start,	le64_to_cpu(e->start));
-		end	= max(end,	le64_to_cpu(e->end));
-		array_remove_item(bl->start, nr, i);
-	}
-
-	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-				  sb_blacklist_u64s(nr + 1));
-	if (!bl) {
-		ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
-		goto out;
-	}
-
-	array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
-		.start	= cpu_to_le64(start),
-		.end	= cpu_to_le64(end),
-	}));
-	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
-
-	ret = bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
-
-	return ret ?: bch2_blacklist_table_initialize(c);
-}
-
-static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
-{
-	const struct journal_seq_blacklist_table_entry *l = _l;
-	const struct journal_seq_blacklist_table_entry *r = _r;
-
-	return cmp_int(l->start, r->start);
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
-				     bool dirty)
-{
-	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-	struct journal_seq_blacklist_table_entry search = { .start = seq };
-	int idx;
-
-	if (!t)
-		return false;
-
-	idx = eytzinger0_find_le(t->entries, t->nr,
-				 sizeof(t->entries[0]),
-				 journal_seq_blacklist_table_cmp,
-				 &search);
-	if (idx < 0)
-		return false;
-
-	BUG_ON(t->entries[idx].start > seq);
-
-	if (seq >= t->entries[idx].end)
-		return false;
-
-	if (dirty)
-		t->entries[idx].dirty = true;
-	return true;
-}
-
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c)
-{
-	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-
-	if (!t || !t->nr)
-		return 0;
-
-	return t->entries[eytzinger0_last(t->nr)].end - 1;
-}
-
-int bch2_blacklist_table_initialize(struct bch_fs *c)
-{
-	struct bch_sb_field_journal_seq_blacklist *bl =
-		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-	struct journal_seq_blacklist_table *t;
-	unsigned i, nr = blacklist_nr_entries(bl);
-
-	if (!bl)
-		return 0;
-
-	t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
-	if (!t)
-		return bch_err_throw(c, ENOMEM_blacklist_table_init);
-
-	t->nr = nr;
-
-	for (i = 0; i < nr; i++) {
-		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
-		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
-	}
-
-	eytzinger0_sort(t->entries,
-			t->nr,
-			sizeof(t->entries[0]),
-			journal_seq_blacklist_table_cmp,
-			NULL);
-
-	kfree(c->journal_seq_blacklist_table);
-	c->journal_seq_blacklist_table = t;
-	return 0;
-}
-
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_journal_seq_blacklist *bl =
-		field_to_type(f, journal_seq_blacklist);
-	unsigned i, nr = blacklist_nr_entries(bl);
-
-	for (i = 0; i < nr; i++) {
-		struct journal_seq_blacklist_entry *e = bl->start + i;
-
-		if (le64_to_cpu(e->start) >=
-		    le64_to_cpu(e->end)) {
-			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
-			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
-			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
-		}
-
-		if (i + 1 < nr &&
-		    le64_to_cpu(e[0].end) >
-		    le64_to_cpu(e[1].start)) {
-			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
-			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
-			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
-						  struct bch_sb *sb,
-						  struct bch_sb_field *f)
-{
-	struct bch_sb_field_journal_seq_blacklist *bl =
-		field_to_type(f, journal_seq_blacklist);
-	struct journal_seq_blacklist_entry *i;
-	unsigned nr = blacklist_nr_entries(bl);
-
-	for (i = bl->start; i < bl->start + nr; i++) {
-		if (i != bl->start)
-			prt_printf(out, " ");
-
-		prt_printf(out, "%llu-%llu",
-		       le64_to_cpu(i->start),
-		       le64_to_cpu(i->end));
-	}
-	prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
-	.validate	= bch2_sb_journal_seq_blacklist_validate,
-	.to_text	= bch2_sb_journal_seq_blacklist_to_text
-};
-
-bool bch2_blacklist_entries_gc(struct bch_fs *c)
-{
-	struct journal_seq_blacklist_entry *src, *dst;
-
-	struct bch_sb_field_journal_seq_blacklist *bl =
-		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
-	if (!bl)
-		return false;
-
-	unsigned nr = blacklist_nr_entries(bl);
-	dst = bl->start;
-
-	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-	BUG_ON(nr != t->nr);
-
-	src = bl->start;
-	eytzinger0_for_each(i, nr) {
-		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
-		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
-
-		if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
-			*dst++ = *src;
-		src++;
-	}
-
-	unsigned new_nr = dst - bl->start;
-	if (new_nr == nr)
-		return false;
-
-	bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
-	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-				  new_nr ? sb_blacklist_u64s(new_nr) : 0);
-	BUG_ON(new_nr && !bl);
-	return true;
-}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
deleted file mode 100644
index f06942ccfcdd..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-
-static inline unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
-	return bl
-		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
-		   sizeof(struct journal_seq_blacklist_entry))
-		: 0;
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
-int bch2_blacklist_table_initialize(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-
-bool bch2_blacklist_entries_gc(struct bch_fs *);
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
deleted file mode 100644
index 2566b12dbc04..000000000000
--- a/fs/bcachefs/journal_seq_blacklist_format.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-
-struct journal_seq_blacklist_entry {
-	__le64			start;
-	__le64			end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
-	struct bch_sb_field	field;
-	struct journal_seq_blacklist_entry start[];
-};
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
deleted file mode 100644
index 51104bbb99da..000000000000
--- a/fs/bcachefs/journal_types.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_TYPES_H
-#define _BCACHEFS_JOURNAL_TYPES_H
-
-#include <linux/cache.h>
-#include <linux/workqueue.h>
-
-#include "alloc_types.h"
-#include "super_types.h"
-#include "fifo.h"
-
-/* btree write buffer steals 8 bits for its own purposes: */
-#define JOURNAL_SEQ_MAX		((1ULL << 56) - 1)
-
-#define JOURNAL_STATE_BUF_BITS	2
-#define JOURNAL_STATE_BUF_NR	(1U << JOURNAL_STATE_BUF_BITS)
-#define JOURNAL_STATE_BUF_MASK	(JOURNAL_STATE_BUF_NR - 1)
-
-#define JOURNAL_BUF_BITS	4
-#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
-#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
-
-/*
- * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
- * the journal that are being staged or in flight.
- */
-struct journal_buf {
-	struct closure		io;
-	struct jset		*data;
-
-	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
-	struct bch_devs_list	devs_written;
-
-	struct closure_waitlist	wait;
-	u64			last_seq;	/* copy of data->last_seq */
-	long			expires;
-	u64			flush_time;
-
-	unsigned		buf_size;	/* size in bytes of @data */
-	unsigned		sectors;	/* maximum size for current entry */
-	unsigned		disk_sectors;	/* maximum size entry could have been, if
-						   buf_size was bigger */
-	unsigned		u64s_reserved;
-	bool			noflush:1;	/* write has already been kicked off, and was noflush */
-	bool			must_flush:1;	/* something wants a flush */
-	bool			separate_flush:1;
-	bool			need_flush_to_write_buffer:1;
-	bool			write_started:1;
-	bool			write_allocated:1;
-	bool			write_done:1;
-	u8			idx;
-};
-
-/*
- * Something that makes a journal entry dirty - i.e. a btree node that has to be
- * flushed:
- */
-
-enum journal_pin_type {
-	JOURNAL_PIN_TYPE_btree3,
-	JOURNAL_PIN_TYPE_btree2,
-	JOURNAL_PIN_TYPE_btree1,
-	JOURNAL_PIN_TYPE_btree0,
-	JOURNAL_PIN_TYPE_key_cache,
-	JOURNAL_PIN_TYPE_other,
-	JOURNAL_PIN_TYPE_NR,
-};
-
-struct journal_entry_pin_list {
-	struct list_head		unflushed[JOURNAL_PIN_TYPE_NR];
-	struct list_head		flushed[JOURNAL_PIN_TYPE_NR];
-	atomic_t			count;
-	struct bch_devs_list		devs;
-};
-
-struct journal;
-struct journal_entry_pin;
-typedef int (*journal_pin_flush_fn)(struct journal *j,
-				struct journal_entry_pin *, u64);
-
-struct journal_entry_pin {
-	struct list_head		list;
-	journal_pin_flush_fn		flush;
-	u64				seq;
-};
-
-struct journal_res {
-	bool			ref;
-	u16			u64s;
-	u32			offset;
-	u64			seq;
-};
-
-union journal_res_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		u64		cur_entry_offset:22,
-				idx:2,
-				buf0_count:10,
-				buf1_count:10,
-				buf2_count:10,
-				buf3_count:10;
-	};
-};
-
-/* bytes: */
-#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 22) /* 16M */
-
-/*
- * We stash some journal state as sentinal values in cur_entry_offset:
- * note - cur_entry_offset is in units of u64s
- */
-#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 22) - 1)
-
-#define JOURNAL_ENTRY_BLOCKED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 2)
-#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
-#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
-
-struct journal_space {
-	/* Units of 512 bytes sectors: */
-	unsigned	next_entry; /* How big the next journal entry can be */
-	unsigned	total;
-};
-
-enum journal_space_from {
-	journal_space_discarded,
-	journal_space_clean_ondisk,
-	journal_space_clean,
-	journal_space_total,
-	journal_space_nr,
-};
-
-#define JOURNAL_FLAGS()			\
-	x(replay_done)			\
-	x(running)			\
-	x(may_skip_flush)		\
-	x(need_flush_write)		\
-	x(space_low)
-
-enum journal_flags {
-#define x(n)	JOURNAL_##n,
-	JOURNAL_FLAGS()
-#undef x
-};
-
-struct journal_bio {
-	struct bch_dev		*ca;
-	unsigned		buf_idx;
-	u64			submit_time;
-
-	struct bio		bio;
-};
-
-/* Embedded in struct bch_fs */
-struct journal {
-	/* Fastpath stuff up front: */
-	struct {
-
-	union journal_res_state reservations;
-	enum bch_watermark	watermark;
-
-	} __aligned(SMP_CACHE_BYTES);
-
-	unsigned long		flags;
-
-	/* Max size of current journal entry */
-	unsigned		cur_entry_u64s;
-	unsigned		cur_entry_sectors;
-
-	/* Reserved space in journal entry to be used just prior to write */
-	unsigned		entry_u64s_reserved;
-
-
-	/*
-	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
-	 * insufficient devices:
-	 */
-	int			cur_entry_error;
-	unsigned		cur_entry_offset_if_blocked;
-
-	unsigned		buf_size_want;
-	/*
-	 * We may queue up some things to be journalled (log messages) before
-	 * the journal has actually started - stash them here:
-	 */
-	darray_u64		early_journal_entries;
-
-	/*
-	 * Protects journal_buf->data, when accessing without a jorunal
-	 * reservation: for synchronization between the btree write buffer code
-	 * and the journal write path:
-	 */
-	struct mutex		buf_lock;
-	/*
-	 * Two journal entries -- one is currently open for new entries, the
-	 * other is possibly being written out.
-	 */
-	struct journal_buf	buf[JOURNAL_BUF_NR];
-	void			*free_buf;
-	unsigned		free_buf_size;
-
-	spinlock_t		lock;
-
-	/* if nonzero, we may not open a new journal entry: */
-	unsigned		blocked;
-
-	/* Used when waiting because the journal was full */
-	wait_queue_head_t	wait;
-	struct closure_waitlist	async_wait;
-	struct closure_waitlist	reclaim_flush_wait;
-
-	struct delayed_work	write_work;
-	struct workqueue_struct *wq;
-
-	/* Sequence number of most recent journal entry (last entry in @pin) */
-	atomic64_t		seq;
-
-	u64			seq_write_started;
-	/* seq, last_seq from the most recent journal entry successfully written */
-	u64			seq_ondisk;
-	u64			flushed_seq_ondisk;
-	u64			flushing_seq;
-	u64			last_seq_ondisk;
-	u64			err_seq;
-	u64			last_empty_seq;
-	u64			oldest_seq_found_ondisk;
-
-	/*
-	 * FIFO of journal entries whose btree updates have not yet been
-	 * written out.
-	 *
-	 * Each entry is a reference count. The position in the FIFO is the
-	 * entry's sequence number relative to @seq.
-	 *
-	 * The journal entry itself holds a reference count, put when the
-	 * journal entry is written out. Each btree node modified by the journal
-	 * entry also holds a reference count, put when the btree node is
-	 * written.
-	 *
-	 * When a reference count reaches zero, the journal entry is no longer
-	 * needed. When all journal entries in the oldest journal bucket are no
-	 * longer needed, the bucket can be discarded and reused.
-	 */
-	struct {
-		u64 front, back, size, mask;
-		struct journal_entry_pin_list *data;
-	}			pin;
-
-	struct journal_space	space[journal_space_nr];
-
-	u64			replay_journal_seq;
-	u64			replay_journal_seq_end;
-
-	struct write_point	wp;
-	spinlock_t		err_lock;
-
-	struct mutex		reclaim_lock;
-	/*
-	 * Used for waiting until journal reclaim has freed up space in the
-	 * journal:
-	 */
-	wait_queue_head_t	reclaim_wait;
-	struct task_struct	*reclaim_thread;
-	bool			reclaim_kicked;
-	unsigned long		next_reclaim;
-	u64			nr_direct_reclaim;
-	u64			nr_background_reclaim;
-
-	unsigned long		last_flushed;
-	struct journal_entry_pin *flush_in_progress;
-	bool			flush_in_progress_dropped;
-	wait_queue_head_t	pin_flush_wait;
-
-	/* protects advancing ja->discard_idx: */
-	struct mutex		discard_lock;
-	bool			can_discard;
-
-	unsigned long		last_flush_write;
-
-	u64			write_start_time;
-
-	u64			nr_flush_writes;
-	u64			nr_noflush_writes;
-	u64			entry_bytes_written;
-
-	struct bch2_time_stats	*flush_write_time;
-	struct bch2_time_stats	*noflush_write_time;
-	struct bch2_time_stats	*flush_seq_time;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	res_map;
-#endif
-} __aligned(SMP_CACHE_BYTES);
-
-/*
- * Embedded in struct bch_dev. First three fields refer to the array of journal
- * buckets, in bch_sb.
- */
-struct journal_device {
-	/*
-	 * For each journal bucket, contains the max sequence number of the
-	 * journal writes it contains - so we know when a bucket can be reused.
-	 */
-	u64			*bucket_seq;
-
-	unsigned		sectors_free;
-
-	/*
-	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
-	 */
-	unsigned		discard_idx;		/* Next bucket to discard */
-	unsigned		dirty_idx_ondisk;
-	unsigned		dirty_idx;
-	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
-	unsigned		nr;
-
-	u64			*buckets;
-
-	/* Bio for journal reads/writes to this device */
-	struct journal_bio	*bio[JOURNAL_BUF_NR];
-
-	/* for bch_journal_read_device */
-	struct closure		read;
-	u64			highest_seq_found;
-};
-
-/*
- * journal_entry_res - reserve space in every journal entry:
- */
-struct journal_entry_res {
-	unsigned		u64s;
-};
-
-#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
deleted file mode 100644
index 1b828bddd11b..000000000000
--- a/fs/bcachefs/keylist.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "keylist.h"
-
-int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
-			size_t nr_inline_u64s, size_t new_u64s)
-{
-	size_t oldsize = bch2_keylist_u64s(l);
-	size_t newsize = oldsize + new_u64s;
-	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
-	u64 *new_keys;
-
-	newsize = roundup_pow_of_two(newsize);
-
-	if (newsize <= nr_inline_u64s ||
-	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
-		return 0;
-
-	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
-	if (!new_keys)
-		return -ENOMEM;
-
-	if (!old_buf)
-		memcpy_u64s(new_keys, inline_u64s, oldsize);
-
-	l->keys_p = new_keys;
-	l->top_p = new_keys + oldsize;
-
-	return 0;
-}
-
-void bch2_keylist_pop_front(struct keylist *l)
-{
-	l->top_p -= bch2_keylist_front(l)->k.u64s;
-
-	memmove_u64s_down(l->keys,
-			  bkey_next(l->keys),
-			  bch2_keylist_u64s(l));
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *l)
-{
-	for_each_keylist_key(l, k)
-		BUG_ON(bkey_next(k) != l->top &&
-		       bpos_ge(k->k.p, bkey_next(k)->k.p));
-}
-#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
deleted file mode 100644
index e687e0e9aede..000000000000
--- a/fs/bcachefs/keylist.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_H
-#define _BCACHEFS_KEYLIST_H
-
-#include "keylist_types.h"
-
-int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_pop_front(struct keylist *);
-
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
-{
-	l->top_p = l->keys_p = inline_keys;
-}
-
-static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
-{
-	if (l->keys_p != inline_keys)
-		kfree(l->keys_p);
-}
-
-static inline void bch2_keylist_push(struct keylist *l)
-{
-	l->top = bkey_next(l->top);
-}
-
-static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
-{
-	bkey_copy(l->top, k);
-	bch2_keylist_push(l);
-}
-
-static inline bool bch2_keylist_empty(struct keylist *l)
-{
-	return l->top == l->keys;
-}
-
-static inline size_t bch2_keylist_u64s(struct keylist *l)
-{
-	return l->top_p - l->keys_p;
-}
-
-static inline size_t bch2_keylist_bytes(struct keylist *l)
-{
-	return bch2_keylist_u64s(l) * sizeof(u64);
-}
-
-static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
-{
-	return l->keys;
-}
-
-#define for_each_keylist_key(_keylist, _k)			\
-	for (struct bkey_i *_k = (_keylist)->keys;		\
-	     _k != (_keylist)->top;				\
-	     _k = bkey_next(_k))
-
-static inline u64 keylist_sectors(struct keylist *keys)
-{
-	u64 ret = 0;
-
-	for_each_keylist_key(keys, k)
-		ret += k->k.size;
-	return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *);
-#else
-static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
-#endif
-
-#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
deleted file mode 100644
index 4b3ff7d8a875..000000000000
--- a/fs/bcachefs/keylist_types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_TYPES_H
-#define _BCACHEFS_KEYLIST_TYPES_H
-
-struct keylist {
-	union {
-		struct bkey_i		*keys;
-		u64			*keys_p;
-	};
-	union {
-		struct bkey_i		*top;
-		u64			*top_p;
-	};
-};
-
-#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
deleted file mode 100644
index 75f27ec26f85..000000000000
--- a/fs/bcachefs/logged_ops.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "error.h"
-#include "io_misc.h"
-#include "logged_ops.h"
-#include "super.h"
-
-struct bch_logged_op_fn {
-	u8		type;
-	int		(*resume)(struct btree_trans *, struct bkey_i *);
-};
-
-static const struct bch_logged_op_fn logged_op_fns[] = {
-#define x(n)		{					\
-	.type		= KEY_TYPE_logged_op_##n,		\
-	.resume		= bch2_resume_logged_op_##n,		\
-},
-	BCH_LOGGED_OPS()
-#undef x
-};
-
-static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
-{
-	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
-		if (logged_op_fns[i].type == type)
-			return logged_op_fns + i;
-	return NULL;
-}
-
-static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
-			    struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
-		    trans, logged_op_but_clean,
-		    "filesystem marked as clean but have logged op\n%s",
-		    (bch2_bkey_val_to_text(&buf, c, k),
-		     buf.buf));
-
-	struct bkey_buf sk;
-	bch2_bkey_buf_init(&sk);
-	bch2_bkey_buf_reassemble(&sk, c, k);
-
-	const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
-	if (fn)
-		fn->resume(trans, sk.k);
-
-	ret = bch2_logged_op_finish(trans, sk.k);
-
-	bch2_bkey_buf_exit(&sk, c);
-fsck_err:
-	printbuf_exit(&buf);
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter,
-				   BTREE_ID_logged_ops,
-				   POS(LOGGED_OPS_INUM_logged_ops, 0),
-				   POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
-				   BTREE_ITER_prefetch, k,
-			resume_logged_op(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
-	struct btree_iter iter;
-	int ret = bch2_bkey_get_empty_slot(trans, &iter,
-				 BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
-	if (ret)
-		return ret;
-
-	k->k.p = iter.pos;
-
-	ret = bch2_trans_update(trans, &iter, k, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
-	return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			 __bch2_logged_op_start(trans, k));
-}
-
-int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
-	/*
-	 * This needs to be a fatal error because we've left an unfinished
-	 * operation in the logged ops btree.
-	 *
-	 * We should only ever see an error here if the filesystem has already
-	 * been shut down, but make sure of that here:
-	 */
-	if (ret) {
-		struct bch_fs *c = trans->c;
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
-				    buf.buf, bch2_err_str(ret));
-		printbuf_exit(&buf);
-	}
-
-	return ret;
-}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
deleted file mode 100644
index 30ae9ef737dd..000000000000
--- a/fs/bcachefs/logged_ops.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_H
-#define _BCACHEFS_LOGGED_OPS_H
-
-#include "bkey.h"
-
-#define BCH_LOGGED_OPS()			\
-	x(truncate)				\
-	x(finsert)
-
-static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
-{
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *);
-int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
-
-#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
deleted file mode 100644
index cfb67c95d4c8..000000000000
--- a/fs/bcachefs/logged_ops_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
-#define _BCACHEFS_LOGGED_OPS_FORMAT_H
-
-enum logged_ops_inums {
-	LOGGED_OPS_INUM_logged_ops,
-	LOGGED_OPS_INUM_inode_cursors,
-};
-
-struct bch_logged_op_truncate {
-	struct bch_val		v;
-	__le32			subvol;
-	__le32			pad;
-	__le64			inum;
-	__le64			new_i_size;
-};
-
-enum logged_op_finsert_state {
-	LOGGED_OP_FINSERT_start,
-	LOGGED_OP_FINSERT_shift_extents,
-	LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
-	struct bch_val		v;
-	__u8			state;
-	__u8			pad[3];
-	__le32			subvol;
-	__le64			inum;
-	__le64			dst_offset;
-	__le64			src_offset;
-	__le64			pos;
-};
-
-#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
deleted file mode 100644
index 57b5b3263b08..000000000000
--- a/fs/bcachefs/lru.c
+++ /dev/null
@@ -1,223 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-
-/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
-		      struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(!lru_pos_time(k.k->p),
-			 c, lru_entry_at_time_0,
-			 "lru entry at time=0");
-fsck_err:
-	return ret;
-}
-
-void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
-		      struct bkey_s_c k)
-{
-	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
-
-	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
-}
-
-void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
-{
-	prt_printf(out, "%llu:%llu -> %llu:%llu",
-		   lru_pos_id(lru),
-		   lru_pos_time(lru),
-		   u64_to_bucket(lru.offset).inode,
-		   u64_to_bucket(lru.offset).offset);
-}
-
-static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
-			  u64 dev_bucket, u64 time, bool set)
-{
-	return time
-		? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
-					      lru_pos(lru_id, dev_bucket, time), set)
-		: 0;
-}
-
-int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
-	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
-}
-
-int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
-	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
-}
-
-int __bch2_lru_change(struct btree_trans *trans,
-		      u16 lru_id, u64 dev_bucket,
-		      u64 old_time, u64 new_time)
-{
-	if (old_time == new_time)
-		return 0;
-
-	return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
-		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
-}
-
-static const char * const bch2_lru_types[] = {
-#define x(n) #n,
-	BCH_LRU_TYPES()
-#undef x
-	NULL
-};
-
-int bch2_lru_check_set(struct btree_trans *trans,
-		       u16 lru_id,
-		       u64 dev_bucket,
-		       u64 time,
-		       struct bkey_s_c referring_k,
-		       struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter lru_iter;
-	struct bkey_s_c lru_k =
-		bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
-				   lru_pos(lru_id, dev_bucket, time), 0);
-	int ret = bkey_err(lru_k);
-	if (ret)
-		return ret;
-
-	if (lru_k.k->type != KEY_TYPE_set) {
-		ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
-		if (ret)
-			goto err;
-
-		if (fsck_err(trans, alloc_key_to_missing_lru_entry,
-			     "missing %s lru entry\n%s",
-			     bch2_lru_types[lru_type(lru_k)],
-			     (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
-			ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
-			if (ret)
-				goto err;
-		}
-	}
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &lru_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
-{
-	enum bch_lru_type type = lru_type(lru_k);
-
-	switch (type) {
-	case BCH_LRU_read:
-	case BCH_LRU_fragmentation:
-		return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
-	case BCH_LRU_stripes:
-		return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
-	default:
-		BUG();
-	}
-}
-
-static u64 bkey_lru_type_idx(struct bch_fs *c,
-			     enum bch_lru_type type,
-			     struct bkey_s_c k)
-{
-	struct bch_alloc_v4 a_convert;
-	const struct bch_alloc_v4 *a;
-
-	switch (type) {
-	case BCH_LRU_read:
-		a = bch2_alloc_to_v4(k, &a_convert);
-		return alloc_lru_idx_read(*a);
-	case BCH_LRU_fragmentation: {
-		a = bch2_alloc_to_v4(k, &a_convert);
-
-		guard(rcu)();
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
-		return ca
-			? alloc_lru_idx_fragmentation(*a, ca)
-			: 0;
-	}
-	case BCH_LRU_stripes:
-		return k.k->type == KEY_TYPE_stripe
-			? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
-			: 0;
-	default:
-		BUG();
-	}
-}
-
-static int bch2_check_lru_key(struct btree_trans *trans,
-			      struct btree_iter *lru_iter,
-			      struct bkey_s_c lru_k,
-			      struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-
-	struct bbpos bp = lru_pos_to_bp(lru_k);
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	enum bch_lru_type type = lru_type(lru_k);
-	u64 idx = bkey_lru_type_idx(c, type, k);
-
-	if (lru_pos_time(lru_k.k->p) != idx) {
-		ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
-		if (ret)
-			goto err;
-
-		if (fsck_err(trans, lru_entry_bad,
-			     "incorrect lru entry: lru %s time %llu\n"
-			     "%s\n"
-			     "for %s",
-			     bch2_lru_types[type],
-			     lru_pos_time(lru_k.k->p),
-			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
-			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
-			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
-	}
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
-int bch2_check_lrus(struct bch_fs *c)
-{
-	struct bkey_buf last_flushed;
-
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_check_lru_key(trans, &iter, k, &last_flushed)));
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch_err_fn(c, ret);
-	return ret;
-
-}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
deleted file mode 100644
index 8abd0aa2083a..000000000000
--- a/fs/bcachefs/lru.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_H
-#define _BCACHEFS_LRU_H
-
-static inline u64 lru_pos_id(struct bpos pos)
-{
-	return pos.inode >> LRU_TIME_BITS;
-}
-
-static inline u64 lru_pos_time(struct bpos pos)
-{
-	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
-}
-
-static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
-{
-	struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
-
-	EBUG_ON(time > LRU_TIME_MAX);
-	EBUG_ON(lru_pos_id(pos) != lru_id);
-	EBUG_ON(lru_pos_time(pos) != time);
-	EBUG_ON(pos.offset != dev_bucket);
-
-	return pos;
-}
-
-static inline enum bch_lru_type lru_type(struct bkey_s_c l)
-{
-	u16 lru_id = l.k->p.inode >> 48;
-
-	switch (lru_id) {
-	case BCH_LRU_BUCKET_FRAGMENTATION:
-		return BCH_LRU_fragmentation;
-	case BCH_LRU_STRIPE_FRAGMENTATION:
-		return BCH_LRU_stripes;
-	default:
-		return BCH_LRU_read;
-	}
-}
-
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
-void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
-
-#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
-	.key_validate	= bch2_lru_validate,	\
-	.val_to_text	= bch2_lru_to_text,	\
-	.min_val_size	= 8,			\
-})
-
-int bch2_lru_del(struct btree_trans *, u16, u64, u64);
-int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
-
-static inline int bch2_lru_change(struct btree_trans *trans,
-		      u16 lru_id, u64 dev_bucket,
-		      u64 old_time, u64 new_time)
-{
-	return old_time != new_time
-		? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
-		: 0;
-}
-
-struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
-
-int bch2_check_lrus(struct bch_fs *);
-
-#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
deleted file mode 100644
index b7392ad8e41f..000000000000
--- a/fs/bcachefs/lru_format.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_FORMAT_H
-#define _BCACHEFS_LRU_FORMAT_H
-
-struct bch_lru {
-	struct bch_val		v;
-	__le64			idx;
-} __packed __aligned(8);
-
-#define BCH_LRU_TYPES()		\
-	x(read)			\
-	x(fragmentation)	\
-	x(stripes)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
-	BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_BUCKET_FRAGMENTATION	((1U << 16) - 1)
-#define BCH_LRU_STRIPE_FRAGMENTATION	((1U << 16) - 2)
-
-#define LRU_TIME_BITS			48
-#define LRU_TIME_MAX			((1ULL << LRU_TIME_BITS) - 1)
-
-#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
deleted file mode 100644
index 0ea9f30803a2..000000000000
--- a/fs/bcachefs/mean_and_variance.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions for incremental mean and variance.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Copyright © 2022 Daniel B. Hill
- *
- * Author: Daniel B. Hill <daniel@gluo.nz>
- *
- * Description:
- *
- * This is includes some incremental algorithms for mean and variance calculation
- *
- * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- *
- * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
- *
- * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
- *
- * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
- * is deferred to these functions for performance reasons.
- *
- * see lib/math/mean_and_variance_test.c for examples of usage.
- *
- * DO NOT access the mean and variance fields of the weighted variants directly.
- * DO NOT change the weight after calling update.
- */
-
-#include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-#include <linux/module.h>
-
-#include "mean_and_variance.h"
-
-u128_u u128_div(u128_u n, u64 d)
-{
-	u128_u r;
-	u64 rem;
-	u64 hi = u128_hi(n);
-	u64 lo = u128_lo(n);
-	u64  h =  hi & ((u64) U32_MAX  << 32);
-	u64  l = (hi &  (u64) U32_MAX) << 32;
-
-	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
-	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
-	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
-	return r;
-}
-EXPORT_SYMBOL_GPL(u128_div);
-
-/**
- * mean_and_variance_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- */
-s64 mean_and_variance_get_mean(struct mean_and_variance s)
-{
-	return s.n ? div64_u64(s.sum, s.n) : 0;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
-
-/**
- * mean_and_variance_get_variance() -  get variance from @s1
- * @s1: mean and variance number of samples and sums
- *
- * see linked pdf equation 12.
- */
-u64 mean_and_variance_get_variance(struct mean_and_variance s1)
-{
-	if (s1.n) {
-		u128_u s2 = u128_div(s1.sum_squares, s1.n);
-		u64  s3 = abs(mean_and_variance_get_mean(s1));
-
-		return u128_lo(u128_sub(s2, u128_square(s3)));
-	} else {
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
-
-/**
- * mean_and_variance_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- */
-u32 mean_and_variance_get_stddev(struct mean_and_variance s)
-{
-	return int_sqrt64(mean_and_variance_get_variance(s));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
-
-/**
- * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s: mean and variance number of samples and their sums
- * @x: new value to include in the &mean_and_variance_weighted
- * @initted: caller must track whether this is the first use or not
- * @weight: ewma weight
- *
- * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
- * values are stored bitshifted for performance and added precision.
- */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
-		s64 x, bool initted, u8 weight)
-{
-	// previous weighted variance.
-	u8 w		= weight;
-	u64 var_w0	= s->variance;
-	// new value weighted.
-	s64 x_w		= x << w;
-	s64 diff_w	= x_w - s->mean;
-	s64 diff	= fast_divpow2(diff_w, w);
-	// new mean weighted.
-	s64 u_w1	= s->mean + diff;
-
-	if (!initted) {
-		s->mean = x_w;
-		s->variance = 0;
-	} else {
-		s->mean = u_w1;
-		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
-	}
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
-
-/**
- * mean_and_variance_weighted_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
-		u8 weight)
-{
-	return fast_divpow2(s.mean, weight);
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
-
-/**
- * mean_and_variance_weighted_get_variance() -- get variance from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
-		u8 weight)
-{
-	// always positive don't need fast divpow2
-	return s.variance >> weight;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
-
-/**
- * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
-		u8 weight)
-{
-	return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
deleted file mode 100644
index 47e4a3c3d26e..000000000000
--- a/fs/bcachefs/mean_and_variance.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef MEAN_AND_VARIANCE_H_
-#define MEAN_AND_VARIANCE_H_
-
-#include <linux/types.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-
-#define SQRT_U64_MAX 4294967295ULL
-
-/*
- * u128_u: u128 user mode, because not all architectures support a real int128
- * type
- *
- * We don't use this version in userspace, because in userspace we link with
- * Rust and rustc has issues with u128.
- */
-
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
-
-typedef struct {
-	unsigned __int128 v;
-} __aligned(16) u128_u;
-
-static inline u128_u u64_to_u128(u64 a)
-{
-	return (u128_u) { .v = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
-	return a.v;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
-	return a.v >> 64;
-}
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
-	a.v += b.v;
-	return a;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
-	a.v -= b.v;
-	return a;
-}
-
-static inline u128_u u128_shl(u128_u a, s8 shift)
-{
-	a.v <<= shift;
-	return a;
-}
-
-static inline u128_u u128_square(u64 a)
-{
-	u128_u b = u64_to_u128(a);
-
-	b.v *= b.v;
-	return b;
-}
-
-#else
-
-typedef struct {
-	u64 hi, lo;
-} __aligned(16) u128_u;
-
-/* conversions */
-
-static inline u128_u u64_to_u128(u64 a)
-{
-	return (u128_u) { .lo = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
-	return a.lo;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
-	return a.hi;
-}
-
-/* arithmetic */
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
-	u128_u c;
-
-	c.lo = a.lo + b.lo;
-	c.hi = a.hi + b.hi + (c.lo < a.lo);
-	return c;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
-	u128_u c;
-
-	c.lo = a.lo - b.lo;
-	c.hi = a.hi - b.hi - (c.lo > a.lo);
-	return c;
-}
-
-static inline u128_u u128_shl(u128_u i, s8 shift)
-{
-	u128_u r;
-
-	r.lo = i.lo << (shift & 63);
-	if (shift < 64)
-		r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
-	else {
-		r.hi = i.lo << (-shift & 63);
-		r.lo = 0;
-	}
-	return r;
-}
-
-static inline u128_u u128_square(u64 i)
-{
-	u128_u r;
-	u64  h = i >> 32, l = i & U32_MAX;
-
-	r =             u128_shl(u64_to_u128(h*h), 64);
-	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
-	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
-	r = u128_add(r,          u64_to_u128(l*l));
-	return r;
-}
-
-#endif
-
-static inline u128_u u64s_to_u128(u64 hi, u64 lo)
-{
-	u128_u c = u64_to_u128(hi);
-
-	c = u128_shl(c, 64);
-	c = u128_add(c, u64_to_u128(lo));
-	return c;
-}
-
-u128_u u128_div(u128_u n, u64 d);
-
-struct mean_and_variance {
-	s64	n;
-	s64	sum;
-	u128_u	sum_squares;
-};
-
-/* expontentially weighted variant */
-struct mean_and_variance_weighted {
-	s64	mean;
-	u64	variance;
-};
-
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-static inline s64 fast_divpow2(s64 n, u8 d)
-{
-	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
-
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-static inline void
-mean_and_variance_update(struct mean_and_variance *s, s64 v)
-{
-	s->n++;
-	s->sum += v;
-	s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
-}
-
-s64 mean_and_variance_get_mean(struct mean_and_variance s);
-u64 mean_and_variance_get_variance(struct mean_and_variance s1);
-u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
-		s64 v, bool initted, u8 weight);
-
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
-		u8 weight);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
-		u8 weight);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
-		u8 weight);
-
-#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
deleted file mode 100644
index e9d9c0212e44..000000000000
--- a/fs/bcachefs/mean_and_variance_test.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <kunit/test.h>
-
-#include "mean_and_variance.h"
-
-#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
-
-static void mean_and_variance_basic_test(struct kunit *test)
-{
-	struct mean_and_variance s = {};
-
-	mean_and_variance_update(&s, 2);
-	mean_and_variance_update(&s, 2);
-
-	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
-	KUNIT_EXPECT_EQ(test, s.n, 2);
-
-	mean_and_variance_update(&s, 4);
-	mean_and_variance_update(&s, 4);
-
-	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
-	KUNIT_EXPECT_EQ(test, s.n, 4);
-}
-
-/*
- * Test values computed using a spreadsheet from the psuedocode at the bottom:
- * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- */
-
-static void mean_and_variance_weighted_test(struct kunit *test)
-{
-	struct mean_and_variance_weighted s = { };
-
-	mean_and_variance_weighted_update(&s, 10, false, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
-	mean_and_variance_weighted_update(&s, 20, true, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
-	mean_and_variance_weighted_update(&s, 30, true, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-
-	s = (struct mean_and_variance_weighted) { };
-
-	mean_and_variance_weighted_update(&s, -10, false, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
-	mean_and_variance_weighted_update(&s, -20, true, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
-	mean_and_variance_weighted_update(&s, -30, true, 2);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-}
-
-static void mean_and_variance_weighted_advanced_test(struct kunit *test)
-{
-	struct mean_and_variance_weighted s = { };
-	bool initted = false;
-	s64 i;
-
-	for (i = 10; i <= 100; i += 10) {
-		mean_and_variance_weighted_update(&s, i, initted, 8);
-		initted = true;
-	}
-
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-
-	s = (struct mean_and_variance_weighted) { };
-	initted = false;
-
-	for (i = -10; i >= -100; i -= 10) {
-		mean_and_variance_weighted_update(&s, i, initted, 8);
-		initted = true;
-	}
-
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-}
-
-static void do_mean_and_variance_test(struct kunit *test,
-				      s64 initial_value,
-				      s64 initial_n,
-				      s64 n,
-				      unsigned weight,
-				      s64 *data,
-				      s64 *mean,
-				      s64 *stddev,
-				      s64 *weighted_mean,
-				      s64 *weighted_stddev)
-{
-	struct mean_and_variance mv = {};
-	struct mean_and_variance_weighted vw = { };
-
-	for (unsigned i = 0; i < initial_n; i++) {
-		mean_and_variance_update(&mv, initial_value);
-		mean_and_variance_weighted_update(&vw, initial_value, false, weight);
-
-		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	initial_value);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
-	}
-
-	for (unsigned i = 0; i < n; i++) {
-		mean_and_variance_update(&mv, data[i]);
-		mean_and_variance_weighted_update(&vw, data[i], true, weight);
-
-		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	weighted_mean[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
-	}
-
-	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
-}
-
-/* Test behaviour with a single outlier, then back to steady state: */
-static void mean_and_variance_test_1(struct kunit *test)
-{
-	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
-	s64 mean[]		= {  22, 21, 20, 19, 18, 17, 16 };
-	s64 stddev[]		= {  32, 29, 28, 27, 26, 25, 24 };
-	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
-	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
-
-	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-			d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_2(struct kunit *test)
-{
-	s64 d[]			= { 100, 100, 100, 100, 100 };
-	s64 mean[]		= {  22,  32,  40,  46,  50 };
-	s64 stddev[]		= {  32,  39,  42,  44,  45 };
-	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
-	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
-
-	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-			d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-static void mean_and_variance_fast_divpow2(struct kunit *test)
-{
-	s64 i;
-	u8 d;
-
-	for (i = 0; i < 100; i++) {
-		d = 0;
-		KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
-		KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
-		for (d = 1; d < 32; d++) {
-			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
-					    div_u64(i, 1 << d), "%lld %u", i, d);
-			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
-					    div_u64(i, 1 << d), "%lld %u", -i, d);
-		}
-	}
-}
-
-static void mean_and_variance_u128_basic_test(struct kunit *test)
-{
-	u128_u a  = u64s_to_u128(0, U64_MAX);
-	u128_u a1 = u64s_to_u128(0, 1);
-	u128_u b  = u64s_to_u128(1, 0);
-	u128_u c  = u64s_to_u128(0, 1LLU << 63);
-	u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
-
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
-
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
-
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
-
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
-
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
-
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
-
-	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
-	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
-}
-
-static struct kunit_case mean_and_variance_test_cases[] = {
-	KUNIT_CASE(mean_and_variance_fast_divpow2),
-	KUNIT_CASE(mean_and_variance_u128_basic_test),
-	KUNIT_CASE(mean_and_variance_basic_test),
-	KUNIT_CASE(mean_and_variance_weighted_test),
-	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
-	KUNIT_CASE(mean_and_variance_test_1),
-	KUNIT_CASE(mean_and_variance_test_2),
-	{}
-};
-
-static struct kunit_suite mean_and_variance_test_suite = {
-	.name		= "mean and variance tests",
-	.test_cases	= mean_and_variance_test_cases
-};
-
-kunit_test_suite(mean_and_variance_test_suite);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
deleted file mode 100644
index f296cce95338..000000000000
--- a/fs/bcachefs/migrate.c
+++ /dev/null
@@ -1,277 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for moving data off a device.
- */
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "ec.h"
-#include "errcode.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "migrate.h"
-#include "move.h"
-#include "progress.h"
-#include "replicas.h"
-#include "super-io.h"
-
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
-			 unsigned dev_idx, unsigned flags, bool metadata)
-{
-	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
-	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
-	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
-	unsigned nr_good;
-
-	bch2_bkey_drop_device(k, dev_idx);
-
-	nr_good = bch2_bkey_durability(c, k.s_c);
-	if ((!nr_good && !(flags & lost)) ||
-	    (nr_good < replicas && !(flags & degraded)))
-		return bch_err_throw(c, remove_would_lose_data);
-
-	return 0;
-}
-
-static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
-			   struct btree *b, unsigned dev_idx, unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_buf k;
-
-	bch2_bkey_buf_init(&k);
-	bch2_bkey_buf_copy(&k, c, &b->key);
-
-	int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
-		bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
-
-	bch_err_fn(c, ret);
-	bch2_bkey_buf_exit(&k, c);
-	return ret;
-}
-
-static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     unsigned dev_idx,
-				     unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_i *n;
-	int ret;
-
-	if (!bch2_bkey_has_device_c(k, dev_idx))
-		return 0;
-
-	n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
-	ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
-	if (ret)
-		return ret;
-
-	/*
-	 * If the new extent no longer has any pointers, bch2_extent_normalize()
-	 * will do the appropriate thing with it (turning it into a
-	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
-	 */
-	bch2_extent_normalize(c, bkey_i_to_s(n));
-
-	/*
-	 * Since we're not inserting through an extent iterator
-	 * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
-	 * we aren't using the extent overwrite path to delete, we're
-	 * just using the normal key deletion path:
-	 */
-	if (bkey_deleted(&n->k))
-		n->k.size = 0;
-	return 0;
-}
-
-static int bch2_dev_btree_drop_key(struct btree_trans *trans,
-				   struct bkey_s_c_backpointer bp,
-				   unsigned dev_idx,
-				   struct bkey_buf *last_flushed,
-				   unsigned flags)
-{
-	struct btree_iter iter;
-	struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
-	int ret = PTR_ERR_OR_ZERO(b);
-	if (ret)
-		return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
-
-	ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_dev_usrdata_drop(struct bch_fs *c,
-				 struct progress_indicator_state *progress,
-				 unsigned dev_idx, unsigned flags)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	enum btree_id id;
-	int ret = 0;
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		if (!btree_type_has_ptrs(id))
-			continue;
-
-		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
-			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
-		}));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_put(trans);
-
-	return ret;
-}
-
-static int bch2_dev_metadata_drop(struct bch_fs *c,
-				  struct progress_indicator_state *progress,
-				  unsigned dev_idx, unsigned flags)
-{
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct closure cl;
-	struct btree *b;
-	struct bkey_buf k;
-	unsigned id;
-	int ret;
-
-	/* don't handle this yet: */
-	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
-
-	trans = bch2_trans_get(c);
-	bch2_bkey_buf_init(&k);
-	closure_init_stack(&cl);
-
-	for (id = 0; id < BTREE_ID_NR; id++) {
-		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
-					  BTREE_ITER_prefetch);
-retry:
-		ret = 0;
-		while (bch2_trans_begin(trans),
-		       (b = bch2_btree_iter_peek_node(trans, &iter)) &&
-		       !(ret = PTR_ERR_OR_ZERO(b))) {
-			bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
-
-			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
-				goto next;
-
-			ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-				ret = 0;
-				continue;
-			}
-
-			if (ret)
-				break;
-next:
-			bch2_btree_iter_next_node(trans, &iter);
-		}
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
-
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (ret)
-			goto err;
-	}
-
-	bch2_btree_interior_updates_flush(c);
-	ret = 0;
-err:
-	bch2_bkey_buf_exit(&k, c);
-	bch2_trans_put(trans);
-
-	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
-	return ret;
-}
-
-static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
-			struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
-			unsigned flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
-						     last_flushed);
-	int ret = bkey_err(k);
-	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
-		return 0;
-	if (ret)
-		return ret;
-
-	if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
-		goto out;
-
-	/*
-	 * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
-	 * properly
-	 */
-
-	if (bkey_is_btree_ptr(k.k))
-		ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
-	else if (k.k->type == KEY_TYPE_stripe)
-		ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
-	else
-		ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-		for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
-				POS(dev_idx, 0),
-				POS(dev_idx, U64_MAX), 0, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			if (k.k->type != KEY_TYPE_backpointer)
-				continue;
-
-			data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
-				     &last_flushed, flags);
-
-	}));
-
-	bch2_bkey_buf_exit(&last_flushed, trans->c);
-	bch2_trans_put(trans);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
-	struct progress_indicator_state progress;
-	bch2_progress_init(&progress, c,
-			   BIT_ULL(BTREE_ID_extents)|
-			   BIT_ULL(BTREE_ID_reflink));
-
-	return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
-		bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
-}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
deleted file mode 100644
index 30018140711b..000000000000
--- a/fs/bcachefs/migrate.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MIGRATE_H
-#define _BCACHEFS_MIGRATE_H
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
-int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
deleted file mode 100644
index eec591e947bd..000000000000
--- a/fs/bcachefs/move.c
+++ /dev/null
@@ -1,1494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/kthread.h>
-
-const char * const bch2_data_ops_strs[] = {
-#define x(t, n, ...) [n] = #t,
-	BCH_DATA_OPS()
-#undef x
-	NULL
-};
-
-struct evacuate_bucket_arg {
-	struct bpos		bucket;
-	int			gen;
-	struct data_update_opts	data_opts;
-};
-
-static bool evacuate_bucket_pred(struct bch_fs *, void *,
-				 enum btree_id, struct bkey_s_c,
-				 struct bch_io_opts *,
-				 struct data_update_opts *);
-
-static noinline void
-trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
-	       struct bch_io_opts *io_opts,
-	       struct data_update_opts *data_opts)
-{
-	struct printbuf buf = PRINTBUF;
-
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_newline(&buf);
-	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
-	trace_io_move(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct printbuf buf = PRINTBUF;
-
-	bch2_bkey_val_to_text(&buf, c, k);
-	trace_io_move_read(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
-		    struct bch_io_opts *io_opts,
-		    struct data_update_opts *data_opts,
-		    move_pred_fn pred, void *_arg, bool p)
-{
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "%ps: %u", pred, p);
-
-	if (pred == evacuate_bucket_pred) {
-		struct evacuate_bucket_arg *arg = _arg;
-		prt_printf(&buf, " gen=%u", arg->gen);
-	}
-
-	prt_newline(&buf);
-	bch2_bkey_val_to_text(&buf, c, k);
-	prt_newline(&buf);
-	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
-	trace_io_move_pred(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
-{
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "bucket: ");
-	bch2_bpos_to_text(&buf, bucket);
-	prt_printf(&buf, " gen: %i\n", gen);
-
-	trace_io_move_evacuate_bucket(c, buf.buf);
-	printbuf_exit(&buf);
-}
-
-struct moving_io {
-	struct list_head		read_list;
-	struct list_head		io_list;
-	struct move_bucket		*b;
-	struct closure			cl;
-	bool				read_completed;
-
-	unsigned			read_sectors;
-	unsigned			write_sectors;
-
-	struct data_update		write;
-};
-
-static void move_free(struct moving_io *io)
-{
-	struct moving_context *ctxt = io->write.ctxt;
-
-	if (io->b)
-		atomic_dec(&io->b->count);
-
-	mutex_lock(&ctxt->lock);
-	list_del(&io->io_list);
-	wake_up(&ctxt->wait);
-	mutex_unlock(&ctxt->lock);
-
-	if (!io->write.data_opts.scrub) {
-		bch2_data_update_exit(&io->write);
-	} else {
-		bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
-		kfree(io->write.bvecs);
-	}
-	kfree(io);
-}
-
-static void move_write_done(struct bch_write_op *op)
-{
-	struct moving_io *io = container_of(op, struct moving_io, write.op);
-	struct bch_fs *c = op->c;
-	struct moving_context *ctxt = io->write.ctxt;
-
-	if (op->error) {
-		if (trace_io_move_write_fail_enabled()) {
-			struct printbuf buf = PRINTBUF;
-
-			bch2_write_op_to_text(&buf, op);
-			trace_io_move_write_fail(c, buf.buf);
-			printbuf_exit(&buf);
-		}
-		this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
-
-		ctxt->write_error = true;
-	}
-
-	atomic_sub(io->write_sectors, &ctxt->write_sectors);
-	atomic_dec(&ctxt->write_ios);
-	move_free(io);
-	closure_put(&ctxt->cl);
-}
-
-static void move_write(struct moving_io *io)
-{
-	struct bch_fs *c = io->write.op.c;
-	struct moving_context *ctxt = io->write.ctxt;
-	struct bch_read_bio *rbio = &io->write.rbio;
-
-	if (ctxt->stats) {
-		if (rbio->bio.bi_status)
-			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
-				     &ctxt->stats->sectors_error_uncorrected);
-		else if (rbio->saw_error)
-			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
-				     &ctxt->stats->sectors_error_corrected);
-	}
-
-	/*
-	 * If the extent has been bitrotted, we're going to have to give it a
-	 * new checksum in order to move it - but the poison bit will ensure
-	 * that userspace still gets the appropriate error.
-	 */
-	if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
-		     (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
-		struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-		struct nonce nonce = extent_nonce(rbio->version, crc);
-
-		rbio->pick.crc.csum	= bch2_checksum_bio(c, rbio->pick.crc.csum_type,
-							    nonce, &rbio->bio);
-		rbio->ret		= 0;
-	}
-
-	if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
-		move_free(io);
-		return;
-	}
-
-	if (trace_io_move_write_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
-		trace_io_move_write(c, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	closure_get(&io->write.ctxt->cl);
-	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-	atomic_inc(&io->write.ctxt->write_ios);
-
-	bch2_data_update_read_done(&io->write);
-}
-
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
-{
-	struct moving_io *io =
-		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
-
-	return io && io->read_completed ? io : NULL;
-}
-
-static void move_read_endio(struct bio *bio)
-{
-	struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
-	struct moving_context *ctxt = io->write.ctxt;
-
-	atomic_sub(io->read_sectors, &ctxt->read_sectors);
-	atomic_dec(&ctxt->read_ios);
-	io->read_completed = true;
-
-	wake_up(&ctxt->wait);
-	closure_put(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
-{
-	struct moving_io *io;
-
-	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
-		bch2_trans_unlock_long(ctxt->trans);
-		list_del(&io->read_list);
-		move_write(io);
-	}
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
-	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
-
-	move_ctxt_wait_event(ctxt,
-		!atomic_read(&ctxt->write_sectors) ||
-		atomic_read(&ctxt->write_sectors) != sectors_pending);
-}
-
-void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
-{
-	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
-	bch2_trans_unlock_long(ctxt->trans);
-	closure_sync(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_exit(struct moving_context *ctxt)
-{
-	struct bch_fs *c = ctxt->trans->c;
-
-	bch2_moving_ctxt_flush_all(ctxt);
-
-	EBUG_ON(atomic_read(&ctxt->write_sectors));
-	EBUG_ON(atomic_read(&ctxt->write_ios));
-	EBUG_ON(atomic_read(&ctxt->read_sectors));
-	EBUG_ON(atomic_read(&ctxt->read_ios));
-
-	mutex_lock(&c->moving_context_lock);
-	list_del(&ctxt->list);
-	mutex_unlock(&c->moving_context_lock);
-
-	/*
-	 * Generally, releasing a transaction within a transaction restart means
-	 * an unhandled transaction restart: but this can happen legitimately
-	 * within the move code, e.g. when bch2_move_ratelimit() tells us to
-	 * exit before we've retried
-	 */
-	bch2_trans_begin(ctxt->trans);
-	bch2_trans_put(ctxt->trans);
-	memset(ctxt, 0, sizeof(*ctxt));
-}
-
-void bch2_moving_ctxt_init(struct moving_context *ctxt,
-			   struct bch_fs *c,
-			   struct bch_ratelimit *rate,
-			   struct bch_move_stats *stats,
-			   struct write_point_specifier wp,
-			   bool wait_on_copygc)
-{
-	memset(ctxt, 0, sizeof(*ctxt));
-
-	ctxt->trans	= bch2_trans_get(c);
-	ctxt->fn	= (void *) _RET_IP_;
-	ctxt->rate	= rate;
-	ctxt->stats	= stats;
-	ctxt->wp	= wp;
-	ctxt->wait_on_copygc = wait_on_copygc;
-
-	closure_init_stack(&ctxt->cl);
-
-	mutex_init(&ctxt->lock);
-	INIT_LIST_HEAD(&ctxt->reads);
-	INIT_LIST_HEAD(&ctxt->ios);
-	init_waitqueue_head(&ctxt->wait);
-
-	mutex_lock(&c->moving_context_lock);
-	list_add(&ctxt->list, &c->moving_context_list);
-	mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
-{
-	trace_move_data(c, stats);
-}
-
-void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->data_type = BCH_DATA_user;
-	scnprintf(stats->name, sizeof(stats->name), "%s", name);
-}
-
-int bch2_move_extent(struct moving_context *ctxt,
-		     struct move_bucket *bucket_in_flight,
-		     struct btree_iter *iter,
-		     struct bkey_s_c k,
-		     struct bch_io_opts io_opts,
-		     struct data_update_opts data_opts)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	int ret = -ENOMEM;
-
-	if (trace_io_move_enabled())
-		trace_io_move2(c, k, &io_opts, &data_opts);
-	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
-	if (ctxt->stats)
-		ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
-
-	bch2_data_update_opts_normalize(k, &data_opts);
-
-	if (!data_opts.rewrite_ptrs &&
-	    !data_opts.extra_replicas &&
-	    !data_opts.scrub) {
-		if (data_opts.kill_ptrs)
-			return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
-		return 0;
-	}
-
-	struct moving_io *io = allocate_dropping_locks(trans, ret,
-				kzalloc(sizeof(struct moving_io), _gfp));
-	if (!io)
-		goto err;
-
-	if (ret)
-		goto err_free;
-
-	INIT_LIST_HEAD(&io->io_list);
-	io->write.ctxt		= ctxt;
-	io->read_sectors	= k.k->size;
-	io->write_sectors	= k.k->size;
-
-	if (!data_opts.scrub) {
-		ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
-					    &io_opts, data_opts, iter->btree_id, k);
-		if (ret)
-			goto err_free;
-
-		io->write.op.end_io	= move_write_done;
-	} else {
-		bch2_bkey_buf_init(&io->write.k);
-		bch2_bkey_buf_reassemble(&io->write.k, c, k);
-
-		io->write.op.c		= c;
-		io->write.data_opts	= data_opts;
-
-		bch2_trans_unlock(trans);
-
-		ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
-		if (ret)
-			goto err_free;
-	}
-
-	io->write.rbio.bio.bi_end_io = move_read_endio;
-	io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-
-	if (ctxt->rate)
-		bch2_ratelimit_increment(ctxt->rate, k.k->size);
-
-	if (ctxt->stats) {
-		atomic64_inc(&ctxt->stats->keys_moved);
-		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
-	}
-
-	if (bucket_in_flight) {
-		io->b = bucket_in_flight;
-		atomic_inc(&io->b->count);
-	}
-
-	if (trace_io_move_read_enabled())
-		trace_io_move_read2(c, k);
-
-	mutex_lock(&ctxt->lock);
-	atomic_add(io->read_sectors, &ctxt->read_sectors);
-	atomic_inc(&ctxt->read_ios);
-
-	list_add_tail(&io->read_list, &ctxt->reads);
-	list_add_tail(&io->io_list, &ctxt->ios);
-	mutex_unlock(&ctxt->lock);
-
-	/*
-	 * dropped by move_read_endio() - guards against use after free of
-	 * ctxt when doing wakeup
-	 */
-	closure_get(&ctxt->cl);
-	__bch2_read_extent(trans, &io->write.rbio,
-			   io->write.rbio.bio.bi_iter,
-			   bkey_start_pos(k.k),
-			   iter->btree_id, k, 0,
-			   NULL,
-			   BCH_READ_last_fragment,
-			   data_opts.scrub ?  data_opts.read_dev : -1);
-	return 0;
-err_free:
-	kfree(io);
-err:
-	if (bch2_err_matches(ret, EROFS) ||
-	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		return ret;
-
-	count_event(c, io_move_start_fail);
-
-	if (trace_io_move_start_fail_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_str(&buf, ": ");
-		prt_str(&buf, bch2_err_str(ret));
-		trace_io_move_start_fail(c, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	if (bch2_err_matches(ret, BCH_ERR_data_update_done))
-		return 0;
-	return ret;
-}
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
-			  struct per_snapshot_io_opts *io_opts,
-			  struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
-			  struct btree_iter *extent_iter,
-			  struct bkey_s_c extent_k)
-{
-	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
-	struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
-	int ret = 0;
-
-	if (extent_iter->min_depth)
-		return opts_ret;
-
-	if (extent_k.k->type == KEY_TYPE_reflink_v)
-		goto out;
-
-	if (io_opts->cur_inum != extent_pos.inode) {
-		io_opts->d.nr = 0;
-
-		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
-					 BTREE_ITER_all_snapshots, k, ({
-			if (k.k->p.offset != extent_pos.inode)
-				break;
-
-			if (!bkey_is_inode(k.k))
-				continue;
-
-			struct bch_inode_unpacked inode;
-			_ret3 = bch2_inode_unpack(k, &inode);
-			if (_ret3)
-				break;
-
-			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
-			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-
-			darray_push(&io_opts->d, e);
-		}));
-		io_opts->cur_inum = extent_pos.inode;
-	}
-
-	ret = ret ?: trans_was_restarted(trans, restart_count);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (extent_k.k->p.snapshot)
-		darray_for_each(io_opts->d, i)
-			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
-				opts_ret = &i->io_opts;
-				break;
-			}
-out:
-	ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
-	if (ret)
-		return ERR_PTR(ret);
-	return opts_ret;
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *trans,
-			      struct bch_io_opts *io_opts,
-			      struct btree_iter *extent_iter,
-			      struct bkey_s_c extent_k)
-{
-	struct bch_fs *c = trans->c;
-
-	*io_opts = bch2_opts_to_inode_opts(c->opts);
-
-	/* reflink btree? */
-	if (!extent_k.k->p.inode)
-		goto out;
-
-	struct btree_iter inode_iter;
-	struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
-			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
-			       BTREE_ITER_cached);
-	int ret = bkey_err(inode_k);
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		return ret;
-
-	if (!ret && bkey_is_inode(inode_k.k)) {
-		struct bch_inode_unpacked inode;
-		bch2_inode_unpack(inode_k, &inode);
-		bch2_inode_opts_get(io_opts, c, &inode);
-	}
-	bch2_trans_iter_exit(trans, &inode_iter);
-	/* seem to be spinning here? */
-out:
-	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
-}
-
-int bch2_move_ratelimit(struct moving_context *ctxt)
-{
-	struct bch_fs *c = ctxt->trans->c;
-	bool is_kthread = current->flags & PF_KTHREAD;
-	u64 delay;
-
-	if (ctxt->wait_on_copygc && c->copygc_running) {
-		bch2_moving_ctxt_flush_all(ctxt);
-		wait_event_killable(c->copygc_running_wq,
-				    !c->copygc_running ||
-				    (is_kthread && kthread_should_stop()));
-	}
-
-	do {
-		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
-
-		if (is_kthread && kthread_should_stop())
-			return 1;
-
-		if (delay)
-			move_ctxt_wait_event_timeout(ctxt,
-					freezing(current) ||
-					(is_kthread && kthread_should_stop()),
-					delay);
-
-		if (unlikely(freezing(current))) {
-			bch2_moving_ctxt_flush_all(ctxt);
-			try_to_freeze();
-		}
-	} while (delay);
-
-	/*
-	 * XXX: these limits really ought to be per device, SSDs and hard drives
-	 * will want different limits
-	 */
-	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
-		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
-		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
-		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
-
-	return 0;
-}
-
-/*
- * Move requires non extents iterators, and there's also no need for it to
- * signal indirect_extent_missing_error:
- */
-static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
-					    struct btree_iter *iter,
-					    struct bkey_s_c_reflink_p p)
-{
-	if (unlikely(REFLINK_P_ERROR(p.v)))
-		return bkey_s_c_null;
-
-	struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
-
-	bch2_trans_iter_init(trans, iter,
-			     BTREE_ID_reflink, reflink_pos,
-			     BTREE_ITER_not_extents);
-
-	struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
-	if (!k.k || bkey_err(k)) {
-		bch2_trans_iter_exit(trans, iter);
-		return k;
-	}
-
-	if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
-		bch2_trans_iter_exit(trans, iter);
-		return bkey_s_c_null;
-	}
-
-	return k;
-}
-
-int bch2_move_data_btree(struct moving_context *ctxt,
-			 struct bpos start,
-			 struct bpos end,
-			 move_pred_fn pred, void *arg,
-			 enum btree_id btree_id, unsigned level)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	struct per_snapshot_io_opts snapshot_io_opts;
-	struct bch_io_opts *io_opts;
-	struct bkey_buf sk;
-	struct btree_iter iter, reflink_iter = {};
-	struct bkey_s_c k;
-	struct data_update_opts data_opts;
-	/*
-	 * If we're moving a single file, also process reflinked data it points
-	 * to (this includes propagating changed io_opts from the inode to the
-	 * extent):
-	 */
-	bool walk_indirect = start.inode == end.inode;
-	int ret = 0, ret2;
-
-	per_snapshot_io_opts_init(&snapshot_io_opts, c);
-	bch2_bkey_buf_init(&sk);
-
-	if (ctxt->stats) {
-		ctxt->stats->data_type	= BCH_DATA_user;
-		ctxt->stats->pos	= BBPOS(btree_id, start);
-	}
-
-retry_root:
-	bch2_trans_begin(trans);
-
-	if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
-		bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
-					  BTREE_ITER_prefetch|
-					  BTREE_ITER_not_extents|
-					  BTREE_ITER_all_snapshots);
-		struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
-		ret = PTR_ERR_OR_ZERO(b);
-		if (ret)
-			goto root_err;
-
-		if (b != btree_node_root(c, b)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto retry_root;
-		}
-
-		k = bkey_i_to_s_c(&b->key);
-
-		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
-						iter.pos, &iter, k);
-		ret = PTR_ERR_OR_ZERO(io_opts);
-		if (ret)
-			goto root_err;
-
-		memset(&data_opts, 0, sizeof(data_opts));
-		if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
-			goto out;
-
-
-		if (!data_opts.scrub)
-			ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
-							  k.k->p, data_opts.target, 0);
-		else
-			ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
-root_err:
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto retry_root;
-		}
-
-		goto out;
-	}
-
-	bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
-				  BTREE_ITER_prefetch|
-				  BTREE_ITER_not_extents|
-				  BTREE_ITER_all_snapshots);
-
-	if (ctxt->rate)
-		bch2_ratelimit_reset(ctxt->rate);
-
-	while (!bch2_move_ratelimit(ctxt)) {
-		struct btree_iter *extent_iter = &iter;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_btree_iter_peek(trans, &iter);
-		if (!k.k)
-			break;
-
-		ret = bkey_err(k);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		if (bkey_gt(bkey_start_pos(k.k), end))
-			break;
-
-		if (ctxt->stats)
-			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-		if (walk_indirect &&
-		    k.k->type == KEY_TYPE_reflink_p &&
-		    REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
-			struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
-			bch2_trans_iter_exit(trans, &reflink_iter);
-			k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
-			ret = bkey_err(k);
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				continue;
-			if (ret)
-				break;
-
-			if (!k.k)
-				goto next_nondata;
-
-			/*
-			 * XXX: reflink pointers may point to multiple indirect
-			 * extents, so don't advance past the entire reflink
-			 * pointer - need to fixup iter->k
-			 */
-			extent_iter = &reflink_iter;
-		}
-
-		if (!bkey_extent_is_direct_data(k.k))
-			goto next_nondata;
-
-		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
-						iter.pos, extent_iter, k);
-		ret = PTR_ERR_OR_ZERO(io_opts);
-		if (ret)
-			continue;
-
-		memset(&data_opts, 0, sizeof(data_opts));
-		if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
-			goto next;
-
-		/*
-		 * The iterator gets unlocked by __bch2_read_extent - need to
-		 * save a copy of @k elsewhere:
-		 */
-		bch2_bkey_buf_reassemble(&sk, c, k);
-		k = bkey_i_to_s_c(sk.k);
-
-		if (!level)
-			ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
-		else if (!data_opts.scrub)
-			ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
-							  k.k->p, data_opts.target, 0);
-		else
-			ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
-		if (ret2) {
-			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
-				continue;
-
-			if (bch2_err_matches(ret2, ENOMEM)) {
-				/* memory allocation failure, wait for some IO to finish */
-				bch2_move_ctxt_wait_for_io(ctxt);
-				continue;
-			}
-
-			/* XXX signal failure */
-			goto next;
-		}
-next:
-		if (ctxt->stats)
-			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-next_nondata:
-		if (!bch2_btree_iter_advance(trans, &iter))
-			break;
-	}
-out:
-	bch2_trans_iter_exit(trans, &reflink_iter);
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_bkey_buf_exit(&sk, c);
-	per_snapshot_io_opts_exit(&snapshot_io_opts);
-
-	return ret;
-}
-
-int __bch2_move_data(struct moving_context *ctxt,
-		     struct bbpos start,
-		     struct bbpos end,
-		     move_pred_fn pred, void *arg)
-{
-	struct bch_fs *c = ctxt->trans->c;
-	enum btree_id id;
-	int ret = 0;
-
-	for (id = start.btree;
-	     id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
-	     id++) {
-		ctxt->stats->pos = BBPOS(id, POS_MIN);
-
-		if (!btree_type_has_ptrs(id) ||
-		    !bch2_btree_id_root(c, id)->b)
-			continue;
-
-		ret = bch2_move_data_btree(ctxt,
-				       id == start.btree ? start.pos : POS_MIN,
-				       id == end.btree   ? end.pos   : POS_MAX,
-				       pred, arg, id, 0);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
-		   struct bbpos start,
-		   struct bbpos end,
-		   struct bch_ratelimit *rate,
-		   struct bch_move_stats *stats,
-		   struct write_point_specifier wp,
-		   bool wait_on_copygc,
-		   move_pred_fn pred, void *arg)
-{
-	struct moving_context ctxt;
-
-	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
-	bch2_moving_ctxt_exit(&ctxt);
-
-	return ret;
-}
-
-static int __bch2_move_data_phys(struct moving_context *ctxt,
-			struct move_bucket *bucket_in_flight,
-			unsigned dev,
-			u64 bucket_start,
-			u64 bucket_end,
-			unsigned data_types,
-			bool copygc,
-			move_pred_fn pred, void *arg)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	bool is_kthread = current->flags & PF_KTHREAD;
-	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct btree_iter iter = {}, bp_iter = {};
-	struct bkey_buf sk;
-	struct bkey_s_c k;
-	struct bkey_buf last_flushed;
-	u64 check_mismatch_done = bucket_start;
-	int ret = 0;
-
-	struct bch_dev *ca = bch2_dev_tryget(c, dev);
-	if (!ca)
-		return 0;
-
-	bucket_end = min(bucket_end, ca->mi.nbuckets);
-
-	struct bpos bp_start	= bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
-	struct bpos bp_end	= bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
-
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-	bch2_bkey_buf_init(&sk);
-
-	/*
-	 * We're not run in a context that handles transaction restarts:
-	 */
-	bch2_trans_begin(trans);
-
-	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
-
-	ret = bch2_btree_write_buffer_tryflush(trans);
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_msg(c, ret, "flushing btree write buffer");
-	if (ret)
-		goto err;
-
-	while (!(ret = bch2_move_ratelimit(ctxt))) {
-		if (is_kthread && kthread_should_stop())
-			break;
-
-		bch2_trans_begin(trans);
-
-		k = bch2_btree_iter_peek(trans, &bp_iter);
-		ret = bkey_err(k);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			goto err;
-
-		if (!k.k || bkey_gt(k.k->p, bp_end))
-			break;
-
-		if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
-			while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
-				bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
-								       copygc, &last_flushed);
-			}
-			continue;
-		}
-
-		if (k.k->type != KEY_TYPE_backpointer)
-			goto next;
-
-		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
-		if (ctxt->stats)
-			ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
-		if (!(data_types & BIT(bp.v->data_type)))
-			goto next;
-
-		if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
-			goto next;
-
-		k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
-		ret = bkey_err(k);
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			goto err;
-		if (!k.k)
-			goto next;
-
-		if (!bp.v->level) {
-			ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
-			if (ret) {
-				bch2_trans_iter_exit(trans, &iter);
-				continue;
-			}
-		}
-
-		struct data_update_opts data_opts = {};
-		bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
-
-		if (trace_io_move_pred_enabled())
-			trace_io_move_pred2(c, k, &io_opts, &data_opts,
-					    pred, arg, p);
-
-		if (!p) {
-			bch2_trans_iter_exit(trans, &iter);
-			goto next;
-		}
-
-		if (data_opts.scrub &&
-		    !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
-			bch2_trans_iter_exit(trans, &iter);
-			ret = bch_err_throw(c, device_offline);
-			break;
-		}
-
-		bch2_bkey_buf_reassemble(&sk, c, k);
-		k = bkey_i_to_s_c(sk.k);
-
-		/* move_extent will drop locks */
-		unsigned sectors = bp.v->bucket_len;
-
-		if (!bp.v->level)
-			ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
-		else if (!data_opts.scrub)
-			ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
-							  k.k->p, data_opts.target, 0);
-		else
-			ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
-
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret == -ENOMEM) {
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(ctxt);
-			continue;
-		}
-		if (ret)
-			goto err;
-
-		if (ctxt->stats)
-			atomic64_add(sectors, &ctxt->stats->sectors_seen);
-next:
-		bch2_btree_iter_advance(trans, &bp_iter);
-	}
-
-	while (check_mismatch_done < bucket_end)
-		bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
-						       copygc, &last_flushed);
-err:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_bkey_buf_exit(&sk, c);
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch2_dev_put(ca);
-	return ret;
-}
-
-int bch2_move_data_phys(struct bch_fs *c,
-			unsigned dev,
-			u64 start,
-			u64 end,
-			unsigned data_types,
-			struct bch_ratelimit *rate,
-			struct bch_move_stats *stats,
-			struct write_point_specifier wp,
-			bool wait_on_copygc,
-			move_pred_fn pred, void *arg)
-{
-	struct moving_context ctxt;
-
-	bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
-
-	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-	if (ctxt.stats) {
-		ctxt.stats->phys = true;
-		ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
-	}
-
-	int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
-					data_types, false, pred, arg);
-	bch2_moving_ctxt_exit(&ctxt);
-
-	return ret;
-}
-
-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
-				 enum btree_id btree, struct bkey_s_c k,
-				 struct bch_io_opts *io_opts,
-				 struct data_update_opts *data_opts)
-{
-	struct evacuate_bucket_arg *arg = _arg;
-
-	*data_opts = arg->data_opts;
-
-	unsigned i = 0;
-	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-		if (ptr->dev == arg->bucket.inode &&
-		    (arg->gen < 0 || arg->gen == ptr->gen) &&
-		    !ptr->cached)
-			data_opts->rewrite_ptrs |= BIT(i);
-		i++;
-	}
-
-	return data_opts->rewrite_ptrs != 0;
-}
-
-int bch2_evacuate_bucket(struct moving_context *ctxt,
-			 struct move_bucket *bucket_in_flight,
-			 struct bpos bucket, int gen,
-			 struct data_update_opts data_opts)
-{
-	struct bch_fs *c = ctxt->trans->c;
-	struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
-
-	count_event(c, io_move_evacuate_bucket);
-	if (trace_io_move_evacuate_bucket_enabled())
-		trace_io_move_evacuate_bucket2(c, bucket, gen);
-
-	return __bch2_move_data_phys(ctxt, bucket_in_flight,
-				   bucket.inode,
-				   bucket.offset,
-				   bucket.offset + 1,
-				   ~0,
-				   true,
-				   evacuate_bucket_pred, &arg);
-}
-
-typedef bool (*move_btree_pred)(struct bch_fs *, void *,
-				struct btree *, struct bch_io_opts *,
-				struct data_update_opts *);
-
-static int bch2_move_btree(struct bch_fs *c,
-			   struct bbpos start,
-			   struct bbpos end,
-			   move_btree_pred pred, void *arg,
-			   struct bch_move_stats *stats)
-{
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
-	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-	struct moving_context ctxt;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct btree *b;
-	enum btree_id btree;
-	struct data_update_opts data_opts;
-	int ret = 0;
-
-	bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
-			      writepoint_ptr(&c->btree_write_point),
-			      true);
-	trans = ctxt.trans;
-
-	stats->data_type = BCH_DATA_btree;
-
-	for (btree = start.btree;
-	     btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
-	     btree ++) {
-		stats->pos = BBPOS(btree, POS_MIN);
-
-		if (!bch2_btree_id_root(c, btree)->b)
-			continue;
-
-		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
-					  BTREE_ITER_prefetch);
-retry:
-		ret = 0;
-		while (bch2_trans_begin(trans),
-		       (b = bch2_btree_iter_peek_node(trans, &iter)) &&
-		       !(ret = PTR_ERR_OR_ZERO(b))) {
-			if (kthread && kthread_should_stop())
-				break;
-
-			if ((cmp_int(btree, end.btree) ?:
-			     bpos_cmp(b->key.k.p, end.pos)) > 0)
-				break;
-
-			stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-			if (!pred(c, arg, b, &io_opts, &data_opts))
-				goto next;
-
-			ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
-			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				continue;
-			if (ret)
-				break;
-next:
-			bch2_btree_iter_next_node(trans, &iter);
-		}
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto retry;
-
-		bch2_trans_iter_exit(trans, &iter);
-
-		if (kthread && kthread_should_stop())
-			break;
-	}
-
-	bch_err_fn(c, ret);
-	bch2_moving_ctxt_exit(&ctxt);
-	bch2_btree_interior_updates_flush(c);
-
-	return ret;
-}
-
-static bool rereplicate_pred(struct bch_fs *c, void *arg,
-			     enum btree_id btree, struct bkey_s_c k,
-			     struct bch_io_opts *io_opts,
-			     struct data_update_opts *data_opts)
-{
-	unsigned nr_good = bch2_bkey_durability(c, k);
-	unsigned replicas = bkey_is_btree_ptr(k.k)
-		? c->opts.metadata_replicas
-		: io_opts->data_replicas;
-
-	guard(rcu)();
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	unsigned i = 0;
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (!ptr->cached &&
-		    (!ca || !ca->mi.durability))
-			data_opts->kill_ptrs |= BIT(i);
-		i++;
-	}
-
-	if (!data_opts->kill_ptrs &&
-	    (!nr_good || nr_good >= replicas))
-		return false;
-
-	data_opts->target		= 0;
-	data_opts->extra_replicas	= replicas - nr_good;
-	data_opts->btree_insert_flags	= 0;
-	return true;
-}
-
-static bool migrate_pred(struct bch_fs *c, void *arg,
-			 enum btree_id btree, struct bkey_s_c k,
-			 struct bch_io_opts *io_opts,
-			 struct data_update_opts *data_opts)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_ioctl_data *op = arg;
-	unsigned i = 0;
-
-	data_opts->rewrite_ptrs		= 0;
-	data_opts->target		= 0;
-	data_opts->extra_replicas	= 0;
-	data_opts->btree_insert_flags	= 0;
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (ptr->dev == op->migrate.dev)
-			data_opts->rewrite_ptrs |= 1U << i;
-		i++;
-	}
-
-	return data_opts->rewrite_ptrs != 0;
-}
-
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
-				   struct btree *b,
-				   struct bch_io_opts *io_opts,
-				   struct data_update_opts *data_opts)
-{
-	return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-/*
- * Ancient versions of bcachefs produced packed formats which could represent
- * keys that the in memory format cannot represent; this checks for those
- * formats so we can get rid of them.
- */
-static bool bformat_needs_redo(struct bkey_format *f)
-{
-	for (unsigned i = 0; i < f->nr_fields; i++)
-		if (bch2_bkey_format_field_overflows(f, i))
-			return true;
-
-	return false;
-}
-
-static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
-				   struct btree *b,
-				   struct bch_io_opts *io_opts,
-				   struct data_update_opts *data_opts)
-{
-	if (b->version_ondisk != c->sb.version ||
-	    btree_node_need_rewrite(b) ||
-	    bformat_needs_redo(&b->format)) {
-		data_opts->target		= 0;
-		data_opts->extra_replicas	= 0;
-		data_opts->btree_insert_flags	= 0;
-		return true;
-	}
-
-	return false;
-}
-
-int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
-{
-	int ret;
-
-	ret = bch2_move_btree(c,
-			      BBPOS_MIN,
-			      BBPOS_MAX,
-			      rewrite_old_nodes_pred, c, stats);
-	if (!ret) {
-		mutex_lock(&c->sb_lock);
-		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
-		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-	}
-
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
-			     enum btree_id btree, struct bkey_s_c k,
-			     struct bch_io_opts *io_opts,
-			     struct data_update_opts *data_opts)
-{
-	unsigned durability = bch2_bkey_durability(c, k);
-	unsigned replicas = bkey_is_btree_ptr(k.k)
-		? c->opts.metadata_replicas
-		: io_opts->data_replicas;
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned i = 0;
-
-	guard(rcu)();
-	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
-		unsigned d = bch2_extent_ptr_durability(c, &p);
-
-		if (d && durability - d >= replicas) {
-			data_opts->kill_ptrs |= BIT(i);
-			durability -= d;
-		}
-
-		i++;
-	}
-
-	return data_opts->kill_ptrs != 0;
-}
-
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
-				   struct btree *b,
-				   struct bch_io_opts *io_opts,
-				   struct data_update_opts *data_opts)
-{
-	return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
-					io_opts, data_opts);
-}
-
-static bool scrub_pred(struct bch_fs *c, void *_arg,
-		       enum btree_id btree, struct bkey_s_c k,
-		       struct bch_io_opts *io_opts,
-		       struct data_update_opts *data_opts)
-{
-	struct bch_ioctl_data *arg = _arg;
-
-	if (k.k->type != KEY_TYPE_btree_ptr_v2) {
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-		const union bch_extent_entry *entry;
-		struct extent_ptr_decoded p;
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (p.ptr.dev == arg->migrate.dev) {
-				if (!p.crc.csum_type)
-					return false;
-				break;
-			}
-	}
-
-	data_opts->scrub	= true;
-	data_opts->read_dev	= arg->migrate.dev;
-	return true;
-}
-
-int bch2_data_job(struct bch_fs *c,
-		  struct bch_move_stats *stats,
-		  struct bch_ioctl_data op)
-{
-	struct bbpos start	= BBPOS(op.start_btree, op.start_pos);
-	struct bbpos end	= BBPOS(op.end_btree, op.end_pos);
-	int ret = 0;
-
-	if (op.op >= BCH_DATA_OP_NR)
-		return -EINVAL;
-
-	bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
-
-	switch (op.op) {
-	case BCH_DATA_OP_scrub:
-		/*
-		 * prevent tests from spuriously failing, make sure we see all
-		 * btree nodes that need to be repaired
-		 */
-		bch2_btree_interior_updates_flush(c);
-
-		ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
-					  op.scrub.data_types,
-					  NULL,
-					  stats,
-					  writepoint_hashed((unsigned long) current),
-					  false,
-					  scrub_pred, &op) ?: ret;
-		break;
-
-	case BCH_DATA_OP_rereplicate:
-		stats->data_type = BCH_DATA_journal;
-		ret = bch2_journal_flush_device_pins(&c->journal, -1);
-		ret = bch2_move_btree(c, start, end,
-				      rereplicate_btree_pred, c, stats) ?: ret;
-		ret = bch2_move_data(c, start, end,
-				     NULL,
-				     stats,
-				     writepoint_hashed((unsigned long) current),
-				     true,
-				     rereplicate_pred, c) ?: ret;
-		ret = bch2_replicas_gc2(c) ?: ret;
-		break;
-	case BCH_DATA_OP_migrate:
-		if (op.migrate.dev >= c->sb.nr_devices)
-			return -EINVAL;
-
-		stats->data_type = BCH_DATA_journal;
-		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-		ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
-					  ~0,
-					  NULL,
-					  stats,
-					  writepoint_hashed((unsigned long) current),
-					  true,
-					  migrate_pred, &op) ?: ret;
-		bch2_btree_interior_updates_flush(c);
-		ret = bch2_replicas_gc2(c) ?: ret;
-		break;
-	case BCH_DATA_OP_rewrite_old_nodes:
-		ret = bch2_scan_old_btree_nodes(c, stats);
-		break;
-	case BCH_DATA_OP_drop_extra_replicas:
-		ret = bch2_move_btree(c, start, end,
-				drop_extra_replicas_btree_pred, c, stats) ?: ret;
-		ret = bch2_move_data(c, start, end, NULL, stats,
-				writepoint_hashed((unsigned long) current),
-				true,
-				drop_extra_replicas_pred, c) ?: ret;
-		ret = bch2_replicas_gc2(c) ?: ret;
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	bch2_move_stats_exit(stats, c);
-	return ret;
-}
-
-void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
-{
-	prt_printf(out, "%s: data type==", stats->name);
-	bch2_prt_data_type(out, stats->data_type);
-	prt_str(out, " pos=");
-	bch2_bbpos_to_text(out, stats->pos);
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "keys moved:\t%llu\n",	atomic64_read(&stats->keys_moved));
-	prt_printf(out, "keys raced:\t%llu\n",	atomic64_read(&stats->keys_raced));
-	prt_printf(out, "bytes seen:\t");
-	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
-	prt_newline(out);
-
-	prt_printf(out, "bytes moved:\t");
-	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
-	prt_newline(out);
-
-	prt_printf(out, "bytes raced:\t");
-	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
-	prt_newline(out);
-
-	printbuf_indent_sub(out, 2);
-}
-
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	bch2_move_stats_to_text(out, ctxt->stats);
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
-		   atomic_read(&ctxt->read_ios),
-		   c->opts.move_ios_in_flight,
-		   atomic_read(&ctxt->read_sectors),
-		   c->opts.move_bytes_in_flight >> 9);
-
-	prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
-		   atomic_read(&ctxt->write_ios),
-		   c->opts.move_ios_in_flight,
-		   atomic_read(&ctxt->write_sectors),
-		   c->opts.move_bytes_in_flight >> 9);
-
-	printbuf_indent_add(out, 2);
-
-	mutex_lock(&ctxt->lock);
-	struct moving_io *io;
-	list_for_each_entry(io, &ctxt->ios, io_list)
-		bch2_data_update_inflight_to_text(out, &io->write);
-	mutex_unlock(&ctxt->lock);
-
-	printbuf_indent_sub(out, 4);
-}
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct moving_context *ctxt;
-
-	mutex_lock(&c->moving_context_lock);
-	list_for_each_entry(ctxt, &c->moving_context_list, list)
-		bch2_moving_ctxt_to_text(out, c, ctxt);
-	mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_fs_move_init(struct bch_fs *c)
-{
-	INIT_LIST_HEAD(&c->moving_context_list);
-	mutex_init(&c->moving_context_lock);
-}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
deleted file mode 100644
index 86b80499ac55..000000000000
--- a/fs/bcachefs/move.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_H
-#define _BCACHEFS_MOVE_H
-
-#include "bbpos.h"
-#include "bcachefs_ioctl.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "move_types.h"
-
-struct bch_read_bio;
-
-struct moving_context {
-	struct btree_trans	*trans;
-	struct list_head	list;
-	void			*fn;
-
-	struct bch_ratelimit	*rate;
-	struct bch_move_stats	*stats;
-	struct write_point_specifier wp;
-	bool			wait_on_copygc;
-	bool			write_error;
-
-	/* For waiting on outstanding reads and writes: */
-	struct closure		cl;
-
-	struct mutex		lock;
-	struct list_head	reads;
-	struct list_head	ios;
-
-	/* in flight sectors: */
-	atomic_t		read_sectors;
-	atomic_t		write_sectors;
-	atomic_t		read_ios;
-	atomic_t		write_ios;
-
-	wait_queue_head_t	wait;
-};
-
-#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout)			\
-({										\
-	int _ret = 0;								\
-	while (true) {								\
-		bool cond_finished = false;					\
-		bch2_moving_ctxt_do_pending_writes(_ctxt);			\
-										\
-		if (_cond)							\
-			break;							\
-		bch2_trans_unlock_long((_ctxt)->trans);				\
-		_ret = __wait_event_timeout((_ctxt)->wait,			\
-			     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
-			     (cond_finished = (_cond)), _timeout);		\
-		if (_ret || ( cond_finished))					\
-			break;							\
-	}									\
-	_ret;									\
-})
-
-#define move_ctxt_wait_event(_ctxt, _cond)				\
-do {									\
-	bool cond_finished = false;					\
-	bch2_moving_ctxt_do_pending_writes(_ctxt);			\
-									\
-	if (_cond)							\
-		break;							\
-	bch2_trans_unlock_long((_ctxt)->trans);				\
-	__wait_event((_ctxt)->wait,					\
-		     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
-		     (cond_finished = (_cond)));			\
-	if (cond_finished)						\
-		break;							\
-} while (1)
-
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
-			     struct bch_io_opts *, struct data_update_opts *);
-
-extern const char * const bch2_data_ops_strs[];
-
-void bch2_moving_ctxt_exit(struct moving_context *);
-void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
-			   struct bch_ratelimit *, struct bch_move_stats *,
-			   struct write_point_specifier, bool);
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
-void bch2_moving_ctxt_flush_all(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
-int bch2_move_ratelimit(struct moving_context *);
-
-/* Inodes in different snapshots may have different IO options: */
-struct snapshot_io_opts_entry {
-	u32			snapshot;
-	struct bch_io_opts	io_opts;
-};
-
-struct per_snapshot_io_opts {
-	u64			cur_inum;
-	struct bch_io_opts	fs_io_opts;
-	DARRAY(struct snapshot_io_opts_entry) d;
-};
-
-static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
-{
-	memset(io_opts, 0, sizeof(*io_opts));
-	io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
-}
-
-static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
-{
-	darray_exit(&io_opts->d);
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
-			      struct btree_iter *, struct bkey_s_c);
-
-int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
-
-int bch2_move_extent(struct moving_context *,
-		     struct move_bucket *,
-		     struct btree_iter *,
-		     struct bkey_s_c,
-		     struct bch_io_opts,
-		     struct data_update_opts);
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
-			  struct per_snapshot_io_opts *, struct bpos,
-			  struct btree_iter *, struct bkey_s_c);
-
-int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
-			 move_pred_fn, void *, enum btree_id, unsigned);
-int __bch2_move_data(struct moving_context *,
-		     struct bbpos,
-		     struct bbpos,
-		     move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
-		   struct bbpos start,
-		   struct bbpos end,
-		   struct bch_ratelimit *,
-		   struct bch_move_stats *,
-		   struct write_point_specifier,
-		   bool,
-		   move_pred_fn, void *);
-
-int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
-			struct bch_ratelimit *, struct bch_move_stats *,
-			struct write_point_specifier, bool,
-			move_pred_fn, void *);
-
-int bch2_evacuate_bucket(struct moving_context *,
-			   struct move_bucket *,
-			   struct bpos, int,
-			   struct data_update_opts);
-int bch2_data_job(struct bch_fs *,
-		  struct bch_move_stats *,
-		  struct bch_ioctl_data);
-
-void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
-void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, const char *);
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_move_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644
index c5c62cd600de..000000000000
--- a/fs/bcachefs/move_types.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_TYPES_H
-#define _BCACHEFS_MOVE_TYPES_H
-
-#include "bbpos_types.h"
-#include "bcachefs_ioctl.h"
-
-struct bch_move_stats {
-	char			name[32];
-	bool			phys;
-	enum bch_ioctl_data_event_ret	ret;
-
-	union {
-	struct {
-		enum bch_data_type	data_type;
-		struct bbpos		pos;
-	};
-	struct {
-		unsigned		dev;
-		u64			offset;
-	};
-	};
-
-	atomic64_t		keys_moved;
-	atomic64_t		keys_raced;
-	atomic64_t		sectors_seen;
-	atomic64_t		sectors_moved;
-	atomic64_t		sectors_raced;
-	atomic64_t		sectors_error_corrected;
-	atomic64_t		sectors_error_uncorrected;
-};
-
-struct move_bucket_key {
-	struct bpos		bucket;
-	unsigned		gen;
-};
-
-struct move_bucket {
-	struct move_bucket	*next;
-	struct rhash_head	hash;
-	struct move_bucket_key	k;
-	unsigned		sectors;
-	atomic_t		count;
-};
-
-#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644
index 5e6de91a8763..000000000000
--- a/fs/bcachefs/movinggc.c
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "errcode.h"
-#include "error.h"
-#include "lru.h"
-#include "move.h"
-#include "movinggc.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/sched/task.h>
-#include <linux/wait.h>
-
-struct buckets_in_flight {
-	struct rhashtable	*table;
-	struct move_bucket	*first;
-	struct move_bucket	*last;
-	size_t			nr;
-	size_t			sectors;
-
-	DARRAY(struct move_bucket *) to_evacuate;
-};
-
-static const struct rhashtable_params bch_move_bucket_params = {
-	.head_offset		= offsetof(struct move_bucket, hash),
-	.key_offset		= offsetof(struct move_bucket, k),
-	.key_len		= sizeof(struct move_bucket_key),
-	.automatic_shrinking	= true,
-};
-
-static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
-{
-	if (!list->first)
-		list->first = b;
-	else
-		list->last->next = b;
-
-	list->last = b;
-	list->nr++;
-	list->sectors += b->sectors;
-}
-
-static int bch2_bucket_is_movable(struct btree_trans *trans,
-				  struct move_bucket *b, u64 time)
-{
-	struct bch_fs *c = trans->c;
-
-	if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
-		return 0;
-
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
-				       b->k.bucket, BTREE_ITER_cached);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p);
-	if (!ca)
-		goto out;
-
-	if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
-		goto out;
-
-	if (ca->mi.state != BCH_MEMBER_STATE_rw ||
-	    !bch2_dev_is_online(ca))
-		goto out;
-
-	struct bch_alloc_v4 _a;
-	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-	b->k.gen	= a->gen;
-	b->sectors	= bch2_bucket_sectors_dirty(*a);
-	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
-
-	ret = lru_idx && lru_idx <= time;
-out:
-	bch2_dev_put(ca);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static void move_bucket_free(struct buckets_in_flight *list,
-			     struct move_bucket *b)
-{
-	int ret = rhashtable_remove_fast(list->table, &b->hash,
-					 bch_move_bucket_params);
-	BUG_ON(ret);
-	kfree(b);
-}
-
-static void move_buckets_wait(struct moving_context *ctxt,
-			      struct buckets_in_flight *list,
-			      bool flush)
-{
-	struct move_bucket *i;
-
-	while ((i = list->first)) {
-		if (flush)
-			move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
-
-		if (atomic_read(&i->count))
-			break;
-
-		list->first = i->next;
-		if (!list->first)
-			list->last = NULL;
-
-		list->nr--;
-		list->sectors -= i->sectors;
-
-		move_bucket_free(list, i);
-	}
-
-	bch2_trans_unlock_long(ctxt->trans);
-}
-
-static bool bucket_in_flight(struct buckets_in_flight *list,
-			     struct move_bucket_key k)
-{
-	return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
-}
-
-static int bch2_copygc_get_buckets(struct moving_context *ctxt,
-			struct buckets_in_flight *buckets_in_flight)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
-	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
-	int ret;
-
-	move_buckets_wait(ctxt, buckets_in_flight, false);
-
-	ret = bch2_btree_write_buffer_tryflush(trans);
-	if (bch2_err_matches(ret, EROFS))
-		return ret;
-
-	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
-		return ret;
-
-	ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
-				  lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
-				  lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
-				  0, k, ({
-		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-		int ret2 = 0;
-
-		saw++;
-
-		ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
-		if (ret2 < 0)
-			goto err;
-
-		if (!ret2)
-			not_movable++;
-		else if (bucket_in_flight(buckets_in_flight, b.k))
-			in_flight++;
-		else {
-			struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
-			ret2 = b_i ? 0 : -ENOMEM;
-			if (ret2)
-				goto err;
-
-			*b_i = b;
-
-			ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
-			if (ret2) {
-				kfree(b_i);
-				goto err;
-			}
-
-			ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
-							     bch_move_bucket_params);
-			BUG_ON(ret2);
-
-			sectors += b.sectors;
-		}
-
-		ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
-err:
-		ret2;
-	}));
-
-	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
-		 buckets_in_flight->nr, buckets_in_flight->sectors,
-		 saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
-
-	return ret < 0 ? ret : 0;
-}
-
-noinline
-static int bch2_copygc(struct moving_context *ctxt,
-		       struct buckets_in_flight *buckets_in_flight,
-		       bool *did_work)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	struct data_update_opts data_opts = {
-		.btree_insert_flags = BCH_WATERMARK_copygc,
-	};
-	u64 sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen);
-	u64 sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved);
-	int ret = 0;
-
-	ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
-	if (ret)
-		goto err;
-
-	darray_for_each(buckets_in_flight->to_evacuate, i) {
-		if (kthread_should_stop() || freezing(current))
-			break;
-
-		struct move_bucket *b = *i;
-		*i = NULL;
-
-		move_bucket_in_flight_add(buckets_in_flight, b);
-
-		ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
-		if (ret)
-			goto err;
-
-		*did_work = true;
-	}
-err:
-	/* no entries in LRU btree found, or got to end: */
-	if (bch2_err_matches(ret, ENOENT))
-		ret = 0;
-
-	if (ret < 0 && !bch2_err_matches(ret, EROFS))
-		bch_err_msg(c, ret, "from bch2_move_data()");
-
-	sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
-	sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
-	trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
-
-	darray_for_each(buckets_in_flight->to_evacuate, i)
-		if (*i)
-			move_bucket_free(buckets_in_flight, *i);
-	darray_exit(&buckets_in_flight->to_evacuate);
-	return ret;
-}
-
-static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
-{
-	struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
-	struct bch_dev_usage usage;
-
-	for (unsigned i = 0; i < BCH_DATA_NR; i++)
-		usage.buckets[i] = usage_full.d[i].buckets;
-
-	s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
-				   ca->mi.bucket_size) >> 1);
-	s64 fragmented = 0;
-
-	for (unsigned i = 0; i < BCH_DATA_NR; i++)
-		if (data_type_movable(i))
-			fragmented += usage_full.d[i].fragmented;
-
-	return max(0LL, fragmented_allowed - fragmented);
-}
-
-/*
- * Copygc runs when the amount of fragmented data is above some arbitrary
- * threshold:
- *
- * The threshold at the limit - when the device is full - is the amount of space
- * we reserved in bch2_recalc_capacity; we can't have more than that amount of
- * disk space stranded due to fragmentation and store everything we have
- * promised to store.
- *
- * But we don't want to be running copygc unnecessarily when the device still
- * has plenty of free space - rather, we want copygc to smoothly run every so
- * often and continually reduce the amount of fragmented space as the device
- * fills up. So, we increase the threshold by half the current free space.
- */
-u64 bch2_copygc_wait_amount(struct bch_fs *c)
-{
-	u64 wait = U64_MAX;
-
-	guard(rcu)();
-	for_each_rw_member_rcu(c, ca)
-		wait = min(wait, bch2_copygc_dev_wait_amount(ca));
-	return wait;
-}
-
-void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	printbuf_tabstop_push(out, 32);
-	prt_printf(out, "running:\t%u\n",		c->copygc_running);
-	prt_printf(out, "copygc_wait:\t%llu\n",		c->copygc_wait);
-	prt_printf(out, "copygc_wait_at:\t%llu\n",	c->copygc_wait_at);
-
-	prt_printf(out, "Currently waiting for:\t");
-	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
-					atomic64_read(&c->io_clock[WRITE].now)) << 9);
-	prt_newline(out);
-
-	prt_printf(out, "Currently waiting since:\t");
-	prt_human_readable_u64(out, max(0LL,
-					atomic64_read(&c->io_clock[WRITE].now) -
-					c->copygc_wait_at) << 9);
-	prt_newline(out);
-
-	bch2_printbuf_make_room(out, 4096);
-
-	struct task_struct *t;
-	out->atomic++;
-	scoped_guard(rcu) {
-		prt_printf(out, "Currently calculated wait:\n");
-		for_each_rw_member_rcu(c, ca) {
-			prt_printf(out, "  %s:\t", ca->name);
-			prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
-			prt_newline(out);
-		}
-
-		t = rcu_dereference(c->copygc_thread);
-		if (t)
-			get_task_struct(t);
-	}
-	--out->atomic;
-
-	if (t) {
-		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
-		put_task_struct(t);
-	}
-}
-
-static int bch2_copygc_thread(void *arg)
-{
-	struct bch_fs *c = arg;
-	struct moving_context ctxt;
-	struct bch_move_stats move_stats;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	struct buckets_in_flight buckets = {};
-	u64 last, wait;
-
-	buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL);
-	int ret = !buckets.table
-		? -ENOMEM
-		: rhashtable_init(buckets.table, &bch_move_bucket_params);
-	bch_err_msg(c, ret, "allocating copygc buckets in flight");
-	if (ret)
-		goto err;
-
-	set_freezable();
-
-	/*
-	 * Data move operations can't run until after check_snapshots has
-	 * completed, and bch2_snapshot_is_ancestor() is available.
-	 */
-	kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
-			       kthread_should_stop());
-
-	bch2_move_stats_init(&move_stats, "copygc");
-	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
-			      writepoint_ptr(&c->copygc_write_point),
-			      false);
-
-	while (!ret && !kthread_should_stop()) {
-		bool did_work = false;
-
-		bch2_trans_unlock_long(ctxt.trans);
-		cond_resched();
-
-		if (!c->opts.copygc_enabled) {
-			move_buckets_wait(&ctxt, &buckets, true);
-			kthread_wait_freezable(c->opts.copygc_enabled ||
-					       kthread_should_stop());
-		}
-
-		if (unlikely(freezing(current))) {
-			move_buckets_wait(&ctxt, &buckets, true);
-			__refrigerator(false);
-			continue;
-		}
-
-		last = atomic64_read(&clock->now);
-		wait = bch2_copygc_wait_amount(c);
-
-		if (wait > clock->max_slop) {
-			c->copygc_wait_at = last;
-			c->copygc_wait = last + wait;
-			move_buckets_wait(&ctxt, &buckets, true);
-			trace_and_count(c, copygc_wait, c, wait, last + wait);
-			bch2_kthread_io_clock_wait(clock, last + wait,
-					MAX_SCHEDULE_TIMEOUT);
-			continue;
-		}
-
-		c->copygc_wait = 0;
-
-		c->copygc_running = true;
-		ret = bch2_copygc(&ctxt, &buckets, &did_work);
-		c->copygc_running = false;
-
-		wake_up(&c->copygc_running_wq);
-
-		if (!wait && !did_work) {
-			u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
-			if (min_member_capacity == U64_MAX)
-				min_member_capacity = 128 * 2048;
-
-			move_buckets_wait(&ctxt, &buckets, true);
-			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
-					MAX_SCHEDULE_TIMEOUT);
-		}
-	}
-
-	move_buckets_wait(&ctxt, &buckets, true);
-	rhashtable_destroy(buckets.table);
-	bch2_moving_ctxt_exit(&ctxt);
-	bch2_move_stats_exit(&move_stats, c);
-err:
-	kfree(buckets.table);
-	return ret;
-}
-
-void bch2_copygc_stop(struct bch_fs *c)
-{
-	if (c->copygc_thread) {
-		kthread_stop(c->copygc_thread);
-		put_task_struct(c->copygc_thread);
-	}
-	c->copygc_thread = NULL;
-}
-
-int bch2_copygc_start(struct bch_fs *c)
-{
-	struct task_struct *t;
-	int ret;
-
-	if (c->copygc_thread)
-		return 0;
-
-	if (c->opts.nochanges)
-		return 0;
-
-	if (bch2_fs_init_fault("copygc_start"))
-		return -ENOMEM;
-
-	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
-	ret = PTR_ERR_OR_ZERO(t);
-	bch_err_msg(c, ret, "creating copygc thread");
-	if (ret)
-		return ret;
-
-	get_task_struct(t);
-
-	c->copygc_thread = t;
-	wake_up_process(c->copygc_thread);
-
-	return 0;
-}
-
-void bch2_fs_copygc_init(struct bch_fs *c)
-{
-	init_waitqueue_head(&c->copygc_running_wq);
-	c->copygc_running = false;
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644
index f615910d6f98..000000000000
--- a/fs/bcachefs/movinggc.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVINGGC_H
-#define _BCACHEFS_MOVINGGC_H
-
-u64 bch2_copygc_wait_amount(struct bch_fs *);
-void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
-
-static inline void bch2_copygc_wakeup(struct bch_fs *c)
-{
-	guard(rcu)();
-	struct task_struct *p = rcu_dereference(c->copygc_thread);
-	if (p)
-		wake_up_process(p);
-}
-
-void bch2_copygc_stop(struct bch_fs *);
-int bch2_copygc_start(struct bch_fs *);
-void bch2_fs_copygc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
deleted file mode 100644
index c3f87c59922d..000000000000
--- a/fs/bcachefs/namei.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "inode.h"
-#include "namei.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
-{
-	return (subvol_inum) {
-		.subvol	= inode->bi_parent_subvol ?: inum.subvol,
-		.inum	= inode->bi_dir,
-	};
-}
-
-static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
-{
-	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
-}
-
-int bch2_create_trans(struct btree_trans *trans,
-		      subvol_inum dir,
-		      struct bch_inode_unpacked *dir_u,
-		      struct bch_inode_unpacked *new_inode,
-		      const struct qstr *name,
-		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
-		      struct posix_acl *default_acl,
-		      struct posix_acl *acl,
-		      subvol_inum snapshot_src,
-		      unsigned flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = {};
-	struct btree_iter inode_iter = {};
-	subvol_inum new_inum = dir;
-	u64 now = bch2_current_time(c);
-	u64 cpu = raw_smp_processor_id();
-	u64 dir_target;
-	u32 snapshot;
-	unsigned dir_type = mode_to_type(mode);
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
-			      BTREE_ITER_intent|BTREE_ITER_with_updates);
-	if (ret)
-		goto err;
-
-	if (!(flags & BCH_CREATE_SNAPSHOT)) {
-		/* Normal create path - allocate a new inode: */
-		bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
-
-		if (flags & BCH_CREATE_TMPFILE)
-			new_inode->bi_flags |= BCH_INODE_unlinked;
-
-		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
-		if (ret)
-			goto err;
-
-		snapshot_src = (subvol_inum) { 0 };
-	} else {
-		/*
-		 * Creating a snapshot - we're not allocating a new inode, but
-		 * we do have to lookup the root inode of the subvolume we're
-		 * snapshotting and update it (in the new snapshot):
-		 */
-
-		if (!snapshot_src.inum) {
-			/* Inode wasn't specified, just snapshot: */
-			struct bch_subvolume s;
-			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
-			if (ret)
-				goto err;
-
-			snapshot_src.inum = le64_to_cpu(s.inode);
-		}
-
-		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-
-		if (new_inode->bi_subvol != snapshot_src.subvol) {
-			/* Not a subvolume root: */
-			ret = -EINVAL;
-			goto err;
-		}
-
-		/*
-		 * If we're not root, we have to own the subvolume being
-		 * snapshotted:
-		 */
-		if (uid && new_inode->bi_uid != uid) {
-			ret = -EPERM;
-			goto err;
-		}
-
-		flags |= BCH_CREATE_SUBVOL;
-	}
-
-	new_inum.inum	= new_inode->bi_inum;
-	dir_target	= new_inode->bi_inum;
-
-	if (flags & BCH_CREATE_SUBVOL) {
-		u32 new_subvol, dir_snapshot;
-
-		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
-					    dir.subvol,
-					    snapshot_src.subvol,
-					    &new_subvol, &snapshot,
-					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
-		if (ret)
-			goto err;
-
-		new_inode->bi_parent_subvol	= dir.subvol;
-		new_inode->bi_subvol		= new_subvol;
-		new_inum.subvol			= new_subvol;
-		dir_target			= new_subvol;
-		dir_type			= DT_SUBVOL;
-
-		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
-		if (ret)
-			goto err;
-
-		bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
-		ret = bch2_btree_iter_traverse(trans, &dir_iter);
-		if (ret)
-			goto err;
-	}
-
-	if (!(flags & BCH_CREATE_SNAPSHOT)) {
-		if (default_acl) {
-			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-						 default_acl, ACL_TYPE_DEFAULT);
-			if (ret)
-				goto err;
-		}
-
-		if (acl) {
-			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-						 acl, ACL_TYPE_ACCESS);
-			if (ret)
-				goto err;
-		}
-	}
-
-	if (!(flags & BCH_CREATE_TMPFILE)) {
-		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-		u64 dir_offset;
-
-		if (is_subdir_for_nlink(new_inode))
-			dir_u->bi_nlink++;
-		dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-		ret =   bch2_dirent_create(trans, dir, &dir_hash,
-					   dir_type,
-					   name,
-					   dir_target,
-					   &dir_offset,
-					   STR_HASH_must_create|BTREE_ITER_with_updates) ?:
-			bch2_inode_write(trans, &dir_iter, dir_u);
-		if (ret)
-			goto err;
-
-		new_inode->bi_dir		= dir_u->bi_inum;
-		new_inode->bi_dir_offset	= dir_offset;
-	}
-
-	if (S_ISDIR(mode)) {
-		ret = bch2_maybe_propagate_has_case_insensitive(trans,
-				(subvol_inum) {
-					new_inode->bi_subvol ?: dir.subvol,
-					new_inode->bi_inum },
-				new_inode);
-		if (ret)
-			goto err;
-	}
-
-	if (S_ISDIR(mode) &&
-	    !new_inode->bi_subvol)
-		new_inode->bi_depth = dir_u->bi_depth + 1;
-
-	inode_iter.flags &= ~BTREE_ITER_all_snapshots;
-	bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
-
-	ret   = bch2_btree_iter_traverse(trans, &inode_iter) ?:
-		bch2_inode_write(trans, &inode_iter, new_inode);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	bch2_trans_iter_exit(trans, &dir_iter);
-	return ret;
-}
-
-int bch2_link_trans(struct btree_trans *trans,
-		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
-		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
-		    const struct qstr *name)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = {};
-	struct btree_iter inode_iter = {};
-	struct bch_hash_info dir_hash;
-	u64 now = bch2_current_time(c);
-	u64 dir_offset = 0;
-	int ret;
-
-	if (dir.subvol != inum.subvol)
-		return -EXDEV;
-
-	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
-	if (ret)
-		return ret;
-
-	inode_u->bi_ctime = now;
-	ret = bch2_inode_nlink_inc(inode_u);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (bch2_reinherit_attrs(inode_u, dir_u)) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	dir_u->bi_mtime = dir_u->bi_ctime = now;
-
-	dir_hash = bch2_hash_info_init(c, dir_u);
-
-	ret = bch2_dirent_create(trans, dir, &dir_hash,
-				 mode_to_type(inode_u->bi_mode),
-				 name, inum.inum,
-				 &dir_offset,
-				 STR_HASH_must_create);
-	if (ret)
-		goto err;
-
-	inode_u->bi_dir		= dir.inum;
-	inode_u->bi_dir_offset	= dir_offset;
-
-	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
-		bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-	bch2_trans_iter_exit(trans, &dir_iter);
-	bch2_trans_iter_exit(trans, &inode_iter);
-	return ret;
-}
-
-int bch2_unlink_trans(struct btree_trans *trans,
-		      subvol_inum dir,
-		      struct bch_inode_unpacked *dir_u,
-		      struct bch_inode_unpacked *inode_u,
-		      const struct qstr *name,
-		      bool deleting_subvol)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dir_iter = {};
-	struct btree_iter dirent_iter = {};
-	struct btree_iter inode_iter = {};
-	struct bch_hash_info dir_hash;
-	subvol_inum inum;
-	u64 now = bch2_current_time(c);
-	struct bkey_s_c k;
-	int ret;
-
-	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	dir_hash = bch2_hash_info_init(c, dir_u);
-
-	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-				       name, &inum, BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
-		ret = bch2_empty_dir_trans(trans, inum);
-		if (ret)
-			goto err;
-	}
-
-	if (deleting_subvol && !inode_u->bi_subvol) {
-		ret = bch_err_throw(c, ENOENT_not_subvol);
-		goto err;
-	}
-
-	if (inode_u->bi_subvol) {
-		/* Recursive subvolume destroy not allowed (yet?) */
-		ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
-		if (ret)
-			goto err;
-	}
-
-	if (deleting_subvol || inode_u->bi_subvol) {
-		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
-		if (ret)
-			goto err;
-
-		k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		/*
-		 * If we're deleting a subvolume, we need to really delete the
-		 * dirent, not just emit a whiteout in the current snapshot:
-		 */
-		bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
-		ret = bch2_btree_iter_traverse(trans, &dirent_iter);
-		if (ret)
-			goto err;
-	} else {
-		bch2_inode_nlink_dec(trans, inode_u);
-	}
-
-	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
-	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
-		inode_u->bi_dir		= 0;
-		inode_u->bi_dir_offset	= 0;
-	}
-
-	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-
-	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				    &dir_hash, &dirent_iter,
-				    BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_inode_write(trans, &dir_iter, dir_u) ?:
-		bch2_inode_write(trans, &inode_iter, inode_u);
-err:
-	bch2_trans_iter_exit(trans, &inode_iter);
-	bch2_trans_iter_exit(trans, &dirent_iter);
-	bch2_trans_iter_exit(trans, &dir_iter);
-	return ret;
-}
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
-			  struct bch_inode_unpacked *src_u)
-{
-	u64 src, dst;
-	unsigned id;
-	bool ret = false;
-
-	for (id = 0; id < Inode_opt_nr; id++) {
-		if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
-			continue;
-
-		/* Skip attributes that were explicitly set on this inode */
-		if (dst_u->bi_fields_set & (1 << id))
-			continue;
-
-		src = bch2_inode_opt_get(src_u, id);
-		dst = bch2_inode_opt_get(dst_u, id);
-
-		if (src == dst)
-			continue;
-
-		bch2_inode_opt_set(dst_u, id, src);
-		ret = true;
-	}
-
-	return ret;
-}
-
-static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
-{
-	struct btree_iter iter;
-	struct bkey_i_subvolume *s =
-		bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_subvolumes, POS(0, subvol),
-			BTREE_ITER_cached, subvolume);
-	int ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		return ret;
-
-	s->v.fs_path_parent = cpu_to_le32(new_parent);
-	bch2_trans_iter_exit(trans, &iter);
-	return 0;
-}
-
-int bch2_rename_trans(struct btree_trans *trans,
-		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
-		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
-		      struct bch_inode_unpacked *src_inode_u,
-		      struct bch_inode_unpacked *dst_inode_u,
-		      const struct qstr *src_name,
-		      const struct qstr *dst_name,
-		      enum bch_rename_mode mode)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter src_dir_iter = {};
-	struct btree_iter dst_dir_iter = {};
-	struct btree_iter src_inode_iter = {};
-	struct btree_iter dst_inode_iter = {};
-	struct bch_hash_info src_hash, dst_hash;
-	subvol_inum src_inum, dst_inum;
-	u64 src_offset, dst_offset;
-	u64 now = bch2_current_time(c);
-	int ret;
-
-	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	src_hash = bch2_hash_info_init(c, src_dir_u);
-
-	if (!subvol_inum_eq(dst_dir, src_dir)) {
-		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-
-		dst_hash = bch2_hash_info_init(c, dst_dir_u);
-	} else {
-		dst_dir_u = src_dir_u;
-		dst_hash = src_hash;
-	}
-
-	ret = bch2_dirent_rename(trans,
-				 src_dir, &src_hash,
-				 dst_dir, &dst_hash,
-				 src_name, &src_inum, &src_offset,
-				 dst_name, &dst_inum, &dst_offset,
-				 mode);
-	if (ret)
-		goto err;
-
-	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
-			      BTREE_ITER_intent);
-	if (ret)
-		goto err;
-
-	if (dst_inum.inum) {
-		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
-				      BTREE_ITER_intent);
-		if (ret)
-			goto err;
-	}
-
-	if (src_inode_u->bi_subvol &&
-	    dst_dir.subvol != src_inode_u->bi_parent_subvol) {
-		ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
-		if (ret)
-			goto err;
-	}
-
-	if (mode == BCH_RENAME_EXCHANGE &&
-	    dst_inode_u->bi_subvol &&
-	    src_dir.subvol != dst_inode_u->bi_parent_subvol) {
-		ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
-		if (ret)
-			goto err;
-	}
-
-	/* Can't move across subvolumes, unless it's a subvolume root: */
-	if (src_dir.subvol != dst_dir.subvol &&
-	    (!src_inode_u->bi_subvol ||
-	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
-		ret = -EXDEV;
-		goto err;
-	}
-
-	if (src_inode_u->bi_parent_subvol)
-		src_inode_u->bi_parent_subvol = dst_dir.subvol;
-
-	if ((mode == BCH_RENAME_EXCHANGE) &&
-	    dst_inode_u->bi_parent_subvol)
-		dst_inode_u->bi_parent_subvol = src_dir.subvol;
-
-	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
-	src_inode_u->bi_dir_offset	= dst_offset;
-
-	if (mode == BCH_RENAME_EXCHANGE) {
-		dst_inode_u->bi_dir		= src_dir_u->bi_inum;
-		dst_inode_u->bi_dir_offset	= src_offset;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE &&
-	    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
-	    dst_inode_u->bi_dir_offset	== src_offset) {
-		dst_inode_u->bi_dir		= 0;
-		dst_inode_u->bi_dir_offset	= 0;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE) {
-		if (S_ISDIR(src_inode_u->bi_mode) !=
-		    S_ISDIR(dst_inode_u->bi_mode)) {
-			ret = -ENOTDIR;
-			goto err;
-		}
-
-		if (S_ISDIR(dst_inode_u->bi_mode)) {
-			ret = bch2_empty_dir_trans(trans, dst_inum);
-			if (ret)
-				goto err;
-		}
-	}
-
-	if (!subvol_inum_eq(dst_dir, src_dir)) {
-		if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
-		    S_ISDIR(src_inode_u->bi_mode)) {
-			ret = -EXDEV;
-			goto err;
-		}
-
-		if (mode == BCH_RENAME_EXCHANGE &&
-		    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
-		    S_ISDIR(dst_inode_u->bi_mode)) {
-			ret = -EXDEV;
-			goto err;
-		}
-
-		ret =   bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
-			(mode == BCH_RENAME_EXCHANGE
-			 ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
-			 : 0);
-		if (ret)
-			goto err;
-
-		if (is_subdir_for_nlink(src_inode_u)) {
-			src_dir_u->bi_nlink--;
-			dst_dir_u->bi_nlink++;
-		}
-
-		if (S_ISDIR(src_inode_u->bi_mode) &&
-		    !src_inode_u->bi_subvol)
-			src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
-
-		if (mode == BCH_RENAME_EXCHANGE &&
-		    S_ISDIR(dst_inode_u->bi_mode) &&
-		    !dst_inode_u->bi_subvol)
-			dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
-	}
-
-	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
-		dst_dir_u->bi_nlink--;
-		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
-	}
-
-	if (mode == BCH_RENAME_OVERWRITE)
-		bch2_inode_nlink_dec(trans, dst_inode_u);
-
-	src_dir_u->bi_mtime		= now;
-	src_dir_u->bi_ctime		= now;
-
-	if (src_dir.inum != dst_dir.inum) {
-		dst_dir_u->bi_mtime	= now;
-		dst_dir_u->bi_ctime	= now;
-	}
-
-	src_inode_u->bi_ctime		= now;
-
-	if (dst_inum.inum)
-		dst_inode_u->bi_ctime	= now;
-
-	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
-		(src_dir.inum != dst_dir.inum
-		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-		 : 0) ?:
-		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
-		(dst_inum.inum
-		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-		 : 0);
-err:
-	bch2_trans_iter_exit(trans, &dst_inode_iter);
-	bch2_trans_iter_exit(trans, &src_inode_iter);
-	bch2_trans_iter_exit(trans, &dst_dir_iter);
-	bch2_trans_iter_exit(trans, &src_dir_iter);
-	return ret;
-}
-
-/* inum_to_path */
-
-static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
-{
-	bch2_printbuf_make_room(out, n);
-
-	unsigned can_print = min(n, printbuf_remaining(out));
-
-	b += n;
-
-	for (unsigned i = 0; i < can_print; i++)
-		out->buf[out->pos++] = *((char *) --b);
-
-	printbuf_nul_terminate(out);
-}
-
-static inline void prt_str_reversed(struct printbuf *out, const char *s)
-{
-	prt_bytes_reversed(out, s, strlen(s));
-}
-
-static inline void reverse_bytes(void *b, size_t n)
-{
-	char *e = b + n, *s = b;
-
-	while (s < e) {
-		--e;
-		swap(*s, *e);
-		s++;
-	}
-}
-
-static int __bch2_inum_to_path(struct btree_trans *trans,
-			       u32 subvol, u64 inum, u32 snapshot,
-			       struct printbuf *path)
-{
-	unsigned orig_pos = path->pos;
-	int ret = 0;
-	DARRAY(subvol_inum) inums = {};
-
-	if (!snapshot) {
-		ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
-		if (ret)
-			goto disconnected;
-	}
-
-	while (true) {
-		subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum };
-
-		if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) {
-			prt_str_reversed(path, "(loop)");
-			break;
-		}
-
-		ret = darray_push(&inums, n);
-		if (ret)
-			goto err;
-
-		struct bch_inode_unpacked inode;
-		ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
-		if (ret)
-			goto disconnected;
-
-		if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
-		    inode.bi_inum == BCACHEFS_ROOT_INO)
-			break;
-
-		if (!inode.bi_dir && !inode.bi_dir_offset) {
-			ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
-			goto disconnected;
-		}
-
-		inum = inode.bi_dir;
-		if (inode.bi_parent_subvol) {
-			subvol = inode.bi_parent_subvol;
-			ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot);
-			if (ret)
-				goto disconnected;
-		}
-
-		struct btree_iter d_iter;
-		struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
-				BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
-				0, dirent);
-		ret = bkey_err(d.s_c);
-		if (ret)
-			goto disconnected;
-
-		struct qstr dirent_name = bch2_dirent_get_name(d);
-
-		prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
-
-		prt_char(path, '/');
-
-		bch2_trans_iter_exit(trans, &d_iter);
-	}
-
-	if (orig_pos == path->pos)
-		prt_char(path, '/');
-out:
-	ret = path->allocation_failure ? -ENOMEM : 0;
-	if (ret)
-		goto err;
-
-	reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
-	darray_exit(&inums);
-	return 0;
-err:
-	darray_exit(&inums);
-	return ret;
-disconnected:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto err;
-
-	prt_str_reversed(path, "(disconnected)");
-	goto out;
-}
-
-int bch2_inum_to_path(struct btree_trans *trans,
-		      subvol_inum inum,
-		      struct printbuf *path)
-{
-	return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
-}
-
-int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
-			       snapshot_id_list *snapshot_overwrites,
-			       struct printbuf *path)
-{
-	return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
-}
-
-/* fsck */
-
-static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
-					  struct bkey_s_c_dirent d,
-					  struct bch_inode_unpacked *target,
-					  bool in_fsck)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter bp_iter = {};
-	int ret = 0;
-
-	if (inode_points_to_dirent(target, d))
-		return 0;
-
-	if (!bch2_inode_has_backpointer(target)) {
-		fsck_err_on(S_ISDIR(target->bi_mode),
-			    trans, inode_dir_missing_backpointer,
-			    "directory with missing backpointer\n%s",
-			    (printbuf_reset(&buf),
-			     bch2_bkey_val_to_text(&buf, c, d.s_c),
-			     prt_printf(&buf, "\n"),
-			     bch2_inode_unpacked_to_text(&buf, target),
-			     buf.buf));
-
-		fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
-			    trans, inode_unlinked_but_has_dirent,
-			    "inode unlinked but has dirent\n%s",
-			    (printbuf_reset(&buf),
-			     bch2_bkey_val_to_text(&buf, c, d.s_c),
-			     prt_printf(&buf, "\n"),
-			     bch2_inode_unpacked_to_text(&buf, target),
-			     buf.buf));
-
-		target->bi_flags &= ~BCH_INODE_unlinked;
-		target->bi_dir		= d.k->p.inode;
-		target->bi_dir_offset	= d.k->p.offset;
-		return __bch2_fsck_write_inode(trans, target);
-	}
-
-	struct bkey_s_c_dirent bp_dirent =
-		bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
-			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
-			      0, dirent);
-	ret = bkey_err(bp_dirent);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	bool backpointer_exists = !ret;
-	ret = 0;
-
-	if (!backpointer_exists) {
-		if (fsck_err(trans, inode_wrong_backpointer,
-			     "inode %llu:%u has wrong backpointer:\n"
-			     "got       %llu:%llu\n"
-			     "should be %llu:%llu",
-			     target->bi_inum, target->bi_snapshot,
-			     target->bi_dir,
-			     target->bi_dir_offset,
-			     d.k->p.inode,
-			     d.k->p.offset)) {
-			target->bi_dir		= d.k->p.inode;
-			target->bi_dir_offset	= d.k->p.offset;
-			ret = __bch2_fsck_write_inode(trans, target);
-		}
-	} else {
-		printbuf_reset(&buf);
-		bch2_bkey_val_to_text(&buf, c, d.s_c);
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
-		if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
-			/*
-			 * XXX: verify connectivity of the other dirent
-			 * up to the root before removing this one
-			 *
-			 * Additionally, bch2_lookup would need to cope with the
-			 * dirent it found being removed - or should we remove
-			 * the other one, even though the inode points to it?
-			 */
-			if (in_fsck) {
-				if (fsck_err(trans, inode_dir_multiple_links,
-					     "%s %llu:%u with multiple links\n%s",
-					     S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-					     target->bi_inum, target->bi_snapshot, buf.buf))
-					ret = bch2_fsck_remove_dirent(trans, d.k->p);
-			} else {
-				bch2_fs_inconsistent(c,
-						"%s %llu:%u with multiple links\n%s",
-						S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-						target->bi_inum, target->bi_snapshot, buf.buf);
-			}
-
-			goto out;
-		} else {
-			/*
-			 * hardlinked file with nlink 0:
-			 * We're just adjusting nlink here so check_nlinks() will pick
-			 * it up, it ignores inodes with nlink 0
-			 */
-			if (fsck_err_on(!target->bi_nlink,
-					trans, inode_multiple_links_but_nlink_0,
-					"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-					target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
-				target->bi_nlink++;
-				target->bi_flags &= ~BCH_INODE_unlinked;
-				ret = __bch2_fsck_write_inode(trans, target);
-				if (ret)
-					goto err;
-			}
-		}
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &bp_iter);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int __bch2_check_dirent_target(struct btree_trans *trans,
-			       struct btree_iter *dirent_iter,
-			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       bool in_fsck)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
-	if (ret)
-		goto err;
-
-	if (fsck_err_on(d.v->d_type != inode_d_type(target),
-			trans, dirent_d_type_wrong,
-			"incorrect d_type: got %s, should be %s:\n%s",
-			bch2_d_type_str(d.v->d_type),
-			bch2_d_type_str(inode_d_type(target)),
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
-		struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_type = inode_d_type(target);
-		if (n->v.d_type == DT_SUBVOL) {
-			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
-			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
-		} else {
-			n->v.d_inum = cpu_to_le64(target->bi_inum);
-		}
-
-		ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
-					BTREE_UPDATE_internal_snapshot_node);
-		if (ret)
-			goto err;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * BCH_INODE_has_case_insensitive:
- * We have to track whether directories have any descendent directory that is
- * casefolded - for overlayfs:
- */
-
-static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
-{
-	struct btree_iter iter = {};
-	int ret = 0;
-
-	while (true) {
-		struct bch_inode_unpacked inode;
-		ret = bch2_inode_peek(trans, &iter, &inode, inum,
-				      BTREE_ITER_intent|BTREE_ITER_with_updates);
-		if (ret)
-			break;
-
-		if (inode.bi_flags & BCH_INODE_has_case_insensitive)
-			break;
-
-		inode.bi_flags |= BCH_INODE_has_case_insensitive;
-		ret = bch2_inode_write(trans, &iter, &inode);
-		if (ret)
-			break;
-
-		bch2_trans_iter_exit(trans, &iter);
-		if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
-			break;
-
-		inum = parent_inum(inum, &inode);
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
-					      struct bch_inode_unpacked *inode)
-{
-	if (!bch2_inode_casefold(trans->c, inode))
-		return 0;
-
-	inode->bi_flags |= BCH_INODE_has_case_insensitive;
-
-	return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
-}
-
-int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
-					  struct bch_inode_unpacked *inode,
-					  snapshot_id_list *snapshot_overwrites,
-					  bool *do_update)
-{
-	struct printbuf buf = PRINTBUF;
-	bool repairing_parents = false;
-	int ret = 0;
-
-	if (!S_ISDIR(inode->bi_mode)) {
-		/*
-		 * Old versions set bi_casefold for non dirs, but that's
-		 * unnecessary and wasteful
-		 */
-		if (inode->bi_casefold) {
-			inode->bi_casefold = 0;
-			*do_update = true;
-		}
-		return 0;
-	}
-
-	if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
-		return 0;
-
-	if (bch2_inode_casefold(trans->c, inode) &&
-	    !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
-		prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
-			   inode->bi_inum, inode->bi_snapshot);
-
-		ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
-						 snapshot_overwrites, &buf);
-		if (ret)
-			goto err;
-
-		if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
-			inode->bi_flags |= BCH_INODE_has_case_insensitive;
-			*do_update = true;
-		}
-	}
-
-	if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
-		goto out;
-
-	struct bch_inode_unpacked dir = *inode;
-	u32 snapshot = dir.bi_snapshot;
-
-	while (!(dir.bi_inum	== BCACHEFS_ROOT_INO &&
-		 dir.bi_subvol	== BCACHEFS_ROOT_SUBVOL)) {
-		if (dir.bi_parent_subvol) {
-			ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
-			if (ret)
-				goto err;
-
-			snapshot_overwrites = NULL;
-		}
-
-		ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
-		if (ret)
-			goto err;
-
-		if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
-			prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
-
-			ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
-							 snapshot_overwrites, &buf);
-			if (ret)
-				goto err;
-
-			if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
-				dir.bi_flags |= BCH_INODE_has_case_insensitive;
-				ret = __bch2_fsck_write_inode(trans, &dir);
-				if (ret)
-					goto err;
-			}
-		}
-
-		/*
-		 * We only need to check the first parent, unless we find an
-		 * inconsistency
-		 */
-		if (!repairing_parents)
-			break;
-	}
-out:
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	if (ret)
-		return ret;
-
-	if (repairing_parents) {
-		return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-			-BCH_ERR_transaction_restart_nested;
-	}
-
-	return 0;
-}
diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h
deleted file mode 100644
index ae6ebc2d0785..000000000000
--- a/fs/bcachefs/namei.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NAMEI_H
-#define _BCACHEFS_NAMEI_H
-
-#include "dirent.h"
-
-struct posix_acl;
-
-#define BCH_CREATE_TMPFILE		(1U << 0)
-#define BCH_CREATE_SUBVOL		(1U << 1)
-#define BCH_CREATE_SNAPSHOT		(1U << 2)
-#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
-
-int bch2_create_trans(struct btree_trans *, subvol_inum,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *,
-		      uid_t, gid_t, umode_t, dev_t,
-		      struct posix_acl *,
-		      struct posix_acl *,
-		      subvol_inum, unsigned);
-
-int bch2_link_trans(struct btree_trans *,
-		    subvol_inum, struct bch_inode_unpacked *,
-		    subvol_inum, struct bch_inode_unpacked *,
-		    const struct qstr *);
-
-int bch2_unlink_trans(struct btree_trans *, subvol_inum,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *, bool);
-
-int bch2_rename_trans(struct btree_trans *,
-		      subvol_inum, struct bch_inode_unpacked *,
-		      subvol_inum, struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      struct bch_inode_unpacked *,
-		      const struct qstr *,
-		      const struct qstr *,
-		      enum bch_rename_mode);
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
-			  struct bch_inode_unpacked *);
-
-int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
-			       snapshot_id_list *, struct printbuf *);
-
-int __bch2_check_dirent_target(struct btree_trans *,
-			       struct btree_iter *,
-			       struct bkey_s_c_dirent,
-			       struct bch_inode_unpacked *, bool);
-
-static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
-					  struct bkey_s_c_dirent d)
-{
-	return  inode->bi_dir		== d.k->p.inode &&
-		inode->bi_dir_offset	== d.k->p.offset;
-}
-
-static inline int bch2_check_dirent_target(struct btree_trans *trans,
-					   struct btree_iter *dirent_iter,
-					   struct bkey_s_c_dirent d,
-					   struct bch_inode_unpacked *target,
-					   bool in_fsck)
-{
-	if (likely(inode_points_to_dirent(target, d) &&
-		   d.v->d_type == inode_d_type(target)))
-		return 0;
-
-	return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
-					      struct bch_inode_unpacked *);
-int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
-					  snapshot_id_list *, bool *);
-
-#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
deleted file mode 100644
index 962218fa68ec..000000000000
--- a/fs/bcachefs/nocow_locking.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "nocow_locking.h"
-#include "util.h"
-
-#include <linux/closure.h>
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
-{
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(l->b); i++)
-		if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
-			return true;
-	return false;
-}
-
-#define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
-
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
-{
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-	int lock_val = flags ? 1 : -1;
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(l->b); i++)
-		if (l->b[i] == dev_bucket) {
-			int v = atomic_sub_return(lock_val, &l->l[i]);
-
-			BUG_ON(v && sign(v) != lock_val);
-			if (!v)
-				closure_wake_up(&l->wait);
-			return;
-		}
-
-	BUG();
-}
-
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
-				 u64 dev_bucket, int flags)
-{
-	int v, lock_val = flags ? 1 : -1;
-	unsigned i;
-
-	spin_lock(&l->lock);
-
-	for (i = 0; i < ARRAY_SIZE(l->b); i++)
-		if (l->b[i] == dev_bucket)
-			goto got_entry;
-
-	for (i = 0; i < ARRAY_SIZE(l->b); i++)
-		if (!atomic_read(&l->l[i])) {
-			l->b[i] = dev_bucket;
-			goto take_lock;
-		}
-fail:
-	spin_unlock(&l->lock);
-	return false;
-got_entry:
-	v = atomic_read(&l->l[i]);
-	if (lock_val > 0 ? v < 0 : v > 0)
-		goto fail;
-take_lock:
-	v = atomic_read(&l->l[i]);
-	/* Overflow? */
-	if (v && sign(v + lock_val) != sign(v))
-		goto fail;
-
-	atomic_add(lock_val, &l->l[i]);
-	spin_unlock(&l->lock);
-	return true;
-}
-
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-			      struct nocow_lock_bucket *l,
-			      u64 dev_bucket, int flags)
-{
-	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
-		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
-		u64 start_time = local_clock();
-
-		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
-		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
-	}
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
-
-{
-	unsigned i, nr_zero = 0;
-	struct nocow_lock_bucket *l;
-
-	for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
-		unsigned v = 0;
-
-		for (i = 0; i < ARRAY_SIZE(l->l); i++)
-			v |= atomic_read(&l->l[i]);
-
-		if (!v) {
-			nr_zero++;
-			continue;
-		}
-
-		if (nr_zero)
-			prt_printf(out, "(%u empty entries)\n", nr_zero);
-		nr_zero = 0;
-
-		for (i = 0; i < ARRAY_SIZE(l->l); i++) {
-			int v = atomic_read(&l->l[i]);
-			if (v) {
-				bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
-				prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
-			}
-		}
-		prt_newline(out);
-	}
-
-	if (nr_zero)
-		prt_printf(out, "(%u empty entries)\n", nr_zero);
-}
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *c)
-{
-	struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
-	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
-		for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
-			BUG_ON(atomic_read(&l->l[j]));
-}
-
-void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
-{
-	struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
-	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
-		spin_lock_init(&l->lock);
-}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
deleted file mode 100644
index 48b8a003c0d2..000000000000
--- a/fs/bcachefs/nocow_locking.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_H
-#define _BCACHEFS_NOCOW_LOCKING_H
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "nocow_locking_types.h"
-
-#include <linux/hash.h>
-
-static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-							  u64 dev_bucket)
-{
-	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
-
-	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
-}
-
-#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
-			      struct nocow_lock_bucket *, u64, int);
-
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
-					  struct bpos bucket, int flags)
-{
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
-
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
-					  struct bpos bucket, int flags)
-{
-	u64 dev_bucket = bucket_to_u64(bucket);
-	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
-	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *);
-void bch2_fs_nocow_locking_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
deleted file mode 100644
index bd12bf677924..000000000000
--- a/fs/bcachefs/nocow_locking_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
-#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
-
-#define BUCKET_NOCOW_LOCKS_BITS		10
-#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
-
-struct nocow_lock_bucket {
-	struct closure_waitlist		wait;
-	spinlock_t			lock;
-	u64				b[4];
-	atomic_t			l[4];
-} __aligned(SMP_CACHE_BYTES);
-
-struct bucket_nocow_lock_table {
-	struct nocow_lock_bucket	l[BUCKET_NOCOW_LOCKS];
-};
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
-
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
deleted file mode 100644
index b1cf88905b81..000000000000
--- a/fs/bcachefs/opts.c
+++ /dev/null
@@ -1,844 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kernel.h>
-#include <linux/fs_parser.h>
-
-#include "bcachefs.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "movinggc.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "super-io.h"
-#include "util.h"
-
-#define x(t, n, ...) [n] = #t,
-
-const char * const bch2_error_actions[] = {
-	BCH_ERROR_ACTIONS()
-	NULL
-};
-
-const char * const bch2_degraded_actions[] = {
-	BCH_DEGRADED_ACTIONS()
-	NULL
-};
-
-const char * const bch2_fsck_fix_opts[] = {
-	BCH_FIX_ERRORS_OPTS()
-	NULL
-};
-
-const char * const bch2_version_upgrade_opts[] = {
-	BCH_VERSION_UPGRADE_OPTS()
-	NULL
-};
-
-const char * const bch2_sb_features[] = {
-	BCH_SB_FEATURES()
-	NULL
-};
-
-const char * const bch2_sb_compat[] = {
-	BCH_SB_COMPAT()
-	NULL
-};
-
-const char * const __bch2_btree_ids[] = {
-	BCH_BTREE_IDS()
-	NULL
-};
-
-const char * const __bch2_csum_types[] = {
-	BCH_CSUM_TYPES()
-	NULL
-};
-
-const char * const __bch2_csum_opts[] = {
-	BCH_CSUM_OPTS()
-	NULL
-};
-
-const char * const __bch2_compression_types[] = {
-	BCH_COMPRESSION_TYPES()
-	NULL
-};
-
-const char * const bch2_compression_opts[] = {
-	BCH_COMPRESSION_OPTS()
-	NULL
-};
-
-const char * const __bch2_str_hash_types[] = {
-	BCH_STR_HASH_TYPES()
-	NULL
-};
-
-const char * const bch2_str_hash_opts[] = {
-	BCH_STR_HASH_OPTS()
-	NULL
-};
-
-const char * const __bch2_data_types[] = {
-	BCH_DATA_TYPES()
-	NULL
-};
-
-const char * const bch2_member_states[] = {
-	BCH_MEMBER_STATES()
-	NULL
-};
-
-static const char * const __bch2_jset_entry_types[] = {
-	BCH_JSET_ENTRY_TYPES()
-	NULL
-};
-
-static const char * const __bch2_fs_usage_types[] = {
-	BCH_FS_USAGE_TYPES()
-	NULL
-};
-
-#undef x
-
-static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
-				    unsigned nr, const char *type, unsigned idx)
-{
-	if (idx < nr)
-		prt_str(out, opts[idx]);
-	else
-		prt_printf(out, "(unknown %s %u)", type, idx);
-}
-
-#define PRT_STR_OPT_BOUNDSCHECKED(name, type)					\
-void bch2_prt_##name(struct printbuf *out, type t)				\
-{										\
-	prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
-}
-
-PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,	enum bch_jset_entry_type);
-PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,	enum bch_fs_usage_type);
-PRT_STR_OPT_BOUNDSCHECKED(data_type,		enum bch_data_type);
-PRT_STR_OPT_BOUNDSCHECKED(csum_opt,		enum bch_csum_opt);
-PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
-PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
-PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,	enum bch_str_hash_type);
-
-static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
-				     struct printbuf *err)
-{
-	if (!val) {
-		*res = FSCK_FIX_yes;
-	} else {
-		int ret = match_string(bch2_fsck_fix_opts, -1, val);
-
-		if (ret < 0 && err)
-			prt_str(err, "fix_errors: invalid selection");
-		if (ret < 0)
-			return ret;
-		*res = ret;
-	}
-
-	return 0;
-}
-
-static void bch2_opt_fix_errors_to_text(struct printbuf *out,
-					struct bch_fs *c,
-					struct bch_sb *sb,
-					u64 v)
-{
-	prt_str(out, bch2_fsck_fix_opts[v]);
-}
-
-#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
-	.parse = bch2_opt_fix_errors_parse,		\
-	.to_text = bch2_opt_fix_errors_to_text,		\
-}
-
-const char * const bch2_d_types[BCH_DT_MAX] = {
-	[DT_UNKNOWN]	= "unknown",
-	[DT_FIFO]	= "fifo",
-	[DT_CHR]	= "chr",
-	[DT_DIR]	= "dir",
-	[DT_BLK]	= "blk",
-	[DT_REG]	= "reg",
-	[DT_LNK]	= "lnk",
-	[DT_SOCK]	= "sock",
-	[DT_WHT]	= "whiteout",
-	[DT_SUBVOL]	= "subvol",
-};
-
-void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
-{
-#define x(_name, ...)						\
-	if (opt_defined(src, _name))					\
-		opt_set(*dst, _name, src._name);
-
-	BCH_OPTS()
-#undef x
-}
-
-bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
-	switch (id) {
-#define x(_name, ...)						\
-	case Opt_##_name:						\
-		return opt_defined(*opts, _name);
-	BCH_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
-	switch (id) {
-#define x(_name, ...)						\
-	case Opt_##_name:						\
-		return opts->_name;
-	BCH_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
-{
-	switch (id) {
-#define x(_name, ...)						\
-	case Opt_##_name:						\
-		opt_set(*opts, _name, v);				\
-		break;
-	BCH_OPTS()
-#undef x
-	default:
-		BUG();
-	}
-}
-
-/* dummy option, for options that aren't stored in the superblock */
-typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
-typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
-typedef u64 (*member_opt_get_fn)(const struct bch_member *);
-typedef void (*member_opt_set_fn)(struct bch_member *, u64);
-
-__maybe_unused static const sb_opt_get_fn	BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const sb_opt_set_fn	SET_BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const member_opt_get_fn	BCH2_NO_MEMBER_OPT = NULL;
-__maybe_unused static const member_opt_set_fn	SET_BCH2_NO_MEMBER_OPT = NULL;
-
-#define type_compatible_or_null(_p, _type)				\
-	__builtin_choose_expr(						\
-		__builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
-
-const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
-#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
-				.min = _min, .max = _max
-#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
-				.min = 0, .max = ARRAY_SIZE(_choices) - 1, \
-				.choices = _choices
-#define OPT_STR_NOLIMIT(_choices)	.type = BCH_OPT_STR,		\
-				.min = 0, .max = U64_MAX,		\
-				.choices = _choices
-#define OPT_BITFIELD(_choices)	.type = BCH_OPT_BITFIELD,		\
-				.choices = _choices
-#define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
-
-#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
-	[Opt_##_name] = {						\
-		.attr.name	= #_name,				\
-		.attr.mode	= (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
-		.flags		= _flags,				\
-		.hint		= _hint,				\
-		.help		= _help,				\
-		.get_sb		= type_compatible_or_null(_sb_opt,	*BCH2_NO_SB_OPT),	\
-		.set_sb		= type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT),	\
-		.get_member	= type_compatible_or_null(_sb_opt,	*BCH2_NO_MEMBER_OPT),	\
-		.set_member	= type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
-		_type							\
-	},
-
-	BCH_OPTS()
-#undef x
-};
-
-int bch2_opt_lookup(const char *name)
-{
-	const struct bch_option *i;
-
-	for (i = bch2_opt_table;
-	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
-	     i++)
-		if (!strcmp(name, i->attr.name))
-			return i - bch2_opt_table;
-
-	return -1;
-}
-
-struct opt_synonym {
-	const char	*s1, *s2;
-};
-
-static const struct opt_synonym bch2_opt_synonyms[] = {
-	{ "quota",	"usrquota" },
-};
-
-static int bch2_mount_opt_lookup(const char *name)
-{
-	const struct opt_synonym *i;
-
-	for (i = bch2_opt_synonyms;
-	     i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
-	     i++)
-		if (!strcmp(name, i->s1))
-			name = i->s2;
-
-	return bch2_opt_lookup(name);
-}
-
-struct opt_val_synonym {
-	const char	*opt, *v1, *v2;
-};
-
-static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
-	{ "degraded",	"true",		"yes" },
-	{ "degraded",	"false",	"no"  },
-	{ "degraded",	"1",		"yes" },
-	{ "degraded",	"0",		"no"  },
-};
-
-static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
-{
-	const struct opt_val_synonym *i;
-
-	for (i = bch2_opt_val_synonyms;
-	     i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
-	     i++)
-		if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
-			return i->v2;
-
-	return val;
-}
-
-int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
-{
-	if (v < opt->min) {
-		if (err)
-			prt_printf(err, "%s: too small (min %llu)",
-			       opt->attr.name, opt->min);
-		return -BCH_ERR_ERANGE_option_too_small;
-	}
-
-	if (opt->max && v >= opt->max) {
-		if (err)
-			prt_printf(err, "%s: too big (max %llu)",
-			       opt->attr.name, opt->max);
-		return -BCH_ERR_ERANGE_option_too_big;
-	}
-
-	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
-		if (err)
-			prt_printf(err, "%s: not a multiple of 512",
-			       opt->attr.name);
-		return -BCH_ERR_opt_parse_error;
-	}
-
-	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
-		if (err)
-			prt_printf(err, "%s: must be a power of two",
-			       opt->attr.name);
-		return -BCH_ERR_opt_parse_error;
-	}
-
-	if (opt->fn.validate)
-		return opt->fn.validate(v, err);
-
-	return 0;
-}
-
-int bch2_opt_parse(struct bch_fs *c,
-		   const struct bch_option *opt,
-		   const char *val, u64 *res,
-		   struct printbuf *err)
-{
-	ssize_t ret;
-
-	if (err)
-		printbuf_indent_add_nextline(err, 2);
-
-	switch (opt->type) {
-	case BCH_OPT_BOOL:
-		if (!val)
-			val = "1";
-
-		ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
-		if (ret != -BCH_ERR_option_not_bool) {
-			*res = ret;
-		} else {
-			if (err)
-				prt_printf(err, "%s: must be bool", opt->attr.name);
-			return ret;
-		}
-		break;
-	case BCH_OPT_UINT:
-		if (!val) {
-			prt_printf(err, "%s: required value",
-				   opt->attr.name);
-			return -EINVAL;
-		}
-
-		if (*val != '-') {
-			ret = opt->flags & OPT_HUMAN_READABLE
-			    ? bch2_strtou64_h(val, res)
-			    : kstrtou64(val, 10, res);
-		} else {
-			prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
-			return -BCH_ERR_option_negative;
-		}
-
-		if (ret < 0) {
-			if (err)
-				prt_printf(err, "%s: must be a number",
-					   opt->attr.name);
-			return ret;
-		}
-		break;
-	case BCH_OPT_STR:
-		if (!val) {
-			prt_printf(err, "%s: required value",
-				   opt->attr.name);
-			return -EINVAL;
-		}
-
-		ret = match_string(opt->choices, -1, val);
-		if (ret < 0) {
-			if (err)
-				prt_printf(err, "%s: invalid selection",
-					   opt->attr.name);
-			return ret;
-		}
-
-		*res = ret;
-		break;
-	case BCH_OPT_BITFIELD: {
-		s64 v = bch2_read_flag_list(val, opt->choices);
-		if (v < 0)
-			return v;
-		*res = v;
-		break;
-	}
-	case BCH_OPT_FN:
-		ret = opt->fn.parse(c, val, res, err);
-
-		if (ret == -BCH_ERR_option_needs_open_fs)
-			return ret;
-
-		if (ret < 0) {
-			if (err)
-				prt_printf(err, "%s: parse error",
-					   opt->attr.name);
-			return ret;
-		}
-	}
-
-	return bch2_opt_validate(opt, *res, err);
-}
-
-void bch2_opt_to_text(struct printbuf *out,
-		      struct bch_fs *c, struct bch_sb *sb,
-		      const struct bch_option *opt, u64 v,
-		      unsigned flags)
-{
-	if (flags & OPT_SHOW_MOUNT_STYLE) {
-		if (opt->type == BCH_OPT_BOOL) {
-			prt_printf(out, "%s%s",
-			       v ? "" : "no",
-			       opt->attr.name);
-			return;
-		}
-
-		prt_printf(out, "%s=", opt->attr.name);
-	}
-
-	switch (opt->type) {
-	case BCH_OPT_BOOL:
-	case BCH_OPT_UINT:
-		if (opt->flags & OPT_HUMAN_READABLE)
-			prt_human_readable_u64(out, v);
-		else
-			prt_printf(out, "%lli", v);
-		break;
-	case BCH_OPT_STR:
-		if (v < opt->min || v >= opt->max)
-			prt_printf(out, "(invalid option %lli)", v);
-		else if (flags & OPT_SHOW_FULL_LIST)
-			prt_string_option(out, opt->choices, v);
-		else
-			prt_str(out, opt->choices[v]);
-		break;
-	case BCH_OPT_BITFIELD:
-		prt_bitflags(out, opt->choices, v);
-		break;
-	case BCH_OPT_FN:
-		opt->fn.to_text(out, c, sb, v);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void bch2_opts_to_text(struct printbuf *out,
-		       struct bch_opts opts,
-		       struct bch_fs *c, struct bch_sb *sb,
-		       unsigned show_mask, unsigned hide_mask,
-		       unsigned flags)
-{
-	bool first = true;
-
-	for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
-		const struct bch_option *opt = &bch2_opt_table[i];
-
-		if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
-			continue;
-
-		u64 v = bch2_opt_get_by_id(&opts, i);
-		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-			continue;
-
-		if (!first)
-			prt_char(out, ',');
-		first = false;
-
-		bch2_opt_to_text(out, c, sb, opt, v, flags);
-	}
-}
-
-int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
-{
-	int ret = 0;
-
-	switch (id) {
-	case Opt_state:
-		if (ca)
-			return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
-		break;
-
-	case Opt_compression:
-	case Opt_background_compression:
-		ret = bch2_check_set_has_compressed_data(c, v);
-		break;
-	case Opt_erasure_code:
-		if (v)
-			bch2_check_set_feature(c, BCH_FEATURE_ec);
-		break;
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-int bch2_opts_hooks_pre_set(struct bch_fs *c)
-{
-	for (unsigned i = 0; i < bch2_opts_nr; i++) {
-		int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
-			    struct bch_opts *new_opts, enum bch_opt_id id)
-{
-	switch (id) {
-	case Opt_foreground_target:
-		if (new_opts->foreground_target &&
-		    !new_opts->background_target)
-			bch2_set_rebalance_needs_scan(c, inum);
-		break;
-	case Opt_compression:
-		if (new_opts->compression &&
-		    !new_opts->background_compression)
-			bch2_set_rebalance_needs_scan(c, inum);
-		break;
-	case Opt_background_target:
-		if (new_opts->background_target)
-			bch2_set_rebalance_needs_scan(c, inum);
-		break;
-	case Opt_background_compression:
-		if (new_opts->background_compression)
-			bch2_set_rebalance_needs_scan(c, inum);
-		break;
-	case Opt_rebalance_enabled:
-		bch2_rebalance_wakeup(c);
-		break;
-	case Opt_copygc_enabled:
-		bch2_copygc_wakeup(c);
-		break;
-	case Opt_discard:
-		if (!ca) {
-			mutex_lock(&c->sb_lock);
-			for_each_member_device(c, ca) {
-				struct bch_member *m =
-					bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
-				SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
-			}
-
-			bch2_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
-		break;
-	case Opt_version_upgrade:
-		/*
-		 * XXX: in the future we'll likely want to do compatible
-		 * upgrades at runtime as well, but right now there's nothing
-		 * that does that:
-		 */
-		if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
-			bch2_sb_upgrade_incompat(c);
-		break;
-	default:
-		break;
-	}
-}
-
-int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
-			     struct printbuf *parse_later,
-			     const char *name, const char *val)
-{
-	struct printbuf err = PRINTBUF;
-	u64 v;
-	int ret, id;
-
-	id = bch2_mount_opt_lookup(name);
-
-	/* Check for the form "noopt", negation of a boolean opt: */
-	if (id < 0 &&
-	    !val &&
-	    !strncmp("no", name, 2)) {
-		id = bch2_mount_opt_lookup(name + 2);
-		val = "0";
-	}
-
-	/* Unknown options are ignored: */
-	if (id < 0)
-		return 0;
-
-	/* must have a value for synonym lookup - but OPT_FN is weird */
-	if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
-		val = "1";
-
-	val = bch2_opt_val_synonym_lookup(name, val);
-
-	if (!(bch2_opt_table[id].flags & OPT_MOUNT))
-		goto bad_opt;
-
-	if (id == Opt_acl &&
-	    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
-		goto bad_opt;
-
-	if ((id == Opt_usrquota ||
-	     id == Opt_grpquota) &&
-	    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
-		goto bad_opt;
-
-	ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
-	if (ret == -BCH_ERR_option_needs_open_fs) {
-		ret = 0;
-
-		if (parse_later) {
-			prt_printf(parse_later, "%s=%s,", name, val);
-			if (parse_later->allocation_failure)
-				ret = -ENOMEM;
-		}
-
-		goto out;
-	}
-
-	if (ret < 0)
-		goto bad_val;
-
-	if (opts)
-		bch2_opt_set_by_id(opts, id, v);
-
-	ret = 0;
-out:
-	printbuf_exit(&err);
-	return ret;
-bad_opt:
-	ret = -BCH_ERR_option_name;
-	goto out;
-bad_val:
-	ret = -BCH_ERR_option_value;
-	goto out;
-}
-
-int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
-			  struct printbuf *parse_later, char *options,
-			  bool ignore_unknown)
-{
-	char *copied_opts, *copied_opts_start;
-	char *opt, *name, *val;
-	int ret = 0;
-
-	if (!options)
-		return 0;
-
-	/*
-	 * sys_fsconfig() is now occasionally providing us with option lists
-	 * starting with a comma - weird.
-	 */
-	if (*options == ',')
-		options++;
-
-	copied_opts = kstrdup(options, GFP_KERNEL);
-	if (!copied_opts)
-		return -ENOMEM;
-	copied_opts_start = copied_opts;
-
-	while ((opt = strsep(&copied_opts, ",")) != NULL) {
-		if (!*opt)
-			continue;
-
-		name	= strsep(&opt, "=");
-		val	= opt;
-
-		ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
-		if (ret == -BCH_ERR_option_name && ignore_unknown)
-			ret = 0;
-		if (ret) {
-			pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
-			break;
-		}
-	}
-
-	kfree(copied_opts_start);
-	return ret;
-}
-
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
-{
-	const struct bch_option *opt = bch2_opt_table + id;
-	u64 v;
-
-	if (dev_idx < 0) {
-		v = opt->get_sb(sb);
-	} else {
-		if (WARN(!bch2_member_exists(sb, dev_idx),
-			 "tried to set device option %s on nonexistent device %i",
-			 opt->attr.name, dev_idx))
-			return 0;
-
-		struct bch_member m = bch2_sb_member_get(sb, dev_idx);
-		v = opt->get_member(&m);
-	}
-
-	if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
-		--v;
-
-	if (opt->flags & OPT_SB_FIELD_ILOG2)
-		v = 1ULL << v;
-
-	if (opt->flags & OPT_SB_FIELD_SECTORS)
-		v <<= 9;
-
-	return v;
-}
-
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
-{
-	for (unsigned id = 0; id < bch2_opts_nr; id++) {
-		const struct bch_option *opt = bch2_opt_table + id;
-
-		if (opt->get_sb)
-			bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
-	}
-
-	return 0;
-}
-
-bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
-		       const struct bch_option *opt, u64 v)
-{
-	bool changed = false;
-
-	if (opt->flags & OPT_SB_FIELD_SECTORS)
-		v >>= 9;
-
-	if (opt->flags & OPT_SB_FIELD_ILOG2)
-		v = ilog2(v);
-
-	if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
-		v++;
-
-	if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
-		changed = v != opt->get_sb(sb);
-
-		opt->set_sb(sb, v);
-	}
-
-	if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
-		if (WARN(!bch2_member_exists(sb, dev_idx),
-			 "tried to set device option %s on nonexistent device %i",
-			 opt->attr.name, dev_idx))
-			return false;
-
-		struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
-		changed = v != opt->get_member(m);
-		opt->set_member(m, v);
-	}
-
-	return changed;
-}
-
-bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
-		     const struct bch_option *opt, u64 v)
-{
-	mutex_lock(&c->sb_lock);
-	bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
-	if (changed)
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-	return changed;
-}
-
-/* io opts: */
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
-{
-	struct bch_io_opts opts = {
-#define x(_name, _bits)	._name = src._name,
-	BCH_INODE_OPTS()
-#undef x
-	};
-
-	bch2_io_opts_fixups(&opts);
-	return opts;
-}
-
-bool bch2_opt_is_inode_opt(enum bch_opt_id id)
-{
-	static const enum bch_opt_id inode_opt_list[] = {
-#define x(_name, _bits)	Opt_##_name,
-	BCH_INODE_OPTS()
-#undef x
-	};
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
-		if (inode_opt_list[i] == id)
-			return true;
-
-	return false;
-}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
deleted file mode 100644
index 63f8e254495c..000000000000
--- a/fs/bcachefs/opts.h
+++ /dev/null
@@ -1,693 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_OPTS_H
-#define _BCACHEFS_OPTS_H
-
-#include <linux/bug.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-#include <linux/sysfs.h>
-#include "bcachefs_format.h"
-
-struct bch_fs;
-
-extern const char * const bch2_error_actions[];
-extern const char * const bch2_degraded_actions[];
-extern const char * const bch2_fsck_fix_opts[];
-extern const char * const bch2_version_upgrade_opts[];
-extern const char * const bch2_sb_features[];
-extern const char * const bch2_sb_compat[];
-extern const char * const __bch2_btree_ids[];
-extern const char * const __bch2_csum_types[];
-extern const char * const __bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
-extern const char * const bch2_compression_opts[];
-extern const char * const __bch2_str_hash_types[];
-extern const char * const bch2_str_hash_opts[];
-extern const char * const __bch2_data_types[];
-extern const char * const bch2_member_states[];
-extern const char * const bch2_d_types[];
-
-void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
-void bch2_prt_fs_usage_type(struct printbuf *,		enum bch_fs_usage_type);
-void bch2_prt_data_type(struct printbuf *,		enum bch_data_type);
-void bch2_prt_csum_opt(struct printbuf *,		enum bch_csum_opt);
-void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
-void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
-void bch2_prt_str_hash_type(struct printbuf *,		enum bch_str_hash_type);
-
-static inline const char *bch2_d_type_str(unsigned d_type)
-{
-	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
-}
-
-/*
- * Mount options; we also store defaults in the superblock.
- *
- * Also exposed via sysfs: if an option is writeable, and it's also stored in
- * the superblock, changing it via sysfs (currently? might change this) also
- * updates the superblock.
- *
- * We store options as signed integers, where -1 means undefined. This means we
- * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
- * apply the options from that struct that are defined.
- */
-
-/* When can be set: */
-enum opt_flags {
-	OPT_FS			= BIT(0),	/* Filesystem option */
-	OPT_DEVICE		= BIT(1),	/* Device option */
-	OPT_INODE		= BIT(2),	/* Inode option */
-	OPT_FORMAT		= BIT(3),	/* May be specified at format time */
-	OPT_MOUNT		= BIT(4),	/* May be specified at mount time */
-	OPT_RUNTIME		= BIT(5),	/* May be specified at runtime */
-	OPT_HUMAN_READABLE	= BIT(6),
-	OPT_MUST_BE_POW_2	= BIT(7),	/* Must be power of 2 */
-	OPT_SB_FIELD_SECTORS	= BIT(8),	/* Superblock field is >> 9 of actual value */
-	OPT_SB_FIELD_ILOG2	= BIT(9),	/* Superblock field is ilog2 of actual value */
-	OPT_SB_FIELD_ONE_BIAS	= BIT(10),	/* 0 means default value */
-	OPT_HIDDEN		= BIT(11),
-};
-
-enum opt_type {
-	BCH_OPT_BOOL,
-	BCH_OPT_UINT,
-	BCH_OPT_STR,
-	BCH_OPT_BITFIELD,
-	BCH_OPT_FN,
-};
-
-struct bch_opt_fn {
-	int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
-	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-	int (*validate)(u64, struct printbuf *);
-};
-
-/**
- * x(name, shortopt, type, in mem type, mode, sb_opt)
- *
- * @name	- name of mount option, sysfs attribute, and struct bch_opts
- *		  member
- *
- * @mode	- when opt may be set
- *
- * @sb_option	- name of corresponding superblock option
- *
- * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
- */
-
-/*
- * XXX: add fields for
- *  - default value
- *  - helptext
- */
-
-#ifdef __KERNEL__
-#define RATELIMIT_ERRORS_DEFAULT true
-#else
-#define RATELIMIT_ERRORS_DEFAULT false
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCACHEFS_VERBOSE_DEFAULT	true
-#else
-#define BCACHEFS_VERBOSE_DEFAULT	false
-#endif
-
-#define BCH_FIX_ERRORS_OPTS()		\
-	x(exit,	0)			\
-	x(yes,	1)			\
-	x(no,	2)			\
-	x(ask,	3)
-
-enum fsck_err_opts {
-#define x(t, n)	FSCK_FIX_##t,
-	BCH_FIX_ERRORS_OPTS()
-#undef x
-};
-
-#define BCH_OPTS()							\
-	x(block_size,			u16,				\
-	  OPT_FS|OPT_FORMAT|						\
-	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
-	  OPT_UINT(512, 1U << 16),					\
-	  BCH_SB_BLOCK_SIZE,		4 << 10,			\
-	  "size",	NULL)						\
-	x(btree_node_size,		u32,				\
-	  OPT_FS|OPT_FORMAT|						\
-	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
-	  OPT_UINT(512, 1U << 20),					\
-	  BCH_SB_BTREE_NODE_SIZE,	256 << 10,			\
-	  "size",	"Btree node size, default 256k")		\
-	x(errors,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_STR(bch2_error_actions),					\
-	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_fix_safe,		\
-	  NULL,		"Action to take on filesystem error")		\
-	x(write_error_timeout,		u16,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_UINT(1, 300),						\
-	  BCH_SB_WRITE_ERROR_TIMEOUT,	30,				\
-	  NULL,		"Number of consecutive write errors allowed before kicking out a device")\
-	x(metadata_replicas,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
-	  BCH_SB_META_REPLICAS_WANT,	1,				\
-	  "#",		"Number of metadata replicas")			\
-	x(data_replicas,		u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
-	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
-	  "#",		"Number of data replicas")			\
-	x(metadata_replicas_required, u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
-	  BCH_SB_META_REPLICAS_REQ,	1,				\
-	  "#",		NULL)						\
-	x(data_replicas_required,	u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
-	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
-	  "#",		NULL)						\
-	x(encoded_extent_max,		u32,				\
-	  OPT_FS|OPT_FORMAT|						\
-	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
-	  OPT_UINT(4096, 2U << 20),					\
-	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
-	  "size",	"Maximum size of checksummed/compressed extents")\
-	x(metadata_checksum,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_STR(__bch2_csum_opts),					\
-	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
-	  NULL,		NULL)						\
-	x(data_checksum,		u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_STR(__bch2_csum_opts),					\
-	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
-	  NULL,		NULL)						\
-	x(checksum_err_retry_nr,	u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_UINT(0, 32),						\
-	  BCH_SB_CSUM_ERR_RETRY_NR,	3,				\
-	  NULL,		NULL)						\
-	x(compression,			u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_FN(bch2_opt_compression),					\
-	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
-	  NULL,		NULL)						\
-	x(background_compression,	u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_FN(bch2_opt_compression),					\
-	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
-	  NULL,		NULL)						\
-	x(str_hash,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_STR(bch2_str_hash_opts),					\
-	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
-	  NULL,		"Hash function for directory entries and xattrs")\
-	x(metadata_target,		u16,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_FN(bch2_opt_target),					\
-	  BCH_SB_METADATA_TARGET,	0,				\
-	  "(target)",	"Device or label for metadata writes")		\
-	x(foreground_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_FN(bch2_opt_target),					\
-	  BCH_SB_FOREGROUND_TARGET,	0,				\
-	  "(target)",	"Device or label for foreground writes")	\
-	x(background_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_FN(bch2_opt_target),					\
-	  BCH_SB_BACKGROUND_TARGET,	0,				\
-	  "(target)",	"Device or label to move data to in the background")\
-	x(promote_target,		u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_FN(bch2_opt_target),					\
-	  BCH_SB_PROMOTE_TARGET,	0,				\
-	  "(target)",	"Device or label to promote data to on read")	\
-	x(erasure_code,			u16,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_BOOL(),							\
-	  BCH_SB_ERASURE_CODE,		false,				\
-	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
-	x(casefold,			u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_CASEFOLD,		false,				\
-	  NULL,		"Dirent lookups are casefolded")		\
-	x(casefold_disabled,			u8,			\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Disable casefolding filesystem wide")		\
-	x(inodes_32bit,			u8,				\
-	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_BOOL(),							\
-	  BCH_SB_INODE_32BIT,		true,				\
-	  NULL,		"Constrain inode numbers to 32 bits")		\
-	x(shard_inode_numbers_bits,	u8,				\
-	  OPT_FS|OPT_FORMAT,						\
-	  OPT_UINT(0, 8),						\
-	  BCH_SB_SHARD_INUMS_NBITS,	0,				\
-	  NULL,		"Shard new inode numbers by CPU id")		\
-	x(inodes_use_key_cache,	u8,					\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
-	  NULL,		"Use the btree key cache for the inodes btree")	\
-	x(btree_node_mem_ptr_optimization, u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		true,				\
-	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
-	x(gc_reserve_percent,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_UINT(5, 21),						\
-	  BCH_SB_GC_RESERVE,		8,				\
-	  "%",		"Percentage of disk space to reserve for copygc")\
-	x(gc_reserve_bytes,		u64,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
-	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
-	  OPT_UINT(0, U64_MAX),						\
-	  BCH_SB_GC_RESERVE_BYTES,	0,				\
-	  "%",		"Amount of disk space to reserve for copygc\n"	\
-			"Takes precedence over gc_reserve_percent if set")\
-	x(root_reserve_percent,		u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_UINT(0, 100),						\
-	  BCH_SB_ROOT_RESERVE,		0,				\
-	  "%",		"Percentage of disk space to reserve for superuser")\
-	x(wide_macs,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_BOOL(),							\
-	  BCH_SB_128_BIT_MACS,		false,				\
-	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
-	x(inline_data,			u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		true,				\
-	  NULL,		"Enable inline data extents")			\
-	x(promote_whole_extents,	u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_PROMOTE_WHOLE_EXTENTS,	true,				\
-	  NULL,		"Promote whole extents, instead of just part being read")\
-	x(acl,				u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_POSIX_ACL,		true,				\
-	  NULL,		"Enable POSIX acls")				\
-	x(usrquota,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_USRQUOTA,		false,				\
-	  NULL,		"Enable user quotas")				\
-	x(grpquota,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_GRPQUOTA,		false,				\
-	  NULL,		"Enable group quotas")				\
-	x(prjquota,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_PRJQUOTA,		false,				\
-	  NULL,		"Enable project quotas")			\
-	x(degraded,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_STR(bch2_degraded_actions),				\
-	  BCH_SB_DEGRADED_ACTION,	BCH_DEGRADED_ask,		\
-	  NULL,		"Allow mounting in degraded mode")		\
-	x(no_splitbrain_check,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don't kick drives out when splitbrain detected")\
-	x(verbose,			u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		BCACHEFS_VERBOSE_DEFAULT,	\
-	  NULL,		"Extra debugging information during mount/recovery")\
-	x(journal_flush_delay,		u32,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_UINT(1, U32_MAX),						\
-	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
-	  NULL,		"Delay in milliseconds before automatic journal commits")\
-	x(journal_flush_disabled,	u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
-	  NULL,		"Disable journal flush on sync/fsync\n"		\
-			"If enabled, writes can be lost, but only since the\n"\
-			"last journal write (default 1 second)")	\
-	x(journal_reclaim_delay,	u32,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_UINT(0, U32_MAX),						\
-	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
-	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
-	x(move_bytes_in_flight,		u32,				\
-	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
-	  OPT_UINT(1024, U32_MAX),					\
-	  BCH2_NO_SB_OPT,		1U << 20,			\
-	  NULL,		"Maximum Amount of IO to keep in flight by the move path")\
-	x(move_ios_in_flight,		u32,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_UINT(1, 1024),						\
-	  BCH2_NO_SB_OPT,		32,				\
-	  NULL,		"Maximum number of IOs to keep in flight by the move path")\
-	x(fsck,				u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Run fsck on mount")				\
-	x(fsck_memory_usage_percent,	u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_UINT(20, 70),						\
-	  BCH2_NO_SB_OPT,		50,				\
-	  NULL,		"Maximum percentage of system ram fsck is allowed to pin")\
-	x(fix_errors,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_FN(bch2_opt_fix_errors),					\
-	  BCH2_NO_SB_OPT,		FSCK_FIX_exit,			\
-	  NULL,		"Fix errors during fsck without asking")	\
-	x(ratelimit_errors,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
-	  NULL,		"Ratelimit error messages during fsck")		\
-	x(nochanges,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
-			"even if we have to replay the journal")	\
-	x(norecovery,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Exit recovery immediately prior to journal replay")\
-	x(journal_rewind,		u64,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_UINT(0, U64_MAX),						\
-	  BCH2_NO_SB_OPT,		0,				\
-	  NULL,		"Rewind journal")				\
-	x(recovery_passes,		u64,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BITFIELD(bch2_recovery_passes),				\
-	  BCH2_NO_SB_OPT,		0,				\
-	  NULL,		"Recovery passes to run explicitly")		\
-	x(recovery_passes_exclude,	u64,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BITFIELD(bch2_recovery_passes),				\
-	  BCH2_NO_SB_OPT,		0,				\
-	  NULL,		"Recovery passes to exclude")			\
-	x(recovery_pass_last,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_STR_NOLIMIT(bch2_recovery_passes),			\
-	  BCH2_NO_SB_OPT,		0,				\
-	  NULL,		"Exit recovery after specified pass")		\
-	x(retain_recovery_info,		u8,				\
-	  0,								\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don't free journal entries/keys, scanned btree nodes after startup")\
-	x(read_entire_journal,		u8,				\
-	  0,								\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Read all journal entries, not just dirty ones")\
-	x(read_journal_only,		u8,				\
-	  0,								\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Only read the journal, skip the rest of recovery")\
-	x(journal_transaction_names,	u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_BOOL(),							\
-	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
-	  NULL,		"Log transaction function names in journal")	\
-	x(allocator_stuck_timeout,	u16,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
-	  OPT_UINT(0, U16_MAX),						\
-	  BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30,				\
-	  NULL,		"Default timeout in seconds for stuck allocator messages")\
-	x(noexcl,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don't open device in exclusive mode")		\
-	x(direct_io,			u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,			true,			\
-	  NULL,		"Use O_DIRECT (userspace only)")		\
-	x(sb,				u64,				\
-	  OPT_MOUNT,							\
-	  OPT_UINT(0, S64_MAX),						\
-	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
-	  "offset",	"Sector offset of superblock")			\
-	x(read_only,			u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_HIDDEN,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		NULL)						\
-	x(nostart,			u8,				\
-	  0,								\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don\'t start filesystem, only open devices")	\
-	x(reconstruct_alloc,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Reconstruct alloc btree")			\
-	x(version_upgrade,		u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_STR(bch2_version_upgrade_opts),				\
-	  BCH_SB_VERSION_UPGRADE,	BCH_VERSION_UPGRADE_compatible,	\
-	  NULL,		"Set superblock to latest version,\n"		\
-			"allowing any new features to be used")		\
-	x(stdio,			u64,				\
-	  0,								\
-	  OPT_UINT(0, S64_MAX),						\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Pointer to a struct stdio_redirect")		\
-	x(project,			u8,				\
-	  OPT_INODE,							\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		NULL)						\
-	x(nocow,			u8,				\
-	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,		\
-	  OPT_BOOL(),							\
-	  BCH_SB_NOCOW,			false,				\
-	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
-			"Snapshots and reflink will still caused writes to be COW\n"\
-			"Implicitly disables data checksumming, compression and encryption")\
-	x(nocow_enabled,		u8,				\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,			true,			\
-	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
-			"data move path needed if nocow will ever be in use\n")\
-	x(copygc_enabled,		u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,			true,			\
-	  NULL,		"Enable copygc: disable for debugging, or to\n"\
-			"quiet the system when doing performance testing\n")\
-	x(rebalance_enabled,		u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,			true,			\
-	  NULL,		"Enable rebalance: disable for debugging, or to\n"\
-			"quiet the system when doing performance testing\n")\
-	x(rebalance_on_ac_only,		u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH_SB_REBALANCE_AC_ONLY,		false,			\
-	  NULL,		"Enable rebalance while on mains power only\n")	\
-	x(auto_snapshot_deletion,	u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,			true,			\
-	  NULL,		"Enable automatic snapshot deletion: disable for debugging, or to\n"\
-			"quiet the system when doing performance testing\n")\
-	x(no_data_io,			u8,				\
-	  OPT_MOUNT,							\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Skip submit_bio() for data reads and writes, "	\
-			"for performance testing purposes")		\
-	x(state,			u64,				\
-	  OPT_DEVICE|OPT_RUNTIME,					\
-	  OPT_STR(bch2_member_states),					\
-	  BCH_MEMBER_STATE,		BCH_MEMBER_STATE_rw,		\
-	  "state",	"rw,ro,failed,spare")				\
-	x(bucket_size,			u32,				\
-	  OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,		\
-	  OPT_UINT(0, S64_MAX),						\
-	  BCH_MEMBER_BUCKET_SIZE,	0,				\
-	  "size",	"Specifies the bucket size; must be greater than the btree node size")\
-	x(durability,			u8,				\
-	  OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS,			\
-	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
-	  BCH_MEMBER_DURABILITY,	1,				\
-	  "n",		"Data written to this device will be considered\n"\
-			"to have already been replicated n times")	\
-	x(data_allowed,			u8,				\
-	  OPT_DEVICE,							\
-	  OPT_BITFIELD(__bch2_data_types),				\
-	  BCH_MEMBER_DATA_ALLOWED,	BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
-	  "types",	"Allowed data types for this device: journal, btree, and/or user")\
-	x(discard,			u8,				\
-	  OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME,			\
-	  OPT_BOOL(),							\
-	  BCH_MEMBER_DISCARD,		true,				\
-	  NULL,		"Enable discard/TRIM support")			\
-	x(btree_node_prefetch,		u8,				\
-	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
-	  OPT_BOOL(),							\
-	  BCH2_NO_SB_OPT,		true,				\
-	  NULL,		"BTREE_ITER_prefetch causes btree nodes to be\n"\
-	  " prefetched sequentially")
-
-struct bch_opts {
-#define x(_name, _bits, ...)	unsigned _name##_defined:1;
-	BCH_OPTS()
-#undef x
-
-#define x(_name, _bits, ...)	_bits	_name;
-	BCH_OPTS()
-#undef x
-};
-
-struct bch2_opts_parse {
-	struct bch_opts opts;
-
-	/* to save opts that can't be parsed before the FS is opened: */
-	struct printbuf parse_later;
-};
-
-static const __maybe_unused struct bch_opts bch2_opts_default = {
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
-	._name##_defined = true,					\
-	._name = _default,						\
-
-	BCH_OPTS()
-#undef x
-};
-
-#define opt_defined(_opts, _name)	((_opts)._name##_defined)
-
-#define opt_get(_opts, _name)						\
-	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
-
-#define opt_set(_opts, _name, _v)					\
-do {									\
-	(_opts)._name##_defined = true;					\
-	(_opts)._name = _v;						\
-} while (0)
-
-static inline struct bch_opts bch2_opts_empty(void)
-{
-	return (struct bch_opts) { 0 };
-}
-
-void bch2_opts_apply(struct bch_opts *, struct bch_opts);
-
-enum bch_opt_id {
-#define x(_name, ...)	Opt_##_name,
-	BCH_OPTS()
-#undef x
-	bch2_opts_nr
-};
-
-struct bch_fs;
-struct printbuf;
-
-struct bch_option {
-	struct attribute	attr;
-	enum opt_type		type;
-	enum opt_flags		flags;
-	u64			min, max;
-
-	const char * const *choices;
-
-	struct bch_opt_fn	fn;
-
-	const char		*hint;
-	const char		*help;
-
-	u64			(*get_sb)(const struct bch_sb *);
-	void			(*set_sb)(struct bch_sb *, u64);
-
-	u64			(*get_member)(const struct bch_member *);
-	void			(*set_member)(struct bch_member *, u64);
-
-};
-
-extern const struct bch_option bch2_opt_table[];
-
-bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
-u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
-void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
-int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
-
-struct bch_dev;
-bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
-
-int bch2_opt_lookup(const char *);
-int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
-		   const char *, u64 *, struct printbuf *);
-
-#define OPT_SHOW_FULL_LIST	(1 << 0)
-#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
-
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
-		      const struct bch_option *, u64, unsigned);
-void bch2_opts_to_text(struct printbuf *,
-		       struct bch_opts,
-		       struct bch_fs *, struct bch_sb *,
-		       unsigned, unsigned, unsigned);
-
-int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
-int bch2_opts_hooks_pre_set(struct bch_fs *);
-void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
-			    struct bch_opts *, enum bch_opt_id);
-
-int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
-			     struct printbuf *, const char *, const char *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
-			  char *, bool);
-
-/* inode opts: */
-
-struct bch_io_opts {
-#define x(_name, _bits)	u##_bits _name;
-	BCH_INODE_OPTS()
-#undef x
-#define x(_name, _bits)	u64 _name##_from_inode:1;
-	BCH_INODE_OPTS()
-#undef x
-};
-
-static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
-{
-	if (!opts->background_target)
-		opts->background_target = opts->foreground_target;
-	if (!opts->background_compression)
-		opts->background_compression = opts->compression;
-	if (opts->nocow) {
-		opts->compression = opts->background_compression = 0;
-		opts->data_checksum = 0;
-		opts->erasure_code = 0;
-	}
-}
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-bool bch2_opt_is_inode_opt(enum bch_opt_id);
-
-#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
deleted file mode 100644
index 3302bbc78a09..000000000000
--- a/fs/bcachefs/printbuf.c
+++ /dev/null
@@ -1,528 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1+
-/* Copyright (C) 2022 Kent Overstreet */
-
-#include <linux/bitmap.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string_helpers.h>
-
-#include "printbuf.h"
-
-static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
-{
-	return pos - buf->last_newline;
-}
-
-static inline unsigned printbuf_linelen(struct printbuf *buf)
-{
-	return __printbuf_linelen(buf, buf->pos);
-}
-
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
-	return buf->cur_tabstop < buf->nr_tabstops
-		? buf->_tabstops[buf->cur_tabstop]
-		: 0;
-}
-
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
-{
-	/* Reserved space for terminating nul: */
-	extra += 1;
-
-	if (out->pos + extra <= out->size)
-		return 0;
-
-	if (!out->heap_allocated) {
-		out->overflow = true;
-		return 0;
-	}
-
-	unsigned new_size = roundup_pow_of_two(out->size + extra);
-
-	/* Sanity check... */
-	if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
-		out->allocation_failure = true;
-		out->overflow = true;
-		return -ENOMEM;
-	}
-
-	/*
-	 * Note: output buffer must be freeable with kfree(), it's not required
-	 * that the user use printbuf_exit().
-	 */
-	char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
-
-	if (!buf) {
-		out->allocation_failure = true;
-		out->overflow = true;
-		return -ENOMEM;
-	}
-
-	out->buf	= buf;
-	out->size	= new_size;
-	return 0;
-}
-
-static void printbuf_advance_pos(struct printbuf *out, unsigned len)
-{
-	out->pos += min(len, printbuf_remaining(out));
-}
-
-static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
-{
-	unsigned move = out->pos - pos;
-
-	bch2_printbuf_make_room(out, nr);
-
-	if (pos + nr < out->size)
-		memmove(out->buf + pos + nr,
-			out->buf + pos,
-			min(move, out->size - 1 - pos - nr));
-
-	if (pos < out->size)
-		memset(out->buf + pos, ' ', min(nr, out->size - pos));
-
-	printbuf_advance_pos(out, nr);
-	printbuf_nul_terminate_reserved(out);
-}
-
-static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
-	while (true) {
-		int pad;
-		unsigned len = out->pos - pos;
-		char *p = out->buf + pos;
-		char *n = memscan(p, '\n', len);
-		if (cur_tabstop(out)) {
-			n = min(n, (char *) memscan(p, '\r', len));
-			n = min(n, (char *) memscan(p, '\t', len));
-		}
-
-		pos = n - out->buf;
-		if (pos == out->pos)
-			break;
-
-		switch (*n) {
-		case '\n':
-			pos++;
-			out->last_newline = pos;
-
-			printbuf_insert_spaces(out, pos, out->indent);
-
-			pos = min(pos + out->indent, out->pos);
-			out->last_field = pos;
-			out->cur_tabstop = 0;
-			break;
-		case '\r':
-			memmove(n, n + 1, out->pos - pos);
-			--out->pos;
-			pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
-			if (pad > 0) {
-				printbuf_insert_spaces(out, out->last_field, pad);
-				pos += pad;
-			}
-
-			out->last_field = pos;
-			out->cur_tabstop++;
-			break;
-		case '\t':
-			pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
-			if (pad > 0) {
-				*n = ' ';
-				printbuf_insert_spaces(out, pos, pad - 1);
-				pos += pad;
-			} else {
-				memmove(n, n + 1, out->pos - pos);
-				--out->pos;
-			}
-
-			out->last_field = pos;
-			out->cur_tabstop++;
-			break;
-		}
-	}
-}
-
-static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
-	if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
-		__printbuf_do_indent(out, pos);
-}
-
-void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
-{
-	int len;
-
-	do {
-		va_list args2;
-
-		va_copy(args2, args);
-		len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
-		va_end(args2);
-	} while (len > printbuf_remaining(out) &&
-		 !bch2_printbuf_make_room(out, len));
-
-	unsigned indent_pos = out->pos;
-	printbuf_advance_pos(out, len);
-	printbuf_do_indent(out, indent_pos);
-}
-
-void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
-{
-	va_list args;
-	int len;
-
-	do {
-		va_start(args, fmt);
-		len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
-		va_end(args);
-	} while (len > printbuf_remaining(out) &&
-		 !bch2_printbuf_make_room(out, len));
-
-	unsigned indent_pos = out->pos;
-	printbuf_advance_pos(out, len);
-	printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
- * null terminated
- * @buf:	printbuf to terminate
- * Returns:	Printbuf contents, as a nul terminated C string
- */
-const char *bch2_printbuf_str(const struct printbuf *buf)
-{
-	/*
-	 * If we've written to a printbuf then it's guaranteed to be a null
-	 * terminated string - but if we haven't, then we might not have
-	 * allocated a buffer at all:
-	 */
-	return buf->pos
-		? buf->buf
-		: "";
-}
-
-/**
- * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
- * against accidental use.
- * @buf:	printbuf to exit
- */
-void bch2_printbuf_exit(struct printbuf *buf)
-{
-	if (buf->heap_allocated) {
-		kfree(buf->buf);
-		buf->buf = ERR_PTR(-EINTR); /* poison value */
-	}
-}
-
-void bch2_printbuf_tabstops_reset(struct printbuf *buf)
-{
-	buf->nr_tabstops = 0;
-}
-
-void bch2_printbuf_tabstop_pop(struct printbuf *buf)
-{
-	if (buf->nr_tabstops)
-		--buf->nr_tabstops;
-}
-
-/*
- * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
- *
- * @buf: printbuf to control
- * @spaces: number of spaces from previous tabpstop
- *
- * In the future this function may allocate memory if setting more than
- * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
- * of line.
- */
-int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
-{
-	unsigned prev_tabstop = buf->nr_tabstops
-		? buf->_tabstops[buf->nr_tabstops - 1]
-		: 0;
-
-	if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
-		return -EINVAL;
-
-	buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
-	buf->has_indent_or_tabstops = true;
-	return 0;
-}
-
-/**
- * bch2_printbuf_indent_add() - add to the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces more spaces.
- */
-void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
-{
-	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
-		spaces = 0;
-
-	buf->indent += spaces;
-	prt_chars(buf, ' ', spaces);
-
-	buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_add_nextline() - add to the current indent level for
- * subsequent lines
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines - not the current line - will be indented by @spaces more
- * spaces.
- */
-void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces)
-{
-	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
-		spaces = 0;
-
-	buf->indent += spaces;
-	buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_sub() - subtract from the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to subtract from the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces less spaces.
- */
-void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
-{
-	if (WARN_ON_ONCE(spaces > buf->indent))
-		spaces = buf->indent;
-
-	if (buf->last_newline + buf->indent == buf->pos) {
-		buf->pos -= spaces;
-		printbuf_nul_terminate(buf);
-	}
-	buf->indent -= spaces;
-
-	if (!buf->indent && !buf->nr_tabstops)
-		buf->has_indent_or_tabstops = false;
-}
-
-void bch2_prt_newline(struct printbuf *buf)
-{
-	bch2_printbuf_make_room(buf, 1 + buf->indent);
-
-	__prt_char_reserved(buf, '\n');
-
-	buf->last_newline	= buf->pos;
-
-	__prt_chars_reserved(buf, ' ', buf->indent);
-
-	printbuf_nul_terminate_reserved(buf);
-
-	buf->last_field		= buf->pos;
-	buf->cur_tabstop	= 0;
-}
-
-void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
-{
-	for (int p = out->pos - 1; p >= 0; --p) {
-		if (out->buf[p] == '\n') {
-			out->pos = p;
-			break;
-		}
-		if (out->buf[p] != ' ')
-			break;
-	}
-
-	printbuf_nul_terminate_reserved(out);
-}
-
-static void __prt_tab(struct printbuf *out)
-{
-	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
-
-	prt_chars(out, ' ', spaces);
-
-	out->last_field = out->pos;
-	out->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab() - Advance printbuf to the next tabstop
- * @out:	printbuf to control
- *
- * Advance output to the next tabstop by printing spaces.
- */
-void bch2_prt_tab(struct printbuf *out)
-{
-	if (WARN_ON(!cur_tabstop(out)))
-		return;
-
-	__prt_tab(out);
-}
-
-static void __prt_tab_rjust(struct printbuf *buf)
-{
-	int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
-	if (pad > 0)
-		printbuf_insert_spaces(buf, buf->last_field, pad);
-
-	buf->last_field = buf->pos;
-	buf->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
- * previous output
- *
- * @buf: printbuf to control
- *
- * Advance output to the next tabstop by inserting spaces immediately after the
- * previous tabstop, right justifying previously outputted text.
- */
-void bch2_prt_tab_rjust(struct printbuf *buf)
-{
-	if (WARN_ON(!cur_tabstop(buf)))
-		return;
-
-	__prt_tab_rjust(buf);
-}
-
-/**
- * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
- *
- * @out:	output printbuf
- * @str:	string to print
- * @count:	number of bytes to print
- *
- * The following contol characters are handled as so:
- *   \n: prt_newline	newline that obeys current indent level
- *   \t: prt_tab	advance to next tabstop
- *   \r: prt_tab_rjust	advance to next tabstop, with right justification
- */
-void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
-{
-	unsigned indent_pos = out->pos;
-	prt_bytes(out, str, count);
-	printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
- * @out:	output printbuf
- * @v:		integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
-{
-	bch2_printbuf_make_room(out, 10);
-	unsigned len = string_get_size(v, 1, !out->si_units,
-				       out->buf + out->pos,
-				       printbuf_remaining_size(out));
-	printbuf_advance_pos(out, len);
-}
-
-/**
- * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
- * @out:	output printbuf
- * @v:		integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
-{
-	if (v < 0)
-		prt_char(out, '-');
-	bch2_prt_human_readable_u64(out, abs(v));
-}
-
-/**
- * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
- * @out:	output printbuf
- * @v:		integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_u64(struct printbuf *out, u64 v)
-{
-	if (out->human_readable_units)
-		bch2_prt_human_readable_u64(out, v);
-	else
-		bch2_prt_printf(out, "%llu", v);
-}
-
-/**
- * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
- * @out:	output printbuf
- * @v:		integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_s64(struct printbuf *out, s64 v)
-{
-	if (v < 0)
-		prt_char(out, '-');
-	bch2_prt_units_u64(out, abs(v));
-}
-
-void bch2_prt_string_option(struct printbuf *out,
-			    const char * const list[],
-			    size_t selected)
-{
-	for (size_t i = 0; list[i]; i++)
-		bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_prt_bitflags(struct printbuf *out,
-		       const char * const list[], u64 flags)
-{
-	unsigned bit, nr = 0;
-	bool first = true;
-
-	while (list[nr])
-		nr++;
-
-	while (flags && (bit = __ffs64(flags)) < nr) {
-		if (!first)
-			bch2_prt_printf(out, ",");
-		first = false;
-		bch2_prt_printf(out, "%s", list[bit]);
-		flags ^= BIT_ULL(bit);
-	}
-}
-
-void bch2_prt_bitflags_vector(struct printbuf *out,
-			      const char * const list[],
-			      unsigned long *v, unsigned nr)
-{
-	bool first = true;
-	unsigned i;
-
-	for (i = 0; i < nr; i++)
-		if (!list[i]) {
-			nr = i - 1;
-			break;
-		}
-
-	for_each_set_bit(i, v, nr) {
-		if (!first)
-			bch2_prt_printf(out, ",");
-		first = false;
-		bch2_prt_printf(out, "%s", list[i]);
-	}
-}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
deleted file mode 100644
index 8f4e28d440ac..000000000000
--- a/fs/bcachefs/printbuf.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ */
-/* Copyright (C) 2022 Kent Overstreet */
-
-#ifndef _BCACHEFS_PRINTBUF_H
-#define _BCACHEFS_PRINTBUF_H
-
-/*
- * Printbufs: Simple strings for printing to, with optional heap allocation
- *
- * This code has provisions for use in userspace, to aid in making other code
- * portable between kernelspace and userspace.
- *
- * Basic example:
- *   struct printbuf buf = PRINTBUF;
- *
- *   prt_printf(&buf, "foo=");
- *   foo_to_text(&buf, foo);
- *   printk("%s", buf.buf);
- *   printbuf_exit(&buf);
- *
- * Or
- *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
- *
- * We can now write pretty printers instead of writing code that dumps
- * everything to the kernel log buffer, and then those pretty-printers can be
- * used by other code that outputs to kernel log, sysfs, debugfs, etc.
- *
- * Memory allocation: Outputing to a printbuf may allocate memory. This
- * allocation is done with GFP_KERNEL, by default: use the newer
- * memalloc_*_(save|restore) functions as needed.
- *
- * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
- * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
- *
- * It's allowed to grab the output buffer and free it later with kfree() instead
- * of using printbuf_exit(), if the user just needs a heap allocated string at
- * the end.
- *
- * Memory allocation failures: We don't return errors directly, because on
- * memory allocation failure we usually don't want to bail out and unwind - we
- * want to print what we've got, on a best-effort basis. But code that does want
- * to return -ENOMEM may check printbuf.allocation_failure.
- *
- * Indenting, tabstops:
- *
- * To aid is writing multi-line pretty printers spread across multiple
- * functions, printbufs track the current indent level.
- *
- * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
- * level, respectively.
- *
- * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
- * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
- * prt_tab_rjust() will also advance the current line of text up to the next
- * tabstop, but it does so by shifting text since the previous tabstop up to the
- * next tabstop - right justifying it.
- *
- * Make sure you use prt_newline() instead of \n in the format string for indent
- * level and tabstops to work corretly.
- *
- * Output units: printbuf->units exists to tell pretty-printers how to output
- * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
- * human readable bytes. prt_units() obeys it.
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-enum printbuf_si {
-	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
-	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
-};
-
-#define PRINTBUF_INLINE_TABSTOPS	6
-
-struct printbuf {
-	char			*buf;
-	unsigned		size;
-	unsigned		pos;
-	unsigned		last_newline;
-	unsigned		last_field;
-	unsigned		indent;
-	/*
-	 * If nonzero, allocations will be done with GFP_ATOMIC:
-	 */
-	u8			atomic;
-	bool			allocation_failure:1;
-	bool			heap_allocated:1;
-	bool			overflow:1;
-	enum printbuf_si	si_units:1;
-	bool			human_readable_units:1;
-	bool			has_indent_or_tabstops:1;
-	bool			suppress_indent_tabstop_handling:1;
-	u8			nr_tabstops;
-
-	/*
-	 * Do not modify directly: use printbuf_tabstop_add(),
-	 * printbuf_tabstop_get()
-	 */
-	u8			cur_tabstop;
-	u8			_tabstops[PRINTBUF_INLINE_TABSTOPS];
-};
-
-int bch2_printbuf_make_room(struct printbuf *, unsigned);
-__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
-__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
-const char *bch2_printbuf_str(const struct printbuf *);
-void bch2_printbuf_exit(struct printbuf *);
-
-void bch2_printbuf_tabstops_reset(struct printbuf *);
-void bch2_printbuf_tabstop_pop(struct printbuf *);
-int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
-
-void bch2_printbuf_indent_add(struct printbuf *, unsigned);
-void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned);
-void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
-
-void bch2_prt_newline(struct printbuf *);
-void bch2_printbuf_strip_trailing_newline(struct printbuf *);
-void bch2_prt_tab(struct printbuf *);
-void bch2_prt_tab_rjust(struct printbuf *);
-
-void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
-void bch2_prt_human_readable_u64(struct printbuf *, u64);
-void bch2_prt_human_readable_s64(struct printbuf *, s64);
-void bch2_prt_units_u64(struct printbuf *, u64);
-void bch2_prt_units_s64(struct printbuf *, s64);
-void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
-void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
-void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
-			      unsigned long *, unsigned);
-
-/* Initializer for a heap allocated printbuf: */
-#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
-
-/* Initializer a printbuf that points to an external buffer: */
-#define PRINTBUF_EXTERN(_buf, _size)			\
-((struct printbuf) {					\
-	.buf	= _buf,					\
-	.size	= _size,				\
-})
-
-static inline struct printbuf bch2_printbuf_init(void)
-{
-	return PRINTBUF;
-}
-
-DEFINE_CLASS(printbuf, struct printbuf,
-	     bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
-
-/*
- * Returns size remaining of output buffer:
- */
-static inline unsigned printbuf_remaining_size(struct printbuf *out)
-{
-	if (WARN_ON(out->size && out->pos >= out->size))
-		out->pos = out->size - 1;
-	return out->size - out->pos;
-}
-
-/*
- * Returns number of characters we can print to the output buffer - i.e.
- * excluding the terminating nul:
- */
-static inline unsigned printbuf_remaining(struct printbuf *out)
-{
-	return out->size ? printbuf_remaining_size(out) - 1 : 0;
-}
-
-static inline unsigned printbuf_written(struct printbuf *out)
-{
-	return out->size ? min(out->pos, out->size - 1) : 0;
-}
-
-static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
-{
-	if (WARN_ON(out->size && out->pos >= out->size))
-		out->pos = out->size - 1;
-	if (out->size)
-		out->buf[out->pos] = 0;
-}
-
-static inline void printbuf_nul_terminate(struct printbuf *out)
-{
-	bch2_printbuf_make_room(out, 1);
-	printbuf_nul_terminate_reserved(out);
-}
-
-/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
-static inline void __prt_char_reserved(struct printbuf *out, char c)
-{
-	if (printbuf_remaining(out))
-		out->buf[out->pos++] = c;
-}
-
-/* Doesn't nul terminate: */
-static inline void __prt_char(struct printbuf *out, char c)
-{
-	bch2_printbuf_make_room(out, 1);
-	__prt_char_reserved(out, c);
-}
-
-static inline void prt_char(struct printbuf *out, char c)
-{
-	bch2_printbuf_make_room(out, 2);
-	__prt_char_reserved(out, c);
-	printbuf_nul_terminate_reserved(out);
-}
-
-static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
-{
-	unsigned can_print = min(n, printbuf_remaining(out));
-
-	for (unsigned i = 0; i < can_print; i++)
-		out->buf[out->pos++] = c;
-}
-
-static inline void prt_chars(struct printbuf *out, char c, unsigned n)
-{
-	bch2_printbuf_make_room(out, n);
-	__prt_chars_reserved(out, c, n);
-	printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
-{
-	bch2_printbuf_make_room(out, n);
-
-	unsigned can_print = min(n, printbuf_remaining(out));
-
-	for (unsigned i = 0; i < can_print; i++)
-		out->buf[out->pos++] = ((char *) b)[i];
-
-	printbuf_nul_terminate(out);
-}
-
-static inline void prt_str(struct printbuf *out, const char *str)
-{
-	prt_bytes(out, str, strlen(str));
-}
-
-static inline void prt_str_indented(struct printbuf *out, const char *str)
-{
-	bch2_prt_bytes_indented(out, str, strlen(str));
-}
-
-static inline void prt_hex_byte(struct printbuf *out, u8 byte)
-{
-	bch2_printbuf_make_room(out, 3);
-	__prt_char_reserved(out, hex_asc_hi(byte));
-	__prt_char_reserved(out, hex_asc_lo(byte));
-	printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
-{
-	bch2_printbuf_make_room(out, 3);
-	__prt_char_reserved(out, hex_asc_upper_hi(byte));
-	__prt_char_reserved(out, hex_asc_upper_lo(byte));
-	printbuf_nul_terminate_reserved(out);
-}
-
-static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
-{
-	buf->pos		= 0;
-	buf->allocation_failure	= 0;
-	buf->last_newline	= 0;
-	buf->last_field		= 0;
-	buf->indent		= 0;
-	buf->cur_tabstop	= 0;
-}
-
-/**
- * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
- */
-static inline void printbuf_reset(struct printbuf *buf)
-{
-	printbuf_reset_keep_tabstops(buf);
-	buf->nr_tabstops	= 0;
-}
-
-/**
- * printbuf_atomic_inc - mark as entering an atomic section
- */
-static inline void printbuf_atomic_inc(struct printbuf *buf)
-{
-	buf->atomic++;
-}
-
-/**
- * printbuf_atomic_inc - mark as leaving an atomic section
- */
-static inline void printbuf_atomic_dec(struct printbuf *buf)
-{
-	buf->atomic--;
-}
-
-#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
deleted file mode 100644
index d09898566abe..000000000000
--- a/fs/bcachefs/progress.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "disk_accounting.h"
-#include "progress.h"
-
-void bch2_progress_init(struct progress_indicator_state *s,
-			struct bch_fs *c,
-			u64 btree_id_mask)
-{
-	memset(s, 0, sizeof(*s));
-
-	s->next_print = jiffies + HZ * 10;
-
-	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
-		if (!(btree_id_mask & BIT_ULL(i)))
-			continue;
-
-		struct disk_accounting_pos acc;
-		disk_accounting_key_init(acc, btree, .id = i);
-
-		u64 v;
-		bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-		s->nodes_total += div64_ul(v, btree_sectors(c));
-	}
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
-	bool ret = time_after_eq(jiffies, s->next_print);
-
-	if (ret)
-		s->next_print = jiffies + HZ * 10;
-	return ret;
-}
-
-void bch2_progress_update_iter(struct btree_trans *trans,
-			       struct progress_indicator_state *s,
-			       struct btree_iter *iter,
-			       const char *msg)
-{
-	struct bch_fs *c = trans->c;
-	struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
-	s->nodes_seen += b != s->last_node;
-	s->last_node = b;
-
-	if (progress_update_p(s)) {
-		struct printbuf buf = PRINTBUF;
-		unsigned percent = s->nodes_total
-			? div64_u64(s->nodes_seen * 100, s->nodes_total)
-			: 0;
-
-		prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
-			   msg, percent, s->nodes_seen, s->nodes_total);
-		bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
-		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
deleted file mode 100644
index 23fb1811f943..000000000000
--- a/fs/bcachefs/progress.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_PROGRESS_H
-#define _BCACHEFS_PROGRESS_H
-
-/*
- * Lame progress indicators
- *
- * We don't like to use these because they print to the dmesg console, which is
- * spammy - we much prefer to be wired up to a userspace programm (e.g. via
- * thread_with_file) and have it print the progress indicator.
- *
- * But some code is old and doesn't support that, or runs in a context where
- * that's not yet practical (mount).
- */
-
-struct progress_indicator_state {
-	unsigned long		next_print;
-	u64			nodes_seen;
-	u64			nodes_total;
-	struct btree		*last_node;
-};
-
-void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
-void bch2_progress_update_iter(struct btree_trans *,
-			       struct progress_indicator_state *,
-			       struct btree_iter *,
-			       const char *);
-
-#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
deleted file mode 100644
index f241efb1fb50..000000000000
--- a/fs/bcachefs/quota.c
+++ /dev/null
@@ -1,892 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "quota.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-static const char * const bch2_quota_types[] = {
-	"user",
-	"group",
-	"project",
-};
-
-static const char * const bch2_quota_counters[] = {
-	"space",
-	"inodes",
-};
-
-static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_quota *q = field_to_type(f, quota);
-
-	if (vstruct_bytes(&q->field) < sizeof(*q)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&q->field), sizeof(*q));
-		return -BCH_ERR_invalid_sb_quota;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_quota *q = field_to_type(f, quota);
-	unsigned qtyp, counter;
-
-	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
-		prt_printf(out, "%s: flags %llx",
-		       bch2_quota_types[qtyp],
-		       le64_to_cpu(q->q[qtyp].flags));
-
-		for (counter = 0; counter < Q_COUNTERS; counter++)
-			prt_printf(out, " %s timelimit %u warnlimit %u",
-			       bch2_quota_counters[counter],
-			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
-			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
-
-		prt_newline(out);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-	.validate	= bch2_sb_quota_validate,
-	.to_text	= bch2_sb_quota_to_text,
-};
-
-int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
-			struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
-			 c, quota_type_invalid,
-			 "invalid quota type (%llu >= %u)",
-			 k.k->p.inode, QTYP_NR);
-fsck_err:
-	return ret;
-}
-
-void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
-			struct bkey_s_c k)
-{
-	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
-	unsigned i;
-
-	for (i = 0; i < Q_COUNTERS; i++)
-		prt_printf(out, "%s hardlimit %llu softlimit %llu",
-		       bch2_quota_counters[i],
-		       le64_to_cpu(dq.v->c[i].hardlimit),
-		       le64_to_cpu(dq.v->c[i].softlimit));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/quota.h>
-
-static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
-{
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 20);
-
-	prt_printf(out, "i_fieldmask\t%x\n",		i->i_fieldmask);
-	prt_printf(out, "i_flags\t%u\n",		i->i_flags);
-	prt_printf(out, "i_spc_timelimit\t%u\n",	i->i_spc_timelimit);
-	prt_printf(out, "i_ino_timelimit\t%u\n",	i->i_ino_timelimit);
-	prt_printf(out, "i_rt_spc_timelimit\t%u\n",	i->i_rt_spc_timelimit);
-	prt_printf(out, "i_spc_warnlimit\t%u\n",	i->i_spc_warnlimit);
-	prt_printf(out, "i_ino_warnlimit\t%u\n",	i->i_ino_warnlimit);
-	prt_printf(out, "i_rt_spc_warnlimit\t%u\n",	i->i_rt_spc_warnlimit);
-}
-
-static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
-{
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, 20);
-
-	prt_printf(out, "d_fieldmask\t%x\n",		q->d_fieldmask);
-	prt_printf(out, "d_spc_hardlimit\t%llu\n",	q->d_spc_hardlimit);
-	prt_printf(out, "d_spc_softlimit\t%llu\n",	q->d_spc_softlimit);
-	prt_printf(out, "d_ino_hardlimit\%llu\n",	q->d_ino_hardlimit);
-	prt_printf(out, "d_ino_softlimit\t%llu\n",	q->d_ino_softlimit);
-	prt_printf(out, "d_space\t%llu\n",		q->d_space);
-	prt_printf(out, "d_ino_count\t%llu\n",		q->d_ino_count);
-	prt_printf(out, "d_ino_timer\t%llu\n",		q->d_ino_timer);
-	prt_printf(out, "d_spc_timer\t%llu\n",		q->d_spc_timer);
-	prt_printf(out, "d_ino_warns\t%i\n",		q->d_ino_warns);
-	prt_printf(out, "d_spc_warns\t%i\n",		q->d_spc_warns);
-}
-
-static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
-{
-	qtypes >>= i;
-	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
-}
-
-#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
-	for (_i = 0;							\
-	     (_i = __next_qtype(_i, _qtypes),				\
-	      _q = &(_c)->quotas[_i],					\
-	      _i < QTYP_NR);						\
-	     _i++)
-
-static bool ignore_hardlimit(struct bch_memquota_type *q)
-{
-	if (capable(CAP_SYS_RESOURCE))
-		return true;
-#if 0
-	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-
-	return capable(CAP_SYS_RESOURCE) &&
-	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-		!(info->dqi_flags & DQF_ROOT_SQUASH));
-#endif
-	return false;
-}
-
-enum quota_msg {
-	SOFTWARN,	/* Softlimit reached */
-	SOFTLONGWARN,	/* Grace time expired */
-	HARDWARN,	/* Hardlimit reached */
-
-	HARDBELOW,	/* Usage got below inode hardlimit */
-	SOFTBELOW,	/* Usage got below inode softlimit */
-};
-
-static int quota_nl[][Q_COUNTERS] = {
-	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
-	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
-	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
-	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
-	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
-
-	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
-	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
-	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
-	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
-	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
-};
-
-struct quota_msgs {
-	u8		nr;
-	struct {
-		u8	qtype;
-		u8	msg;
-	}		m[QTYP_NR * Q_COUNTERS];
-};
-
-static void prepare_msg(unsigned qtype,
-			enum quota_counters counter,
-			struct quota_msgs *msgs,
-			enum quota_msg msg_type)
-{
-	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
-
-	msgs->m[msgs->nr].qtype	= qtype;
-	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
-	msgs->nr++;
-}
-
-static void prepare_warning(struct memquota_counter *qc,
-			    unsigned qtype,
-			    enum quota_counters counter,
-			    struct quota_msgs *msgs,
-			    enum quota_msg msg_type)
-{
-	if (qc->warning_issued & (1 << msg_type))
-		return;
-
-	prepare_msg(qtype, counter, msgs, msg_type);
-}
-
-static void flush_warnings(struct bch_qid qid,
-			   struct super_block *sb,
-			   struct quota_msgs *msgs)
-{
-	unsigned i;
-
-	for (i = 0; i < msgs->nr; i++)
-		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
-				   sb->s_dev, msgs->m[i].msg);
-}
-
-static int bch2_quota_check_limit(struct bch_fs *c,
-				  unsigned qtype,
-				  struct bch_memquota *mq,
-				  struct quota_msgs *msgs,
-				  enum quota_counters counter,
-				  s64 v,
-				  enum quota_acct_mode mode)
-{
-	struct bch_memquota_type *q = &c->quotas[qtype];
-	struct memquota_counter *qc = &mq->c[counter];
-	u64 n = qc->v + v;
-
-	BUG_ON((s64) n < 0);
-
-	if (mode == KEY_TYPE_QUOTA_NOCHECK)
-		return 0;
-
-	if (v <= 0) {
-		if (n < qc->hardlimit &&
-		    (qc->warning_issued & (1 << HARDWARN))) {
-			qc->warning_issued &= ~(1 << HARDWARN);
-			prepare_msg(qtype, counter, msgs, HARDBELOW);
-		}
-
-		if (n < qc->softlimit &&
-		    (qc->warning_issued & (1 << SOFTWARN))) {
-			qc->warning_issued &= ~(1 << SOFTWARN);
-			prepare_msg(qtype, counter, msgs, SOFTBELOW);
-		}
-
-		qc->warning_issued = 0;
-		return 0;
-	}
-
-	if (qc->hardlimit &&
-	    qc->hardlimit < n &&
-	    !ignore_hardlimit(q)) {
-		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
-		return -EDQUOT;
-	}
-
-	if (qc->softlimit &&
-	    qc->softlimit < n) {
-		if (qc->timer == 0) {
-			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
-			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-		} else if (ktime_get_real_seconds() >= qc->timer &&
-			   !ignore_hardlimit(q)) {
-			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-			return -EDQUOT;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
-		    enum quota_counters counter, s64 v,
-		    enum quota_acct_mode mode)
-{
-	unsigned qtypes = enabled_qtypes(c);
-	struct bch_memquota_type *q;
-	struct bch_memquota *mq[QTYP_NR];
-	struct quota_msgs msgs;
-	unsigned i;
-	int ret = 0;
-
-	memset(&msgs, 0, sizeof(msgs));
-
-	for_each_set_qtype(c, i, q, qtypes) {
-		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
-		if (!mq[i])
-			return -ENOMEM;
-	}
-
-	for_each_set_qtype(c, i, q, qtypes)
-		mutex_lock_nested(&q->lock, i);
-
-	for_each_set_qtype(c, i, q, qtypes) {
-		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
-		if (ret)
-			goto err;
-	}
-
-	for_each_set_qtype(c, i, q, qtypes)
-		mq[i]->c[counter].v += v;
-err:
-	for_each_set_qtype(c, i, q, qtypes)
-		mutex_unlock(&q->lock);
-
-	flush_warnings(qid, c->vfs_sb, &msgs);
-
-	return ret;
-}
-
-static void __bch2_quota_transfer(struct bch_memquota *src_q,
-				  struct bch_memquota *dst_q,
-				  enum quota_counters counter, s64 v)
-{
-	BUG_ON(v > src_q->c[counter].v);
-	BUG_ON(v + dst_q->c[counter].v < v);
-
-	src_q->c[counter].v -= v;
-	dst_q->c[counter].v += v;
-}
-
-int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
-			struct bch_qid dst,
-			struct bch_qid src, u64 space,
-			enum quota_acct_mode mode)
-{
-	struct bch_memquota_type *q;
-	struct bch_memquota *src_q[3], *dst_q[3];
-	struct quota_msgs msgs;
-	unsigned i;
-	int ret = 0;
-
-	qtypes &= enabled_qtypes(c);
-
-	memset(&msgs, 0, sizeof(msgs));
-
-	for_each_set_qtype(c, i, q, qtypes) {
-		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
-		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
-		if (!src_q[i] || !dst_q[i])
-			return -ENOMEM;
-	}
-
-	for_each_set_qtype(c, i, q, qtypes)
-		mutex_lock_nested(&q->lock, i);
-
-	for_each_set_qtype(c, i, q, qtypes) {
-		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
-					     dst_q[i]->c[Q_SPC].v + space,
-					     mode);
-		if (ret)
-			goto err;
-
-		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
-					     dst_q[i]->c[Q_INO].v + 1,
-					     mode);
-		if (ret)
-			goto err;
-	}
-
-	for_each_set_qtype(c, i, q, qtypes) {
-		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
-		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
-	}
-
-err:
-	for_each_set_qtype(c, i, q, qtypes)
-		mutex_unlock(&q->lock);
-
-	flush_warnings(dst, c->vfs_sb, &msgs);
-
-	return ret;
-}
-
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
-			    struct qc_dqblk *qdq)
-{
-	struct bkey_s_c_quota dq;
-	struct bch_memquota_type *q;
-	struct bch_memquota *mq;
-	unsigned i;
-
-	BUG_ON(k.k->p.inode >= QTYP_NR);
-
-	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
-		return 0;
-
-	switch (k.k->type) {
-	case KEY_TYPE_quota:
-		dq = bkey_s_c_to_quota(k);
-		q = &c->quotas[k.k->p.inode];
-
-		mutex_lock(&q->lock);
-		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
-		if (!mq) {
-			mutex_unlock(&q->lock);
-			return -ENOMEM;
-		}
-
-		for (i = 0; i < Q_COUNTERS; i++) {
-			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
-			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
-		}
-
-		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
-			mq->c[Q_SPC].timer	= qdq->d_spc_timer;
-		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
-			mq->c[Q_SPC].warns	= qdq->d_spc_warns;
-		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
-			mq->c[Q_INO].timer	= qdq->d_ino_timer;
-		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
-			mq->c[Q_INO].warns	= qdq->d_ino_warns;
-
-		mutex_unlock(&q->lock);
-	}
-
-	return 0;
-}
-
-void bch2_fs_quota_exit(struct bch_fs *c)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
-		genradix_free(&c->quotas[i].table);
-}
-
-void bch2_fs_quota_init(struct bch_fs *c)
-{
-	unsigned i;
-
-	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
-		mutex_init(&c->quotas[i].lock);
-}
-
-static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
-{
-	struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
-
-	if (sb_quota)
-		return sb_quota;
-
-	sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
-	if (sb_quota) {
-		unsigned qtype, qc;
-
-		for (qtype = 0; qtype < QTYP_NR; qtype++)
-			for (qc = 0; qc < Q_COUNTERS; qc++)
-				sb_quota->q[qtype].c[qc].timelimit =
-					cpu_to_le32(7 * 24 * 60 * 60);
-	}
-
-	return sb_quota;
-}
-
-static void bch2_sb_quota_read(struct bch_fs *c)
-{
-	struct bch_sb_field_quota *sb_quota;
-	unsigned i, j;
-
-	sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
-	if (!sb_quota)
-		return;
-
-	for (i = 0; i < QTYP_NR; i++) {
-		struct bch_memquota_type *q = &c->quotas[i];
-
-		for (j = 0; j < Q_COUNTERS; j++) {
-			q->limits[j].timelimit =
-				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
-			q->limits[j].warnlimit =
-				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
-		}
-	}
-}
-
-static int bch2_fs_quota_read_inode(struct btree_trans *trans,
-				    struct btree_iter *iter,
-				    struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_inode_unpacked u;
-	struct bch_snapshot_tree s_t;
-	u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
-
-	int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-			"%s: snapshot tree %u not found", __func__, tree);
-	if (ret)
-		return ret;
-
-	if (!s_t.master_subvol)
-		goto advance;
-
-	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
-				(subvol_inum) {
-					le32_to_cpu(s_t.master_subvol),
-					k.k->p.offset,
-				}, &u);
-	/*
-	 * Inode might be deleted in this snapshot - the easiest way to handle
-	 * that is to just skip it here:
-	 */
-	if (bch2_err_matches(ret, ENOENT))
-		goto advance;
-
-	if (ret)
-		return ret;
-
-	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-			KEY_TYPE_QUOTA_NOCHECK);
-	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-			KEY_TYPE_QUOTA_NOCHECK);
-advance:
-	bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
-	return 0;
-}
-
-int bch2_fs_quota_read(struct bch_fs *c)
-{
-
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-	if (!sb_quota) {
-		mutex_unlock(&c->sb_lock);
-		return bch_err_throw(c, ENOSPC_sb_quota);
-	}
-
-	bch2_sb_quota_read(c);
-	mutex_unlock(&c->sb_lock);
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
-				   BTREE_ITER_prefetch, k,
-			__bch2_quota_set(c, k, NULL)) ?:
-		for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
-				   BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-			bch2_fs_quota_read_inode(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* Enable/disable/delete quotas for an entire filesystem: */
-
-static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct bch_sb_field_quota *sb_quota;
-	int ret = 0;
-
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	/* Accounting must be enabled at mount time: */
-	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
-		return -EINVAL;
-
-	/* Can't enable enforcement without accounting: */
-	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
-		return -EINVAL;
-
-	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
-		return -EINVAL;
-
-	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
-		return -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-	if (!sb_quota) {
-		ret = bch_err_throw(c, ENOSPC_sb_quota);
-		goto unlock;
-	}
-
-	if (uflags & FS_QUOTA_UDQ_ENFD)
-		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
-
-	if (uflags & FS_QUOTA_GDQ_ENFD)
-		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
-
-	if (uflags & FS_QUOTA_PDQ_ENFD)
-		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
-
-	bch2_write_super(c);
-unlock:
-	mutex_unlock(&c->sb_lock);
-
-	return bch2_err_class(ret);
-}
-
-static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
-{
-	struct bch_fs *c = sb->s_fs_info;
-
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	mutex_lock(&c->sb_lock);
-	if (uflags & FS_QUOTA_UDQ_ENFD)
-		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
-
-	if (uflags & FS_QUOTA_GDQ_ENFD)
-		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
-
-	if (uflags & FS_QUOTA_PDQ_ENFD)
-		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	int ret;
-
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	if (uflags & FS_USER_QUOTA) {
-		if (c->opts.usrquota)
-			return -EINVAL;
-
-		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-					      POS(QTYP_USR, 0),
-					      POS(QTYP_USR, U64_MAX),
-					      0, NULL);
-		if (ret)
-			return ret;
-	}
-
-	if (uflags & FS_GROUP_QUOTA) {
-		if (c->opts.grpquota)
-			return -EINVAL;
-
-		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-					      POS(QTYP_GRP, 0),
-					      POS(QTYP_GRP, U64_MAX),
-					      0, NULL);
-		if (ret)
-			return ret;
-	}
-
-	if (uflags & FS_PROJ_QUOTA) {
-		if (c->opts.prjquota)
-			return -EINVAL;
-
-		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
-					      POS(QTYP_PRJ, 0),
-					      POS(QTYP_PRJ, U64_MAX),
-					      0, NULL);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/*
- * Return quota status information, such as enforcements, quota file inode
- * numbers etc.
- */
-static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	unsigned qtypes = enabled_qtypes(c);
-	unsigned i;
-
-	memset(state, 0, sizeof(*state));
-
-	for (i = 0; i < QTYP_NR; i++) {
-		state->s_state[i].flags |= QCI_SYSFILE;
-
-		if (!(qtypes & (1 << i)))
-			continue;
-
-		state->s_state[i].flags |= QCI_ACCT_ENABLED;
-
-		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
-		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
-
-		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
-		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
-	}
-
-	return 0;
-}
-
-/*
- * Adjust quota timers & warnings
- */
-static int bch2_quota_set_info(struct super_block *sb, int type,
-			       struct qc_info *info)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct bch_sb_field_quota *sb_quota;
-	int ret = 0;
-
-	if (0) {
-		struct printbuf buf = PRINTBUF;
-
-		qc_info_to_text(&buf, info);
-		pr_info("setting:\n%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	if (type >= QTYP_NR)
-		return -EINVAL;
-
-	if (!((1 << type) & enabled_qtypes(c)))
-		return -ESRCH;
-
-	if (info->i_fieldmask &
-	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
-		return -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
-	if (!sb_quota) {
-		ret = bch_err_throw(c, ENOSPC_sb_quota);
-		goto unlock;
-	}
-
-	if (info->i_fieldmask & QC_SPC_TIMER)
-		sb_quota->q[type].c[Q_SPC].timelimit =
-			cpu_to_le32(info->i_spc_timelimit);
-
-	if (info->i_fieldmask & QC_SPC_WARNS)
-		sb_quota->q[type].c[Q_SPC].warnlimit =
-			cpu_to_le32(info->i_spc_warnlimit);
-
-	if (info->i_fieldmask & QC_INO_TIMER)
-		sb_quota->q[type].c[Q_INO].timelimit =
-			cpu_to_le32(info->i_ino_timelimit);
-
-	if (info->i_fieldmask & QC_INO_WARNS)
-		sb_quota->q[type].c[Q_INO].warnlimit =
-			cpu_to_le32(info->i_ino_warnlimit);
-
-	bch2_sb_quota_read(c);
-
-	bch2_write_super(c);
-unlock:
-	mutex_unlock(&c->sb_lock);
-
-	return bch2_err_class(ret);
-}
-
-/* Get/set individual quotas: */
-
-static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
-{
-	dst->d_space		= src->c[Q_SPC].v << 9;
-	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
-	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
-	dst->d_spc_timer	= src->c[Q_SPC].timer;
-	dst->d_spc_warns	= src->c[Q_SPC].warns;
-
-	dst->d_ino_count	= src->c[Q_INO].v;
-	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
-	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
-	dst->d_ino_timer	= src->c[Q_INO].timer;
-	dst->d_ino_warns	= src->c[Q_INO].warns;
-}
-
-static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
-			  struct qc_dqblk *qdq)
-{
-	struct bch_fs *c		= sb->s_fs_info;
-	struct bch_memquota_type *q	= &c->quotas[kqid.type];
-	qid_t qid			= from_kqid(&init_user_ns, kqid);
-	struct bch_memquota *mq;
-
-	memset(qdq, 0, sizeof(*qdq));
-
-	mutex_lock(&q->lock);
-	mq = genradix_ptr(&q->table, qid);
-	if (mq)
-		__bch2_quota_get(qdq, mq);
-	mutex_unlock(&q->lock);
-
-	return 0;
-}
-
-static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
-			       struct qc_dqblk *qdq)
-{
-	struct bch_fs *c		= sb->s_fs_info;
-	struct bch_memquota_type *q	= &c->quotas[kqid->type];
-	qid_t qid			= from_kqid(&init_user_ns, *kqid);
-	struct genradix_iter iter;
-	struct bch_memquota *mq;
-	int ret = 0;
-
-	mutex_lock(&q->lock);
-
-	genradix_for_each_from(&q->table, iter, mq, qid)
-		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
-			__bch2_quota_get(qdq, mq);
-			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
-			goto found;
-		}
-
-	ret = -ENOENT;
-found:
-	mutex_unlock(&q->lock);
-	return bch2_err_class(ret);
-}
-
-static int bch2_set_quota_trans(struct btree_trans *trans,
-				struct bkey_i_quota *new_quota,
-				struct qc_dqblk *qdq)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
-			       BTREE_ITER_slots|BTREE_ITER_intent);
-	ret = bkey_err(k);
-	if (unlikely(ret))
-		return ret;
-
-	if (k.k->type == KEY_TYPE_quota)
-		new_quota->v = *bkey_s_c_to_quota(k).v;
-
-	if (qdq->d_fieldmask & QC_SPC_SOFT)
-		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
-	if (qdq->d_fieldmask & QC_SPC_HARD)
-		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
-
-	if (qdq->d_fieldmask & QC_INO_SOFT)
-		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
-	if (qdq->d_fieldmask & QC_INO_HARD)
-		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
-
-	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
-			  struct qc_dqblk *qdq)
-{
-	struct bch_fs *c = sb->s_fs_info;
-	struct bkey_i_quota new_quota;
-	int ret;
-
-	if (0) {
-		struct printbuf buf = PRINTBUF;
-
-		qc_dqblk_to_text(&buf, qdq);
-		pr_info("setting:\n%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	if (sb->s_flags & SB_RDONLY)
-		return -EROFS;
-
-	bkey_quota_init(&new_quota.k_i);
-	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
-	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
-		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
-
-	return bch2_err_class(ret);
-}
-
-const struct quotactl_ops bch2_quotactl_operations = {
-	.quota_enable		= bch2_quota_enable,
-	.quota_disable		= bch2_quota_disable,
-	.rm_xquota		= bch2_quota_remove,
-
-	.get_state		= bch2_quota_get_state,
-	.set_info		= bch2_quota_set_info,
-
-	.get_dqblk		= bch2_get_quota,
-	.get_nextdqblk		= bch2_get_next_quota,
-	.set_dqblk		= bch2_set_quota,
-};
-
-#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
deleted file mode 100644
index 1551800ff44c..000000000000
--- a/fs/bcachefs/quota.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_H
-#define _BCACHEFS_QUOTA_H
-
-#include "inode.h"
-#include "quota_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
-			struct bkey_validate_context);
-void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
-	.key_validate	= bch2_quota_validate,		\
-	.val_to_text	= bch2_quota_to_text,		\
-	.min_val_size	= 32,				\
-})
-
-static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
-{
-	return (struct bch_qid) {
-		.q[QTYP_USR] = u->bi_uid,
-		.q[QTYP_GRP] = u->bi_gid,
-		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
-	};
-}
-
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
-	return ((c->opts.usrquota << QTYP_USR)|
-		(c->opts.grpquota << QTYP_GRP)|
-		(c->opts.prjquota << QTYP_PRJ));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
-		    s64, enum quota_acct_mode);
-
-int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
-			struct bch_qid, u64, enum quota_acct_mode);
-
-void bch2_fs_quota_exit(struct bch_fs *);
-void bch2_fs_quota_init(struct bch_fs *);
-int bch2_fs_quota_read(struct bch_fs *);
-
-extern const struct quotactl_ops bch2_quotactl_operations;
-
-#else
-
-static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
-				  enum quota_counters counter, s64 v,
-				  enum quota_acct_mode mode)
-{
-	return 0;
-}
-
-static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
-				      struct bch_qid dst,
-				      struct bch_qid src, u64 space,
-				      enum quota_acct_mode mode)
-{
-	return 0;
-}
-
-static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
-static inline void bch2_fs_quota_init(struct bch_fs *c) {}
-static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
-
-#endif
-
-#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
deleted file mode 100644
index dc34347ef6c7..000000000000
--- a/fs/bcachefs/quota_format.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_FORMAT_H
-#define _BCACHEFS_QUOTA_FORMAT_H
-
-/* KEY_TYPE_quota: */
-
-enum quota_types {
-	QTYP_USR		= 0,
-	QTYP_GRP		= 1,
-	QTYP_PRJ		= 2,
-	QTYP_NR			= 3,
-};
-
-enum quota_counters {
-	Q_SPC			= 0,
-	Q_INO			= 1,
-	Q_COUNTERS		= 2,
-};
-
-struct bch_quota_counter {
-	__le64			hardlimit;
-	__le64			softlimit;
-};
-
-struct bch_quota {
-	struct bch_val		v;
-	struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
-	__le32				timelimit;
-	__le32				warnlimit;
-};
-
-struct bch_sb_quota_type {
-	__le64				flags;
-	struct bch_sb_quota_counter	c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
-	struct bch_sb_field		field;
-	struct bch_sb_quota_type	q[QTYP_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
deleted file mode 100644
index 6a136083d389..000000000000
--- a/fs/bcachefs/quota_types.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_TYPES_H
-#define _BCACHEFS_QUOTA_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-struct bch_qid {
-	u32		q[QTYP_NR];
-};
-
-enum quota_acct_mode {
-	KEY_TYPE_QUOTA_PREALLOC,
-	KEY_TYPE_QUOTA_WARN,
-	KEY_TYPE_QUOTA_NOCHECK,
-};
-
-struct memquota_counter {
-	u64				v;
-	u64				hardlimit;
-	u64				softlimit;
-	s64				timer;
-	int				warns;
-	int				warning_issued;
-};
-
-struct bch_memquota {
-	struct memquota_counter		c[Q_COUNTERS];
-};
-
-typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
-
-struct quota_limit {
-	u32				timelimit;
-	u32				warnlimit;
-};
-
-struct bch_memquota_type {
-	struct quota_limit		limits[Q_COUNTERS];
-	bch_memquota_table		table;
-	struct mutex			lock;
-};
-
-#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
deleted file mode 100644
index b1438be9d690..000000000000
--- a/fs/bcachefs/rcu_pending.c
+++ /dev/null
@@ -1,666 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-
-#include <linux/generic-radix-tree.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/srcu.h>
-#include <linux/vmalloc.h>
-
-#include "rcu_pending.h"
-#include "darray.h"
-#include "util.h"
-
-#define static_array_for_each(_a, _i)			\
-	for (typeof(&(_a)[0]) _i = _a;			\
-	     _i < (_a) + ARRAY_SIZE(_a);		\
-	     _i++)
-
-enum rcu_pending_special {
-	RCU_PENDING_KVFREE	= 1,
-	RCU_PENDING_CALL_RCU	= 2,
-};
-
-#define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
-#define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
-
-#ifdef __KERNEL__
-typedef unsigned long			rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
-	return l == r;
-}
-#else
-typedef struct urcu_gp_poll_state	rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
-	return l.grace_period_id == r.grace_period_id;
-}
-#endif
-
-static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
-{
-	return ssp
-		? get_state_synchronize_srcu(ssp)
-		: get_state_synchronize_rcu();
-}
-
-static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
-{
-	return ssp
-		? start_poll_synchronize_srcu(ssp)
-		: start_poll_synchronize_rcu();
-}
-
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
-{
-	return ssp
-		? poll_state_synchronize_srcu(ssp, cookie)
-		: poll_state_synchronize_rcu(cookie);
-}
-
-static inline void __rcu_barrier(struct srcu_struct *ssp)
-{
-	return ssp
-		? srcu_barrier(ssp)
-		: rcu_barrier();
-}
-
-static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
-			      rcu_callback_t func)
-{
-	if (ssp)
-		call_srcu(ssp, rhp, func);
-	else
-		call_rcu(rhp, func);
-}
-
-struct rcu_pending_seq {
-	/*
-	 * We're using a radix tree like a vector - we're just pushing elements
-	 * onto the end; we're using a radix tree instead of an actual vector to
-	 * avoid reallocation overhead
-	 */
-	GENRADIX(struct rcu_head *)	objs;
-	size_t				nr;
-	struct rcu_head			**cursor;
-	rcu_gp_poll_state_t		seq;
-};
-
-struct rcu_pending_list {
-	struct rcu_head			*head;
-	struct rcu_head			*tail;
-	rcu_gp_poll_state_t		seq;
-};
-
-struct rcu_pending_pcpu {
-	struct rcu_pending		*parent;
-	spinlock_t			lock;
-	int				cpu;
-
-	/*
-	 * We can't bound the number of unprocessed gp sequence numbers, and we
-	 * can't efficiently merge radix trees for expired grace periods, so we
-	 * need darray/vector:
-	 */
-	DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
-
-	/* Third entry is for expired objects: */
-	struct rcu_pending_list		lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
-
-	struct rcu_head			cb;
-	bool				cb_armed;
-	struct work_struct		work;
-};
-
-static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
-{
-	if (p->objs.nr)
-		return true;
-
-	static_array_for_each(p->lists, i)
-		if (i->head)
-			return true;
-
-	return false;
-}
-
-static void rcu_pending_list_merge(struct rcu_pending_list *l1,
-				   struct rcu_pending_list *l2)
-{
-#ifdef __KERNEL__
-	if (!l1->head)
-		l1->head = l2->head;
-	else
-		l1->tail->next = l2->head;
-#else
-	if (!l1->head)
-		l1->head = l2->head;
-	else
-		l1->tail->next.next = (void *) l2->head;
-#endif
-
-	l1->tail = l2->tail;
-	l2->head = l2->tail = NULL;
-}
-
-static void rcu_pending_list_add(struct rcu_pending_list *l,
-				 struct rcu_head *n)
-{
-#ifdef __KERNEL__
-	if (!l->head)
-		l->head = n;
-	else
-		l->tail->next = n;
-	l->tail = n;
-	n->next = NULL;
-#else
-	if (!l->head)
-		l->head = n;
-	else
-		l->tail->next.next = (void *) n;
-	l->tail = n;
-	n->next.next = NULL;
-#endif
-}
-
-static void merge_expired_lists(struct rcu_pending_pcpu *p)
-{
-	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-
-	for (struct rcu_pending_list *i = p->lists; i < expired; i++)
-		if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
-			rcu_pending_list_merge(expired, i);
-}
-
-#ifndef __KERNEL__
-static inline void kfree_bulk(size_t nr, void ** p)
-{
-	while (nr--)
-		kfree(*p);
-}
-#endif
-
-static noinline void __process_finished_items(struct rcu_pending *pending,
-					      struct rcu_pending_pcpu *p,
-					      unsigned long flags)
-{
-	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-	struct rcu_pending_seq objs = {};
-	struct rcu_head *list = NULL;
-
-	if (p->objs.nr &&
-	    __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
-		objs = p->objs.data[0];
-		darray_remove_item(&p->objs, p->objs.data);
-	}
-
-	merge_expired_lists(p);
-
-	list = expired->head;
-	expired->head = expired->tail = NULL;
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	switch ((ulong) pending->process) {
-	case RCU_PENDING_KVFREE:
-		for (size_t i = 0; i < objs.nr; ) {
-			size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
-
-			kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
-			i += nr_this_node;
-		}
-		genradix_free(&objs.objs);
-
-		while (list) {
-			struct rcu_head *obj = list;
-#ifdef __KERNEL__
-			list = obj->next;
-#else
-			list = (void *) obj->next.next;
-#endif
-
-			/*
-			 * low bit of pointer indicates whether rcu_head needs
-			 * to be freed - kvfree_rcu_mightsleep()
-			 */
-			BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
-
-			void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
-			bool free_head = ((unsigned long) obj->func) & 1UL;
-
-			kvfree(ptr);
-			if (free_head)
-				kfree(obj);
-		}
-
-		break;
-
-	case RCU_PENDING_CALL_RCU:
-		for (size_t i = 0; i < objs.nr; i++) {
-			struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
-			obj->func(obj);
-		}
-		genradix_free(&objs.objs);
-
-		while (list) {
-			struct rcu_head *obj = list;
-#ifdef __KERNEL__
-			list = obj->next;
-#else
-			list = (void *) obj->next.next;
-#endif
-			obj->func(obj);
-		}
-		break;
-
-	default:
-		for (size_t i = 0; i < objs.nr; i++)
-			pending->process(pending, *genradix_ptr(&objs.objs, i));
-		genradix_free(&objs.objs);
-
-		while (list) {
-			struct rcu_head *obj = list;
-#ifdef __KERNEL__
-			list = obj->next;
-#else
-			list = (void *) obj->next.next;
-#endif
-			pending->process(pending, obj);
-		}
-		break;
-	}
-}
-
-static bool process_finished_items(struct rcu_pending *pending,
-				   struct rcu_pending_pcpu *p,
-				   unsigned long flags)
-{
-	/*
-	 * XXX: we should grab the gp seq once and avoid multiple function
-	 * calls, this is called from __rcu_pending_enqueue() fastpath in
-	 * may_sleep==true mode
-	 */
-	if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
-	    (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
-	    (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
-	    p->lists[2].head) {
-		__process_finished_items(pending, p, flags);
-		return true;
-	}
-
-	return false;
-}
-
-static void rcu_pending_work(struct work_struct *work)
-{
-	struct rcu_pending_pcpu *p =
-		container_of(work, struct rcu_pending_pcpu, work);
-	struct rcu_pending *pending = p->parent;
-	unsigned long flags;
-
-	do {
-		spin_lock_irqsave(&p->lock, flags);
-	} while (process_finished_items(pending, p, flags));
-
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void rcu_pending_rcu_cb(struct rcu_head *rcu)
-{
-	struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
-
-	schedule_work_on(p->cpu, &p->work);
-
-	unsigned long flags;
-	spin_lock_irqsave(&p->lock, flags);
-	if (__rcu_pending_has_pending(p)) {
-		spin_unlock_irqrestore(&p->lock, flags);
-		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
-	} else {
-		p->cb_armed = false;
-		spin_unlock_irqrestore(&p->lock, flags);
-	}
-}
-
-static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
-{
-	darray_for_each_reverse(p->objs, objs)
-		if (rcu_gp_poll_cookie_eq(objs->seq, seq))
-			return objs;
-
-	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
-		return NULL;
-
-	return &darray_last(p->objs);
-}
-
-static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
-			 struct rcu_head *head, void *ptr,
-			 unsigned long *flags)
-{
-	if (ptr) {
-		if (!head) {
-			/*
-			 * kvfree_rcu_mightsleep(): we weren't passed an
-			 * rcu_head, but we need one: use the low bit of the
-			 * ponter to free to flag that the head needs to be
-			 * freed as well:
-			 */
-			ptr = (void *)(((unsigned long) ptr)|1UL);
-			head = kmalloc(sizeof(*head), __GFP_NOWARN);
-			if (!head) {
-				spin_unlock_irqrestore(&p->lock, *flags);
-				head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
-				/*
-				 * dropped lock, did GFP_KERNEL allocation,
-				 * check for gp expiration
-				 */
-				if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
-					kvfree(--ptr);
-					kfree(head);
-					spin_lock_irqsave(&p->lock, *flags);
-					return false;
-				}
-			}
-		}
-
-		head->func = ptr;
-	}
-again:
-	for (struct rcu_pending_list *i = p->lists;
-	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
-		if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
-			rcu_pending_list_add(i, head);
-			return false;
-		}
-	}
-
-	for (struct rcu_pending_list *i = p->lists;
-	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
-		if (!i->head) {
-			i->seq = seq;
-			rcu_pending_list_add(i, head);
-			return true;
-		}
-	}
-
-	merge_expired_lists(p);
-	goto again;
-}
-
-/*
- * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
- * pending->pracess) once grace period elapses.
- *
- * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
- * back to a linked list.
- *
- * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
- *   process callback
- *
- * - If @ptr and @head are both not NULL, we're kvfree_rcu()
- *
- * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
- *
- * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
- *   expired items.
- */
-static __always_inline void
-__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
-		      void *ptr, bool may_sleep)
-{
-
-	struct rcu_pending_pcpu *p;
-	struct rcu_pending_seq *objs;
-	struct genradix_node *new_node = NULL;
-	unsigned long flags;
-	bool start_gp = false;
-
-	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
-
-	/* We could technically be scheduled before taking the lock and end up
-	 * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
-	 * anyways
-	 *
-	 * And we have to do it this way to avoid breaking PREEMPT_RT, which
-	 * redefines how spinlocks work:
-	 */
-	p = raw_cpu_ptr(pending->p);
-	spin_lock_irqsave(&p->lock, flags);
-	rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
-restart:
-	if (may_sleep &&
-	    unlikely(process_finished_items(pending, p, flags)))
-		goto check_expired;
-
-	/*
-	 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
-	 * that we can do kfree_bulk() - vmalloc pointers always use the linked
-	 * list:
-	 */
-	if (ptr && unlikely(is_vmalloc_addr(ptr)))
-		goto list_add;
-
-	objs = get_object_radix(p, seq);
-	if (unlikely(!objs))
-		goto list_add;
-
-	if (unlikely(!objs->cursor)) {
-		/*
-		 * New radix tree nodes must be added under @p->lock because the
-		 * tree root is in a darray that can be resized (typically,
-		 * genradix supports concurrent unlocked allocation of new
-		 * nodes) - hence preallocation and the retry loop:
-		 */
-		objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
-						objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
-		if (unlikely(!objs->cursor)) {
-			if (may_sleep) {
-				spin_unlock_irqrestore(&p->lock, flags);
-
-				gfp_t gfp = GFP_KERNEL;
-				if (!head)
-					gfp |= __GFP_NOFAIL;
-
-				new_node = genradix_alloc_node(gfp);
-				if (!new_node)
-					may_sleep = false;
-				goto check_expired;
-			}
-list_add:
-			start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
-			goto start_gp;
-		}
-	}
-
-	*objs->cursor++ = ptr ?: head;
-	/* zero cursor if we hit the end of a radix tree node: */
-	if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
-		objs->cursor = NULL;
-	start_gp = !objs->nr;
-	objs->nr++;
-start_gp:
-	if (unlikely(start_gp)) {
-		/*
-		 * We only have one callback (ideally, we would have one for
-		 * every outstanding graceperiod) - so if our callback is
-		 * already in flight, we may still have to start a grace period
-		 * (since we used get_state() above, not start_poll())
-		 */
-		if (!p->cb_armed) {
-			p->cb_armed = true;
-			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
-		} else {
-			__start_poll_synchronize_rcu(pending->srcu);
-		}
-	}
-	spin_unlock_irqrestore(&p->lock, flags);
-free_node:
-	if (new_node)
-		genradix_free_node(new_node);
-	return;
-check_expired:
-	if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
-		switch ((ulong) pending->process) {
-		case RCU_PENDING_KVFREE:
-			kvfree(ptr);
-			break;
-		case RCU_PENDING_CALL_RCU:
-			head->func(head);
-			break;
-		default:
-			pending->process(pending, head);
-			break;
-		}
-		goto free_node;
-	}
-
-	p = raw_cpu_ptr(pending->p);
-	spin_lock_irqsave(&p->lock, flags);
-	goto restart;
-}
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
-{
-	__rcu_pending_enqueue(pending, obj, NULL, true);
-}
-
-static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
-{
-	struct rcu_head *ret = NULL;
-
-	spin_lock_irq(&p->lock);
-	darray_for_each(p->objs, objs)
-		if (objs->nr) {
-			ret = *genradix_ptr(&objs->objs, --objs->nr);
-			objs->cursor = NULL;
-			if (!objs->nr)
-				genradix_free(&objs->objs);
-			goto out;
-		}
-
-	static_array_for_each(p->lists, i)
-		if (i->head) {
-			ret = i->head;
-#ifdef __KERNEL__
-			i->head = ret->next;
-#else
-			i->head = (void *) ret->next.next;
-#endif
-			if (!i->head)
-				i->tail = NULL;
-			goto out;
-		}
-out:
-	spin_unlock_irq(&p->lock);
-
-	return ret;
-}
-
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
-{
-	return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
-}
-
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
-{
-	struct rcu_head *ret = rcu_pending_dequeue(pending);
-
-	if (ret)
-		return ret;
-
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
-		if (ret)
-			break;
-	}
-	return ret;
-}
-
-static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
-{
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-		spin_lock_irq(&p->lock);
-		if (__rcu_pending_has_pending(p) || p->cb_armed) {
-			spin_unlock_irq(&p->lock);
-			return true;
-		}
-		spin_unlock_irq(&p->lock);
-	}
-
-	return false;
-}
-
-void rcu_pending_exit(struct rcu_pending *pending)
-{
-	int cpu;
-
-	if (!pending->p)
-		return;
-
-	while (rcu_pending_has_pending_or_armed(pending)) {
-		__rcu_barrier(pending->srcu);
-
-		for_each_possible_cpu(cpu) {
-			struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-			flush_work(&p->work);
-		}
-	}
-
-	for_each_possible_cpu(cpu) {
-		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-		flush_work(&p->work);
-	}
-
-	for_each_possible_cpu(cpu) {
-		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-
-		static_array_for_each(p->lists, i)
-			WARN_ON(i->head);
-		WARN_ON(p->objs.nr);
-		darray_exit(&p->objs);
-	}
-	free_percpu(pending->p);
-}
-
-/**
- * rcu_pending_init: - initialize a rcu_pending
- *
- * @pending:	Object to init
- * @srcu:	May optionally be used with an srcu_struct; if NULL, uses normal
- *		RCU flavor
- * @process:	Callback function invoked on objects once their RCU barriers
- *		have completed; if NULL, kvfree() is used.
- */
-int rcu_pending_init(struct rcu_pending *pending,
-		     struct srcu_struct *srcu,
-		     rcu_pending_process_fn process)
-{
-	pending->p = alloc_percpu(struct rcu_pending_pcpu);
-	if (!pending->p)
-		return -ENOMEM;
-
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-		p->parent	= pending;
-		p->cpu		= cpu;
-		spin_lock_init(&p->lock);
-		darray_init(&p->objs);
-		INIT_WORK(&p->work, rcu_pending_work);
-	}
-
-	pending->srcu = srcu;
-	pending->process = process;
-
-	return 0;
-}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
deleted file mode 100644
index 71a2f4ddaade..000000000000
--- a/fs/bcachefs/rcu_pending.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_RCU_PENDING_H
-#define _LINUX_RCU_PENDING_H
-
-#include <linux/rcupdate.h>
-
-struct rcu_pending;
-typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
-
-struct rcu_pending_pcpu;
-
-struct rcu_pending {
-	struct rcu_pending_pcpu __percpu *p;
-	struct srcu_struct		*srcu;
-	rcu_pending_process_fn		process;
-};
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
-
-void rcu_pending_exit(struct rcu_pending *pending);
-int rcu_pending_init(struct rcu_pending *pending,
-		     struct srcu_struct *srcu,
-		     rcu_pending_process_fn process);
-
-#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
deleted file mode 100644
index 1c345b86b1c0..000000000000
--- a/fs/bcachefs/rebalance.c
+++ /dev/null
@@ -1,889 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_write.h"
-#include "move.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-
-/* bch_extent_rebalance: */
-
-static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
-{
-	const union bch_extent_entry *entry;
-
-	bkey_extent_entry_for_each(ptrs, entry)
-		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
-			return &entry->rebalance;
-
-	return NULL;
-}
-
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
-	return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
-}
-
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
-					   struct bch_io_opts *opts,
-					   struct bkey_s_c k,
-					   struct bkey_ptrs_c ptrs)
-{
-	if (!opts->background_compression)
-		return 0;
-
-	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	unsigned ptr_bit = 1;
-	unsigned rewrite_ptrs = 0;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-		    p.ptr.unwritten)
-			return 0;
-
-		if (!p.ptr.cached && p.crc.compression_type != compression_type)
-			rewrite_ptrs |= ptr_bit;
-		ptr_bit <<= 1;
-	}
-
-	return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
-				       struct bch_io_opts *opts,
-				       struct bkey_ptrs_c ptrs)
-{
-	if (!opts->background_target ||
-	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
-		return 0;
-
-	unsigned ptr_bit = 1;
-	unsigned rewrite_ptrs = 0;
-
-	guard(rcu)();
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
-			rewrite_ptrs |= ptr_bit;
-		ptr_bit <<= 1;
-	}
-
-	return rewrite_ptrs;
-}
-
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
-					      struct bch_io_opts *opts,
-					      struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-		return 0;
-
-	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
-		bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-	const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
-	if (!opts)
-		return 0;
-
-	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-		return 0;
-
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	u64 sectors = 0;
-
-	if (opts->background_compression) {
-		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
-			    p.ptr.unwritten) {
-				sectors = 0;
-				goto incompressible;
-			}
-
-			if (!p.ptr.cached && p.crc.compression_type != compression_type)
-				sectors += p.crc.compressed_size;
-		}
-	}
-incompressible:
-	if (opts->background_target) {
-		guard(rcu)();
-		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached &&
-			    !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
-				sectors += p.crc.compressed_size;
-	}
-
-	return sectors;
-}
-
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
-					     struct bkey_s_c k)
-{
-	if (!bkey_extent_is_direct_data(k.k))
-		return 0;
-
-	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-
-	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
-		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
-		return old == NULL || memcmp(old, &new, sizeof(new));
-	} else {
-		return old != NULL;
-	}
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
-				  struct bkey_i *_k)
-{
-	if (!bkey_extent_is_direct_data(&_k->k))
-		return 0;
-
-	struct bkey_s k = bkey_i_to_s(_k);
-	struct bch_extent_rebalance *old =
-		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-
-	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
-		if (!old) {
-			old = bkey_val_end(k);
-			k.k->u64s += sizeof(*old) / sizeof(u64);
-		}
-
-		*old = io_opts_to_rebalance_opts(c, opts);
-	} else {
-		if (old)
-			extent_entry_drop(k, (union bch_extent_entry *) old);
-	}
-
-	return 0;
-}
-
-int bch2_get_update_rebalance_opts(struct btree_trans *trans,
-				   struct bch_io_opts *io_opts,
-				   struct btree_iter *iter,
-				   struct bkey_s_c k)
-{
-	BUG_ON(iter->flags & BTREE_ITER_is_extents);
-	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
-
-	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
-		? bch2_bkey_rebalance_opts(k) : NULL;
-	if (r) {
-#define x(_name)							\
-		if (r->_name##_from_inode) {				\
-			io_opts->_name = r->_name;			\
-			io_opts->_name##_from_inode = true;		\
-		}
-		BCH_REBALANCE_OPTS()
-#undef x
-	}
-
-	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
-		return 0;
-
-	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	bkey_reassemble(n, k);
-
-	/* On successfull transaction commit, @k was invalidated: */
-
-	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
-		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
-		bch2_trans_commit(trans, NULL, NULL, 0) ?:
-		-BCH_ERR_transaction_restart_nested;
-}
-
-#define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
-
-static const char * const bch2_rebalance_state_strs[] = {
-#define x(t) #t,
-	BCH_REBALANCE_STATES()
-	NULL
-#undef x
-};
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i_cookie *cookie;
-	u64 v;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
-			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
-			     BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_slot(trans, &iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	v = k.k->type == KEY_TYPE_cookie
-		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
-		: 0;
-
-	cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
-	ret = PTR_ERR_OR_ZERO(cookie);
-	if (ret)
-		goto err;
-
-	bkey_cookie_init(&cookie->k_i);
-	cookie->k.p = iter.pos;
-	cookie->v.cookie = cpu_to_le64(v + 1);
-
-	ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
-{
-	int ret = bch2_trans_commit_do(c, NULL, NULL,
-				       BCH_TRANS_COMMIT_no_enospc,
-			    bch2_set_rebalance_needs_scan_trans(trans, inum));
-	bch2_rebalance_wakeup(c);
-	return ret;
-}
-
-int bch2_set_fs_needs_rebalance(struct bch_fs *c)
-{
-	return bch2_set_rebalance_needs_scan(c, 0);
-}
-
-static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 v;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
-			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
-			     BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_slot(trans, &iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	v = k.k->type == KEY_TYPE_cookie
-		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
-		: 0;
-
-	if (v == cookie)
-		ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
-					    struct btree_iter *work_iter)
-{
-	return !kthread_should_stop()
-		? bch2_btree_iter_peek(trans, work_iter)
-		: bkey_s_c_null;
-}
-
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
-					   struct btree_iter *iter,
-					   struct bkey_s_c k)
-{
-	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
-		return 0;
-
-	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
-	int ret = PTR_ERR_OR_ZERO(n);
-	if (ret)
-		return ret;
-
-	extent_entry_drop(bkey_i_to_s(n),
-			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
-	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
-			struct bpos work_pos,
-			struct btree_iter *extent_iter,
-			struct bch_io_opts *io_opts,
-			struct data_update_opts *data_opts)
-{
-	struct bch_fs *c = trans->c;
-
-	bch2_trans_iter_exit(trans, extent_iter);
-	bch2_trans_iter_init(trans, extent_iter,
-			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
-			     work_pos,
-			     BTREE_ITER_all_snapshots);
-	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
-	if (bkey_err(k))
-		return k;
-
-	int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
-	if (ret)
-		return bkey_s_c_err(ret);
-
-	memset(data_opts, 0, sizeof(*data_opts));
-	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
-	data_opts->target		= io_opts->background_target;
-	data_opts->write_flags		|= BCH_WRITE_only_specified_devs;
-
-	if (!data_opts->rewrite_ptrs) {
-		/*
-		 * device we would want to write to offline? devices in target
-		 * changed?
-		 *
-		 * We'll now need a full scan before this extent is picked up
-		 * again:
-		 */
-		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
-		if (ret)
-			return bkey_s_c_err(ret);
-		return bkey_s_c_null;
-	}
-
-	if (trace_rebalance_extent_enabled()) {
-		struct printbuf buf = PRINTBUF;
-
-		bch2_bkey_val_to_text(&buf, c, k);
-		prt_newline(&buf);
-
-		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-		unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
-		if (p) {
-			prt_str(&buf, "compression=");
-			bch2_compression_opt_to_text(&buf, io_opts->background_compression);
-			prt_str(&buf, " ");
-			bch2_prt_u64_base2(&buf, p);
-			prt_newline(&buf);
-		}
-
-		p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
-		if (p) {
-			prt_str(&buf, "move=");
-			bch2_target_to_text(&buf, c, io_opts->background_target);
-			prt_str(&buf, " ");
-			bch2_prt_u64_base2(&buf, p);
-			prt_newline(&buf);
-		}
-
-		trace_rebalance_extent(c, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	return k;
-}
-
-noinline_for_stack
-static int do_rebalance_extent(struct moving_context *ctxt,
-			       struct bpos work_pos,
-			       struct btree_iter *extent_iter)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	struct bch_fs_rebalance *r = &trans->c->rebalance;
-	struct data_update_opts data_opts;
-	struct bch_io_opts io_opts;
-	struct bkey_s_c k;
-	struct bkey_buf sk;
-	int ret;
-
-	ctxt->stats = &r->work_stats;
-	r->state = BCH_REBALANCE_working;
-
-	bch2_bkey_buf_init(&sk);
-
-	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
-				extent_iter, &io_opts, &data_opts));
-	if (ret || !k.k)
-		goto out;
-
-	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-
-	/*
-	 * The iterator gets unlocked by __bch2_read_extent - need to
-	 * save a copy of @k elsewhere:
-	 */
-	bch2_bkey_buf_reassemble(&sk, c, k);
-	k = bkey_i_to_s_c(sk.k);
-
-	ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
-	if (ret) {
-		if (bch2_err_matches(ret, ENOMEM)) {
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(ctxt);
-			ret = bch_err_throw(c, transaction_restart_nested);
-		}
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			goto out;
-
-		/* skip it and continue, XXX signal failure */
-		ret = 0;
-	}
-out:
-	bch2_bkey_buf_exit(&sk, c);
-	return ret;
-}
-
-static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	struct bch_fs_rebalance *r = &trans->c->rebalance;
-
-	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-	ctxt->stats = &r->scan_stats;
-
-	if (!inum) {
-		r->scan_start	= BBPOS_MIN;
-		r->scan_end	= BBPOS_MAX;
-	} else {
-		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
-		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
-	}
-
-	r->state = BCH_REBALANCE_scanning;
-
-	struct per_snapshot_io_opts snapshot_io_opts;
-	per_snapshot_io_opts_init(&snapshot_io_opts, c);
-
-	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-				      r->scan_start.pos, r->scan_end.pos,
-				      BTREE_ITER_all_snapshots|
-				      BTREE_ITER_not_extents|
-				      BTREE_ITER_prefetch, k, ({
-		ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
-		struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
-					&snapshot_io_opts, iter.pos, &iter, k);
-		PTR_ERR_OR_ZERO(io_opts);
-	})) ?:
-	commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		  bch2_clear_rebalance_needs_scan(trans, inum, cookie));
-
-	per_snapshot_io_opts_exit(&snapshot_io_opts);
-	bch2_move_stats_exit(&r->scan_stats, trans->c);
-
-	/*
-	 * Ensure that the rebalance_work entries we created are seen by the
-	 * next iteration of do_rebalance(), so we don't end up stuck in
-	 * rebalance_wait():
-	 */
-	atomic64_inc(&r->scan_stats.sectors_seen);
-	bch2_btree_write_buffer_flush_sync(trans);
-
-	return ret;
-}
-
-static void rebalance_wait(struct bch_fs *c)
-{
-	struct bch_fs_rebalance *r = &c->rebalance;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	u64 now = atomic64_read(&clock->now);
-	u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
-	if (min_member_capacity == U64_MAX)
-		min_member_capacity = 128 * 2048;
-
-	r->wait_iotime_end		= now + (min_member_capacity >> 6);
-
-	if (r->state != BCH_REBALANCE_waiting) {
-		r->wait_iotime_start	= now;
-		r->wait_wallclock_start	= ktime_get_real_ns();
-		r->state		= BCH_REBALANCE_waiting;
-	}
-
-	bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
-}
-
-static bool bch2_rebalance_enabled(struct bch_fs *c)
-{
-	return c->opts.rebalance_enabled &&
-		!(c->opts.rebalance_on_ac_only &&
-		  c->rebalance.on_battery);
-}
-
-static int do_rebalance(struct moving_context *ctxt)
-{
-	struct btree_trans *trans = ctxt->trans;
-	struct bch_fs *c = trans->c;
-	struct bch_fs_rebalance *r = &c->rebalance;
-	struct btree_iter rebalance_work_iter, extent_iter = {};
-	struct bkey_s_c k;
-	u32 kick = r->kick;
-	int ret = 0;
-
-	bch2_trans_begin(trans);
-
-	bch2_move_stats_init(&r->work_stats, "rebalance_work");
-	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-
-	bch2_trans_iter_init(trans, &rebalance_work_iter,
-			     BTREE_ID_rebalance_work, POS_MIN,
-			     BTREE_ITER_all_snapshots);
-
-	while (!bch2_move_ratelimit(ctxt)) {
-		if (!bch2_rebalance_enabled(c)) {
-			bch2_moving_ctxt_flush_all(ctxt);
-			kthread_wait_freezable(bch2_rebalance_enabled(c) ||
-					       kthread_should_stop());
-		}
-
-		if (kthread_should_stop())
-			break;
-
-		bch2_trans_begin(trans);
-
-		ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret || !k.k)
-			break;
-
-		ret = k.k->type == KEY_TYPE_cookie
-			? do_rebalance_scan(ctxt, k.k->p.inode,
-					    le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
-			: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			continue;
-		if (ret)
-			break;
-
-		bch2_btree_iter_advance(trans, &rebalance_work_iter);
-	}
-
-	bch2_trans_iter_exit(trans, &extent_iter);
-	bch2_trans_iter_exit(trans, &rebalance_work_iter);
-	bch2_move_stats_exit(&r->scan_stats, c);
-
-	if (!ret &&
-	    !kthread_should_stop() &&
-	    !atomic64_read(&r->work_stats.sectors_seen) &&
-	    !atomic64_read(&r->scan_stats.sectors_seen) &&
-	    kick == r->kick) {
-		bch2_moving_ctxt_flush_all(ctxt);
-		bch2_trans_unlock_long(trans);
-		rebalance_wait(c);
-	}
-
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
-	struct bch_fs *c = arg;
-	struct bch_fs_rebalance *r = &c->rebalance;
-	struct moving_context ctxt;
-
-	set_freezable();
-
-	/*
-	 * Data move operations can't run until after check_snapshots has
-	 * completed, and bch2_snapshot_is_ancestor() is available.
-	 */
-	kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
-			       kthread_should_stop());
-
-	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
-			      writepoint_ptr(&c->rebalance_write_point),
-			      true);
-
-	while (!kthread_should_stop() && !do_rebalance(&ctxt))
-		;
-
-	bch2_moving_ctxt_exit(&ctxt);
-
-	return 0;
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	printbuf_tabstop_push(out, 32);
-
-	struct bch_fs_rebalance *r = &c->rebalance;
-
-	/* print pending work */
-	struct disk_accounting_pos acc;
-	disk_accounting_key_init(acc, rebalance_work);
-	u64 v;
-	bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-
-	prt_printf(out, "pending work:\t");
-	prt_human_readable_u64(out, v << 9);
-	prt_printf(out, "\n\n");
-
-	prt_str(out, bch2_rebalance_state_strs[r->state]);
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-
-	switch (r->state) {
-	case BCH_REBALANCE_waiting: {
-		u64 now = atomic64_read(&c->io_clock[WRITE].now);
-
-		prt_printf(out, "io wait duration:\t");
-		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
-		prt_newline(out);
-
-		prt_printf(out, "io wait remaining:\t");
-		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
-		prt_newline(out);
-
-		prt_printf(out, "duration waited:\t");
-		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
-		prt_newline(out);
-		break;
-	}
-	case BCH_REBALANCE_working:
-		bch2_move_stats_to_text(out, &r->work_stats);
-		break;
-	case BCH_REBALANCE_scanning:
-		bch2_move_stats_to_text(out, &r->scan_stats);
-		break;
-	}
-	prt_newline(out);
-
-	struct task_struct *t;
-	scoped_guard(rcu) {
-		t = rcu_dereference(c->rebalance.thread);
-		if (t)
-			get_task_struct(t);
-	}
-
-	if (t) {
-		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
-		put_task_struct(t);
-	}
-
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
-	struct task_struct *p;
-
-	c->rebalance.pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&c->rebalance.pd.rate);
-
-	p = rcu_dereference_protected(c->rebalance.thread, 1);
-	c->rebalance.thread = NULL;
-
-	if (p) {
-		/* for sychronizing with bch2_rebalance_wakeup() */
-		synchronize_rcu();
-
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
-	struct task_struct *p;
-	int ret;
-
-	if (c->rebalance.thread)
-		return 0;
-
-	if (c->opts.nochanges)
-		return 0;
-
-	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
-	ret = PTR_ERR_OR_ZERO(p);
-	bch_err_msg(c, ret, "creating rebalance thread");
-	if (ret)
-		return ret;
-
-	get_task_struct(p);
-	rcu_assign_pointer(c->rebalance.thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
-#ifdef CONFIG_POWER_SUPPLY
-#include <linux/power_supply.h>
-
-static int bch2_rebalance_power_notifier(struct notifier_block *nb,
-					 unsigned long event, void *data)
-{
-	struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
-
-	c->rebalance.on_battery = !power_supply_is_system_supplied();
-	bch2_rebalance_wakeup(c);
-	return NOTIFY_OK;
-}
-#endif
-
-void bch2_fs_rebalance_exit(struct bch_fs *c)
-{
-#ifdef CONFIG_POWER_SUPPLY
-	power_supply_unreg_notifier(&c->rebalance.power_notifier);
-#endif
-}
-
-int bch2_fs_rebalance_init(struct bch_fs *c)
-{
-	struct bch_fs_rebalance *r = &c->rebalance;
-
-	bch2_pd_controller_init(&r->pd);
-
-#ifdef CONFIG_POWER_SUPPLY
-	r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
-	int ret = power_supply_reg_notifier(&r->power_notifier);
-	if (ret)
-		return ret;
-
-	r->on_battery = !power_supply_is_system_supplied();
-#endif
-	return 0;
-}
-
-static int check_rebalance_work_one(struct btree_trans *trans,
-				    struct btree_iter *extent_iter,
-				    struct btree_iter *rebalance_iter,
-				    struct bkey_buf *last_flushed)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c extent_k, rebalance_k;
-	struct printbuf buf = PRINTBUF;
-
-	int ret = bkey_err(extent_k	= bch2_btree_iter_peek(trans, extent_iter)) ?:
-		  bkey_err(rebalance_k	= bch2_btree_iter_peek(trans, rebalance_iter));
-	if (ret)
-		return ret;
-
-	if (!extent_k.k &&
-	    extent_iter->btree_id == BTREE_ID_reflink &&
-	    (!rebalance_k.k ||
-	     rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
-		bch2_trans_iter_exit(trans, extent_iter);
-		bch2_trans_iter_init(trans, extent_iter,
-				     BTREE_ID_extents, POS_MIN,
-				     BTREE_ITER_prefetch|
-				     BTREE_ITER_all_snapshots);
-		return bch_err_throw(c, transaction_restart_nested);
-	}
-
-	if (!extent_k.k && !rebalance_k.k)
-		return 1;
-
-	int cmp = bpos_cmp(extent_k.k	 ? extent_k.k->p    : SPOS_MAX,
-			   rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
-
-	struct bkey deleted;
-	bkey_init(&deleted);
-
-	if (cmp < 0) {
-		deleted.p = extent_k.k->p;
-		rebalance_k.k = &deleted;
-	} else if (cmp > 0) {
-		deleted.p = rebalance_k.k->p;
-		extent_k.k = &deleted;
-	}
-
-	bool should_have_rebalance =
-		bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
-	bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
-
-	if (should_have_rebalance != have_rebalance) {
-		ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
-		if (ret)
-			return ret;
-
-		bch2_bkey_val_to_text(&buf, c, extent_k);
-	}
-
-	if (fsck_err_on(!should_have_rebalance && have_rebalance,
-			trans, rebalance_work_incorrectly_set,
-			"rebalance work incorrectly set\n%s", buf.buf)) {
-		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-						  extent_k.k->p, false);
-		if (ret)
-			goto err;
-	}
-
-	if (fsck_err_on(should_have_rebalance && !have_rebalance,
-			trans, rebalance_work_incorrectly_unset,
-			"rebalance work incorrectly unset\n%s", buf.buf)) {
-		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-						  extent_k.k->p, true);
-		if (ret)
-			goto err;
-	}
-
-	if (cmp <= 0)
-		bch2_btree_iter_advance(trans, extent_iter);
-	if (cmp >= 0)
-		bch2_btree_iter_advance(trans, rebalance_iter);
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_rebalance_work(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter rebalance_iter, extent_iter;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &extent_iter,
-			     BTREE_ID_reflink, POS_MIN,
-			     BTREE_ITER_prefetch);
-	bch2_trans_iter_init(trans, &rebalance_iter,
-			     BTREE_ID_rebalance_work, POS_MIN,
-			     BTREE_ITER_prefetch);
-
-	struct bkey_buf last_flushed;
-	bch2_bkey_buf_init(&last_flushed);
-	bkey_init(&last_flushed.k->k);
-
-	while (!ret) {
-		bch2_trans_begin(trans);
-
-		ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
-
-		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			ret = 0;
-	}
-
-	bch2_bkey_buf_exit(&last_flushed, c);
-	bch2_trans_iter_exit(trans, &extent_iter);
-	bch2_trans_iter_exit(trans, &rebalance_iter);
-	bch2_trans_put(trans);
-	return ret < 0 ? ret : 0;
-}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
deleted file mode 100644
index 7a565ea7dbfc..000000000000
--- a/fs/bcachefs/rebalance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_H
-#define _BCACHEFS_REBALANCE_H
-
-#include "compress.h"
-#include "disk_groups.h"
-#include "opts.h"
-#include "rebalance_types.h"
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
-								    struct bch_io_opts *opts)
-{
-	struct bch_extent_rebalance r = {
-		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name)							\
-		._name = opts->_name,					\
-		._name##_from_inode = opts->_name##_from_inode,
-		BCH_REBALANCE_OPTS()
-#undef x
-	};
-
-	if (r.background_target &&
-	    !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
-		r.background_target = 0;
-
-	return r;
-};
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-int bch2_get_update_rebalance_opts(struct btree_trans *,
-				   struct bch_io_opts *,
-				   struct btree_iter *,
-				   struct bkey_s_c);
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
-int bch2_set_fs_needs_rebalance(struct bch_fs *);
-
-static inline void bch2_rebalance_wakeup(struct bch_fs *c)
-{
-	c->rebalance.kick++;
-	guard(rcu)();
-	struct task_struct *p = rcu_dereference(c->rebalance.thread);
-	if (p)
-		wake_up_process(p);
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_rebalance_stop(struct bch_fs *);
-int bch2_rebalance_start(struct bch_fs *);
-
-void bch2_fs_rebalance_exit(struct bch_fs *);
-int bch2_fs_rebalance_init(struct bch_fs *);
-
-int bch2_check_rebalance_work(struct bch_fs *);
-
-#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
deleted file mode 100644
index ff9a1342a22b..000000000000
--- a/fs/bcachefs/rebalance_format.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_FORMAT_H
-#define _BCACHEFS_REBALANCE_FORMAT_H
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:6,
-				unused:3,
-
-				promote_target_from_inode:1,
-				erasure_code_from_inode:1,
-				data_checksum_from_inode:1,
-				background_compression_from_inode:1,
-				data_replicas_from_inode:1,
-				background_target_from_inode:1,
-
-				promote_target:16,
-				erasure_code:1,
-				data_checksum:4,
-				data_replicas:4,
-				background_compression:8, /* enum bch_compression_opt */
-				background_target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			background_target:16,
-				background_compression:8,
-				data_replicas:4,
-				data_checksum:4,
-				erasure_code:1,
-				promote_target:16,
-
-				background_target_from_inode:1,
-				data_replicas_from_inode:1,
-				background_compression_from_inode:1,
-				data_checksum_from_inode:1,
-				erasure_code_from_inode:1,
-				promote_target_from_inode:1,
-
-				unused:3,
-				type:6;
-#endif
-};
-
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS()			\
-	x(data_checksum)			\
-	x(background_compression)		\
-	x(data_replicas)			\
-	x(promote_target)			\
-	x(background_target)			\
-	x(erasure_code)
-
-#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
-
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
deleted file mode 100644
index c659da149fa3..000000000000
--- a/fs/bcachefs/rebalance_types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_TYPES_H
-#define _BCACHEFS_REBALANCE_TYPES_H
-
-#include "bbpos_types.h"
-#include "move_types.h"
-
-#define BCH_REBALANCE_STATES()		\
-	x(waiting)			\
-	x(working)			\
-	x(scanning)
-
-enum bch_rebalance_states {
-#define x(t)	BCH_REBALANCE_##t,
-	BCH_REBALANCE_STATES()
-#undef x
-};
-
-struct bch_fs_rebalance {
-	struct task_struct __rcu	*thread;
-	u32				kick;
-	struct bch_pd_controller pd;
-
-	enum bch_rebalance_states	state;
-	u64				wait_iotime_start;
-	u64				wait_iotime_end;
-	u64				wait_wallclock_start;
-
-	struct bch_move_stats		work_stats;
-
-	struct bbpos			scan_start;
-	struct bbpos			scan_end;
-	struct bch_move_stats		scan_stats;
-
-	bool				on_battery;
-#ifdef CONFIG_POWER_SUPPLY
-	struct notifier_block		power_notifier;
-#endif
-};
-
-#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
deleted file mode 100644
index c94debb12d2f..000000000000
--- a/fs/bcachefs/recovery.c
+++ /dev/null
@@ -1,1306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "buckets.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "logged_ops.h"
-#include "move.h"
-#include "movinggc.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-downgrade.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-#include <linux/stat.h>
-
-int bch2_btree_lost_data(struct bch_fs *c,
-			 struct printbuf *msg,
-			 enum btree_id btree)
-{
-	u64 b = BIT_ULL(btree);
-	int ret = 0;
-
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-	if (!(c->sb.btrees_lost_data & b)) {
-		prt_printf(msg, "flagging btree ");
-		bch2_btree_id_to_text(msg, btree);
-		prt_printf(msg, " lost data\n");
-
-		ext->btrees_lost_data |= cpu_to_le64(b);
-	}
-
-	/* Once we have runtime self healing for topology errors we won't need this: */
-	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-
-	/* Btree node accounting will be off: */
-	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
-	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-	/*
-	 * These are much more minor, and don't need to be corrected right away,
-	 * but in debug mode we want the next fsck run to be clean:
-	 */
-	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
-	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
-#endif
-
-	switch (btree) {
-	case BTREE_ID_alloc:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
-		goto out;
-	case BTREE_ID_backpointers:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
-		goto out;
-	case BTREE_ID_need_discard:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-		goto out;
-	case BTREE_ID_freespace:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-		goto out;
-	case BTREE_ID_bucket_gens:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-		goto out;
-	case BTREE_ID_lru:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-		goto out;
-	case BTREE_ID_accounting:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
-		goto out;
-	case BTREE_ID_snapshots:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
-		goto out;
-	default:
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
-		goto out;
-	}
-out:
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-static void kill_btree(struct bch_fs *c, enum btree_id btree)
-{
-	bch2_btree_id_root(c, btree)->alive = false;
-	bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-}
-
-/* for -o reconstruct_alloc: */
-void bch2_reconstruct_alloc(struct bch_fs *c)
-{
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
-	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
-	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
-	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
-	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
-
-	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
-
-	__set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
-
-	__set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
-
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
-
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
-	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
-	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-
-	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-	c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
-		if (btree_id_is_alloc(i))
-			kill_btree(c, i);
-}
-
-/*
- * Btree node pointers have a field to stack a pointer to the in memory btree
- * node; we need to zero out this field when reading in btree nodes, or when
- * reading in keys from the journal:
- */
-static void zero_out_btree_mem_ptr(struct journal_keys *keys)
-{
-	darray_for_each(*keys, i)
-		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
-			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
-}
-
-/* journal replay: */
-
-static void replay_now_at(struct journal *j, u64 seq)
-{
-	BUG_ON(seq < j->replay_journal_seq);
-
-	seq = min(seq, j->replay_journal_seq_end);
-
-	while (j->replay_journal_seq < seq)
-		bch2_journal_pin_put(j, j->replay_journal_seq++);
-}
-
-static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
-					      struct journal_key *k)
-{
-	struct btree_iter iter;
-	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-				  BTREE_MAX_DEPTH, k->level,
-				  BTREE_ITER_intent);
-	int ret = bch2_btree_iter_traverse(trans, &iter);
-	if (ret)
-		goto out;
-
-	struct bkey u;
-	struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
-
-	/* Has this delta already been applied to the btree? */
-	if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
-		ret = 0;
-		goto out;
-	}
-
-	struct bkey_i *new = k->k;
-	if (old.k->type == KEY_TYPE_accounting) {
-		new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
-
-		bch2_accounting_accumulate(bkey_i_to_accounting(new),
-					   bkey_s_c_to_accounting(old));
-	}
-
-	trans->journal_res.seq = k->journal_seq;
-
-	ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_journal_replay_key(struct btree_trans *trans,
-				   struct journal_key *k)
-{
-	struct btree_iter iter;
-	unsigned iter_flags =
-		BTREE_ITER_intent|
-		BTREE_ITER_not_extents;
-	unsigned update_flags = BTREE_TRIGGER_norun;
-	int ret;
-
-	if (k->overwritten)
-		return 0;
-
-	trans->journal_res.seq = k->journal_seq;
-
-	/*
-	 * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
-	 * keep the key cache coherent with the underlying btree. Nothing
-	 * besides the allocator is doing updates yet so we don't need key cache
-	 * coherency for non-alloc btrees, and key cache fills for snapshots
-	 * btrees use BTREE_ITER_filter_snapshots, which isn't available until
-	 * the snapshots recovery pass runs.
-	 */
-	if (!k->level && k->btree_id == BTREE_ID_alloc)
-		iter_flags |= BTREE_ITER_cached;
-	else
-		update_flags |= BTREE_UPDATE_key_cache_reclaim;
-
-	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-				  BTREE_MAX_DEPTH, k->level,
-				  iter_flags);
-	ret = bch2_btree_iter_traverse(trans, &iter);
-	if (ret)
-		goto out;
-
-	struct btree_path *path = btree_iter_path(trans, &iter);
-	if (unlikely(!btree_path_node(path, k->level))) {
-		struct bch_fs *c = trans->c;
-
-		CLASS(printbuf, buf)();
-		prt_str(&buf, "btree=");
-		bch2_btree_id_to_text(&buf, k->btree_id);
-		prt_printf(&buf, " level=%u ", k->level);
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
-
-		if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
-						     BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
-			bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
-				buf.buf);
-			ret = -EINVAL;
-		}
-
-		if (!k->allocated) {
-			bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
-				   buf.buf);
-			k->overwritten = true;
-			goto out;
-		}
-
-		bch2_trans_iter_exit(trans, &iter);
-		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
-					  BTREE_MAX_DEPTH, 0, iter_flags);
-		ret =   bch2_btree_iter_traverse(trans, &iter) ?:
-			bch2_btree_increase_depth(trans, iter.path, 0) ?:
-			-BCH_ERR_transaction_restart_nested;
-		goto out;
-	}
-
-	/* Must be checked with btree locked: */
-	if (k->overwritten)
-		goto out;
-
-	if (k->k->k.type == KEY_TYPE_accounting) {
-		struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto out;
-
-		bkey_copy(n, k->k);
-		goto out;
-	}
-
-	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
-out:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int journal_sort_seq_cmp(const void *_l, const void *_r)
-{
-	const struct journal_key *l = *((const struct journal_key **)_l);
-	const struct journal_key *r = *((const struct journal_key **)_r);
-
-	/*
-	 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
-	 *
-	 * journal_seq == 0 means that the key comes from early repair, and
-	 * should be inserted last so as to avoid overflowing the journal
-	 */
-	return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
-}
-
-int bch2_journal_replay(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-	DARRAY(struct journal_key *) keys_sorted = { 0 };
-	struct journal *j = &c->journal;
-	u64 start_seq	= c->journal_replay_seq_start;
-	u64 end_seq	= c->journal_replay_seq_start;
-	struct btree_trans *trans = NULL;
-	bool immediate_flush = false;
-	int ret = 0;
-
-	if (keys->nr) {
-		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
-					   keys->nr, start_seq, end_seq);
-		if (ret)
-			goto err;
-	}
-
-	BUG_ON(!atomic_read(&keys->ref));
-
-	move_gap(keys, keys->nr);
-	trans = bch2_trans_get(c);
-
-	/*
-	 * Replay accounting keys first: we can't allow the write buffer to
-	 * flush accounting keys until we're done
-	 */
-	darray_for_each(*keys, k) {
-		if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
-			continue;
-
-		cond_resched();
-
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc|
-				BCH_TRANS_COMMIT_journal_reclaim|
-				BCH_TRANS_COMMIT_skip_accounting_apply|
-				BCH_TRANS_COMMIT_no_journal_res|
-				BCH_WATERMARK_reclaim,
-			     bch2_journal_replay_accounting_key(trans, k));
-		if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
-			goto err;
-
-		k->overwritten = true;
-	}
-
-	set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
-	/*
-	 * First, attempt to replay keys in sorted order. This is more
-	 * efficient - better locality of btree access -  but some might fail if
-	 * that would cause a journal deadlock.
-	 */
-	darray_for_each(*keys, k) {
-		cond_resched();
-
-		/*
-		 * k->allocated means the key wasn't read in from the journal,
-		 * rather it was from early repair code
-		 */
-		if (k->allocated)
-			immediate_flush = true;
-
-		/* Skip fastpath if we're low on space in the journal */
-		ret = c->journal.watermark ? -1 :
-			commit_do(trans, NULL, NULL,
-				  BCH_TRANS_COMMIT_no_enospc|
-				  BCH_TRANS_COMMIT_journal_reclaim|
-				  BCH_TRANS_COMMIT_skip_accounting_apply|
-				  (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
-			     bch2_journal_replay_key(trans, k));
-		BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
-		if (ret) {
-			ret = darray_push(&keys_sorted, k);
-			if (ret)
-				goto err;
-		}
-	}
-
-	bch2_trans_unlock_long(trans);
-	/*
-	 * Now, replay any remaining keys in the order in which they appear in
-	 * the journal, unpinning those journal entries as we go:
-	 */
-	sort_nonatomic(keys_sorted.data, keys_sorted.nr,
-		       sizeof(keys_sorted.data[0]),
-		       journal_sort_seq_cmp, NULL);
-
-	darray_for_each(keys_sorted, kp) {
-		cond_resched();
-
-		struct journal_key *k = *kp;
-
-		if (k->journal_seq)
-			replay_now_at(j, k->journal_seq);
-		else
-			replay_now_at(j, j->replay_journal_seq_end);
-
-		ret = commit_do(trans, NULL, NULL,
-				BCH_TRANS_COMMIT_no_enospc|
-				BCH_TRANS_COMMIT_skip_accounting_apply|
-				(!k->allocated
-				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
-				 : 0),
-			     bch2_journal_replay_key(trans, k));
-		if (ret) {
-			struct printbuf buf = PRINTBUF;
-			bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
-			bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
-			printbuf_exit(&buf);
-			goto err;
-		}
-
-		BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
-	}
-
-	/*
-	 * We need to put our btree_trans before calling flush_all_pins(), since
-	 * that will use a btree_trans internally
-	 */
-	bch2_trans_put(trans);
-	trans = NULL;
-
-	if (!c->opts.retain_recovery_info &&
-	    c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
-		bch2_journal_keys_put_initial(c);
-
-	replay_now_at(j, j->replay_journal_seq_end);
-	j->replay_journal_seq = 0;
-
-	bch2_journal_set_replay_done(j);
-
-	/* if we did any repair, flush it immediately */
-	if (immediate_flush) {
-		bch2_journal_flush_all_pins(&c->journal);
-		ret = bch2_journal_meta(&c->journal);
-	}
-
-	if (keys->nr)
-		bch2_journal_log_msg(c, "journal replay finished");
-err:
-	if (trans)
-		bch2_trans_put(trans);
-	darray_exit(&keys_sorted);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/* journal replay early: */
-
-static int journal_replay_entry_early(struct bch_fs *c,
-				      struct jset_entry *entry)
-{
-	int ret = 0;
-
-	switch (entry->type) {
-	case BCH_JSET_ENTRY_btree_root: {
-
-		if (unlikely(!entry->u64s))
-			return 0;
-
-		if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
-				c, invalid_btree_id,
-				"invalid btree id %u (max %u)",
-				entry->btree_id, BTREE_ID_NR_MAX))
-			return 0;
-
-		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
-			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
-			if (ret)
-				return ret;
-		}
-
-		struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
-		r->level = entry->level;
-		bkey_copy(&r->key, (struct bkey_i *) entry->start);
-		r->error = 0;
-		r->alive = true;
-		break;
-	}
-	case BCH_JSET_ENTRY_usage: {
-		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
-
-		switch (entry->btree_id) {
-		case BCH_FS_USAGE_key_version:
-			atomic64_set(&c->key_version, le64_to_cpu(u->v));
-			break;
-		}
-		break;
-	}
-	case BCH_JSET_ENTRY_blacklist: {
-		struct jset_entry_blacklist *bl_entry =
-			container_of(entry, struct jset_entry_blacklist, entry);
-
-		ret = bch2_journal_seq_blacklist_add(c,
-				le64_to_cpu(bl_entry->seq),
-				le64_to_cpu(bl_entry->seq) + 1);
-		break;
-	}
-	case BCH_JSET_ENTRY_blacklist_v2: {
-		struct jset_entry_blacklist_v2 *bl_entry =
-			container_of(entry, struct jset_entry_blacklist_v2, entry);
-
-		ret = bch2_journal_seq_blacklist_add(c,
-				le64_to_cpu(bl_entry->start),
-				le64_to_cpu(bl_entry->end) + 1);
-		break;
-	}
-	case BCH_JSET_ENTRY_clock: {
-		struct jset_entry_clock *clock =
-			container_of(entry, struct jset_entry_clock, entry);
-
-		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
-	}
-	}
-fsck_err:
-	return ret;
-}
-
-static int journal_replay_early(struct bch_fs *c,
-				struct bch_sb_field_clean *clean)
-{
-	if (clean) {
-		for (struct jset_entry *entry = clean->start;
-		     entry != vstruct_end(&clean->field);
-		     entry = vstruct_next(entry)) {
-			int ret = journal_replay_entry_early(c, entry);
-			if (ret)
-				return ret;
-		}
-	} else {
-		struct genradix_iter iter;
-		struct journal_replay *i, **_i;
-
-		genradix_for_each(&c->journal_entries, iter, _i) {
-			i = *_i;
-
-			if (journal_replay_ignore(i))
-				continue;
-
-			vstruct_for_each(&i->j, entry) {
-				int ret = journal_replay_entry_early(c, entry);
-				if (ret)
-					return ret;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/* sb clean section: */
-
-static int read_btree_roots(struct bch_fs *c)
-{
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (!r->alive)
-			continue;
-
-		printbuf_reset(&buf);
-		bch2_btree_id_level_to_text(&buf, i, r->level);
-
-		if (mustfix_fsck_err_on((ret = r->error),
-					c, btree_root_bkey_invalid,
-					"invalid btree root %s",
-					buf.buf) ||
-		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
-					c, btree_root_read_error,
-					"error reading btree root %s: %s",
-					buf.buf, bch2_err_str(ret))) {
-			if (btree_id_is_alloc(i))
-				r->error = 0;
-			ret = 0;
-		}
-	}
-
-	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = bch2_btree_id_root(c, i);
-
-		if (!r->b && !r->error) {
-			r->alive = false;
-			r->level = 0;
-			bch2_btree_root_alloc_fake(c, i, 0);
-		}
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static bool check_version_upgrade(struct bch_fs *c)
-{
-	unsigned latest_version	= bcachefs_metadata_version_current;
-	unsigned latest_compatible = min(latest_version,
-					 bch2_latest_compatible_version(c->sb.version));
-	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
-	unsigned new_version = 0;
-	bool ret = false;
-
-	if (old_version < bcachefs_metadata_required_upgrade_below) {
-		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
-		    latest_compatible < bcachefs_metadata_required_upgrade_below)
-			new_version = latest_version;
-		else
-			new_version = latest_compatible;
-	} else {
-		switch (c->opts.version_upgrade) {
-		case BCH_VERSION_UPGRADE_compatible:
-			new_version = latest_compatible;
-			break;
-		case BCH_VERSION_UPGRADE_incompatible:
-			new_version = latest_version;
-			break;
-		case BCH_VERSION_UPGRADE_none:
-			new_version = min(old_version, latest_version);
-			break;
-		}
-	}
-
-	if (new_version > old_version) {
-		struct printbuf buf = PRINTBUF;
-
-		if (old_version < bcachefs_metadata_required_upgrade_below)
-			prt_str(&buf, "Version upgrade required:\n");
-
-		if (old_version != c->sb.version) {
-			prt_str(&buf, "Version upgrade from ");
-			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
-			prt_str(&buf, " to ");
-			bch2_version_to_text(&buf, c->sb.version);
-			prt_str(&buf, " incomplete\n");
-		}
-
-		prt_printf(&buf, "Doing %s version upgrade from ",
-			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
-			   ? "incompatible" : "compatible");
-		bch2_version_to_text(&buf, old_version);
-		prt_str(&buf, " to ");
-		bch2_version_to_text(&buf, new_version);
-		prt_newline(&buf);
-
-		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-		__le64 passes = ext->recovery_passes_required[0];
-		bch2_sb_set_upgrade(c, old_version, new_version);
-		passes = ext->recovery_passes_required[0] & ~passes;
-
-		if (passes) {
-			prt_str(&buf, "  running recovery passes: ");
-			prt_bitflags(&buf, bch2_recovery_passes,
-				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
-		}
-
-		bch_notice(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-
-		ret = true;
-	}
-
-	if (new_version > c->sb.version_incompat_allowed &&
-	    c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "Now allowing incompatible features up to ");
-		bch2_version_to_text(&buf, new_version);
-		prt_str(&buf, ", previously allowed up to ");
-		bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
-		prt_newline(&buf);
-
-		bch_notice(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-
-		ret = true;
-	}
-
-	if (ret)
-		bch2_sb_upgrade(c, new_version,
-				c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
-
-	return ret;
-}
-
-int bch2_fs_recovery(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *clean = NULL;
-	struct jset *last_journal_entry = NULL;
-	u64 last_seq = 0, blacklist_seq, journal_seq;
-	int ret = 0;
-
-	if (c->sb.clean) {
-		clean = bch2_read_superblock_clean(c);
-		ret = PTR_ERR_OR_ZERO(clean);
-		if (ret)
-			goto err;
-
-		bch_info(c, "recovering from clean shutdown, journal seq %llu",
-			 le64_to_cpu(clean->journal_seq));
-	} else {
-		bch_info(c, "recovering from unclean shutdown");
-	}
-
-	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
-		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (!c->sb.clean &&
-	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
-		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (c->opts.norecovery) {
-		c->opts.recovery_pass_last = c->opts.recovery_pass_last
-			? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
-			: BCH_RECOVERY_PASS_snapshots_read;
-		c->opts.nochanges = true;
-	}
-
-	if (c->opts.nochanges)
-		c->opts.read_only = true;
-
-	if (c->opts.journal_rewind) {
-		bch_info(c, "rewinding journal, fsck required");
-		c->opts.fsck = true;
-	}
-
-	if (go_rw_in_recovery(c)) {
-		/*
-		 * start workqueues/kworkers early - kthread creation checks for
-		 * pending signals, which is _very_ annoying
-		 */
-		ret = bch2_fs_init_rw(c);
-		if (ret)
-			goto err;
-	}
-
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-	bool write_sb = false;
-
-	if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
-		ext->recovery_passes_required[0] |=
-			cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
-		write_sb = true;
-	}
-
-	u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-	if (sb_passes) {
-		struct printbuf buf = PRINTBUF;
-		prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
-		prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
-		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
-	if (bch2_check_version_downgrade(c)) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "Version downgrade required:");
-
-		__le64 passes = ext->recovery_passes_required[0];
-		bch2_sb_set_downgrade(c,
-				      BCH_VERSION_MINOR(bcachefs_metadata_version_current),
-				      BCH_VERSION_MINOR(c->sb.version));
-		passes = ext->recovery_passes_required[0] & ~passes;
-		if (passes) {
-			prt_str(&buf, "\n  running recovery passes: ");
-			prt_bitflags(&buf, bch2_recovery_passes,
-				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
-		}
-
-		bch_info(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-		write_sb = true;
-	}
-
-	if (check_version_upgrade(c))
-		write_sb = true;
-
-	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
-		SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
-		write_sb = true;
-	}
-
-	if (write_sb)
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	if (c->sb.clean)
-		set_bit(BCH_FS_clean_recovery, &c->flags);
-	if (c->opts.fsck)
-		set_bit(BCH_FS_in_fsck, &c->flags);
-	set_bit(BCH_FS_in_recovery, &c->flags);
-
-	ret = bch2_blacklist_table_initialize(c);
-	if (ret) {
-		bch_err(c, "error initializing blacklist table");
-		goto err;
-	}
-
-	bch2_journal_pos_from_member_info_resume(c);
-
-	if (!c->sb.clean || c->opts.retain_recovery_info) {
-		struct genradix_iter iter;
-		struct journal_replay **i;
-
-		bch_verbose(c, "starting journal read");
-		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
-		if (ret)
-			goto err;
-
-		/*
-		 * note: cmd_list_journal needs the blacklist table fully up to date so
-		 * it can asterisk ignored journal entries:
-		 */
-		if (c->opts.read_journal_only)
-			goto out;
-
-		genradix_for_each_reverse(&c->journal_entries, iter, i)
-			if (!journal_replay_ignore(*i)) {
-				last_journal_entry = &(*i)->j;
-				break;
-			}
-
-		if (mustfix_fsck_err_on(c->sb.clean &&
-					last_journal_entry &&
-					!journal_entry_empty(last_journal_entry), c,
-				clean_but_journal_not_empty,
-				"filesystem marked clean but journal not empty")) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-			c->sb.clean = false;
-		}
-
-		if (!last_journal_entry) {
-			fsck_err_on(!c->sb.clean, c,
-				    dirty_but_no_journal_entries,
-				    "no journal entries found");
-			if (clean)
-				goto use_clean;
-
-			genradix_for_each_reverse(&c->journal_entries, iter, i)
-				if (*i) {
-					last_journal_entry = &(*i)->j;
-					(*i)->ignore_blacklisted = false;
-					(*i)->ignore_not_dirty= false;
-					/*
-					 * This was probably a NO_FLUSH entry,
-					 * so last_seq was garbage - but we know
-					 * we're only using a single journal
-					 * entry, set it here:
-					 */
-					(*i)->j.last_seq = (*i)->j.seq;
-					break;
-				}
-		}
-
-		ret = bch2_journal_keys_sort(c);
-		if (ret)
-			goto err;
-
-		if (c->sb.clean && last_journal_entry) {
-			ret = bch2_verify_superblock_clean(c, &clean,
-						      last_journal_entry);
-			if (ret)
-				goto err;
-		}
-	} else {
-use_clean:
-		if (!clean) {
-			bch_err(c, "no superblock clean section found");
-			ret = bch_err_throw(c, fsck_repair_impossible);
-			goto err;
-
-		}
-		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
-	}
-
-	c->journal_replay_seq_start	= last_seq;
-	c->journal_replay_seq_end	= blacklist_seq - 1;
-
-	zero_out_btree_mem_ptr(&c->journal_keys);
-
-	ret = journal_replay_early(c, clean);
-	if (ret)
-		goto err;
-
-	ret = bch2_fs_resize_on_mount(c);
-	if (ret) {
-		up_write(&c->state_lock);
-		goto err;
-	}
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-		bch_info(c, "filesystem is an unresized image file, mounting ro");
-		c->opts.read_only = true;
-	}
-
-	if (!c->opts.read_only &&
-	    (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
-		bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
-
-		bch2_reconstruct_alloc(c);
-	} else if (c->opts.reconstruct_alloc) {
-		bch2_journal_log_msg(c, "dropping alloc info");
-		bch_info(c, "dropping and reconstructing all alloc info");
-
-		bch2_reconstruct_alloc(c);
-	}
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
-		/* We can't go RW to fix errors without alloc info */
-		if (c->opts.fix_errors == FSCK_FIX_yes ||
-		    c->opts.fix_errors == FSCK_FIX_ask)
-			c->opts.fix_errors = FSCK_FIX_no;
-		if (c->opts.errors == BCH_ON_ERROR_fix_safe)
-			c->opts.errors = BCH_ON_ERROR_continue;
-	}
-
-	/*
-	 * After an unclean shutdown, skip then next few journal sequence
-	 * numbers as they may have been referenced by btree writes that
-	 * happened before their corresponding journal writes - those btree
-	 * writes need to be ignored, by skipping and blacklisting the next few
-	 * journal sequence numbers:
-	 */
-	if (!c->sb.clean)
-		journal_seq += JOURNAL_BUF_NR * 4;
-
-	if (blacklist_seq != journal_seq) {
-		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
-					     blacklist_seq, journal_seq) ?:
-			bch2_journal_seq_blacklist_add(c,
-					blacklist_seq, journal_seq);
-		if (ret) {
-			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
-			goto err;
-		}
-	}
-
-	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
-				     journal_seq, last_seq, blacklist_seq - 1) ?:
-		bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
-	if (ret)
-		goto err;
-
-	/*
-	 * Skip past versions that might have possibly been used (as nonces),
-	 * but hadn't had their pointers written:
-	 */
-	if (c->sb.encryption_type && !c->sb.clean)
-		atomic64_add(1 << 16, &c->key_version);
-
-	ret = read_btree_roots(c);
-	if (ret)
-		goto err;
-
-	set_bit(BCH_FS_btree_running, &c->flags);
-
-	ret = bch2_sb_set_upgrade_extra(c);
-	if (ret)
-		goto err;
-
-	ret = bch2_run_recovery_passes(c, 0);
-	if (ret)
-		goto err;
-
-	/*
-	 * Normally set by the appropriate recovery pass: when cleared, this
-	 * indicates we're in early recovery and btree updates should be done by
-	 * being applied to the journal replay keys. _Must_ be cleared before
-	 * multithreaded use:
-	 */
-	set_bit(BCH_FS_may_go_rw, &c->flags);
-	clear_bit(BCH_FS_in_fsck, &c->flags);
-
-	/* in case we don't run journal replay, i.e. norecovery mode */
-	set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
-	bch2_async_btree_node_rewrites_flush(c);
-
-	/* fsync if we fixed errors */
-	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
-		bch2_journal_flush_all_pins(&c->journal);
-		bch2_journal_meta(&c->journal);
-	}
-
-	/* If we fixed errors, verify that fs is actually clean now: */
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    test_bit(BCH_FS_errors_fixed, &c->flags) &&
-	    !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
-	    !test_bit(BCH_FS_error, &c->flags)) {
-		bch2_flush_fsck_errs(c);
-
-		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
-		clear_bit(BCH_FS_errors_fixed, &c->flags);
-
-		ret = bch2_run_recovery_passes(c,
-			BCH_RECOVERY_PASS_check_alloc_info);
-		if (ret)
-			goto err;
-
-		if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
-		    test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
-			bch_err(c, "Second fsck run was not clean");
-			set_bit(BCH_FS_errors_not_fixed, &c->flags);
-		}
-
-		set_bit(BCH_FS_errors_fixed, &c->flags);
-	}
-
-	if (enabled_qtypes(c)) {
-		bch_verbose(c, "reading quotas");
-		ret = bch2_fs_quota_read(c);
-		if (ret)
-			goto err;
-		bch_verbose(c, "quotas done");
-	}
-
-	mutex_lock(&c->sb_lock);
-	ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-	write_sb = false;
-
-	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
-		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
-		write_sb = true;
-	}
-
-	if (!test_bit(BCH_FS_error, &c->flags) &&
-	    !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
-		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-		write_sb = true;
-	}
-
-	if (!test_bit(BCH_FS_error, &c->flags) &&
-	    !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
-		memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
-		write_sb = true;
-	}
-
-	if (c->opts.fsck &&
-	    !test_bit(BCH_FS_error, &c->flags) &&
-	    c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
-	    ext->btrees_lost_data) {
-		ext->btrees_lost_data = 0;
-		write_sb = true;
-	}
-
-	if (c->opts.fsck &&
-	    !test_bit(BCH_FS_error, &c->flags) &&
-	    !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
-		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
-		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
-		write_sb = true;
-	}
-
-	if (bch2_blacklist_entries_gc(c))
-		write_sb = true;
-
-	if (write_sb)
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
-		struct bch_move_stats stats;
-
-		bch2_move_stats_init(&stats, "recovery");
-
-		struct printbuf buf = PRINTBUF;
-		bch2_version_to_text(&buf, c->sb.version_min);
-		bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
-		printbuf_exit(&buf);
-
-		ret =   bch2_fs_read_write_early(c) ?:
-			bch2_scan_old_btree_nodes(c, &stats);
-		if (ret)
-			goto err;
-		bch_info(c, "scanning for old btree nodes done");
-	}
-
-	ret = 0;
-out:
-	bch2_flush_fsck_errs(c);
-
-	if (!ret &&
-	    test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
-	    !c->opts.nochanges) {
-		bch2_fs_read_write_early(c);
-		bch2_delete_dead_snapshots_async(c);
-	}
-
-	bch_err_fn(c, ret);
-final_out:
-	if (!IS_ERR(clean))
-		kfree(clean);
-	return ret;
-err:
-fsck_err:
-	{
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
-		bch2_fs_emergency_read_only2(c, &buf);
-
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-	goto final_out;
-}
-
-int bch2_fs_initialize(struct bch_fs *c)
-{
-	struct bch_inode_unpacked root_inode, lostfound_inode;
-	struct bkey_inode_buf packed_inode;
-	struct qstr lostfound = QSTR("lost+found");
-	struct bch_member *m;
-	int ret;
-
-	bch_notice(c, "initializing new filesystem");
-	set_bit(BCH_FS_new_fs, &c->flags);
-
-	mutex_lock(&c->sb_lock);
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-
-	bch2_check_version_downgrade(c);
-
-	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
-		bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
-		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-		bch2_write_super(c);
-	}
-
-	for_each_member_device(c, ca) {
-		m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-		SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
-		ca->mi = bch2_mi_to_cpu(m);
-	}
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	set_bit(BCH_FS_btree_running, &c->flags);
-	set_bit(BCH_FS_may_go_rw, &c->flags);
-
-	for (unsigned i = 0; i < BTREE_ID_NR; i++)
-		bch2_btree_root_alloc_fake(c, i, 0);
-
-	ret = bch2_fs_journal_alloc(c);
-	if (ret)
-		goto err;
-
-	/*
-	 * journal_res_get() will crash if called before this has
-	 * set up the journal.pin FIFO and journal.cur pointer:
-	 */
-	ret = bch2_fs_journal_start(&c->journal, 1, 1);
-	if (ret)
-		goto err;
-
-	ret = bch2_fs_read_write_early(c);
-	if (ret)
-		goto err;
-
-	set_bit(BCH_FS_accounting_replay_done, &c->flags);
-	bch2_journal_set_replay_done(&c->journal);
-
-	for_each_member_device(c, ca) {
-		ret = bch2_dev_usage_init(ca, false);
-		if (ret) {
-			bch2_dev_put(ca);
-			goto err;
-		}
-	}
-
-	/*
-	 * Write out the superblock and journal buckets, now that we can do
-	 * btree updates
-	 */
-	bch_verbose(c, "marking superblocks");
-	ret = bch2_trans_mark_dev_sbs(c);
-	bch_err_msg(c, ret, "marking superblocks");
-	if (ret)
-		goto err;
-
-	ret = bch2_fs_freespace_init(c);
-	if (ret)
-		goto err;
-
-	ret = bch2_initialize_subvolumes(c);
-	if (ret)
-		goto err;
-
-	bch_verbose(c, "reading snapshots table");
-	ret = bch2_snapshots_read(c);
-	if (ret)
-		goto err;
-	bch_verbose(c, "reading snapshots done");
-
-	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
-	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
-	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
-	bch2_inode_pack(&packed_inode, &root_inode);
-	packed_inode.inode.k.p.snapshot = U32_MAX;
-
-	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
-	bch_err_msg(c, ret, "creating root directory");
-	if (ret)
-		goto err;
-
-	bch2_inode_init_early(c, &lostfound_inode);
-
-	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-		bch2_create_trans(trans,
-				  BCACHEFS_ROOT_SUBVOL_INUM,
-				  &root_inode, &lostfound_inode,
-				  &lostfound,
-				  0, 0, S_IFDIR|0700, 0,
-				  NULL, NULL, (subvol_inum) { 0 }, 0));
-	bch_err_msg(c, ret, "creating lost+found");
-	if (ret)
-		goto err;
-
-	c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
-
-	bch2_copygc_wakeup(c);
-	bch2_rebalance_wakeup(c);
-
-	if (enabled_qtypes(c)) {
-		ret = bch2_fs_quota_read(c);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_journal_flush(&c->journal);
-	bch_err_msg(c, ret, "writing first journal entry");
-	if (ret)
-		goto err;
-
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
-	return 0;
-err:
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
deleted file mode 100644
index c023f52fc2d6..000000000000
--- a/fs/bcachefs/recovery.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_H
-#define _BCACHEFS_RECOVERY_H
-
-int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
-void bch2_reconstruct_alloc(struct bch_fs *);
-
-int bch2_journal_replay(struct bch_fs *);
-
-int bch2_fs_recovery(struct bch_fs *);
-int bch2_fs_initialize(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
deleted file mode 100644
index 6a039e011064..000000000000
--- a/fs/bcachefs/recovery_passes.c
+++ /dev/null
@@ -1,646 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "btree_gc.h"
-#include "btree_node_scan.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "lru.h"
-#include "logged_ops.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...)	#_fn,
-	BCH_RECOVERY_PASSES()
-#undef x
-	NULL
-};
-
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static const u8 passes_from_stable_map[] = {
-#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
-{
-	return passes_to_stable_map[pass];
-}
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
-	u64 ret = 0;
-	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
-		if (v & BIT_ULL(i))
-			ret |= BIT_ULL(passes_to_stable_map[i]);
-	return ret;
-}
-
-static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
-{
-	return pass < ARRAY_SIZE(passes_from_stable_map)
-		? passes_from_stable_map[pass]
-		: 0;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
-	u64 ret = 0;
-	for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
-		if (v & BIT_ULL(i))
-			ret |= BIT_ULL(passes_from_stable_map[i]);
-	return ret;
-}
-
-static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
-					    enum bch_validate_flags flags, struct printbuf *err)
-{
-	return 0;
-}
-
-static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
-					    struct bch_sb *sb,
-					    struct bch_sb_field *f)
-{
-	struct bch_sb_field_recovery_passes *r =
-		field_to_type(f, recovery_passes);
-	unsigned nr = recovery_passes_nr_entries(r);
-
-	if (out->nr_tabstops < 1)
-		printbuf_tabstop_push(out, 32);
-	if (out->nr_tabstops < 2)
-		printbuf_tabstop_push(out, 16);
-
-	prt_printf(out, "Pass\tLast run\tLast runtime\n");
-
-	for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
-		if (!i->last_run)
-			continue;
-
-		unsigned idx = i - r->start;
-
-		prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
-
-		bch2_prt_datetime(out, le64_to_cpu(i->last_run));
-		prt_tab(out);
-
-		bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
-
-		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
-			prt_str(out, " (no ratelimit)");
-
-		prt_newline(out);
-	}
-}
-
-static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
-							       enum bch_recovery_pass pass)
-{
-	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-
-	lockdep_assert_held(&c->sb_lock);
-
-	struct bch_sb_field_recovery_passes *r =
-		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
-	if (stable >= recovery_passes_nr_entries(r)) {
-		unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
-
-		r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
-		if (!r) {
-			bch_err(c, "error creating recovery_passes sb section");
-			return NULL;
-		}
-	}
-
-	return r->start + stable;
-}
-
-static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
-					   enum bch_recovery_pass pass,
-					   s64 start_time)
-{
-	guard(mutex)(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-	__clear_bit_le64(bch2_recovery_pass_to_stable(pass),
-			 ext->recovery_passes_required);
-
-	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
-	if (e) {
-		s64 end_time	= ktime_get_real_seconds();
-		e->last_run	= cpu_to_le64(end_time);
-		e->last_runtime	= cpu_to_le32(max(0, end_time - start_time));
-		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
-	}
-
-	bch2_write_super(c);
-}
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
-					 enum bch_recovery_pass pass)
-{
-	guard(mutex)(&c->sb_lock);
-
-	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
-	if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
-		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
-		bch2_write_super(c);
-	}
-}
-
-static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-	bool ret = false;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	struct bch_sb_field_recovery_passes *r =
-		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
-	if (stable < recovery_passes_nr_entries(r)) {
-		struct recovery_pass_entry *i = r->start + stable;
-
-		/*
-		 * Ratelimit if the last runtime was more than 1% of the time
-		 * since we last ran
-		 */
-		ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
-			ktime_get_real_seconds() - le64_to_cpu(i->last_run);
-
-		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
-			ret = false;
-	}
-
-	return ret;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
-	.validate	= bch2_sb_recovery_passes_validate,
-	.to_text	= bch2_sb_recovery_passes_to_text
-};
-
-/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
-static int bch2_recovery_pass_empty(struct bch_fs *c)
-{
-	return 0;
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-
-	/*
-	 * After we go RW, the journal keys buffer can't be modified (except for
-	 * setting journal_key->overwritten: it will be accessed by multiple
-	 * threads
-	 */
-	move_gap(keys, keys->nr);
-
-	set_bit(BCH_FS_may_go_rw, &c->flags);
-
-	if (go_rw_in_recovery(c)) {
-		if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
-			bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
-			bch2_reconstruct_alloc(c);
-		}
-
-		return bch2_fs_read_write_early(c);
-	}
-	return 0;
-}
-
-/*
- * Make sure root inode is readable while we're still in recovery and can rewind
- * for repair:
- */
-static int bch2_lookup_root_inode(struct bch_fs *c)
-{
-	subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
-	struct bch_inode_unpacked inode_u;
-	struct bch_subvolume subvol;
-
-	return bch2_trans_do(c,
-		bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
-}
-
-struct recovery_pass_fn {
-	int		(*fn)(struct bch_fs *);
-	unsigned	when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static u64 bch2_recovery_passes_match(unsigned flags)
-{
-	u64 ret = 0;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
-		if (recovery_pass_fns[i].when & flags)
-			ret |= BIT_ULL(i);
-	return ret;
-}
-
-u64 bch2_fsck_recovery_passes(void)
-{
-	return bch2_recovery_passes_match(PASS_FSCK);
-}
-
-static void bch2_run_async_recovery_passes(struct bch_fs *c)
-{
-	if (!down_trylock(&c->recovery.run_lock))
-		return;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
-		goto unlock;
-
-	if (queue_work(system_long_wq, &c->recovery.work))
-		return;
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-unlock:
-	up(&c->recovery.run_lock);
-}
-
-static bool recovery_pass_needs_set(struct bch_fs *c,
-				    enum bch_recovery_pass pass,
-				    enum bch_run_recovery_pass_flags *flags)
-{
-	struct bch_fs_recovery *r = &c->recovery;
-
-	/*
-	 * Never run scan_for_btree_nodes persistently: check_topology will run
-	 * it if required
-	 */
-	if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
-		*flags |= RUN_RECOVERY_PASS_nopersistent;
-
-	if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
-	    !bch2_recovery_pass_want_ratelimit(c, pass))
-		*flags &= ~RUN_RECOVERY_PASS_ratelimit;
-
-	/*
-	 * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
-	 * anything if the pass has already run: these mean we need a prior pass
-	 * to run before we continue to repair, we don't expect that pass to fix
-	 * the damage we encountered.
-	 *
-	 * Otherwise, we run run_explicit_recovery_pass when we find damage, so
-	 * it should run again even if it's already run:
-	 */
-	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
-	bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
-	bool rewind = in_recovery &&
-		r->curr_pass > pass &&
-		!(r->passes_complete & BIT_ULL(pass));
-
-	if (persistent
-	    ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
-	    : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
-		return true;
-
-	if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
-	    (r->passes_ratelimiting & BIT_ULL(pass)))
-		return true;
-
-	if (rewind)
-		return true;
-
-	return false;
-}
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
-				      struct printbuf *out,
-				      enum bch_recovery_pass pass,
-				      enum bch_run_recovery_pass_flags flags)
-{
-	struct bch_fs_recovery *r = &c->recovery;
-	int ret = 0;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	bch2_printbuf_make_room(out, 1024);
-	out->atomic++;
-
-	unsigned long lockflags;
-	spin_lock_irqsave(&r->lock, lockflags);
-
-	if (!recovery_pass_needs_set(c, pass, &flags))
-		goto out;
-
-	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
-	bool rewind = in_recovery &&
-		r->curr_pass > pass &&
-		!(r->passes_complete & BIT_ULL(pass));
-	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
-
-	if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
-		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-		__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
-	}
-
-	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
-	    (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
-		prt_printf(out, "need recovery pass %s (%u), but already rw\n",
-			   bch2_recovery_passes[pass], pass);
-		ret = bch_err_throw(c, cannot_rewind_recovery);
-		goto out;
-	}
-
-	if (ratelimit)
-		r->passes_ratelimiting |= BIT_ULL(pass);
-	else
-		r->passes_ratelimiting &= ~BIT_ULL(pass);
-
-	if (in_recovery && !ratelimit) {
-		prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
-			   bch2_recovery_passes[pass], pass,
-			   bch2_recovery_passes[r->curr_pass], r->curr_pass,
-			   rewind ? " - rewinding" : "");
-
-		r->passes_to_run |= BIT_ULL(pass);
-
-		if (rewind) {
-			r->next_pass = pass;
-			r->passes_complete &= (1ULL << pass) >> 1;
-			ret = bch_err_throw(c, restart_recovery);
-		}
-	} else {
-		prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
-			   bch2_recovery_passes[pass], pass,
-			   ratelimit ? " - ratelimiting" : "");
-
-		struct recovery_pass_fn *p = recovery_pass_fns + pass;
-		if (p->when & PASS_ONLINE)
-			bch2_run_async_recovery_passes(c);
-	}
-out:
-	spin_unlock_irqrestore(&r->lock, lockflags);
-	--out->atomic;
-	return ret;
-}
-
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-				    struct printbuf *out,
-				    enum bch_recovery_pass pass,
-				    enum bch_run_recovery_pass_flags flags)
-{
-	int ret = 0;
-
-	if (recovery_pass_needs_set(c, pass, &flags)) {
-		guard(mutex)(&c->sb_lock);
-		ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
-		bch2_write_super(c);
-	}
-
-	return ret;
-}
-
-/*
- * Returns 0 if @pass has run recently, otherwise one of
- * -BCH_ERR_restart_recovery
- * -BCH_ERR_recovery_pass_will_run
- */
-int bch2_require_recovery_pass(struct bch_fs *c,
-			       struct printbuf *out,
-			       enum bch_recovery_pass pass)
-{
-	if (test_bit(BCH_FS_in_recovery, &c->flags) &&
-	    c->recovery.passes_complete & BIT_ULL(pass))
-		return 0;
-
-	guard(mutex)(&c->sb_lock);
-
-	if (bch2_recovery_pass_want_ratelimit(c, pass))
-		return 0;
-
-	enum bch_run_recovery_pass_flags flags = 0;
-	int ret = 0;
-
-	if (recovery_pass_needs_set(c, pass, &flags)) {
-		ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
-		bch2_write_super(c);
-	}
-
-	return ret ?: bch_err_throw(c, recovery_pass_will_run);
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-	enum bch_run_recovery_pass_flags flags = 0;
-
-	if (!recovery_pass_needs_set(c, pass, &flags))
-		return 0;
-
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	mutex_lock(&c->sb_lock);
-	int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
-						RUN_RECOVERY_PASS_nopersistent);
-	mutex_unlock(&c->sb_lock);
-
-	bch2_print_str(c, KERN_NOTICE, buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-	struct bch_fs_recovery *r = &c->recovery;
-	struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
-	if (!(p->when & PASS_SILENT))
-		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
-			   bch2_recovery_passes[pass]);
-
-	s64 start_time = ktime_get_real_seconds();
-	int ret = p->fn(c);
-
-	r->passes_to_run &= ~BIT_ULL(pass);
-
-	if (ret) {
-		r->passes_failing |= BIT_ULL(pass);
-		return ret;
-	}
-
-	r->passes_failing = 0;
-
-	if (!test_bit(BCH_FS_error, &c->flags))
-		bch2_sb_recovery_pass_complete(c, pass, start_time);
-
-	if (!(p->when & PASS_SILENT))
-		bch2_print(c, KERN_CONT " done\n");
-
-	return 0;
-}
-
-static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
-				      bool online)
-{
-	struct bch_fs_recovery *r = &c->recovery;
-	int ret = 0;
-
-	spin_lock_irq(&r->lock);
-
-	if (online)
-		orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
-		orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
-
-	/*
-	 * A failed recovery pass will be retried after another pass succeeds -
-	 * but not this iteration.
-	 *
-	 * This is because some passes depend on repair done by other passes: we
-	 * may want to retry, but we don't want to loop on failing passes.
-	 */
-
-	orig_passes_to_run &= ~r->passes_failing;
-
-	r->passes_to_run = orig_passes_to_run;
-
-	while (r->passes_to_run) {
-		unsigned prev_done = r->pass_done;
-		unsigned pass = __ffs64(r->passes_to_run);
-		r->curr_pass = pass;
-		r->next_pass = r->curr_pass + 1;
-		r->passes_to_run &= ~BIT_ULL(pass);
-
-		spin_unlock_irq(&r->lock);
-
-		int ret2 = bch2_run_recovery_pass(c, pass) ?:
-			bch2_journal_flush(&c->journal);
-
-		spin_lock_irq(&r->lock);
-
-		if (r->next_pass < r->curr_pass) {
-			/* Rewind: */
-			r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
-		} else if (!ret2) {
-			r->pass_done = max(r->pass_done, pass);
-			r->passes_complete |= BIT_ULL(pass);
-		} else {
-			ret = ret2;
-		}
-
-		if (ret && !online)
-			break;
-
-		if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
-		    r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
-			bch2_copygc_wakeup(c);
-			bch2_rebalance_wakeup(c);
-		}
-	}
-
-	clear_bit(BCH_FS_in_recovery, &c->flags);
-	spin_unlock_irq(&r->lock);
-
-	return ret;
-}
-
-static void bch2_async_recovery_passes_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
-	struct bch_fs_recovery *r = &c->recovery;
-
-	__bch2_run_recovery_passes(c,
-		c->sb.recovery_passes_required & ~r->passes_ratelimiting,
-		true);
-
-	up(&r->run_lock);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
-{
-	return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
-}
-
-int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
-{
-	u64 passes =
-		bch2_recovery_passes_match(PASS_ALWAYS) |
-		(!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
-		(c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
-		c->opts.recovery_passes |
-		c->sb.recovery_passes_required;
-
-	if (c->opts.recovery_pass_last)
-		passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;
-
-	/*
-	 * We can't allow set_may_go_rw to be excluded; that would cause us to
-	 * use the journal replay keys for updates where it's not expected.
-	 */
-	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
-	passes &= ~c->opts.recovery_passes_exclude;
-
-	passes &= ~(BIT_ULL(from) - 1);
-
-	down(&c->recovery.run_lock);
-	int ret = __bch2_run_recovery_passes(c, passes, false);
-	up(&c->recovery.run_lock);
-
-	return ret;
-}
-
-static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
-{
-	prt_printf(out, "%s:\t", msg);
-	prt_bitflags(out, bch2_recovery_passes, passes);
-	prt_newline(out);
-}
-
-void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_fs_recovery *r = &c->recovery;
-
-	printbuf_tabstop_push(out, 32);
-	prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
-	prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
-		   bch2_recovery_passes_match(PASS_ONLINE));
-	prt_passes(out, "Complete passes", r->passes_complete);
-	prt_passes(out, "Failing passes", r->passes_failing);
-
-	if (r->curr_pass) {
-		prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
-		prt_passes(out, "Current passes", r->passes_to_run);
-	}
-}
-
-void bch2_fs_recovery_passes_init(struct bch_fs *c)
-{
-	spin_lock_init(&c->recovery.lock);
-	sema_init(&c->recovery.run_lock, 1);
-
-	INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
-}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
deleted file mode 100644
index 2117f0ce1922..000000000000
--- a/fs/bcachefs/recovery_passes.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _BCACHEFS_RECOVERY_PASSES_H
-#define _BCACHEFS_RECOVERY_PASSES_H
-
-extern const char * const bch2_recovery_passes[];
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
-
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-u64 bch2_fsck_recovery_passes(void);
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
-
-enum bch_run_recovery_pass_flags {
-	RUN_RECOVERY_PASS_nopersistent	= BIT(0),
-	RUN_RECOVERY_PASS_ratelimit	= BIT(1),
-};
-
-static inline bool go_rw_in_recovery(struct bch_fs *c)
-{
-	return (c->journal_keys.nr ||
-		!c->opts.read_only ||
-		!c->sb.clean ||
-		c->opts.recovery_passes ||
-		(c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))));
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-
-int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
-				      enum bch_recovery_pass,
-				      enum bch_run_recovery_pass_flags);
-int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
-				    enum bch_recovery_pass,
-				    enum bch_run_recovery_pass_flags);
-
-int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
-			       enum bch_recovery_pass);
-
-int bch2_run_online_recovery_passes(struct bch_fs *, u64);
-int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
-
-void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_recovery_passes_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
deleted file mode 100644
index b63c20558d3d..000000000000
--- a/fs/bcachefs/recovery_passes_format.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-
-#define PASS_SILENT		BIT(0)
-#define PASS_FSCK		BIT(1)
-#define PASS_UNCLEAN		BIT(2)
-#define PASS_ALWAYS		BIT(3)
-#define PASS_ONLINE		BIT(4)
-#define PASS_ALLOC		BIT(5)
-#define PASS_FSCK_ALLOC		(PASS_FSCK|PASS_ALLOC)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG		BIT(1)
-#else
-#define PASS_FSCK_DEBUG		0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES()								\
-	x(recovery_pass_empty,			41, PASS_SILENT)			\
-	x(scan_for_btree_nodes,			37, 0)					\
-	x(check_topology,			 4, 0)					\
-	x(accounting_read,			39, PASS_ALWAYS)			\
-	x(alloc_read,				 0, PASS_ALWAYS)			\
-	x(stripes_read,				 1, 0)					\
-	x(initialize_subvolumes,		 2, 0)					\
-	x(snapshots_read,			 3, PASS_ALWAYS)			\
-	x(check_allocations,			 5, PASS_FSCK_ALLOC)			\
-	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC)	\
-	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC)	\
-	x(set_may_go_rw,			 8, PASS_ALWAYS|PASS_SILENT)		\
-	x(journal_replay,			 9, PASS_ALWAYS)			\
-	x(check_alloc_info,			10, PASS_ONLINE|PASS_FSCK_ALLOC)	\
-	x(check_lrus,				11, PASS_ONLINE|PASS_FSCK_ALLOC)	\
-	x(check_btree_backpointers,		12, PASS_ONLINE|PASS_FSCK_ALLOC)	\
-	x(check_backpointers_to_extents,	13, PASS_ONLINE|PASS_FSCK_DEBUG)	\
-	x(check_extents_to_backpointers,	14, PASS_ONLINE|PASS_FSCK_ALLOC)	\
-	x(check_alloc_to_lru_refs,		15, PASS_ONLINE|PASS_FSCK_ALLOC)	\
-	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)		\
-	x(bucket_gens_init,			17, 0)					\
-	x(reconstruct_snapshots,		38, 0)					\
-	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)		\
-	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)		\
-	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)		\
-	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)		\
-	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)		\
-	x(fs_upgrade_for_subvolumes,		22, 0)					\
-	x(check_inodes,				24, PASS_FSCK)				\
-	x(check_extents,			25, PASS_FSCK)				\
-	x(check_indirect_extents,		26, PASS_ONLINE|PASS_FSCK)		\
-	x(check_dirents,			27, PASS_FSCK)				\
-	x(check_xattrs,				28, PASS_FSCK)				\
-	x(check_root,				29, PASS_ONLINE|PASS_FSCK)		\
-	x(check_unreachable_inodes,		40, PASS_FSCK)				\
-	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)		\
-	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)		\
-	x(check_nlinks,				31, PASS_FSCK)				\
-	x(check_rebalance_work,			43, PASS_ONLINE|PASS_FSCK)		\
-	x(resume_logged_ops,			23, PASS_ALWAYS)			\
-	x(delete_dead_inodes,			32, PASS_ALWAYS)			\
-	x(fix_reflink_p,			33, 0)					\
-	x(set_fs_needs_rebalance,		34, 0)					\
-	x(lookup_root_inode,			42, PASS_ALWAYS|PASS_SILENT)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when)	BCH_RECOVERY_PASS_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-	BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when)	BCH_RECOVERY_PASS_STABLE_##n = id,
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
-struct recovery_pass_entry {
-	__le64			last_run;
-	__le32			last_runtime;
-	__le32			flags;
-};
-
-LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT,	struct recovery_pass_entry, flags, 0, 1)
-
-struct bch_sb_field_recovery_passes {
-	struct bch_sb_field	field;
-	struct recovery_pass_entry start[];
-};
-
-static inline unsigned
-recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
-{
-	return r
-		? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
-		   sizeof(struct recovery_pass_entry))
-		: 0;
-}
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
deleted file mode 100644
index aa9526938cc3..000000000000
--- a/fs/bcachefs/recovery_passes_types.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-
-struct bch_fs_recovery {
-	/*
-	 * Two different uses:
-	 * "Has this fsck pass?" - i.e. should this type of error be an
-	 * emergency read-only
-	 * And, in certain situations fsck will rewind to an earlier pass: used
-	 * for signaling to the toplevel code which pass we want to run now.
-	 */
-	enum bch_recovery_pass	curr_pass;
-	enum bch_recovery_pass	next_pass;
-	/* never rewinds version of curr_pass */
-	enum bch_recovery_pass	pass_done;
-	u64			passes_to_run;
-	/* bitmask of recovery passes that we actually ran */
-	u64			passes_complete;
-	u64			passes_failing;
-	u64			passes_ratelimiting;
-	spinlock_t		lock;
-	struct semaphore	run_lock;
-	struct work_struct	work;
-};
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
deleted file mode 100644
index 92b90cfe622b..000000000000
--- a/fs/bcachefs/reflink.c
+++ /dev/null
@@ -1,865 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "super-io.h"
-
-#include <linux/sched/signal.h>
-
-static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_reflink_v:
-	case KEY_TYPE_indirect_inline_data:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static inline unsigned bkey_type_to_indirect(const struct bkey *k)
-{
-	switch (k->type) {
-	case KEY_TYPE_extent:
-		return KEY_TYPE_reflink_v;
-	case KEY_TYPE_inline_data:
-		return KEY_TYPE_indirect_inline_data;
-	default:
-		return 0;
-	}
-}
-
-/* reflink pointers */
-
-int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
-			 c, reflink_p_front_pad_bad,
-			 "idx < front_pad (%llu < %u)",
-			 REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
-fsck_err:
-	return ret;
-}
-
-void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
-	prt_printf(out, "idx %llu front_pad %u back_pad %u",
-	       REFLINK_P_IDX(p.v),
-	       le32_to_cpu(p.v->front_pad),
-	       le32_to_cpu(p.v->back_pad));
-
-	if (REFLINK_P_ERROR(p.v))
-		prt_str(out, " error");
-}
-
-bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
-	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
-
-	/*
-	 * Disabled for now, the triggers code needs to be reworked for merging
-	 * of reflink pointers to work:
-	 */
-	return false;
-
-	if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
-		return false;
-
-	if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
-		return false;
-
-	bch2_key_resize(l.k, l.k->size + r.k->size);
-	return true;
-}
-
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
-			 c, reflink_v_pos_bad,
-			 "indirect extent above maximum position 0:%llu",
-			 REFLINK_P_IDX_MAX);
-
-	ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
-	return ret;
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
-	bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
-	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
-	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
-	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
-
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
-				       struct bkey_validate_context from)
-{
-	return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
-				       struct bch_fs *c, struct bkey_s_c k)
-{
-	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
-	unsigned datalen = bkey_inline_data_bytes(k.k);
-
-	prt_printf(out, "refcount %llu datalen %u: %*phN",
-	       le64_to_cpu(d.v->refcount), datalen,
-	       min(datalen, 32U), d.v->data);
-}
-
-/* lookup */
-
-static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
-					    bool should_commit)
-{
-	struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
-	int ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		return ret;
-
-	SET_REFLINK_P_ERROR(&new->v, false);
-	ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
-	if (ret)
-		return ret;
-
-	if (!should_commit)
-		return 0;
-
-	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-		-BCH_ERR_transaction_restart_nested;
-}
-
-static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
-					      struct bkey_s_c_reflink_p p,
-					      u64 missing_start, u64 missing_end,
-					      bool should_commit)
-{
-	if (REFLINK_P_ERROR(p.v))
-		return 0;
-
-	struct bch_fs *c = trans->c;
-	u64 live_start	= REFLINK_P_IDX(p.v);
-	u64 live_end	= REFLINK_P_IDX(p.v) + p.k->size;
-	u64 refd_start	= live_start	- le32_to_cpu(p.v->front_pad);
-	u64 refd_end	= live_end	+ le32_to_cpu(p.v->back_pad);
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	BUG_ON(missing_start	< refd_start);
-	BUG_ON(missing_end	> refd_end);
-
-	struct bpos missing_pos = bkey_start_pos(p.k);
-	missing_pos.offset += missing_start - live_start;
-
-	prt_printf(&buf, "pointer to missing indirect extent in ");
-	ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
-	if (ret)
-		goto err;
-
-	prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9);
-	bch2_bkey_val_to_text(&buf, c, p.s_c);
-
-	prt_printf(&buf, "\nmissing reflink btree range %llu-%llu",
-		   missing_start, missing_end);
-
-	if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
-		struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto err;
-
-		/*
-		 * Is the missing range not actually needed?
-		 *
-		 * p.v->idx refers to the data that we actually want, but if the
-		 * indirect extent we point to was bigger, front_pad and back_pad
-		 * indicate the range we took a reference on.
-		 */
-
-		if (missing_end <= live_start) {
-			new->v.front_pad = cpu_to_le32(live_start - missing_end);
-		} else if (missing_start >= live_end) {
-			new->v.back_pad = cpu_to_le32(missing_start - live_end);
-		} else {
-			struct bpos new_start	= bkey_start_pos(&new->k);
-			struct bpos new_end	= new->k.p;
-
-			if (missing_start > live_start)
-				new_start.offset += missing_start - live_start;
-			if (missing_end < live_end)
-				new_end.offset -= live_end - missing_end;
-
-			bch2_cut_front(new_start, &new->k_i);
-			bch2_cut_back(new_end, &new->k_i);
-
-			SET_REFLINK_P_ERROR(&new->v, true);
-		}
-
-		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
-		if (ret)
-			goto err;
-
-		if (should_commit)
-			ret =   bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-				-BCH_ERR_transaction_restart_nested;
-	}
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * This is used from the read path, which doesn't expect to have to do a
- * transaction commit, and from triggers, which should not be doing a commit:
- */
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
-					    struct btree_iter *iter,
-					    s64 *offset_into_extent,
-					    struct bkey_s_c_reflink_p p,
-					    bool should_commit,
-					    unsigned iter_flags)
-{
-	BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
-	BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
-
-	u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
-
-	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
-				       POS(0, reflink_offset), iter_flags);
-	if (bkey_err(k))
-		return k;
-
-	if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
-		u64 missing_end = min(k.k->p.offset,
-				      REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad));
-		BUG_ON(reflink_offset == missing_end);
-
-		int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
-							     missing_end, should_commit);
-		if (ret) {
-			bch2_trans_iter_exit(trans, iter);
-			return bkey_s_c_err(ret);
-		}
-	} else if (unlikely(REFLINK_P_ERROR(p.v))) {
-		int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
-		if (ret) {
-			bch2_trans_iter_exit(trans, iter);
-			return bkey_s_c_err(ret);
-		}
-	}
-
-	*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
-	return k;
-}
-
-/* reflink pointer trigger */
-
-static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
-			struct bkey_s_c_reflink_p p, u64 *idx,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
-							BTREE_ITER_intent|
-							BTREE_ITER_with_updates);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (!bkey_refcount_c(k)) {
-		if (!(flags & BTREE_TRIGGER_overwrite))
-			ret = bch_err_throw(c, missing_indirect_extent);
-		goto next;
-	}
-
-	struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		goto err;
-
-	__le64 *refcount = bkey_refcount(bkey_i_to_s(new));
-	if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
-		bch2_bkey_val_to_text(&buf, c, p.s_c);
-		prt_newline(&buf);
-		bch2_bkey_val_to_text(&buf, c, k);
-		log_fsck_err(trans, reflink_refcount_underflow,
-			     "indirect extent refcount underflow while marking\n%s",
-			   buf.buf);
-		goto next;
-	}
-
-	if (flags & BTREE_TRIGGER_insert) {
-		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-		u64 pad;
-
-		pad = max_t(s64, le32_to_cpu(v->front_pad),
-			    REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
-		BUG_ON(pad > U32_MAX);
-		v->front_pad = cpu_to_le32(pad);
-
-		pad = max_t(s64, le32_to_cpu(v->back_pad),
-			    new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
-		BUG_ON(pad > U32_MAX);
-		v->back_pad = cpu_to_le32(pad);
-	}
-
-	le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
-
-	bch2_btree_iter_set_pos_to_extent_start(&iter);
-	ret = bch2_trans_update(trans, &iter, new, 0);
-	if (ret)
-		goto err;
-next:
-	*idx = k.k->p.offset;
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
-				struct bkey_s_c_reflink_p p, u64 *idx,
-				enum btree_iter_update_trigger_flags flags,
-				size_t r_idx)
-{
-	struct bch_fs *c = trans->c;
-	struct reflink_gc *r;
-	int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
-	u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-	s64 ret = 0;
-	struct printbuf buf = PRINTBUF;
-
-	if (r_idx >= c->reflink_gc_nr)
-		goto not_found;
-
-	r = genradix_ptr(&c->reflink_gc_table, r_idx);
-	next_idx = min(next_idx, r->offset - r->size);
-	if (*idx < next_idx)
-		goto not_found;
-
-	BUG_ON((s64) r->refcount + add < 0);
-
-	if (flags & BTREE_TRIGGER_gc)
-		r->refcount += add;
-	*idx = r->offset;
-	return 0;
-not_found:
-	if (flags & BTREE_TRIGGER_check_repair) {
-		ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
-		if (ret)
-			goto err;
-	}
-
-	*idx = next_idx;
-err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static int __trigger_reflink_p(struct btree_trans *trans,
-		enum btree_id btree_id, unsigned level, struct bkey_s_c k,
-		enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-	int ret = 0;
-
-	u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
-	u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-
-	if (flags & BTREE_TRIGGER_transactional) {
-		while (idx < end && !ret)
-			ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
-	}
-
-	if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
-		size_t l = 0, r = c->reflink_gc_nr;
-
-		while (l < r) {
-			size_t m = l + (r - l) / 2;
-			struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
-			if (ref->offset <= idx)
-				l = m + 1;
-			else
-				r = m;
-		}
-
-		while (idx < end && !ret)
-			ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
-	}
-
-	return ret;
-}
-
-int bch2_trigger_reflink_p(struct btree_trans *trans,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c old,
-			   struct bkey_s new,
-			   enum btree_iter_update_trigger_flags flags)
-{
-	if ((flags & BTREE_TRIGGER_transactional) &&
-	    (flags & BTREE_TRIGGER_insert)) {
-		struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
-
-		v->front_pad = v->back_pad = 0;
-	}
-
-	return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
-/* indirect extent trigger */
-
-static inline void
-check_indirect_extent_deleting(struct bkey_s new,
-			       enum btree_iter_update_trigger_flags *flags)
-{
-	if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
-		new.k->type = KEY_TYPE_deleted;
-		new.k->size = 0;
-		set_bkey_val_u64s(new.k, 0);
-		*flags &= ~BTREE_TRIGGER_insert;
-	}
-}
-
-int bch2_trigger_reflink_v(struct btree_trans *trans,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c old, struct bkey_s new,
-			   enum btree_iter_update_trigger_flags flags)
-{
-	if ((flags & BTREE_TRIGGER_transactional) &&
-	    (flags & BTREE_TRIGGER_insert))
-		check_indirect_extent_deleting(new, &flags);
-
-	return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
-}
-
-int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
-			      enum btree_id btree_id, unsigned level,
-			      struct bkey_s_c old, struct bkey_s new,
-			      enum btree_iter_update_trigger_flags flags)
-{
-	check_indirect_extent_deleting(new, &flags);
-
-	return 0;
-}
-
-/* create */
-
-static int bch2_make_extent_indirect(struct btree_trans *trans,
-				     struct btree_iter *extent_iter,
-				     struct bkey_i *orig,
-				     bool reflink_p_may_update_opts_field)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter reflink_iter = {};
-	struct bkey_s_c k;
-	struct bkey_i *r_v;
-	struct bkey_i_reflink_p *r_p;
-	__le64 *refcount;
-	int ret;
-
-	if (orig->k.type == KEY_TYPE_inline_data)
-		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
-
-	bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
-			     BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	/*
-	 * XXX: we're assuming that 56 bits will be enough for the life of the
-	 * filesystem: we need to implement wraparound, with a cursor in the
-	 * logged ops btree:
-	 */
-	if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
-		return -ENOSPC;
-
-	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
-	ret = PTR_ERR_OR_ZERO(r_v);
-	if (ret)
-		goto err;
-
-	bkey_init(&r_v->k);
-	r_v->k.type	= bkey_type_to_indirect(&orig->k);
-	r_v->k.p	= reflink_iter.pos;
-	bch2_key_resize(&r_v->k, orig->k.size);
-	r_v->k.bversion	= orig->k.bversion;
-
-	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
-
-	refcount	= bkey_refcount(bkey_i_to_s(r_v));
-	*refcount	= 0;
-	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
-
-	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
-	if (ret)
-		goto err;
-
-	/*
-	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
-	 * so we know it will be big enough:
-	 */
-	orig->k.type = KEY_TYPE_reflink_p;
-	r_p = bkey_i_to_reflink_p(orig);
-	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
-
-	/* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
-#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
-	__underlying_memset(&r_p->v, 0, sizeof(r_p->v));
-#else
-	memset(&r_p->v, 0, sizeof(r_p->v));
-#endif
-
-	SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
-
-	if (reflink_p_may_update_opts_field)
-		SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
-
-	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
-				BTREE_UPDATE_internal_snapshot_node);
-err:
-	bch2_trans_iter_exit(trans, &reflink_iter);
-
-	return ret;
-}
-
-static struct bkey_s_c get_next_src(struct btree_trans *trans,
-				    struct btree_iter *iter, struct bpos end)
-{
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
-		if (bkey_extent_is_unwritten(k))
-			continue;
-
-		if (bkey_extent_is_data(k.k))
-			return k;
-	}
-
-	if (bkey_ge(iter->pos, end))
-		bch2_btree_iter_set_pos(trans, iter, end);
-	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-s64 bch2_remap_range(struct bch_fs *c,
-		     subvol_inum dst_inum, u64 dst_offset,
-		     subvol_inum src_inum, u64 src_offset,
-		     u64 remap_sectors,
-		     u64 new_i_size, s64 *i_sectors_delta,
-		     bool may_change_src_io_path_opts)
-{
-	struct btree_trans *trans;
-	struct btree_iter dst_iter, src_iter;
-	struct bkey_s_c src_k;
-	struct bkey_buf new_dst, new_src;
-	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
-	struct bpos src_start = POS(src_inum.inum, src_offset);
-	struct bpos dst_end = dst_start, src_end = src_start;
-	struct bch_io_opts opts;
-	struct bpos src_want;
-	u64 dst_done = 0;
-	u32 dst_snapshot, src_snapshot;
-	bool reflink_p_may_update_opts_field =
-		!bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
-	int ret = 0, ret2 = 0;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
-		return bch_err_throw(c, erofs_no_writes);
-
-	bch2_check_set_feature(c, BCH_FEATURE_reflink);
-
-	dst_end.offset += remap_sectors;
-	src_end.offset += remap_sectors;
-
-	bch2_bkey_buf_init(&new_dst);
-	bch2_bkey_buf_init(&new_src);
-	trans = bch2_trans_get(c);
-
-	ret = bch2_inum_opts_get(trans, src_inum, &opts);
-	if (ret)
-		goto err;
-
-	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
-			     BTREE_ITER_intent);
-	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
-			     BTREE_ITER_intent);
-
-	while ((ret == 0 ||
-		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
-	       bkey_lt(dst_iter.pos, dst_end)) {
-		struct disk_reservation disk_res = { 0 };
-
-		bch2_trans_begin(trans);
-
-		if (fatal_signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
-						  &src_snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
-
-		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
-						  &dst_snapshot);
-		if (ret)
-			continue;
-
-		bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
-
-		if (dst_inum.inum < src_inum.inum) {
-			/* Avoid some lock cycle transaction restarts */
-			ret = bch2_btree_iter_traverse(trans, &dst_iter);
-			if (ret)
-				continue;
-		}
-
-		dst_done = dst_iter.pos.offset - dst_start.offset;
-		src_want = POS(src_start.inode, src_start.offset + dst_done);
-		bch2_btree_iter_set_pos(trans, &src_iter, src_want);
-
-		src_k = get_next_src(trans, &src_iter, src_end);
-		ret = bkey_err(src_k);
-		if (ret)
-			continue;
-
-		if (bkey_lt(src_want, src_iter.pos)) {
-			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
-					min(dst_end.offset,
-					    dst_iter.pos.offset +
-					    src_iter.pos.offset - src_want.offset),
-					i_sectors_delta);
-			continue;
-		}
-
-		if (src_k.k->type != KEY_TYPE_reflink_p) {
-			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
-
-			bch2_bkey_buf_reassemble(&new_src, c, src_k);
-			src_k = bkey_i_to_s_c(new_src.k);
-
-			ret = bch2_make_extent_indirect(trans, &src_iter,
-						new_src.k,
-						reflink_p_may_update_opts_field);
-			if (ret)
-				continue;
-
-			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
-		}
-
-		if (src_k.k->type == KEY_TYPE_reflink_p) {
-			struct bkey_s_c_reflink_p src_p =
-				bkey_s_c_to_reflink_p(src_k);
-			struct bkey_i_reflink_p *dst_p =
-				bkey_reflink_p_init(new_dst.k);
-
-			u64 offset = REFLINK_P_IDX(src_p.v) +
-				(src_want.offset -
-				 bkey_start_offset(src_k.k));
-
-			SET_REFLINK_P_IDX(&dst_p->v, offset);
-
-			if (reflink_p_may_update_opts_field &&
-			    may_change_src_io_path_opts &&
-			    REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
-				SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
-		} else {
-			BUG();
-		}
-
-		new_dst.k->k.p = dst_iter.pos;
-		bch2_key_resize(&new_dst.k->k,
-				min(src_k.k->p.offset - src_want.offset,
-				    dst_end.offset - dst_iter.pos.offset));
-
-		ret =   bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
-			bch2_extent_update(trans, dst_inum, &dst_iter,
-					new_dst.k, &disk_res,
-					new_i_size, i_sectors_delta,
-					true);
-		bch2_disk_reservation_put(c, &disk_res);
-	}
-	bch2_trans_iter_exit(trans, &dst_iter);
-	bch2_trans_iter_exit(trans, &src_iter);
-
-	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
-	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
-
-	dst_done = dst_iter.pos.offset - dst_start.offset;
-	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
-
-	do {
-		struct bch_inode_unpacked inode_u;
-		struct btree_iter inode_iter = {};
-
-		bch2_trans_begin(trans);
-
-		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
-				       dst_inum, BTREE_ITER_intent);
-
-		if (!ret2 &&
-		    inode_u.bi_size < new_i_size) {
-			inode_u.bi_size = new_i_size;
-			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
-				bch2_trans_commit(trans, NULL, NULL,
-						  BCH_TRANS_COMMIT_no_enospc);
-		}
-
-		bch2_trans_iter_exit(trans, &inode_iter);
-	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-err:
-	bch2_trans_put(trans);
-	bch2_bkey_buf_exit(&new_src, c);
-	bch2_bkey_buf_exit(&new_dst, c);
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
-
-	return dst_done ?: ret ?: ret2;
-}
-
-/* fsck */
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
-				     struct btree_iter *iter,
-				     struct bkey_s_c k,
-				     size_t *idx)
-{
-	struct bch_fs *c = trans->c;
-	const __le64 *refcount = bkey_refcount_c(k);
-	struct printbuf buf = PRINTBUF;
-	struct reflink_gc *r;
-	int ret = 0;
-
-	if (!refcount)
-		return 0;
-
-	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
-	       r->offset < k.k->p.offset)
-		++*idx;
-
-	if (!r ||
-	    r->offset != k.k->p.offset ||
-	    r->size != k.k->size) {
-		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-		return -EINVAL;
-	}
-
-	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
-			trans, reflink_v_refcount_wrong,
-			"reflink key has wrong refcount:\n"
-			"%s\n"
-			"should be %u",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
-
-		if (!r->refcount)
-			new->k.type = KEY_TYPE_deleted;
-		else
-			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
-		ret = bch2_trans_update(trans, iter, new, 0);
-	}
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_gc_reflink_done(struct bch_fs *c)
-{
-	size_t idx = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_reflink, POS_MIN,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
-	c->reflink_gc_nr = 0;
-	return ret;
-}
-
-int bch2_gc_reflink_start(struct bch_fs *c)
-{
-	c->reflink_gc_nr = 0;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
-				   BTREE_ITER_prefetch, k, ({
-			const __le64 *refcount = bkey_refcount_c(k);
-
-			if (!refcount)
-				continue;
-
-			struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
-							c->reflink_gc_nr++, GFP_KERNEL);
-			if (!r) {
-				ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
-				break;
-			}
-
-			r->offset	= k.k->p.offset;
-			r->size		= k.k->size;
-			r->refcount	= 0;
-			0;
-		})));
-
-	bch_err_fn(c, ret);
-	return ret;
-}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
deleted file mode 100644
index 1632780bdf18..000000000000
--- a/fs/bcachefs/reflink.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_H
-#define _BCACHEFS_REFLINK_H
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
-			    struct bkey_validate_context);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
-			   struct bkey_s_c, struct bkey_s,
-			   enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
-	.key_validate	= bch2_reflink_p_validate,		\
-	.val_to_text	= bch2_reflink_p_to_text,		\
-	.key_merge	= bch2_reflink_p_merge,			\
-	.trigger	= bch2_trigger_reflink_p,		\
-	.min_val_size	= 16,					\
-})
-
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
-			    struct bkey_validate_context);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
-			   struct bkey_s_c, struct bkey_s,
-			   enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
-	.key_validate	= bch2_reflink_v_validate,		\
-	.val_to_text	= bch2_reflink_v_to_text,		\
-	.swab		= bch2_ptr_swab,			\
-	.trigger	= bch2_trigger_reflink_v,		\
-	.min_val_size	= 8,					\
-})
-
-int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
-				       struct bkey_validate_context);
-void bch2_indirect_inline_data_to_text(struct printbuf *,
-				struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_indirect_inline_data(struct btree_trans *,
-					 enum btree_id, unsigned,
-			      struct bkey_s_c, struct bkey_s,
-			      enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
-	.key_validate	= bch2_indirect_inline_data_validate,	\
-	.val_to_text	= bch2_indirect_inline_data_to_text,	\
-	.trigger	= bch2_trigger_indirect_inline_data,	\
-	.min_val_size	= 8,					\
-})
-
-static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_reflink_v:
-		return &bkey_s_c_to_reflink_v(k).v->refcount;
-	case KEY_TYPE_indirect_inline_data:
-		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
-	default:
-		return NULL;
-	}
-}
-
-static inline __le64 *bkey_refcount(struct bkey_s k)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_reflink_v:
-		return &bkey_s_to_reflink_v(k).v->refcount;
-	case KEY_TYPE_indirect_inline_data:
-		return &bkey_s_to_indirect_inline_data(k).v->refcount;
-	default:
-		return NULL;
-	}
-}
-
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
-					    s64 *, struct bkey_s_c_reflink_p,
-					    bool, unsigned);
-
-s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
-		     subvol_inum, u64, u64, u64, s64 *,
-		     bool);
-
-int bch2_gc_reflink_done(struct bch_fs *);
-int bch2_gc_reflink_start(struct bch_fs *);
-
-#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
deleted file mode 100644
index 92995e4f898e..000000000000
--- a/fs/bcachefs/reflink_format.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_FORMAT_H
-#define _BCACHEFS_REFLINK_FORMAT_H
-
-struct bch_reflink_p {
-	struct bch_val		v;
-	__le64			idx_flags;
-	/*
-	 * A reflink pointer might point to an indirect extent which is then
-	 * later split (by copygc or rebalance). If we only pointed to part of
-	 * the original indirect extent, and then one of the fragments is
-	 * outside the range we point to, we'd leak a refcount: so when creating
-	 * reflink pointers, we need to store pad values to remember the full
-	 * range we were taking a reference on.
-	 */
-	__le32			front_pad;
-	__le32			back_pad;
-} __packed __aligned(8);
-
-LE64_BITMASK(REFLINK_P_IDX,	struct bch_reflink_p, idx_flags,  0, 56);
-LE64_BITMASK(REFLINK_P_ERROR,	struct bch_reflink_p, idx_flags, 56, 57);
-LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
-				struct bch_reflink_p, idx_flags, 57, 58);
-
-struct bch_reflink_v {
-	struct bch_val		v;
-	__le64			refcount;
-	union bch_extent_entry	start[0];
-	__u64			_data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
-	struct bch_val		v;
-	__le64			refcount;
-	u8			data[];
-};
-
-#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
deleted file mode 100644
index 8383bd7fdb3f..000000000000
--- a/fs/bcachefs/replicas.c
+++ /dev/null
@@ -1,918 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "journal.h"
-#include "replicas.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
-					    struct bch_replicas_cpu *);
-
-/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r,  const void *priv)
-{
-	size_t size = (size_t) priv;
-	return memcmp(l, r, size);
-}
-
-/* Replicas tracking - in memory: */
-
-static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	BUG_ON(!e->nr_devs);
-	BUG_ON(e->nr_required > 1 &&
-	       e->nr_required >= e->nr_devs);
-
-	for (unsigned i = 0; i + 1 < e->nr_devs; i++)
-		BUG_ON(e->devs[i] >= e->devs[i + 1]);
-#endif
-}
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
-{
-	bubble_sort(e->devs, e->nr_devs, u8_cmp);
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
-	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
-			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
-}
-
-static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
-					   struct bch_replicas_entry_v0 *e)
-{
-	bch2_prt_data_type(out, e->data_type);
-
-	prt_printf(out, ": %u [", e->nr_devs);
-	for (unsigned i = 0; i < e->nr_devs; i++)
-		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
-	prt_printf(out, "]");
-}
-
-void bch2_replicas_entry_to_text(struct printbuf *out,
-				 struct bch_replicas_entry_v1 *e)
-{
-	bch2_prt_data_type(out, e->data_type);
-
-	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
-	for (unsigned i = 0; i < e->nr_devs; i++)
-		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
-	prt_printf(out, "]");
-}
-
-static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
-					   struct bch_sb *sb,
-					   struct printbuf *err)
-{
-	if (!r->nr_devs) {
-		prt_printf(err, "no devices in entry ");
-		goto bad;
-	}
-
-	if (r->nr_required > 1 &&
-	    r->nr_required >= r->nr_devs) {
-		prt_printf(err, "bad nr_required in entry ");
-		goto bad;
-	}
-
-	for (unsigned i = 0; i < r->nr_devs; i++)
-		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
-		    !bch2_member_exists(sb, r->devs[i])) {
-			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
-			goto bad;
-		}
-
-	return 0;
-bad:
-	bch2_replicas_entry_to_text(err, r);
-	return -BCH_ERR_invalid_replicas_entry;
-}
-
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
-				 struct bch_fs *c,
-				 struct printbuf *err)
-{
-	if (!r->nr_devs) {
-		prt_printf(err, "no devices in entry ");
-		goto bad;
-	}
-
-	if (r->nr_required > 1 &&
-	    r->nr_required >= r->nr_devs) {
-		prt_printf(err, "bad nr_required in entry ");
-		goto bad;
-	}
-
-	for (unsigned i = 0; i < r->nr_devs; i++)
-		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
-		    !bch2_dev_exists(c, r->devs[i])) {
-			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
-			goto bad;
-		}
-
-	return 0;
-bad:
-	bch2_replicas_entry_to_text(err, r);
-	return bch_err_throw(c, invalid_replicas_entry);
-}
-
-void bch2_cpu_replicas_to_text(struct printbuf *out,
-			       struct bch_replicas_cpu *r)
-{
-	struct bch_replicas_entry_v1 *e;
-	bool first = true;
-
-	for_each_cpu_replicas_entry(r, e) {
-		if (!first)
-			prt_printf(out, " ");
-		first = false;
-
-		bch2_replicas_entry_to_text(out, e);
-	}
-}
-
-static void extent_to_replicas(struct bkey_s_c k,
-			       struct bch_replicas_entry_v1 *r)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-
-	r->nr_required	= 1;
-
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		if (p.ptr.cached)
-			continue;
-
-		if (!p.has_ec)
-			replicas_entry_add_dev(r, p.ptr.dev);
-		else
-			r->nr_required = 0;
-	}
-}
-
-static void stripe_to_replicas(struct bkey_s_c k,
-			       struct bch_replicas_entry_v1 *r)
-{
-	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-	const struct bch_extent_ptr *ptr;
-
-	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
-
-	for (ptr = s.v->ptrs;
-	     ptr < s.v->ptrs + s.v->nr_blocks;
-	     ptr++)
-		replicas_entry_add_dev(r, ptr->dev);
-}
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
-			   struct bkey_s_c k)
-{
-	e->nr_devs = 0;
-
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		e->data_type = BCH_DATA_btree;
-		extent_to_replicas(k, e);
-		break;
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		e->data_type = BCH_DATA_user;
-		extent_to_replicas(k, e);
-		break;
-	case KEY_TYPE_stripe:
-		e->data_type = BCH_DATA_parity;
-		stripe_to_replicas(k, e);
-		break;
-	}
-
-	bch2_replicas_entry_sort(e);
-}
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
-			      enum bch_data_type data_type,
-			      struct bch_devs_list devs)
-{
-	BUG_ON(!data_type ||
-	       data_type == BCH_DATA_sb ||
-	       data_type >= BCH_DATA_NR);
-
-	e->data_type	= data_type;
-	e->nr_devs	= 0;
-	e->nr_required	= 1;
-
-	darray_for_each(devs, i)
-		replicas_entry_add_dev(e, *i);
-
-	bch2_replicas_entry_sort(e);
-}
-
-static struct bch_replicas_cpu
-cpu_replicas_add_entry(struct bch_fs *c,
-		       struct bch_replicas_cpu *old,
-		       struct bch_replicas_entry_v1 *new_entry)
-{
-	struct bch_replicas_cpu new = {
-		.nr		= old->nr + 1,
-		.entry_size	= max_t(unsigned, old->entry_size,
-					replicas_entry_bytes(new_entry)),
-	};
-
-	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
-	if (!new.entries)
-		return new;
-
-	for (unsigned i = 0; i < old->nr; i++)
-		memcpy(cpu_replicas_entry(&new, i),
-		       cpu_replicas_entry(old, i),
-		       old->entry_size);
-
-	memcpy(cpu_replicas_entry(&new, old->nr),
-	       new_entry,
-	       replicas_entry_bytes(new_entry));
-
-	bch2_cpu_replicas_sort(&new);
-	return new;
-}
-
-static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
-				       struct bch_replicas_entry_v1 *search)
-{
-	int idx, entry_size = replicas_entry_bytes(search);
-
-	if (unlikely(entry_size > r->entry_size))
-		return -1;
-
-#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
-	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
-			      entry_cmp, search);
-#undef entry_cmp
-
-	return idx < r->nr ? idx : -1;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *c,
-			    struct bch_replicas_entry_v1 *search)
-{
-	bch2_replicas_entry_sort(search);
-
-	return __replicas_entry_idx(&c->replicas, search);
-}
-
-static bool __replicas_has_entry(struct bch_replicas_cpu *r,
-				 struct bch_replicas_entry_v1 *search)
-{
-	return __replicas_entry_idx(r, search) >= 0;
-}
-
-bool bch2_replicas_marked_locked(struct bch_fs *c,
-			  struct bch_replicas_entry_v1 *search)
-{
-	verify_replicas_entry(search);
-
-	return !search->nr_devs ||
-		(__replicas_has_entry(&c->replicas, search) &&
-		 (likely((!c->replicas_gc.entries)) ||
-		  __replicas_has_entry(&c->replicas_gc, search)));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
-			  struct bch_replicas_entry_v1 *search)
-{
-	percpu_down_read(&c->mark_lock);
-	bool ret = bch2_replicas_marked_locked(c, search);
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-				struct bch_replicas_entry_v1 *new_entry)
-{
-	struct bch_replicas_cpu new_r, new_gc;
-	int ret = 0;
-
-	verify_replicas_entry(new_entry);
-
-	memset(&new_r, 0, sizeof(new_r));
-	memset(&new_gc, 0, sizeof(new_gc));
-
-	mutex_lock(&c->sb_lock);
-
-	if (c->replicas_gc.entries &&
-	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
-		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
-		if (!new_gc.entries) {
-			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-			goto err;
-		}
-	}
-
-	if (!__replicas_has_entry(&c->replicas, new_entry)) {
-		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
-		if (!new_r.entries) {
-			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-			goto err;
-		}
-
-		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
-		if (ret)
-			goto err;
-	}
-
-	if (!new_r.entries &&
-	    !new_gc.entries)
-		goto out;
-
-	/* allocations done, now commit: */
-
-	if (new_r.entries)
-		bch2_write_super(c);
-
-	/* don't update in memory replicas until changes are persistent */
-	percpu_down_write(&c->mark_lock);
-	if (new_r.entries)
-		swap(c->replicas, new_r);
-	if (new_gc.entries)
-		swap(new_gc, c->replicas_gc);
-	percpu_up_write(&c->mark_lock);
-out:
-	mutex_unlock(&c->sb_lock);
-
-	kfree(new_r.entries);
-	kfree(new_gc.entries);
-
-	return ret;
-err:
-	bch_err_msg(c, ret, "adding replicas entry");
-	goto out;
-}
-
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
-{
-	return likely(bch2_replicas_marked(c, r))
-		? 0 : bch2_mark_replicas_slowpath(c, r);
-}
-
-/*
- * Old replicas_gc mechanism: only used for journal replicas entries now, should
- * die at some point:
- */
-
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
-{
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-	percpu_down_write(&c->mark_lock);
-
-	ret =   ret ?:
-		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
-	if (!ret)
-		swap(c->replicas, c->replicas_gc);
-
-	kfree(c->replicas_gc.entries);
-	c->replicas_gc.entries = NULL;
-
-	percpu_up_write(&c->mark_lock);
-
-	if (!ret)
-		bch2_write_super(c);
-
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-	struct bch_replicas_entry_v1 *e;
-	unsigned i = 0;
-
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	mutex_lock(&c->sb_lock);
-	BUG_ON(c->replicas_gc.entries);
-
-	c->replicas_gc.nr		= 0;
-	c->replicas_gc.entry_size	= 0;
-
-	for_each_cpu_replicas_entry(&c->replicas, e) {
-		/* Preserve unknown data types */
-		if (e->data_type >= BCH_DATA_NR ||
-		    !((1 << e->data_type) & typemask)) {
-			c->replicas_gc.nr++;
-			c->replicas_gc.entry_size =
-				max_t(unsigned, c->replicas_gc.entry_size,
-				      replicas_entry_bytes(e));
-		}
-	}
-
-	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
-					 c->replicas_gc.entry_size,
-					 GFP_KERNEL);
-	if (!c->replicas_gc.entries) {
-		mutex_unlock(&c->sb_lock);
-		bch_err(c, "error allocating c->replicas_gc");
-		return bch_err_throw(c, ENOMEM_replicas_gc);
-	}
-
-	for_each_cpu_replicas_entry(&c->replicas, e)
-		if (e->data_type >= BCH_DATA_NR ||
-		    !((1 << e->data_type) & typemask))
-			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
-			       e, c->replicas_gc.entry_size);
-
-	bch2_cpu_replicas_sort(&c->replicas_gc);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
-/*
- * New much simpler mechanism for clearing out unneeded replicas entries - drop
- * replicas entries that have 0 sectors used.
- *
- * However, we don't track sector counts for journal usage, so this doesn't drop
- * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
- * is retained for that.
- */
-int bch2_replicas_gc2(struct bch_fs *c)
-{
-	struct bch_replicas_cpu new = { 0 };
-	unsigned nr;
-	int ret = 0;
-
-	bch2_accounting_mem_gc(c);
-retry:
-	nr		= READ_ONCE(c->replicas.nr);
-	new.entry_size	= READ_ONCE(c->replicas.entry_size);
-	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
-	if (!new.entries) {
-		bch_err(c, "error allocating c->replicas_gc");
-		return bch_err_throw(c, ENOMEM_replicas_gc);
-	}
-
-	mutex_lock(&c->sb_lock);
-	percpu_down_write(&c->mark_lock);
-
-	if (nr			!= c->replicas.nr ||
-	    new.entry_size	!= c->replicas.entry_size) {
-		percpu_up_write(&c->mark_lock);
-		mutex_unlock(&c->sb_lock);
-		kfree(new.entries);
-		goto retry;
-	}
-
-	for (unsigned i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(&c->replicas, i);
-
-		struct disk_accounting_pos k = {
-			.type = BCH_DISK_ACCOUNTING_replicas,
-		};
-
-		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
-			      "embedded variable length struct");
-
-		struct bpos p = disk_accounting_pos_to_bpos(&k);
-
-		struct bch_accounting_mem *acc = &c->accounting;
-		bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-					    accounting_pos_cmp, &p) >= acc->k.nr;
-
-		if (e->data_type == BCH_DATA_journal || !kill)
-			memcpy(cpu_replicas_entry(&new, new.nr++),
-			       e, new.entry_size);
-	}
-
-	bch2_cpu_replicas_sort(&new);
-
-	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-
-	if (!ret)
-		swap(c->replicas, new);
-
-	kfree(new.entries);
-
-	percpu_up_write(&c->mark_lock);
-
-	if (!ret)
-		bch2_write_super(c);
-
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-/* Replicas tracking - superblock: */
-
-static int
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
-				   struct bch_replicas_cpu *cpu_r)
-{
-	struct bch_replicas_entry_v1 *e, *dst;
-	unsigned nr = 0, entry_size = 0, idx = 0;
-
-	for_each_replicas_entry(sb_r, e) {
-		entry_size = max_t(unsigned, entry_size,
-				   replicas_entry_bytes(e));
-		nr++;
-	}
-
-	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
-	if (!cpu_r->entries)
-		return -BCH_ERR_ENOMEM_cpu_replicas;
-
-	cpu_r->nr		= nr;
-	cpu_r->entry_size	= entry_size;
-
-	for_each_replicas_entry(sb_r, e) {
-		dst = cpu_replicas_entry(cpu_r, idx++);
-		memcpy(dst, e, replicas_entry_bytes(e));
-		bch2_replicas_entry_sort(dst);
-	}
-
-	return 0;
-}
-
-static int
-__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
-				      struct bch_replicas_cpu *cpu_r)
-{
-	struct bch_replicas_entry_v0 *e;
-	unsigned nr = 0, entry_size = 0, idx = 0;
-
-	for_each_replicas_entry(sb_r, e) {
-		entry_size = max_t(unsigned, entry_size,
-				   replicas_entry_bytes(e));
-		nr++;
-	}
-
-	entry_size += sizeof(struct bch_replicas_entry_v1) -
-		sizeof(struct bch_replicas_entry_v0);
-
-	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
-	if (!cpu_r->entries)
-		return -BCH_ERR_ENOMEM_cpu_replicas;
-
-	cpu_r->nr		= nr;
-	cpu_r->entry_size	= entry_size;
-
-	for_each_replicas_entry(sb_r, e) {
-		struct bch_replicas_entry_v1 *dst =
-			cpu_replicas_entry(cpu_r, idx++);
-
-		dst->data_type	= e->data_type;
-		dst->nr_devs	= e->nr_devs;
-		dst->nr_required = 1;
-		memcpy(dst->devs, e->devs, e->nr_devs);
-		bch2_replicas_entry_sort(dst);
-	}
-
-	return 0;
-}
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
-	struct bch_sb_field_replicas *sb_v1;
-	struct bch_sb_field_replicas_v0 *sb_v0;
-	struct bch_replicas_cpu new_r = { 0, 0, NULL };
-	int ret = 0;
-
-	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
-		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
-	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
-		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
-	if (ret)
-		return ret;
-
-	bch2_cpu_replicas_sort(&new_r);
-
-	percpu_down_write(&c->mark_lock);
-	swap(c->replicas, new_r);
-	percpu_up_write(&c->mark_lock);
-
-	kfree(new_r.entries);
-
-	return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
-					       struct bch_replicas_cpu *r)
-{
-	struct bch_sb_field_replicas_v0 *sb_r;
-	struct bch_replicas_entry_v0 *dst;
-	struct bch_replicas_entry_v1 *src;
-	size_t bytes;
-
-	bytes = sizeof(struct bch_sb_field_replicas);
-
-	for_each_cpu_replicas_entry(r, src)
-		bytes += replicas_entry_bytes(src) - 1;
-
-	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
-			DIV_ROUND_UP(bytes, sizeof(u64)));
-	if (!sb_r)
-		return bch_err_throw(c, ENOSPC_sb_replicas);
-
-	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
-	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
-
-	memset(&sb_r->entries, 0,
-	       vstruct_end(&sb_r->field) -
-	       (void *) &sb_r->entries);
-
-	dst = sb_r->entries;
-	for_each_cpu_replicas_entry(r, src) {
-		dst->data_type	= src->data_type;
-		dst->nr_devs	= src->nr_devs;
-		memcpy(dst->devs, src->devs, src->nr_devs);
-
-		dst = replicas_entry_next(dst);
-
-		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
-	}
-
-	return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
-					    struct bch_replicas_cpu *r)
-{
-	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry_v1 *dst, *src;
-	bool need_v1 = false;
-	size_t bytes;
-
-	bytes = sizeof(struct bch_sb_field_replicas);
-
-	for_each_cpu_replicas_entry(r, src) {
-		bytes += replicas_entry_bytes(src);
-		if (src->nr_required != 1)
-			need_v1 = true;
-	}
-
-	if (!need_v1)
-		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
-
-	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
-			DIV_ROUND_UP(bytes, sizeof(u64)));
-	if (!sb_r)
-		return bch_err_throw(c, ENOSPC_sb_replicas);
-
-	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
-	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
-
-	memset(&sb_r->entries, 0,
-	       vstruct_end(&sb_r->field) -
-	       (void *) &sb_r->entries);
-
-	dst = sb_r->entries;
-	for_each_cpu_replicas_entry(r, src) {
-		memcpy(dst, src, replicas_entry_bytes(src));
-
-		dst = replicas_entry_next(dst);
-
-		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
-	}
-
-	return 0;
-}
-
-static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
-				      struct bch_sb *sb,
-				      struct printbuf *err)
-{
-	unsigned i;
-
-	sort_r(cpu_r->entries,
-	       cpu_r->nr,
-	       cpu_r->entry_size,
-	       bch2_memcmp, NULL,
-	       (void *)(size_t)cpu_r->entry_size);
-
-	for (i = 0; i < cpu_r->nr; i++) {
-		struct bch_replicas_entry_v1 *e =
-			cpu_replicas_entry(cpu_r, i);
-
-		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
-		if (ret)
-			return ret;
-
-		if (i + 1 < cpu_r->nr) {
-			struct bch_replicas_entry_v1 *n =
-				cpu_replicas_entry(cpu_r, i + 1);
-
-			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
-
-			if (!memcmp(e, n, cpu_r->entry_size)) {
-				prt_printf(err, "duplicate replicas entry ");
-				bch2_replicas_entry_to_text(err, e);
-				return -BCH_ERR_invalid_sb_replicas;
-			}
-		}
-	}
-
-	return 0;
-}
-
-static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				     enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-	struct bch_replicas_cpu cpu_r;
-	int ret;
-
-	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
-	if (ret)
-		return ret;
-
-	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-	kfree(cpu_r.entries);
-	return ret;
-}
-
-static void bch2_sb_replicas_to_text(struct printbuf *out,
-				     struct bch_sb *sb,
-				     struct bch_sb_field *f)
-{
-	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
-	struct bch_replicas_entry_v1 *e;
-	bool first = true;
-
-	for_each_replicas_entry(r, e) {
-		if (!first)
-			prt_printf(out, " ");
-		first = false;
-
-		bch2_replicas_entry_to_text(out, e);
-	}
-	prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-	.validate	= bch2_sb_replicas_validate,
-	.to_text	= bch2_sb_replicas_to_text,
-};
-
-static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
-					enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-	struct bch_replicas_cpu cpu_r;
-	int ret;
-
-	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
-	if (ret)
-		return ret;
-
-	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-	kfree(cpu_r.entries);
-	return ret;
-}
-
-static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
-					struct bch_sb *sb,
-					struct bch_sb_field *f)
-{
-	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-	struct bch_replicas_entry_v0 *e;
-	bool first = true;
-
-	for_each_replicas_entry(sb_r, e) {
-		if (!first)
-			prt_printf(out, " ");
-		first = false;
-
-		bch2_replicas_entry_v0_to_text(out, e);
-	}
-	prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
-	.validate	= bch2_sb_replicas_v0_validate,
-	.to_text	= bch2_sb_replicas_v0_to_text,
-};
-
-/* Query replicas: */
-
-bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
-			   unsigned flags, bool print)
-{
-	struct bch_replicas_entry_v1 *e;
-	bool ret = true;
-
-	percpu_down_read(&c->mark_lock);
-	for_each_cpu_replicas_entry(&c->replicas, e) {
-		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
-		bool metadata = e->data_type < BCH_DATA_user;
-
-		if (e->data_type == BCH_DATA_cached)
-			continue;
-
-		scoped_guard(rcu)
-			for (unsigned i = 0; i < e->nr_devs; i++) {
-				if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
-					nr_failed++;
-					continue;
-				}
-
-				nr_online += test_bit(e->devs[i], devs.d);
-
-				struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
-				nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-			}
-
-		if (nr_online + nr_failed == e->nr_devs)
-			continue;
-
-		if (nr_online < e->nr_required)
-			dflags |= metadata
-				? BCH_FORCE_IF_METADATA_LOST
-				: BCH_FORCE_IF_DATA_LOST;
-
-		if (nr_online < e->nr_devs)
-			dflags |= metadata
-				? BCH_FORCE_IF_METADATA_DEGRADED
-				: BCH_FORCE_IF_DATA_DEGRADED;
-
-		if (dflags & ~flags) {
-			if (print) {
-				struct printbuf buf = PRINTBUF;
-
-				bch2_replicas_entry_to_text(&buf, e);
-				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
-					nr_online, buf.buf);
-				printbuf_exit(&buf);
-			}
-			ret = false;
-			break;
-		}
-
-	}
-	percpu_up_read(&c->mark_lock);
-
-	return ret;
-}
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
-	struct bch_sb_field_replicas *replicas;
-	struct bch_sb_field_replicas_v0 *replicas_v0;
-	unsigned data_has = 0;
-
-	replicas = bch2_sb_field_get(sb, replicas);
-	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
-
-	if (replicas) {
-		struct bch_replicas_entry_v1 *r;
-
-		for_each_replicas_entry(replicas, r) {
-			if (r->data_type >= sizeof(data_has) * 8)
-				continue;
-
-			for (unsigned i = 0; i < r->nr_devs; i++)
-				if (r->devs[i] == dev)
-					data_has |= 1 << r->data_type;
-		}
-
-	} else if (replicas_v0) {
-		struct bch_replicas_entry_v0 *r;
-
-		for_each_replicas_entry_v0(replicas_v0, r) {
-			if (r->data_type >= sizeof(data_has) * 8)
-				continue;
-
-			for (unsigned i = 0; i < r->nr_devs; i++)
-				if (r->devs[i] == dev)
-					data_has |= 1 << r->data_type;
-		}
-	}
-
-
-	return data_has;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
-	mutex_lock(&c->sb_lock);
-	unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-void bch2_fs_replicas_exit(struct bch_fs *c)
-{
-	kfree(c->replicas.entries);
-	kfree(c->replicas_gc.entries);
-}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
deleted file mode 100644
index 5aba2c1ce133..000000000000
--- a/fs/bcachefs/replicas.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_H
-#define _BCACHEFS_REPLICAS_H
-
-#include "bkey.h"
-#include "eytzinger.h"
-#include "replicas_types.h"
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
-void bch2_replicas_entry_to_text(struct printbuf *,
-				 struct bch_replicas_entry_v1 *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
-				 struct bch_fs *, struct printbuf *);
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-
-static inline struct bch_replicas_entry_v1 *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-	return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *,
-			    struct bch_replicas_entry_v1 *);
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
-			      enum bch_data_type,
-			      struct bch_devs_list);
-
-bool bch2_replicas_marked_locked(struct bch_fs *,
-			  struct bch_replicas_entry_v1 *);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
-int bch2_mark_replicas(struct bch_fs *,
-		       struct bch_replicas_entry_v1 *);
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
-					      unsigned dev)
-{
-	e->data_type	= BCH_DATA_cached;
-	e->nr_devs	= 1;
-	e->nr_required	= 1;
-	e->devs[0]	= dev;
-}
-
-bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
-			   unsigned, bool);
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-int bch2_replicas_gc2(struct bch_fs *);
-
-#define for_each_cpu_replicas_entry(_r, _i)				\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-	     _i = (void *) (_i) + (_r)->entry_size)
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-#define replicas_entry_next(_i)						\
-	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
-
-#define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
-#define for_each_replicas_entry_v0(_r, _i)				\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
-
-void bch2_fs_replicas_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
deleted file mode 100644
index b7eff904acdb..000000000000
--- a/fs/bcachefs/replicas_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_FORMAT_H
-#define _BCACHEFS_REPLICAS_FORMAT_H
-
-struct bch_replicas_entry_v0 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
-	struct bch_sb_field	field;
-	struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
-	__u8			data_type;
-	__u8			nr_devs;
-	__u8			nr_required;
-	__u8			devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas {
-	struct bch_sb_field	field;
-	struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-#define replicas_entry_bytes(_i)					\
-	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-#define replicas_entry_add_dev(e, d) ({					\
-	(e)->nr_devs++;							\
-	(e)->devs[(e)->nr_devs - 1] = (d);				\
-})
-
-#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
deleted file mode 100644
index fed71c861fe7..000000000000
--- a/fs/bcachefs/replicas_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_TYPES_H
-#define _BCACHEFS_REPLICAS_TYPES_H
-
-struct bch_replicas_cpu {
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_entry_v1 *entries;
-};
-
-#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
deleted file mode 100644
index 59c8770e4a0e..000000000000
--- a/fs/bcachefs/sb-clean.c
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "super-io.h"
-
-/*
- * BCH_SB_FIELD_clean:
- *
- * Btree roots, and a few other things, are recovered from the journal after an
- * unclean shutdown - but after a clean shutdown, to avoid having to read the
- * journal, we can store them in the superblock.
- *
- * bch_sb_field_clean simply contains a list of journal entries, stored exactly
- * as they would be in the journal:
- */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
-				int write)
-{
-	struct bkey_validate_context from = {
-		.flags		= write,
-		.from		= BKEY_VALIDATE_superblock,
-	};
-	struct jset_entry *entry;
-	int ret;
-
-	for (entry = clean->start;
-	     entry < (struct jset_entry *) vstruct_end(&clean->field);
-	     entry = vstruct_next(entry)) {
-		if (vstruct_end(entry) > vstruct_end(&clean->field)) {
-			bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
-				le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
-				(u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
-			bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
-			return -BCH_ERR_fsck_repair_unimplemented;
-		}
-
-		ret = bch2_journal_entry_validate(c, NULL, entry,
-						  le16_to_cpu(c->disk_sb.sb->version),
-						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-						  from);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-				      struct bch_sb_field_clean *clean,
-				      struct jset *j,
-				      enum btree_id id, unsigned *level)
-{
-	struct bkey_i *k;
-	struct jset_entry *entry, *start, *end;
-
-	if (clean) {
-		start = clean->start;
-		end = vstruct_end(&clean->field);
-	} else {
-		start = j->start;
-		end = vstruct_last(j);
-	}
-
-	for (entry = start; entry < end; entry = vstruct_next(entry))
-		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-		    entry->btree_id == id)
-			goto found;
-
-	return NULL;
-found:
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
-
-	k = entry->start;
-	*level = entry->level;
-	return k;
-}
-
-int bch2_verify_superblock_clean(struct bch_fs *c,
-				 struct bch_sb_field_clean **cleanp,
-				 struct jset *j)
-{
-	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
-	struct printbuf buf1 = PRINTBUF;
-	struct printbuf buf2 = PRINTBUF;
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			sb_clean_journal_seq_mismatch,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
-	}
-
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
-
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
-
-		if (!k1 && !k2)
-			continue;
-
-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-
-		if (k1)
-			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-		else
-			prt_printf(&buf1, "(none)");
-
-		if (k2)
-			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-		else
-			prt_printf(&buf2, "(none)");
-
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-				    l1 != l2, c,
-			sb_clean_btree_root_mismatch,
-			"superblock btree root %u doesn't match journal after clean shutdown\n"
-			"sb:      l=%u %s\n"
-			"journal: l=%u %s\n", i,
-			l1, buf1.buf,
-			l2, buf2.buf);
-	}
-fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *clean, *sb_clean;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
-
-	if (fsck_err_on(!sb_clean, c,
-			sb_clean_missing,
-			"superblock marked clean but clean section not present")) {
-		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-		c->sb.clean = false;
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-BCH_ERR_invalid_sb_clean);
-	}
-
-	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-			GFP_KERNEL);
-	if (!clean) {
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-	}
-
-	ret = bch2_sb_clean_validate_late(c, clean, READ);
-	if (ret) {
-		kfree(clean);
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(ret);
-	}
-
-	mutex_unlock(&c->sb_lock);
-
-	return clean;
-fsck_err:
-	mutex_unlock(&c->sb_lock);
-	return ERR_PTR(ret);
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
-					   struct jset_entry **end,
-					   u64 journal_seq)
-{
-	{
-		struct jset_entry_usage *u =
-			container_of(jset_entry_init(end, sizeof(*u)),
-				     struct jset_entry_usage, entry);
-
-		u->entry.type	= BCH_JSET_ENTRY_usage;
-		u->entry.btree_id = BCH_FS_USAGE_key_version;
-		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-	}
-
-	for (unsigned i = 0; i < 2; i++) {
-		struct jset_entry_clock *clock =
-			container_of(jset_entry_init(end, sizeof(*clock)),
-				     struct jset_entry_clock, entry);
-
-		clock->entry.type = BCH_JSET_ENTRY_clock;
-		clock->rw	= i;
-		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-	}
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-		prt_printf(err, "wrong size (got %zu should be %zu)",
-		       vstruct_bytes(&clean->field), sizeof(*clean));
-		return -BCH_ERR_invalid_sb_clean;
-	}
-
-	for (struct jset_entry *entry = clean->start;
-	     entry != vstruct_end(&clean->field);
-	     entry = vstruct_next(entry)) {
-		if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
-			prt_str(err, "entry type ");
-			bch2_prt_jset_entry_type(err, entry->type);
-			prt_str(err, " overruns end of section");
-			return -BCH_ERR_invalid_sb_clean;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-				  struct bch_sb_field *f)
-{
-	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-	struct jset_entry *entry;
-
-	prt_printf(out, "flags:          %x\n",		le32_to_cpu(clean->flags));
-	prt_printf(out, "journal_seq:    %llu\n",	le64_to_cpu(clean->journal_seq));
-
-	for (entry = clean->start;
-	     entry != vstruct_end(&clean->field);
-	     entry = vstruct_next(entry)) {
-		if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
-			break;
-
-		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-		    !entry->u64s)
-			continue;
-
-		bch2_journal_entry_to_text(out, NULL, entry);
-		prt_newline(out);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-	.validate	= bch2_sb_clean_validate,
-	.to_text	= bch2_sb_clean_to_text,
-};
-
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
-	int ret;
-
-	/*
-	 * Unconditionally write superblock, to verify it hasn't changed before
-	 * we go rw:
-	 */
-
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-	ret = bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *sb_clean;
-	struct jset_entry *entry;
-	unsigned u64s;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	if (BCH_SB_CLEAN(c->disk_sb.sb))
-		goto out;
-
-	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
-	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
-	sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
-	if (!sb_clean) {
-		bch_err(c, "error resizing superblock while setting filesystem clean");
-		goto out;
-	}
-
-	sb_clean->flags		= 0;
-	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
-
-	/* Trying to catch outstanding bug: */
-	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
-	entry = sb_clean->start;
-	bch2_journal_super_entries_add_common(c, &entry, 0);
-	entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
-	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
-	memset(entry, 0,
-	       vstruct_end(&sb_clean->field) - (void *) entry);
-
-	/*
-	 * this should be in the write path, and we should be validating every
-	 * superblock section:
-	 */
-	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-	if (ret) {
-		bch_err(c, "error writing marking filesystem clean: validate error");
-		goto out;
-	}
-
-	bch2_journal_pos_from_member_info_set(c);
-
-	bch2_write_super(c);
-out:
-	mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
deleted file mode 100644
index 71caef281239..000000000000
--- a/fs/bcachefs/sb-clean.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_CLEAN_H
-#define _BCACHEFS_SB_CLEAN_H
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
-				 struct jset *);
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
-void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
-
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
deleted file mode 100644
index 2b4b8445d418..000000000000
--- a/fs/bcachefs/sb-counters.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "super-io.h"
-#include "sb-counters.h"
-
-/* BCH_SB_FIELD_counters */
-
-static const u8 counters_to_stable_map[] = {
-#define x(n, id, ...)	[BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
-	BCH_PERSISTENT_COUNTERS()
-#undef x
-};
-
-const char * const bch2_counter_names[] = {
-#define x(t, n, ...) (#t),
-	BCH_PERSISTENT_COUNTERS()
-#undef x
-	NULL
-};
-
-static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
-{
-	if (!ctrs)
-		return 0;
-
-	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-}
-
-static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	return 0;
-}
-
-static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
-			      struct bch_sb_field *f)
-{
-	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
-	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-	for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-		unsigned stable = counters_to_stable_map[i];
-		if (stable < nr)
-			prt_printf(out, "%s \t%llu\n",
-				   bch2_counter_names[i],
-				   le64_to_cpu(ctrs->d[stable]));
-	}
-}
-
-int bch2_sb_counters_to_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
-	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-	for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
-		c->counters_on_mount[i] = 0;
-
-	for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-		unsigned stable = counters_to_stable_map[i];
-		if (stable < nr) {
-			u64 v = le64_to_cpu(ctrs->d[stable]);
-			percpu_u64_set(&c->counters[i], v);
-			c->counters_on_mount[i] = v;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_sb_counters_from_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
-	struct bch_sb_field_counters *ret;
-	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
-	if (nr < BCH_COUNTER_NR) {
-		ret = bch2_sb_field_resize(&c->disk_sb, counters,
-					   sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
-		if (ret) {
-			ctrs = ret;
-			nr = bch2_sb_counter_nr_entries(ctrs);
-		}
-	}
-
-	for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-		unsigned stable = counters_to_stable_map[i];
-		if (stable < nr)
-			ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
-	}
-
-	return 0;
-}
-
-void bch2_fs_counters_exit(struct bch_fs *c)
-{
-	free_percpu(c->counters);
-}
-
-int bch2_fs_counters_init(struct bch_fs *c)
-{
-	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
-	if (!c->counters)
-		return -BCH_ERR_ENOMEM_fs_counters_init;
-
-	return bch2_sb_counters_to_cpu(c);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_counters = {
-	.validate	= bch2_sb_counters_validate,
-	.to_text	= bch2_sb_counters_to_text,
-};
-
-#ifndef NO_BCACHEFS_CHARDEV
-long bch2_ioctl_query_counters(struct bch_fs *c,
-			struct bch_ioctl_query_counters __user *user_arg)
-{
-	struct bch_ioctl_query_counters arg;
-	int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
-	if (ret)
-		return ret;
-
-	if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
-	    arg.pad)
-		return -EINVAL;
-
-	arg.nr = min(arg.nr, BCH_COUNTER_NR);
-	ret = put_user(arg.nr, &user_arg->nr);
-	if (ret)
-		return ret;
-
-	for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
-		unsigned stable = counters_to_stable_map[i];
-
-		if (stable < arg.nr) {
-			u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
-				? percpu_u64_get(&c->counters[i])
-				: c->counters_on_mount[i];
-
-			ret = put_user(v, &user_arg->d[stable]);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
deleted file mode 100644
index a4329ad8dd1b..000000000000
--- a/fs/bcachefs/sb-counters.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_H
-#define _BCACHEFS_SB_COUNTERS_H
-
-#include "bcachefs.h"
-#include "super-io.h"
-
-int bch2_sb_counters_to_cpu(struct bch_fs *);
-int bch2_sb_counters_from_cpu(struct bch_fs *);
-
-void bch2_fs_counters_exit(struct bch_fs *);
-int bch2_fs_counters_init(struct bch_fs *);
-
-extern const char * const bch2_counter_names[];
-extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-
-long bch2_ioctl_query_counters(struct bch_fs *,
-			struct bch_ioctl_query_counters __user *);
-
-#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
deleted file mode 100644
index b868702a431a..000000000000
--- a/fs/bcachefs/sb-counters_format.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-
-enum counters_flags {
-	TYPE_COUNTER	= BIT(0),	/* event counters */
-	TYPE_SECTORS	= BIT(1),	/* amount counters, the unit is sectors */
-};
-
-#define BCH_PERSISTENT_COUNTERS()					\
-	x(io_read,					0,	TYPE_SECTORS)	\
-	x(io_read_inline,				80,	TYPE_SECTORS)	\
-	x(io_read_hole,					81,	TYPE_SECTORS)	\
-	x(io_read_promote,				30,	TYPE_COUNTER)	\
-	x(io_read_bounce,				31,	TYPE_COUNTER)	\
-	x(io_read_split,				33,	TYPE_COUNTER)	\
-	x(io_read_reuse_race,				34,	TYPE_COUNTER)	\
-	x(io_read_retry,				32,	TYPE_COUNTER)	\
-	x(io_read_fail_and_poison,			82,	TYPE_COUNTER)	\
-	x(io_write,					1,	TYPE_SECTORS)	\
-	x(io_move,					2,	TYPE_SECTORS)	\
-	x(io_move_read,					35,	TYPE_SECTORS)	\
-	x(io_move_write,				36,	TYPE_SECTORS)	\
-	x(io_move_finish,				37,	TYPE_SECTORS)	\
-	x(io_move_fail,					38,	TYPE_COUNTER)	\
-	x(io_move_write_fail,				82,	TYPE_COUNTER)	\
-	x(io_move_start_fail,				39,	TYPE_COUNTER)	\
-	x(io_move_created_rebalance,			83,	TYPE_COUNTER)	\
-	x(io_move_evacuate_bucket,			84,	TYPE_COUNTER)	\
-	x(bucket_invalidate,				3,	TYPE_COUNTER)	\
-	x(bucket_discard,				4,	TYPE_COUNTER)	\
-	x(bucket_discard_fast,				79,	TYPE_COUNTER)	\
-	x(bucket_alloc,					5,	TYPE_COUNTER)	\
-	x(bucket_alloc_fail,				6,	TYPE_COUNTER)	\
-	x(btree_cache_scan,				7,	TYPE_COUNTER)	\
-	x(btree_cache_reap,				8,	TYPE_COUNTER)	\
-	x(btree_cache_cannibalize,			9,	TYPE_COUNTER)	\
-	x(btree_cache_cannibalize_lock,			10,	TYPE_COUNTER)	\
-	x(btree_cache_cannibalize_lock_fail,		11,	TYPE_COUNTER)	\
-	x(btree_cache_cannibalize_unlock,		12,	TYPE_COUNTER)	\
-	x(btree_node_write,				13,	TYPE_COUNTER)	\
-	x(btree_node_read,				14,	TYPE_COUNTER)	\
-	x(btree_node_compact,				15,	TYPE_COUNTER)	\
-	x(btree_node_merge,				16,	TYPE_COUNTER)	\
-	x(btree_node_split,				17,	TYPE_COUNTER)	\
-	x(btree_node_rewrite,				18,	TYPE_COUNTER)	\
-	x(btree_node_alloc,				19,	TYPE_COUNTER)	\
-	x(btree_node_free,				20,	TYPE_COUNTER)	\
-	x(btree_node_set_root,				21,	TYPE_COUNTER)	\
-	x(btree_path_relock_fail,			22,	TYPE_COUNTER)	\
-	x(btree_path_upgrade_fail,			23,	TYPE_COUNTER)	\
-	x(btree_reserve_get_fail,			24,	TYPE_COUNTER)	\
-	x(journal_entry_full,				25,	TYPE_COUNTER)	\
-	x(journal_full,					26,	TYPE_COUNTER)	\
-	x(journal_reclaim_finish,			27,	TYPE_COUNTER)	\
-	x(journal_reclaim_start,			28,	TYPE_COUNTER)	\
-	x(journal_write,				29,	TYPE_COUNTER)	\
-	x(copygc,					40,	TYPE_COUNTER)	\
-	x(copygc_wait,					41,	TYPE_COUNTER)	\
-	x(gc_gens_end,					42,	TYPE_COUNTER)	\
-	x(gc_gens_start,				43,	TYPE_COUNTER)	\
-	x(trans_blocked_journal_reclaim,		44,	TYPE_COUNTER)	\
-	x(trans_restart_btree_node_reused,		45,	TYPE_COUNTER)	\
-	x(trans_restart_btree_node_split,		46,	TYPE_COUNTER)	\
-	x(trans_restart_fault_inject,			47,	TYPE_COUNTER)	\
-	x(trans_restart_iter_upgrade,			48,	TYPE_COUNTER)	\
-	x(trans_restart_journal_preres_get,		49,	TYPE_COUNTER)	\
-	x(trans_restart_journal_reclaim,		50,	TYPE_COUNTER)	\
-	x(trans_restart_journal_res_get,		51,	TYPE_COUNTER)	\
-	x(trans_restart_key_cache_key_realloced,	52,	TYPE_COUNTER)	\
-	x(trans_restart_key_cache_raced,		53,	TYPE_COUNTER)	\
-	x(trans_restart_mark_replicas,			54,	TYPE_COUNTER)	\
-	x(trans_restart_mem_realloced,			55,	TYPE_COUNTER)	\
-	x(trans_restart_memory_allocation_failure,	56,	TYPE_COUNTER)	\
-	x(trans_restart_relock,				57,	TYPE_COUNTER)	\
-	x(trans_restart_relock_after_fill,		58,	TYPE_COUNTER)	\
-	x(trans_restart_relock_key_cache_fill,		59,	TYPE_COUNTER)	\
-	x(trans_restart_relock_next_node,		60,	TYPE_COUNTER)	\
-	x(trans_restart_relock_parent_for_fill,		61,	TYPE_COUNTER)	\
-	x(trans_restart_relock_path,			62,	TYPE_COUNTER)	\
-	x(trans_restart_relock_path_intent,		63,	TYPE_COUNTER)	\
-	x(trans_restart_too_many_iters,			64,	TYPE_COUNTER)	\
-	x(trans_restart_traverse,			65,	TYPE_COUNTER)	\
-	x(trans_restart_upgrade,			66,	TYPE_COUNTER)	\
-	x(trans_restart_would_deadlock,			67,	TYPE_COUNTER)	\
-	x(trans_restart_would_deadlock_write,		68,	TYPE_COUNTER)	\
-	x(trans_restart_injected,			69,	TYPE_COUNTER)	\
-	x(trans_restart_key_cache_upgrade,		70,	TYPE_COUNTER)	\
-	x(trans_traverse_all,				71,	TYPE_COUNTER)	\
-	x(transaction_commit,				72,	TYPE_COUNTER)	\
-	x(write_super,					73,	TYPE_COUNTER)	\
-	x(trans_restart_would_deadlock_recursion_limit,	74,	TYPE_COUNTER)	\
-	x(trans_restart_write_buffer_flush,		75,	TYPE_COUNTER)	\
-	x(trans_restart_split_race,			76,	TYPE_COUNTER)	\
-	x(write_buffer_flush_slowpath,			77,	TYPE_COUNTER)	\
-	x(write_buffer_flush_sync,			78,	TYPE_COUNTER)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
-	BCH_PERSISTENT_COUNTERS()
-#undef x
-	BCH_COUNTER_NR
-};
-
-enum bch_persistent_counters_stable {
-#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
-	BCH_PERSISTENT_COUNTERS()
-#undef x
-	BCH_COUNTER_STABLE_NR
-};
-
-struct bch_sb_field_counters {
-	struct bch_sb_field	field;
-	__le64			d[];
-};
-
-#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
deleted file mode 100644
index 1506d05e0665..000000000000
--- a/fs/bcachefs/sb-downgrade.c
+++ /dev/null
@@ -1,457 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Superblock section that contains a list of recovery passes to run when
- * downgrading past a given version
- */
-
-#include "bcachefs.h"
-#include "darray.h"
-#include "recovery_passes.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-#define RECOVERY_PASS_ALL_FSCK		BIT_ULL(63)
-
-/*
- * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
- *
- * x(version, recovery_passes, errors...)
- */
-#define UPGRADE_TABLE()						\
-	x(snapshot_2,						\
-	  RECOVERY_PASS_ALL_FSCK,				\
-	  BCH_FSCK_ERR_subvol_root_wrong_bi_subvol,		\
-	  BCH_FSCK_ERR_subvol_not_master_and_not_snapshot)	\
-	x(backpointers,						\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(inode_v3,						\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(unwritten_extents,					\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(bucket_gens,						\
-	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|		\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(lru_v2,						\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(fragmentation_lru,					\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(no_bps_in_alloc_keys,					\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(snapshot_trees,					\
-	  RECOVERY_PASS_ALL_FSCK)				\
-	x(snapshot_skiplists,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots),		\
-	  BCH_FSCK_ERR_snapshot_bad_depth,			\
-	  BCH_FSCK_ERR_snapshot_bad_skiplist)			\
-	x(deleted_inodes,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
-	  BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)	\
-	x(rebalance_work,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
-	x(subvolume_fs_parent,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
-	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
-	x(btree_subvolume_children,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
-	  BCH_FSCK_ERR_subvol_children_not_set)			\
-	x(mi_btree_bitmap,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_btree_bitmap_not_marked)			\
-	x(disk_accounting_v2,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_bkey_version_in_future,			\
-	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
-	  BCH_FSCK_ERR_accounting_mismatch)			\
-	x(disk_accounting_v3,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_bkey_version_in_future,			\
-	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
-	  BCH_FSCK_ERR_accounting_mismatch,			\
-	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
-	  BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad,	\
-	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
-	x(disk_accounting_inum,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)			\
-	x(rebalance_work_acct_fix,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)			\
-	x(inode_has_child_snapshots,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
-	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)		\
-	x(backpointer_bucket_gen,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
-	x(disk_accounting_big_endian,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch,			\
-	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
-	x(cached_backpointers,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
-	x(stripe_backpointers,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
-	x(inode_has_case_insensitive,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
-	  BCH_FSCK_ERR_inode_has_case_insensitive_not_set,	\
-	  BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
-
-#define DOWNGRADE_TABLE()					\
-	x(bucket_stripe_sectors,				\
-	  0)							\
-	x(disk_accounting_v2,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
-	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
-	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
-	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
-	  BCH_FSCK_ERR_bkey_version_in_future)			\
-	x(disk_accounting_v3,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
-	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
-	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
-	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
-	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
-	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
-	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
-	  BCH_FSCK_ERR_bkey_version_in_future)			\
-	x(rebalance_work_acct_fix,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch,			\
-	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
-	x(backpointer_bucket_gen,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
-	  BCH_FSCK_ERR_backpointer_bucket_offset_wrong,		\
-	  BCH_FSCK_ERR_backpointer_to_missing_ptr,		\
-	  BCH_FSCK_ERR_ptr_to_missing_backpointer)		\
-	x(disk_accounting_big_endian,				\
-	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch,			\
-	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)
-
-struct upgrade_downgrade_entry {
-	u64		recovery_passes;
-	u16		version;
-	u16		nr_errors;
-	const u16	*errors;
-};
-
-#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
-UPGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry upgrade_table[] = {
-#define x(ver, passes, ...) {					\
-	.recovery_passes	= passes,			\
-	.version		= bcachefs_metadata_version_##ver,\
-	.nr_errors		= ARRAY_SIZE(upgrade_##ver##_errors),	\
-	.errors			= upgrade_##ver##_errors,	\
-},
-UPGRADE_TABLE()
-#undef x
-};
-
-static int have_stripes(struct bch_fs *c)
-{
-	if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
-		return 0;
-
-	return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
-}
-
-int bch2_sb_set_upgrade_extra(struct bch_fs *c)
-{
-	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
-	unsigned new_version = c->sb.version;
-	bool write_sb = false;
-	int ret = 0;
-
-	mutex_lock(&c->sb_lock);
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-	if (old_version <  bcachefs_metadata_version_bucket_stripe_sectors &&
-	    new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
-	    (ret = have_stripes(c) > 0)) {
-		__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
-		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
-		write_sb = true;
-	}
-
-	if (write_sb)
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return ret < 0 ? ret : 0;
-}
-
-void bch2_sb_set_upgrade(struct bch_fs *c,
-			 unsigned old_version,
-			 unsigned new_version)
-{
-	lockdep_assert_held(&c->sb_lock);
-
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-	for (const struct upgrade_downgrade_entry *i = upgrade_table;
-	     i < upgrade_table + ARRAY_SIZE(upgrade_table);
-	     i++)
-		if (i->version > old_version && i->version <= new_version) {
-			u64 passes = i->recovery_passes;
-
-			if (passes & RECOVERY_PASS_ALL_FSCK)
-				passes |= bch2_fsck_recovery_passes();
-			passes &= ~RECOVERY_PASS_ALL_FSCK;
-
-			ext->recovery_passes_required[0] |=
-				cpu_to_le64(bch2_recovery_passes_to_stable(passes));
-
-			for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
-				__set_bit_le64(*e, ext->errors_silent);
-		}
-}
-
-#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
-DOWNGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry downgrade_table[] = {
-#define x(ver, passes, ...) {					\
-	.recovery_passes	= passes,			\
-	.version		= bcachefs_metadata_version_##ver,\
-	.nr_errors		= ARRAY_SIZE(downgrade_##ver##_errors),	\
-	.errors			= downgrade_##ver##_errors,	\
-},
-DOWNGRADE_TABLE()
-#undef x
-};
-
-static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
-{
-	unsigned dst_offset = table->nr;
-	struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
-	unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
-	int ret = 0;
-
-	unsigned nr_errors = le16_to_cpu(dst->nr_errors);
-
-	switch (le16_to_cpu(dst->version)) {
-	case bcachefs_metadata_version_bucket_stripe_sectors:
-		if (have_stripes(c)) {
-			bytes += sizeof(dst->errors[0]) * 2;
-
-			ret = darray_make_room(table, bytes);
-			if (ret)
-				return ret;
-
-			dst = (void *) &table->data[dst_offset];
-			dst->nr_errors = cpu_to_le16(nr_errors + 1);
-
-			/* open coded __set_bit_le64, as dst is packed and
-			 * dst->recovery_passes is misaligned */
-			unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
-			dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
-
-			dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
-		}
-		break;
-	}
-
-	return ret;
-}
-
-static inline const struct bch_sb_field_downgrade_entry *
-downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
-{
-	return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
-}
-
-#define for_each_downgrade_entry(_d, _i)						\
-	for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries;		\
-	     (void *) _i	< vstruct_end(&(_d)->field) &&				\
-	     (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) &&			\
-	     (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field);		\
-	     _i = downgrade_entry_next_c(_i))
-
-static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				      enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
-	for (const struct bch_sb_field_downgrade_entry *i = e->entries;
-	     (void *) i	< vstruct_end(&e->field);
-	     i = downgrade_entry_next_c(i)) {
-		/*
-		 * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
-		 * section sizes are 8 byte aligned - an empty entry spanning
-		 * the end of the section is allowed (and ignored):
-		 */
-		if ((void *) &i->errors[0] > vstruct_end(&e->field))
-			break;
-
-		if (flags & BCH_VALIDATE_write &&
-		    (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
-			prt_printf(err, "downgrade entry overruns end of superblock section");
-			return -BCH_ERR_invalid_sb_downgrade;
-		}
-
-		if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
-		    BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
-			prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
-				   BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
-				   BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
-			return -BCH_ERR_invalid_sb_downgrade;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
-				      struct bch_sb_field *f)
-{
-	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
-	if (out->nr_tabstops <= 1)
-		printbuf_tabstop_push(out, 16);
-
-	for_each_downgrade_entry(e, i) {
-		prt_str(out, "version:\t");
-		bch2_version_to_text(out, le16_to_cpu(i->version));
-		prt_newline(out);
-
-		prt_str(out, "recovery passes:\t");
-		prt_bitflags(out, bch2_recovery_passes,
-			     bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
-		prt_newline(out);
-
-		prt_str(out, "errors:\t");
-		bool first = true;
-		for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
-			if (!first)
-				prt_char(out, ',');
-			first = false;
-			bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
-		}
-		prt_newline(out);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
-	.validate	= bch2_sb_downgrade_validate,
-	.to_text	= bch2_sb_downgrade_to_text,
-};
-
-int bch2_sb_downgrade_update(struct bch_fs *c)
-{
-	if (!test_bit(BCH_FS_btree_running, &c->flags))
-		return 0;
-
-	darray_char table = {};
-	int ret = 0;
-
-	for (const struct upgrade_downgrade_entry *src = downgrade_table;
-	     src < downgrade_table + ARRAY_SIZE(downgrade_table);
-	     src++) {
-		if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
-			continue;
-
-		if (src->version < c->sb.version_incompat)
-			continue;
-
-		struct bch_sb_field_downgrade_entry *dst;
-		unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
-
-		ret = darray_make_room(&table, bytes);
-		if (ret)
-			goto out;
-
-		dst = (void *) &darray_top(table);
-		dst->version = cpu_to_le16(src->version);
-		dst->recovery_passes[0]	= cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
-		dst->recovery_passes[1]	= 0;
-		dst->nr_errors		= cpu_to_le16(src->nr_errors);
-		for (unsigned i = 0; i < src->nr_errors; i++)
-			dst->errors[i] = cpu_to_le16(src->errors[i]);
-
-		ret = downgrade_table_extra(c, &table);
-		if (ret)
-			goto out;
-
-		if (!dst->recovery_passes[0] &&
-		    !dst->recovery_passes[1] &&
-		    !dst->nr_errors)
-			continue;
-
-		table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
-	}
-
-	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-
-	unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
-
-	if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
-		goto out;
-
-	d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
-	if (!d) {
-		ret = bch_err_throw(c, ENOSPC_sb_downgrade);
-		goto out;
-	}
-
-	memcpy(d->entries, table.data, table.nr);
-	memset_u64s_tail(d->entries, 0, table.nr);
-out:
-	darray_exit(&table);
-	return ret;
-}
-
-void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
-{
-	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-	if (!d)
-		return;
-
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
-	for_each_downgrade_entry(d, i) {
-		unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
-		if (new_minor < minor && minor <= old_minor) {
-			ext->recovery_passes_required[0] |= i->recovery_passes[0];
-			ext->recovery_passes_required[1] |= i->recovery_passes[1];
-
-			for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
-				unsigned e = le16_to_cpu(i->errors[j]);
-				if (e < BCH_FSCK_ERR_MAX)
-					__set_bit(e, c->sb.errors_silent);
-				if (e < sizeof(ext->errors_silent) * 8)
-					__set_bit_le64(e, ext->errors_silent);
-			}
-		}
-	}
-}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
deleted file mode 100644
index 095b7cc9bb47..000000000000
--- a/fs/bcachefs/sb-downgrade.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_H
-#define _BCACHEFS_SB_DOWNGRADE_H
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
-
-int bch2_sb_downgrade_update(struct bch_fs *);
-void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
-int bch2_sb_set_upgrade_extra(struct bch_fs *);
-void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
deleted file mode 100644
index cffd932be3ec..000000000000
--- a/fs/bcachefs/sb-downgrade_format.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-
-struct bch_sb_field_downgrade_entry {
-	__le16			version;
-	__le64			recovery_passes[2];
-	__le16			nr_errors;
-	__le16			errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
-	struct bch_sb_field	field;
-	struct bch_sb_field_downgrade_entry entries[];
-};
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
deleted file mode 100644
index 48853efdc105..000000000000
--- a/fs/bcachefs/sb-errors.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-const char * const bch2_sb_error_strs[] = {
-#define x(t, n, ...) [n] = #t,
-	BCH_SB_ERRS()
-#undef x
-};
-
-void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
-{
-	if (id < BCH_FSCK_ERR_MAX)
-		prt_str(out, bch2_sb_error_strs[id]);
-	else
-		prt_printf(out, "(unknown error %u)", id);
-}
-
-static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
-{
-	return bch2_sb_field_nr_entries(e);
-}
-
-static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
-{
-	return (sizeof(struct bch_sb_field_errors) +
-		sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
-}
-
-static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				   enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_errors *e = field_to_type(f, errors);
-	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
-	for (i = 0; i < nr; i++) {
-		if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
-			prt_printf(err, "entry with count 0 (id ");
-			bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
-			prt_printf(err, ")");
-			return -BCH_ERR_invalid_sb_errors;
-		}
-
-		if (i + 1 < nr &&
-		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
-		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
-			prt_printf(err, "entries out of order");
-			return -BCH_ERR_invalid_sb_errors;
-		}
-	}
-
-	return 0;
-}
-
-static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
-				   struct bch_sb_field *f)
-{
-	struct bch_sb_field_errors *e = field_to_type(f, errors);
-	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
-	if (out->nr_tabstops <= 1)
-		printbuf_tabstop_push(out, 16);
-
-	for (i = 0; i < nr; i++) {
-		bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
-		prt_tab(out);
-		prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
-		prt_tab(out);
-		bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
-		prt_newline(out);
-	}
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_errors = {
-	.validate	= bch2_sb_errors_validate,
-	.to_text	= bch2_sb_errors_to_text,
-};
-
-void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	if (out->nr_tabstops < 1)
-		printbuf_tabstop_push(out, 48);
-	if (out->nr_tabstops < 2)
-		printbuf_tabstop_push(out, 8);
-	if (out->nr_tabstops < 3)
-		printbuf_tabstop_push(out, 16);
-
-	guard(mutex)(&c->fsck_error_counts_lock);
-
-	bch_sb_errors_cpu *e = &c->fsck_error_counts;
-	darray_for_each(*e, i) {
-		bch2_sb_error_id_to_text(out, i->id);
-		prt_tab(out);
-		prt_u64(out, i->nr);
-		prt_tab(out);
-		bch2_prt_datetime(out, i->last_error_time);
-		prt_newline(out);
-	}
-}
-
-void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
-{
-	bch_sb_errors_cpu *e = &c->fsck_error_counts;
-	struct bch_sb_error_entry_cpu n = {
-		.id = err,
-		.nr = 1,
-		.last_error_time = ktime_get_real_seconds()
-	};
-	unsigned i;
-
-	mutex_lock(&c->fsck_error_counts_lock);
-	for (i = 0; i < e->nr; i++) {
-		if (err == e->data[i].id) {
-			e->data[i].nr++;
-			e->data[i].last_error_time = n.last_error_time;
-			goto out;
-		}
-		if (err < e->data[i].id)
-			break;
-	}
-
-	if (darray_make_room(e, 1))
-		goto out;
-
-	darray_insert_item(e, i, n);
-out:
-	mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-void bch2_sb_errors_from_cpu(struct bch_fs *c)
-{
-	bch_sb_errors_cpu *src = &c->fsck_error_counts;
-	struct bch_sb_field_errors *dst;
-	unsigned i;
-
-	mutex_lock(&c->fsck_error_counts_lock);
-
-	dst = bch2_sb_field_resize(&c->disk_sb, errors,
-				   bch2_sb_field_errors_u64s(src->nr));
-
-	if (!dst)
-		goto err;
-
-	for (i = 0; i < src->nr; i++) {
-		SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
-		SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
-		dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
-	}
-
-err:
-	mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-static int bch2_sb_errors_to_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
-	bch_sb_errors_cpu *dst = &c->fsck_error_counts;
-	unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
-	int ret;
-
-	if (!nr)
-		return 0;
-
-	mutex_lock(&c->fsck_error_counts_lock);
-	ret = darray_make_room(dst, nr);
-	if (ret)
-		goto err;
-
-	dst->nr = nr;
-
-	for (i = 0; i < nr; i++) {
-		dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
-		dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
-		dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
-	}
-err:
-	mutex_unlock(&c->fsck_error_counts_lock);
-
-	return ret;
-}
-
-void bch2_fs_sb_errors_exit(struct bch_fs *c)
-{
-	darray_exit(&c->fsck_error_counts);
-}
-
-void bch2_fs_sb_errors_init_early(struct bch_fs *c)
-{
-	mutex_init(&c->fsck_error_counts_lock);
-	darray_init(&c->fsck_error_counts);
-}
-
-int bch2_fs_sb_errors_init(struct bch_fs *c)
-{
-	return bch2_sb_errors_to_cpu(c);
-}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
deleted file mode 100644
index e86267264692..000000000000
--- a/fs/bcachefs/sb-errors.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_H
-#define _BCACHEFS_SB_ERRORS_H
-
-#include "sb-errors_types.h"
-
-extern const char * const bch2_sb_error_strs[];
-
-void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
-void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
-
-void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
-
-void bch2_sb_errors_from_cpu(struct bch_fs *);
-
-void bch2_fs_sb_errors_exit(struct bch_fs *);
-void bch2_fs_sb_errors_init_early(struct bch_fs *);
-int bch2_fs_sb_errors_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
deleted file mode 100644
index d154b7651d28..000000000000
--- a/fs/bcachefs/sb-errors_format.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
-#define _BCACHEFS_SB_ERRORS_FORMAT_H
-
-enum bch_fsck_flags {
-	FSCK_CAN_FIX		= BIT(0),
-	FSCK_CAN_IGNORE		= BIT(1),
-	FSCK_AUTOFIX		= BIT(2),
-	FSCK_ERR_NO_LOG		= BIT(3),
-};
-
-#define BCH_SB_ERRS()									\
-	x(clean_but_journal_not_empty,				  0,	0)		\
-	x(dirty_but_no_journal_entries,				  1,	0)		\
-	x(dirty_but_no_journal_entries_post_drop_nonflushes,	  2,	0)		\
-	x(sb_clean_journal_seq_mismatch,			  3,	0)		\
-	x(sb_clean_btree_root_mismatch,				  4,	0)		\
-	x(sb_clean_missing,					  5,	0)		\
-	x(jset_unsupported_version,				  6,	0)		\
-	x(jset_unknown_csum,					  7,	0)		\
-	x(jset_last_seq_newer_than_seq,				  8,	0)		\
-	x(jset_past_bucket_end,					  9,	0)		\
-	x(jset_seq_blacklisted,					 10,	0)		\
-	x(journal_entries_missing,				 11,	0)		\
-	x(journal_entry_replicas_not_marked,			 12,	FSCK_AUTOFIX)	\
-	x(journal_entry_past_jset_end,				 13,	0)		\
-	x(journal_entry_replicas_data_mismatch,			 14,	0)		\
-	x(journal_entry_bkey_u64s_0,				 15,	0)		\
-	x(journal_entry_bkey_past_end,				 16,	0)		\
-	x(journal_entry_bkey_bad_format,			 17,	0)		\
-	x(journal_entry_bkey_invalid,				 18,	0)		\
-	x(journal_entry_btree_root_bad_size,			 19,	0)		\
-	x(journal_entry_blacklist_bad_size,			 20,	0)		\
-	x(journal_entry_blacklist_v2_bad_size,			 21,	0)		\
-	x(journal_entry_blacklist_v2_start_past_end,		 22,	0)		\
-	x(journal_entry_usage_bad_size,				 23,	0)		\
-	x(journal_entry_data_usage_bad_size,			 24,	0)		\
-	x(journal_entry_clock_bad_size,				 25,	0)		\
-	x(journal_entry_clock_bad_rw,				 26,	0)		\
-	x(journal_entry_dev_usage_bad_size,			 27,	0)		\
-	x(journal_entry_dev_usage_bad_dev,			 28,	0)		\
-	x(journal_entry_dev_usage_bad_pad,			 29,	0)		\
-	x(btree_node_unreadable,				 30,	0)		\
-	x(btree_node_fault_injected,				 31,	0)		\
-	x(btree_node_bad_magic,					 32,	0)		\
-	x(btree_node_bad_seq,					 33,	0)		\
-	x(btree_node_unsupported_version,			 34,	0)		\
-	x(btree_node_bset_older_than_sb_min,			 35,	0)		\
-	x(btree_node_bset_newer_than_sb,			 36,	0)		\
-	x(btree_node_data_missing,				 37,	FSCK_AUTOFIX)	\
-	x(btree_node_bset_after_end,				 38,	0)		\
-	x(btree_node_replicas_sectors_written_mismatch,		 39,	0)		\
-	x(btree_node_replicas_data_mismatch,			 40,	0)		\
-	x(bset_unknown_csum,					 41,	0)		\
-	x(bset_bad_csum,					 42,	0)		\
-	x(bset_past_end_of_btree_node,				 43,	0)		\
-	x(bset_wrong_sector_offset,				 44,	0)		\
-	x(bset_empty,						 45,	0)		\
-	x(bset_bad_seq,						 46,	0)		\
-	x(bset_blacklisted_journal_seq,				 47,	FSCK_AUTOFIX)	\
-	x(first_bset_blacklisted_journal_seq,			 48,	FSCK_AUTOFIX)	\
-	x(btree_node_bad_btree,					 49,	0)		\
-	x(btree_node_bad_level,					 50,	0)		\
-	x(btree_node_bad_min_key,				 51,	0)		\
-	x(btree_node_bad_max_key,				 52,	0)		\
-	x(btree_node_bad_format,				 53,	0)		\
-	x(btree_node_bkey_past_bset_end,			 54,	0)		\
-	x(btree_node_bkey_bad_format,				 55,	0)		\
-	x(btree_node_bad_bkey,					 56,	0)		\
-	x(btree_node_bkey_out_of_order,				 57,	FSCK_AUTOFIX)	\
-	x(btree_root_bkey_invalid,				 58,	FSCK_AUTOFIX)	\
-	x(btree_root_read_error,				 59,	FSCK_AUTOFIX)	\
-	x(btree_root_bad_min_key,				 60,	0)		\
-	x(btree_root_bad_max_key,				 61,	0)		\
-	x(btree_node_read_error,				 62,	FSCK_AUTOFIX)	\
-	x(btree_node_topology_bad_min_key,			 63,	FSCK_AUTOFIX)	\
-	x(btree_node_topology_bad_max_key,			 64,	FSCK_AUTOFIX)	\
-	x(btree_node_topology_overwritten_by_prev_node,		 65,	FSCK_AUTOFIX)	\
-	x(btree_node_topology_overwritten_by_next_node,		 66,	FSCK_AUTOFIX)	\
-	x(btree_node_topology_interior_node_empty,		 67,	FSCK_AUTOFIX)	\
-	x(fs_usage_hidden_wrong,				 68,	FSCK_AUTOFIX)	\
-	x(fs_usage_btree_wrong,					 69,	FSCK_AUTOFIX)	\
-	x(fs_usage_data_wrong,					 70,	FSCK_AUTOFIX)	\
-	x(fs_usage_cached_wrong,				 71,	FSCK_AUTOFIX)	\
-	x(fs_usage_reserved_wrong,				 72,	FSCK_AUTOFIX)	\
-	x(fs_usage_persistent_reserved_wrong,			 73,	FSCK_AUTOFIX)	\
-	x(fs_usage_nr_inodes_wrong,				 74,	FSCK_AUTOFIX)	\
-	x(fs_usage_replicas_wrong,				 75,	FSCK_AUTOFIX)	\
-	x(dev_usage_buckets_wrong,				 76,	FSCK_AUTOFIX)	\
-	x(dev_usage_sectors_wrong,				 77,	FSCK_AUTOFIX)	\
-	x(dev_usage_fragmented_wrong,				 78,	FSCK_AUTOFIX)	\
-	x(dev_usage_buckets_ec_wrong,				 79,	FSCK_AUTOFIX)	\
-	x(bkey_version_in_future,				 80,	0)		\
-	x(bkey_u64s_too_small,					 81,	0)		\
-	x(bkey_invalid_type_for_btree,				 82,	0)		\
-	x(bkey_extent_size_zero,				 83,	0)		\
-	x(bkey_extent_size_greater_than_offset,			 84,	0)		\
-	x(bkey_size_nonzero,					 85,	0)		\
-	x(bkey_snapshot_nonzero,				 86,	0)		\
-	x(bkey_snapshot_zero,					 87,	0)		\
-	x(bkey_at_pos_max,					 88,	0)		\
-	x(bkey_before_start_of_btree_node,			 89,	0)		\
-	x(bkey_after_end_of_btree_node,				 90,	0)		\
-	x(bkey_val_size_nonzero,				 91,	0)		\
-	x(bkey_val_size_too_small,				 92,	0)		\
-	x(alloc_v1_val_size_bad,				 93,	0)		\
-	x(alloc_v2_unpack_error,				 94,	0)		\
-	x(alloc_v3_unpack_error,				 95,	0)		\
-	x(alloc_v4_val_size_bad,				 96,	0)		\
-	x(alloc_v4_backpointers_start_bad,			 97,	0)		\
-	x(alloc_key_data_type_bad,				 98,	0)		\
-	x(alloc_key_empty_but_have_data,			 99,	0)		\
-	x(alloc_key_dirty_sectors_0,				100,	0)		\
-	x(alloc_key_data_type_inconsistency,			101,	0)		\
-	x(alloc_key_to_missing_dev_bucket,			102,	0)		\
-	x(alloc_key_cached_inconsistency,			103,	0)		\
-	x(alloc_key_cached_but_read_time_zero,			104,	FSCK_AUTOFIX)	\
-	x(alloc_key_to_missing_lru_entry,			105,	FSCK_AUTOFIX)	\
-	x(alloc_key_data_type_wrong,				106,	FSCK_AUTOFIX)	\
-	x(alloc_key_gen_wrong,					107,	FSCK_AUTOFIX)	\
-	x(alloc_key_dirty_sectors_wrong,			108,	FSCK_AUTOFIX)	\
-	x(alloc_key_cached_sectors_wrong,			109,	FSCK_AUTOFIX)	\
-	x(alloc_key_stripe_wrong,				110,	FSCK_AUTOFIX)	\
-	x(alloc_key_stripe_redundancy_wrong,			111,	FSCK_AUTOFIX)	\
-	x(alloc_key_journal_seq_in_future,			298,	FSCK_AUTOFIX)	\
-	x(bucket_sector_count_overflow,				112,	0)		\
-	x(bucket_metadata_type_mismatch,			113,	0)		\
-	x(need_discard_key_wrong,				114,	FSCK_AUTOFIX)	\
-	x(freespace_key_wrong,					115,	FSCK_AUTOFIX)	\
-	x(freespace_hole_missing,				116,	FSCK_AUTOFIX)	\
-	x(bucket_gens_val_size_bad,				117,	0)		\
-	x(bucket_gens_key_wrong,				118,	FSCK_AUTOFIX)	\
-	x(bucket_gens_hole_wrong,				119,	FSCK_AUTOFIX)	\
-	x(bucket_gens_to_invalid_dev,				120,	FSCK_AUTOFIX)	\
-	x(bucket_gens_to_invalid_buckets,			121,	FSCK_AUTOFIX)	\
-	x(bucket_gens_nonzero_for_invalid_buckets,		122,	FSCK_AUTOFIX)	\
-	x(need_discard_freespace_key_to_invalid_dev_bucket,	123,	0)		\
-	x(need_discard_freespace_key_bad,			124,	FSCK_AUTOFIX)	\
-	x(discarding_bucket_not_in_need_discard_btree,		291,	0)		\
-	x(backpointer_bucket_offset_wrong,			125,	0)		\
-	x(backpointer_level_bad,				294,	0)		\
-	x(backpointer_dev_bad,					297,	0)		\
-	x(backpointer_to_missing_device,			126,	FSCK_AUTOFIX)	\
-	x(backpointer_to_missing_alloc,				127,	FSCK_AUTOFIX)	\
-	x(backpointer_to_missing_ptr,				128,	FSCK_AUTOFIX)	\
-	x(lru_entry_at_time_0,					129,	FSCK_AUTOFIX)	\
-	x(lru_entry_to_invalid_bucket,				130,	FSCK_AUTOFIX)	\
-	x(lru_entry_bad,					131,	FSCK_AUTOFIX)	\
-	x(btree_ptr_val_too_big,				132,	0)		\
-	x(btree_ptr_v2_val_too_big,				133,	0)		\
-	x(btree_ptr_has_non_ptr,				134,	0)		\
-	x(extent_ptrs_invalid_entry,				135,	0)		\
-	x(extent_ptrs_no_ptrs,					136,	0)		\
-	x(extent_ptrs_too_many_ptrs,				137,	0)		\
-	x(extent_ptrs_redundant_crc,				138,	0)		\
-	x(extent_ptrs_redundant_stripe,				139,	0)		\
-	x(extent_ptrs_unwritten,				140,	0)		\
-	x(extent_ptrs_written_and_unwritten,			141,	0)		\
-	x(ptr_to_invalid_device,				142,	0)		\
-	x(ptr_to_duplicate_device,				143,	0)		\
-	x(ptr_after_last_bucket,				144,	0)		\
-	x(ptr_before_first_bucket,				145,	0)		\
-	x(ptr_spans_multiple_buckets,				146,	0)		\
-	x(ptr_to_missing_backpointer,				147,	FSCK_AUTOFIX)	\
-	x(ptr_to_missing_alloc_key,				148,	FSCK_AUTOFIX)	\
-	x(ptr_to_missing_replicas_entry,			149,	FSCK_AUTOFIX)	\
-	x(ptr_to_missing_stripe,				150,	0)		\
-	x(ptr_to_incorrect_stripe,				151,	0)		\
-	x(ptr_gen_newer_than_bucket_gen,			152,	FSCK_AUTOFIX)		\
-	x(ptr_too_stale,					153,	0)		\
-	x(stale_dirty_ptr,					154,	FSCK_AUTOFIX)	\
-	x(ptr_bucket_data_type_mismatch,			155,	0)		\
-	x(ptr_cached_and_erasure_coded,				156,	0)		\
-	x(ptr_crc_uncompressed_size_too_small,			157,	0)		\
-	x(ptr_crc_uncompressed_size_too_big,			161,	0)		\
-	x(ptr_crc_uncompressed_size_mismatch,			300,	0)		\
-	x(ptr_crc_csum_type_unknown,				158,	0)		\
-	x(ptr_crc_compression_type_unknown,			159,	0)		\
-	x(ptr_crc_redundant,					160,	0)		\
-	x(ptr_crc_nonce_mismatch,				162,	0)		\
-	x(ptr_stripe_redundant,					163,	0)		\
-	x(extent_flags_not_at_start,				306,	0)		\
-	x(reservation_key_nr_replicas_invalid,			164,	0)		\
-	x(reflink_v_refcount_wrong,				165,	FSCK_AUTOFIX)	\
-	x(reflink_v_pos_bad,					292,	0)		\
-	x(reflink_p_to_missing_reflink_v,			166,	FSCK_AUTOFIX)	\
-	x(reflink_refcount_underflow,				293,	0)		\
-	x(stripe_pos_bad,					167,	0)		\
-	x(stripe_val_size_bad,					168,	0)		\
-	x(stripe_csum_granularity_bad,				290,	0)		\
-	x(stripe_sector_count_wrong,				169,	0)		\
-	x(snapshot_tree_pos_bad,				170,	0)		\
-	x(snapshot_tree_to_missing_snapshot,			171,	0)		\
-	x(snapshot_tree_to_missing_subvol,			172,	0)		\
-	x(snapshot_tree_to_wrong_subvol,			173,	0)		\
-	x(snapshot_tree_to_snapshot_subvol,			174,	0)		\
-	x(snapshot_pos_bad,					175,	0)		\
-	x(snapshot_parent_bad,					176,	0)		\
-	x(snapshot_children_not_normalized,			177,	0)		\
-	x(snapshot_child_duplicate,				178,	0)		\
-	x(snapshot_child_bad,					179,	0)		\
-	x(snapshot_skiplist_not_normalized,			180,	0)		\
-	x(snapshot_skiplist_bad,				181,	0)		\
-	x(snapshot_should_not_have_subvol,			182,	0)		\
-	x(snapshot_to_bad_snapshot_tree,			183,	FSCK_AUTOFIX)	\
-	x(snapshot_bad_depth,					184,	0)		\
-	x(snapshot_bad_skiplist,				185,	0)		\
-	x(subvol_pos_bad,					186,	0)		\
-	x(subvol_not_master_and_not_snapshot,			187,	FSCK_AUTOFIX)	\
-	x(subvol_to_missing_root,				188,	0)		\
-	x(subvol_root_wrong_bi_subvol,				189,	FSCK_AUTOFIX)	\
-	x(bkey_in_missing_snapshot,				190,	0)		\
-	x(bkey_in_deleted_snapshot,				315,	FSCK_AUTOFIX)	\
-	x(inode_pos_inode_nonzero,				191,	0)		\
-	x(inode_pos_blockdev_range,				192,	0)		\
-	x(inode_alloc_cursor_inode_bad,				301,	0)		\
-	x(inode_unpack_error,					193,	0)		\
-	x(inode_str_hash_invalid,				194,	0)		\
-	x(inode_v3_fields_start_bad,				195,	0)		\
-	x(inode_snapshot_mismatch,				196,	0)		\
-	x(snapshot_key_missing_inode_snapshot,			314,	FSCK_AUTOFIX)	\
-	x(inode_unlinked_but_clean,				197,	0)		\
-	x(inode_unlinked_but_nlink_nonzero,			198,	0)		\
-	x(inode_unlinked_and_not_open,				281,	0)		\
-	x(inode_unlinked_but_has_dirent,			285,	0)		\
-	x(inode_checksum_type_invalid,				199,	0)		\
-	x(inode_compression_type_invalid,			200,	0)		\
-	x(inode_subvol_root_but_not_dir,			201,	0)		\
-	x(inode_i_size_dirty_but_clean,				202,	FSCK_AUTOFIX)	\
-	x(inode_i_sectors_dirty_but_clean,			203,	FSCK_AUTOFIX)	\
-	x(inode_i_sectors_wrong,				204,	FSCK_AUTOFIX)	\
-	x(inode_dir_wrong_nlink,				205,	FSCK_AUTOFIX)	\
-	x(inode_dir_multiple_links,				206,	FSCK_AUTOFIX)	\
-	x(inode_dir_missing_backpointer,			284,	FSCK_AUTOFIX)	\
-	x(inode_dir_unlinked_but_not_empty,			286,	FSCK_AUTOFIX)	\
-	x(inode_dir_has_nonzero_i_size,				319,	FSCK_AUTOFIX)	\
-	x(inode_multiple_links_but_nlink_0,			207,	FSCK_AUTOFIX)	\
-	x(inode_wrong_backpointer,				208,	FSCK_AUTOFIX)	\
-	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
-	x(inode_has_child_snapshots_wrong,			287,	FSCK_AUTOFIX)	\
-	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
-	x(inode_journal_seq_in_future,				299,	FSCK_AUTOFIX)	\
-	x(inode_i_sectors_underflow,				312,	FSCK_AUTOFIX)	\
-	x(inode_has_case_insensitive_not_set,			316,	FSCK_AUTOFIX)	\
-	x(inode_parent_has_case_insensitive_not_set,		317,	FSCK_AUTOFIX)	\
-	x(vfs_inode_i_blocks_underflow,				311,	FSCK_AUTOFIX)	\
-	x(vfs_inode_i_blocks_not_zero_at_truncate,		313,	FSCK_AUTOFIX)	\
-	x(vfs_bad_inode_rm,					320,	0)		\
-	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
-	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
-	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
-	x(deleted_inode_not_unlinked,				214,	FSCK_AUTOFIX)	\
-	x(deleted_inode_has_child_snapshots,			288,	FSCK_AUTOFIX)	\
-	x(extent_overlapping,					215,	0)		\
-	x(key_in_missing_inode,					216,	FSCK_AUTOFIX)	\
-	x(key_in_wrong_inode_type,				217,	0)		\
-	x(extent_past_end_of_inode,				218,	FSCK_AUTOFIX)	\
-	x(dirent_empty_name,					219,	0)		\
-	x(dirent_val_too_big,					220,	0)		\
-	x(dirent_name_too_long,					221,	0)		\
-	x(dirent_name_embedded_nul,				222,	0)		\
-	x(dirent_name_dot_or_dotdot,				223,	0)		\
-	x(dirent_name_has_slash,				224,	0)		\
-	x(dirent_d_type_wrong,					225,	FSCK_AUTOFIX)	\
-	x(inode_bi_parent_wrong,				226,	0)		\
-	x(dirent_in_missing_dir_inode,				227,	0)		\
-	x(dirent_in_non_dir_inode,				228,	0)		\
-	x(dirent_to_missing_inode,				229,	FSCK_AUTOFIX)	\
-	x(dirent_to_overwritten_inode,				302,	0)		\
-	x(dirent_to_missing_subvol,				230,	0)		\
-	x(dirent_to_itself,					231,	0)		\
-	x(dirent_casefold_mismatch,				318,	FSCK_AUTOFIX)	\
-	x(quota_type_invalid,					232,	0)		\
-	x(xattr_val_size_too_small,				233,	0)		\
-	x(xattr_val_size_too_big,				234,	0)		\
-	x(xattr_invalid_type,					235,	0)		\
-	x(xattr_name_invalid_chars,				236,	0)		\
-	x(xattr_in_missing_inode,				237,	0)		\
-	x(root_subvol_missing,					238,	0)		\
-	x(root_dir_missing,					239,	0)		\
-	x(root_inode_not_dir,					240,	0)		\
-	x(dir_loop,						241,	0)		\
-	x(hash_table_key_duplicate,				242,	FSCK_AUTOFIX)	\
-	x(hash_table_key_wrong_offset,				243,	FSCK_AUTOFIX)	\
-	x(unlinked_inode_not_on_deleted_list,			244,	FSCK_AUTOFIX)	\
-	x(reflink_p_front_pad_bad,				245,	0)		\
-	x(journal_entry_dup_same_device,			246,	0)		\
-	x(inode_bi_subvol_missing,				247,	0)		\
-	x(inode_bi_subvol_wrong,				248,	0)		\
-	x(inode_points_to_missing_dirent,			249,	FSCK_AUTOFIX)	\
-	x(inode_points_to_wrong_dirent,				250,	FSCK_AUTOFIX)	\
-	x(inode_bi_parent_nonzero,				251,	0)		\
-	x(dirent_to_missing_parent_subvol,			252,	0)		\
-	x(dirent_not_visible_in_parent_subvol,			253,	0)		\
-	x(subvol_fs_path_parent_wrong,				254,	0)		\
-	x(subvol_root_fs_path_parent_nonzero,			255,	0)		\
-	x(subvol_children_not_set,				256,	0)		\
-	x(subvol_children_bad,					257,	0)		\
-	x(subvol_loop,						258,	0)		\
-	x(subvol_unreachable,					259,	FSCK_AUTOFIX)	\
-	x(btree_node_bkey_bad_u64s,				260,	0)		\
-	x(btree_node_topology_empty_interior_node,		261,	0)		\
-	x(btree_ptr_v2_min_key_bad,				262,	0)		\
-	x(btree_root_unreadable_and_scan_found_nothing,		263,	0)		\
-	x(snapshot_node_missing,				264,	FSCK_AUTOFIX)	\
-	x(dup_backpointer_to_bad_csum_extent,			265,	0)		\
-	x(btree_bitmap_not_marked,				266,	FSCK_AUTOFIX)	\
-	x(sb_clean_entry_overrun,				267,	0)		\
-	x(btree_ptr_v2_written_0,				268,	0)		\
-	x(subvol_snapshot_bad,					269,	0)		\
-	x(subvol_inode_bad,					270,	0)		\
-	x(subvol_missing,					308,	FSCK_AUTOFIX)	\
-	x(alloc_key_stripe_sectors_wrong,			271,	FSCK_AUTOFIX)	\
-	x(accounting_mismatch,					272,	FSCK_AUTOFIX)	\
-	x(accounting_replicas_not_marked,			273,	0)		\
-	x(accounting_to_invalid_device,				289,	0)		\
-	x(invalid_btree_id,					274,	FSCK_AUTOFIX)		\
-	x(alloc_key_io_time_bad,				275,	0)		\
-	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
-	x(accounting_key_junk_at_end,				277,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_nr_devs_0,			278,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_nr_required_bad,		279,	FSCK_AUTOFIX)	\
-	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
-	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
-	x(accounting_key_nr_counters_wrong,			307,	FSCK_AUTOFIX)	\
-	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(compression_opt_not_marked_in_sb,			295,	FSCK_AUTOFIX)	\
-	x(compression_type_not_marked_in_sb,			296,	FSCK_AUTOFIX)	\
-	x(directory_size_mismatch,				303,	FSCK_AUTOFIX)	\
-	x(dirent_cf_name_too_big,				304,	0)		\
-	x(dirent_stray_data_after_cf_name,			305,	0)		\
-	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
-	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
-	x(MAX,							321,	0)
-
-enum bch_sb_error_id {
-#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-	BCH_SB_ERRS()
-#undef x
-};
-
-struct bch_sb_field_errors {
-	struct bch_sb_field	field;
-	struct bch_sb_field_error_entry {
-		__le64		v;
-		__le64		last_error_time;
-	}			entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,	struct bch_sb_field_error_entry, v,  0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,	struct bch_sb_field_error_entry, v, 16, 64);
-
-#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
deleted file mode 100644
index 40325239c3b0..000000000000
--- a/fs/bcachefs/sb-errors_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
-#define _BCACHEFS_SB_ERRORS_TYPES_H
-
-#include "darray.h"
-
-struct bch_sb_error_entry_cpu {
-	u64			id:16,
-				nr:48;
-	u64			last_error_time;
-};
-
-typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
-
-#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
deleted file mode 100644
index 6245e342a8a8..000000000000
--- a/fs/bcachefs/sb-members.c
+++ /dev/null
@@ -1,606 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "opts.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
-{
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
-	bch2_bkey_val_to_text(&buf, c, k);
-
-	bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
-
-	int ret = bch2_run_explicit_recovery_pass(c, &buf,
-					BCH_RECOVERY_PASS_check_allocations, 0);
-
-	if (print)
-		bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
-{
-	if (dev != BCH_SB_MEMBER_INVALID)
-		bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
-{
-	bch2_fs_inconsistent(ca->fs,
-		"pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
-		bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
-}
-
-#define x(t, n, ...) [n] = #t,
-static const char * const bch2_iops_measurements[] = {
-	BCH_IOPS_MEASUREMENTS()
-	NULL
-};
-
-char * const bch2_member_error_strs[] = {
-	BCH_MEMBER_ERROR_TYPES()
-	NULL
-};
-#undef x
-
-/* Code for bch_sb_field_members_v1: */
-
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
-{
-	return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
-}
-
-static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
-{
-	struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
-	memset(&ret, 0, sizeof(ret));
-	memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
-	return ret;
-}
-
-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
-{
-	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
-}
-
-static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
-{
-	struct bch_member ret, *p = members_v1_get_mut(mi, i);
-	memset(&ret, 0, sizeof(ret));
-	memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
-	return ret;
-}
-
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
-{
-	struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
-	if (mi2)
-		return members_v2_get(mi2, i);
-	struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
-	return members_v1_get(mi1, i);
-}
-
-static int sb_members_v2_resize_entries(struct bch_fs *c)
-{
-	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
-	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
-		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
-					      c->disk_sb.sb->nr_devices), 8);
-
-		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
-		if (!mi)
-			return bch_err_throw(c, ENOSPC_sb_members_v2);
-
-		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
-			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
-			memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
-			memset(dst + le16_to_cpu(mi->member_bytes),
-			       0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
-		}
-		mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
-	}
-	return 0;
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c)
-{
-	struct bch_sb_field_members_v1 *mi1;
-	struct bch_sb_field_members_v2 *mi2;
-
-	if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
-		mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
-				DIV_ROUND_UP(sizeof(*mi2) +
-					     sizeof(struct bch_member) * c->sb.nr_devices,
-					     sizeof(u64)));
-		mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
-		memcpy(&mi2->_members[0], &mi1->_members[0],
-		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
-		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
-		mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
-	}
-
-	return sb_members_v2_resize_entries(c);
-}
-
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
-{
-	struct bch_sb_field_members_v1 *mi1;
-	struct bch_sb_field_members_v2 *mi2;
-
-	if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
-		bch2_sb_field_resize(disk_sb, members_v1, 0);
-		return 0;
-	}
-
-	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
-			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
-				     disk_sb->sb->nr_devices, sizeof(u64)));
-	if (!mi1)
-		return -BCH_ERR_ENOSPC_sb_members;
-
-	mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
-
-	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
-		memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
-
-	return 0;
-}
-
-static int validate_member(struct printbuf *err,
-			   struct bch_member m,
-			   struct bch_sb *sb,
-			   int i)
-{
-	if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
-		prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
-			   i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	if (le64_to_cpu(m.nbuckets) -
-	    le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
-		prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-			   i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	if (le16_to_cpu(m.bucket_size) <
-	    le16_to_cpu(sb->block_size)) {
-		prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-			   i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	if (le16_to_cpu(m.bucket_size) <
-	    BCH_SB_BTREE_NODE_SIZE(sb)) {
-		prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-			   i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
-		prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
-	    sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
-		prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	return 0;
-}
-
-static void member_to_text(struct printbuf *out,
-			   struct bch_member m,
-			   struct bch_sb_field_disk_groups *gi,
-			   struct bch_sb *sb,
-			   int i)
-{
-	unsigned data_have = bch2_sb_dev_has_data(sb, i);
-	u64 bucket_size = le16_to_cpu(m.bucket_size);
-	u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
-
-	if (!bch2_member_alive(&m))
-		return;
-
-	prt_printf(out, "Device:\t%u\n", i);
-
-	printbuf_indent_add(out, 2);
-
-	prt_printf(out, "Label:\t");
-	if (BCH_MEMBER_GROUP(&m))
-		bch2_disk_path_to_text_sb(out, sb,
-				BCH_MEMBER_GROUP(&m) - 1);
-	else
-		prt_printf(out, "(none)");
-	prt_newline(out);
-
-	prt_printf(out, "UUID:\t");
-	pr_uuid(out, m.uuid.b);
-	prt_newline(out);
-
-	prt_printf(out, "Size:\t");
-	prt_units_u64(out, device_size << 9);
-	prt_newline(out);
-
-	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-		prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
-
-	for (unsigned i = 0; i < BCH_IOPS_NR; i++)
-		prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
-
-	prt_printf(out, "Bucket size:\t");
-	prt_units_u64(out, bucket_size << 9);
-	prt_newline(out);
-
-	prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
-	prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
-
-	prt_printf(out, "Last mount:\t");
-	if (m.last_mount)
-		bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
-	else
-		prt_printf(out, "(never)");
-	prt_newline(out);
-
-	prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
-
-	prt_printf(out, "State:\t%s\n",
-		   BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
-		   ? bch2_member_states[BCH_MEMBER_STATE(&m)]
-		   : "unknown");
-
-	prt_printf(out, "Data allowed:\t");
-	if (BCH_MEMBER_DATA_ALLOWED(&m))
-		prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
-	else
-		prt_printf(out, "(none)");
-	prt_newline(out);
-
-	prt_printf(out, "Has data:\t");
-	if (data_have)
-		prt_bitflags(out, __bch2_data_types, data_have);
-	else
-		prt_printf(out, "(none)");
-	prt_newline(out);
-
-	prt_printf(out, "Btree allocated bitmap blocksize:\t");
-	if (m.btree_bitmap_shift < 64)
-		prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
-	else
-		prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
-	prt_newline(out);
-
-	prt_printf(out, "Btree allocated bitmap:\t");
-	bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
-	prt_newline(out);
-
-	prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
-
-	prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
-	prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
-	prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
-
-	printbuf_indent_sub(out, 2);
-}
-
-static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
-	unsigned i;
-
-	if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
-		prt_printf(err, "too many devices for section size");
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	for (i = 0; i < sb->nr_devices; i++) {
-		struct bch_member m = members_v1_get(mi, i);
-
-		int ret = validate_member(err, m, sb, i);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
-				       struct bch_sb_field *f)
-{
-	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
-	if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
-		prt_printf(out, "field ends before start of entries");
-		return;
-	}
-
-	unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
-	if (nr != sb->nr_devices)
-		prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
-	for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
-		member_to_text(out, members_v1_get(mi, i), gi, sb, i);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
-	.validate	= bch2_sb_members_v1_validate,
-	.to_text	= bch2_sb_members_v1_to_text,
-};
-
-static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
-				       struct bch_sb_field *f)
-{
-	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
-	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
-	if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
-		prt_printf(out, "field ends before start of entries");
-		return;
-	}
-
-	if (!le16_to_cpu(mi->member_bytes)) {
-		prt_printf(out, "member_bytes 0");
-		return;
-	}
-
-	unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
-	if (nr != sb->nr_devices)
-		prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
-	/*
-	 * We call to_text() on superblock sections that haven't passed
-	 * validate, so we can't trust sb->nr_devices.
-	 */
-
-	for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
-		member_to_text(out, members_v2_get(mi, i), gi, sb, i);
-}
-
-static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
-	size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
-		(void *) mi;
-
-	if (mi_bytes > vstruct_bytes(&mi->field)) {
-		prt_printf(err, "section too small (%zu > %zu)",
-			   mi_bytes, vstruct_bytes(&mi->field));
-		return -BCH_ERR_invalid_sb_members;
-	}
-
-	for (unsigned i = 0; i < sb->nr_devices; i++) {
-		int ret = validate_member(err, members_v2_get(mi, i), sb, i);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
-	.validate	= bch2_sb_members_v2_validate,
-	.to_text	= bch2_sb_members_v2_to_text,
-};
-
-void bch2_sb_members_from_cpu(struct bch_fs *c)
-{
-	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
-	guard(rcu)();
-	for_each_member_device_rcu(c, ca, NULL) {
-		struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
-
-		for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
-			m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
-	}
-}
-
-void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_member m;
-
-	mutex_lock(&ca->fs->sb_lock);
-	m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-	mutex_unlock(&ca->fs->sb_lock);
-
-	printbuf_tabstop_push(out, 12);
-
-	prt_str(out, "IO errors since filesystem creation");
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-		prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
-	printbuf_indent_sub(out, 2);
-
-	prt_str(out, "IO errors since ");
-	bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
-	prt_str(out, " ago");
-	prt_newline(out);
-
-	printbuf_indent_add(out, 2);
-	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
-		prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
-			   atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
-	printbuf_indent_sub(out, 2);
-}
-
-void bch2_dev_errors_reset(struct bch_dev *ca)
-{
-	struct bch_fs *c = ca->fs;
-	struct bch_member *m;
-
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
-		m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
-	m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-}
-
-/*
- * Per member "range has btree nodes" bitmap:
- *
- * This is so that if we ever have to run the btree node scan to repair we don't
- * have to scan full devices:
- */
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
-{
-	guard(rcu)();
-	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
-		if (ca &&
-		    !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
-			return false;
-	}
-	return true;
-}
-
-static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
-				u64 start, unsigned sectors)
-{
-	struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
-	u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
-
-	u64 end = start + sectors;
-
-	int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
-	if (resize > 0) {
-		u64 new_bitmap = 0;
-
-		for (unsigned i = 0; i < 64; i++)
-			if (bitmap & BIT_ULL(i))
-				new_bitmap |= BIT_ULL(i >> resize);
-		bitmap = new_bitmap;
-		m->btree_bitmap_shift += resize;
-	}
-
-	BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
-	BUG_ON(end > 64ULL << m->btree_bitmap_shift);
-
-	for (unsigned bit = start >> m->btree_bitmap_shift;
-	     (u64) bit << m->btree_bitmap_shift < end;
-	     bit++)
-		bitmap |= BIT_ULL(bit);
-
-	m->btree_allocated_bitmap = cpu_to_le64(bitmap);
-}
-
-void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
-{
-	lockdep_assert_held(&c->sb_lock);
-
-	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
-		if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
-			continue;
-
-		__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
-	}
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
-{
-	unsigned nr = 0;
-
-	for (unsigned i = 0; i < sb->nr_devices; i++)
-		nr += bch2_member_exists((struct bch_sb *) sb, i);
-	return nr;
-}
-
-int bch2_sb_member_alloc(struct bch_fs *c)
-{
-	unsigned dev_idx = c->sb.nr_devices;
-	struct bch_sb_field_members_v2 *mi;
-	unsigned nr_devices;
-	unsigned u64s;
-	int best = -1;
-	u64 best_last_mount = 0;
-	unsigned nr_deleted = 0;
-
-	if (dev_idx < BCH_SB_MEMBERS_MAX)
-		goto have_slot;
-
-	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
-		/* eventually BCH_SB_MEMBERS_MAX will be raised */
-		if (dev_idx == BCH_SB_MEMBER_INVALID)
-			continue;
-
-		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-
-		nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
-
-		if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
-			continue;
-
-		u64 last_mount = le64_to_cpu(m.last_mount);
-		if (best < 0 || last_mount < best_last_mount) {
-			best = dev_idx;
-			best_last_mount = last_mount;
-		}
-	}
-	if (best >= 0) {
-		dev_idx = best;
-		goto have_slot;
-	}
-
-	if (nr_deleted)
-		bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
-			nr_deleted);
-
-	return -BCH_ERR_ENOSPC_sb_members;
-have_slot:
-	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
-	mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
-			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
-
-	mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
-	if (!mi)
-		return -BCH_ERR_ENOSPC_sb_members;
-
-	c->disk_sb.sb->nr_devices = nr_devices;
-	return dev_idx;
-}
-
-void bch2_sb_members_clean_deleted(struct bch_fs *c)
-{
-	mutex_lock(&c->sb_lock);
-	bool write_sb = false;
-
-	for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
-
-		if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
-			memset(&m->uuid, 0, sizeof(m->uuid));
-			write_sb = true;
-		}
-	}
-
-	if (write_sb)
-		bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
deleted file mode 100644
index 8d8a8a857648..000000000000
--- a/fs/bcachefs/sb-members.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_H
-#define _BCACHEFS_SB_MEMBERS_H
-
-#include "darray.h"
-#include "bkey_types.h"
-#include "enumerated_ref.h"
-
-extern char * const bch2_member_error_strs[];
-
-static inline struct bch_member *
-__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
-{
-	return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c);
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-	return !enumerated_ref_is_zero(&ca->io_ref[READ]);
-}
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
-
-static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
-{
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu(c, dev);
-	return ca && bch2_dev_is_online(ca);
-}
-
-static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
-{
-	return bch2_dev_is_online(ca) &&
-		ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-					 unsigned dev)
-{
-	darray_for_each(devs, i)
-		if (*i == dev)
-			return true;
-	return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-					  unsigned dev)
-{
-	darray_for_each(*devs, i)
-		if (*i == dev) {
-			darray_remove_item(devs, i);
-			return;
-		}
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-					 unsigned dev)
-{
-	if (!bch2_dev_list_has_dev(*devs, dev)) {
-		BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
-		devs->data[devs->nr++] = dev;
-	}
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-	return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
-						  const struct bch_devs_mask *mask)
-{
-	struct bch_dev *ca = NULL;
-
-	while ((idx = mask
-		? find_next_bit(mask->d, c->sb.nr_devices, idx)
-		: idx) < c->sb.nr_devices &&
-	       !(ca = rcu_dereference_check(c->devs[idx],
-					    lockdep_is_held(&c->state_lock))))
-		idx++;
-
-	return ca;
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
-					      const struct bch_devs_mask *mask)
-{
-	return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
-}
-
-#define for_each_member_device_rcu(_c, _ca, _mask)			\
-	for (struct bch_dev *_ca = NULL;				\
-	     (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-
-#define for_each_online_member_rcu(_c, _ca)				\
-	for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
-
-#define for_each_rw_member_rcu(_c, _ca)					\
-	for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
-
-static inline void bch2_dev_get(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
-#else
-	percpu_ref_get(&ca->ref);
-#endif
-}
-
-static inline void __bch2_dev_put(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	long r = atomic_long_dec_return(&ca->ref);
-	if (r < (long) !ca->dying)
-		panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
-	ca->last_put = _THIS_IP_;
-	if (!r)
-		complete(&ca->ref_completion);
-#else
-	percpu_ref_put(&ca->ref);
-#endif
-}
-
-static inline void bch2_dev_put(struct bch_dev *ca)
-{
-	if (ca)
-		__bch2_dev_put(ca);
-}
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
-{
-	guard(rcu)();
-	bch2_dev_put(ca);
-	if ((ca = __bch2_next_dev(c, ca, NULL)))
-		bch2_dev_get(ca);
-	return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define __for_each_member_device(_c, _ca)				\
-	for (;	(_ca = bch2_get_next_dev(_c, _ca));)
-
-#define for_each_member_device(_c, _ca)					\
-	for (struct bch_dev *_ca = NULL;				\
-	     (_ca = bch2_get_next_dev(_c, _ca));)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-						       struct bch_dev *ca,
-						       unsigned state_mask,
-						       int rw, unsigned ref_idx)
-{
-	guard(rcu)();
-	if (ca)
-		enumerated_ref_put(&ca->io_ref[rw], ref_idx);
-
-	while ((ca = __bch2_next_dev(c, ca, NULL)) &&
-	       (!((1 << ca->mi.state) & state_mask) ||
-		!enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
-		;
-
-	return ca;
-}
-
-#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx)	\
-	for (struct bch_dev *_ca = NULL;				\
-	     (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
-
-#define for_each_online_member(c, ca, ref_idx)				\
-	__for_each_online_member(c, ca, ~0, READ, ref_idx)
-
-#define for_each_rw_member(c, ca, ref_idx)					\
-	__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
-
-#define for_each_readable_member(c, ca, ref_idx)				\
-	__for_each_online_member(c, ca,	BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
-
-static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
-{
-	return dev < c->sb.nr_devices && c->devs[dev];
-}
-
-static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
-{
-	return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
-}
-
-static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
-{
-	EBUG_ON(!bch2_dev_exists(c, dev));
-
-	return rcu_dereference_check(c->devs[dev], 1);
-}
-
-static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
-{
-	EBUG_ON(!bch2_dev_exists(c, dev));
-
-	return rcu_dereference_protected(c->devs[dev],
-					 lockdep_is_held(&c->sb_lock) ||
-					 lockdep_is_held(&c->state_lock));
-}
-
-static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
-{
-	return c && dev < c->sb.nr_devices
-		? rcu_dereference(c->devs[dev])
-		: NULL;
-}
-
-int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
-{
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
-	if (unlikely(!ca))
-		bch2_dev_missing_atomic(c, dev);
-	return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
-{
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
-	if (ca)
-		bch2_dev_get(ca);
-	return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
-{
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
-	if (unlikely(!ca))
-		bch2_dev_missing_atomic(c, dev);
-	return ca;
-}
-
-static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
-{
-	struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
-	if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
-		bch2_dev_put(ca);
-		ca = NULL;
-	}
-	return ca;
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *, u64);
-
-static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
-{
-	struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
-	if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
-		bch2_dev_bucket_missing(ca, bucket.offset);
-		bch2_dev_put(ca);
-		ca = NULL;
-	}
-	return ca;
-}
-
-static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
-	if (ca && ca->dev_idx == dev_idx)
-		return ca;
-	bch2_dev_put(ca);
-	return bch2_dev_tryget_noerror(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
-	if (ca && ca->dev_idx == dev_idx)
-		return ca;
-	bch2_dev_put(ca);
-	return bch2_dev_tryget(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
-						 int rw, unsigned ref_idx)
-{
-	might_sleep();
-
-	guard(rcu)();
-	struct bch_dev *ca = bch2_dev_rcu(c, dev);
-	if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
-		return NULL;
-
-	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-		return ca;
-
-	enumerated_ref_put(&ca->io_ref[rw], ref_idx);
-	return NULL;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-
-static inline bool bch2_member_alive(struct bch_member *m)
-{
-	return  !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
-		!uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
-}
-
-static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
-{
-	if (dev < sb->nr_devices) {
-		struct bch_member m = bch2_sb_member_get(sb, dev);
-		return bch2_member_alive(&m);
-	}
-	return false;
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *);
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
-	return (struct bch_member_cpu) {
-		.nbuckets	= le64_to_cpu(mi->nbuckets),
-		.nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
-			le16_to_cpu(mi->first_bucket),
-		.first_bucket	= le16_to_cpu(mi->first_bucket),
-		.bucket_size	= le16_to_cpu(mi->bucket_size),
-		.group		= BCH_MEMBER_GROUP(mi),
-		.state		= BCH_MEMBER_STATE(mi),
-		.discard	= BCH_MEMBER_DISCARD(mi),
-		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
-		.durability	= BCH_MEMBER_DURABILITY(mi)
-			? BCH_MEMBER_DURABILITY(mi) - 1
-			: 1,
-		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
-		.resize_on_mount	= BCH_MEMBER_RESIZE_ON_MOUNT(mi),
-		.valid		= bch2_member_alive(mi),
-		.btree_bitmap_shift	= mi->btree_bitmap_shift,
-		.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
-	};
-}
-
-void bch2_sb_members_from_cpu(struct bch_fs *);
-
-void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
-void bch2_dev_errors_reset(struct bch_dev *);
-
-static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
-{
-	u64 end = start + sectors;
-
-	if (end > 64ULL << ca->mi.btree_bitmap_shift)
-		return false;
-
-	for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
-	     (u64) bit << ca->mi.btree_bitmap_shift < end;
-	     bit++)
-		if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
-			return false;
-	return true;
-}
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
-void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
-
-int bch2_sb_member_alloc(struct bch_fs *);
-void bch2_sb_members_clean_deleted(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
deleted file mode 100644
index fb72ad730518..000000000000
--- a/fs/bcachefs/sb-members_format.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
-#define _BCACHEFS_SB_MEMBERS_FORMAT_H
-
-/*
- * We refer to members with bitmasks in various places - but we need to get rid
- * of this limit:
- */
-#define BCH_SB_MEMBERS_MAX		64
-
-/*
- * Sentinal value - indicates a device that does not exist
- */
-#define BCH_SB_MEMBER_INVALID		255
-
-#define BCH_SB_MEMBER_DELETED_UUID					\
-	UUID_INIT(0xffffffff, 0xffff, 0xffff,				\
-		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCH_MIN_NR_NBUCKETS	(1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS()			\
-	x(seqread,	0)			\
-	x(seqwrite,	1)			\
-	x(randread,	2)			\
-	x(randwrite,	3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
-	BCH_IOPS_MEASUREMENTS()
-#undef x
-	BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES()		\
-	x(read,		0)			\
-	x(write,	1)			\
-	x(checksum,	2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
-	BCH_MEMBER_ERROR_TYPES()
-#undef x
-	BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
-	__uuid_t		uuid;
-	__le64			nbuckets;	/* device size */
-	__le16			first_bucket;   /* index of first bucket used */
-	__le16			bucket_size;	/* sectors */
-	__u8			btree_bitmap_shift;
-	__u8			pad[3];
-	__le64			last_mount;	/* time_t */
-
-	__le64			flags;
-	__le32			iops[4];
-	__le64			errors[BCH_MEMBER_ERROR_NR];
-	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
-	__le64			errors_reset_time;
-	__le64			seq;
-	__le64			btree_allocated_bitmap;
-	/*
-	 * On recovery from a clean shutdown we don't normally read the journal,
-	 * but we still want to resume writing from where we left off so we
-	 * don't overwrite more than is necessary, for list journal debugging:
-	 */
-	__le32			last_journal_bucket;
-	__le32			last_journal_bucket_offset;
-};
-
-/*
- * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
- * 64 elements, so 64 - ilog2(64)
- */
-#define BCH_MI_BTREE_BITMAP_SHIFT_MAX	58
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX	(INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES	56
-
-LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE,	struct bch_member, bucket_size,  0, 16)
-LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
-					struct bch_member, flags, 30, 31)
-LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
-					struct bch_member, flags, 31, 32)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES()			\
-	x(rw,		0)			\
-	x(ro,		1)			\
-	x(failed,	2)			\
-	x(spare,	3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
-	BCH_MEMBER_STATES()
-#undef x
-	BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
-	struct bch_sb_field	field;
-	struct bch_member	_members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
-	struct bch_sb_field	field;
-	__le16			member_bytes; //size of single member entry
-	u8			pad[6];
-	struct bch_member	_members[];
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
deleted file mode 100644
index d6443e186872..000000000000
--- a/fs/bcachefs/sb-members_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
-#define _BCACHEFS_SB_MEMBERS_TYPES_H
-
-struct bch_member_cpu {
-	u64			nbuckets;	/* device size */
-	u64			nbuckets_minus_first;
-	u16			first_bucket;   /* index of first bucket used */
-	u16			bucket_size;	/* sectors */
-	u16			group;
-	u8			state;
-	u8			discard;
-	u8			data_allowed;
-	u8			durability;
-	u8			freespace_initialized;
-	u8			resize_on_mount;
-	u8			valid;
-	u8			btree_bitmap_shift;
-	u64			btree_allocated_bitmap;
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
deleted file mode 100644
index c4b3d8d3f414..000000000000
--- a/fs/bcachefs/seqmutex.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SEQMUTEX_H
-#define _BCACHEFS_SEQMUTEX_H
-
-#include <linux/mutex.h>
-
-struct seqmutex {
-	struct mutex	lock;
-	u32		seq;
-};
-
-#define seqmutex_init(_lock)	mutex_init(&(_lock)->lock)
-
-static inline bool seqmutex_trylock(struct seqmutex *lock)
-{
-	return mutex_trylock(&lock->lock);
-}
-
-static inline void seqmutex_lock(struct seqmutex *lock)
-{
-	mutex_lock(&lock->lock);
-	lock->seq++;
-}
-
-static inline u32 seqmutex_unlock(struct seqmutex *lock)
-{
-	u32 seq = lock->seq;
-	mutex_unlock(&lock->lock);
-	return seq;
-}
-
-static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
-{
-	if (lock->seq != seq || !mutex_trylock(&lock->lock))
-		return false;
-
-	if (lock->seq != seq) {
-		mutex_unlock(&lock->lock);
-		return false;
-	}
-
-	return true;
-}
-
-#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
deleted file mode 100644
index a1cc44e66c7e..000000000000
--- a/fs/bcachefs/siphash.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: BSD-3-Clause
-/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
-
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
- * are the number of compression rounds and the number of finalization rounds.
- * A compression round is identical to a finalization round and this round
- * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
- * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
- *
- * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
- * by Jean-Philippe Aumasson and Daniel J. Bernstein,
- * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
- * https://131002.net/siphash/siphash.pdf
- * https://131002.net/siphash/
- */
-
-#include <asm/byteorder.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-
-#include "siphash.h"
-
-static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
-	while (rounds--) {
-		ctx->v[0] += ctx->v[1];
-		ctx->v[2] += ctx->v[3];
-		ctx->v[1] = rol64(ctx->v[1], 13);
-		ctx->v[3] = rol64(ctx->v[3], 16);
-
-		ctx->v[1] ^= ctx->v[0];
-		ctx->v[3] ^= ctx->v[2];
-		ctx->v[0] = rol64(ctx->v[0], 32);
-
-		ctx->v[2] += ctx->v[1];
-		ctx->v[0] += ctx->v[3];
-		ctx->v[1] = rol64(ctx->v[1], 17);
-		ctx->v[3] = rol64(ctx->v[3], 21);
-
-		ctx->v[1] ^= ctx->v[2];
-		ctx->v[3] ^= ctx->v[0];
-		ctx->v[2] = rol64(ctx->v[2], 32);
-	}
-}
-
-static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
-{
-	u64 m = get_unaligned_le64(ptr);
-
-	ctx->v[3] ^= m;
-	SipHash_Rounds(ctx, rounds);
-	ctx->v[0] ^= m;
-}
-
-void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
-{
-	u64 k0, k1;
-
-	k0 = le64_to_cpu(key->k0);
-	k1 = le64_to_cpu(key->k1);
-
-	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
-	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
-	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
-	ctx->v[3] = 0x7465646279746573ULL ^ k1;
-
-	memset(ctx->buf, 0, sizeof(ctx->buf));
-	ctx->bytes = 0;
-}
-
-void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
-		    const void *src, size_t len)
-{
-	const u8 *ptr = src;
-	size_t left, used;
-
-	if (len == 0)
-		return;
-
-	used = ctx->bytes % sizeof(ctx->buf);
-	ctx->bytes += len;
-
-	if (used > 0) {
-		left = sizeof(ctx->buf) - used;
-
-		if (len >= left) {
-			memcpy(&ctx->buf[used], ptr, left);
-			SipHash_CRounds(ctx, ctx->buf, rc);
-			len -= left;
-			ptr += left;
-		} else {
-			memcpy(&ctx->buf[used], ptr, len);
-			return;
-		}
-	}
-
-	while (len >= sizeof(ctx->buf)) {
-		SipHash_CRounds(ctx, ptr, rc);
-		len -= sizeof(ctx->buf);
-		ptr += sizeof(ctx->buf);
-	}
-
-	if (len > 0)
-		memcpy(&ctx->buf[used], ptr, len);
-}
-
-void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
-{
-	u64 r;
-
-	r = SipHash_End(ctx, rc, rf);
-
-	*((__le64 *) dst) = cpu_to_le64(r);
-}
-
-u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
-{
-	u64 r;
-	size_t left, used;
-
-	used = ctx->bytes % sizeof(ctx->buf);
-	left = sizeof(ctx->buf) - used;
-	memset(&ctx->buf[used], 0, left - 1);
-	ctx->buf[7] = ctx->bytes;
-
-	SipHash_CRounds(ctx, ctx->buf, rc);
-	ctx->v[2] ^= 0xff;
-	SipHash_Rounds(ctx, rf);
-
-	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
-	memset(ctx, 0, sizeof(*ctx));
-	return r;
-}
-
-u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
-{
-	SIPHASH_CTX ctx;
-
-	SipHash_Init(&ctx, key);
-	SipHash_Update(&ctx, rc, rf, src, len);
-	return SipHash_End(&ctx, rc, rf);
-}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
deleted file mode 100644
index 3dfaf34a43b2..000000000000
--- a/fs/bcachefs/siphash.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
-/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
- * optimized for speed on short messages returning a 64bit hash/digest value.
- *
- * The number of rounds is defined during the initialization:
- *  SipHash24_Init() for the fast and resonable strong version
- *  SipHash48_Init() for the strong version (half as fast)
- *
- * struct SIPHASH_CTX ctx;
- * SipHash24_Init(&ctx);
- * SipHash_SetKey(&ctx, "16bytes long key");
- * SipHash_Update(&ctx, pointer_to_string, length_of_string);
- * SipHash_Final(output, &ctx);
- */
-
-#ifndef _SIPHASH_H_
-#define _SIPHASH_H_
-
-#include <linux/types.h>
-
-#define SIPHASH_BLOCK_LENGTH	 8
-#define SIPHASH_KEY_LENGTH	16
-#define SIPHASH_DIGEST_LENGTH	 8
-
-typedef struct _SIPHASH_CTX {
-	u64		v[4];
-	u8		buf[SIPHASH_BLOCK_LENGTH];
-	u32		bytes;
-} SIPHASH_CTX;
-
-typedef struct {
-	__le64		k0;
-	__le64		k1;
-} SIPHASH_KEY;
-
-void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
-void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
-u64	SipHash_End(SIPHASH_CTX *, int, int);
-void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
-u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
-
-#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
-#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
-#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
-#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
-#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
-
-#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
-#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
-#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
-#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
-#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
-
-#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
deleted file mode 100644
index 538c324f4765..000000000000
--- a/fs/bcachefs/six.c
+++ /dev/null
@@ -1,878 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#include "six.h"
-
-#ifdef DEBUG
-#define EBUG_ON(cond)			BUG_ON(cond)
-#else
-#define EBUG_ON(cond)			do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip)		lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET	0
-#define SIX_LOCK_HELD_read		~(~0U << 26)
-#define SIX_LOCK_HELD_intent		(1U << 26)
-#define SIX_LOCK_HELD_write		(1U << 27)
-#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN			(1U << 31)
-
-struct six_lock_vals {
-	/* Value we add to the lock in order to take the lock: */
-	u32			lock_val;
-
-	/* If the lock has this value (used as a mask), taking the lock fails: */
-	u32			lock_fail;
-
-	/* Mask that indicates lock is held for this type: */
-	u32			held_mask;
-
-	/* Waitlist we wakeup when releasing the lock: */
-	enum six_lock_type	unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
-	[SIX_LOCK_read] = {
-		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
-		.lock_fail	= SIX_LOCK_HELD_write,
-		.held_mask	= SIX_LOCK_HELD_read,
-		.unlock_wakeup	= SIX_LOCK_write,
-	},
-	[SIX_LOCK_intent] = {
-		.lock_val	= SIX_LOCK_HELD_intent,
-		.lock_fail	= SIX_LOCK_HELD_intent,
-		.held_mask	= SIX_LOCK_HELD_intent,
-		.unlock_wakeup	= SIX_LOCK_intent,
-	},
-	[SIX_LOCK_write] = {
-		.lock_val	= SIX_LOCK_HELD_write,
-		.lock_fail	= SIX_LOCK_HELD_read,
-		.held_mask	= SIX_LOCK_HELD_write,
-		.unlock_wakeup	= SIX_LOCK_read,
-	},
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
-	if ((atomic_read(&lock->state) & mask) != mask)
-		atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
-	if (atomic_read(&lock->state) & mask)
-		atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 u32 old, struct task_struct *owner)
-{
-	if (type != SIX_LOCK_intent)
-		return;
-
-	if (!(old & SIX_LOCK_HELD_intent)) {
-		EBUG_ON(lock->owner);
-		lock->owner = owner;
-	} else {
-		EBUG_ON(lock->owner != current);
-	}
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
-	unsigned read_count = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		read_count += *per_cpu_ptr(lock->readers, cpu);
-	return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
-			    struct task_struct *task, bool try)
-{
-	int ret;
-	u32 old;
-
-	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-	EBUG_ON(type == SIX_LOCK_write &&
-		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
-	/*
-	 * Percpu reader mode:
-	 *
-	 * The basic idea behind this algorithm is that you can implement a lock
-	 * between two threads without any atomics, just memory barriers:
-	 *
-	 * For two threads you'll need two variables, one variable for "thread a
-	 * has the lock" and another for "thread b has the lock".
-	 *
-	 * To take the lock, a thread sets its variable indicating that it holds
-	 * the lock, then issues a full memory barrier, then reads from the
-	 * other thread's variable to check if the other thread thinks it has
-	 * the lock. If we raced, we backoff and retry/sleep.
-	 *
-	 * Failure to take the lock may cause a spurious trylock failure in
-	 * another thread, because we temporarily set the lock to indicate that
-	 * we held it. This would be a problem for a thread in six_lock(), when
-	 * they are calling trylock after adding themself to the waitlist and
-	 * prior to sleeping.
-	 *
-	 * Therefore, if we fail to get the lock, and there were waiters of the
-	 * type we conflict with, we will have to issue a wakeup.
-	 *
-	 * Since we may be called under wait_lock (and by the wakeup code
-	 * itself), we return that the wakeup has to be done instead of doing it
-	 * here.
-	 */
-	if (type == SIX_LOCK_read && lock->readers) {
-		preempt_disable();
-		this_cpu_inc(*lock->readers); /* signal that we own lock */
-
-		smp_mb();
-
-		old = atomic_read(&lock->state);
-		ret = !(old & l[type].lock_fail);
-
-		this_cpu_sub(*lock->readers, !ret);
-		preempt_enable();
-
-		if (!ret) {
-			smp_mb();
-			if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
-				ret = -1 - SIX_LOCK_write;
-		}
-	} else if (type == SIX_LOCK_write && lock->readers) {
-		if (try)
-			atomic_add(SIX_LOCK_HELD_write, &lock->state);
-
-		/*
-		 * Make sure atomic_add happens before pcpu_read_count and
-		 * six_set_bitmask in slow path happens before pcpu_read_count.
-		 *
-		 * Paired with the smp_mb() in read lock fast path (per-cpu mode)
-		 * and the one before atomic_read in read unlock path.
-		 */
-		smp_mb();
-		ret = !pcpu_read_count(lock);
-
-		if (try && !ret) {
-			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
-			if (old & SIX_LOCK_WAITING_read)
-				ret = -1 - SIX_LOCK_read;
-		}
-	} else {
-		old = atomic_read(&lock->state);
-		do {
-			ret = !(old & l[type].lock_fail);
-			if (!ret || (type == SIX_LOCK_write && !try)) {
-				smp_mb();
-				break;
-			}
-		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
-		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
-	}
-
-	if (ret > 0)
-		six_set_owner(lock, type, old, task);
-
-	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
-	return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
-	struct six_lock_waiter *w, *next;
-	struct task_struct *task;
-	bool saw_one;
-	int ret;
-again:
-	ret = 0;
-	saw_one = false;
-	raw_spin_lock(&lock->wait_lock);
-
-	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
-		if (w->lock_want != lock_type)
-			continue;
-
-		if (saw_one && lock_type != SIX_LOCK_read)
-			goto unlock;
-		saw_one = true;
-
-		ret = __do_six_trylock(lock, lock_type, w->task, false);
-		if (ret <= 0)
-			goto unlock;
-
-		/*
-		 * Similar to percpu_rwsem_wake_function(), we need to guard
-		 * against the wakee noticing w->lock_acquired, returning, and
-		 * then exiting before we do the wakeup:
-		 */
-		task = get_task_struct(w->task);
-		__list_del(w->list.prev, w->list.next);
-		/*
-		 * The release barrier here ensures the ordering of the
-		 * __list_del before setting w->lock_acquired; @w is on the
-		 * stack of the thread doing the waiting and will be reused
-		 * after it sees w->lock_acquired with no other locking:
-		 * pairs with smp_load_acquire() in six_lock_slowpath()
-		 */
-		smp_store_release(&w->lock_acquired, true);
-		wake_up_process(task);
-		put_task_struct(task);
-	}
-
-	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
-	raw_spin_unlock(&lock->wait_lock);
-
-	if (ret < 0) {
-		lock_type = -ret - 1;
-		goto again;
-	}
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
-			    enum six_lock_type lock_type)
-{
-	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
-		return;
-
-	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
-		return;
-
-	__six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
-	int ret;
-
-	ret = __do_six_trylock(lock, type, current, try);
-	if (ret < 0)
-		__six_lock_wakeup(lock, -ret - 1);
-
-	return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-	if (!do_six_trylock(lock, type, true))
-		return false;
-
-	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-	return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:	lock sequence number obtained from six_lock_seq() while lock was
- *		held previously
- * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-		   unsigned seq, unsigned long ip)
-{
-	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
-		return false;
-
-	if (six_lock_seq(lock) != seq) {
-		six_unlock_ip(lock, type, ip);
-		return false;
-	}
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-
-static inline bool six_owner_running(struct six_lock *lock)
-{
-	/*
-	 * When there's no owner, we might have preempted between the owner
-	 * acquiring the lock and setting the owner field. If we're an RT task
-	 * that will live-lock because we won't let the owner complete.
-	 */
-	guard(rcu)();
-	struct task_struct *owner = READ_ONCE(lock->owner);
-	return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
-				       struct six_lock_waiter *wait,
-				       enum six_lock_type type)
-{
-	unsigned loop = 0;
-	u64 end_time;
-
-	if (type == SIX_LOCK_write)
-		return false;
-
-	if (lock->wait_list.next != &wait->list)
-		return false;
-
-	if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
-		return false;
-
-	preempt_disable();
-	end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
-	while (!need_resched() && six_owner_running(lock)) {
-		/*
-		 * Ensures that writes to the waitlist entry happen after we see
-		 * wait->lock_acquired: pairs with the smp_store_release in
-		 * __six_lock_wakeup
-		 */
-		if (smp_load_acquire(&wait->lock_acquired)) {
-			preempt_enable();
-			return true;
-		}
-
-		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
-			break;
-		}
-
-		/*
-		 * The cpu_relax() call is a compiler barrier which forces
-		 * everything in this loop to be re-loaded. We don't need
-		 * memory barriers as we'll eventually observe the right
-		 * values at the cost of a few extra spins.
-		 */
-		cpu_relax();
-	}
-
-	preempt_enable();
-	return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
-				       struct six_lock_waiter *wait,
-				       enum six_lock_type type)
-{
-	return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
-			     struct six_lock_waiter *wait,
-			     six_lock_should_sleep_fn should_sleep_fn, void *p,
-			     unsigned long ip)
-{
-	int ret = 0;
-
-	if (type == SIX_LOCK_write) {
-		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-		atomic_add(SIX_LOCK_HELD_write, &lock->state);
-		smp_mb__after_atomic();
-	}
-
-	trace_contention_begin(lock, 0);
-	lock_contended(&lock->dep_map, ip);
-
-	wait->task		= current;
-	wait->lock_want		= type;
-	wait->lock_acquired	= false;
-
-	raw_spin_lock(&lock->wait_lock);
-	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
-	/*
-	 * Retry taking the lock after taking waitlist lock, in case we raced
-	 * with an unlock:
-	 */
-	ret = __do_six_trylock(lock, type, current, false);
-	if (ret <= 0) {
-		wait->start_time = local_clock();
-
-		if (!list_empty(&lock->wait_list)) {
-			struct six_lock_waiter *last =
-				list_last_entry(&lock->wait_list,
-					struct six_lock_waiter, list);
-
-			if (time_before_eq64(wait->start_time, last->start_time))
-				wait->start_time = last->start_time + 1;
-		}
-
-		list_add_tail(&wait->list, &lock->wait_list);
-	}
-	raw_spin_unlock(&lock->wait_lock);
-
-	if (unlikely(ret > 0)) {
-		ret = 0;
-		goto out;
-	}
-
-	if (unlikely(ret < 0)) {
-		__six_lock_wakeup(lock, -ret - 1);
-		ret = 0;
-	}
-
-	if (six_optimistic_spin(lock, wait, type))
-		goto out;
-
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-
-		/*
-		 * Ensures that writes to the waitlist entry happen after we see
-		 * wait->lock_acquired: pairs with the smp_store_release in
-		 * __six_lock_wakeup
-		 */
-		if (smp_load_acquire(&wait->lock_acquired))
-			break;
-
-		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-		if (unlikely(ret)) {
-			bool acquired;
-
-			/*
-			 * If should_sleep_fn() returns an error, we are
-			 * required to return that error even if we already
-			 * acquired the lock - should_sleep_fn() might have
-			 * modified external state (e.g. when the deadlock cycle
-			 * detector in bcachefs issued a transaction restart)
-			 */
-			raw_spin_lock(&lock->wait_lock);
-			acquired = wait->lock_acquired;
-			if (!acquired)
-				list_del(&wait->list);
-			raw_spin_unlock(&lock->wait_lock);
-
-			if (unlikely(acquired)) {
-				do_six_unlock_type(lock, type);
-			} else if (type == SIX_LOCK_write) {
-				six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-				six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
-			}
-			break;
-		}
-
-		schedule();
-	}
-
-	__set_current_state(TASK_RUNNING);
-out:
-	trace_contention_end(lock, 0);
-
-	return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:	pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *		to scheduling
- * @p:		passed through to @should_sleep_fn
- * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-		       struct six_lock_waiter *wait,
-		       six_lock_should_sleep_fn should_sleep_fn, void *p,
-		       unsigned long ip)
-{
-	int ret;
-
-	wait->start_time = 0;
-
-	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
-	ret = do_six_trylock(lock, type, true) ? 0
-		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
-	if (ret && type != SIX_LOCK_write)
-		six_release(&lock->dep_map, ip);
-	if (!ret)
-		lock_acquired(&lock->dep_map, ip);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	u32 state;
-
-	if (type == SIX_LOCK_intent)
-		lock->owner = NULL;
-
-	if (type == SIX_LOCK_read &&
-	    lock->readers) {
-		smp_mb(); /* unlock barrier */
-		this_cpu_dec(*lock->readers);
-		smp_mb(); /* between unlocking and checking for waiters */
-		state = atomic_read(&lock->state);
-	} else {
-		u32 v = l[type].lock_val;
-
-		if (type != SIX_LOCK_read)
-			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
-		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
-		state = atomic_sub_return_release(v, &lock->state);
-	}
-
-	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock:	lock to unlock
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);				read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-	EBUG_ON(type == SIX_LOCK_write &&
-		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-	EBUG_ON((type == SIX_LOCK_write ||
-		 type == SIX_LOCK_intent) &&
-		lock->owner != current);
-
-	if (type != SIX_LOCK_write)
-		six_release(&lock->dep_map, ip);
-
-	if (type == SIX_LOCK_intent &&
-	    lock->intent_lock_recurse) {
-		--lock->intent_lock_recurse;
-		return;
-	}
-
-	if (type == SIX_LOCK_write &&
-	    lock->write_lock_recurse) {
-		--lock->write_lock_recurse;
-		return;
-	}
-
-	if (type == SIX_LOCK_write)
-		lock->seq++;
-
-	do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock:	lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
-	six_lock_increment(lock, SIX_LOCK_read);
-	six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock:	lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
-	u32 old = atomic_read(&lock->state), new;
-
-	do {
-		new = old;
-
-		if (new & SIX_LOCK_HELD_intent)
-			return false;
-
-		if (!lock->readers) {
-			EBUG_ON(!(new & SIX_LOCK_HELD_read));
-			new -= l[SIX_LOCK_read].lock_val;
-		}
-
-		new |= SIX_LOCK_HELD_intent;
-	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
-	if (lock->readers)
-		this_cpu_dec(*lock->readers);
-
-	six_set_owner(lock, SIX_LOCK_intent, old, current);
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock:	lock to upgrade
- * @from:	SIX_LOCK_read or SIX_LOCK_intent
- * @to:		SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
-			 enum six_lock_type from,
-			 enum six_lock_type to)
-{
-	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
-	if (to == from)
-		return true;
-
-	if (to == SIX_LOCK_read) {
-		six_lock_downgrade(lock);
-		return true;
-	} else {
-		return six_lock_tryupgrade(lock);
-	}
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock:	lock to increment
- * @type:	SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
-	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
-	/* XXX: assert already locked, and that we don't overflow: */
-
-	switch (type) {
-	case SIX_LOCK_read:
-		if (lock->readers) {
-			this_cpu_inc(*lock->readers);
-		} else {
-			EBUG_ON(!(atomic_read(&lock->state) &
-				  (SIX_LOCK_HELD_read|
-				   SIX_LOCK_HELD_intent)));
-			atomic_add(l[type].lock_val, &lock->state);
-		}
-		break;
-	case SIX_LOCK_write:
-		lock->write_lock_recurse++;
-		fallthrough;
-	case SIX_LOCK_intent:
-		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-		lock->intent_lock_recurse++;
-		break;
-	}
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock:	lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
-	u32 state = atomic_read(&lock->state);
-	struct six_lock_waiter *w;
-
-	six_lock_wakeup(lock, state, SIX_LOCK_read);
-	six_lock_wakeup(lock, state, SIX_LOCK_intent);
-	six_lock_wakeup(lock, state, SIX_LOCK_write);
-
-	raw_spin_lock(&lock->wait_lock);
-	list_for_each_entry(w, &lock->wait_list, list)
-		wake_up_process(w->task);
-	raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock:	lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
-	struct six_lock_count ret;
-
-	ret.n[SIX_LOCK_read]	= !lock->readers
-		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
-		: pcpu_read_count(lock);
-	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
-		lock->intent_lock_recurse;
-	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock:	lock to add/subtract readers for
- * @nr:		reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-	if (lock->readers) {
-		this_cpu_add(*lock->readers, nr);
-	} else {
-		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
-		/* reader count starts at bit 0 */
-		atomic_add(nr, &lock->state);
-	}
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock:	lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
-	WARN_ON(lock->readers && pcpu_read_count(lock));
-	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
-	free_percpu(lock->readers);
-	lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-		     struct lock_class_key *key, enum six_lock_init_flags flags,
-		     gfp_t gfp)
-{
-	atomic_set(&lock->state, 0);
-	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-	lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
-	/*
-	 * Don't assume that we have real percpu variables available in
-	 * userspace:
-	 */
-#ifdef __KERNEL__
-	if (flags & SIX_LOCK_INIT_PCPU) {
-		/*
-		 * We don't return an error here on memory allocation failure
-		 * since percpu is an optimization, and locks will work with the
-		 * same semantics in non-percpu mode: callers can check for
-		 * failure if they wish by checking lock->readers, but generally
-		 * will not want to treat it as an error.
-		 */
-		lock->readers = alloc_percpu_gfp(unsigned, gfp);
-	}
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
deleted file mode 100644
index 59b851cf8bac..000000000000
--- a/fs/bcachefs/six.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at the start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- *   six_lock_read(&foo->lock);
- *   six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- *   six_lock_intent(&foo->lock);
- *   six_lock_write(&foo->lock);
- *   six_unlock_write(&foo->lock);
- *   six_unlock_intent(&foo->lock);
- *
- * Other operations:
- *   six_trylock_read()
- *   six_trylock_intent()
- *   six_trylock_write()
- *
- *   six_lock_downgrade()	convert from intent to read
- *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- *   six_lock_type(&foo->lock, SIX_LOCK_read);
- *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- *   six_lock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- *   Locks embed sequences numbers, which are incremented on write lock/unlock.
- *   This allows locks to be dropped and the retaken iff the state they protect
- *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
- *   doing IO or allocating memory.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     u32 seq = six_lock_seq(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *
- *     some_operation_that_may_block();
- *
- *     if (six_relock_read(&foo->lock, seq)) { ... }
- *
- *   If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- *   Six locks are not by themselves reentrant, but have counters for both the
- *   read and intent states that can be used to provide reentrancy by an upper
- *   layer that tracks held locks. If a lock is known to already be held in the
- *   read or intent state, six_lock_increment() can be used to bump the "lock
- *   held in this state" counter, increasing the number of unlock calls that
- *   will be required to fully unlock it.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     six_lock_increment(&foo->lock, SIX_LOCK_read);
- *     six_unlock_read(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *   foo->lock is now fully unlocked.
- *
- *   Since the intent state supercedes read, it's legal to increment the read
- *   counter when holding an intent lock, but not the reverse.
- *
- *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- *   is not legal.
- *
- * should_sleep_fn:
- *
- *   There is a six_lock() variant that takes a function pointer that is called
- *   immediately prior to schedule() when blocking, and may return an error to
- *   abort.
- *
- *   One possible use for this feature is when objects being locked are part of
- *   a cache and may reused, and lock ordering is based on a property of the
- *   object that will change when the object is reused - i.e. logical key order.
- *
- *   If looking up an object in the cache may race with object reuse, and lock
- *   ordering is required to prevent deadlock, object reuse may change the
- *   correct lock order for that object and cause a deadlock. should_sleep_fn
- *   can be used to check if the object is still the object we want and avoid
- *   this deadlock.
- *
- * Wait list entry interface:
- *
- *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- *   wait list entry. By embedding six_lock_waiter into another object, and by
- *   traversing lock waitlists, it is then possible for an upper layer to
- *   implement full cycle detection for deadlock avoidance.
- *
- *   should_sleep_fn should be used for invoking the cycle detector, walking the
- *   graph of held locks to check for a deadlock. The upper layer must track
- *   held locks for each thread, and each thread's held locks must be reachable
- *   from its six_lock_waiter object.
- *
- *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
- *   the lock, and before calling should_sleep_fn, and the wait object will not
- *   be removed from the waitlist until either the lock has been successfully
- *   acquired, or we aborted because should_sleep_fn returned an error.
- *
- *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- *   have timestamps in strictly ascending order - this is so the timestamp can
- *   be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
-	SIX_LOCK_read,
-	SIX_LOCK_intent,
-	SIX_LOCK_write,
-};
-
-struct six_lock {
-	atomic_t		state;
-	u32			seq;
-	unsigned		intent_lock_recurse;
-	unsigned		write_lock_recurse;
-	struct task_struct	*owner;
-	unsigned __percpu	*readers;
-	raw_spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
-struct six_lock_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-	enum six_lock_type	lock_want;
-	bool			lock_acquired;
-	u64			start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
-	SIX_LOCK_INIT_PCPU	= 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-		     struct lock_class_key *key, enum six_lock_init_flags flags,
-		     gfp_t gfp);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock:	lock to initialize
- * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags, gfp)					\
-do {									\
-	static struct lock_class_key __key;				\
-									\
-	__six_lock_init((lock), #lock, &__key, flags, gfp);			\
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock:	six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
-	return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-		       struct six_lock_waiter *wait,
-		       six_lock_should_sleep_fn should_sleep_fn, void *p,
-		       unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:	pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *		to scheduling
- * @p:		passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
-				  struct six_lock_waiter *wait,
-				  six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *		to scheduling
- * @p:		passed through to @should_sleep_fn
- * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
-			      six_lock_should_sleep_fn should_sleep_fn, void *p,
-			      unsigned long ip)
-{
-	struct six_lock_waiter wait;
-
-	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *		to scheduling
- * @p:		passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	struct six_lock_waiter wait;
-
-	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-		   unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock:	lock to take
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:	lock sequence number obtained from six_lock_seq() while lock was
- *		held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-				   unsigned seq)
-{
-	return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock:	lock to unlock
- * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);				read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type)						\
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{									\
-	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
-}									\
-									\
-static inline bool six_trylock_##type(struct six_lock *lock)		\
-{									\
-	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
-}									\
-									\
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
-			   struct six_lock_waiter *wait,		\
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
-			   unsigned long ip)				\
-{									\
-	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}									\
-									\
-static inline int six_lock_ip_##type(struct six_lock *lock,		\
-		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
-		    unsigned long ip)					\
-{									\
-	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}									\
-									\
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{									\
-	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
-}									\
-									\
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
-{									\
-	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
-}									\
-									\
-static inline int six_lock_##type(struct six_lock *lock,		\
-				  six_lock_should_sleep_fn fn, void *p)\
-{									\
-	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
-}									\
-									\
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-{									\
-	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
-}									\
-									\
-static inline void six_unlock_##type(struct six_lock *lock)		\
-{									\
-	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
-			 enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
-	unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
deleted file mode 100644
index 4c43d2a2c1f5..000000000000
--- a/fs/bcachefs/snapshot.c
+++ /dev/null
@@ -1,2043 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-
-#include <linux/random.h>
-
-/*
- * Snapshot trees:
- *
- * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
- * exist to provide a stable identifier for the whole lifetime of a snapshot
- * tree.
- */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
-				struct bkey_s_c k)
-{
-	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
-	prt_printf(out, "subvol %u root snapshot %u",
-		   le32_to_cpu(t.v->master_subvol),
-		   le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
-				struct bkey_validate_context from)
-{
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-			 bkey_lt(k.k->p, POS(0, 1)),
-			 c, snapshot_tree_pos_bad,
-			 "bad pos");
-fsck_err:
-	return ret;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-			      struct bch_snapshot_tree *s)
-{
-	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-					  BTREE_ITER_with_updates, snapshot_tree, s);
-
-	if (bch2_err_matches(ret, ENOENT))
-		ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
-	return ret;
-}
-
-struct bkey_i_snapshot_tree *
-__bch2_snapshot_tree_create(struct btree_trans *trans)
-{
-	struct btree_iter iter;
-	int ret = bch2_bkey_get_empty_slot(trans, &iter,
-			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
-	struct bkey_i_snapshot_tree *s_t;
-
-	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-		ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
-	if (ret)
-		return ERR_PTR(ret);
-
-	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
-	ret = PTR_ERR_OR_ZERO(s_t);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int bch2_snapshot_tree_create(struct btree_trans *trans,
-				u32 root_id, u32 subvol_id, u32 *tree_id)
-{
-	struct bkey_i_snapshot_tree *n_tree =
-		__bch2_snapshot_tree_create(trans);
-
-	if (IS_ERR(n_tree))
-		return PTR_ERR(n_tree);
-
-	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
-	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
-	*tree_id = n_tree->k.p.offset;
-	return 0;
-}
-
-/* Snapshot nodes: */
-
-static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-	while (id && id < ancestor) {
-		const struct snapshot_t *s = __snapshot_t(t, id);
-		id = s ? s->parent : 0;
-	}
-	return id == ancestor;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	guard(rcu)();
-	return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
-}
-
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-	const struct snapshot_t *s = __snapshot_t(t, id);
-	if (!s)
-		return 0;
-
-	if (s->skip[2] <= ancestor)
-		return s->skip[2];
-	if (s->skip[1] <= ancestor)
-		return s->skip[1];
-	if (s->skip[0] <= ancestor)
-		return s->skip[0];
-	return s->parent;
-}
-
-static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-	const struct snapshot_t *s = __snapshot_t(t, id);
-	if (!s)
-		return false;
-
-	return test_bit(ancestor - id - 1, s->is_ancestor);
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	u32 orig_id = id;
-#endif
-
-	guard(rcu)();
-	struct snapshot_table *t = rcu_dereference(c->snapshots);
-
-	if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
-		return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
-
-	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
-		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
-			id = get_ancestor_below(t, id, ancestor);
-
-	bool ret = id && id < ancestor
-		? test_ancestor_bitmap(t, id, ancestor)
-		: id == ancestor;
-
-	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
-	return ret;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-	size_t idx = U32_MAX - id;
-	struct snapshot_table *new, *old;
-
-	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
-	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
-
-	if (unlikely(new_bytes > INT_MAX))
-		return NULL;
-
-	new = kvzalloc(new_bytes, GFP_KERNEL);
-	if (!new)
-		return NULL;
-
-	new->nr = new_size;
-
-	old = rcu_dereference_protected(c->snapshots, true);
-	if (old)
-		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
-
-	rcu_assign_pointer(c->snapshots, new);
-	kvfree_rcu(old, rcu);
-
-	return &rcu_dereference_protected(c->snapshots,
-				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-	size_t idx = U32_MAX - id;
-	struct snapshot_table *table =
-		rcu_dereference_protected(c->snapshots,
-				lockdep_is_held(&c->snapshot_table_lock));
-
-	lockdep_assert_held(&c->snapshot_table_lock);
-
-	if (likely(table && idx < table->nr))
-		return &table->s[idx];
-
-	return __snapshot_t_mut(c, id);
-}
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
-			   struct bkey_s_c k)
-{
-	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
-	if (BCH_SNAPSHOT_SUBVOL(s.v))
-		prt_str(out, "subvol ");
-	if (BCH_SNAPSHOT_WILL_DELETE(s.v))
-		prt_str(out, "will_delete ");
-	if (BCH_SNAPSHOT_DELETED(s.v))
-		prt_str(out, "deleted ");
-
-	prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
-	       le32_to_cpu(s.v->parent),
-	       le32_to_cpu(s.v->children[0]),
-	       le32_to_cpu(s.v->children[1]),
-	       le32_to_cpu(s.v->subvol),
-	       le32_to_cpu(s.v->tree));
-
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
-		prt_printf(out, " depth %u skiplist %u %u %u",
-			   le32_to_cpu(s.v->depth),
-			   le32_to_cpu(s.v->skip[0]),
-			   le32_to_cpu(s.v->skip[1]),
-			   le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
-			   struct bkey_validate_context from)
-{
-	struct bkey_s_c_snapshot s;
-	u32 i, id;
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-			 bkey_lt(k.k->p, POS(0, 1)),
-			 c, snapshot_pos_bad,
-			 "bad pos");
-
-	s = bkey_s_c_to_snapshot(k);
-
-	id = le32_to_cpu(s.v->parent);
-	bkey_fsck_err_on(id && id <= k.k->p.offset,
-			 c, snapshot_parent_bad,
-			 "bad parent node (%u <= %llu)",
-			 id, k.k->p.offset);
-
-	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
-			 c, snapshot_children_not_normalized,
-			 "children not normalized");
-
-	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
-			 c, snapshot_child_duplicate,
-			 "duplicate child nodes");
-
-	for (i = 0; i < 2; i++) {
-		id = le32_to_cpu(s.v->children[i]);
-
-		bkey_fsck_err_on(id >= k.k->p.offset,
-				 c, snapshot_child_bad,
-				 "bad child node (%u >= %llu)",
-				 id, k.k->p.offset);
-	}
-
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
-				 c, snapshot_skiplist_not_normalized,
-				 "skiplist not normalized");
-
-		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
-			id = le32_to_cpu(s.v->skip[i]);
-
-			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
-					 c, snapshot_skiplist_bad,
-					 "bad skiplist node %u", id);
-		}
-	}
-fsck_err:
-	return ret;
-}
-
-static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
-{
-	mutex_lock(&c->snapshot_table_lock);
-	int ret = snapshot_t_mut(c, id)
-		? 0
-		: bch_err_throw(c, ENOMEM_mark_snapshot);
-	mutex_unlock(&c->snapshot_table_lock);
-	return ret;
-}
-
-static int __bch2_mark_snapshot(struct btree_trans *trans,
-		       enum btree_id btree, unsigned level,
-		       struct bkey_s_c old, struct bkey_s_c new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_t *t;
-	u32 id = new.k->p.offset;
-	int ret = 0;
-
-	mutex_lock(&c->snapshot_table_lock);
-
-	t = snapshot_t_mut(c, id);
-	if (!t) {
-		ret = bch_err_throw(c, ENOMEM_mark_snapshot);
-		goto err;
-	}
-
-	if (new.k->type == KEY_TYPE_snapshot) {
-		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-
-		t->state	= !BCH_SNAPSHOT_DELETED(s.v)
-			? SNAPSHOT_ID_live
-			: SNAPSHOT_ID_deleted;
-		t->parent	= le32_to_cpu(s.v->parent);
-		t->children[0]	= le32_to_cpu(s.v->children[0]);
-		t->children[1]	= le32_to_cpu(s.v->children[1]);
-		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
-		t->tree		= le32_to_cpu(s.v->tree);
-
-		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
-			t->depth	= le32_to_cpu(s.v->depth);
-			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
-			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
-			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
-		} else {
-			t->depth	= 0;
-			t->skip[0]	= 0;
-			t->skip[1]	= 0;
-			t->skip[2]	= 0;
-		}
-
-		u32 parent = id;
-
-		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-		       parent - id - 1 < IS_ANCESTOR_BITMAP)
-			__set_bit(parent - id - 1, t->is_ancestor);
-
-		if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
-			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
-			if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
-				bch2_delete_dead_snapshots_async(c);
-		}
-	} else {
-		memset(t, 0, sizeof(*t));
-	}
-err:
-	mutex_unlock(&c->snapshot_table_lock);
-	return ret;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
-		       enum btree_id btree, unsigned level,
-		       struct bkey_s_c old, struct bkey_s new,
-		       enum btree_iter_update_trigger_flags flags)
-{
-	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
-			 struct bch_snapshot *s)
-{
-	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
-				       BTREE_ITER_with_updates, snapshot, s);
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
-	return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
-	return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
-	return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
-	u32 n, parent;
-
-	n = bch2_snapshot_left_child(c, id);
-	if (n)
-		return n;
-
-	while ((parent = bch2_snapshot_parent(c, id))) {
-		n = bch2_snapshot_right_child(c, parent);
-		if (n && n != id)
-			return n;
-		id = parent;
-	}
-
-	return 0;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
-				snapshot_id_list *skip)
-{
-	guard(rcu)();
-	u32 id, subvol = 0, s;
-retry:
-	id = snapshot_root;
-	while (id && bch2_snapshot_exists(c, id)) {
-		if (!(skip && snapshot_list_has_id(skip, id))) {
-			s = snapshot_t(c, id)->subvol;
-
-			if (s && (!subvol || s < subvol))
-				subvol = s;
-		}
-		id = bch2_snapshot_tree_next(c, id);
-		if (id == snapshot_root)
-			break;
-	}
-
-	if (!subvol && skip) {
-		skip = NULL;
-		goto retry;
-	}
-
-	return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
-					    u32 snapshot_root, u32 *subvol_id)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	bool found = false;
-	int ret;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-				     0, k, ret) {
-		if (k.k->type != KEY_TYPE_subvolume)
-			continue;
-
-		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
-			continue;
-		if (!BCH_SUBVOLUME_SNAP(s.v)) {
-			*subvol_id = s.k->p.offset;
-			found = true;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (!ret && !found) {
-		struct bkey_i_subvolume *u;
-
-		*subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
-
-		u = bch2_bkey_get_mut_typed(trans, &iter,
-					    BTREE_ID_subvolumes, POS(0, *subvol_id),
-					    0, subvolume);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			return ret;
-
-		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
-	}
-
-	return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c_snapshot_tree st;
-	struct bch_snapshot s;
-	struct bch_subvolume subvol;
-	struct printbuf buf = PRINTBUF;
-	struct btree_iter snapshot_iter = {};
-	u32 root_id;
-	int ret;
-
-	if (k.k->type != KEY_TYPE_snapshot_tree)
-		return 0;
-
-	st = bkey_s_c_to_snapshot_tree(k);
-	root_id = le32_to_cpu(st.v->root_snapshot);
-
-	struct bkey_s_c_snapshot snapshot_k =
-		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
-					 POS(0, root_id), 0, snapshot);
-	ret = bkey_err(snapshot_k);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (!ret)
-		bkey_val_copy(&s, snapshot_k);
-
-	if (fsck_err_on(ret ||
-			root_id != bch2_snapshot_root(c, root_id) ||
-			st.k->p.offset != le32_to_cpu(s.tree),
-			trans, snapshot_tree_to_missing_snapshot,
-			"snapshot tree points to missing/incorrect snapshot:\n%s",
-			(bch2_bkey_val_to_text(&buf, c, st.s_c),
-			 prt_newline(&buf),
-			 ret
-			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
-			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
-			 buf.buf))) {
-		ret = bch2_btree_delete_at(trans, iter, 0);
-		goto err;
-	}
-
-	if (!st.v->master_subvol)
-		goto out;
-
-	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		goto err;
-
-	if (fsck_err_on(ret,
-			trans, snapshot_tree_to_missing_subvol,
-			"snapshot tree points to missing subvolume:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
-						le32_to_cpu(subvol.snapshot),
-						root_id),
-			trans, snapshot_tree_to_wrong_subvol,
-			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
-			trans, snapshot_tree_to_snapshot_subvol,
-			"snapshot tree points to snapshot subvolume:\n%s",
-			(printbuf_reset(&buf),
-			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-		struct bkey_i_snapshot_tree *u;
-		u32 subvol_id;
-
-		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
-		bch_err_fn(c, ret);
-
-		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
-			ret = 0;
-			goto err;
-		}
-
-		if (ret)
-			goto err;
-
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.master_subvol = cpu_to_le32(subvol_id);
-		st = snapshot_tree_i_to_s_c(u);
-	}
-out:
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &snapshot_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-			BTREE_ID_snapshot_trees, POS_MIN,
-			BTREE_ITER_prefetch, k,
-			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		check_snapshot_tree(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
-				  u32 snap_id, u32 tree_id)
-{
-	struct bch_snapshot_tree s_t;
-	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
-	if (bch2_err_matches(ret, ENOENT))
-		return 0;
-	if (ret)
-		return ret;
-
-	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
-	if (!id)
-		return 0;
-
-	guard(rcu)();
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s->parent
-		? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
-		: id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
-{
-	unsigned i;
-
-	for (i = 0; i < 3; i++)
-		if (!s.parent) {
-			if (s.skip[i])
-				return false;
-		} else {
-			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
-				return false;
-		}
-
-	return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
-				    struct btree_iter *iter,
-				    struct bkey_s_c k,
-				    struct bch_snapshot *s)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter root_iter;
-	struct bch_snapshot_tree s_t;
-	struct bkey_s_c_snapshot root;
-	struct bkey_i_snapshot *u;
-	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
-	int ret;
-
-	root = bch2_bkey_get_iter_typed(trans, &root_iter,
-			       BTREE_ID_snapshots, POS(0, root_id),
-			       BTREE_ITER_with_updates, snapshot);
-	ret = bkey_err(root);
-	if (ret)
-		goto err;
-
-	tree_id = le32_to_cpu(root.v->tree);
-
-	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
-		ret =   PTR_ERR_OR_ZERO(u) ?:
-			bch2_snapshot_tree_create(trans, root_id,
-				bch2_snapshot_oldest_subvol(c, root_id, NULL),
-				&tree_id);
-		if (ret)
-			goto err;
-
-		u->v.tree = cpu_to_le32(tree_id);
-		if (k.k->p.offset == root_id)
-			*s = u->v;
-	}
-
-	if (k.k->p.offset != root_id) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.tree = cpu_to_le32(tree_id);
-		*s = u->v;
-	}
-err:
-	bch2_trans_iter_exit(trans, &root_iter);
-	return ret;
-}
-
-static int check_snapshot(struct btree_trans *trans,
-			  struct btree_iter *iter,
-			  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bch_snapshot s;
-	struct bch_subvolume subvol;
-	struct bch_snapshot v;
-	struct bkey_i_snapshot *u;
-	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
-	u32 real_depth;
-	struct printbuf buf = PRINTBUF;
-	u32 i, id;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	memset(&s, 0, sizeof(s));
-	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
-
-	if (BCH_SNAPSHOT_DELETED(&s))
-		return 0;
-
-	id = le32_to_cpu(s.parent);
-	if (id) {
-		ret = bch2_snapshot_lookup(trans, id, &v);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot with nonexistent parent:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (ret)
-			goto err;
-
-		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
-		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
-			bch_err(c, "snapshot parent %u missing pointer to child %llu",
-				id, k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	for (i = 0; i < 2 && s.children[i]; i++) {
-		id = le32_to_cpu(s.children[i]);
-
-		ret = bch2_snapshot_lookup(trans, id, &v);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot node %llu has nonexistent child %u",
-				k.k->p.offset, id);
-		if (ret)
-			goto err;
-
-		if (le32_to_cpu(v.parent) != k.k->p.offset) {
-			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-				id, le32_to_cpu(v.parent), k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-		!BCH_SNAPSHOT_WILL_DELETE(&s);
-
-	if (should_have_subvol) {
-		id = le32_to_cpu(s.subvol);
-		ret = bch2_subvolume_get(trans, id, false, &subvol);
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-		if (ret)
-			goto err;
-
-		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
-			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-				k.k->p.offset);
-			ret = -EINVAL;
-			goto err;
-		}
-	} else {
-		if (fsck_err_on(s.subvol,
-				trans, snapshot_should_not_have_subvol,
-				"snapshot should not point to subvol:\n%s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-			ret = PTR_ERR_OR_ZERO(u);
-			if (ret)
-				goto err;
-
-			u->v.subvol = 0;
-			s = u->v;
-		}
-	}
-
-	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
-	if (ret < 0)
-		goto err;
-
-	if (fsck_err_on(!ret,
-			trans, snapshot_to_bad_snapshot_tree,
-			"snapshot points to missing/incorrect tree:\n%s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
-		if (ret)
-			goto err;
-	}
-	ret = 0;
-
-	real_depth = bch2_snapshot_depth(c, parent_id);
-
-	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
-			trans, snapshot_bad_depth,
-			"snapshot with incorrect depth field, should be %u:\n%s",
-			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		u->v.depth = cpu_to_le32(real_depth);
-		s = u->v;
-	}
-
-	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
-	if (ret < 0)
-		goto err;
-
-	if (fsck_err_on(!ret,
-			trans, snapshot_bad_skiplist,
-			"snapshot with bad skiplist field:\n%s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(u);
-		if (ret)
-			goto err;
-
-		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
-			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
-
-		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
-		s = u->v;
-	}
-	ret = 0;
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
-	/*
-	 * We iterate backwards as checking/fixing the depth field requires that
-	 * the parent's depth already be correct:
-	 */
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_reverse_commit(trans, iter,
-				BTREE_ID_snapshots, POS_MAX,
-				BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_snapshot(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_snapshot_exists(struct btree_trans *trans, u32 id)
-{
-	struct bch_fs *c = trans->c;
-
-	/* Do we need to reconstruct the snapshot_tree entry as well? */
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-	u32 tree_id = 0;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
-				     0, k, ret) {
-		if (k.k->type == KEY_TYPE_snapshot_tree &&
-		    le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
-			tree_id = k.k->p.offset;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		return ret;
-
-	if (!tree_id) {
-		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
-		if (ret)
-			return ret;
-	}
-
-	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
-	ret = PTR_ERR_OR_ZERO(snapshot);
-	if (ret)
-		return ret;
-
-	bkey_snapshot_init(&snapshot->k_i);
-	snapshot->k.p		= POS(0, id);
-	snapshot->v.tree	= cpu_to_le32(tree_id);
-	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-				     0, k, ret) {
-		if (k.k->type == KEY_TYPE_subvolume &&
-		    le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
-			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
-			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return  bch2_snapshot_table_make_room(c, id) ?:
-		bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
-}
-
-/* Figure out which snapshot nodes belong in the same tree: */
-struct snapshot_tree_reconstruct {
-	enum btree_id			btree;
-	struct bpos			cur_pos;
-	snapshot_id_list		cur_ids;
-	DARRAY(snapshot_id_list)	trees;
-};
-
-static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
-{
-	darray_for_each(r->trees, i)
-		darray_exit(i);
-	darray_exit(&r->trees);
-	darray_exit(&r->cur_ids);
-}
-
-static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
-	return r->btree == BTREE_ID_inodes
-		? r->cur_pos.offset == pos.offset
-		: r->cur_pos.inode == pos.inode;
-}
-
-static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
-{
-	return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
-}
-
-static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
-{
-	bool first = true;
-	darray_for_each(*s, i) {
-		if (!first)
-			prt_char(out, ' ');
-		first = false;
-		prt_printf(out, "%u", *i);
-	}
-}
-
-static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
-{
-	if (r->cur_ids.nr) {
-		darray_for_each(r->trees, i)
-			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
-				int ret = snapshot_list_merge(c, i, &r->cur_ids);
-				if (ret)
-					return ret;
-				goto out;
-			}
-		darray_push(&r->trees, r->cur_ids);
-		darray_init(&r->cur_ids);
-	}
-out:
-	r->cur_ids.nr = 0;
-	return 0;
-}
-
-static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
-	if (!same_snapshot(r, pos))
-		snapshot_tree_reconstruct_next(c, r);
-	r->cur_pos = pos;
-	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
-}
-
-int bch2_reconstruct_snapshots(struct bch_fs *c)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct printbuf buf = PRINTBUF;
-	struct snapshot_tree_reconstruct r = {};
-	int ret = 0;
-
-	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
-		if (btree_type_has_snapshots(btree)) {
-			r.btree = btree;
-
-			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
-					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
-				get_snapshot_trees(c, &r, k.k->p);
-			}));
-			if (ret)
-				goto err;
-
-			snapshot_tree_reconstruct_next(c, &r);
-		}
-	}
-
-	darray_for_each(r.trees, t) {
-		printbuf_reset(&buf);
-		snapshot_id_list_to_text(&buf, t);
-
-		darray_for_each(*t, id) {
-			if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
-					trans, snapshot_node_missing,
-					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
-				if (t->nr > 1) {
-					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
-					ret = bch_err_throw(c, fsck_repair_unimplemented);
-					goto err;
-				}
-
-				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-						check_snapshot_exists(trans, *id));
-				if (ret)
-					goto err;
-			}
-		}
-	}
-fsck_err:
-err:
-	bch2_trans_put(trans);
-	snapshot_tree_reconstruct_exit(&r);
-	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-int __bch2_check_key_has_snapshot(struct btree_trans *trans,
-				  struct btree_iter *iter,
-				  struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-	enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
-
-	/* Snapshot was definitively deleted, this error is marked autofix */
-	if (fsck_err_on(state == SNAPSHOT_ID_deleted,
-			trans, bkey_in_deleted_snapshot,
-			"key in deleted snapshot %s, delete?",
-			(bch2_btree_id_to_text(&buf, iter->btree_id),
-			 prt_char(&buf, ' '),
-			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-		ret = bch2_btree_delete_at(trans, iter,
-					   BTREE_UPDATE_internal_snapshot_node) ?: 1;
-
-	if (state == SNAPSHOT_ID_empty) {
-		/*
-		 * Snapshot missing: we should have caught this with btree_lost_data and
-		 * kicked off reconstruct_snapshots, so if we end up here we have no
-		 * idea what happened.
-		 *
-		 * Do not delete unless we know that subvolumes and snapshots
-		 * are consistent:
-		 *
-		 * XXX:
-		 *
-		 * We could be smarter here, and instead of using the generic
-		 * recovery pass ratelimiting, track if there have been any
-		 * changes to the snapshots or inodes btrees since those passes
-		 * last ran.
-		 */
-		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
-		ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
-
-		if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
-			ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
-
-		unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
-
-		if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
-			     "key in missing snapshot %s, delete?",
-			     (bch2_btree_id_to_text(&buf, iter->btree_id),
-			      prt_char(&buf, ' '),
-			      bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			ret = bch2_btree_delete_at(trans, iter,
-						   BTREE_UPDATE_internal_snapshot_node) ?: 1;
-		}
-	}
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
-				   enum btree_id btree, struct bpos pos,
-				   snapshot_id_list *s)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
-					     BTREE_ITER_all_snapshots, k, ret) {
-		if (!bkey_eq(k.k->p, pos))
-			break;
-
-		if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
-		    snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
-			continue;
-
-		ret = snapshot_list_add(c, s, k.k->p.snapshot);
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	if (ret)
-		darray_exit(s);
-
-	return ret;
-}
-
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
-	struct btree_iter iter;
-	struct bkey_i_snapshot *s =
-		bch2_bkey_get_mut_typed(trans, &iter,
-				    BTREE_ID_snapshots, POS(0, id),
-				    0, snapshot);
-	int ret = PTR_ERR_OR_ZERO(s);
-	if (unlikely(ret)) {
-		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-					trans->c, "missing snapshot %u", id);
-		return ret;
-	}
-
-	/* already deleted? */
-	if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
-		goto err;
-
-	SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
-	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
-	s->v.subvol = 0;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
-{
-	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
-		swap(s->children[0], s->children[1]);
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter, p_iter = {};
-	struct btree_iter c_iter = {};
-	struct btree_iter tree_iter = {};
-	u32 parent_id, child_id;
-	unsigned i;
-	int ret = 0;
-
-	struct bkey_i_snapshot *s =
-		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-					BTREE_ITER_intent, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-				"missing snapshot %u", id);
-
-	if (ret)
-		goto err;
-
-	BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
-	BUG_ON(s->v.children[1]);
-
-	parent_id = le32_to_cpu(s->v.parent);
-	child_id = le32_to_cpu(s->v.children[0]);
-
-	if (parent_id) {
-		struct bkey_i_snapshot *parent;
-
-		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
-				     BTREE_ID_snapshots, POS(0, parent_id),
-				     0, snapshot);
-		ret = PTR_ERR_OR_ZERO(parent);
-		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-					"missing snapshot %u", parent_id);
-		if (unlikely(ret))
-			goto err;
-
-		/* find entry in parent->children for node being deleted */
-		for (i = 0; i < 2; i++)
-			if (le32_to_cpu(parent->v.children[i]) == id)
-				break;
-
-		if (bch2_fs_inconsistent_on(i == 2, c,
-					"snapshot %u missing child pointer to %u",
-					parent_id, id))
-			goto err;
-
-		parent->v.children[i] = cpu_to_le32(child_id);
-
-		normalize_snapshot_child_pointers(&parent->v);
-	}
-
-	if (child_id) {
-		struct bkey_i_snapshot *child;
-
-		child = bch2_bkey_get_mut_typed(trans, &c_iter,
-				     BTREE_ID_snapshots, POS(0, child_id),
-				     0, snapshot);
-		ret = PTR_ERR_OR_ZERO(child);
-		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-					"missing snapshot %u", child_id);
-		if (unlikely(ret))
-			goto err;
-
-		child->v.parent = cpu_to_le32(parent_id);
-
-		if (!child->v.parent) {
-			child->v.skip[0] = 0;
-			child->v.skip[1] = 0;
-			child->v.skip[2] = 0;
-		}
-	}
-
-	if (!parent_id) {
-		/*
-		 * We're deleting the root of a snapshot tree: update the
-		 * snapshot_tree entry to point to the new root, or delete it if
-		 * this is the last snapshot ID in this tree:
-		 */
-		struct bkey_i_snapshot_tree *s_t;
-
-		BUG_ON(s->v.children[1]);
-
-		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
-				0, snapshot_tree);
-		ret = PTR_ERR_OR_ZERO(s_t);
-		if (ret)
-			goto err;
-
-		if (s->v.children[0]) {
-			s_t->v.root_snapshot = s->v.children[0];
-		} else {
-			s_t->k.type = KEY_TYPE_deleted;
-			set_bkey_val_u64s(&s_t->k, 0);
-		}
-	}
-
-	if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
-		SET_BCH_SNAPSHOT_DELETED(&s->v, true);
-		s->v.parent		= 0;
-		s->v.children[0]	= 0;
-		s->v.children[1]	= 0;
-		s->v.subvol		= 0;
-		s->v.tree		= 0;
-		s->v.depth		= 0;
-		s->v.skip[0]		= 0;
-		s->v.skip[1]		= 0;
-		s->v.skip[2]		= 0;
-	} else {
-		s->k.type = KEY_TYPE_deleted;
-		set_bkey_val_u64s(&s->k, 0);
-	}
-err:
-	bch2_trans_iter_exit(trans, &tree_iter);
-	bch2_trans_iter_exit(trans, &p_iter);
-	bch2_trans_iter_exit(trans, &c_iter);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
-			  u32 *new_snapids,
-			  u32 *snapshot_subvols,
-			  unsigned nr_snapids)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_i_snapshot *n;
-	struct bkey_s_c k;
-	unsigned i, j;
-	u32 depth = bch2_snapshot_depth(c, parent);
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-			     POS_MIN, BTREE_ITER_intent);
-	k = bch2_btree_iter_peek(trans, &iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	for (i = 0; i < nr_snapids; i++) {
-		k = bch2_btree_iter_prev_slot(trans, &iter);
-		ret = bkey_err(k);
-		if (ret)
-			goto err;
-
-		if (!k.k || !k.k->p.offset) {
-			ret = bch_err_throw(c, ENOSPC_snapshot_create);
-			goto err;
-		}
-
-		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		n->v.flags	= 0;
-		n->v.parent	= cpu_to_le32(parent);
-		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
-		n->v.tree	= cpu_to_le32(tree);
-		n->v.depth	= cpu_to_le32(depth);
-		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
-		n->v.btime.hi	= 0;
-
-		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
-			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
-
-		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
-		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
-		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
-		if (ret)
-			goto err;
-
-		new_snapids[i]	= iter.pos.offset;
-	}
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	struct btree_iter iter;
-	struct bkey_i_snapshot *n_parent;
-	int ret = 0;
-
-	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_snapshots, POS(0, parent),
-			0, snapshot);
-	ret = PTR_ERR_OR_ZERO(n_parent);
-	if (unlikely(ret)) {
-		if (bch2_err_matches(ret, ENOENT))
-			bch_err(trans->c, "snapshot %u not found", parent);
-		return ret;
-	}
-
-	if (n_parent->v.children[0] || n_parent->v.children[1]) {
-		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
-			     new_snapids, snapshot_subvols, nr_snapids);
-	if (ret)
-		goto err;
-
-	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
-	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
-	n_parent->v.subvol = 0;
-	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	struct bkey_i_snapshot_tree *n_tree;
-	int ret;
-
-	n_tree = __bch2_snapshot_tree_create(trans);
-	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
-		create_snapids(trans, 0, n_tree->k.p.offset,
-			     new_snapids, snapshot_subvols, nr_snapids);
-	if (ret)
-		return ret;
-
-	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
-	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
-	return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-			      u32 *new_snapids,
-			      u32 *snapshot_subvols,
-			      unsigned nr_snapids)
-{
-	BUG_ON((parent == 0) != (nr_snapids == 1));
-	BUG_ON((parent != 0) != (nr_snapids == 2));
-
-	return parent
-		? bch2_snapshot_node_create_children(trans, parent,
-				new_snapids, snapshot_subvols, nr_snapids)
-		: bch2_snapshot_node_create_tree(trans,
-				new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-/*
- * If we have an unlinked inode in an internal snapshot node, and the inode
- * really has been deleted in all child snapshots, how does this get cleaned up?
- *
- * first there is the problem of how keys that have been overwritten in all
- * child snapshots get deleted (unimplemented?), but inodes may perhaps be
- * special?
- *
- * also: unlinked inode in internal snapshot appears to not be getting deleted
- * correctly if inode doesn't exist in leaf snapshots
- *
- * solution:
- *
- * for a key in an interior snapshot node that needs work to be done that
- * requires it to be mutated: iterate over all descendent leaf nodes and copy
- * that key to snapshot leaf nodes, where we can mutate it
- */
-
-static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
-{
-	struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
-	return i ? i->live_child : 0;
-}
-
-static unsigned __live_child(struct snapshot_table *t, u32 id,
-			     snapshot_id_list *delete_leaves,
-			     interior_delete_list *delete_interior)
-{
-	struct snapshot_t *s = __snapshot_t(t, id);
-	if (!s)
-		return 0;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
-		if (s->children[i] &&
-		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
-		    !interior_delete_has_id(delete_interior, s->children[i]))
-			return s->children[i];
-
-	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
-		u32 live_child = s->children[i]
-			? __live_child(t, s->children[i], delete_leaves, delete_interior)
-			: 0;
-		if (live_child)
-			return live_child;
-	}
-
-	return 0;
-}
-
-static unsigned live_child(struct bch_fs *c, u32 id)
-{
-	struct snapshot_delete *d = &c->snapshot_delete;
-
-	guard(rcu)();
-	return __live_child(rcu_dereference(c->snapshots), id,
-			    &d->delete_leaves, &d->delete_interior);
-}
-
-static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
-{
-	return snapshot_list_has_id(&d->delete_leaves, id) ||
-		interior_delete_has_id(&d->delete_interior, id) != 0;
-}
-
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
-					     struct btree_iter *iter,
-					     struct bkey_s_c k)
-{
-	struct snapshot_delete *d = &trans->c->snapshot_delete;
-
-	if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
-		return bch2_btree_delete_at(trans, iter,
-					    BTREE_UPDATE_internal_snapshot_node);
-
-	u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
-	if (live_child) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		int ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			return ret;
-
-		new->k.p.snapshot = live_child;
-
-		struct btree_iter dst_iter;
-		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
-							   iter->btree_id, new->k.p,
-							   BTREE_ITER_all_snapshots|
-							   BTREE_ITER_intent);
-		ret = bkey_err(dst_k);
-		if (ret)
-			return ret;
-
-		ret =   (bkey_deleted(dst_k.k)
-			 ? bch2_trans_update(trans, &dst_iter, new,
-					     BTREE_UPDATE_internal_snapshot_node)
-			 : 0) ?:
-			bch2_btree_delete_at(trans, iter,
-					     BTREE_UPDATE_internal_snapshot_node);
-		bch2_trans_iter_exit(trans, &dst_iter);
-		return ret;
-	}
-
-	return 0;
-}
-
-static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_delete *d = &c->snapshot_delete;
-
-	u64 inum = iter->btree_id != BTREE_ID_inodes
-		? iter->pos.inode
-		: iter->pos.offset;
-
-	if (*prev_inum == inum)
-		return false;
-
-	*prev_inum = inum;
-
-	bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
-					 bch2_snapshot_tree(c, iter->pos.snapshot));
-	if (unlikely(ret)) {
-		struct bpos pos = iter->pos;
-		pos.snapshot = 0;
-		if (iter->btree_id != BTREE_ID_inodes)
-			pos.offset = U64_MAX;
-		bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
-	}
-
-	return ret;
-}
-
-static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_delete *d = &c->snapshot_delete;
-
-	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
-		struct disk_reservation res = { 0 };
-		u64 prev_inum = 0;
-
-		d->pos.pos = POS_MIN;
-
-		if (!btree_type_has_snapshots(d->pos.btree))
-			continue;
-
-		int ret = for_each_btree_key_commit(trans, iter,
-				d->pos.btree, POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			d->pos.pos = iter.pos;
-
-			if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-				continue;
-
-			delete_dead_snapshots_process_key(trans, &iter, k);
-		}));
-
-		bch2_disk_reservation_put(c, &res);
-
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
-					   struct bpos start, struct bpos end)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_delete *d = &c->snapshot_delete;
-	struct disk_reservation res = { 0 };
-
-	d->pos.btree	= btree;
-	d->pos.pos	= POS_MIN;
-
-	int ret = for_each_btree_key_max_commit(trans, iter,
-			btree, start, end,
-			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		d->pos.pos = iter.pos;
-		delete_dead_snapshots_process_key(trans, &iter, k);
-	}));
-
-	bch2_disk_reservation_put(c, &res);
-	return ret;
-}
-
-static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
-	struct snapshot_delete *d = &c->snapshot_delete;
-	struct disk_reservation res = { 0 };
-	u64 prev_inum = 0;
-	int ret = 0;
-
-	struct btree_iter iter;
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
-			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-
-	while (1) {
-		struct bkey_s_c k;
-		ret = lockrestart_do(trans,
-				bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
-		if (ret)
-			break;
-
-		if (!k.k)
-			break;
-
-		d->pos.btree	= iter.btree_id;
-		d->pos.pos	= iter.pos;
-
-		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-			continue;
-
-		if (snapshot_id_dying(d, k.k->p.snapshot)) {
-			struct bpos start	= POS(k.k->p.offset, 0);
-			struct bpos end		= POS(k.k->p.offset, U64_MAX);
-
-			ret   = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
-				delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
-				delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
-			if (ret)
-				break;
-
-			bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
-		} else {
-			bch2_btree_iter_advance(trans, &iter);
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (ret)
-		goto err;
-
-	prev_inum = 0;
-	ret = for_each_btree_key_commit(trans, iter,
-			BTREE_ID_inodes, POS_MIN,
-			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-		d->pos.btree	= iter.btree_id;
-		d->pos.pos	= iter.pos;
-
-		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
-			continue;
-
-		delete_dead_snapshots_process_key(trans, &iter, k);
-	}));
-err:
-	bch2_disk_reservation_put(c, &res);
-	return ret;
-}
-
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	struct bch_fs *c = trans->c;
-	struct snapshot_delete *d = &c->snapshot_delete;
-	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-	unsigned live_children = 0;
-	int ret = 0;
-
-	if (BCH_SNAPSHOT_SUBVOL(s.v))
-		return 0;
-
-	if (BCH_SNAPSHOT_DELETED(s.v))
-		return 0;
-
-	mutex_lock(&d->progress_lock);
-	for (unsigned i = 0; i < 2; i++) {
-		u32 child = le32_to_cpu(s.v->children[i]);
-
-		live_children += child &&
-			!snapshot_list_has_id(&d->delete_leaves, child);
-	}
-
-	u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
-
-	if (live_children == 0) {
-		ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
-			snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
-	} else if (live_children == 1) {
-		struct snapshot_interior_delete n = {
-			.id		= s.k->p.offset,
-			.live_child	= live_child(c, s.k->p.offset),
-		};
-
-		if (!n.live_child) {
-			bch_err(c, "error finding live child of snapshot %u", n.id);
-			ret = -EINVAL;
-		} else {
-			ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
-				darray_push(&d->delete_interior, n);
-		}
-	}
-	mutex_unlock(&d->progress_lock);
-
-	return ret;
-}
-
-static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
-						interior_delete_list *skip)
-{
-	guard(rcu)();
-	while (interior_delete_has_id(skip, id))
-		id = __bch2_snapshot_parent(c, id);
-
-	while (n--) {
-		do {
-			id = __bch2_snapshot_parent(c, id);
-		} while (interior_delete_has_id(skip, id));
-	}
-
-	return id;
-}
-
-static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
-					      struct btree_iter *iter, struct bkey_s_c k,
-					      interior_delete_list *deleted)
-{
-	struct bch_fs *c = trans->c;
-	u32 nr_deleted_ancestors = 0;
-	struct bkey_i_snapshot *s;
-	int ret;
-
-	if (!bch2_snapshot_exists(c, k.k->p.offset))
-		return 0;
-
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	if (interior_delete_has_id(deleted, k.k->p.offset))
-		return 0;
-
-	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
-	ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		return ret;
-
-	darray_for_each(*deleted, i)
-		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
-
-	if (!nr_deleted_ancestors)
-		return 0;
-
-	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
-
-	if (!s->v.depth) {
-		s->v.skip[0] = 0;
-		s->v.skip[1] = 0;
-		s->v.skip[2] = 0;
-	} else {
-		u32 depth = le32_to_cpu(s->v.depth);
-		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
-
-		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
-			u32 id = le32_to_cpu(s->v.skip[j]);
-
-			if (interior_delete_has_id(deleted, id)) {
-				id = bch2_snapshot_nth_parent_skip(c,
-							parent,
-							depth > 1
-							? get_random_u32_below(depth - 1)
-							: 0,
-							deleted);
-				s->v.skip[j] = cpu_to_le32(id);
-			}
-		}
-
-		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
-	}
-
-	return bch2_trans_update(trans, iter, &s->k_i, 0);
-}
-
-static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
-{
-	prt_printf(out, "deleting from trees");
-	darray_for_each(d->deleting_from_trees, i)
-		prt_printf(out, " %u", *i);
-
-	prt_printf(out, "deleting leaves");
-	darray_for_each(d->delete_leaves, i)
-		prt_printf(out, " %u", *i);
-	prt_newline(out);
-
-	prt_printf(out, "interior");
-	darray_for_each(d->delete_interior, i)
-		prt_printf(out, " %u->%u", i->id, i->live_child);
-	prt_newline(out);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *c)
-{
-	struct snapshot_delete *d = &c->snapshot_delete;
-	int ret = 0;
-
-	if (!mutex_trylock(&d->lock))
-		return 0;
-
-	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
-		goto out_unlock;
-
-	struct btree_trans *trans = bch2_trans_get(c);
-
-	/*
-	 * For every snapshot node: If we have no live children and it's not
-	 * pointed to by a subvolume, delete it:
-	 */
-	d->running = true;
-	d->pos = BBPOS_MIN;
-
-	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
-		check_should_delete_snapshot(trans, k));
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_msg(c, ret, "walking snapshots");
-	if (ret)
-		goto err;
-
-	if (!d->delete_leaves.nr && !d->delete_interior.nr)
-		goto err;
-
-	{
-		struct printbuf buf = PRINTBUF;
-		bch2_snapshot_delete_nodes_to_text(&buf, d);
-
-		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
-		printbuf_exit(&buf);
-		if (ret)
-			goto err;
-	}
-
-	ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
-		? delete_dead_snapshot_keys_v2(trans)
-		: delete_dead_snapshot_keys_v1(trans);
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_msg(c, ret, "deleting keys from dying snapshots");
-	if (ret)
-		goto err;
-
-	darray_for_each(d->delete_leaves, i) {
-		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(trans, *i));
-		if (!bch2_err_matches(ret, EROFS))
-			bch_err_msg(c, ret, "deleting snapshot %u", *i);
-		if (ret)
-			goto err;
-	}
-
-	/*
-	 * Fixing children of deleted snapshots can't be done completely
-	 * atomically, if we crash between here and when we delete the interior
-	 * nodes some depth fields will be off:
-	 */
-	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
-				  BTREE_ITER_intent, k,
-				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
-	if (ret)
-		goto err;
-
-	darray_for_each(d->delete_interior, i) {
-		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_snapshot_node_delete(trans, i->id));
-		if (!bch2_err_matches(ret, EROFS))
-			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
-		if (ret)
-			goto err;
-	}
-err:
-	mutex_lock(&d->progress_lock);
-	darray_exit(&d->deleting_from_trees);
-	darray_exit(&d->delete_interior);
-	darray_exit(&d->delete_leaves);
-	d->running = false;
-	mutex_unlock(&d->progress_lock);
-	bch2_trans_put(trans);
-
-	bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
-out_unlock:
-	mutex_unlock(&d->lock);
-	if (!bch2_err_matches(ret, EROFS))
-		bch_err_fn(c, ret);
-	return ret;
-}
-
-int bch2_delete_dead_snapshots(struct bch_fs *c)
-{
-	if (!c->opts.auto_snapshot_deletion)
-		return 0;
-
-	return __bch2_delete_dead_snapshots(c);
-}
-
-void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
-
-	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
-
-	bch2_delete_dead_snapshots(c);
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
-	if (!c->opts.auto_snapshot_deletion)
-		return;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
-		return;
-
-	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
-	if (!queue_work(system_long_wq, &c->snapshot_delete.work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct snapshot_delete *d = &c->snapshot_delete;
-
-	if (!d->running) {
-		prt_str(out, "(not running)");
-		return;
-	}
-
-	mutex_lock(&d->progress_lock);
-	bch2_snapshot_delete_nodes_to_text(out, d);
-
-	bch2_bbpos_to_text(out, d->pos);
-	mutex_unlock(&d->progress_lock);
-}
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
-				       enum btree_id id,
-				       struct bpos pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
-					     BTREE_ITER_not_extents|
-					     BTREE_ITER_all_snapshots,
-					     k, ret) {
-		if (!bkey_eq(pos, k.k->p))
-			break;
-
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-			ret = 1;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	return ret;
-}
-
-static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
-{
-	/* If there's one child, it's redundant and keys will be moved to the child */
-	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
-}
-
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
-{
-	if (k.k->type != KEY_TYPE_snapshot)
-		return 0;
-
-	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
-	if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
-	    interior_snapshot_needs_delete(snap))
-		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
-
-	return 0;
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
-	/*
-	 * Initializing the is_ancestor bitmaps requires ancestors to already be
-	 * initialized - so mark in reverse:
-	 */
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
-				   POS_MAX, 0, k,
-			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-			bch2_check_snapshot_needs_deletion(trans, k)));
-	bch_err_fn(c, ret);
-
-	/*
-	 * It's important that we check if we need to reconstruct snapshots
-	 * before going RW, so we mark that pass as required in the superblock -
-	 * otherwise, we could end up deleting keys with missing snapshot nodes
-	 * instead
-	 */
-	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
-	       test_bit(BCH_FS_may_go_rw, &c->flags));
-
-	return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
-	kvfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-void bch2_fs_snapshots_init_early(struct bch_fs *c)
-{
-	INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
-	mutex_init(&c->snapshot_delete.lock);
-	mutex_init(&c->snapshot_delete.progress_lock);
-	mutex_init(&c->snapshots_unlinked_lock);
-}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
deleted file mode 100644
index 6766bf673ed9..000000000000
--- a/fs/bcachefs/snapshot.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_H
-#define _BCACHEFS_SNAPSHOT_H
-
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
-				struct bkey_validate_context);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
-	.key_validate	= bch2_snapshot_tree_validate,		\
-	.val_to_text	= bch2_snapshot_tree_to_text,		\
-	.min_val_size	= 8,					\
-})
-
-struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
-			   struct bkey_validate_context);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
-		       struct bkey_s_c, struct bkey_s,
-		       enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
-	.key_validate	= bch2_snapshot_validate,		\
-	.val_to_text	= bch2_snapshot_to_text,		\
-	.trigger	= bch2_mark_snapshot,			\
-	.min_val_size	= 24,					\
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
-	u32 idx = U32_MAX - id;
-
-	return likely(t && idx < t->nr)
-		? &t->s[idx]
-		: NULL;
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
-	return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->tree : 0;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->parent : 0;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	return __bch2_snapshot_parent_early(c, id);
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-	if (!s)
-		return 0;
-
-	u32 parent = s->parent;
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-	    parent &&
-	    s->depth != snapshot_t(c, parent)->depth + 1)
-		panic("id %u depth=%u parent %u depth=%u\n",
-		      id, snapshot_t(c, id)->depth,
-		      parent, snapshot_t(c, parent)->depth);
-
-	return parent;
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	return __bch2_snapshot_parent(c, id);
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
-	guard(rcu)();
-	while (n--)
-		id = __bch2_snapshot_parent(c, id);
-	return id;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
-u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-
-	u32 parent;
-	while ((parent = __bch2_snapshot_parent(c, id)))
-		id = parent;
-	return id;
-}
-
-static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->state : SNAPSHOT_ID_empty;
-}
-
-static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	return __bch2_snapshot_id_state(c, id);
-}
-
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
-{
-	return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
-}
-
-static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	const struct snapshot_t *s = snapshot_t(c, id);
-	return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
-}
-
-static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
-	int ret = bch2_snapshot_is_internal_node(c, id);
-	if (ret < 0)
-		return ret;
-	return !ret;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
-	guard(rcu)();
-	return parent ? snapshot_t(c, parent)->depth + 1 : 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-	return id == ancestor
-		? true
-		: __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
-	guard(rcu)();
-	const struct snapshot_t *t = snapshot_t(c, id);
-	return t && (t->children[0]|t->children[1]) != 0;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
-	return darray_find(*s, id) != NULL;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-	darray_for_each(*s, i)
-		if (bch2_snapshot_is_ancestor(c, id, *i))
-			return true;
-	return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-	BUG_ON(snapshot_list_has_id(s, id));
-	int ret = darray_push(s, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-	return ret;
-}
-
-static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-	int ret = snapshot_list_has_id(s, id)
-		? 0
-		: darray_push(s, id);
-	if (ret)
-		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-	return ret;
-}
-
-static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
-{
-	darray_for_each(*src, i) {
-		int ret = snapshot_list_add_nodup(c, dst, *i);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
-			 struct bch_snapshot *s);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
-			     struct bch_subvolume *);
-
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
-			      u32 *, u32 *, unsigned);
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
-int bch2_reconstruct_snapshots(struct bch_fs *);
-
-int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
-
-static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
-					      struct btree_iter *iter,
-					      struct bkey_s_c k)
-{
-	return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
-		? 0
-		: __bch2_check_key_has_snapshot(trans, iter, k);
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *,
-				   enum btree_id, struct bpos,
-				   snapshot_id_list *);
-
-/*
- * Get a list of snapshot IDs that have overwritten a given key:
- */
-static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
-					       enum btree_id btree, struct bpos pos,
-					       snapshot_id_list *s)
-{
-	darray_init(s);
-
-	return bch2_snapshot_has_children(trans->c, pos.snapshot)
-		? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
-		: 0;
-
-}
-
-int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
-					  enum btree_id id,
-					  struct bpos pos)
-{
-	if (!btree_type_has_snapshots(id) ||
-	    bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
-		return 0;
-
-	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *);
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
-
-int bch2_snapshots_read(struct bch_fs *);
-void bch2_fs_snapshots_exit(struct bch_fs *);
-void bch2_fs_snapshots_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
deleted file mode 100644
index 9bccae1f3590..000000000000
--- a/fs/bcachefs/snapshot_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
-#define _BCACHEFS_SNAPSHOT_FORMAT_H
-
-struct bch_snapshot {
-	struct bch_val		v;
-	__le32			flags;
-	__le32			parent;
-	__le32			children[2];
-	__le32			subvol;
-	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
-	__le32			tree;
-	__le32			depth;
-	__le32			skip[3];
-	bch_le128		btime;
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE,	struct bch_snapshot, flags,  0,  1)
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
-LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  2,  3)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
-	struct bch_val		v;
-	__le32			master_subvol;
-	__le32			root_snapshot;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
deleted file mode 100644
index 0ab698f13e5c..000000000000
--- a/fs/bcachefs/snapshot_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
-#define _BCACHEFS_SNAPSHOT_TYPES_H
-
-#include "bbpos_types.h"
-#include "darray.h"
-#include "subvolume_types.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP	128
-
-struct snapshot_t {
-	enum snapshot_id_state {
-		SNAPSHOT_ID_empty,
-		SNAPSHOT_ID_live,
-		SNAPSHOT_ID_deleted,
-	}			state;
-	u32			parent;
-	u32			skip[3];
-	u32			depth;
-	u32			children[2];
-	u32			subvol; /* Nonzero only if a subvolume points to this node: */
-	u32			tree;
-	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
-	struct rcu_head		rcu;
-	size_t			nr;
-#ifndef RUST_BINDGEN
-	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
-	struct snapshot_t	s[0];
-#endif
-};
-
-struct snapshot_interior_delete {
-	u32	id;
-	u32	live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
-struct snapshot_delete {
-	struct mutex		lock;
-	struct work_struct	work;
-
-	struct mutex		progress_lock;
-	snapshot_id_list	deleting_from_trees;
-	snapshot_id_list	delete_leaves;
-	interior_delete_list	delete_interior;
-
-	bool			running;
-	struct bbpos		pos;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
deleted file mode 100644
index 3e9f59226bdf..000000000000
--- a/fs/bcachefs/str_hash.c
+++ /dev/null
@@ -1,400 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "fsck.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
-	if (d.v->d_type == DT_SUBVOL) {
-		struct bch_subvolume subvol;
-		int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
-					     false, &subvol);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			return ret;
-		return !ret;
-	} else {
-		struct btree_iter iter;
-		struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-				SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
-		int ret = bkey_err(k);
-		if (ret)
-			return ret;
-
-		ret = bkey_is_inode(k.k);
-		bch2_trans_iter_exit(trans, &iter);
-		return ret;
-	}
-}
-
-static int bch2_fsck_rename_dirent(struct btree_trans *trans,
-				   struct snapshots_seen *s,
-				   const struct bch_hash_desc desc,
-				   struct bch_hash_info *hash_info,
-				   struct bkey_s_c_dirent old,
-				   bool *updated_before_k_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct qstr old_name = bch2_dirent_get_name(old);
-	struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
-	int ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		return ret;
-
-	bkey_dirent_init(&new->k_i);
-	dirent_copy_target(new, old);
-	new->k.p = old.k->p;
-
-	char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
-	ret = PTR_ERR_OR_ZERO(renamed_buf);
-	if (ret)
-		return ret;
-
-	for (unsigned i = 0; i < 1000; i++) {
-		new->k.u64s = BKEY_U64s_MAX;
-
-		struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
-					sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
-						old_name.len, old_name.name, i));
-
-		ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL);
-		if (ret)
-			return ret;
-
-		ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-						(subvol_inum) { 0, old.k->p.inode },
-						old.k->p.snapshot, &new->k_i,
-						BTREE_UPDATE_internal_snapshot_node|
-						STR_HASH_must_create);
-		if (ret && !bch2_err_matches(ret, EEXIST))
-			break;
-		if (!ret) {
-			if (bpos_lt(new->k.p, old.k->p))
-				*updated_before_k_pos = true;
-			break;
-		}
-	}
-
-	ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static noinline int hash_pick_winner(struct btree_trans *trans,
-				     const struct bch_hash_desc desc,
-				     struct bch_hash_info *hash_info,
-				     struct bkey_s_c k1,
-				     struct bkey_s_c k2)
-{
-	if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
-	    !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
-		return 0;
-
-	switch (desc.btree_id) {
-	case BTREE_ID_dirents: {
-		int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
-		if (ret < 0)
-			return ret;
-		if (!ret)
-			return 0;
-
-		ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
-		if (ret < 0)
-			return ret;
-		if (!ret)
-			return 1;
-		return 2;
-	}
-	default:
-		return 0;
-	}
-}
-
-/*
- * str_hash lookups across snapshots break in wild ways if hash_info in
- * different snapshot versions doesn't match - so if we find one mismatch, check
- * them all
- */
-int bch2_repair_inode_hash_info(struct btree_trans *trans,
-				struct bch_inode_unpacked *snapshot_root)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct printbuf buf = PRINTBUF;
-	bool need_commit = false;
-	int ret = 0;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
-				     POS(0, snapshot_root->bi_inum),
-				     BTREE_ITER_all_snapshots, k, ret) {
-		if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
-			break;
-		if (!bkey_is_inode(k.k))
-			continue;
-
-		struct bch_inode_unpacked inode;
-		ret = bch2_inode_unpack(k, &inode);
-		if (ret)
-			break;
-
-		if (inode.bi_hash_seed		== snapshot_root->bi_hash_seed &&
-		    INODE_STR_HASH(&inode)	== INODE_STR_HASH(snapshot_root)) {
-#ifdef CONFIG_BCACHEFS_DEBUG
-			struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
-			struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
-
-			BUG_ON(hash1.type != hash2.type ||
-			       memcmp(&hash1.siphash_key,
-				      &hash2.siphash_key,
-				      sizeof(hash1.siphash_key)));
-#endif
-			continue;
-		}
-
-		printbuf_reset(&buf);
-		prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
-			   snapshot_root->bi_inum,
-			   inode.bi_snapshot,
-			   snapshot_root->bi_snapshot);
-
-		bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
-		prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
-
-		bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
-		prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
-
-		if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
-			inode.bi_hash_seed = snapshot_root->bi_hash_seed;
-			SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
-
-			ret = __bch2_fsck_write_inode(trans, &inode);
-			if (ret)
-				break;
-			need_commit = true;
-		}
-	}
-
-	if (ret)
-		goto err;
-
-	if (!need_commit) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
-			   snapshot_root->bi_inum);
-
-		prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
-		bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
-		prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
-#if 0
-		prt_printf(&buf, "vs   snapshot %u ", hash_info->inum_snapshot);
-		bch2_prt_str_hash_type(&buf, hash_info->type);
-		prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
-#endif
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-		ret = bch_err_throw(c, fsck_repair_unimplemented);
-		goto err;
-	}
-
-	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-		-BCH_ERR_transaction_restart_nested;
-err:
-fsck_err:
-	printbuf_exit(&buf);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/*
- * All versions of the same inode in different snapshots must have the same hash
- * seed/type: verify that the hash info we're using matches the root
- */
-static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
-						       struct bch_hash_info *hash_info)
-{
-	struct bch_inode_unpacked snapshot_root;
-	int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
-	if (ret)
-		return ret;
-
-	struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
-	if (hash_info->type != hash_root.type ||
-	    memcmp(&hash_info->siphash_key,
-		   &hash_root.siphash_key,
-		   sizeof(hash_root.siphash_key)))
-		ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
-
-	return ret;
-}
-
-/* Put a str_hash key in its proper location, checking for duplicates */
-int bch2_str_hash_repair_key(struct btree_trans *trans,
-			     struct snapshots_seen *s,
-			     const struct bch_hash_desc *desc,
-			     struct bch_hash_info *hash_info,
-			     struct btree_iter *k_iter, struct bkey_s_c k,
-			     struct btree_iter *dup_iter, struct bkey_s_c dup_k,
-			     bool *updated_before_k_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-	bool free_snapshots_seen = false;
-	int ret = 0;
-
-	if (!s) {
-		s = bch2_trans_kmalloc(trans, sizeof(*s));
-		ret = PTR_ERR_OR_ZERO(s);
-		if (ret)
-			goto out;
-
-		s->pos = k_iter->pos;
-		darray_init(&s->ids);
-
-		ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
-		if (ret)
-			goto out;
-
-		free_snapshots_seen = true;
-	}
-
-	if (!dup_k.k) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
-
-		dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
-				       (subvol_inum) { 0, new->k.p.inode },
-				       new->k.p.snapshot, new,
-				       STR_HASH_must_create|
-				       BTREE_ITER_with_updates|
-				       BTREE_UPDATE_internal_snapshot_node);
-		ret = bkey_err(dup_k);
-		if (ret)
-			goto out;
-		if (dup_k.k)
-			goto duplicate_entries;
-
-		if (bpos_lt(new->k.p, k.k->p))
-			*updated_before_k_pos = true;
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
-						       k_iter->pos, new->k.p) ?:
-			bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-					    BTREE_ITER_with_updates|
-					    BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-			-BCH_ERR_transaction_restart_commit;
-	} else {
-duplicate_entries:
-		ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
-		if (ret < 0)
-			goto out;
-
-		if (!fsck_err(trans, hash_table_key_duplicate,
-			      "duplicate hash table keys%s:\n%s",
-			      ret != 2 ? "" : ", both point to valid inodes",
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, k),
-			       prt_newline(&buf),
-			       bch2_bkey_val_to_text(&buf, c, dup_k),
-			       buf.buf)))
-			goto out;
-
-		switch (ret) {
-		case 0:
-			ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
-			break;
-		case 1:
-			ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
-			break;
-		case 2:
-			ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
-						      bkey_s_c_to_dirent(k),
-						      updated_before_k_pos) ?:
-				bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-						    BTREE_ITER_with_updates);
-			goto out;
-		}
-
-		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-			-BCH_ERR_transaction_restart_commit;
-	}
-out:
-fsck_err:
-	bch2_trans_iter_exit(trans, dup_iter);
-	printbuf_exit(&buf);
-	if (free_snapshots_seen)
-		darray_exit(&s->ids);
-	return ret;
-}
-
-int __bch2_str_hash_check_key(struct btree_trans *trans,
-			      struct snapshots_seen *s,
-			      const struct bch_hash_desc *desc,
-			      struct bch_hash_info *hash_info,
-			      struct btree_iter *k_iter, struct bkey_s_c hash_k,
-			      bool *updated_before_k_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = {};
-	struct printbuf buf = PRINTBUF;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	u64 hash = desc->hash_bkey(hash_info, hash_k);
-	if (hash_k.k->p.offset < hash)
-		goto bad_hash;
-
-	for_each_btree_key_norestart(trans, iter, desc->btree_id,
-				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-				     BTREE_ITER_slots|
-				     BTREE_ITER_with_updates, k, ret) {
-		if (bkey_eq(k.k->p, hash_k.k->p))
-			break;
-
-		if (k.k->type == desc->key_type &&
-		    !desc->cmp_bkey(k, hash_k)) {
-			ret =	check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
-								   hash_info) ?:
-				bch2_str_hash_repair_key(trans, s, desc, hash_info,
-							 k_iter, hash_k,
-							 &iter, k, updated_before_k_pos);
-			break;
-		}
-
-		if (bkey_deleted(k.k))
-			goto bad_hash;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-out:
-fsck_err:
-	printbuf_exit(&buf);
-	return ret;
-bad_hash:
-	bch2_trans_iter_exit(trans, &iter);
-	/*
-	 * Before doing any repair, check hash_info itself:
-	 */
-	ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
-	if (ret)
-		goto out;
-
-	if (fsck_err(trans, hash_table_key_wrong_offset,
-		     "hash table key at wrong offset: should be at %llu\n%s",
-		     hash,
-		     (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
-		ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
-					       k_iter, hash_k,
-					       &iter, bkey_s_c_null,
-					       updated_before_k_pos);
-	goto out;
-}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
deleted file mode 100644
index 8979ac2d7a3b..000000000000
--- a/fs/bcachefs/str_hash.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_STR_HASH_H
-#define _BCACHEFS_STR_HASH_H
-
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "error.h"
-#include "inode.h"
-#include "siphash.h"
-#include "subvolume.h"
-#include "super.h"
-
-#include <linux/crc32c.h>
-#include <crypto/sha2.h>
-
-static inline enum bch_str_hash_type
-bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
-{
-	switch (opt) {
-	case BCH_STR_HASH_OPT_crc32c:
-		return BCH_STR_HASH_crc32c;
-	case BCH_STR_HASH_OPT_crc64:
-		return BCH_STR_HASH_crc64;
-	case BCH_STR_HASH_OPT_siphash:
-		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
-			? BCH_STR_HASH_siphash
-			: BCH_STR_HASH_siphash_old;
-	default:
-	     BUG();
-	}
-}
-
-struct bch_hash_info {
-	u32			inum_snapshot;
-	u8			type;
-	struct unicode_map	*cf_encoding;
-	/*
-	 * For crc32 or crc64 string hashes the first key value of
-	 * the siphash_key (k0) is used as the key.
-	 */
-	SIPHASH_KEY	siphash_key;
-};
-
-static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
-	struct bch_hash_info info = {
-		.inum_snapshot	= bi->bi_snapshot,
-		.type		= INODE_STR_HASH(bi),
-		.cf_encoding	= bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
-		.siphash_key	= { .k0 = bi->bi_hash_seed }
-	};
-
-	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
-		u8 digest[SHA256_DIGEST_SIZE];
-
-		sha256((const u8 *)&bi->bi_hash_seed,
-		       sizeof(bi->bi_hash_seed), digest);
-		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
-	}
-
-	return info;
-}
-
-struct bch_str_hash_ctx {
-	union {
-		u32		crc32c;
-		u64		crc64;
-		SIPHASH_CTX	siphash;
-	};
-};
-
-static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
-				     const struct bch_hash_info *info)
-{
-	switch (info->type) {
-	case BCH_STR_HASH_crc32c:
-		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
-				     sizeof(info->siphash_key.k0));
-		break;
-	case BCH_STR_HASH_crc64:
-		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
-				      sizeof(info->siphash_key.k0));
-		break;
-	case BCH_STR_HASH_siphash_old:
-	case BCH_STR_HASH_siphash:
-		SipHash24_Init(&ctx->siphash, &info->siphash_key);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
-				       const struct bch_hash_info *info,
-				       const void *data, size_t len)
-{
-	switch (info->type) {
-	case BCH_STR_HASH_crc32c:
-		ctx->crc32c = crc32c(ctx->crc32c, data, len);
-		break;
-	case BCH_STR_HASH_crc64:
-		ctx->crc64 = crc64_be(ctx->crc64, data, len);
-		break;
-	case BCH_STR_HASH_siphash_old:
-	case BCH_STR_HASH_siphash:
-		SipHash24_Update(&ctx->siphash, data, len);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
-				   const struct bch_hash_info *info)
-{
-	switch (info->type) {
-	case BCH_STR_HASH_crc32c:
-		return ctx->crc32c;
-	case BCH_STR_HASH_crc64:
-		return ctx->crc64 >> 1;
-	case BCH_STR_HASH_siphash_old:
-	case BCH_STR_HASH_siphash:
-		return SipHash24_End(&ctx->siphash) >> 1;
-	default:
-		BUG();
-	}
-}
-
-struct bch_hash_desc {
-	enum btree_id	btree_id;
-	u8		key_type;
-
-	u64		(*hash_key)(const struct bch_hash_info *, const void *);
-	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
-	bool		(*cmp_key)(struct bkey_s_c, const void *);
-	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
-	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
-};
-
-static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
-{
-	return k.k->type == desc.key_type &&
-		(!desc.is_visible ||
-		 !inum.inum ||
-		 desc.is_visible(inum, k));
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
-		 struct btree_iter *iter,
-		 const struct bch_hash_desc desc,
-		 const struct bch_hash_info *info,
-		 subvol_inum inum, const void *key,
-		 enum btree_iter_update_trigger_flags flags,
-		 u32 snapshot)
-{
-	struct bkey_s_c k;
-	int ret;
-
-	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-			   POS(inum.inum, U64_MAX),
-			   BTREE_ITER_slots|flags, k, ret) {
-		if (is_visible_key(desc, inum, k)) {
-			if (!desc.cmp_key(k, key))
-				return k;
-		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
-			;
-		} else {
-			/* hole, not found */
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, iter);
-
-	return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup(struct btree_trans *trans,
-		 struct btree_iter *iter,
-		 const struct bch_hash_desc desc,
-		 const struct bch_hash_info *info,
-		 subvol_inum inum, const void *key,
-		 enum btree_iter_update_trigger_flags flags)
-{
-	u32 snapshot;
-	int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return bkey_s_c_err(ret);
-
-	return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
-}
-
-static __always_inline int
-bch2_hash_hole(struct btree_trans *trans,
-	       struct btree_iter *iter,
-	       const struct bch_hash_desc desc,
-	       const struct bch_hash_info *info,
-	       subvol_inum inum, const void *key)
-{
-	struct bkey_s_c k;
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
-	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-			   POS(inum.inum, U64_MAX),
-			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
-		if (!is_visible_key(desc, inum, k))
-			return 0;
-	bch2_trans_iter_exit(trans, iter);
-
-	return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
-}
-
-static __always_inline
-int bch2_hash_needs_whiteout(struct btree_trans *trans,
-			     const struct bch_hash_desc desc,
-			     const struct bch_hash_info *info,
-			     struct btree_iter *start)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_copy_iter(trans, &iter, start);
-
-	bch2_btree_iter_advance(trans, &iter);
-
-	for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
-		if (k.k->type != desc.key_type &&
-		    k.k->type != KEY_TYPE_hash_whiteout)
-			break;
-
-		if (k.k->type == desc.key_type &&
-		    desc.hash_bkey(info, k) <= start->pos.offset) {
-			ret = 1;
-			break;
-		}
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static __always_inline
-struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
-			   struct btree_iter *iter,
-			   const struct bch_hash_desc desc,
-			   const struct bch_hash_info *info,
-			   subvol_inum inum, u32 snapshot,
-			   struct bkey_i *insert,
-			   enum btree_iter_update_trigger_flags flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter slot = {};
-	struct bkey_s_c k;
-	bool found = false;
-	int ret;
-
-	for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
-			   SPOS(insert->k.p.inode,
-				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
-				snapshot),
-			   POS(insert->k.p.inode, U64_MAX),
-			   BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
-		if (is_visible_key(desc, inum, k)) {
-			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
-				goto found;
-
-			/* hash collision: */
-			continue;
-		}
-
-		if (!slot.path && !(flags & STR_HASH_must_replace))
-			bch2_trans_copy_iter(trans, &slot, iter);
-
-		if (k.k->type != KEY_TYPE_hash_whiteout)
-			goto not_found;
-	}
-
-	if (!ret)
-		ret = bch_err_throw(c, ENOSPC_str_hash_create);
-out:
-	bch2_trans_iter_exit(trans, &slot);
-	bch2_trans_iter_exit(trans, iter);
-	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-found:
-	found = true;
-not_found:
-	if (found && (flags & STR_HASH_must_create)) {
-		bch2_trans_iter_exit(trans, &slot);
-		return k;
-	} else if (!found && (flags & STR_HASH_must_replace)) {
-		ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
-	} else {
-		if (!found && slot.path)
-			swap(*iter, slot);
-
-		insert->k.p = iter->pos;
-		ret = bch2_trans_update(trans, iter, insert, flags);
-	}
-
-	goto out;
-}
-
-static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
-			   const struct bch_hash_desc desc,
-			   const struct bch_hash_info *info,
-			   subvol_inum inum, u32 snapshot,
-			   struct bkey_i *insert,
-			   enum btree_iter_update_trigger_flags flags)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
-							     snapshot, insert, flags);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-	if (k.k) {
-		bch2_trans_iter_exit(trans, &iter);
-		return bch_err_throw(trans->c, EEXIST_str_hash_set);
-	}
-
-	return 0;
-}
-
-static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
-		  const struct bch_hash_desc desc,
-		  const struct bch_hash_info *info,
-		  subvol_inum inum,
-		  struct bkey_i *insert,
-		  enum btree_iter_update_trigger_flags flags)
-{
-	insert->k.p.inode = inum.inum;
-
-	u32 snapshot;
-	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
-		bch2_hash_set_in_snapshot(trans, desc, info, inum,
-					  snapshot, insert, flags);
-}
-
-static __always_inline
-int bch2_hash_delete_at(struct btree_trans *trans,
-			const struct bch_hash_desc desc,
-			const struct bch_hash_info *info,
-			struct btree_iter *iter,
-			enum btree_iter_update_trigger_flags flags)
-{
-	struct bkey_i *delete;
-	int ret;
-
-	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-	ret = PTR_ERR_OR_ZERO(delete);
-	if (ret)
-		return ret;
-
-	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
-	if (ret < 0)
-		return ret;
-
-	bkey_init(&delete->k);
-	delete->k.p = iter->pos;
-	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
-
-	return bch2_trans_update(trans, iter, delete, flags);
-}
-
-static __always_inline
-int bch2_hash_delete(struct btree_trans *trans,
-		     const struct bch_hash_desc desc,
-		     const struct bch_hash_info *info,
-		     subvol_inum inum, const void *key)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
-					     BTREE_ITER_intent);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct snapshots_seen;
-int bch2_str_hash_repair_key(struct btree_trans *,
-			     struct snapshots_seen *,
-			     const struct bch_hash_desc *,
-			     struct bch_hash_info *,
-			     struct btree_iter *, struct bkey_s_c,
-			     struct btree_iter *, struct bkey_s_c,
-			     bool *);
-
-int __bch2_str_hash_check_key(struct btree_trans *,
-			      struct snapshots_seen *,
-			      const struct bch_hash_desc *,
-			      struct bch_hash_info *,
-			      struct btree_iter *, struct bkey_s_c,
-			      bool *);
-
-static inline int bch2_str_hash_check_key(struct btree_trans *trans,
-			    struct snapshots_seen *s,
-			    const struct bch_hash_desc *desc,
-			    struct bch_hash_info *hash_info,
-			    struct btree_iter *k_iter, struct bkey_s_c hash_k,
-			    bool *updated_before_k_pos)
-{
-	if (hash_k.k->type != desc->key_type)
-		return 0;
-
-	if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
-		return 0;
-
-	return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
-					 updated_before_k_pos);
-}
-
-#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
deleted file mode 100644
index 020587449123..000000000000
--- a/fs/bcachefs/subvolume.c
+++ /dev/null
@@ -1,752 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-
-#include <linux/random.h>
-
-static int bch2_subvolume_delete(struct btree_trans *, u32);
-
-static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
-{
-	struct printbuf buf = PRINTBUF;
-	bch2_log_msg_start(c, &buf);
-
-	prt_printf(&buf, "missing subvolume %u", subvolid);
-	bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
-
-	int ret = bch2_run_explicit_recovery_pass(c, &buf,
-					BCH_RECOVERY_PASS_check_inodes, 0);
-	if (print)
-		bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-static struct bpos subvolume_children_pos(struct bkey_s_c k)
-{
-	if (k.k->type != KEY_TYPE_subvolume)
-		return POS_MIN;
-
-	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-	if (!s.v->fs_path_parent)
-		return POS_MIN;
-	return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
-}
-
-static int check_subvol(struct btree_trans *trans,
-			struct btree_iter *iter,
-			struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_s_c_subvolume subvol;
-	struct btree_iter subvol_children_iter = {};
-	struct bch_snapshot snapshot;
-	struct printbuf buf = PRINTBUF;
-	unsigned snapid;
-	int ret = 0;
-
-	if (k.k->type != KEY_TYPE_subvolume)
-		return 0;
-
-	subvol = bkey_s_c_to_subvolume(k);
-	snapid = le32_to_cpu(subvol.v->snapshot);
-	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
-
-	if (bch2_err_matches(ret, ENOENT))
-		return bch2_run_print_explicit_recovery_pass(c,
-					BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
-	if (ret)
-		return ret;
-
-	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
-		ret = bch2_subvolume_delete(trans, iter->pos.offset);
-		bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
-		return ret ?: -BCH_ERR_transaction_restart_nested;
-	}
-
-	if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
-			subvol.v->fs_path_parent,
-			trans, subvol_root_fs_path_parent_nonzero,
-			"root subvolume has nonzero fs_path_parent\n%s",
-			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-		struct bkey_i_subvolume *n =
-			bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
-		ret = PTR_ERR_OR_ZERO(n);
-		if (ret)
-			goto err;
-
-		n->v.fs_path_parent = 0;
-	}
-
-	if (subvol.v->fs_path_parent) {
-		struct bpos pos = subvolume_children_pos(k);
-
-		struct bkey_s_c subvol_children_k =
-			bch2_bkey_get_iter(trans, &subvol_children_iter,
-					   BTREE_ID_subvolume_children, pos, 0);
-		ret = bkey_err(subvol_children_k);
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
-				trans, subvol_children_not_set,
-				"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
-				pos.inode, pos.offset,
-				(printbuf_reset(&buf),
-				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-			ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
-			if (ret)
-				goto err;
-		}
-	}
-
-	struct bch_inode_unpacked inode;
-	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
-				    (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
-				    &inode);
-	if (!ret) {
-		if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
-				trans, subvol_root_wrong_bi_subvol,
-				"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
-				inode.bi_inum, inode.bi_snapshot,
-				inode.bi_subvol, subvol.k->p.offset)) {
-			inode.bi_subvol = subvol.k->p.offset;
-			inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
-			ret = __bch2_fsck_write_inode(trans, &inode);
-			if (ret)
-				goto err;
-		}
-	} else if (bch2_err_matches(ret, ENOENT)) {
-		if (fsck_err(trans, subvol_to_missing_root,
-			     "subvolume %llu points to missing subvolume root %llu:%u",
-			     k.k->p.offset, le64_to_cpu(subvol.v->inode),
-			     le32_to_cpu(subvol.v->snapshot))) {
-			/*
-			 * Recreate - any contents that are still disconnected
-			 * will then get reattached under lost+found
-			 */
-			bch2_inode_init_early(c, &inode);
-			bch2_inode_init_late(c, &inode, bch2_current_time(c),
-					     0, 0, S_IFDIR|0700, 0, NULL);
-			inode.bi_inum			= le64_to_cpu(subvol.v->inode);
-			inode.bi_snapshot		= le32_to_cpu(subvol.v->snapshot);
-			inode.bi_subvol			= k.k->p.offset;
-			inode.bi_parent_subvol		= le32_to_cpu(subvol.v->fs_path_parent);
-			ret = __bch2_fsck_write_inode(trans, &inode);
-			if (ret)
-				goto err;
-		}
-	} else {
-		goto err;
-	}
-
-	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
-		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
-		u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
-
-		struct bch_snapshot_tree st;
-		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
-
-		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-				"%s: snapshot tree %u not found", __func__, snapshot_tree);
-
-		if (ret)
-			goto err;
-
-		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
-				trans, subvol_not_master_and_not_snapshot,
-				"subvolume %llu is not set as snapshot but is not master subvolume",
-				k.k->p.offset)) {
-			struct bkey_i_subvolume *s =
-				bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
-			ret = PTR_ERR_OR_ZERO(s);
-			if (ret)
-				goto err;
-
-			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
-		}
-	}
-err:
-fsck_err:
-	bch2_trans_iter_exit(trans, &subvol_children_iter);
-	printbuf_exit(&buf);
-	return ret;
-}
-
-int bch2_check_subvols(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_subvol(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int check_subvol_child(struct btree_trans *trans,
-			      struct btree_iter *child_iter,
-			      struct bkey_s_c child_k)
-{
-	struct bch_subvolume s;
-	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
-					  0, subvolume, &s);
-	if (ret && !bch2_err_matches(ret, ENOENT))
-		return ret;
-
-	if (fsck_err_on(ret ||
-			le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
-			trans, subvol_children_bad,
-			"incorrect entry in subvolume_children btree %llu:%llu",
-			child_k.k->p.inode, child_k.k->p.offset)) {
-		ret = bch2_btree_delete_at(trans, child_iter, 0);
-		if (ret)
-			goto err;
-	}
-err:
-fsck_err:
-	return ret;
-}
-
-int bch2_check_subvol_children(struct bch_fs *c)
-{
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_subvol_child(trans, &iter, k)));
-	bch_err_fn(c, ret);
-	return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
-			    struct bkey_validate_context from)
-{
-	struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
-			 bkey_gt(k.k->p, SUBVOL_POS_MAX),
-			 c, subvol_pos_bad,
-			 "invalid pos");
-
-	bkey_fsck_err_on(!subvol.v->snapshot,
-			 c, subvol_snapshot_bad,
-			 "invalid snapshot");
-
-	bkey_fsck_err_on(!subvol.v->inode,
-			 c, subvol_inode_bad,
-			 "invalid inode");
-fsck_err:
-	return ret;
-}
-
-void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
-			    struct bkey_s_c k)
-{
-	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
-	prt_printf(out, "root %llu snapshot id %u",
-		   le64_to_cpu(s.v->inode),
-		   le32_to_cpu(s.v->snapshot));
-
-	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
-		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
-		prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
-	}
-
-	if (BCH_SUBVOLUME_RO(s.v))
-		prt_printf(out, " ro");
-	if (BCH_SUBVOLUME_SNAP(s.v))
-		prt_printf(out, " snapshot");
-	if (BCH_SUBVOLUME_UNLINKED(s.v))
-		prt_printf(out, " unlinked");
-}
-
-static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
-{
-	return !bpos_eq(pos, POS_MIN)
-		? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
-		: 0;
-}
-
-int bch2_subvolume_trigger(struct btree_trans *trans,
-			   enum btree_id btree_id, unsigned level,
-			   struct bkey_s_c old, struct bkey_s new,
-			   enum btree_iter_update_trigger_flags flags)
-{
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bpos children_pos_old = subvolume_children_pos(old);
-		struct bpos children_pos_new = subvolume_children_pos(new.s_c);
-
-		if (!bpos_eq(children_pos_old, children_pos_new)) {
-			int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
-				  subvolume_children_mod(trans, children_pos_new, true);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
-{
-	struct btree_iter iter;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
-	struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
-	bch2_trans_iter_exit(trans, &iter);
-
-	return bkey_err(k) ?: k.k && k.k->p.inode == subvol
-		? -BCH_ERR_ENOTEMPTY_subvol_not_empty
-		: 0;
-}
-
-static __always_inline int
-bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
-			   bool inconsistent_if_not_found,
-			   struct bch_subvolume *s)
-{
-	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
-					  BTREE_ITER_cached|
-					  BTREE_ITER_with_updates, subvolume, s);
-	if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
-		ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
-	return ret;
-}
-
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
-		       bool inconsistent_if_not_found,
-		       struct bch_subvolume *s)
-{
-	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
-}
-
-int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
-{
-	struct bch_subvolume s;
-	int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
-	if (ret)
-		return ret;
-
-	if (BCH_SUBVOLUME_RO(&s))
-		return -EROFS;
-	return 0;
-}
-
-int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
-{
-	return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
-}
-
-int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
-			     struct bch_subvolume *subvol)
-{
-	struct bch_snapshot snap;
-
-	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
-		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
-}
-
-int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
-				  u32 *snapid, bool warn)
-{
-	struct btree_iter iter;
-	struct bkey_s_c_subvolume subvol;
-	int ret;
-
-	subvol = bch2_bkey_get_iter_typed(trans, &iter,
-					  BTREE_ID_subvolumes, POS(0, subvolid),
-					  BTREE_ITER_cached|BTREE_ITER_with_updates,
-					  subvolume);
-	ret = bkey_err(subvol);
-
-	if (bch2_err_matches(ret, ENOENT))
-		ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-
-	if (likely(!ret))
-		*snapid = le32_to_cpu(subvol.v->snapshot);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
-				u32 *snapid)
-{
-	return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
-}
-
-static int bch2_subvolume_reparent(struct btree_trans *trans,
-				   struct btree_iter *iter,
-				   struct bkey_s_c k,
-				   u32 old_parent, u32 new_parent)
-{
-	struct bkey_i_subvolume *s;
-	int ret;
-
-	if (k.k->type != KEY_TYPE_subvolume)
-		return 0;
-
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
-	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
-		return 0;
-
-	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
-	ret = PTR_ERR_OR_ZERO(s);
-	if (ret)
-		return ret;
-
-	s->v.creation_parent = cpu_to_le32(new_parent);
-	return 0;
-}
-
-/*
- * Separate from the snapshot tree in the snapshots btree, we record the tree
- * structure of how snapshot subvolumes were created - the parent subvolume of
- * each snapshot subvolume.
- *
- * When a subvolume is deleted, we scan for child subvolumes and reparant them,
- * to avoid dangling references:
- */
-static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
-{
-	struct bch_subvolume s;
-
-	return lockrestart_do(trans,
-			bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
-		for_each_btree_key_commit(trans, iter,
-				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_subvolume_reparent(trans, &iter, k,
-					subvolid_to_delete, le32_to_cpu(s.creation_parent)));
-}
-
-/*
- * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
- * deletion/cleanup:
- */
-static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
-	struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
-
-	struct bkey_s_c_subvolume subvol =
-		bch2_bkey_get_iter_typed(trans, &subvol_iter,
-				BTREE_ID_subvolumes, POS(0, subvolid),
-				BTREE_ITER_cached|BTREE_ITER_intent,
-				subvolume);
-	int ret = bkey_err(subvol);
-	if (bch2_err_matches(ret, ENOENT))
-		ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-	if (ret)
-		goto err;
-
-	u32 snapid = le32_to_cpu(subvol.v->snapshot);
-
-	struct bkey_s_c_snapshot snapshot =
-		bch2_bkey_get_iter_typed(trans, &snapshot_iter,
-				BTREE_ID_snapshots, POS(0, snapid),
-				0, snapshot);
-	ret = bkey_err(snapshot);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
-				"missing snapshot %u", snapid);
-	if (ret)
-		goto err;
-
-	u32 treeid = le32_to_cpu(snapshot.v->tree);
-
-	struct bkey_s_c_snapshot_tree snapshot_tree =
-		bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
-				BTREE_ID_snapshot_trees, POS(0, treeid),
-				0, snapshot_tree);
-	ret = bkey_err(snapshot_tree);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
-				"missing snapshot tree %u", treeid);
-	if (ret)
-		goto err;
-
-	if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
-		struct bkey_i_snapshot_tree *snapshot_tree_mut =
-			bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
-						 &snapshot_tree.s_c,
-						 0, snapshot_tree);
-		ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
-		if (ret)
-			goto err;
-
-		snapshot_tree_mut->v.master_subvol = 0;
-	}
-
-	ret =   bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
-		bch2_snapshot_node_set_deleted(trans, snapid);
-err:
-	bch2_trans_iter_exit(trans, &snapshot_tree_iter);
-	bch2_trans_iter_exit(trans, &snapshot_iter);
-	bch2_trans_iter_exit(trans, &subvol_iter);
-	return ret;
-}
-
-static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
-	int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
-		commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			  __bch2_subvolume_delete(trans, subvolid));
-
-	bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
-	return ret;
-}
-
-static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs,
-				snapshot_wait_for_pagecache_and_delete_work);
-	int ret = 0;
-
-	while (!ret) {
-		mutex_lock(&c->snapshots_unlinked_lock);
-		snapshot_id_list s = c->snapshots_unlinked;
-		darray_init(&c->snapshots_unlinked);
-		mutex_unlock(&c->snapshots_unlinked_lock);
-
-		if (!s.nr)
-			break;
-
-		bch2_evict_subvolume_inodes(c, &s);
-
-		darray_for_each(s, id) {
-			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
-			bch_err_msg(c, ret, "deleting subvolume %u", *id);
-			if (ret)
-				break;
-		}
-
-		darray_exit(&s);
-	}
-
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
-}
-
-struct subvolume_unlink_hook {
-	struct btree_trans_commit_hook	h;
-	u32				subvol;
-};
-
-static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
-						      struct btree_trans_commit_hook *_h)
-{
-	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
-	struct bch_fs *c = trans->c;
-	int ret = 0;
-
-	mutex_lock(&c->snapshots_unlinked_lock);
-	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
-	mutex_unlock(&c->snapshots_unlinked_lock);
-
-	if (ret)
-		return ret;
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
-		return -EROFS;
-
-	if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
-		enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
-	return 0;
-}
-
-int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
-{
-	struct btree_iter iter;
-	struct bkey_i_subvolume *n;
-	struct subvolume_unlink_hook *h;
-	int ret = 0;
-
-	h = bch2_trans_kmalloc(trans, sizeof(*h));
-	ret = PTR_ERR_OR_ZERO(h);
-	if (ret)
-		return ret;
-
-	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
-	h->subvol	= subvolid;
-	bch2_trans_commit_hook(trans, &h->h);
-
-	n = bch2_bkey_get_mut_typed(trans, &iter,
-			BTREE_ID_subvolumes, POS(0, subvolid),
-			BTREE_ITER_cached, subvolume);
-	ret = PTR_ERR_OR_ZERO(n);
-	if (bch2_err_matches(ret, ENOENT))
-		ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-	if (unlikely(ret))
-		return ret;
-
-	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
-	n->v.fs_path_parent = 0;
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
-			  u32 parent_subvolid,
-			  u32 src_subvolid,
-			  u32 *new_subvolid,
-			  u32 *new_snapshotid,
-			  bool ro)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter dst_iter, src_iter = {};
-	struct bkey_i_subvolume *new_subvol = NULL;
-	struct bkey_i_subvolume *src_subvol = NULL;
-	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
-	int ret = 0;
-
-	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
-				BTREE_ID_subvolumes, POS(0, U32_MAX));
-	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-		ret = bch_err_throw(c, ENOSPC_subvolume_create);
-	if (ret)
-		return ret;
-
-	snapshot_subvols[0] = dst_iter.pos.offset;
-	snapshot_subvols[1] = src_subvolid;
-
-	if (src_subvolid) {
-		/* Creating a snapshot: */
-
-		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
-				BTREE_ID_subvolumes, POS(0, src_subvolid),
-				BTREE_ITER_cached, subvolume);
-		ret = PTR_ERR_OR_ZERO(src_subvol);
-		if (bch2_err_matches(ret, ENOENT))
-			ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
-		if (unlikely(ret))
-			goto err;
-
-		parent = le32_to_cpu(src_subvol->v.snapshot);
-	}
-
-	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
-					snapshot_subvols,
-					src_subvolid ? 2 : 1);
-	if (ret)
-		goto err;
-
-	if (src_subvolid) {
-		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
-		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
-		if (ret)
-			goto err;
-	}
-
-	new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
-	ret = PTR_ERR_OR_ZERO(new_subvol);
-	if (ret)
-		goto err;
-
-	new_subvol->v.flags		= 0;
-	new_subvol->v.snapshot		= cpu_to_le32(new_nodes[0]);
-	new_subvol->v.inode		= cpu_to_le64(inode);
-	new_subvol->v.creation_parent	= cpu_to_le32(src_subvolid);
-	new_subvol->v.fs_path_parent	= cpu_to_le32(parent_subvolid);
-	new_subvol->v.otime.lo		= cpu_to_le64(bch2_current_time(c));
-	new_subvol->v.otime.hi		= 0;
-
-	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
-	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-
-	*new_subvolid	= new_subvol->k.p.offset;
-	*new_snapshotid	= new_nodes[0];
-err:
-	bch2_trans_iter_exit(trans, &src_iter);
-	bch2_trans_iter_exit(trans, &dst_iter);
-	return ret;
-}
-
-int bch2_initialize_subvolumes(struct bch_fs *c)
-{
-	struct bkey_i_snapshot_tree	root_tree;
-	struct bkey_i_snapshot		root_snapshot;
-	struct bkey_i_subvolume		root_volume;
-	int ret;
-
-	bkey_snapshot_tree_init(&root_tree.k_i);
-	root_tree.k.p.offset		= 1;
-	root_tree.v.master_subvol	= cpu_to_le32(1);
-	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
-
-	bkey_snapshot_init(&root_snapshot.k_i);
-	root_snapshot.k.p.offset = U32_MAX;
-	root_snapshot.v.flags	= 0;
-	root_snapshot.v.parent	= 0;
-	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
-	root_snapshot.v.tree	= cpu_to_le32(1);
-	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
-	bkey_subvolume_init(&root_volume.k_i);
-	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
-	root_volume.v.flags	= 0;
-	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
-	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
-
-	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0, 0);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (!bkey_is_inode(k.k)) {
-		struct bch_fs *c = trans->c;
-		bch_err(c, "root inode not found");
-		ret = bch_err_throw(c, ENOENT_inode);
-		goto err;
-	}
-
-	ret = bch2_inode_unpack(k, &inode);
-	BUG_ON(ret);
-
-	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
-	ret = bch2_inode_write(trans, &iter, &inode);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* set bi_subvol on root inode */
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
-	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				       __bch2_fs_upgrade_for_subvolumes(trans));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *c)
-{
-	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
-		  bch2_subvolume_wait_for_pagecache_and_delete);
-}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
deleted file mode 100644
index 075f55e25c70..000000000000
--- a/fs/bcachefs/subvolume.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_H
-#define _BCACHEFS_SUBVOLUME_H
-
-#include "darray.h"
-#include "subvolume_types.h"
-
-int bch2_check_subvols(struct bch_fs *);
-int bch2_check_subvol_children(struct bch_fs *);
-
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
-			    struct bkey_validate_context);
-void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
-			   struct bkey_s_c, struct bkey_s,
-			   enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
-	.key_validate	= bch2_subvolume_validate,		\
-	.val_to_text	= bch2_subvolume_to_text,		\
-	.trigger	= bch2_subvolume_trigger,		\
-	.min_val_size	= 16,					\
-})
-
-int bch2_subvol_has_children(struct btree_trans *, u32);
-int bch2_subvolume_get(struct btree_trans *, unsigned,
-		       bool, struct bch_subvolume *);
-int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
-				  u32 *, bool);
-int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-
-int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
-int bch2_subvol_is_ro(struct bch_fs *, u32);
-
-static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
-					   struct bpos end, u32 subvolid, unsigned flags)
-{
-	u32 snapshot;
-	int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
-	if (ret)
-		return bkey_s_c_err(ret);
-
-	bch2_btree_iter_set_snapshot(trans, iter, snapshot);
-	return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
-}
-
-#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
-					 _end, _subvolid, _flags, _k, _do)	\
-({										\
-	struct bkey_s_c _k;							\
-	int _ret3 = 0;								\
-										\
-	do {									\
-		_ret3 = lockrestart_do(_trans, ({				\
-			(_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
-						_end, _subvolid, (_flags));	\
-			if (!(_k).k)						\
-				break;						\
-										\
-			bkey_err(_k) ?: (_do);					\
-		}));								\
-	} while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter)));		\
-										\
-	bch2_trans_iter_exit((_trans), &(_iter));				\
-	_ret3;									\
-})
-
-#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id,		\
-				_start, _end, _subvolid, _flags, _k, _do)	\
-({										\
-	struct btree_iter _iter;						\
-	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),			\
-			     (_start), (_flags));				\
-										\
-	for_each_btree_key_in_subvolume_max_continue(_trans, _iter,		\
-					_end, _subvolid, _flags, _k, _do);	\
-})
-
-int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
-
-int bch2_initialize_subvolumes(struct bch_fs *);
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
deleted file mode 100644
index e029df7ba89f..000000000000
--- a/fs/bcachefs/subvolume_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
-#define _BCACHEFS_SUBVOLUME_FORMAT_H
-
-#define SUBVOL_POS_MIN		POS(0, 1)
-#define SUBVOL_POS_MAX		POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL	1
-
-struct bch_subvolume {
-	struct bch_val		v;
-	__le32			flags;
-	__le32			snapshot;
-	__le64			inode;
-	/*
-	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
-	 * tree - if this subvolume is a snapshot, this is the ID of the
-	 * subvolume it was created from:
-	 *
-	 * This is _not_ necessarily the subvolume of the directory containing
-	 * this subvolume:
-	 */
-	__le32			creation_parent;
-	__le32			fs_path_parent;
-	bch_le128		otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
-
-#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
deleted file mode 100644
index 9d634b906dcd..000000000000
--- a/fs/bcachefs/subvolume_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
-#define _BCACHEFS_SUBVOLUME_TYPES_H
-
-typedef struct {
-	/* we can't have padding in this struct: */
-	u64		subvol;
-	u64		inum;
-} subvol_inum;
-
-#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
deleted file mode 100644
index 6c2e1d647403..000000000000
--- a/fs/bcachefs/super-io.c
+++ /dev/null
@@ -1,1562 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "quota.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "super.h"
-#include "trace.h"
-#include "vstructs.h"
-
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-struct bch2_metadata_version {
-	u16		version;
-	const char	*name;
-};
-
-static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v) {		\
-	.version = v,				\
-	.name = #n,				\
-},
-	BCH_METADATA_VERSIONS()
-#undef x
-};
-
-void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
-{
-	const char *str = "(unknown version)";
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
-		if (bch2_metadata_versions[i].version == v) {
-			str = bch2_metadata_versions[i].name;
-			break;
-		}
-
-	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
-}
-
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
-{
-	if (!BCH_VERSION_MAJOR(v))
-		return v;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
-		if (bch2_metadata_versions[i].version > v &&
-		    BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
-		    BCH_VERSION_MAJOR(v))
-			v = bch2_metadata_versions[i].version;
-
-	return v;
-}
-
-int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
-{
-	int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
-		   version <= c->sb.version_incompat_allowed)
-		? 0
-		: -BCH_ERR_may_not_use_incompat_feature;
-
-	mutex_lock(&c->sb_lock);
-	if (!ret) {
-		SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
-			max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
-		bch2_write_super(c);
-	} else {
-		darray_for_each(c->incompat_versions_requested, i)
-			if (version == *i)
-				goto out;
-
-		darray_push(&c->incompat_versions_requested, version);
-		struct printbuf buf = PRINTBUF;
-		prt_str(&buf, "requested incompat feature ");
-		bch2_version_to_text(&buf, version);
-		prt_str(&buf, " currently not enabled, allowed up to ");
-		bch2_version_to_text(&buf, version);
-		prt_printf(&buf, "\n  set version_upgrade=incompat to enable");
-
-		bch_notice(c, "%s", buf.buf);
-		printbuf_exit(&buf);
-	}
-
-out:
-	mutex_unlock(&c->sb_lock);
-
-	return ret;
-}
-
-const char * const bch2_sb_fields[] = {
-#define x(name, nr)	#name,
-	BCH_SB_FIELDS()
-#undef x
-	NULL
-};
-
-static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
-				  enum bch_validate_flags, struct printbuf *);
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
-				      enum bch_sb_field_type type)
-{
-	/* XXX: need locking around superblock to access optional fields */
-
-	vstruct_for_each(sb, f)
-		if (le32_to_cpu(f->type) == type)
-			return f;
-	return NULL;
-}
-
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
-						   struct bch_sb_field *f,
-						   unsigned u64s)
-{
-	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
-
-	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
-
-	if (!f && !u64s) {
-		/* nothing to do: */
-	} else if (!f) {
-		f = vstruct_last(sb->sb);
-		memset(f, 0, sizeof(u64) * u64s);
-		f->u64s = cpu_to_le32(u64s);
-		f->type = 0;
-	} else {
-		void *src, *dst;
-
-		src = vstruct_end(f);
-
-		if (u64s) {
-			f->u64s = cpu_to_le32(u64s);
-			dst = vstruct_end(f);
-		} else {
-			dst = f;
-		}
-
-		memmove(dst, src, vstruct_end(sb->sb) - src);
-
-		if (dst > src)
-			memset(src, 0, dst - src);
-	}
-
-	sb->sb->u64s = cpu_to_le32(sb_u64s);
-
-	return u64s ? f : NULL;
-}
-
-void bch2_sb_field_delete(struct bch_sb_handle *sb,
-			  enum bch_sb_field_type type)
-{
-	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
-	if (f)
-		__bch2_sb_field_resize(sb, f, 0);
-}
-
-/* Superblock realloc/free: */
-
-void bch2_free_super(struct bch_sb_handle *sb)
-{
-	kfree(sb->bio);
-	if (!IS_ERR_OR_NULL(sb->s_bdev_file))
-		bdev_fput(sb->s_bdev_file);
-	kfree(sb->holder);
-	kfree(sb->sb_name);
-
-	kfree(sb->sb);
-	memset(sb, 0, sizeof(*sb));
-}
-
-int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
-	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
-	size_t new_buffer_size;
-	struct bch_sb *new_sb;
-	struct bio *bio;
-
-	if (sb->bdev)
-		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
-
-	new_buffer_size = roundup_pow_of_two(new_bytes);
-
-	if (sb->sb && sb->buffer_size >= new_buffer_size)
-		return 0;
-
-	if (sb->sb && sb->have_layout) {
-		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
-		if (new_bytes > max_bytes) {
-			struct printbuf buf = PRINTBUF;
-
-			prt_bdevname(&buf, sb->bdev);
-			prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
-			pr_err("%s", buf.buf);
-			printbuf_exit(&buf);
-			return -BCH_ERR_ENOSPC_sb;
-		}
-	}
-
-	if (sb->buffer_size >= new_buffer_size && sb->sb)
-		return 0;
-
-	if (dynamic_fault("bcachefs:add:super_realloc"))
-		return -BCH_ERR_ENOMEM_sb_realloc_injected;
-
-	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
-	if (!new_sb)
-		return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
-	sb->sb = new_sb;
-
-	if (sb->have_bio) {
-		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
-
-		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-		if (!bio)
-			return -BCH_ERR_ENOMEM_sb_bio_realloc;
-
-		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
-
-		kfree(sb->bio);
-		sb->bio = bio;
-	}
-
-	sb->buffer_size = new_buffer_size;
-
-	return 0;
-}
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
-					  enum bch_sb_field_type type,
-					  unsigned u64s)
-{
-	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-	ssize_t d = -old_u64s + u64s;
-
-	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
-		return NULL;
-
-	if (sb->fs_sb) {
-		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
-
-		lockdep_assert_held(&c->sb_lock);
-
-		/* XXX: we're not checking that offline device have enough space */
-
-		for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
-			struct bch_sb_handle *dev_sb = &ca->disk_sb;
-
-			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
-				enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
-				return NULL;
-			}
-		}
-	}
-
-	f = bch2_sb_field_get_id(sb->sb, type);
-	f = __bch2_sb_field_resize(sb, f, u64s);
-	if (f)
-		f->type = cpu_to_le32(type);
-	return f;
-}
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
-						  enum bch_sb_field_type type,
-						  unsigned u64s)
-{
-	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
-	if (!f || le32_to_cpu(f->u64s) < u64s)
-		f = bch2_sb_field_resize_id(sb, type, u64s);
-	return f;
-}
-
-/* Superblock validate: */
-
-static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
-{
-	u64 offset, prev_offset, max_sectors;
-	unsigned i;
-
-	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-
-	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
-	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
-		prt_printf(out, "Not a bcachefs superblock layout");
-		return -BCH_ERR_invalid_sb_layout;
-	}
-
-	if (layout->layout_type != 0) {
-		prt_printf(out, "Invalid superblock layout type %u",
-		       layout->layout_type);
-		return -BCH_ERR_invalid_sb_layout_type;
-	}
-
-	if (!layout->nr_superblocks) {
-		prt_printf(out, "Invalid superblock layout: no superblocks");
-		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
-	}
-
-	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
-		prt_printf(out, "Invalid superblock layout: too many superblocks");
-		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
-	}
-
-	if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
-		prt_printf(out, "Invalid superblock layout: max_size_bits too high");
-		return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
-	}
-
-	max_sectors = 1 << layout->sb_max_size_bits;
-
-	prev_offset = le64_to_cpu(layout->sb_offset[0]);
-
-	for (i = 1; i < layout->nr_superblocks; i++) {
-		offset = le64_to_cpu(layout->sb_offset[i]);
-
-		if (offset < prev_offset + max_sectors) {
-			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
-			       "  (sb %u ends at %llu next starts at %llu",
-			       i - 1, prev_offset + max_sectors, offset);
-			return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
-		}
-		prev_offset = offset;
-	}
-
-	return 0;
-}
-
-static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
-{
-	u16 version		= le16_to_cpu(sb->version);
-	u16 version_min		= le16_to_cpu(sb->version_min);
-
-	if (!bch2_version_compatible(version)) {
-		prt_str(out, "Unsupported superblock version ");
-		bch2_version_to_text(out, version);
-		prt_str(out, " (min ");
-		bch2_version_to_text(out, bcachefs_metadata_version_min);
-		prt_str(out, ", max ");
-		bch2_version_to_text(out, bcachefs_metadata_version_current);
-		prt_str(out, ")");
-		return -BCH_ERR_invalid_sb_version;
-	}
-
-	if (!bch2_version_compatible(version_min)) {
-		prt_str(out, "Unsupported superblock version_min ");
-		bch2_version_to_text(out, version_min);
-		prt_str(out, " (min ");
-		bch2_version_to_text(out, bcachefs_metadata_version_min);
-		prt_str(out, ", max ");
-		bch2_version_to_text(out, bcachefs_metadata_version_current);
-		prt_str(out, ")");
-		return -BCH_ERR_invalid_sb_version;
-	}
-
-	if (version_min > version) {
-		prt_str(out, "Bad minimum version ");
-		bch2_version_to_text(out, version_min);
-		prt_str(out, ", greater than version field ");
-		bch2_version_to_text(out, version);
-		return -BCH_ERR_invalid_sb_version;
-	}
-
-	return 0;
-}
-
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
-		     enum bch_validate_flags flags, struct printbuf *out)
-{
-	enum bch_opt_id opt_id;
-	int ret;
-
-	ret = bch2_sb_compatible(sb, out);
-	if (ret)
-		return ret;
-
-	u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
-	unsigned incompat_bit = 0;
-	if (incompat)
-		incompat_bit = __ffs64(incompat);
-	else if (sb->features[1])
-		incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
-	if (incompat_bit) {
-		prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
-			   incompat_bit,
-			   bch2_sb_features[BCH_FEATURE_NR - 1],
-			   BCH_FEATURE_NR - 1);
-		return -BCH_ERR_invalid_sb_features;
-	}
-
-	if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
-	    BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
-		prt_str(out, "Filesystem has incompatible version ");
-		bch2_version_to_text(out, le16_to_cpu(sb->version));
-		prt_str(out, ", current version ");
-		bch2_version_to_text(out, bcachefs_metadata_version_current);
-		return -BCH_ERR_invalid_sb_features;
-	}
-
-	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
-		prt_printf(out, "Bad user UUID (got zeroes)");
-		return -BCH_ERR_invalid_sb_uuid;
-	}
-
-	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-		prt_printf(out, "Bad internal UUID (got zeroes)");
-		return -BCH_ERR_invalid_sb_uuid;
-	}
-
-	if (!(flags & BCH_VALIDATE_write) &&
-	    le64_to_cpu(sb->offset) != read_offset) {
-		prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
-			   le64_to_cpu(sb->offset), read_offset);
-		return -BCH_ERR_invalid_sb_offset;
-	}
-
-	if (!sb->nr_devices ||
-	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
-		prt_printf(out, "Bad number of member devices %u (max %u)",
-		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
-		return -BCH_ERR_invalid_sb_too_many_members;
-	}
-
-	if (sb->dev_idx >= sb->nr_devices) {
-		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
-		       sb->dev_idx, sb->nr_devices);
-		return -BCH_ERR_invalid_sb_dev_idx;
-	}
-
-	if (!sb->time_precision ||
-	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
-		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
-		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
-		return -BCH_ERR_invalid_sb_time_precision;
-	}
-
-	/* old versions didn't know to downgrade this field */
-	if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
-		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
-
-	if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
-		prt_printf(out, "Invalid version_incompat ");
-		bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
-		prt_str(out, " > incompat_allowed ");
-		bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
-		if (flags & BCH_VALIDATE_write)
-			return -BCH_ERR_invalid_sb_version;
-		else
-			SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
-	}
-
-	if (sb->nr_devices > 1)
-		SET_BCH_SB_MULTI_DEVICE(sb, true);
-
-	if (!flags) {
-		/*
-		 * Been seeing a bug where these are getting inexplicably
-		 * zeroed, so we're now validating them, but we have to be
-		 * careful not to preven people's filesystems from mounting:
-		 */
-		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
-			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
-			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
-
-		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
-			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
-
-		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
-		    !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
-			SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
-
-		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
-			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
-
-		if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
-			SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
-
-		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
-		    !BCH_SB_CSUM_ERR_RETRY_NR(sb))
-			SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
-	}
-
-#ifdef __KERNEL__
-	if (!BCH_SB_SHARD_INUMS_NBITS(sb))
-		SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
-#endif
-
-	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
-		const struct bch_option *opt = bch2_opt_table + opt_id;
-
-		if (opt->get_sb) {
-			u64 v = bch2_opt_from_sb(sb, opt_id, -1);
-
-			prt_printf(out, "Invalid option ");
-			ret = bch2_opt_validate(opt, v, out);
-			if (ret)
-				return ret;
-
-			printbuf_reset(out);
-		}
-	}
-
-	/* validate layout */
-	ret = validate_sb_layout(&sb->layout, out);
-	if (ret)
-		return ret;
-
-	vstruct_for_each(sb, f) {
-		if (!f->u64s) {
-			prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
-			       le32_to_cpu(f->type));
-			return -BCH_ERR_invalid_sb_field_size;
-		}
-
-		if (vstruct_next(f) > vstruct_last(sb)) {
-			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
-			       le32_to_cpu(f->type));
-			return -BCH_ERR_invalid_sb_field_size;
-		}
-	}
-
-	struct bch_sb_field *mi =
-		bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
-		bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
-
-	/* members must be validated first: */
-	if (!mi) {
-		prt_printf(out, "Invalid superblock: member info area missing");
-		return -BCH_ERR_invalid_sb_members_missing;
-	}
-
-	ret = bch2_sb_field_validate(sb, mi, flags, out);
-	if (ret)
-		return ret;
-
-	vstruct_for_each(sb, f) {
-		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
-			continue;
-
-		ret = bch2_sb_field_validate(sb, f, flags, out);
-		if (ret)
-			return ret;
-	}
-
-	if ((flags & BCH_VALIDATE_write) &&
-	    bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
-		prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
-			   le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
-			   le64_to_cpu(sb->seq));
-		return -BCH_ERR_invalid_sb_members_missing;
-	}
-
-	return 0;
-}
-
-/* device open: */
-
-static unsigned long le_ulong_to_cpu(unsigned long v)
-{
-	return sizeof(unsigned long) == 8
-		? le64_to_cpu(v)
-		: le32_to_cpu(v);
-}
-
-static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
-{
-	BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
-
-	for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
-		dst[i] = le_ulong_to_cpu(src[i]);
-}
-
-static void bch2_sb_update(struct bch_fs *c)
-{
-	struct bch_sb *src = c->disk_sb.sb;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	c->sb.uuid		= src->uuid;
-	c->sb.user_uuid		= src->user_uuid;
-	c->sb.version		= le16_to_cpu(src->version);
-	c->sb.version_incompat	= BCH_SB_VERSION_INCOMPAT(src);
-	c->sb.version_incompat_allowed
-				= BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
-	c->sb.version_min	= le16_to_cpu(src->version_min);
-	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
-	c->sb.nr_devices	= src->nr_devices;
-	c->sb.clean		= BCH_SB_CLEAN(src);
-	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-
-	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
-	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
-
-	/* XXX this is wrong, we need a 96 or 128 bit integer type */
-	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
-					  c->sb.nsec_per_time_unit);
-	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
-
-	c->sb.features		= le64_to_cpu(src->features[0]);
-	c->sb.compat		= le64_to_cpu(src->compat[0]);
-	c->sb.multi_device	= BCH_SB_MULTI_DEVICE(src);
-
-	memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
-
-	struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
-	if (ext) {
-		c->sb.recovery_passes_required =
-			bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
-		le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
-				    sizeof(c->sb.errors_silent) * 8);
-		c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
-	}
-
-	for_each_member_device(c, ca) {
-		struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
-		ca->mi = bch2_mi_to_cpu(&m);
-	}
-}
-
-static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
-{
-	struct bch_sb_field *src_f, *dst_f;
-	struct bch_sb *dst = dst_handle->sb;
-	unsigned i;
-
-	dst->version		= src->version;
-	dst->version_min	= src->version_min;
-	dst->seq		= src->seq;
-	dst->uuid		= src->uuid;
-	dst->user_uuid		= src->user_uuid;
-	memcpy(dst->label,	src->label, sizeof(dst->label));
-
-	dst->block_size		= src->block_size;
-	dst->nr_devices		= src->nr_devices;
-
-	dst->time_base_lo	= src->time_base_lo;
-	dst->time_base_hi	= src->time_base_hi;
-	dst->time_precision	= src->time_precision;
-	dst->write_time		= src->write_time;
-
-	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
-	memcpy(dst->features,	src->features,	sizeof(dst->features));
-	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
-
-	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-		int d;
-
-		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
-			continue;
-
-		src_f = bch2_sb_field_get_id(src, i);
-		dst_f = bch2_sb_field_get_id(dst, i);
-
-		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
-		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
-		if (d > 0) {
-			int ret = bch2_sb_realloc(dst_handle,
-					le32_to_cpu(dst_handle->sb->u64s) + d);
-
-			if (ret)
-				return ret;
-
-			dst = dst_handle->sb;
-			dst_f = bch2_sb_field_get_id(dst, i);
-		}
-
-		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
-				src_f ? le32_to_cpu(src_f->u64s) : 0);
-
-		if (src_f)
-			memcpy(dst_f, src_f, vstruct_bytes(src_f));
-	}
-
-	return 0;
-}
-
-int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
-{
-	int ret;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
-		__copy_super(&c->disk_sb, src) ?:
-		bch2_sb_replicas_to_cpu_replicas(c) ?:
-		bch2_sb_disk_groups_to_cpu(c);
-	if (ret)
-		return ret;
-
-	bch2_sb_update(c);
-	return 0;
-}
-
-int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
-{
-	return __copy_super(&ca->disk_sb, c->disk_sb.sb);
-}
-
-/* read superblock: */
-
-static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
-{
-	size_t bytes;
-	int ret;
-reread:
-	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-	sb->bio->bi_iter.bi_sector = offset;
-	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
-
-	ret = submit_bio_wait(sb->bio);
-	if (ret) {
-		prt_printf(err, "IO error: %i", ret);
-		return ret;
-	}
-
-	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
-	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
-		prt_str(err, "Not a bcachefs superblock (got magic ");
-		pr_uuid(err, sb->sb->magic.b);
-		prt_str(err, ")");
-		return -BCH_ERR_invalid_sb_magic;
-	}
-
-	ret = bch2_sb_compatible(sb->sb, err);
-	if (ret)
-		return ret;
-
-	bytes = vstruct_bytes(sb->sb);
-
-	u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
-	if (bytes > sb_size) {
-		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
-			   bytes, sb_size);
-		return -BCH_ERR_invalid_sb_too_big;
-	}
-
-	if (bytes > sb->buffer_size) {
-		ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
-		if (ret)
-			return ret;
-		goto reread;
-	}
-
-	enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
-	if (csum_type >= BCH_CSUM_NR ||
-	    bch2_csum_type_is_encryption(csum_type)) {
-		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
-		return -BCH_ERR_invalid_sb_csum_type;
-	}
-
-	/* XXX: verify MACs */
-	struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
-	if (bch2_crc_cmp(csum, sb->sb->csum)) {
-		bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
-		return -BCH_ERR_invalid_sb_csum;
-	}
-
-	sb->seq = le64_to_cpu(sb->sb->seq);
-
-	return 0;
-}
-
-static int __bch2_read_super(const char *path, struct bch_opts *opts,
-		    struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
-{
-	u64 offset = opt_get(*opts, sb);
-	struct bch_sb_layout layout;
-	struct printbuf err = PRINTBUF;
-	struct printbuf err2 = PRINTBUF;
-	__le64 *i;
-	int ret;
-#ifndef __KERNEL__
-retry:
-#endif
-	memset(sb, 0, sizeof(*sb));
-	sb->mode	= BLK_OPEN_READ;
-	sb->have_bio	= true;
-	sb->holder	= kzalloc(sizeof(*sb->holder), GFP_KERNEL);
-	if (!sb->holder)
-		return -ENOMEM;
-
-	sb->sb_name = kstrdup(path, GFP_KERNEL);
-	if (!sb->sb_name) {
-		ret = -ENOMEM;
-		prt_printf(&err, "error allocating memory for sb_name");
-		goto err;
-	}
-
-#ifndef __KERNEL__
-	if (opt_get(*opts, direct_io) == false)
-		sb->mode |= BLK_OPEN_BUFFERED;
-#endif
-
-	if (!opt_get(*opts, noexcl))
-		sb->mode |= BLK_OPEN_EXCL;
-
-	if (!opt_get(*opts, nochanges))
-		sb->mode |= BLK_OPEN_WRITE;
-
-	sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-	if (IS_ERR(sb->s_bdev_file) &&
-	    PTR_ERR(sb->s_bdev_file) == -EACCES &&
-	    opt_get(*opts, read_only)) {
-		sb->mode &= ~BLK_OPEN_WRITE;
-
-		sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-		if (!IS_ERR(sb->s_bdev_file))
-			opt_set(*opts, nochanges, true);
-	}
-
-	if (IS_ERR(sb->s_bdev_file)) {
-		ret = PTR_ERR(sb->s_bdev_file);
-		prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
-		goto err;
-	}
-	sb->bdev = file_bdev(sb->s_bdev_file);
-
-	ret = bch2_sb_realloc(sb, 0);
-	if (ret) {
-		prt_printf(&err, "error allocating memory for superblock");
-		goto err;
-	}
-
-	if (bch2_fs_init_fault("read_super")) {
-		prt_printf(&err, "dynamic fault");
-		ret = -EFAULT;
-		goto err;
-	}
-
-	ret = read_one_super(sb, offset, &err);
-	if (!ret)
-		goto got_super;
-
-	if (opt_defined(*opts, sb))
-		goto err;
-
-	prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
-	       path, err.buf);
-	if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
-		bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
-	else
-		bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
-
-	printbuf_exit(&err2);
-	printbuf_reset(&err);
-
-	/*
-	 * Error reading primary superblock - read location of backup
-	 * superblocks:
-	 */
-	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
-	/*
-	 * use sb buffer to read layout, since sb buffer is page aligned but
-	 * layout won't be:
-	 */
-	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
-
-	ret = submit_bio_wait(sb->bio);
-	if (ret) {
-		prt_printf(&err, "IO error: %i", ret);
-		goto err;
-	}
-
-	memcpy(&layout, sb->sb, sizeof(layout));
-	ret = validate_sb_layout(&layout, &err);
-	if (ret)
-		goto err;
-
-	for (i = layout.sb_offset;
-	     i < layout.sb_offset + layout.nr_superblocks; i++) {
-		offset = le64_to_cpu(*i);
-
-		if (offset == opt_get(*opts, sb)) {
-			ret = -BCH_ERR_invalid;
-			continue;
-		}
-
-		ret = read_one_super(sb, offset, &err);
-		if (!ret)
-			goto got_super;
-	}
-
-	goto err;
-
-got_super:
-	if (le16_to_cpu(sb->sb->block_size) << 9 <
-	    bdev_logical_block_size(sb->bdev) &&
-	    opt_get(*opts, direct_io)) {
-#ifndef __KERNEL__
-		opt_set(*opts, direct_io, false);
-		bch2_free_super(sb);
-		goto retry;
-#endif
-		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
-		       le16_to_cpu(sb->sb->block_size) << 9,
-		       bdev_logical_block_size(sb->bdev));
-		ret = -BCH_ERR_block_size_too_small;
-		goto err;
-	}
-
-	sb->have_layout = true;
-
-	ret = bch2_sb_validate(sb->sb, offset, 0, &err);
-	if (ret) {
-		bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
-				path, err.buf);
-		goto err_no_print;
-	}
-out:
-	printbuf_exit(&err);
-	return ret;
-err:
-	bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
-			path, err.buf);
-err_no_print:
-	bch2_free_super(sb);
-	goto out;
-}
-
-int bch2_read_super(const char *path, struct bch_opts *opts,
-		    struct bch_sb_handle *sb)
-{
-	return __bch2_read_super(path, opts, sb, false);
-}
-
-/* provide a silenced version for mount.bcachefs */
-
-int bch2_read_super_silent(const char *path, struct bch_opts *opts,
-		    struct bch_sb_handle *sb)
-{
-	return __bch2_read_super(path, opts, sb, true);
-}
-
-/* write superblock: */
-
-static void write_super_endio(struct bio *bio)
-{
-	struct bch_dev *ca = bio->bi_private;
-
-	bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
-
-	/* XXX: return errors directly */
-
-	if (bio->bi_status) {
-		bch_err_dev_ratelimited(ca, "superblock %s error: %s",
-			       str_write_read(bio_data_dir(bio)),
-			       bch2_blk_status_to_str(bio->bi_status));
-		ca->sb_write_error = 1;
-	}
-
-	closure_put(&ca->fs->sb_write);
-	enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-}
-
-static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bch_sb *sb = ca->disk_sb.sb;
-	struct bio *bio = ca->disk_sb.bio;
-
-	memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
-	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
-	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
-	bio->bi_end_io		= write_super_endio;
-	bio->bi_private		= ca;
-	bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
-	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
-
-	enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-	closure_bio_submit(bio, &c->sb_write);
-}
-
-static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
-{
-	struct bch_sb *sb = ca->disk_sb.sb;
-	struct bio *bio = ca->disk_sb.bio;
-
-	sb->offset = sb->layout.sb_offset[idx];
-
-	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
-	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
-				null_nonce(), sb);
-
-	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
-	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
-	bio->bi_end_io		= write_super_endio;
-	bio->bi_private		= ca;
-	bch2_bio_map(bio, sb,
-		     roundup((size_t) vstruct_bytes(sb),
-			     bdev_logical_block_size(ca->disk_sb.bdev)));
-
-	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
-		     bio_sectors(bio));
-
-	enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-	closure_bio_submit(bio, &c->sb_write);
-}
-
-int bch2_write_super(struct bch_fs *c)
-{
-	struct closure *cl = &c->sb_write;
-	struct printbuf err = PRINTBUF;
-	unsigned sb = 0, nr_wrote;
-	struct bch_devs_mask sb_written;
-	bool wrote, can_mount_without_written, can_mount_with_written;
-	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
-	DARRAY(struct bch_dev *) online_devices = {};
-	int ret = 0;
-
-	trace_and_count(c, write_super, c, _RET_IP_);
-
-	if (c->opts.degraded == BCH_DEGRADED_very)
-		degraded_flags |= BCH_FORCE_IF_LOST;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	closure_init_stack(cl);
-	memset(&sb_written, 0, sizeof(sb_written));
-
-	/*
-	 * Note: we do writes to RO devices here, and we might want to change
-	 * that in the future.
-	 *
-	 * For now, we expect to be able to call write_super() when we're not
-	 * yet RW:
-	 */
-	for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
-		ret = darray_push(&online_devices, ca);
-		if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
-			enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-			goto out;
-		}
-		enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-	}
-
-	/* Make sure we're using the new magic numbers: */
-	c->disk_sb.sb->magic = BCHFS_MAGIC;
-	c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
-
-	le64_add_cpu(&c->disk_sb.sb->seq, 1);
-
-	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-	darray_for_each(online_devices, ca)
-		__bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
-	c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
-
-	if (test_bit(BCH_FS_error, &c->flags))
-		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
-	if (test_bit(BCH_FS_topology_error, &c->flags))
-		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
-
-	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
-
-	bch2_sb_counters_from_cpu(c);
-	bch2_sb_members_from_cpu(c);
-	bch2_sb_members_cpy_v2_v1(&c->disk_sb);
-	bch2_sb_errors_from_cpu(c);
-	bch2_sb_downgrade_update(c);
-
-	darray_for_each(online_devices, ca)
-		bch2_sb_from_fs(c, (*ca));
-
-	darray_for_each(online_devices, ca) {
-		printbuf_reset(&err);
-
-		ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
-		if (ret) {
-			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
-			goto out;
-		}
-	}
-
-	if (c->opts.nochanges)
-		goto out;
-
-	/*
-	 * Defer writing the superblock until filesystem initialization is
-	 * complete - don't write out a partly initialized superblock:
-	 */
-	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
-		goto out;
-
-	if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
-		struct printbuf buf = PRINTBUF;
-		prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
-		bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
-		prt_str(&buf, " > ");
-		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
-		prt_str(&buf, ")");
-		bch2_fs_fatal_error(c, ": %s", buf.buf);
-		printbuf_exit(&buf);
-		ret = bch_err_throw(c, sb_not_downgraded);
-		goto out;
-	}
-
-	darray_for_each(online_devices, ca) {
-		__set_bit((*ca)->dev_idx, sb_written.d);
-		(*ca)->sb_write_error = 0;
-	}
-
-	darray_for_each(online_devices, ca)
-		read_back_super(c, *ca);
-	closure_sync(cl);
-
-	darray_for_each(online_devices, cap) {
-		struct bch_dev *ca = *cap;
-
-		if (ca->sb_write_error)
-			continue;
-
-		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
-			struct printbuf buf = PRINTBUF;
-			prt_char(&buf, ' ');
-			prt_bdevname(&buf, ca->disk_sb.bdev);
-			prt_printf(&buf,
-				": Superblock write was silently dropped! (seq %llu expected %llu)",
-				le64_to_cpu(ca->sb_read_scratch->seq),
-				ca->disk_sb.seq);
-
-			if (c->opts.errors != BCH_ON_ERROR_continue &&
-			    c->opts.errors != BCH_ON_ERROR_fix_safe) {
-				ret = bch_err_throw(c, erofs_sb_err);
-				bch2_fs_fatal_error(c, "%s", buf.buf);
-			} else {
-				bch_err(c, "%s", buf.buf);
-			}
-
-			printbuf_exit(&buf);
-		}
-
-		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
-			struct printbuf buf = PRINTBUF;
-			prt_char(&buf, ' ');
-			prt_bdevname(&buf, ca->disk_sb.bdev);
-			prt_printf(&buf,
-				": Superblock modified by another process (seq %llu expected %llu)",
-				le64_to_cpu(ca->sb_read_scratch->seq),
-				ca->disk_sb.seq);
-			bch2_fs_fatal_error(c, "%s", buf.buf);
-			printbuf_exit(&buf);
-			ret = bch_err_throw(c, erofs_sb_err);
-		}
-	}
-
-	if (ret)
-		goto out;
-
-	do {
-		wrote = false;
-		darray_for_each(online_devices, cap) {
-			struct bch_dev *ca = *cap;
-			if (!ca->sb_write_error &&
-			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
-				write_one_super(c, ca, sb);
-				wrote = true;
-			}
-		}
-		closure_sync(cl);
-		sb++;
-	} while (wrote);
-
-	darray_for_each(online_devices, cap) {
-		struct bch_dev *ca = *cap;
-		if (ca->sb_write_error)
-			__clear_bit(ca->dev_idx, sb_written.d);
-		else
-			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
-	}
-
-	nr_wrote = dev_mask_nr(&sb_written);
-
-	can_mount_with_written =
-		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
-	for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
-		sb_written.d[i] = ~sb_written.d[i];
-
-	can_mount_without_written =
-		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
-	/*
-	 * If we would be able to mount _without_ the devices we successfully
-	 * wrote superblocks to, we weren't able to write to enough devices:
-	 *
-	 * Exception: if we can mount without the successes because we haven't
-	 * written anything (new filesystem), we continue if we'd be able to
-	 * mount with the devices we did successfully write to:
-	 */
-	if (bch2_fs_fatal_err_on(!nr_wrote ||
-				 !can_mount_with_written ||
-				 (can_mount_without_written &&
-				  !can_mount_with_written), c,
-		": Unable to write superblock to sufficient devices (from %ps)",
-		(void *) _RET_IP_))
-		ret = bch_err_throw(c, erofs_sb_err);
-out:
-	/* Make new options visible after they're persistent: */
-	bch2_sb_update(c);
-	darray_for_each(online_devices, ca)
-		enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
-	darray_exit(&online_devices);
-	printbuf_exit(&err);
-	return ret;
-}
-
-void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
-	mutex_lock(&c->sb_lock);
-	if (!(c->sb.features & (1ULL << feat))) {
-		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
-
-		bch2_write_super(c);
-	}
-	mutex_unlock(&c->sb_lock);
-}
-
-/* Downgrade if superblock is at a higher version than currently supported: */
-bool bch2_check_version_downgrade(struct bch_fs *c)
-{
-	bool ret = bcachefs_metadata_version_current < c->sb.version;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	/*
-	 * Downgrade, if superblock is at a higher version than currently
-	 * supported:
-	 *
-	 * c->sb will be checked before we write the superblock, so update it as
-	 * well:
-	 */
-	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
-		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
-	if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
-		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
-	if (c->sb.version > bcachefs_metadata_version_current)
-		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
-	if (c->sb.version_min > bcachefs_metadata_version_current)
-		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
-	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
-	return ret;
-}
-
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
-{
-	lockdep_assert_held(&c->sb_lock);
-
-	if (BCH_VERSION_MAJOR(new_version) >
-	    BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
-		bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
-
-	c->disk_sb.sb->version = cpu_to_le16(new_version);
-
-	if (incompat) {
-		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
-		SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
-			max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
-	}
-}
-
-void bch2_sb_upgrade_incompat(struct bch_fs *c)
-{
-	mutex_lock(&c->sb_lock);
-	if (c->sb.version == c->sb.version_incompat_allowed)
-		goto unlock;
-
-	struct printbuf buf = PRINTBUF;
-
-	prt_str(&buf, "Now allowing incompatible features up to ");
-	bch2_version_to_text(&buf, c->sb.version);
-	prt_str(&buf, ", previously allowed up to ");
-	bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
-	prt_newline(&buf);
-
-	bch_notice(c, "%s", buf.buf);
-	printbuf_exit(&buf);
-
-	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
-	SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
-			max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
-	bch2_write_super(c);
-unlock:
-	mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				enum bch_validate_flags flags, struct printbuf *err)
-{
-	if (vstruct_bytes(f) < 88) {
-		prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
-		return -BCH_ERR_invalid_sb_ext;
-	}
-
-	return 0;
-}
-
-static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
-				struct bch_sb_field *f)
-{
-	struct bch_sb_field_ext *e = field_to_type(f, ext);
-
-	prt_printf(out, "Recovery passes required:\t");
-	prt_bitflags(out, bch2_recovery_passes,
-		     bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
-	prt_newline(out);
-
-	unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
-	if (errors_silent) {
-		le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
-
-		prt_printf(out, "Errors to silently fix:\t");
-		prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
-				    min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
-		prt_newline(out);
-
-		kfree(errors_silent);
-	}
-
-	prt_printf(out, "Btrees with missing data:\t");
-	prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
-	prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
-	.validate	= bch2_sb_ext_validate,
-	.to_text	= bch2_sb_ext_to_text,
-};
-
-static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
-#define x(f, nr)					\
-	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
-	BCH_SB_FIELDS()
-#undef x
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_null_ops;
-
-static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
-{
-	return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
-		? bch2_sb_field_ops[type]
-		: &bch2_sb_field_null_ops;
-}
-
-static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
-				  enum bch_validate_flags flags, struct printbuf *err)
-{
-	unsigned type = le32_to_cpu(f->type);
-	struct printbuf field_err = PRINTBUF;
-	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-	int ret;
-
-	ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
-	if (ret) {
-		prt_printf(err, "Invalid superblock section %s: %s",
-			   bch2_sb_fields[type], field_err.buf);
-		prt_newline(err);
-		bch2_sb_field_to_text(err, sb, f);
-	}
-
-	printbuf_exit(&field_err);
-	return ret;
-}
-
-void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
-			     struct bch_sb_field *f)
-{
-	unsigned type = le32_to_cpu(f->type);
-	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 32);
-
-	if (ops->to_text)
-		ops->to_text(out, sb, f);
-}
-
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
-			   struct bch_sb_field *f)
-{
-	unsigned type = le32_to_cpu(f->type);
-
-	if (type < BCH_SB_FIELD_NR)
-		prt_printf(out, "%s", bch2_sb_fields[type]);
-	else
-		prt_printf(out, "(unknown field %u)", type);
-
-	prt_printf(out, " (size %zu):", vstruct_bytes(f));
-	prt_newline(out);
-
-	__bch2_sb_field_to_text(out, sb, f);
-}
-
-void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
-{
-	unsigned i;
-
-	prt_printf(out, "Type:                    %u", l->layout_type);
-	prt_newline(out);
-
-	prt_str(out, "Superblock max size:     ");
-	prt_units_u64(out, 512 << l->sb_max_size_bits);
-	prt_newline(out);
-
-	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
-	prt_newline(out);
-
-	prt_str(out, "Offsets:                 ");
-	for (i = 0; i < l->nr_superblocks; i++) {
-		if (i)
-			prt_str(out, ", ");
-		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
-	}
-	prt_newline(out);
-}
-
-void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
-		     bool print_layout, unsigned fields)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 44);
-
-	prt_printf(out, "External UUID:\t");
-	pr_uuid(out, sb->user_uuid.b);
-	prt_newline(out);
-
-	prt_printf(out, "Internal UUID:\t");
-	pr_uuid(out, sb->uuid.b);
-	prt_newline(out);
-
-	prt_printf(out, "Magic number:\t");
-	pr_uuid(out, sb->magic.b);
-	prt_newline(out);
-
-	prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
-
-	prt_printf(out, "Label:\t");
-	if (!strlen(sb->label))
-		prt_printf(out, "(none)");
-	else
-		prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
-	prt_newline(out);
-
-	prt_printf(out, "Version:\t");
-	bch2_version_to_text(out, le16_to_cpu(sb->version));
-	prt_newline(out);
-
-	prt_printf(out, "Incompatible features allowed:\t");
-	bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
-	prt_newline(out);
-
-	prt_printf(out, "Incompatible features in use:\t");
-	bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
-	prt_newline(out);
-
-	prt_printf(out, "Version upgrade complete:\t");
-	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
-	prt_newline(out);
-
-	prt_printf(out, "Oldest version on disk:\t");
-	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
-	prt_newline(out);
-
-	prt_printf(out, "Created:\t");
-	if (sb->time_base_lo)
-		bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
-	else
-		prt_printf(out, "(not set)");
-	prt_newline(out);
-
-	prt_printf(out, "Sequence number:\t");
-	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
-	prt_newline(out);
-
-	prt_printf(out, "Time of last write:\t");
-	bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
-	prt_newline(out);
-
-	prt_printf(out, "Superblock size:\t");
-	prt_units_u64(out, vstruct_bytes(sb));
-	prt_str(out, "/");
-	prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
-	prt_newline(out);
-
-	prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
-	prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
-
-	prt_printf(out, "Sections:\t");
-	u64 fields_have = 0;
-	vstruct_for_each(sb, f)
-		fields_have |= 1 << le32_to_cpu(f->type);
-	prt_bitflags(out, bch2_sb_fields, fields_have);
-	prt_newline(out);
-
-	prt_printf(out, "Features:\t");
-	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
-	prt_newline(out);
-
-	prt_printf(out, "Compat features:\t");
-	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
-	prt_newline(out);
-
-	prt_newline(out);
-	prt_printf(out, "Options:");
-	prt_newline(out);
-	printbuf_indent_add(out, 2);
-	{
-		enum bch_opt_id id;
-
-		for (id = 0; id < bch2_opts_nr; id++) {
-			const struct bch_option *opt = bch2_opt_table + id;
-
-			if (opt->get_sb) {
-				u64 v = bch2_opt_from_sb(sb, id, -1);
-
-				prt_printf(out, "%s:\t", opt->attr.name);
-				bch2_opt_to_text(out, NULL, sb, opt, v,
-						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
-				prt_newline(out);
-			}
-		}
-	}
-
-	printbuf_indent_sub(out, 2);
-
-	if (print_layout) {
-		prt_newline(out);
-		prt_printf(out, "layout:");
-		prt_newline(out);
-		printbuf_indent_add(out, 2);
-		bch2_sb_layout_to_text(out, &sb->layout);
-		printbuf_indent_sub(out, 2);
-	}
-
-	vstruct_for_each(sb, f)
-		if (fields & (1 << le32_to_cpu(f->type))) {
-			prt_newline(out);
-			bch2_sb_field_to_text(out, sb, f);
-		}
-}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
deleted file mode 100644
index a3b7a90f2533..000000000000
--- a/fs/bcachefs/super-io.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_IO_H
-#define _BCACHEFS_SUPER_IO_H
-
-#include "extents.h"
-#include "eytzinger.h"
-#include "super_types.h"
-#include "super.h"
-#include "sb-members.h"
-
-#include <asm/byteorder.h>
-
-#define BCH_SB_READ_SCRATCH_BUF_SIZE		4096
-
-static inline bool bch2_version_compatible(u16 version)
-{
-	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
-		version >= bcachefs_metadata_version_min;
-}
-
-void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-
-int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-
-static inline int bch2_request_incompat_feature(struct bch_fs *c,
-						enum bcachefs_metadata_version version)
-{
-	return likely(version <= c->sb.version_incompat)
-		? 0
-		: bch2_set_version_incompat(c, version);
-}
-
-static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
-{
-	return le32_to_cpu(f->u64s) * sizeof(u64);
-}
-
-#define field_to_type(_f, _name)					\
-	container_of_or_null(_f, struct bch_sb_field_##_name, field)
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
-#define bch2_sb_field_get(_sb, _name)					\
-	field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
-					     enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_resize(_sb, _name, _u64s)				\
-	field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
-					enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_get_minsize(_sb, _name, _u64s)				\
-	field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-#define bch2_sb_field_nr_entries(_f)					\
-	(_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) /	\
-	       sizeof(_f->entries[0]))					\
-	    : 0)
-
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
-
-extern const char * const bch2_sb_fields[];
-
-struct bch_sb_field_ops {
-	int	(*validate)(struct bch_sb *, struct bch_sb_field *,
-			    enum bch_validate_flags, struct printbuf *);
-	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
-};
-
-static inline __le64 bch2_sb_magic(struct bch_fs *c)
-{
-	__le64 ret;
-
-	memcpy(&ret, &c->sb.uuid, sizeof(ret));
-	return ret;
-}
-
-static inline __u64 jset_magic(struct bch_fs *c)
-{
-	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
-}
-
-static inline __u64 bset_magic(struct bch_fs *c)
-{
-	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
-}
-
-int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
-int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
-
-void bch2_free_super(struct bch_sb_handle *);
-int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
-
-int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_write_super(struct bch_fs *);
-void __bch2_check_set_feature(struct bch_fs *, unsigned);
-
-static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
-	if (!(c->sb.features & (1ULL << feat)))
-		__bch2_check_set_feature(c, feat);
-}
-
-bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
-void bch2_sb_upgrade_incompat(struct bch_fs *);
-
-void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
-			     struct bch_sb_field *);
-void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
-			   struct bch_sb_field *);
-void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
-void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
-
-#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
deleted file mode 100644
index c46b1053a02c..000000000000
--- a/fs/bcachefs/super.c
+++ /dev/null
@@ -1,2547 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs setup/teardown code, and some metadata io - read a superblock and
- * figure out what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "backpointers.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_write_buffer.h"
-#include "buckets_waiting_for_journal.h"
-#include "chardev.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "move.h"
-#include "migrate.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "sysfs.h"
-#include "thread_with_file.h"
-#include "trace.h"
-
-#include <linux/backing-dev.h>
-#include <linux/blkdev.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/random.h>
-#include <linux/sysfs.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-MODULE_DESCRIPTION("bcachefs filesystem");
-
-typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
-
-#define x(n)		#n,
-const char * const bch2_fs_flag_strs[] = {
-	BCH_FS_FLAGS()
-	NULL
-};
-
-const char * const bch2_write_refs[] = {
-	BCH_WRITE_REFS()
-	NULL
-};
-
-const char * const bch2_dev_read_refs[] = {
-	BCH_DEV_READ_REFS()
-	NULL
-};
-
-const char * const bch2_dev_write_refs[] = {
-	BCH_DEV_WRITE_REFS()
-	NULL
-};
-#undef x
-
-static void __bch2_print_str(struct bch_fs *c, const char *prefix,
-			     const char *str)
-{
-#ifdef __KERNEL__
-	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
-	if (unlikely(stdio)) {
-		bch2_stdio_redirect_printf(stdio, true, "%s", str);
-		return;
-	}
-#endif
-	bch2_print_string_as_lines(KERN_ERR, str);
-}
-
-void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
-{
-	__bch2_print_str(c, prefix, str);
-}
-
-__printf(2, 0)
-static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
-{
-#ifdef __KERNEL__
-	if (unlikely(stdio)) {
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
-
-		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
-		return;
-	}
-#endif
-	vprintk(fmt, args);
-}
-
-void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
-{
-	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
-
-	va_list args;
-	va_start(args, fmt);
-	bch2_print_maybe_redirect(stdio, fmt, args);
-	va_end(args);
-}
-
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
-{
-	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
-	va_list args;
-	va_start(args, fmt);
-	bch2_print_maybe_redirect(stdio, fmt, args);
-	va_end(args);
-}
-
-#define KTYPE(type)							\
-static const struct attribute_group type ## _group = {			\
-	.attrs = type ## _files						\
-};									\
-									\
-static const struct attribute_group *type ## _groups[] = {		\
-	&type ## _group,						\
-	NULL								\
-};									\
-									\
-static const struct kobj_type type ## _ktype = {			\
-	.release	= type ## _release,				\
-	.sysfs_ops	= &type ## _sysfs_ops,				\
-	.default_groups = type ## _groups				\
-}
-
-static void bch2_fs_release(struct kobject *);
-static void bch2_dev_release(struct kobject *);
-static void bch2_fs_counters_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_internal_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_opts_dir_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_time_stats_release(struct kobject *k)
-{
-}
-
-KTYPE(bch2_fs);
-KTYPE(bch2_fs_counters);
-KTYPE(bch2_fs_internal);
-KTYPE(bch2_fs_opts_dir);
-KTYPE(bch2_fs_time_stats);
-KTYPE(bch2_dev);
-
-static struct kset *bcachefs_kset;
-static LIST_HEAD(bch_fs_list);
-static DEFINE_MUTEX(bch_fs_list_lock);
-
-DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
-
-static void bch2_dev_unlink(struct bch_dev *);
-static void bch2_dev_free(struct bch_dev *);
-static int bch2_dev_alloc(struct bch_fs *, unsigned);
-static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
-static void bch2_dev_io_ref_stop(struct bch_dev *, int);
-static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
-
-struct bch_fs *bch2_dev_to_fs(dev_t dev)
-{
-	guard(mutex)(&bch_fs_list_lock);
-	guard(rcu)();
-
-	struct bch_fs *c;
-	list_for_each_entry(c, &bch_fs_list, list)
-		for_each_member_device_rcu(c, ca, NULL)
-			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
-				closure_get(&c->cl);
-				return c;
-			}
-	return NULL;
-}
-
-static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
-{
-	struct bch_fs *c;
-
-	lockdep_assert_held(&bch_fs_list_lock);
-
-	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
-			return c;
-
-	return NULL;
-}
-
-struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
-{
-	struct bch_fs *c;
-
-	mutex_lock(&bch_fs_list_lock);
-	c = __bch2_uuid_to_fs(uuid);
-	if (c)
-		closure_get(&c->cl);
-	mutex_unlock(&bch_fs_list_lock);
-
-	return c;
-}
-
-/* Filesystem RO/RW: */
-
-/*
- * For startup/shutdown of RW stuff, the dependencies are:
- *
- * - foreground writes depend on copygc and rebalance (to free up space)
- *
- * - copygc and rebalance depend on mark and sweep gc (they actually probably
- *   don't because they either reserve ahead of time or don't block if
- *   allocations fail, but allocations can require mark and sweep gc to run
- *   because of generation number wraparound)
- *
- * - all of the above depends on the allocator threads
- *
- * - allocator depends on the journal (when it rewrites prios and gens)
- */
-
-static void __bch2_fs_read_only(struct bch_fs *c)
-{
-	unsigned clean_passes = 0;
-	u64 seq = 0;
-
-	bch2_fs_ec_stop(c);
-	bch2_open_buckets_stop(c, NULL, true);
-	bch2_rebalance_stop(c);
-	bch2_copygc_stop(c);
-	bch2_fs_ec_flush(c);
-
-	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
-		    journal_cur_seq(&c->journal));
-
-	do {
-		clean_passes++;
-
-		if (bch2_btree_interior_updates_flush(c) ||
-		    bch2_btree_write_buffer_flush_going_ro(c) ||
-		    bch2_journal_flush_all_pins(&c->journal) ||
-		    bch2_btree_flush_all_writes(c) ||
-		    seq != atomic64_read(&c->journal.seq)) {
-			seq = atomic64_read(&c->journal.seq);
-			clean_passes = 0;
-		}
-	} while (clean_passes < 2);
-
-	bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
-		    journal_cur_seq(&c->journal));
-
-	if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
-	    !test_bit(BCH_FS_emergency_ro, &c->flags))
-		set_bit(BCH_FS_clean_shutdown, &c->flags);
-
-	bch2_fs_journal_stop(&c->journal);
-
-	bch_info(c, "%sclean shutdown complete, journal seq %llu",
-		 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
-		 c->journal.seq_ondisk);
-
-	/*
-	 * After stopping journal:
-	 */
-	for_each_member_device(c, ca) {
-		bch2_dev_io_ref_stop(ca, WRITE);
-		bch2_dev_allocator_remove(c, ca);
-	}
-}
-
-static void bch2_writes_disabled(struct enumerated_ref *writes)
-{
-	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
-
-	set_bit(BCH_FS_write_disable_complete, &c->flags);
-	wake_up(&bch2_read_only_wait);
-}
-
-void bch2_fs_read_only(struct bch_fs *c)
-{
-	if (!test_bit(BCH_FS_rw, &c->flags)) {
-		bch2_journal_reclaim_stop(&c->journal);
-		return;
-	}
-
-	BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
-
-	bch_verbose(c, "going read-only");
-
-	/*
-	 * Block new foreground-end write operations from starting - any new
-	 * writes will return -EROFS:
-	 */
-	set_bit(BCH_FS_going_ro, &c->flags);
-	enumerated_ref_stop_async(&c->writes);
-
-	/*
-	 * If we're not doing an emergency shutdown, we want to wait on
-	 * outstanding writes to complete so they don't see spurious errors due
-	 * to shutting down the allocator:
-	 *
-	 * If we are doing an emergency shutdown outstanding writes may
-	 * hang until we shutdown the allocator so we don't want to wait
-	 * on outstanding writes before shutting everything down - but
-	 * we do need to wait on them before returning and signalling
-	 * that going RO is complete:
-	 */
-	wait_event(bch2_read_only_wait,
-		   test_bit(BCH_FS_write_disable_complete, &c->flags) ||
-		   test_bit(BCH_FS_emergency_ro, &c->flags));
-
-	bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
-	if (writes_disabled)
-		bch_verbose(c, "finished waiting for writes to stop");
-
-	__bch2_fs_read_only(c);
-
-	wait_event(bch2_read_only_wait,
-		   test_bit(BCH_FS_write_disable_complete, &c->flags));
-
-	if (!writes_disabled)
-		bch_verbose(c, "finished waiting for writes to stop");
-
-	clear_bit(BCH_FS_write_disable_complete, &c->flags);
-	clear_bit(BCH_FS_going_ro, &c->flags);
-	clear_bit(BCH_FS_rw, &c->flags);
-
-	if (!bch2_journal_error(&c->journal) &&
-	    !test_bit(BCH_FS_error, &c->flags) &&
-	    !test_bit(BCH_FS_emergency_ro, &c->flags) &&
-	    test_bit(BCH_FS_started, &c->flags) &&
-	    test_bit(BCH_FS_clean_shutdown, &c->flags) &&
-	    c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
-		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
-		BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
-		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
-		BUG_ON(c->btree_write_buffer.inc.keys.nr);
-		BUG_ON(c->btree_write_buffer.flushing.keys.nr);
-		bch2_verify_accounting_clean(c);
-
-		bch_verbose(c, "marking filesystem clean");
-		bch2_fs_mark_clean(c);
-	} else {
-		/* Make sure error counts/counters are persisted */
-		mutex_lock(&c->sb_lock);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
-
-		bch_verbose(c, "done going read-only, filesystem not clean");
-	}
-}
-
-static void bch2_fs_read_only_work(struct work_struct *work)
-{
-	struct bch_fs *c =
-		container_of(work, struct bch_fs, read_only_work);
-
-	down_write(&c->state_lock);
-	bch2_fs_read_only(c);
-	up_write(&c->state_lock);
-}
-
-static void bch2_fs_read_only_async(struct bch_fs *c)
-{
-	queue_work(system_long_wq, &c->read_only_work);
-}
-
-bool bch2_fs_emergency_read_only(struct bch_fs *c)
-{
-	bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-	bch2_journal_halt(&c->journal);
-	bch2_fs_read_only_async(c);
-
-	wake_up(&bch2_read_only_wait);
-	return ret;
-}
-
-static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
-					   bool locked)
-{
-	bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-	if (!locked)
-		bch2_journal_halt(&c->journal);
-	else
-		bch2_journal_halt_locked(&c->journal);
-	bch2_fs_read_only_async(c);
-	wake_up(&bch2_read_only_wait);
-
-	if (ret)
-		prt_printf(out, "emergency read only at seq %llu\n",
-			   journal_cur_seq(&c->journal));
-
-	return ret;
-}
-
-bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
-{
-	return __bch2_fs_emergency_read_only2(c, out, false);
-}
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
-{
-	bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
-	bch2_journal_halt_locked(&c->journal);
-	bch2_fs_read_only_async(c);
-
-	wake_up(&bch2_read_only_wait);
-	return ret;
-}
-
-static int __bch2_fs_read_write(struct bch_fs *c, bool early)
-{
-	int ret;
-
-	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
-	if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
-		return bch_err_throw(c, erofs_no_alloc_info);
-
-	if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
-		bch_err(c, "cannot go rw, unfixed btree errors");
-		return bch_err_throw(c, erofs_unfixed_errors);
-	}
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-		bch_err(c, "cannot go rw, filesystem is an unresized image file");
-		return bch_err_throw(c, erofs_filesystem_full);
-	}
-
-	if (test_bit(BCH_FS_rw, &c->flags))
-		return 0;
-
-	bch_info(c, "going read-write");
-
-	ret = bch2_fs_init_rw(c);
-	if (ret)
-		goto err;
-
-	ret = bch2_sb_members_v2_init(c);
-	if (ret)
-		goto err;
-
-	clear_bit(BCH_FS_clean_shutdown, &c->flags);
-
-	scoped_guard(rcu)
-		for_each_online_member_rcu(c, ca)
-			if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-				bch2_dev_allocator_add(c, ca);
-				enumerated_ref_start(&ca->io_ref[WRITE]);
-			}
-
-	bch2_recalc_capacity(c);
-
-	/*
-	 * First journal write must be a flush write: after a clean shutdown we
-	 * don't read the journal, so the first journal write may end up
-	 * overwriting whatever was there previously, and there must always be
-	 * at least one non-flush write in the journal or recovery will fail:
-	 */
-	spin_lock(&c->journal.lock);
-	set_bit(JOURNAL_need_flush_write, &c->journal.flags);
-	set_bit(JOURNAL_running, &c->journal.flags);
-	bch2_journal_space_available(&c->journal);
-	spin_unlock(&c->journal.lock);
-
-	ret = bch2_fs_mark_dirty(c);
-	if (ret)
-		goto err;
-
-	ret = bch2_journal_reclaim_start(&c->journal);
-	if (ret)
-		goto err;
-
-	set_bit(BCH_FS_rw, &c->flags);
-	set_bit(BCH_FS_was_rw, &c->flags);
-
-	enumerated_ref_start(&c->writes);
-
-	ret = bch2_copygc_start(c);
-	if (ret) {
-		bch_err_msg(c, ret, "error starting copygc thread");
-		goto err;
-	}
-
-	ret = bch2_rebalance_start(c);
-	if (ret) {
-		bch_err_msg(c, ret, "error starting rebalance thread");
-		goto err;
-	}
-
-	bch2_do_discards(c);
-	bch2_do_invalidates(c);
-	bch2_do_stripe_deletes(c);
-	bch2_do_pending_node_rewrites(c);
-	return 0;
-err:
-	if (test_bit(BCH_FS_rw, &c->flags))
-		bch2_fs_read_only(c);
-	else
-		__bch2_fs_read_only(c);
-	return ret;
-}
-
-int bch2_fs_read_write(struct bch_fs *c)
-{
-	if (c->opts.recovery_pass_last &&
-	    c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
-		return bch_err_throw(c, erofs_norecovery);
-
-	if (c->opts.nochanges)
-		return bch_err_throw(c, erofs_nochanges);
-
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
-		return bch_err_throw(c, erofs_no_alloc_info);
-
-	return __bch2_fs_read_write(c, false);
-}
-
-int bch2_fs_read_write_early(struct bch_fs *c)
-{
-	down_write(&c->state_lock);
-	int ret = __bch2_fs_read_write(c, true);
-	up_write(&c->state_lock);
-
-	return ret;
-}
-
-/* Filesystem startup/shutdown: */
-
-static void __bch2_fs_free(struct bch_fs *c)
-{
-	for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_exit(&c->times[i]);
-
-#ifdef CONFIG_UNICODE
-	utf8_unload(c->cf_encoding);
-#endif
-
-	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-	bch2_free_pending_node_rewrites(c);
-	bch2_free_fsck_errs(c);
-	bch2_fs_vfs_exit(c);
-	bch2_fs_snapshots_exit(c);
-	bch2_fs_sb_errors_exit(c);
-	bch2_fs_replicas_exit(c);
-	bch2_fs_rebalance_exit(c);
-	bch2_fs_quota_exit(c);
-	bch2_fs_nocow_locking_exit(c);
-	bch2_fs_journal_exit(&c->journal);
-	bch2_fs_fs_io_direct_exit(c);
-	bch2_fs_fs_io_buffered_exit(c);
-	bch2_fs_fsio_exit(c);
-	bch2_fs_io_write_exit(c);
-	bch2_fs_io_read_exit(c);
-	bch2_fs_encryption_exit(c);
-	bch2_fs_ec_exit(c);
-	bch2_fs_counters_exit(c);
-	bch2_fs_compress_exit(c);
-	bch2_io_clock_exit(&c->io_clock[WRITE]);
-	bch2_io_clock_exit(&c->io_clock[READ]);
-	bch2_fs_buckets_waiting_for_journal_exit(c);
-	bch2_fs_btree_write_buffer_exit(c);
-	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
-	bch2_fs_btree_iter_exit(c);
-	bch2_fs_btree_interior_update_exit(c);
-	bch2_fs_btree_cache_exit(c);
-	bch2_fs_accounting_exit(c);
-	bch2_fs_async_obj_exit(c);
-	bch2_journal_keys_put_initial(c);
-	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-
-	BUG_ON(atomic_read(&c->journal_keys.ref));
-	percpu_free_rwsem(&c->mark_lock);
-	if (c->online_reserved) {
-		u64 v = percpu_u64_get(c->online_reserved);
-		WARN(v, "online_reserved not 0 at shutdown: %lli", v);
-		free_percpu(c->online_reserved);
-	}
-
-	darray_exit(&c->incompat_versions_requested);
-	darray_exit(&c->btree_roots_extra);
-	free_percpu(c->pcpu);
-	free_percpu(c->usage);
-	mempool_exit(&c->large_bkey_pool);
-	mempool_exit(&c->btree_bounce_pool);
-	bioset_exit(&c->btree_bio);
-	mempool_exit(&c->fill_iter);
-	enumerated_ref_exit(&c->writes);
-	kfree(rcu_dereference_protected(c->disk_groups, 1));
-	kfree(c->journal_seq_blacklist_table);
-
-	if (c->write_ref_wq)
-		destroy_workqueue(c->write_ref_wq);
-	if (c->btree_write_submit_wq)
-		destroy_workqueue(c->btree_write_submit_wq);
-	if (c->btree_read_complete_wq)
-		destroy_workqueue(c->btree_read_complete_wq);
-	if (c->copygc_wq)
-		destroy_workqueue(c->copygc_wq);
-	if (c->btree_write_complete_wq)
-		destroy_workqueue(c->btree_write_complete_wq);
-	if (c->btree_update_wq)
-		destroy_workqueue(c->btree_update_wq);
-
-	bch2_free_super(&c->disk_sb);
-	kvfree(c);
-	module_put(THIS_MODULE);
-}
-
-static void bch2_fs_release(struct kobject *kobj)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-	__bch2_fs_free(c);
-}
-
-void __bch2_fs_stop(struct bch_fs *c)
-{
-	bch_verbose(c, "shutting down");
-
-	set_bit(BCH_FS_stopping, &c->flags);
-
-	down_write(&c->state_lock);
-	bch2_fs_read_only(c);
-	up_write(&c->state_lock);
-
-	for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-		if (ca)
-			bch2_dev_io_ref_stop(ca, READ);
-	}
-
-	for_each_member_device(c, ca)
-		bch2_dev_unlink(ca);
-
-	if (c->kobj.state_in_sysfs)
-		kobject_del(&c->kobj);
-
-	bch2_fs_debug_exit(c);
-	bch2_fs_chardev_exit(c);
-
-	bch2_ro_ref_put(c);
-	wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
-
-	kobject_put(&c->counters_kobj);
-	kobject_put(&c->time_stats);
-	kobject_put(&c->opts_dir);
-	kobject_put(&c->internal);
-
-	/* btree prefetch might have kicked off reads in the background: */
-	bch2_btree_flush_all_reads(c);
-
-	for_each_member_device(c, ca)
-		cancel_work_sync(&ca->io_error_work);
-
-	cancel_work_sync(&c->read_only_work);
-}
-
-void bch2_fs_free(struct bch_fs *c)
-{
-	mutex_lock(&bch_fs_list_lock);
-	list_del(&c->list);
-	mutex_unlock(&bch_fs_list_lock);
-
-	closure_sync(&c->cl);
-	closure_debug_destroy(&c->cl);
-
-	for (unsigned i = 0; i < c->sb.nr_devices; i++) {
-		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
-		if (ca) {
-			EBUG_ON(atomic_long_read(&ca->ref) != 1);
-			bch2_dev_io_ref_stop(ca, READ);
-			bch2_free_super(&ca->disk_sb);
-			bch2_dev_free(ca);
-		}
-	}
-
-	bch_verbose(c, "shutdown complete");
-
-	kobject_put(&c->kobj);
-}
-
-void bch2_fs_stop(struct bch_fs *c)
-{
-	__bch2_fs_stop(c);
-	bch2_fs_free(c);
-}
-
-static int bch2_fs_online(struct bch_fs *c)
-{
-	int ret = 0;
-
-	lockdep_assert_held(&bch_fs_list_lock);
-
-	if (c->sb.multi_device &&
-	    __bch2_uuid_to_fs(c->sb.uuid)) {
-		bch_err(c, "filesystem UUID already open");
-		return bch_err_throw(c, filesystem_uuid_already_open);
-	}
-
-	ret = bch2_fs_chardev_init(c);
-	if (ret) {
-		bch_err(c, "error creating character device");
-		return ret;
-	}
-
-	bch2_fs_debug_init(c);
-
-	ret = (c->sb.multi_device
-	       ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
-	       : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
-	    kobject_add(&c->internal, &c->kobj, "internal") ?:
-	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
-#endif
-	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
-	    bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
-	if (ret) {
-		bch_err(c, "error creating sysfs objects");
-		return ret;
-	}
-
-	down_write(&c->state_lock);
-
-	for_each_member_device(c, ca) {
-		ret = bch2_dev_sysfs_online(c, ca);
-		if (ret) {
-			bch_err(c, "error creating sysfs objects");
-			bch2_dev_put(ca);
-			goto err;
-		}
-	}
-
-	BUG_ON(!list_empty(&c->list));
-	list_add(&c->list, &bch_fs_list);
-err:
-	up_write(&c->state_lock);
-	return ret;
-}
-
-int bch2_fs_init_rw(struct bch_fs *c)
-{
-	if (test_bit(BCH_FS_rw_init_done, &c->flags))
-		return 0;
-
-	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
-	    !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
-	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-	    !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
-	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
-				WQ_FREEZABLE, 0)))
-		return bch_err_throw(c, ENOMEM_fs_other_alloc);
-
-	int ret = bch2_fs_btree_interior_update_init(c) ?:
-		bch2_fs_btree_write_buffer_init(c) ?:
-		bch2_fs_fs_io_buffered_init(c) ?:
-		bch2_fs_io_write_init(c) ?:
-		bch2_fs_journal_init(&c->journal);
-	if (ret)
-		return ret;
-
-	set_bit(BCH_FS_rw_init_done, &c->flags);
-	return 0;
-}
-
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
-				    bch_sb_handles *sbs)
-{
-	struct bch_fs *c;
-	struct printbuf name = PRINTBUF;
-	unsigned i, iter_size;
-	int ret = 0;
-
-	c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
-	if (!c) {
-		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
-		goto out;
-	}
-
-	c->stdio = (void *)(unsigned long) opts->stdio;
-
-	__module_get(THIS_MODULE);
-
-	closure_init(&c->cl, NULL);
-
-	c->kobj.kset = bcachefs_kset;
-	kobject_init(&c->kobj, &bch2_fs_ktype);
-	kobject_init(&c->internal, &bch2_fs_internal_ktype);
-	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
-	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
-	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
-
-	c->minor		= -1;
-	c->disk_sb.fs_sb	= true;
-
-	init_rwsem(&c->state_lock);
-	mutex_init(&c->sb_lock);
-	mutex_init(&c->replicas_gc_lock);
-	mutex_init(&c->btree_root_lock);
-	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
-
-	refcount_set(&c->ro_ref, 1);
-	init_waitqueue_head(&c->ro_ref_wait);
-
-	for (i = 0; i < BCH_TIME_STAT_NR; i++)
-		bch2_time_stats_init(&c->times[i]);
-
-	bch2_fs_allocator_background_init(c);
-	bch2_fs_allocator_foreground_init(c);
-	bch2_fs_btree_cache_init_early(&c->btree_cache);
-	bch2_fs_btree_gc_init_early(c);
-	bch2_fs_btree_interior_update_init_early(c);
-	bch2_fs_btree_iter_init_early(c);
-	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
-	bch2_fs_btree_write_buffer_init_early(c);
-	bch2_fs_copygc_init(c);
-	bch2_fs_ec_init_early(c);
-	bch2_fs_journal_init_early(&c->journal);
-	bch2_fs_journal_keys_init(c);
-	bch2_fs_move_init(c);
-	bch2_fs_nocow_locking_init_early(c);
-	bch2_fs_quota_init(c);
-	bch2_fs_recovery_passes_init(c);
-	bch2_fs_sb_errors_init_early(c);
-	bch2_fs_snapshots_init_early(c);
-	bch2_fs_subvolumes_init_early(c);
-
-	INIT_LIST_HEAD(&c->list);
-
-	mutex_init(&c->bio_bounce_pages_lock);
-	mutex_init(&c->snapshot_table_lock);
-	init_rwsem(&c->snapshot_create_lock);
-
-	spin_lock_init(&c->btree_write_error_lock);
-
-	INIT_LIST_HEAD(&c->journal_iters);
-
-	INIT_LIST_HEAD(&c->fsck_error_msgs);
-	mutex_init(&c->fsck_error_msgs_lock);
-
-	seqcount_init(&c->usage_lock);
-
-	sema_init(&c->io_in_flight, 128);
-
-	INIT_LIST_HEAD(&c->vfs_inodes_list);
-	mutex_init(&c->vfs_inodes_lock);
-
-	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
-	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
-	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
-
-	mutex_init(&c->sectors_available_lock);
-
-	ret = percpu_init_rwsem(&c->mark_lock);
-	if (ret)
-		goto err;
-
-	mutex_lock(&c->sb_lock);
-	ret = bch2_sb_to_fs(c, sb);
-	mutex_unlock(&c->sb_lock);
-
-	if (ret)
-		goto err;
-
-	/* Compat: */
-	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
-	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
-		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-
-	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
-	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
-		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
-
-	c->opts = bch2_opts_default;
-	ret = bch2_opts_from_sb(&c->opts, sb);
-	if (ret)
-		goto err;
-
-	bch2_opts_apply(&c->opts, *opts);
-
-	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-	    c->opts.block_size > PAGE_SIZE) {
-		bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
-	if (c->opts.inodes_use_key_cache)
-		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
-	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
-
-	c->block_bits		= ilog2(block_sectors(c));
-	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
-
-	if (bch2_fs_init_fault("fs_alloc")) {
-		bch_err(c, "fs_alloc fault injected");
-		ret = -EFAULT;
-		goto err;
-	}
-
-	if (c->sb.multi_device)
-		pr_uuid(&name, c->sb.user_uuid.b);
-	else
-		prt_bdevname(&name, sbs->data[0].bdev);
-
-	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
-	if (ret)
-		goto err;
-
-	strscpy(c->name, name.buf, sizeof(c->name));
-	printbuf_exit(&name);
-
-	iter_size = sizeof(struct sort_iter) +
-		(btree_blocks(c) + 1) * 2 *
-		sizeof(struct sort_iter_set);
-
-	if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
-				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
-	    enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
-				bch2_writes_disabled) ||
-	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
-	    bioset_init(&c->btree_bio, 1,
-			max(offsetof(struct btree_read_bio, bio),
-			    offsetof(struct btree_write_bio, wbio.bio)),
-			BIOSET_NEED_BVECS) ||
-	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
-	    !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
-	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
-				       c->opts.btree_node_size) ||
-	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
-		ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
-		goto err;
-	}
-
-	ret =
-	    bch2_fs_async_obj_init(c) ?:
-	    bch2_fs_btree_cache_init(c) ?:
-	    bch2_fs_btree_iter_init(c) ?:
-	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
-	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
-	    bch2_io_clock_init(&c->io_clock[READ]) ?:
-	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
-	    bch2_fs_compress_init(c) ?:
-	    bch2_fs_counters_init(c) ?:
-	    bch2_fs_ec_init(c) ?:
-	    bch2_fs_encryption_init(c) ?:
-	    bch2_fs_fsio_init(c) ?:
-	    bch2_fs_fs_io_direct_init(c) ?:
-	    bch2_fs_io_read_init(c) ?:
-	    bch2_fs_rebalance_init(c) ?:
-	    bch2_fs_sb_errors_init(c) ?:
-	    bch2_fs_vfs_init(c);
-	if (ret)
-		goto err;
-
-	if (go_rw_in_recovery(c)) {
-		/*
-		 * start workqueues/kworkers early - kthread creation checks for
-		 * pending signals, which is _very_ annoying
-		 */
-		ret = bch2_fs_init_rw(c);
-		if (ret)
-			goto err;
-	}
-
-#ifdef CONFIG_UNICODE
-	if (bch2_fs_casefold_enabled(c)) {
-		/* Default encoding until we can potentially have more as an option. */
-		c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
-		if (IS_ERR(c->cf_encoding)) {
-			printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
-			       unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-			       unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-			       unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-#else
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
-		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
-		ret = -EINVAL;
-		goto err;
-	}
-#endif
-
-	for (i = 0; i < c->sb.nr_devices; i++) {
-		if (!bch2_member_exists(c->disk_sb.sb, i))
-			continue;
-		ret = bch2_dev_alloc(c, i);
-		if (ret)
-			goto err;
-	}
-
-	bch2_journal_entry_res_resize(&c->journal,
-			&c->btree_root_journal_res,
-			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
-	bch2_journal_entry_res_resize(&c->journal,
-			&c->clock_journal_res,
-			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
-
-	mutex_lock(&bch_fs_list_lock);
-	ret = bch2_fs_online(c);
-	mutex_unlock(&bch_fs_list_lock);
-
-	if (ret)
-		goto err;
-out:
-	return c;
-err:
-	bch2_fs_free(c);
-	c = ERR_PTR(ret);
-	goto out;
-}
-
-noinline_for_stack
-static void print_mount_opts(struct bch_fs *c)
-{
-	enum bch_opt_id i;
-	CLASS(printbuf, p)();
-	bch2_log_msg_start(c, &p);
-
-	prt_str(&p, "starting version ");
-	bch2_version_to_text(&p, c->sb.version);
-
-	bool first = true;
-	for (i = 0; i < bch2_opts_nr; i++) {
-		const struct bch_option *opt = &bch2_opt_table[i];
-		u64 v = bch2_opt_get_by_id(&c->opts, i);
-
-		if (!(opt->flags & OPT_MOUNT))
-			continue;
-
-		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-			continue;
-
-		prt_str(&p, first ? " opts=" : ",");
-		first = false;
-		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
-	}
-
-	if (c->sb.version_incompat_allowed != c->sb.version) {
-		prt_printf(&p, "\nallowing incompatible features above ");
-		bch2_version_to_text(&p, c->sb.version_incompat_allowed);
-	}
-
-	if (c->opts.verbose) {
-		prt_printf(&p, "\nfeatures: ");
-		prt_bitflags(&p, bch2_sb_features, c->sb.features);
-	}
-
-	if (c->sb.multi_device) {
-		prt_printf(&p, "\nwith devices");
-		for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
-			prt_char(&p, ' ');
-			prt_str(&p, ca->name);
-		}
-	}
-
-	bch2_print_str(c, KERN_INFO, p.buf);
-}
-
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	unsigned flags = 0;
-
-	switch (c->opts.degraded) {
-	case BCH_DEGRADED_very:
-		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-		break;
-	case BCH_DEGRADED_yes:
-		flags |= BCH_FORCE_IF_DEGRADED;
-		break;
-	default:
-		mutex_lock(&c->sb_lock);
-		for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-			if (!bch2_member_exists(c->disk_sb.sb, i))
-				continue;
-
-			ca = bch2_dev_locked(c, i);
-
-			if (!bch2_dev_is_online(ca) &&
-			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
-			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
-				mutex_unlock(&c->sb_lock);
-				return false;
-			}
-		}
-		mutex_unlock(&c->sb_lock);
-		break;
-	}
-
-	return bch2_have_enough_devs(c, c->online_devs, flags, true);
-}
-
-int bch2_fs_start(struct bch_fs *c)
-{
-	time64_t now = ktime_get_real_seconds();
-	int ret = 0;
-
-	print_mount_opts(c);
-
-	if (c->cf_encoding)
-		bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
-			 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
-			 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
-			 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-
-	if (!bch2_fs_may_start(c))
-		return bch_err_throw(c, insufficient_devices_to_start);
-
-	down_write(&c->state_lock);
-	mutex_lock(&c->sb_lock);
-
-	BUG_ON(test_bit(BCH_FS_started, &c->flags));
-
-	if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
-			sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
-		mutex_unlock(&c->sb_lock);
-		up_write(&c->state_lock);
-		ret = bch_err_throw(c, ENOSPC_sb);
-		goto err;
-	}
-
-	ret = bch2_sb_members_v2_init(c);
-	if (ret) {
-		mutex_unlock(&c->sb_lock);
-		up_write(&c->state_lock);
-		goto err;
-	}
-
-	scoped_guard(rcu)
-		for_each_online_member_rcu(c, ca)
-			bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
-			cpu_to_le64(now);
-
-	/*
-	 * Dno't write superblock yet: recovery might have to downgrade
-	 */
-	mutex_unlock(&c->sb_lock);
-
-	scoped_guard(rcu)
-		for_each_online_member_rcu(c, ca)
-			if (ca->mi.state == BCH_MEMBER_STATE_rw)
-				bch2_dev_allocator_add(c, ca);
-	bch2_recalc_capacity(c);
-	up_write(&c->state_lock);
-
-	c->recovery_task = current;
-	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
-		? bch2_fs_recovery(c)
-		: bch2_fs_initialize(c);
-	c->recovery_task = NULL;
-
-	if (ret)
-		goto err;
-
-	ret = bch2_opts_hooks_pre_set(c);
-	if (ret)
-		goto err;
-
-	if (bch2_fs_init_fault("fs_start")) {
-		ret = bch_err_throw(c, injected_fs_start);
-		goto err;
-	}
-
-	set_bit(BCH_FS_started, &c->flags);
-	wake_up(&c->ro_ref_wait);
-
-	down_write(&c->state_lock);
-	if (c->opts.read_only)
-		bch2_fs_read_only(c);
-	else if (!test_bit(BCH_FS_rw, &c->flags))
-		ret = bch2_fs_read_write(c);
-	up_write(&c->state_lock);
-
-err:
-	if (ret)
-		bch_err_msg(c, ret, "starting filesystem");
-	else
-		bch_verbose(c, "done starting filesystem");
-	return ret;
-}
-
-static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
-{
-	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-
-	if (le16_to_cpu(sb->block_size) != block_sectors(c))
-		return bch_err_throw(c, mismatched_block_size);
-
-	if (le16_to_cpu(m.bucket_size) <
-	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
-		return bch_err_throw(c, bucket_size_too_small);
-
-	return 0;
-}
-
-static int bch2_dev_in_fs(struct bch_sb_handle *fs,
-			  struct bch_sb_handle *sb,
-			  struct bch_opts *opts)
-{
-	if (fs == sb)
-		return 0;
-
-	if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
-		return -BCH_ERR_device_not_a_member_of_filesystem;
-
-	if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
-		return -BCH_ERR_device_has_been_removed;
-
-	if (fs->sb->block_size != sb->sb->block_size)
-		return -BCH_ERR_mismatched_block_size;
-
-	if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
-	    le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
-		return 0;
-
-	if (fs->sb->seq == sb->sb->seq &&
-	    fs->sb->write_time != sb->sb->write_time) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "Split brain detected between ");
-		prt_bdevname(&buf, sb->bdev);
-		prt_str(&buf, " and ");
-		prt_bdevname(&buf, fs->bdev);
-		prt_char(&buf, ':');
-		prt_newline(&buf);
-		prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
-		prt_newline(&buf);
-
-		prt_bdevname(&buf, fs->bdev);
-		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
-		prt_newline(&buf);
-
-		prt_bdevname(&buf, sb->bdev);
-		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
-		prt_newline(&buf);
-
-		if (!opts->no_splitbrain_check)
-			prt_printf(&buf, "Not using older sb");
-
-		pr_err("%s", buf.buf);
-		printbuf_exit(&buf);
-
-		if (!opts->no_splitbrain_check)
-			return -BCH_ERR_device_splitbrain;
-	}
-
-	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
-	u64 seq_from_fs		= le64_to_cpu(m.seq);
-	u64 seq_from_member	= le64_to_cpu(sb->sb->seq);
-
-	if (seq_from_fs && seq_from_fs < seq_from_member) {
-		struct printbuf buf = PRINTBUF;
-
-		prt_str(&buf, "Split brain detected between ");
-		prt_bdevname(&buf, sb->bdev);
-		prt_str(&buf, " and ");
-		prt_bdevname(&buf, fs->bdev);
-		prt_char(&buf, ':');
-		prt_newline(&buf);
-
-		prt_bdevname(&buf, fs->bdev);
-		prt_str(&buf, " believes seq of ");
-		prt_bdevname(&buf, sb->bdev);
-		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
-		prt_bdevname(&buf, sb->bdev);
-		prt_printf(&buf, " has %llu\n", seq_from_member);
-
-		if (!opts->no_splitbrain_check) {
-			prt_str(&buf, "Not using ");
-			prt_bdevname(&buf, sb->bdev);
-		}
-
-		pr_err("%s", buf.buf);
-		printbuf_exit(&buf);
-
-		if (!opts->no_splitbrain_check)
-			return -BCH_ERR_device_splitbrain;
-	}
-
-	return 0;
-}
-
-/* Device startup/shutdown: */
-
-static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
-{
-	if (rw == READ)
-		clear_bit(ca->dev_idx, ca->fs->online_devs.d);
-
-	if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
-		enumerated_ref_stop(&ca->io_ref[rw],
-				    rw == READ
-				    ? bch2_dev_read_refs
-				    : bch2_dev_write_refs);
-}
-
-static void bch2_dev_release(struct kobject *kobj)
-{
-	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-
-	kfree(ca);
-}
-
-static void bch2_dev_free(struct bch_dev *ca)
-{
-	WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
-	WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
-
-	cancel_work_sync(&ca->io_error_work);
-
-	bch2_dev_unlink(ca);
-
-	if (ca->kobj.state_in_sysfs)
-		kobject_del(&ca->kobj);
-
-	bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
-	bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
-
-	bch2_free_super(&ca->disk_sb);
-	bch2_dev_allocator_background_exit(ca);
-	bch2_dev_journal_exit(ca);
-
-	free_percpu(ca->io_done);
-	bch2_dev_buckets_free(ca);
-	kfree(ca->sb_read_scratch);
-
-	bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
-	bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
-
-	enumerated_ref_exit(&ca->io_ref[WRITE]);
-	enumerated_ref_exit(&ca->io_ref[READ]);
-#ifndef CONFIG_BCACHEFS_DEBUG
-	percpu_ref_exit(&ca->ref);
-#endif
-	kobject_put(&ca->kobj);
-}
-
-static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
-{
-
-	lockdep_assert_held(&c->state_lock);
-
-	if (enumerated_ref_is_zero(&ca->io_ref[READ]))
-		return;
-
-	__bch2_dev_read_only(c, ca);
-
-	bch2_dev_io_ref_stop(ca, READ);
-
-	bch2_dev_unlink(ca);
-
-	bch2_free_super(&ca->disk_sb);
-	bch2_dev_journal_exit(ca);
-}
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-static void bch2_dev_ref_complete(struct percpu_ref *ref)
-{
-	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
-
-	complete(&ca->ref_completion);
-}
-#endif
-
-static void bch2_dev_unlink(struct bch_dev *ca)
-{
-	struct kobject *b;
-
-	/*
-	 * This is racy w.r.t. the underlying block device being hot-removed,
-	 * which removes it from sysfs.
-	 *
-	 * It'd be lovely if we had a way to handle this race, but the sysfs
-	 * code doesn't appear to provide a good method and block/holder.c is
-	 * susceptible as well:
-	 */
-	if (ca->kobj.state_in_sysfs &&
-	    ca->disk_sb.bdev &&
-	    (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
-		sysfs_remove_link(b, "bcachefs");
-		sysfs_remove_link(&ca->kobj, "block");
-	}
-}
-
-static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
-{
-	int ret;
-
-	if (!c->kobj.state_in_sysfs)
-		return 0;
-
-	if (!ca->kobj.state_in_sysfs) {
-		ret =   kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
-			bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
-		if (ret)
-			return ret;
-	}
-
-	if (ca->disk_sb.bdev) {
-		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
-
-		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
-		if (ret)
-			return ret;
-
-		ret = sysfs_create_link(&ca->kobj, block, "block");
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
-					struct bch_member *member)
-{
-	struct bch_dev *ca;
-	unsigned i;
-
-	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-	if (!ca)
-		return NULL;
-
-	kobject_init(&ca->kobj, &bch2_dev_ktype);
-	init_completion(&ca->ref_completion);
-
-	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
-
-	bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
-	bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
-
-	ca->mi = bch2_mi_to_cpu(member);
-
-	for (i = 0; i < ARRAY_SIZE(member->errors); i++)
-		atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
-
-	ca->uuid = member->uuid;
-
-	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
-			     ca->mi.bucket_size / btree_sectors(c));
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
-		goto err;
-#else
-	atomic_long_set(&ca->ref, 1);
-#endif
-
-	mutex_init(&ca->bucket_backpointer_mismatch.lock);
-	mutex_init(&ca->bucket_backpointer_empty.lock);
-
-	bch2_dev_allocator_background_init(ca);
-
-	if (enumerated_ref_init(&ca->io_ref[READ],  BCH_DEV_READ_REF_NR,  NULL) ||
-	    enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
-	    !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
-	    bch2_dev_buckets_alloc(c, ca) ||
-	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
-		goto err;
-
-	return ca;
-err:
-	bch2_dev_free(ca);
-	return NULL;
-}
-
-static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
-			    unsigned dev_idx)
-{
-	ca->dev_idx = dev_idx;
-	__set_bit(ca->dev_idx, ca->self.d);
-
-	if (!ca->name[0])
-		scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
-
-	ca->fs = c;
-	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
-
-	if (bch2_dev_sysfs_online(c, ca))
-		pr_warn("error creating sysfs objects");
-}
-
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
-{
-	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-	struct bch_dev *ca = NULL;
-
-	if (bch2_fs_init_fault("dev_alloc"))
-		goto err;
-
-	ca = __bch2_dev_alloc(c, &member);
-	if (!ca)
-		goto err;
-
-	ca->fs = c;
-
-	bch2_dev_attach(c, ca, dev_idx);
-	return 0;
-err:
-	return bch_err_throw(c, ENOMEM_dev_alloc);
-}
-
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
-{
-	unsigned ret;
-
-	if (bch2_dev_is_online(ca)) {
-		bch_err(ca, "already have device online in slot %u",
-			sb->sb->dev_idx);
-		return bch_err_throw(ca->fs, device_already_online);
-	}
-
-	if (get_capacity(sb->bdev->bd_disk) <
-	    ca->mi.bucket_size * ca->mi.nbuckets) {
-		bch_err(ca, "cannot online: device too small");
-		return bch_err_throw(ca->fs, device_size_too_small);
-	}
-
-	BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
-	BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
-
-	ret = bch2_dev_journal_init(ca, sb->sb);
-	if (ret)
-		return ret;
-
-	struct printbuf name = PRINTBUF;
-	prt_bdevname(&name, sb->bdev);
-	strscpy(ca->name, name.buf, sizeof(ca->name));
-	printbuf_exit(&name);
-
-	/* Commit: */
-	ca->disk_sb = *sb;
-	memset(sb, 0, sizeof(*sb));
-
-	/*
-	 * Stash pointer to the filesystem for blk_holder_ops - note that once
-	 * attached to a filesystem, we will always close the block device
-	 * before tearing down the filesystem object.
-	 */
-	ca->disk_sb.holder->c = ca->fs;
-
-	ca->dev = ca->disk_sb.bdev->bd_dev;
-
-	enumerated_ref_start(&ca->io_ref[READ]);
-
-	return 0;
-}
-
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
-{
-	struct bch_dev *ca;
-	int ret;
-
-	lockdep_assert_held(&c->state_lock);
-
-	if (le64_to_cpu(sb->sb->seq) >
-	    le64_to_cpu(c->disk_sb.sb->seq))
-		bch2_sb_to_fs(c, sb->sb);
-
-	BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
-
-	ca = bch2_dev_locked(c, sb->sb->dev_idx);
-
-	ret = __bch2_dev_attach_bdev(ca, sb);
-	if (ret)
-		return ret;
-
-	set_bit(ca->dev_idx, c->online_devs.d);
-
-	bch2_dev_sysfs_online(c, ca);
-
-	bch2_rebalance_wakeup(c);
-	return 0;
-}
-
-/* Device management: */
-
-/*
- * Note: this function is also used by the error paths - when a particular
- * device sees an error, we call it to determine whether we can just set the
- * device RO, or - if this function returns false - we'll set the whole
- * filesystem RO:
- *
- * XXX: maybe we should be more explicit about whether we're changing state
- * because we got an error or what have you?
- */
-bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
-			    enum bch_member_state new_state, int flags)
-{
-	struct bch_devs_mask new_online_devs;
-	int nr_rw = 0, required;
-
-	lockdep_assert_held(&c->state_lock);
-
-	switch (new_state) {
-	case BCH_MEMBER_STATE_rw:
-		return true;
-	case BCH_MEMBER_STATE_ro:
-		if (ca->mi.state != BCH_MEMBER_STATE_rw)
-			return true;
-
-		/* do we have enough devices to write to?  */
-		for_each_member_device(c, ca2)
-			if (ca2 != ca)
-				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
-
-		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
-			       ? c->opts.metadata_replicas
-			       : metadata_replicas_required(c),
-			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
-			       ? c->opts.data_replicas
-			       : data_replicas_required(c));
-
-		return nr_rw >= required;
-	case BCH_MEMBER_STATE_failed:
-	case BCH_MEMBER_STATE_spare:
-		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
-		    ca->mi.state != BCH_MEMBER_STATE_ro)
-			return true;
-
-		/* do we have enough devices to read from?  */
-		new_online_devs = c->online_devs;
-		__clear_bit(ca->dev_idx, new_online_devs.d);
-
-		return bch2_have_enough_devs(c, new_online_devs, flags, false);
-	default:
-		BUG();
-	}
-}
-
-static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
-{
-	bch2_dev_io_ref_stop(ca, WRITE);
-
-	/*
-	 * The allocator thread itself allocates btree nodes, so stop it first:
-	 */
-	bch2_dev_allocator_remove(c, ca);
-	bch2_recalc_capacity(c);
-	bch2_dev_journal_stop(&c->journal, ca);
-}
-
-static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-{
-	lockdep_assert_held(&c->state_lock);
-
-	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
-
-	bch2_dev_allocator_add(c, ca);
-	bch2_recalc_capacity(c);
-
-	if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
-		enumerated_ref_start(&ca->io_ref[WRITE]);
-
-	bch2_dev_do_discards(ca);
-}
-
-int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
-			 enum bch_member_state new_state, int flags)
-{
-	struct bch_member *m;
-	int ret = 0;
-
-	if (ca->mi.state == new_state)
-		return 0;
-
-	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
-		return bch_err_throw(c, device_state_not_allowed);
-
-	if (new_state != BCH_MEMBER_STATE_rw)
-		__bch2_dev_read_only(c, ca);
-
-	bch_notice(ca, "%s", bch2_member_states[new_state]);
-
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	SET_BCH_MEMBER_STATE(m, new_state);
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	if (new_state == BCH_MEMBER_STATE_rw)
-		__bch2_dev_read_write(c, ca);
-
-	bch2_rebalance_wakeup(c);
-
-	return ret;
-}
-
-int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
-		       enum bch_member_state new_state, int flags)
-{
-	int ret;
-
-	down_write(&c->state_lock);
-	ret = __bch2_dev_set_state(c, ca, new_state, flags);
-	up_write(&c->state_lock);
-
-	return ret;
-}
-
-/* Device add/removal: */
-
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
-	struct bch_member *m;
-	unsigned dev_idx = ca->dev_idx, data;
-	bool fast_device_removal = !bch2_request_incompat_feature(c,
-					bcachefs_metadata_version_fast_device_removal);
-	int ret;
-
-	down_write(&c->state_lock);
-
-	/*
-	 * We consume a reference to ca->ref, regardless of whether we succeed
-	 * or fail:
-	 */
-	bch2_dev_put(ca);
-
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
-		bch_err(ca, "Cannot remove without losing data");
-		ret = bch_err_throw(c, device_state_not_allowed);
-		goto err;
-	}
-
-	__bch2_dev_read_only(c, ca);
-
-	ret = fast_device_removal
-		? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
-		: (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
-		   bch2_dev_remove_stripes(c, ca->dev_idx, flags));
-	if (ret)
-		goto err;
-
-	/* Check if device still has data before blowing away alloc info */
-	struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-	for (unsigned i = 0; i < BCH_DATA_NR; i++)
-		if (!data_type_is_empty(i) &&
-		    !data_type_is_hidden(i) &&
-		    usage.buckets[i]) {
-			bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
-				__bch2_data_types[i], usage.buckets[i]);
-			ret = -EBUSY;
-			goto err;
-		}
-
-	ret = bch2_dev_remove_alloc(c, ca);
-	bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
-	if (ret)
-		goto err;
-
-	/*
-	 * We need to flush the entire journal to get rid of keys that reference
-	 * the device being removed before removing the superblock entry
-	 */
-	bch2_journal_flush_all_pins(&c->journal);
-
-	/*
-	 * this is really just needed for the bch2_replicas_gc_(start|end)
-	 * calls, and could be cleaned up:
-	 */
-	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
-	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
-	if (ret)
-		goto err;
-
-	ret = bch2_journal_flush(&c->journal);
-	bch_err_msg(ca, ret, "bch2_journal_flush()");
-	if (ret)
-		goto err;
-
-	ret = bch2_replicas_gc2(c);
-	bch_err_msg(ca, ret, "bch2_replicas_gc2()");
-	if (ret)
-		goto err;
-
-	data = bch2_dev_has_data(c, ca);
-	if (data) {
-		struct printbuf data_has = PRINTBUF;
-
-		prt_bitflags(&data_has, __bch2_data_types, data);
-		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
-		printbuf_exit(&data_has);
-		ret = -EBUSY;
-		goto err;
-	}
-
-	__bch2_dev_offline(c, ca);
-
-	mutex_lock(&c->sb_lock);
-	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-	mutex_unlock(&c->sb_lock);
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-	percpu_ref_kill(&ca->ref);
-#else
-	ca->dying = true;
-	bch2_dev_put(ca);
-#endif
-	wait_for_completion(&ca->ref_completion);
-
-	bch2_dev_free(ca);
-
-	/*
-	 * Free this device's slot in the bch_member array - all pointers to
-	 * this device must be gone:
-	 */
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
-
-	if (fast_device_removal)
-		m->uuid = BCH_SB_MEMBER_DELETED_UUID;
-	else
-		memset(&m->uuid, 0, sizeof(m->uuid));
-
-	bch2_write_super(c);
-
-	mutex_unlock(&c->sb_lock);
-	up_write(&c->state_lock);
-	return 0;
-err:
-	if (test_bit(BCH_FS_rw, &c->flags) &&
-	    ca->mi.state == BCH_MEMBER_STATE_rw &&
-	    !enumerated_ref_is_zero(&ca->io_ref[READ]))
-		__bch2_dev_read_write(c, ca);
-	up_write(&c->state_lock);
-	return ret;
-}
-
-/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
-{
-	struct bch_opts opts = bch2_opts_empty();
-	struct bch_sb_handle sb = {};
-	struct bch_dev *ca = NULL;
-	struct printbuf errbuf = PRINTBUF;
-	struct printbuf label = PRINTBUF;
-	int ret = 0;
-
-	ret = bch2_read_super(path, &opts, &sb);
-	bch_err_msg(c, ret, "reading super");
-	if (ret)
-		goto err;
-
-	struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
-
-	if (BCH_MEMBER_GROUP(&dev_mi)) {
-		bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
-		if (label.allocation_failure) {
-			ret = -ENOMEM;
-			goto err;
-		}
-	}
-
-	if (list_empty(&c->list)) {
-		mutex_lock(&bch_fs_list_lock);
-		if (__bch2_uuid_to_fs(c->sb.uuid))
-			ret = bch_err_throw(c, filesystem_uuid_already_open);
-		else
-			list_add(&c->list, &bch_fs_list);
-		mutex_unlock(&bch_fs_list_lock);
-
-		if (ret) {
-			bch_err(c, "filesystem UUID already open");
-			goto err;
-		}
-	}
-
-	ret = bch2_dev_may_add(sb.sb, c);
-	if (ret)
-		goto err;
-
-	ca = __bch2_dev_alloc(c, &dev_mi);
-	if (!ca) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	ret = __bch2_dev_attach_bdev(ca, &sb);
-	if (ret)
-		goto err;
-
-	down_write(&c->state_lock);
-	mutex_lock(&c->sb_lock);
-	SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
-
-	ret = bch2_sb_from_fs(c, ca);
-	bch_err_msg(c, ret, "setting up new superblock");
-	if (ret)
-		goto err_unlock;
-
-	if (dynamic_fault("bcachefs:add:no_slot"))
-		goto err_unlock;
-
-	ret = bch2_sb_member_alloc(c);
-	if (ret < 0) {
-		bch_err_msg(c, ret, "setting up new superblock");
-		goto err_unlock;
-	}
-	unsigned dev_idx = ret;
-	ret = 0;
-
-	/* success: */
-
-	dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
-	*bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
-
-	ca->disk_sb.sb->dev_idx	= dev_idx;
-	bch2_dev_attach(c, ca, dev_idx);
-
-	if (BCH_MEMBER_GROUP(&dev_mi)) {
-		ret = __bch2_dev_group_set(c, ca, label.buf);
-		bch_err_msg(c, ret, "creating new label");
-		if (ret)
-			goto err_unlock;
-	}
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	if (test_bit(BCH_FS_started, &c->flags)) {
-		ret = bch2_dev_usage_init(ca, false);
-		if (ret)
-			goto err_late;
-
-		ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-		bch_err_msg(ca, ret, "marking new superblock");
-		if (ret)
-			goto err_late;
-
-		ret = bch2_fs_freespace_init(c);
-		bch_err_msg(ca, ret, "initializing free space");
-		if (ret)
-			goto err_late;
-
-		if (ca->mi.state == BCH_MEMBER_STATE_rw)
-			__bch2_dev_read_write(c, ca);
-
-		ret = bch2_dev_journal_alloc(ca, false);
-		bch_err_msg(c, ret, "allocating journal");
-		if (ret)
-			goto err_late;
-	}
-
-	/*
-	 * We just changed the superblock UUID, invalidate cache and send a
-	 * uevent to update /dev/disk/by-uuid
-	 */
-	invalidate_bdev(ca->disk_sb.bdev);
-
-	char uuid_str[37];
-	snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
-
-	char *envp[] = {
-		"CHANGE=uuid",
-		uuid_str,
-		NULL,
-	};
-	kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
-
-	up_write(&c->state_lock);
-out:
-	printbuf_exit(&label);
-	printbuf_exit(&errbuf);
-	bch_err_fn(c, ret);
-	return ret;
-
-err_unlock:
-	mutex_unlock(&c->sb_lock);
-	up_write(&c->state_lock);
-err:
-	if (ca)
-		bch2_dev_free(ca);
-	bch2_free_super(&sb);
-	goto out;
-err_late:
-	up_write(&c->state_lock);
-	ca = NULL;
-	goto err;
-}
-
-/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
-{
-	struct bch_opts opts = bch2_opts_empty();
-	struct bch_sb_handle sb = { NULL };
-	struct bch_dev *ca;
-	unsigned dev_idx;
-	int ret;
-
-	down_write(&c->state_lock);
-
-	ret = bch2_read_super(path, &opts, &sb);
-	if (ret) {
-		up_write(&c->state_lock);
-		return ret;
-	}
-
-	dev_idx = sb.sb->dev_idx;
-
-	ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
-	bch_err_msg(c, ret, "bringing %s online", path);
-	if (ret)
-		goto err;
-
-	ret = bch2_dev_attach_bdev(c, &sb);
-	if (ret)
-		goto err;
-
-	ca = bch2_dev_locked(c, dev_idx);
-
-	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-	bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
-	if (ret)
-		goto err;
-
-	if (ca->mi.state == BCH_MEMBER_STATE_rw)
-		__bch2_dev_read_write(c, ca);
-
-	if (!ca->mi.freespace_initialized) {
-		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
-		bch_err_msg(ca, ret, "initializing free space");
-		if (ret)
-			goto err;
-	}
-
-	if (!ca->journal.nr) {
-		ret = bch2_dev_journal_alloc(ca, false);
-		bch_err_msg(ca, ret, "allocating journal");
-		if (ret)
-			goto err;
-	}
-
-	mutex_lock(&c->sb_lock);
-	bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
-		cpu_to_le64(ktime_get_real_seconds());
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	up_write(&c->state_lock);
-	return 0;
-err:
-	up_write(&c->state_lock);
-	bch2_free_super(&sb);
-	return ret;
-}
-
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
-	down_write(&c->state_lock);
-
-	if (!bch2_dev_is_online(ca)) {
-		bch_err(ca, "Already offline");
-		up_write(&c->state_lock);
-		return 0;
-	}
-
-	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
-		bch_err(ca, "Cannot offline required disk");
-		up_write(&c->state_lock);
-		return bch_err_throw(c, device_state_not_allowed);
-	}
-
-	__bch2_dev_offline(c, ca);
-
-	up_write(&c->state_lock);
-	return 0;
-}
-
-static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
-{
-	struct bch_fs *c = ca->fs;
-	u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
-
-	return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
-			bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
-						  .dev = ca->dev_idx,
-						  .data_type = BCH_DATA_free)) ?:
-		bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
-}
-
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
-	struct bch_member *m;
-	u64 old_nbuckets;
-	int ret = 0;
-
-	down_write(&c->state_lock);
-	old_nbuckets = ca->mi.nbuckets;
-
-	if (nbuckets < ca->mi.nbuckets) {
-		bch_err(ca, "Cannot shrink yet");
-		ret = -EINVAL;
-		goto err;
-	}
-
-	if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
-		bch_err(ca, "New device size too big (%llu greater than max %u)",
-			nbuckets, BCH_MEMBER_NBUCKETS_MAX);
-		ret = bch_err_throw(c, device_size_too_big);
-		goto err;
-	}
-
-	if (bch2_dev_is_online(ca) &&
-	    get_capacity(ca->disk_sb.bdev->bd_disk) <
-	    ca->mi.bucket_size * nbuckets) {
-		bch_err(ca, "New size larger than device");
-		ret = bch_err_throw(c, device_size_too_small);
-		goto err;
-	}
-
-	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
-	bch_err_msg(ca, ret, "resizing buckets");
-	if (ret)
-		goto err;
-
-	ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
-	if (ret)
-		goto err;
-
-	mutex_lock(&c->sb_lock);
-	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-	m->nbuckets = cpu_to_le64(nbuckets);
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	if (ca->mi.freespace_initialized) {
-		ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
-		if (ret)
-			goto err;
-	}
-
-	bch2_recalc_capacity(c);
-err:
-	up_write(&c->state_lock);
-	return ret;
-}
-
-int bch2_fs_resize_on_mount(struct bch_fs *c)
-{
-	for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
-		u64 old_nbuckets = ca->mi.nbuckets;
-		u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
-					 ca->mi.bucket_size);
-
-		if (ca->mi.resize_on_mount &&
-		    new_nbuckets > ca->mi.nbuckets) {
-			bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
-			int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
-			bch_err_fn(ca, ret);
-			if (ret) {
-				enumerated_ref_put(&ca->io_ref[READ],
-						   BCH_DEV_READ_REF_fs_resize_on_mount);
-				up_write(&c->state_lock);
-				return ret;
-			}
-
-			mutex_lock(&c->sb_lock);
-			struct bch_member *m =
-				bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-			m->nbuckets = cpu_to_le64(new_nbuckets);
-			SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
-
-			c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
-			bch2_write_super(c);
-			mutex_unlock(&c->sb_lock);
-
-			if (ca->mi.freespace_initialized) {
-				ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
-				if (ret) {
-					enumerated_ref_put(&ca->io_ref[READ],
-							BCH_DEV_READ_REF_fs_resize_on_mount);
-					up_write(&c->state_lock);
-					return ret;
-				}
-			}
-		}
-	}
-	return 0;
-}
-
-/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
-{
-	if (!strncmp(name, "/dev/", strlen("/dev/")))
-		name += strlen("/dev/");
-
-	for_each_member_device(c, ca)
-		if (!strcmp(name, ca->name))
-			return ca;
-	return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-}
-
-/* blk_holder_ops: */
-
-static struct bch_fs *bdev_get_fs(struct block_device *bdev)
-	__releases(&bdev->bd_holder_lock)
-{
-	struct bch_sb_handle_holder *holder = bdev->bd_holder;
-	struct bch_fs *c = holder->c;
-
-	if (c && !bch2_ro_ref_tryget(c))
-		c = NULL;
-
-	mutex_unlock(&bdev->bd_holder_lock);
-
-	if (c)
-		wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
-	return c;
-}
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
-{
-	for_each_member_device(c, ca)
-		if (ca->disk_sb.bdev == bdev)
-			return ca;
-	return NULL;
-}
-
-static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
-{
-	struct bch_fs *c = bdev_get_fs(bdev);
-	if (!c)
-		return;
-
-	struct super_block *sb = c->vfs_sb;
-	if (sb) {
-		/*
-		 * Not necessary, c->ro_ref guards against the filesystem being
-		 * unmounted - we only take this to avoid a warning in
-		 * sync_filesystem:
-		 */
-		down_read(&sb->s_umount);
-	}
-
-	down_write(&c->state_lock);
-	struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
-	if (!ca)
-		goto unlock;
-
-	bool dev = bch2_dev_state_allowed(c, ca,
-					  BCH_MEMBER_STATE_failed,
-					  BCH_FORCE_IF_DEGRADED);
-
-	if (!dev && sb) {
-		if (!surprise)
-			sync_filesystem(sb);
-		shrink_dcache_sb(sb);
-		evict_inodes(sb);
-	}
-
-	struct printbuf buf = PRINTBUF;
-	__bch2_log_msg_start(ca->name, &buf);
-
-	prt_printf(&buf, "offline from block layer");
-
-	if (dev) {
-		__bch2_dev_offline(c, ca);
-	} else {
-		bch2_journal_flush(&c->journal);
-		bch2_fs_emergency_read_only2(c, &buf);
-	}
-
-	bch2_print_str(c, KERN_ERR, buf.buf);
-	printbuf_exit(&buf);
-
-	bch2_dev_put(ca);
-unlock:
-	if (sb)
-		up_read(&sb->s_umount);
-	up_write(&c->state_lock);
-	bch2_ro_ref_put(c);
-}
-
-static void bch2_fs_bdev_sync(struct block_device *bdev)
-{
-	struct bch_fs *c = bdev_get_fs(bdev);
-	if (!c)
-		return;
-
-	struct super_block *sb = c->vfs_sb;
-	if (sb) {
-		/*
-		 * Not necessary, c->ro_ref guards against the filesystem being
-		 * unmounted - we only take this to avoid a warning in
-		 * sync_filesystem:
-		 */
-		down_read(&sb->s_umount);
-		sync_filesystem(sb);
-		up_read(&sb->s_umount);
-	}
-
-	bch2_ro_ref_put(c);
-}
-
-const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-	.mark_dead		= bch2_fs_bdev_mark_dead,
-	.sync			= bch2_fs_bdev_sync,
-};
-
-/* Filesystem open: */
-
-static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
-{
-	return  cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
-		cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
-}
-
-struct bch_fs *bch2_fs_open(darray_const_str *devices,
-			    struct bch_opts *opts)
-{
-	bch_sb_handles sbs = {};
-	struct bch_fs *c = NULL;
-	struct bch_sb_handle *best = NULL;
-	struct printbuf errbuf = PRINTBUF;
-	int ret = 0;
-
-	if (!try_module_get(THIS_MODULE))
-		return ERR_PTR(-ENODEV);
-
-	if (!devices->nr) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ret = darray_make_room(&sbs, devices->nr);
-	if (ret)
-		goto err;
-
-	darray_for_each(*devices, i) {
-		struct bch_sb_handle sb = { NULL };
-
-		ret = bch2_read_super(*i, opts, &sb);
-		if (ret)
-			goto err;
-
-		BUG_ON(darray_push(&sbs, sb));
-	}
-
-	if (opts->nochanges && !opts->read_only) {
-		ret = bch_err_throw(c, erofs_nochanges);
-		goto err_print;
-	}
-
-	darray_for_each(sbs, sb)
-		if (!best || sb_cmp(sb->sb, best->sb) > 0)
-			best = sb;
-
-	darray_for_each_reverse(sbs, sb) {
-		ret = bch2_dev_in_fs(best, sb, opts);
-
-		if (ret == -BCH_ERR_device_has_been_removed ||
-		    ret == -BCH_ERR_device_splitbrain) {
-			bch2_free_super(sb);
-			darray_remove_item(&sbs, sb);
-			best -= best > sb;
-			ret = 0;
-			continue;
-		}
-
-		if (ret)
-			goto err_print;
-	}
-
-	c = bch2_fs_alloc(best->sb, opts, &sbs);
-	ret = PTR_ERR_OR_ZERO(c);
-	if (ret)
-		goto err;
-
-	down_write(&c->state_lock);
-	darray_for_each(sbs, sb) {
-		ret = bch2_dev_attach_bdev(c, sb);
-		if (ret) {
-			up_write(&c->state_lock);
-			goto err;
-		}
-	}
-	up_write(&c->state_lock);
-
-	if (!c->opts.nostart) {
-		ret = bch2_fs_start(c);
-		if (ret)
-			goto err;
-	}
-out:
-	darray_for_each(sbs, sb)
-		bch2_free_super(sb);
-	darray_exit(&sbs);
-	printbuf_exit(&errbuf);
-	module_put(THIS_MODULE);
-	return c;
-err_print:
-	pr_err("bch_fs_open err opening %s: %s",
-	       devices->data[0], bch2_err_str(ret));
-err:
-	if (!IS_ERR_OR_NULL(c))
-		bch2_fs_stop(c);
-	c = ERR_PTR(ret);
-	goto out;
-}
-
-/* Global interfaces/init */
-
-static void bcachefs_exit(void)
-{
-	bch2_debug_exit();
-	bch2_vfs_exit();
-	bch2_chardev_exit();
-	bch2_btree_key_cache_exit();
-	if (bcachefs_kset)
-		kset_unregister(bcachefs_kset);
-}
-
-static int __init bcachefs_init(void)
-{
-	bch2_bkey_pack_test();
-
-	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
-	    bch2_btree_key_cache_init() ||
-	    bch2_chardev_init() ||
-	    bch2_vfs_init() ||
-	    bch2_debug_init())
-		goto err;
-
-	return 0;
-err:
-	bcachefs_exit();
-	return -ENOMEM;
-}
-
-#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
-{
-	/* Match bool exactly, by re-using it. */
-	struct static_key *key = kp->arg;
-	struct kernel_param boolkp = *kp;
-	bool v;
-	int ret;
-
-	boolkp.arg = &v;
-
-	ret = param_set_bool(val, &boolkp);
-	if (ret)
-		return ret;
-	if (v)
-		static_key_enable(key);
-	else
-		static_key_disable(key);
-	return 0;
-}
-
-static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
-{
-	struct static_key *key = kp->arg;
-	return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
-}
-
-static const struct kernel_param_ops bch2_param_ops_static_key_t = {
-	.flags = KERNEL_PARAM_OPS_FL_NOARG,
-	.set = bch2_param_set_static_key_t,
-	.get = bch2_param_get_static_key_t,
-};
-
-#define BCH_DEBUG_PARAM(name, description)				\
-	module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
-	__MODULE_PARM_TYPE(name, "static_key_t");			\
-	MODULE_PARM_DESC(name, description);
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-__maybe_unused
-static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
-module_param_named(version, bch2_metadata_version, uint, 0444);
-
-module_exit(bcachefs_exit);
-module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
deleted file mode 100644
index e90bab9afe78..000000000000
--- a/fs/bcachefs/super.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_H
-#define _BCACHEFS_SUPER_H
-
-#include "extents.h"
-
-#include "bcachefs_ioctl.h"
-
-#include <linux/math64.h>
-
-extern const char * const bch2_fs_flag_strs[];
-extern const char * const bch2_write_refs[];
-extern const char * const bch2_dev_read_refs[];
-extern const char * const bch2_dev_write_refs[];
-
-struct bch_fs *bch2_dev_to_fs(dev_t);
-struct bch_fs *bch2_uuid_to_fs(__uuid_t);
-
-bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
-			   enum bch_member_state, int);
-int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
-			enum bch_member_state, int);
-int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
-		      enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
-struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-
-bool bch2_fs_emergency_read_only(struct bch_fs *);
-bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
-void bch2_fs_read_only(struct bch_fs *);
-
-int bch2_fs_read_write(struct bch_fs *);
-int bch2_fs_read_write_early(struct bch_fs *);
-
-int bch2_fs_resize_on_mount(struct bch_fs *);
-
-void __bch2_fs_stop(struct bch_fs *);
-void bch2_fs_free(struct bch_fs *);
-void bch2_fs_stop(struct bch_fs *);
-
-int bch2_fs_init_rw(struct bch_fs *);
-int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
-
-extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
-
-#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
deleted file mode 100644
index 3a899f799d1d..000000000000
--- a/fs/bcachefs/super_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_TYPES_H
-#define _BCACHEFS_SUPER_TYPES_H
-
-struct bch_fs;
-
-struct bch_sb_handle_holder {
-	struct bch_fs		*c;
-};
-
-struct bch_sb_handle {
-	struct bch_sb		*sb;
-	struct file		*s_bdev_file;
-	struct block_device	*bdev;
-	char			*sb_name;
-	struct bio		*bio;
-	struct bch_sb_handle_holder *holder;
-	size_t			buffer_size;
-	blk_mode_t		mode;
-	unsigned		have_layout:1;
-	unsigned		have_bio:1;
-	unsigned		fs_sb:1;
-	u64			seq;
-};
-
-struct bch_devs_mask {
-	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-};
-
-struct bch_devs_list {
-	u8			nr;
-	u8			data[BCH_BKEY_PTRS_MAX];
-};
-
-#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
deleted file mode 100644
index 05848375cea2..000000000000
--- a/fs/bcachefs/sysfs.c
+++ /dev/null
@@ -1,914 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcache sysfs interfaces
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#ifndef NO_BCACHEFS_SYSFS
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "sysfs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "inode.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-errors.h"
-#include "super-io.h"
-#include "tests.h"
-
-#include <linux/blkdev.h>
-#include <linux/sort.h>
-#include <linux/sched/clock.h>
-
-#include "util.h"
-
-#define SYSFS_OPS(type)							\
-const struct sysfs_ops type ## _sysfs_ops = {				\
-	.show	= type ## _show,					\
-	.store	= type ## _store					\
-}
-
-#define SHOW(fn)							\
-static ssize_t fn ## _to_text(struct printbuf *,			\
-			      struct kobject *, struct attribute *);	\
-									\
-static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
-			   char *buf)					\
-{									\
-	struct printbuf out = PRINTBUF;					\
-	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
-									\
-	if (out.pos && out.buf[out.pos - 1] != '\n')			\
-		prt_newline(&out);					\
-									\
-	if (!ret && out.allocation_failure)				\
-		ret = -ENOMEM;						\
-									\
-	if (!ret) {							\
-		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
-		memcpy(buf, out.buf, ret);				\
-	}								\
-	printbuf_exit(&out);						\
-	return bch2_err_class(ret);					\
-}									\
-									\
-static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
-			      struct attribute *attr)
-
-#define STORE(fn)							\
-static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
-			    const char *, size_t);			\
-									\
-static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
-			    const char *buf, size_t size)		\
-{									\
-	return bch2_err_class(fn##_store_inner(kobj, attr, buf, size));	\
-}									\
-									\
-static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
-				  const char *buf, size_t size)
-
-#define __sysfs_attribute(_name, _mode)					\
-	static struct attribute sysfs_##_name =				\
-		{ .name = #_name, .mode = _mode }
-
-#define write_attribute(n)	__sysfs_attribute(n, 0200)
-#define read_attribute(n)	__sysfs_attribute(n, 0444)
-#define rw_attribute(n)		__sysfs_attribute(n, 0644)
-
-#define sysfs_printf(file, fmt, ...)					\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		prt_printf(out, fmt "\n", __VA_ARGS__);			\
-} while (0)
-
-#define sysfs_print(file, var)						\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		snprint(out, var);					\
-} while (0)
-
-#define sysfs_hprint(file, val)						\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		prt_human_readable_s64(out, val);			\
-} while (0)
-
-#define sysfs_strtoul(file, var)					\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
-} while (0)
-
-#define sysfs_strtoul_clamp(file, var, min, max)			\
-do {									\
-	if (attr == &sysfs_ ## file)					\
-		return strtoul_safe_clamp(buf, var, min, max)		\
-			?: (ssize_t) size;				\
-} while (0)
-
-#define strtoul_or_return(cp)						\
-({									\
-	unsigned long _v;						\
-	int _r = kstrtoul(cp, 10, &_v);					\
-	if (_r)								\
-		return _r;						\
-	_v;								\
-})
-
-write_attribute(trigger_gc);
-write_attribute(trigger_discards);
-write_attribute(trigger_invalidates);
-write_attribute(trigger_journal_commit);
-write_attribute(trigger_journal_flush);
-write_attribute(trigger_journal_writes);
-write_attribute(trigger_btree_cache_shrink);
-write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_btree_updates);
-write_attribute(trigger_freelist_wakeup);
-write_attribute(trigger_recalc_capacity);
-write_attribute(trigger_delete_dead_snapshots);
-write_attribute(trigger_emergency_read_only);
-read_attribute(gc_gens_pos);
-
-read_attribute(uuid);
-read_attribute(minor);
-read_attribute(flags);
-read_attribute(first_bucket);
-read_attribute(nbuckets);
-read_attribute(io_done);
-read_attribute(io_errors);
-write_attribute(io_errors_reset);
-
-read_attribute(io_latency_read);
-read_attribute(io_latency_write);
-read_attribute(io_latency_stats_read);
-read_attribute(io_latency_stats_write);
-read_attribute(congested);
-
-read_attribute(btree_write_stats);
-
-read_attribute(btree_cache_size);
-read_attribute(compression_stats);
-read_attribute(errors);
-read_attribute(journal_debug);
-read_attribute(btree_cache);
-read_attribute(btree_key_cache);
-read_attribute(btree_reserve_cache);
-read_attribute(open_buckets);
-read_attribute(open_buckets_partial);
-read_attribute(nocow_lock_table);
-
-read_attribute(read_refs);
-read_attribute(write_refs);
-
-read_attribute(internal_uuid);
-read_attribute(disk_groups);
-
-read_attribute(has_data);
-read_attribute(alloc_debug);
-read_attribute(usage_base);
-
-#define x(t, n, ...) read_attribute(t);
-BCH_PERSISTENT_COUNTERS()
-#undef x
-
-rw_attribute(label);
-
-read_attribute(copy_gc_wait);
-
-sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_status);
-read_attribute(snapshot_delete_status);
-read_attribute(recovery_status);
-
-read_attribute(new_stripes);
-
-read_attribute(io_timers_read);
-read_attribute(io_timers_write);
-
-read_attribute(moving_ctxts);
-
-#ifdef CONFIG_BCACHEFS_TESTS
-write_attribute(perf_test);
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#define x(_name)						\
-	static struct attribute sysfs_time_stat_##_name =		\
-		{ .name = #_name, .mode = 0644 };
-	BCH_TIME_STATS()
-#undef x
-
-static size_t bch2_btree_cache_size(struct bch_fs *c)
-{
-	struct btree_cache *bc = &c->btree_cache;
-	size_t ret = 0;
-	struct btree *b;
-
-	mutex_lock(&bc->lock);
-	list_for_each_entry(b, &bc->live[0].list, list)
-		ret += btree_buf_bytes(b);
-	list_for_each_entry(b, &bc->live[1].list, list)
-		ret += btree_buf_bytes(b);
-	list_for_each_entry(b, &bc->freeable, list)
-		ret += btree_buf_bytes(b);
-	mutex_unlock(&bc->lock);
-	return ret;
-}
-
-static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	prt_str(out, "type");
-	printbuf_tabstop_push(out, 12);
-	printbuf_tabstop_push(out, 16);
-	printbuf_tabstop_push(out, 16);
-	printbuf_tabstop_push(out, 24);
-	prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
-
-	for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
-		struct disk_accounting_pos a;
-		disk_accounting_key_init(a, compression, .type = i);
-		struct bpos p = disk_accounting_pos_to_bpos(&a);
-		u64 v[3];
-		bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
-
-		u64 nr_extents			= v[0];
-		u64 sectors_uncompressed	= v[1];
-		u64 sectors_compressed		= v[2];
-
-		bch2_prt_compression_type(out, i);
-		prt_tab(out);
-
-		prt_human_readable_u64(out, sectors_compressed << 9);
-		prt_tab_rjust(out);
-
-		prt_human_readable_u64(out, sectors_uncompressed << 9);
-		prt_tab_rjust(out);
-
-		prt_human_readable_u64(out, nr_extents
-				       ? div64_u64(sectors_uncompressed << 9, nr_extents)
-				       : 0);
-		prt_tab_rjust(out);
-		prt_newline(out);
-	}
-
-	return 0;
-}
-
-static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	bch2_btree_id_to_text(out, c->gc_gens_btree);
-	prt_printf(out, ": ");
-	bch2_bpos_to_text(out, c->gc_gens_pos);
-	prt_printf(out, "\n");
-}
-
-static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_fs_usage_base b = {};
-
-	acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
-
-	prt_printf(out, "hidden:\t\t%llu\n",	b.hidden);
-	prt_printf(out, "btree:\t\t%llu\n",	b.btree);
-	prt_printf(out, "data:\t\t%llu\n",	b.data);
-	prt_printf(out, "cached:\t%llu\n",	b.cached);
-	prt_printf(out, "reserved:\t\t%llu\n",	b.reserved);
-	prt_printf(out, "nr_inodes:\t%llu\n",	b.nr_inodes);
-}
-
-SHOW(bch2_fs)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-	sysfs_print(minor,			c->minor);
-	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
-
-	if (attr == &sysfs_flags)
-		prt_bitflags(out, bch2_fs_flag_strs, c->flags);
-
-	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
-
-	if (attr == &sysfs_btree_write_stats)
-		bch2_btree_write_stats_to_text(out, c);
-
-	if (attr == &sysfs_gc_gens_pos)
-		bch2_gc_gens_pos_to_text(out, c);
-
-	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
-
-	if (attr == &sysfs_copy_gc_wait)
-		bch2_copygc_wait_to_text(out, c);
-
-	if (attr == &sysfs_rebalance_status)
-		bch2_rebalance_status_to_text(out, c);
-
-	if (attr == &sysfs_snapshot_delete_status)
-		bch2_snapshot_delete_status_to_text(out, c);
-
-	if (attr == &sysfs_recovery_status)
-		bch2_recovery_pass_status_to_text(out, c);
-
-	/* Debugging: */
-
-	if (attr == &sysfs_journal_debug)
-		bch2_journal_debug_to_text(out, &c->journal);
-
-	if (attr == &sysfs_btree_cache)
-		bch2_btree_cache_to_text(out, &c->btree_cache);
-
-	if (attr == &sysfs_btree_key_cache)
-		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
-
-	if (attr == &sysfs_btree_reserve_cache)
-		bch2_btree_reserve_cache_to_text(out, c);
-
-	if (attr == &sysfs_open_buckets)
-		bch2_open_buckets_to_text(out, c, NULL);
-
-	if (attr == &sysfs_open_buckets_partial)
-		bch2_open_buckets_partial_to_text(out, c);
-
-	if (attr == &sysfs_compression_stats)
-		bch2_compression_stats_to_text(out, c);
-
-	if (attr == &sysfs_errors)
-		bch2_fs_errors_to_text(out, c);
-
-	if (attr == &sysfs_new_stripes)
-		bch2_new_stripes_to_text(out, c);
-
-	if (attr == &sysfs_io_timers_read)
-		bch2_io_timers_to_text(out, &c->io_clock[READ]);
-
-	if (attr == &sysfs_io_timers_write)
-		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
-
-	if (attr == &sysfs_moving_ctxts)
-		bch2_fs_moving_ctxts_to_text(out, c);
-
-	if (attr == &sysfs_write_refs)
-		enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
-
-	if (attr == &sysfs_nocow_lock_table)
-		bch2_nocow_locks_to_text(out, &c->nocow_locks);
-
-	if (attr == &sysfs_disk_groups)
-		bch2_disk_groups_to_text(out, c);
-
-	if (attr == &sysfs_alloc_debug)
-		bch2_fs_alloc_debug_to_text(out, c);
-
-	if (attr == &sysfs_usage_base)
-		bch2_fs_usage_base_to_text(out, c);
-
-	return 0;
-}
-
-STORE(bch2_fs)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
-	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
-
-	/* Debugging: */
-
-	if (!test_bit(BCH_FS_started, &c->flags))
-		return -EPERM;
-
-	/* Debugging: */
-
-	if (attr == &sysfs_trigger_btree_updates)
-		queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-
-	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
-		return -EROFS;
-
-	if (attr == &sysfs_trigger_btree_cache_shrink) {
-		struct btree_cache *bc = &c->btree_cache;
-		struct shrink_control sc;
-
-		sc.gfp_mask = GFP_KERNEL;
-		sc.nr_to_scan = strtoul_or_return(buf);
-		bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
-	}
-
-	if (attr == &sysfs_trigger_btree_key_cache_shrink) {
-		struct shrink_control sc;
-
-		sc.gfp_mask = GFP_KERNEL;
-		sc.nr_to_scan = strtoul_or_return(buf);
-		c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
-	}
-
-	if (attr == &sysfs_trigger_gc)
-		bch2_gc_gens(c);
-
-	if (attr == &sysfs_trigger_discards)
-		bch2_do_discards(c);
-
-	if (attr == &sysfs_trigger_invalidates)
-		bch2_do_invalidates(c);
-
-	if (attr == &sysfs_trigger_journal_commit)
-		bch2_journal_flush(&c->journal);
-
-	if (attr == &sysfs_trigger_journal_flush) {
-		bch2_journal_flush_all_pins(&c->journal);
-		bch2_journal_meta(&c->journal);
-	}
-
-	if (attr == &sysfs_trigger_journal_writes)
-		bch2_journal_do_writes(&c->journal);
-
-	if (attr == &sysfs_trigger_freelist_wakeup)
-		closure_wake_up(&c->freelist_wait);
-
-	if (attr == &sysfs_trigger_recalc_capacity) {
-		down_read(&c->state_lock);
-		bch2_recalc_capacity(c);
-		up_read(&c->state_lock);
-	}
-
-	if (attr == &sysfs_trigger_delete_dead_snapshots)
-		__bch2_delete_dead_snapshots(c);
-
-	if (attr == &sysfs_trigger_emergency_read_only) {
-		struct printbuf buf = PRINTBUF;
-		bch2_log_msg_start(c, &buf);
-
-		prt_printf(&buf, "shutdown by sysfs\n");
-		bch2_fs_emergency_read_only2(c, &buf);
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-
-#ifdef CONFIG_BCACHEFS_TESTS
-	if (attr == &sysfs_perf_test) {
-		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
-		char *test		= strsep(&p, " \t\n");
-		char *nr_str		= strsep(&p, " \t\n");
-		char *threads_str	= strsep(&p, " \t\n");
-		unsigned threads;
-		u64 nr;
-		int ret = -EINVAL;
-
-		if (threads_str &&
-		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
-		    !(ret = bch2_strtoull_h(nr_str, &nr)))
-			ret = bch2_btree_perf_test(c, test, nr, threads);
-		kfree(tmp);
-
-		if (ret)
-			size = ret;
-	}
-#endif
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
-	return size;
-}
-SYSFS_OPS(bch2_fs);
-
-struct attribute *bch2_fs_files[] = {
-	&sysfs_minor,
-	&sysfs_btree_cache_size,
-	&sysfs_btree_write_stats,
-
-	&sysfs_rebalance_status,
-	&sysfs_snapshot_delete_status,
-	&sysfs_recovery_status,
-
-	&sysfs_compression_stats,
-	&sysfs_errors,
-
-#ifdef CONFIG_BCACHEFS_TESTS
-	&sysfs_perf_test,
-#endif
-	NULL
-};
-
-/* counters dir */
-
-SHOW(bch2_fs_counters)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
-	u64 counter = 0;
-	u64 counter_since_mount = 0;
-
-	printbuf_tabstop_push(out, 32);
-
-	#define x(t, n, f, ...) \
-		if (attr == &sysfs_##t) {					\
-			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
-			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
-			if (f & TYPE_SECTORS) {					\
-				counter <<= 9;					\
-				counter_since_mount <<= 9;			\
-			}							\
-										\
-			prt_printf(out, "since mount:\t");			\
-			(f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
-			prt_human_readable_u64(out, counter_since_mount);	\
-			prt_newline(out);					\
-										\
-			prt_printf(out, "since filesystem creation:\t");	\
-			(f & TYPE_COUNTER) ? prt_u64(out, counter) :		\
-			prt_human_readable_u64(out, counter);			\
-			prt_newline(out);					\
-		}
-	BCH_PERSISTENT_COUNTERS()
-	#undef x
-	return 0;
-}
-
-STORE(bch2_fs_counters) {
-	return 0;
-}
-
-SYSFS_OPS(bch2_fs_counters);
-
-struct attribute *bch2_fs_counters_files[] = {
-#define x(t, ...) \
-	&sysfs_##t,
-	BCH_PERSISTENT_COUNTERS()
-#undef x
-	NULL
-};
-/* internal dir - just a wrapper */
-
-SHOW(bch2_fs_internal)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
-	return bch2_fs_to_text(out, &c->kobj, attr);
-}
-
-STORE(bch2_fs_internal)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
-	return bch2_fs_store(&c->kobj, attr, buf, size);
-}
-SYSFS_OPS(bch2_fs_internal);
-
-struct attribute *bch2_fs_internal_files[] = {
-	&sysfs_flags,
-	&sysfs_journal_debug,
-	&sysfs_btree_cache,
-	&sysfs_btree_key_cache,
-	&sysfs_btree_reserve_cache,
-	&sysfs_new_stripes,
-	&sysfs_open_buckets,
-	&sysfs_open_buckets_partial,
-	&sysfs_write_refs,
-	&sysfs_nocow_lock_table,
-	&sysfs_io_timers_read,
-	&sysfs_io_timers_write,
-
-	&sysfs_trigger_gc,
-	&sysfs_trigger_discards,
-	&sysfs_trigger_invalidates,
-	&sysfs_trigger_journal_commit,
-	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_journal_writes,
-	&sysfs_trigger_btree_cache_shrink,
-	&sysfs_trigger_btree_key_cache_shrink,
-	&sysfs_trigger_btree_updates,
-	&sysfs_trigger_freelist_wakeup,
-	&sysfs_trigger_recalc_capacity,
-	&sysfs_trigger_delete_dead_snapshots,
-	&sysfs_trigger_emergency_read_only,
-
-	&sysfs_gc_gens_pos,
-
-	&sysfs_copy_gc_wait,
-
-	sysfs_pd_controller_files(rebalance),
-
-	&sysfs_moving_ctxts,
-
-	&sysfs_internal_uuid,
-
-	&sysfs_disk_groups,
-	&sysfs_alloc_debug,
-	&sysfs_usage_base,
-	NULL
-};
-
-/* options */
-
-static ssize_t sysfs_opt_show(struct bch_fs *c,
-			      struct bch_dev *ca,
-			      enum bch_opt_id id,
-			      struct printbuf *out)
-{
-	const struct bch_option *opt = bch2_opt_table + id;
-	u64 v;
-
-	if (opt->flags & OPT_FS) {
-		v = bch2_opt_get_by_id(&c->opts, id);
-	} else if ((opt->flags & OPT_DEVICE) && opt->get_member)  {
-		v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
-	} else {
-		return -EINVAL;
-	}
-
-	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
-	prt_char(out, '\n');
-	return 0;
-}
-
-static ssize_t sysfs_opt_store(struct bch_fs *c,
-			       struct bch_dev *ca,
-			       enum bch_opt_id id,
-			       const char *buf, size_t size)
-{
-	const struct bch_option *opt = bch2_opt_table + id;
-	int ret = 0;
-
-	/*
-	 * We don't need to take c->writes for correctness, but it eliminates an
-	 * unsightly error message in the dmesg log when we're RO:
-	 */
-	if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
-		return -EROFS;
-
-	char *tmp = kstrdup(buf, GFP_KERNEL);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	u64 v;
-	ret =   bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
-		bch2_opt_hook_pre_set(c, ca, id, v);
-	kfree(tmp);
-
-	if (ret < 0)
-		goto err;
-
-	bool is_sb = opt->get_sb || opt->get_member;
-	bool changed = false;
-
-	if (is_sb) {
-		changed = bch2_opt_set_sb(c, ca, opt, v);
-	} else if (!ca) {
-		changed = bch2_opt_get_by_id(&c->opts, id) != v;
-	} else {
-		/* device options that aren't superblock options aren't
-		 * supported */
-		BUG();
-	}
-
-	if (!ca)
-		bch2_opt_set_by_id(&c->opts, id, v);
-
-	if (changed)
-		bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
-
-	ret = size;
-err:
-	enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
-	return ret;
-}
-
-SHOW(bch2_fs_opts_dir)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
-	int id = bch2_opt_lookup(attr->name);
-	if (id < 0)
-		return 0;
-
-	return sysfs_opt_show(c, NULL, id, out);
-}
-
-STORE(bch2_fs_opts_dir)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
-	int id = bch2_opt_lookup(attr->name);
-	if (id < 0)
-		return 0;
-
-	return sysfs_opt_store(c, NULL, id, buf, size);
-}
-SYSFS_OPS(bch2_fs_opts_dir);
-
-struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-
-int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{
-	for (const struct bch_option *i = bch2_opt_table;
-	     i < bch2_opt_table + bch2_opts_nr;
-	     i++) {
-		if (i->flags & OPT_HIDDEN)
-			continue;
-		if (!(i->flags & type))
-			continue;
-
-		int ret = sysfs_create_file(kobj, &i->attr);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/* time stats */
-
-SHOW(bch2_fs_time_stats)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name)								\
-	if (attr == &sysfs_time_stat_##name)				\
-		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
-	BCH_TIME_STATS()
-#undef x
-
-	return 0;
-}
-
-STORE(bch2_fs_time_stats)
-{
-	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name)								\
-	if (attr == &sysfs_time_stat_##name)				\
-		bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
-	BCH_TIME_STATS()
-#undef x
-	return size;
-}
-SYSFS_OPS(bch2_fs_time_stats);
-
-struct attribute *bch2_fs_time_stats_files[] = {
-#define x(name)						\
-	&sysfs_time_stat_##name,
-	BCH_TIME_STATS()
-#undef x
-	NULL
-};
-
-static const char * const bch2_rw[] = {
-	"read",
-	"write",
-	NULL
-};
-
-static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-	int rw, i;
-
-	for (rw = 0; rw < 2; rw++) {
-		prt_printf(out, "%s:\n", bch2_rw[rw]);
-
-		for (i = 1; i < BCH_DATA_NR; i++)
-			prt_printf(out, "%-12s:%12llu\n",
-			       bch2_data_type_str(i),
-			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
-	}
-}
-
-SHOW(bch2_dev)
-{
-	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-	struct bch_fs *c = ca->fs;
-
-	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
-
-	sysfs_print(first_bucket,	ca->mi.first_bucket);
-	sysfs_print(nbuckets,		ca->mi.nbuckets);
-
-	if (attr == &sysfs_label) {
-		if (ca->mi.group)
-			bch2_disk_path_to_text(out, c, ca->mi.group - 1);
-		prt_char(out, '\n');
-	}
-
-	if (attr == &sysfs_has_data) {
-		prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
-		prt_char(out, '\n');
-	}
-
-	if (attr == &sysfs_io_done)
-		dev_io_done_to_text(out, ca);
-
-	if (attr == &sysfs_io_errors)
-		bch2_dev_io_errors_to_text(out, ca);
-
-	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
-	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
-
-	if (attr == &sysfs_io_latency_stats_read)
-		bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
-
-	if (attr == &sysfs_io_latency_stats_write)
-		bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
-
-	sysfs_printf(congested,			"%u%%",
-		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
-		     * 100 / CONGESTED_MAX);
-
-	if (attr == &sysfs_alloc_debug)
-		bch2_dev_alloc_debug_to_text(out, ca);
-
-	if (attr == &sysfs_open_buckets)
-		bch2_open_buckets_to_text(out, c, ca);
-
-	int opt_id = bch2_opt_lookup(attr->name);
-	if (opt_id >= 0)
-		return sysfs_opt_show(c, ca, opt_id, out);
-
-	if (attr == &sysfs_read_refs)
-		enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
-
-	if (attr == &sysfs_write_refs)
-		enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
-
-	return 0;
-}
-
-STORE(bch2_dev)
-{
-	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-	struct bch_fs *c = ca->fs;
-
-	if (attr == &sysfs_label) {
-		char *tmp;
-		int ret;
-
-		tmp = kstrdup(buf, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-
-		ret = bch2_dev_group_set(c, ca, strim(tmp));
-		kfree(tmp);
-		if (ret)
-			return ret;
-	}
-
-	if (attr == &sysfs_io_errors_reset)
-		bch2_dev_errors_reset(ca);
-
-	int opt_id = bch2_opt_lookup(attr->name);
-	if (opt_id >= 0)
-		return sysfs_opt_store(c, ca, opt_id, buf, size);
-
-	return size;
-}
-SYSFS_OPS(bch2_dev);
-
-struct attribute *bch2_dev_files[] = {
-	&sysfs_uuid,
-	&sysfs_first_bucket,
-	&sysfs_nbuckets,
-
-	/* settings: */
-	&sysfs_label,
-
-	&sysfs_has_data,
-	&sysfs_io_done,
-	&sysfs_io_errors,
-	&sysfs_io_errors_reset,
-
-	&sysfs_io_latency_read,
-	&sysfs_io_latency_write,
-	&sysfs_io_latency_stats_read,
-	&sysfs_io_latency_stats_write,
-	&sysfs_congested,
-
-	/* debug: */
-	&sysfs_alloc_debug,
-	&sysfs_open_buckets,
-
-	&sysfs_read_refs,
-	&sysfs_write_refs,
-	NULL
-};
-
-#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
deleted file mode 100644
index 303e0433c702..000000000000
--- a/fs/bcachefs/sysfs.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SYSFS_H_
-#define _BCACHEFS_SYSFS_H_
-
-#include <linux/sysfs.h>
-
-#ifndef NO_BCACHEFS_SYSFS
-
-struct attribute;
-struct sysfs_ops;
-
-extern struct attribute *bch2_fs_files[];
-extern struct attribute *bch2_fs_counters_files[];
-extern struct attribute *bch2_fs_internal_files[];
-extern struct attribute *bch2_fs_opts_dir_files[];
-extern struct attribute *bch2_fs_time_stats_files[];
-extern struct attribute *bch2_dev_files[];
-
-extern const struct sysfs_ops bch2_fs_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern const struct sysfs_ops bch2_dev_sysfs_ops;
-
-int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
-
-#else
-
-static struct attribute *bch2_fs_files[] = {};
-static struct attribute *bch2_fs_counters_files[] = {};
-static struct attribute *bch2_fs_internal_files[] = {};
-static struct attribute *bch2_fs_opts_dir_files[] = {};
-static struct attribute *bch2_fs_time_stats_files[] = {};
-static struct attribute *bch2_dev_files[] = {};
-
-static const struct sysfs_ops bch2_fs_sysfs_ops;
-static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-static const struct sysfs_ops bch2_dev_sysfs_ops;
-
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{ return 0; }
-
-#endif /* NO_BCACHEFS_SYSFS */
-
-#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
deleted file mode 100644
index 782a05fe7656..000000000000
--- a/fs/bcachefs/tests.c
+++ /dev/null
@@ -1,891 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_TESTS
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "journal_reclaim.h"
-#include "snapshot.h"
-#include "tests.h"
-
-#include "linux/kthread.h"
-#include "linux/random.h"
-
-static void delete_test_keys(struct bch_fs *c)
-{
-	int ret;
-
-	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      SPOS(0, 0, U32_MAX),
-				      POS(0, U64_MAX),
-				      0, NULL);
-	BUG_ON(ret);
-
-	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX),
-				      POS(0, U64_MAX),
-				      0, NULL);
-	BUG_ON(ret);
-}
-
-/* unit tests */
-
-static int test_delete(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_i_cookie k;
-	int ret;
-
-	bkey_cookie_init(&k.k_i);
-	k.k.p.snapshot = U32_MAX;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
-			     BTREE_ITER_intent);
-
-	ret = commit_do(trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_trans_update(trans, &iter, &k.k_i, 0));
-	bch_err_msg(c, ret, "update error");
-	if (ret)
-		goto err;
-
-	pr_info("deleting once");
-	ret = commit_do(trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_btree_delete_at(trans, &iter, 0));
-	bch_err_msg(c, ret, "delete error (first)");
-	if (ret)
-		goto err;
-
-	pr_info("deleting twice");
-	ret = commit_do(trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_btree_delete_at(trans, &iter, 0));
-	bch_err_msg(c, ret, "delete error (second)");
-	if (ret)
-		goto err;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int test_delete_written(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_i_cookie k;
-	int ret;
-
-	bkey_cookie_init(&k.k_i);
-	k.k.p.snapshot = U32_MAX;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
-			     BTREE_ITER_intent);
-
-	ret = commit_do(trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_trans_update(trans, &iter, &k.k_i, 0));
-	bch_err_msg(c, ret, "update error");
-	if (ret)
-		goto err;
-
-	bch2_trans_unlock(trans);
-	bch2_journal_flush_all_pins(&c->journal);
-
-	ret = commit_do(trans, NULL, NULL, 0,
-		bch2_btree_iter_traverse(trans, &iter) ?:
-		bch2_btree_delete_at(trans, &iter, 0));
-	bch_err_msg(c, ret, "delete error");
-	if (ret)
-		goto err;
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int test_iterate(struct bch_fs *c, u64 nr)
-{
-	u64 i;
-	int ret = 0;
-
-	delete_test_keys(c);
-
-	pr_info("inserting test keys");
-
-	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie ck;
-
-		bkey_cookie_init(&ck.k_i);
-		ck.k.p.offset = i;
-		ck.k.p.snapshot = U32_MAX;
-
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
-		bch_err_msg(c, ret, "insert error");
-		if (ret)
-			return ret;
-	}
-
-	pr_info("iterating forwards");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					0, k, ({
-			BUG_ON(k.k->p.offset != i++);
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i != nr);
-
-	pr_info("iterating backwards");
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
-				SPOS(0, U64_MAX, U32_MAX), 0, k, ({
-			BUG_ON(k.k->p.offset != --i);
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating backwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i);
-	return 0;
-}
-
-static int test_iterate_extents(struct bch_fs *c, u64 nr)
-{
-	u64 i;
-	int ret = 0;
-
-	delete_test_keys(c);
-
-	pr_info("inserting test extents");
-
-	for (i = 0; i < nr; i += 8) {
-		struct bkey_i_cookie ck;
-
-		bkey_cookie_init(&ck.k_i);
-		ck.k.p.offset = i + 8;
-		ck.k.p.snapshot = U32_MAX;
-		ck.k.size = 8;
-
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
-		bch_err_msg(c, ret, "insert error");
-		if (ret)
-			return ret;
-	}
-
-	pr_info("iterating forwards");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					0, k, ({
-			BUG_ON(bkey_start_offset(k.k) != i);
-			i = k.k->p.offset;
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i != nr);
-
-	pr_info("iterating backwards");
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
-				SPOS(0, U64_MAX, U32_MAX), 0, k, ({
-			BUG_ON(k.k->p.offset != i);
-			i = bkey_start_offset(k.k);
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating backwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i);
-	return 0;
-}
-
-static int test_iterate_slots(struct bch_fs *c, u64 nr)
-{
-	u64 i;
-	int ret = 0;
-
-	delete_test_keys(c);
-
-	pr_info("inserting test keys");
-
-	for (i = 0; i < nr; i++) {
-		struct bkey_i_cookie ck;
-
-		bkey_cookie_init(&ck.k_i);
-		ck.k.p.offset = i * 2;
-		ck.k.p.snapshot = U32_MAX;
-
-		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
-		bch_err_msg(c, ret, "insert error");
-		if (ret)
-			return ret;
-	}
-
-	pr_info("iterating forwards");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-					  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					  0, k, ({
-			BUG_ON(k.k->p.offset != i);
-			i += 2;
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i != nr * 2);
-
-	pr_info("iterating forwards by slots");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					BTREE_ITER_slots, k, ({
-			if (i >= nr * 2)
-				break;
-
-			BUG_ON(k.k->p.offset != i);
-			BUG_ON(bkey_deleted(k.k) != (i & 1));
-
-			i++;
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards by slots");
-	return ret;
-}
-
-static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
-{
-	u64 i;
-	int ret = 0;
-
-	delete_test_keys(c);
-
-	pr_info("inserting test keys");
-
-	for (i = 0; i < nr; i += 16) {
-		struct bkey_i_cookie ck;
-
-		bkey_cookie_init(&ck.k_i);
-		ck.k.p.offset = i + 16;
-		ck.k.p.snapshot = U32_MAX;
-		ck.k.size = 8;
-
-		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
-		bch_err_msg(c, ret, "insert error");
-		if (ret)
-			return ret;
-	}
-
-	pr_info("iterating forwards");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					0, k, ({
-			BUG_ON(bkey_start_offset(k.k) != i + 8);
-			BUG_ON(k.k->size != 8);
-			i += 16;
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards");
-	if (ret)
-		return ret;
-
-	BUG_ON(i != nr);
-
-	pr_info("iterating forwards by slots");
-	i = 0;
-
-	ret = bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_extents,
-					SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-					BTREE_ITER_slots, k, ({
-			if (i == nr)
-				break;
-			BUG_ON(bkey_deleted(k.k) != !(i % 16));
-
-			BUG_ON(bkey_start_offset(k.k) != i);
-			BUG_ON(k.k->size != 8);
-			i = k.k->p.offset;
-			0;
-		})));
-	bch_err_msg(c, ret, "error iterating forwards by slots");
-	return ret;
-}
-
-/*
- * XXX: we really want to make sure we've got a btree with depth > 0 for these
- * tests
- */
-static int test_peek_end(struct bch_fs *c, u64 nr)
-{
-	delete_test_keys(c);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-			     SPOS(0, 0, U32_MAX), 0);
-
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-	BUG_ON(k.k);
-
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-	BUG_ON(k.k);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return 0;
-}
-
-static int test_peek_end_extents(struct bch_fs *c, u64 nr)
-{
-	delete_test_keys(c);
-
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(0, 0, U32_MAX), 0);
-
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-	BUG_ON(k.k);
-
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-	BUG_ON(k.k);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return 0;
-}
-
-/* extent unit tests */
-
-static u64 test_version;
-
-static int insert_test_extent(struct bch_fs *c,
-			      u64 start, u64 end)
-{
-	struct bkey_i_cookie k;
-	int ret;
-
-	bkey_cookie_init(&k.k_i);
-	k.k_i.k.p.offset = end;
-	k.k_i.k.p.snapshot = U32_MAX;
-	k.k_i.k.size = end - start;
-	k.k_i.k.bversion.lo = test_version++;
-
-	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int __test_extent_overwrite(struct bch_fs *c,
-				    u64 e1_start, u64 e1_end,
-				    u64 e2_start, u64 e2_end)
-{
-	int ret;
-
-	ret   = insert_test_extent(c, e1_start, e1_end) ?:
-		insert_test_extent(c, e2_start, e2_end);
-
-	delete_test_keys(c);
-	return ret;
-}
-
-static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
-{
-	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
-		__test_extent_overwrite(c, 8, 64, 0, 32);
-}
-
-static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
-{
-	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
-		__test_extent_overwrite(c, 0, 64, 32, 72);
-}
-
-static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
-{
-	return __test_extent_overwrite(c, 0, 64, 32, 40);
-}
-
-static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
-{
-	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
-		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
-		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
-		__test_extent_overwrite(c, 32, 64, 32, 128);
-}
-
-static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
-{
-	struct bkey_i_cookie k;
-	int ret;
-
-	bkey_cookie_init(&k.k_i);
-	k.k_i.k.p.inode	= inum;
-	k.k_i.k.p.offset = start + len;
-	k.k_i.k.p.snapshot = snapid;
-	k.k_i.k.size = len;
-
-	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
-					    BTREE_UPDATE_internal_snapshot_node));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
-{
-	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
-		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
-		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
-		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
-		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
-		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
-		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
-}
-
-/* snapshot unit tests */
-
-/* Test skipping over keys in unrelated snapshots: */
-static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
-{
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bkey_i_cookie cookie;
-	int ret;
-
-	bkey_cookie_init(&cookie.k_i);
-	cookie.k.p.snapshot = snapid_hi;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
-	if (ret)
-		return ret;
-
-	trans = bch2_trans_get(c);
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-			     SPOS(0, 0, snapid_lo), 0);
-	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-
-	BUG_ON(k.k->p.snapshot != U32_MAX);
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int test_snapshots(struct bch_fs *c, u64 nr)
-{
-	struct bkey_i_cookie cookie;
-	u32 snapids[2];
-	u32 snapid_subvols[2] = { 1, 1 };
-	int ret;
-
-	bkey_cookie_init(&cookie.k_i);
-	cookie.k.p.snapshot = U32_MAX;
-	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
-	if (ret)
-		return ret;
-
-	ret = bch2_trans_commit_do(c, NULL, NULL, 0,
-		      bch2_snapshot_node_create(trans, U32_MAX,
-						snapids,
-						snapid_subvols,
-						2));
-	if (ret)
-		return ret;
-
-	if (snapids[0] > snapids[1])
-		swap(snapids[0], snapids[1]);
-
-	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
-	bch_err_msg(c, ret, "from test_snapshot_filter");
-	return ret;
-}
-
-/* perf tests */
-
-static u64 test_rand(void)
-{
-	u64 v;
-
-	get_random_bytes(&v, sizeof(v));
-	return v;
-}
-
-static int rand_insert(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bkey_i_cookie k;
-	int ret = 0;
-	u64 i;
-
-	for (i = 0; i < nr; i++) {
-		bkey_cookie_init(&k.k_i);
-		k.k.p.offset = test_rand();
-		k.k.p.snapshot = U32_MAX;
-
-		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int rand_insert_multi(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct bkey_i_cookie k[8];
-	int ret = 0;
-	unsigned j;
-	u64 i;
-
-	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
-		for (j = 0; j < ARRAY_SIZE(k); j++) {
-			bkey_cookie_init(&k[j].k_i);
-			k[j].k.p.offset = test_rand();
-			k[j].k.p.snapshot = U32_MAX;
-		}
-
-		ret = commit_do(trans, NULL, NULL, 0,
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int rand_lookup(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-	u64 i;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-			     SPOS(0, 0, U32_MAX), 0);
-
-	for (i = 0; i < nr; i++) {
-		bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
-
-		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
-		ret = bkey_err(k);
-		if (ret)
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int rand_mixed_trans(struct btree_trans *trans,
-			    struct btree_iter *iter,
-			    struct bkey_i_cookie *cookie,
-			    u64 i, u64 pos)
-{
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
-
-	k = bch2_btree_iter_peek(trans, iter);
-	ret = bkey_err(k);
-	bch_err_msg(trans->c, ret, "lookup error");
-	if (ret)
-		return ret;
-
-	if (!(i & 3) && k.k) {
-		bkey_cookie_init(&cookie->k_i);
-		cookie->k.p = iter->pos;
-		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
-	}
-
-	return ret;
-}
-
-static int rand_mixed(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_i_cookie cookie;
-	int ret = 0;
-	u64 i, rand;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
-			     SPOS(0, 0, U32_MAX), 0);
-
-	for (i = 0; i < nr; i++) {
-		rand = test_rand();
-		ret = commit_do(trans, NULL, NULL, 0,
-			rand_mixed_trans(trans, &iter, &cookie, i, rand));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_iter_exit(trans, &iter);
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int __do_delete(struct btree_trans *trans, struct bpos pos)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
-			     BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	if (!k.k)
-		goto err;
-
-	ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int rand_delete(struct bch_fs *c, u64 nr)
-{
-	struct btree_trans *trans = bch2_trans_get(c);
-	int ret = 0;
-	u64 i;
-
-	for (i = 0; i < nr; i++) {
-		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
-
-		ret = commit_do(trans, NULL, NULL, 0,
-			__do_delete(trans, pos));
-		if (ret)
-			break;
-	}
-
-	bch2_trans_put(trans);
-	return ret;
-}
-
-static int seq_insert(struct bch_fs *c, u64 nr)
-{
-	struct bkey_i_cookie insert;
-
-	bkey_cookie_init(&insert.k_i);
-
-	return bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-					SPOS(0, 0, U32_MAX),
-					BTREE_ITER_slots|BTREE_ITER_intent, k,
-					NULL, NULL, 0, ({
-			if (iter.pos.offset >= nr)
-				break;
-			insert.k.p = iter.pos;
-			bch2_trans_update(trans, &iter, &insert.k_i, 0);
-		})));
-}
-
-static int seq_lookup(struct bch_fs *c, u64 nr)
-{
-	return bch2_trans_run(c,
-		for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
-				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
-				  0, k,
-		0));
-}
-
-static int seq_overwrite(struct bch_fs *c, u64 nr)
-{
-	return bch2_trans_run(c,
-		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
-					SPOS(0, 0, U32_MAX),
-					BTREE_ITER_intent, k,
-					NULL, NULL, 0, ({
-			struct bkey_i_cookie u;
-
-			bkey_reassemble(&u.k_i, k);
-			bch2_trans_update(trans, &iter, &u.k_i, 0);
-		})));
-}
-
-static int seq_delete(struct bch_fs *c, u64 nr)
-{
-	return bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      SPOS(0, 0, U32_MAX),
-				      POS(0, U64_MAX),
-				      0, NULL);
-}
-
-typedef int (*perf_test_fn)(struct bch_fs *, u64);
-
-struct test_job {
-	struct bch_fs			*c;
-	u64				nr;
-	unsigned			nr_threads;
-	perf_test_fn			fn;
-
-	atomic_t			ready;
-	wait_queue_head_t		ready_wait;
-
-	atomic_t			done;
-	struct completion		done_completion;
-
-	u64				start;
-	u64				finish;
-	int				ret;
-};
-
-static int btree_perf_test_thread(void *data)
-{
-	struct test_job *j = data;
-	int ret;
-
-	if (atomic_dec_and_test(&j->ready)) {
-		wake_up(&j->ready_wait);
-		j->start = sched_clock();
-	} else {
-		wait_event(j->ready_wait, !atomic_read(&j->ready));
-	}
-
-	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
-	if (ret) {
-		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
-		j->ret = ret;
-	}
-
-	if (atomic_dec_and_test(&j->done)) {
-		j->finish = sched_clock();
-		complete(&j->done_completion);
-	}
-
-	return 0;
-}
-
-int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
-			 u64 nr, unsigned nr_threads)
-{
-	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-	char name_buf[20];
-	struct printbuf nr_buf = PRINTBUF;
-	struct printbuf per_sec_buf = PRINTBUF;
-	unsigned i;
-	u64 time;
-
-	if (nr == 0 || nr_threads == 0) {
-		pr_err("nr of iterations or threads is not allowed to be 0");
-		return -EINVAL;
-	}
-
-	atomic_set(&j.ready, nr_threads);
-	init_waitqueue_head(&j.ready_wait);
-
-	atomic_set(&j.done, nr_threads);
-	init_completion(&j.done_completion);
-
-#define perf_test(_test)				\
-	if (!strcmp(testname, #_test)) j.fn = _test
-
-	perf_test(rand_insert);
-	perf_test(rand_insert_multi);
-	perf_test(rand_lookup);
-	perf_test(rand_mixed);
-	perf_test(rand_delete);
-
-	perf_test(seq_insert);
-	perf_test(seq_lookup);
-	perf_test(seq_overwrite);
-	perf_test(seq_delete);
-
-	/* a unit test, not a perf test: */
-	perf_test(test_delete);
-	perf_test(test_delete_written);
-	perf_test(test_iterate);
-	perf_test(test_iterate_extents);
-	perf_test(test_iterate_slots);
-	perf_test(test_iterate_slots_extents);
-	perf_test(test_peek_end);
-	perf_test(test_peek_end_extents);
-
-	perf_test(test_extent_overwrite_front);
-	perf_test(test_extent_overwrite_back);
-	perf_test(test_extent_overwrite_middle);
-	perf_test(test_extent_overwrite_all);
-	perf_test(test_extent_create_overlapping);
-
-	perf_test(test_snapshots);
-
-	if (!j.fn) {
-		pr_err("unknown test %s", testname);
-		return -EINVAL;
-	}
-
-	//pr_info("running test %s:", testname);
-
-	if (nr_threads == 1)
-		btree_perf_test_thread(&j);
-	else
-		for (i = 0; i < nr_threads; i++)
-			kthread_run(btree_perf_test_thread, &j,
-				    "bcachefs perf test[%u]", i);
-
-	while (wait_for_completion_interruptible(&j.done_completion))
-		;
-
-	time = j.finish - j.start;
-
-	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-	prt_human_readable_u64(&nr_buf, nr);
-	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
-	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-		name_buf, nr_buf.buf, nr_threads,
-		div_u64(time, NSEC_PER_SEC),
-		div_u64(time * nr_threads, nr),
-		per_sec_buf.buf);
-	printbuf_exit(&per_sec_buf);
-	printbuf_exit(&nr_buf);
-	return j.ret;
-}
-
-#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
deleted file mode 100644
index c73b18aea7e0..000000000000
--- a/fs/bcachefs/tests.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TEST_H
-#define _BCACHEFS_TEST_H
-
-struct bch_fs;
-
-#ifdef CONFIG_BCACHEFS_TESTS
-
-int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
-
-#else
-
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
deleted file mode 100644
index 314a24d15d4e..000000000000
--- a/fs/bcachefs/thread_with_file.c
+++ /dev/null
@@ -1,494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
-	if (thr->task) {
-		kthread_stop(thr->task);
-		put_task_struct(thr->task);
-	}
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
-			      const struct file_operations *fops,
-			      int (*fn)(void *))
-{
-	struct file *file = NULL;
-	int ret, fd = -1;
-	unsigned fd_flags = O_CLOEXEC;
-
-	if (fops->read && fops->write)
-		fd_flags |= O_RDWR;
-	else if (fops->read)
-		fd_flags |= O_RDONLY;
-	else if (fops->write)
-		fd_flags |= O_WRONLY;
-
-	char name[TASK_COMM_LEN];
-	get_task_comm(name, current);
-
-	thr->ret = 0;
-	thr->task = kthread_create(fn, thr, "%s", name);
-	ret = PTR_ERR_OR_ZERO(thr->task);
-	if (ret)
-		return ret;
-
-	ret = get_unused_fd_flags(fd_flags);
-	if (ret < 0)
-		goto err;
-	fd = ret;
-
-	file = anon_inode_getfile(name, fops, thr, fd_flags);
-	ret = PTR_ERR_OR_ZERO(file);
-	if (ret)
-		goto err;
-
-	get_task_struct(thr->task);
-	wake_up_process(thr->task);
-	fd_install(fd, file);
-	return fd;
-err:
-	if (fd >= 0)
-		put_unused_fd(fd);
-	if (thr->task)
-		kthread_stop(thr->task);
-	return ret;
-}
-
-/* stdio_redirect */
-
-static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
-{
-	return stdio->input.buf.nr > seen || stdio->done;
-}
-
-static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
-{
-	return stdio_redirect_has_more_input(stdio, 0);
-}
-
-static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
-{
-	return stdio->output.buf.nr || stdio->done;
-}
-
-#define STDIO_REDIRECT_BUFSIZE		4096
-
-static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
-{
-	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
-{
-	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static void stdio_buf_init(struct stdio_buf *buf)
-{
-	spin_lock_init(&buf->lock);
-	init_waitqueue_head(&buf->wait);
-	darray_init(&buf->buf);
-}
-
-/* thread_with_stdio */
-
-static void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-	thr->thr.done = true;
-	thr->stdio.done = true;
-	wake_up(&thr->stdio.input.wait);
-	wake_up(&thr->stdio.output.wait);
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
-				      size_t len, loff_t *ppos)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct stdio_buf *buf = &thr->stdio.output;
-	size_t copied = 0, b;
-	int ret = 0;
-
-	if (!(file->f_flags & O_NONBLOCK)) {
-		ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
-		if (ret)
-			return ret;
-	} else if (!stdio_redirect_has_output(&thr->stdio))
-		return -EAGAIN;
-
-	while (len && buf->buf.nr) {
-		if (fault_in_writeable(ubuf, len) == len) {
-			ret = -EFAULT;
-			break;
-		}
-
-		spin_lock_irq(&buf->lock);
-		b = min_t(size_t, len, buf->buf.nr);
-
-		if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
-			ubuf	+= b;
-			len	-= b;
-			copied	+= b;
-			buf->buf.nr -= b;
-			memmove(buf->buf.data,
-				buf->buf.data + b,
-				buf->buf.nr);
-		}
-		spin_unlock_irq(&buf->lock);
-	}
-
-	return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	thread_with_stdio_done(thr);
-	bch2_thread_with_file_exit(&thr->thr);
-	darray_exit(&thr->stdio.input.buf);
-	darray_exit(&thr->stdio.output.buf);
-	thr->ops->exit(thr);
-	return 0;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
-				       size_t len, loff_t *ppos)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct stdio_buf *buf = &thr->stdio.input;
-	size_t copied = 0;
-	ssize_t ret = 0;
-
-	while (len) {
-		if (thr->thr.done) {
-			ret = -EPIPE;
-			break;
-		}
-
-		size_t b = len - fault_in_readable(ubuf, len);
-		if (!b) {
-			ret = -EFAULT;
-			break;
-		}
-
-		spin_lock(&buf->lock);
-		size_t makeroom = b;
-		if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
-			makeroom = min_t(ssize_t, makeroom,
-				   max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
-						  0));
-		darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
-
-		b = min(len, darray_room(buf->buf));
-
-		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
-			buf->buf.nr += b;
-			ubuf	+= b;
-			len	-= b;
-			copied	+= b;
-		}
-		spin_unlock(&buf->lock);
-
-		if (b) {
-			wake_up(&buf->wait);
-		} else {
-			if ((file->f_flags & O_NONBLOCK)) {
-				ret = -EAGAIN;
-				break;
-			}
-
-			ret = wait_event_interruptible(buf->wait,
-					stdio_redirect_has_input_space(&thr->stdio));
-			if (ret)
-				break;
-		}
-	}
-
-	return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	poll_wait(file, &thr->stdio.output.wait, wait);
-	poll_wait(file, &thr->stdio.input.wait, wait);
-
-	__poll_t mask = 0;
-
-	if (stdio_redirect_has_output(&thr->stdio))
-		mask |= EPOLLIN;
-	if (stdio_redirect_has_input_space(&thr->stdio))
-		mask |= EPOLLOUT;
-	if (thr->thr.done)
-		mask |= EPOLLHUP|EPOLLERR;
-	return mask;
-}
-
-static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	poll_wait(file, &thr->stdio.output.wait, wait);
-
-	__poll_t mask = 0;
-
-	if (stdio_redirect_has_output(&thr->stdio))
-		mask |= EPOLLIN;
-	if (thr->thr.done)
-		mask |= EPOLLHUP|EPOLLERR;
-	return mask;
-}
-
-static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	return thr->thr.ret;
-}
-
-static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
-{
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-
-	if (thr->ops->unlocked_ioctl)
-		return thr->ops->unlocked_ioctl(thr, cmd, p);
-	return -ENOTTY;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
-	.read		= thread_with_stdio_read,
-	.write		= thread_with_stdio_write,
-	.poll		= thread_with_stdio_poll,
-	.flush		= thread_with_stdio_flush,
-	.release	= thread_with_stdio_release,
-	.unlocked_ioctl	= thread_with_stdio_ioctl,
-};
-
-static const struct file_operations thread_with_stdout_fops = {
-	.read		= thread_with_stdio_read,
-	.poll		= thread_with_stdout_poll,
-	.flush		= thread_with_stdio_flush,
-	.release	= thread_with_stdio_release,
-	.unlocked_ioctl	= thread_with_stdio_ioctl,
-};
-
-static int thread_with_stdio_fn(void *arg)
-{
-	struct thread_with_stdio *thr = arg;
-
-	thr->thr.ret = thr->ops->fn(thr);
-
-	thread_with_stdio_done(thr);
-	return 0;
-}
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
-				 const struct thread_with_stdio_ops *ops)
-{
-	stdio_buf_init(&thr->stdio.input);
-	stdio_buf_init(&thr->stdio.output);
-	thr->ops = ops;
-}
-
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
-{
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-			       const struct thread_with_stdio_ops *ops)
-{
-	bch2_thread_with_stdio_init(thr, ops);
-
-	return __bch2_run_thread_with_stdio(thr);
-}
-
-int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
-				const struct thread_with_stdio_ops *ops)
-{
-	stdio_buf_init(&thr->stdio.input);
-	stdio_buf_init(&thr->stdio.output);
-	thr->ops = ops;
-
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
-}
-EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
-	struct stdio_buf *buf = &stdio->input;
-
-	/*
-	 * we're waiting on user input (or for the file descriptor to be
-	 * closed), don't want a hung task warning:
-	 */
-	do {
-		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
-				   sysctl_hung_task_timeout_secs * HZ / 2);
-	} while (!stdio_redirect_has_input(stdio));
-
-	if (stdio->done)
-		return -1;
-
-	spin_lock(&buf->lock);
-	int ret = min(len, buf->buf.nr);
-	buf->buf.nr -= ret;
-	memcpy(ubuf, buf->buf.data, ret);
-	memmove(buf->buf.data,
-		buf->buf.data + ret,
-		buf->buf.nr);
-	spin_unlock(&buf->lock);
-
-	wake_up(&buf->wait);
-	return ret;
-}
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
-					 darray_char *line,
-					 unsigned long timeout)
-{
-	unsigned long until = jiffies + timeout, t;
-	struct stdio_buf *buf = &stdio->input;
-	size_t seen = 0;
-again:
-	t = timeout != MAX_SCHEDULE_TIMEOUT
-		? max_t(long, until - jiffies, 0)
-		: timeout;
-
-	t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
-
-	wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
-
-	if (stdio->done)
-		return -1;
-
-	spin_lock(&buf->lock);
-	seen = buf->buf.nr;
-	char *n = memchr(buf->buf.data, '\n', seen);
-
-	if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
-		spin_unlock(&buf->lock);
-		return -ETIME;
-	}
-
-	if (!n) {
-		buf->waiting_for_line = true;
-		spin_unlock(&buf->lock);
-		goto again;
-	}
-
-	size_t b = n + 1 - buf->buf.data;
-	if (b > line->size) {
-		spin_unlock(&buf->lock);
-		int ret = darray_resize(line, b);
-		if (ret)
-			return ret;
-		seen = 0;
-		goto again;
-	}
-
-	buf->buf.nr -= b;
-	memcpy(line->data, buf->buf.data, b);
-	memmove(buf->buf.data,
-		buf->buf.data + b,
-		buf->buf.nr);
-	line->nr = b;
-
-	buf->waiting_for_line = false;
-	spin_unlock(&buf->lock);
-
-	wake_up(&buf->wait);
-	return 0;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
-{
-	return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
-}
-
-__printf(3, 0)
-static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
-{
-	ssize_t ret;
-
-	do {
-		va_list args2;
-		size_t len;
-
-		va_copy(args2, args);
-		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
-		va_end(args2);
-
-		if (len + 1 <= darray_room(*out)) {
-			out->nr += len;
-			return len;
-		}
-
-		ret = darray_make_room_gfp(out, len + 1, gfp);
-	} while (ret == 0);
-
-	return ret;
-}
-
-ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
-				    const char *fmt, va_list args)
-{
-	struct stdio_buf *buf = &stdio->output;
-	unsigned long flags;
-	ssize_t ret;
-again:
-	if (stdio->done)
-		return -EPIPE;
-
-	spin_lock_irqsave(&buf->lock, flags);
-	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
-	spin_unlock_irqrestore(&buf->lock, flags);
-
-	if (ret < 0) {
-		if (nonblocking)
-			return -EAGAIN;
-
-		ret = wait_event_interruptible(buf->wait,
-				stdio_redirect_has_output_space(stdio));
-		if (ret)
-			return ret;
-		goto again;
-	}
-
-	wake_up(&buf->wait);
-	return ret;
-}
-
-ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
-				const char *fmt, ...)
-{
-	va_list args;
-	ssize_t ret;
-
-	va_start(args, fmt);
-	ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
-	va_end(args);
-
-	return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
deleted file mode 100644
index 72497b921911..000000000000
--- a/fs/bcachefs/thread_with_file.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-/*
- * Thread with file: Run a kthread and connect it to a file descriptor, so that
- * it can be interacted with via fd read/write methods and closing the file
- * descriptor stops the kthread.
- *
- * We have two different APIs:
- *
- * thread_with_file, the low level version.
- * You get to define the full file_operations, including your release function,
- * which means that you must call bch2_thread_with_file_exit() from your
- * .release method
- *
- * thread_with_stdio, the higher level version
- * This implements full piping of input and output, including .poll.
- *
- * Notes on behaviour:
- *  - kthread shutdown behaves like writing or reading from a pipe that has been
- *    closed
- *  - Input and output buffers are 4096 bytes, although buffers may in some
- *    situations slightly exceed that limit so as to avoid chopping off a
- *    message in the middle in nonblocking mode.
- *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
- *    should be fine but might change in future revisions.
- *  - Output buffer may grow past 4096 bytes to deal with messages that are
- *    bigger than 4096 bytes
- *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
- *    drop entire messages.
- *
- * To write, use stdio_redirect_printf()
- * To read, use stdio_redirect_read() or stdio_redirect_readline()
- */
-
-struct task_struct;
-
-struct thread_with_file {
-	struct task_struct	*task;
-	int			ret;
-	bool			done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
-			      const struct file_operations *,
-			      int (*fn)(void *));
-
-struct thread_with_stdio;
-
-struct thread_with_stdio_ops {
-	void (*exit)(struct thread_with_stdio *);
-	int (*fn)(struct thread_with_stdio *);
-	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
-};
-
-struct thread_with_stdio {
-	struct thread_with_file	thr;
-	struct stdio_redirect	stdio;
-	const struct thread_with_stdio_ops	*ops;
-};
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *,
-				 const struct thread_with_stdio_ops *);
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-			       const struct thread_with_stdio_ops *);
-int bch2_run_thread_with_stdout(struct thread_with_stdio *,
-				const struct thread_with_stdio_ops *);
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
-
-__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
deleted file mode 100644
index f4d484d44f63..000000000000
--- a/fs/bcachefs/thread_with_file_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-#include "darray.h"
-
-struct stdio_buf {
-	spinlock_t		lock;
-	wait_queue_head_t	wait;
-	darray_char		buf;
-	bool			waiting_for_line;
-};
-
-struct stdio_redirect {
-	struct stdio_buf	input;
-	struct stdio_buf	output;
-	bool			done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
deleted file mode 100644
index 2c34fe4be912..000000000000
--- a/fs/bcachefs/time_stats.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-
-#include "eytzinger.h"
-#include "time_stats.h"
-
-/* disable automatic switching to percpu mode */
-#define TIME_STATS_NONPCPU	((unsigned long) 1)
-
-static const struct time_unit time_units[] = {
-	{ "ns",		1		 },
-	{ "us",		NSEC_PER_USEC	 },
-	{ "ms",		NSEC_PER_MSEC	 },
-	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          (u64) NSEC_PER_SEC * 60},
-	{ "h",          (u64) NSEC_PER_SEC * 3600},
-	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
-	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
-	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
-	{ "eon",        U64_MAX          },
-};
-
-const struct time_unit *bch2_pick_time_units(u64 ns)
-{
-	const struct time_unit *u;
-
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
-
-	return u;
-}
-
-static void quantiles_update(struct quantiles *q, u64 v)
-{
-	unsigned i = 0;
-
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct quantile_entry *e = q->entries + i;
-
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
-
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
-
-		if (v >= e->m)
-			break;
-
-		i = eytzinger0_child(i, v > e->m);
-	}
-}
-
-static inline void time_stats_update_one(struct bch2_time_stats *stats,
-					      u64 start, u64 end)
-{
-	u64 duration, freq;
-	bool initted = stats->last_event != 0;
-
-	if (time_after64(end, start)) {
-		struct quantiles *quantiles = time_stats_to_quantiles(stats);
-
-		duration = end - start;
-		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
-				duration, initted, TIME_STATS_MV_WEIGHT);
-		stats->max_duration = max(stats->max_duration, duration);
-		stats->min_duration = min(stats->min_duration, duration);
-		stats->total_duration += duration;
-
-		if (quantiles)
-			quantiles_update(quantiles, duration);
-	}
-
-	if (stats->last_event && time_after64(end, stats->last_event)) {
-		freq = end - stats->last_event;
-		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
-				freq, initted, TIME_STATS_MV_WEIGHT);
-		stats->max_freq = max(stats->max_freq, freq);
-		stats->min_freq = min(stats->min_freq, freq);
-	}
-
-	stats->last_event = end;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-				    struct time_stat_buffer *b)
-{
-	for (struct time_stat_buffer_entry *i = b->entries;
-	     i < b->entries + ARRAY_SIZE(b->entries);
-	     i++)
-		time_stats_update_one(stats, i->start, i->end);
-	b->nr = 0;
-}
-
-static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
-					     struct time_stat_buffer *b)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-	__bch2_time_stats_clear_buffer(stats, b);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-	unsigned long flags;
-
-	if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) {
-		spin_lock_irqsave(&stats->lock, flags);
-		time_stats_update_one(stats, start, end);
-
-		if (!stats->buffer &&
-		    mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
-		    stats->duration_stats.n > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-			time_stats_clear_buffer(stats, b);
-		preempt_enable();
-	}
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *stats)
-{
-	spin_lock_irq(&stats->lock);
-	unsigned offset = offsetof(struct bch2_time_stats, min_duration);
-	memset((void *) stats + offset, 0, sizeof(*stats) - offset);
-
-	if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) {
-		int cpu;
-		for_each_possible_cpu(cpu)
-			per_cpu_ptr(stats->buffer, cpu)->nr = 0;
-	}
-	spin_unlock_irq(&stats->lock);
-}
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-	if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU)
-		free_percpu(stats->buffer);
-	stats->buffer = NULL;
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->min_duration = U64_MAX;
-	stats->min_freq = U64_MAX;
-	spin_lock_init(&stats->lock);
-}
-
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
-{
-	bch2_time_stats_init(stats);
-	stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU;
-}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
deleted file mode 100644
index eddb0985bab4..000000000000
--- a/fs/bcachefs/time_stats.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * bch2_time_stats - collect statistics on events that have a duration, with nicely
- * formatted textual output on demand
- *
- * - percpu buffering of event collection: cheap enough to shotgun
- *   everywhere without worrying about overhead
- *
- * tracks:
- *  - number of events
- *  - maximum event duration ever seen
- *  - sum of all event durations
- *  - average event duration, standard and weighted
- *  - standard deviation of event durations, standard and weighted
- * and analagous statistics for the frequency of events
- *
- * We provide both mean and weighted mean (exponentially weighted), and standard
- * deviation and weighted standard deviation, to give an efficient-to-compute
- * view of current behaviour versus. average behaviour - "did this event source
- * just become wonky, or is this typical?".
- *
- * Particularly useful for tracking down latency issues.
- */
-#ifndef _BCACHEFS_TIME_STATS_H
-#define _BCACHEFS_TIME_STATS_H
-
-#include <linux/sched/clock.h>
-#include <linux/spinlock_types.h>
-#include <linux/string.h>
-
-#include "mean_and_variance.h"
-
-struct time_unit {
-	const char	*name;
-	u64		nsecs;
-};
-
-/*
- * given a nanosecond value, pick the preferred time units for printing:
- */
-const struct time_unit *bch2_pick_time_units(u64 ns);
-
-/*
- * quantiles - do not use:
- *
- * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
- * use in new code.
- */
-
-#define NR_QUANTILES	15
-#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
-
-struct quantiles {
-	struct quantile_entry {
-		u64	m;
-		u64	step;
-	}		entries[NR_QUANTILES];
-};
-
-struct time_stat_buffer {
-	unsigned	nr;
-	struct time_stat_buffer_entry {
-		u64	start;
-		u64	end;
-	}		entries[31];
-};
-
-struct bch2_time_stats {
-	spinlock_t	lock;
-	bool		have_quantiles;
-	struct time_stat_buffer __percpu *buffer;
-	/* all fields are in nanoseconds */
-	u64             min_duration;
-	u64		max_duration;
-	u64		total_duration;
-	u64             max_freq;
-	u64             min_freq;
-	u64		last_event;
-	u64		last_event_start;
-
-	struct mean_and_variance	  duration_stats;
-	struct mean_and_variance	  freq_stats;
-
-/* default weight for weighted mean and variance calculations */
-#define TIME_STATS_MV_WEIGHT	8
-
-	struct mean_and_variance_weighted duration_stats_weighted;
-	struct mean_and_variance_weighted freq_stats_weighted;
-};
-
-struct bch2_time_stats_quantiles {
-	struct bch2_time_stats	stats;
-	struct quantiles	quantiles;
-};
-
-static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
-{
-	return stats->have_quantiles
-		? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
-		: NULL;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-/**
- * time_stats_update - collect a new event being tracked
- *
- * @stats	- bch2_time_stats to update
- * @start	- start time of event, recorded with local_clock()
- *
- * The end duration of the event will be the current time
- */
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-	__bch2_time_stats_update(stats, start, local_clock());
-}
-
-/**
- * track_event_change - track state change events
- *
- * @stats	- bch2_time_stats to update
- * @v		- new state, true or false
- *
- * Use this when tracking time stats for state changes, i.e. resource X becoming
- * blocked/unblocked.
- */
-static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
-{
-	if (v != !!stats->last_event_start) {
-		if (!v) {
-			bch2_time_stats_update(stats, stats->last_event_start);
-			stats->last_event_start = 0;
-		} else {
-			stats->last_event_start = local_clock() ?: 1;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
-
-static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
-{
-	bch2_time_stats_exit(&statq->stats);
-}
-static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
-{
-	bch2_time_stats_init(&statq->stats);
-	statq->stats.have_quantiles = true;
-	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
-}
-
-#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
deleted file mode 100644
index dfad1d06633d..000000000000
--- a/fs/bcachefs/trace.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update_interior.h"
-#include "keylist.h"
-#include "move_types.h"
-#include "opts.h"
-#include "six.h"
-
-#include <linux/blktrace_api.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
deleted file mode 100644
index 9c5a9c551f03..000000000000
--- a/fs/bcachefs/trace.h
+++ /dev/null
@@ -1,1883 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcachefs
-
-#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-
-#include <linux/tracepoint.h>
-
-#define TRACE_BPOS_entries(name)				\
-	__field(u64,			name##_inode	)	\
-	__field(u64,			name##_offset	)	\
-	__field(u32,			name##_snapshot	)
-
-#define TRACE_BPOS_assign(dst, src)				\
-	__entry->dst##_inode		= (src).inode;		\
-	__entry->dst##_offset		= (src).offset;		\
-	__entry->dst##_snapshot		= (src).snapshot
-
-DECLARE_EVENT_CLASS(bpos,
-	TP_PROTO(const struct bpos *p),
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		TRACE_BPOS_entries(p)
-	),
-
-	TP_fast_assign(
-		TRACE_BPOS_assign(p, *p);
-	),
-
-	TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
-);
-
-DECLARE_EVENT_CLASS(fs_str,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__string(str,		str			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__assign_str(str);
-	),
-
-	TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str,
-	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
-	TP_ARGS(trans, caller_ip, str),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__array(char,		trans_fn, 32		)
-		__field(unsigned long,	caller_ip		)
-		__string(str,		str			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= trans->c->dev;
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__assign_str(str);
-	),
-
-	TP_printk("%d,%d %s %pS %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str_nocaller,
-	TP_PROTO(struct btree_trans *trans, const char *str),
-	TP_ARGS(trans, str),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__array(char,		trans_fn, 32		)
-		__string(str,		str			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= trans->c->dev;
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__assign_str(str);
-	),
-
-	TP_printk("%d,%d %s %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->trans_fn, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(btree_node_nofs,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u8,		level			)
-		__field(u8,		btree_id		)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->level		= b->c.level;
-		__entry->btree_id	= b->c.btree_id;
-		TRACE_BPOS_assign(pos, b->key.k.p);
-	),
-
-	TP_printk("%d,%d %u %s %llu:%llu:%u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->level,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_node,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__array(char,		trans_fn, 32		)
-		__field(u8,		level			)
-		__field(u8,		btree_id		)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= trans->c->dev;
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->level		= b->c.level;
-		__entry->btree_id	= b->c.btree_id;
-		TRACE_BPOS_assign(pos, b->key.k.p);
-	),
-
-	TP_printk("%d,%d %s %u %s %llu:%llu:%u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
-		  __entry->level,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(bch_fs,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-	),
-
-	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
-DECLARE_EVENT_CLASS(btree_trans,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__array(char,		trans_fn, 32		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= trans->c->dev;
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-	),
-
-	TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
-);
-
-DECLARE_EVENT_CLASS(bio,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
-	),
-
-	TP_printk("%d,%d  %s %llu + %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  (unsigned long long)__entry->sector, __entry->nr_sector)
-);
-
-/* errors */
-
-TRACE_EVENT(error_throw,
-	TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
-	TP_ARGS(c, bch_err, ip),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(int,		err			)
-		__array(char,		err_str, 32		)
-		__array(char,		ip, 32			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->err		= bch_err;
-		strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
-		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
-	),
-
-	TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ip, __entry->err_str)
-);
-
-TRACE_EVENT(error_downcast,
-	TP_PROTO(int bch_err, int std_err, unsigned long ip),
-	TP_ARGS(bch_err, std_err, ip),
-
-	TP_STRUCT__entry(
-		__array(char,		bch_err, 32		)
-		__array(char,		std_err, 32		)
-		__array(char,		ip, 32			)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
-		strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
-		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
-	),
-
-	TP_printk("%s ret %s -> %s %s", __entry->ip,
-		  __entry->bch_err, __entry->std_err, __entry->ip)
-);
-
-/* disk_accounting.c */
-
-TRACE_EVENT(accounting_mem_insert,
-	TP_PROTO(struct bch_fs *c, const char *acc),
-	TP_ARGS(c, acc),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(unsigned,	new_nr			)
-		__string(acc,		acc			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->new_nr		= c->accounting.k.nr;
-		__assign_str(acc);
-	),
-
-	TP_printk("%d,%d entries %u added %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->new_nr,
-		  __get_str(acc))
-);
-
-/* fs.c: */
-TRACE_EVENT(bch2_sync_fs,
-	TP_PROTO(struct super_block *sb, int wait),
-
-	TP_ARGS(sb, wait),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	int,	wait			)
-
-	),
-
-	TP_fast_assign(
-		__entry->dev	= sb->s_dev;
-		__entry->wait	= wait;
-	),
-
-	TP_printk("dev %d,%d wait %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->wait)
-);
-
-/* fs-io.c: */
-TRACE_EVENT(bch2_fsync,
-	TP_PROTO(struct file *file, int datasync),
-
-	TP_ARGS(file, datasync),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	ino_t,	parent			)
-		__field(	int,	datasync		)
-	),
-
-	TP_fast_assign(
-		struct dentry *dentry = file->f_path.dentry;
-
-		__entry->dev		= dentry->d_sb->s_dev;
-		__entry->ino		= d_inode(dentry)->i_ino;
-		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
-		__entry->datasync	= datasync;
-	),
-
-	TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino,
-		  (unsigned long) __entry->parent, __entry->datasync)
-);
-
-/* super-io.c: */
-TRACE_EVENT(write_super,
-	TP_PROTO(struct bch_fs *c, unsigned long ip),
-	TP_ARGS(c, ip),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev	)
-		__field(unsigned long,	ip	)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->ip		= ip;
-	),
-
-	TP_printk("%d,%d for %pS",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (void *) __entry->ip)
-);
-
-/* io.c: */
-
-DEFINE_EVENT(bio, io_read_promote,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-TRACE_EVENT(io_read_nopromote,
-	TP_PROTO(struct bch_fs *c, int ret),
-	TP_ARGS(c, ret),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev		)
-		__array(char,		ret, 32		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
-	),
-
-	TP_printk("%d,%d ret %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ret)
-);
-
-DEFINE_EVENT(bio, io_read_bounce,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_split,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_retry,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_reuse_race,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_fail_and_poison,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-/* ec.c */
-
-TRACE_EVENT(stripe_create,
-	TP_PROTO(struct bch_fs *c, u64 idx, int ret),
-	TP_ARGS(c, idx, ret),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		idx			)
-		__field(int,		ret			)
-	),
-
-	TP_fast_assign(
-		__entry->dev			= c->dev;
-		__entry->idx			= idx;
-		__entry->ret			= ret;
-	),
-
-	TP_printk("%d,%d idx %llu ret %i",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->idx,
-		  __entry->ret)
-);
-
-/* Journal */
-
-DEFINE_EVENT(bch_fs, journal_full,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_full,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_close,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(bio, journal_write,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
-);
-
-TRACE_EVENT(journal_reclaim_start,
-	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
-		 u64 min_nr, u64 min_key_cache,
-		 u64 btree_cache_dirty, u64 btree_cache_total,
-		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-	TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
-		btree_cache_dirty, btree_cache_total,
-		btree_key_cache_dirty, btree_key_cache_total),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(bool,		direct			)
-		__field(bool,		kicked			)
-		__field(u64,		min_nr			)
-		__field(u64,		min_key_cache		)
-		__field(u64,		btree_cache_dirty	)
-		__field(u64,		btree_cache_total	)
-		__field(u64,		btree_key_cache_dirty	)
-		__field(u64,		btree_key_cache_total	)
-	),
-
-	TP_fast_assign(
-		__entry->dev			= c->dev;
-		__entry->direct			= direct;
-		__entry->kicked			= kicked;
-		__entry->min_nr			= min_nr;
-		__entry->min_key_cache		= min_key_cache;
-		__entry->btree_cache_dirty	= btree_cache_dirty;
-		__entry->btree_cache_total	= btree_cache_total;
-		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
-		__entry->btree_key_cache_total	= btree_key_cache_total;
-	),
-
-	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->direct,
-		  __entry->kicked,
-		  __entry->min_nr,
-		  __entry->min_key_cache,
-		  __entry->btree_cache_dirty,
-		  __entry->btree_cache_total,
-		  __entry->btree_key_cache_dirty,
-		  __entry->btree_key_cache_total)
-);
-
-TRACE_EVENT(journal_reclaim_finish,
-	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
-	TP_ARGS(c, nr_flushed),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		nr_flushed		)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->nr_flushed	= nr_flushed;
-	),
-
-	TP_printk("%d,%d flushed %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->nr_flushed)
-);
-
-/* bset.c: */
-
-DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-	TP_PROTO(const struct bpos *p),
-	TP_ARGS(p)
-);
-
-/* Btree cache: */
-
-TRACE_EVENT(btree_cache_scan,
-	TP_PROTO(long nr_to_scan, long can_free, long ret),
-	TP_ARGS(nr_to_scan, can_free, ret),
-
-	TP_STRUCT__entry(
-		__field(long,	nr_to_scan		)
-		__field(long,	can_free		)
-		__field(long,	ret			)
-	),
-
-	TP_fast_assign(
-		__entry->nr_to_scan	= nr_to_scan;
-		__entry->can_free	= can_free;
-		__entry->ret		= ret;
-	),
-
-	TP_printk("scanned for %li nodes, can free %li, ret %li",
-		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
-	TP_PROTO(struct bch_fs *c, struct btree *b),
-	TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans)
-);
-
-/* Btree */
-
-DEFINE_EVENT(btree_node, btree_node_read,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_node_write,
-	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
-	TP_ARGS(b, bytes, sectors),
-
-	TP_STRUCT__entry(
-		__field(enum btree_node_type,	type)
-		__field(unsigned,	bytes			)
-		__field(unsigned,	sectors			)
-	),
-
-	TP_fast_assign(
-		__entry->type	= btree_node_type(b);
-		__entry->bytes	= bytes;
-		__entry->sectors = sectors;
-	),
-
-	TP_printk("bkey type %u bytes %u sectors %u",
-		  __entry->type , __entry->bytes, __entry->sectors)
-);
-
-DEFINE_EVENT(btree_node, btree_node_alloc,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_free,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_reserve_get_fail,
-	TP_PROTO(const char *trans_fn,
-		 unsigned long caller_ip,
-		 size_t required,
-		 int ret),
-	TP_ARGS(trans_fn, caller_ip, required, ret),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(size_t,			required	)
-		__array(char,			ret, 32		)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip	= caller_ip;
-		__entry->required	= required;
-		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
-	),
-
-	TP_printk("%s %pS required %zu ret %s",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->required,
-		  __entry->ret)
-);
-
-DEFINE_EVENT(btree_node, btree_node_compact,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_merge,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_split,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_rewrite,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_set_root,
-	TP_PROTO(struct btree_trans *trans, struct btree *b),
-	TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_path_relock_fail,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path,
-		 unsigned level),
-	TP_ARGS(trans, caller_ip, path, level),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			btree_id	)
-		__field(u8,			level		)
-		__field(u8,			path_idx)
-		TRACE_BPOS_entries(pos)
-		__array(char,			node, 24	)
-		__field(u8,			self_read_count	)
-		__field(u8,			self_intent_count)
-		__field(u8,			read_count	)
-		__field(u8,			intent_count	)
-		__field(u32,			iter_lock_seq	)
-		__field(u32,			node_lock_seq	)
-	),
-
-	TP_fast_assign(
-		struct btree *b = btree_path_node(path, level);
-		struct six_lock_count c;
-
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= path->btree_id;
-		__entry->level			= level;
-		__entry->path_idx		= path - trans->paths;
-		TRACE_BPOS_assign(pos, path->pos);
-
-		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
-		__entry->self_read_count	= c.n[SIX_LOCK_read];
-		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
-
-		if (IS_ERR(b)) {
-			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
-		} else {
-			c = six_lock_counts(&path->l[level].b->c.lock);
-			__entry->read_count	= c.n[SIX_LOCK_read];
-			__entry->intent_count	= c.n[SIX_LOCK_intent];
-			scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
-		}
-		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level)
-			? six_lock_seq(&path->l[level].b->c.lock)
-			: 0;
-	),
-
-	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->path_idx,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->level,
-		  __entry->node,
-		  __entry->self_read_count,
-		  __entry->self_intent_count,
-		  __entry->read_count,
-		  __entry->intent_count,
-		  __entry->iter_lock_seq,
-		  __entry->node_lock_seq)
-);
-
-TRACE_EVENT(btree_path_upgrade_fail,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path,
-		 unsigned level),
-	TP_ARGS(trans, caller_ip, path, level),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			btree_id	)
-		__field(u8,			level		)
-		__field(u8,			path_idx)
-		TRACE_BPOS_entries(pos)
-		__field(u8,			locked		)
-		__field(u8,			self_read_count	)
-		__field(u8,			self_intent_count)
-		__field(u8,			read_count	)
-		__field(u8,			intent_count	)
-		__field(u32,			iter_lock_seq	)
-		__field(u32,			node_lock_seq	)
-	),
-
-	TP_fast_assign(
-		struct six_lock_count c;
-
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= path->btree_id;
-		__entry->level			= level;
-		__entry->path_idx		= path - trans->paths;
-		TRACE_BPOS_assign(pos, path->pos);
-		__entry->locked			= btree_node_locked(path, level);
-
-		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
-		__entry->self_read_count	= c.n[SIX_LOCK_read];
-		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
-		c = six_lock_counts(&path->l[level].b->c.lock);
-		__entry->read_count		= c.n[SIX_LOCK_read];
-		__entry->intent_count		= c.n[SIX_LOCK_intent];
-		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level)
-			? six_lock_seq(&path->l[level].b->c.lock)
-			: 0;
-	),
-
-	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->path_idx,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->level,
-		  __entry->locked,
-		  __entry->self_read_count,
-		  __entry->self_intent_count,
-		  __entry->read_count,
-		  __entry->intent_count,
-		  __entry->iter_lock_seq,
-		  __entry->node_lock_seq)
-);
-
-/* Garbage collection */
-
-DEFINE_EVENT(bch_fs, gc_gens_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_gens_end,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-/* Allocator */
-
-DEFINE_EVENT(fs_str, bucket_alloc,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, bucket_alloc_fail,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DECLARE_EVENT_CLASS(discard_buckets_class,
-	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-		 u64 need_journal_commit, u64 discarded, const char *err),
-	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		seen			)
-		__field(u64,		open			)
-		__field(u64,		need_journal_commit	)
-		__field(u64,		discarded		)
-		__array(char,		err,	16		)
-	),
-
-	TP_fast_assign(
-		__entry->dev			= c->dev;
-		__entry->seen			= seen;
-		__entry->open			= open;
-		__entry->need_journal_commit	= need_journal_commit;
-		__entry->discarded		= discarded;
-		strscpy(__entry->err, err, sizeof(__entry->err));
-	),
-
-	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->seen,
-		  __entry->open,
-		  __entry->need_journal_commit,
-		  __entry->discarded,
-		  __entry->err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets,
-	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-		 u64 need_journal_commit, u64 discarded, const char *err),
-	TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
-	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-		 u64 need_journal_commit, u64 discarded, const char *err),
-	TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-TRACE_EVENT(bucket_invalidate,
-	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
-	TP_ARGS(c, dev, bucket, sectors),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u32,		dev_idx			)
-		__field(u32,		sectors			)
-		__field(u64,		bucket			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->dev_idx	= dev;
-		__entry->sectors	= sectors;
-		__entry->bucket		= bucket;
-	),
-
-	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->dev_idx, __entry->bucket,
-		  __entry->sectors)
-);
-
-/* Moving IO */
-
-DEFINE_EVENT(fs_str, io_move,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_read,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_finish,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_fail,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write_fail,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_start_fail,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-TRACE_EVENT(move_data,
-	TP_PROTO(struct bch_fs *c,
-		 struct bch_move_stats *stats),
-	TP_ARGS(c, stats),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev		)
-		__field(u64,		keys_moved	)
-		__field(u64,		keys_raced	)
-		__field(u64,		sectors_seen	)
-		__field(u64,		sectors_moved	)
-		__field(u64,		sectors_raced	)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->keys_moved	= atomic64_read(&stats->keys_moved);
-		__entry->keys_raced	= atomic64_read(&stats->keys_raced);
-		__entry->sectors_seen	= atomic64_read(&stats->sectors_seen);
-		__entry->sectors_moved	= atomic64_read(&stats->sectors_moved);
-		__entry->sectors_raced	= atomic64_read(&stats->sectors_raced);
-	),
-
-	TP_printk("%d,%d keys moved %llu raced %llu"
-		  "sectors seen %llu moved %llu raced %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->keys_moved,
-		  __entry->keys_raced,
-		  __entry->sectors_seen,
-		  __entry->sectors_moved,
-		  __entry->sectors_raced)
-);
-
-TRACE_EVENT(copygc,
-	TP_PROTO(struct bch_fs *c,
-		 u64 buckets,
-		 u64 sectors_seen,
-		 u64 sectors_moved),
-	TP_ARGS(c, buckets, sectors_seen, sectors_moved),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		buckets			)
-		__field(u64,		sectors_seen		)
-		__field(u64,		sectors_moved		)
-	),
-
-	TP_fast_assign(
-		__entry->dev			= c->dev;
-		__entry->buckets		= buckets;
-		__entry->sectors_seen		= sectors_seen;
-		__entry->sectors_moved		= sectors_moved;
-	),
-
-	TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->buckets,
-		  __entry->sectors_seen,
-		  __entry->sectors_moved)
-);
-
-TRACE_EVENT(copygc_wait,
-	TP_PROTO(struct bch_fs *c,
-		 u64 wait_amount, u64 until),
-	TP_ARGS(c, wait_amount, until),
-
-	TP_STRUCT__entry(
-		__field(dev_t,		dev			)
-		__field(u64,		wait_amount		)
-		__field(u64,		until			)
-	),
-
-	TP_fast_assign(
-		__entry->dev		= c->dev;
-		__entry->wait_amount	= wait_amount;
-		__entry->until		= until;
-	),
-
-	TP_printk("%d,%u waiting for %llu sectors until %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->wait_amount, __entry->until)
-);
-
-/* btree transactions: */
-
-DECLARE_EVENT_CLASS(transaction_event,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-	),
-
-	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,	transaction_commit,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,	trans_restart_injected,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_split_race,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree *b),
-	TP_ARGS(trans, caller_ip, b),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			level		)
-		__field(u16,			written		)
-		__field(u16,			blocks		)
-		__field(u16,			u64s_remaining	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->level		= b->c.level;
-		__entry->written	= b->written;
-		__entry->blocks		= btree_blocks(trans->c);
-		__entry->u64s_remaining	= bch2_btree_keys_u64s_remaining(b);
-	),
-
-	TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
-		  __entry->trans_fn, (void *) __entry->caller_ip,
-		  __entry->level,
-		  __entry->written, __entry->blocks,
-		  __entry->u64s_remaining)
-);
-
-TRACE_EVENT(trans_blocked_journal_reclaim,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-
-		__field(unsigned long,		key_cache_nr_keys	)
-		__field(unsigned long,		key_cache_nr_dirty	)
-		__field(long,			must_wait		)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->key_cache_nr_keys	= atomic_long_read(&trans->c->btree_key_cache.nr_keys);
-		__entry->key_cache_nr_dirty	= atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
-		__entry->must_wait		= __bch2_btree_key_cache_must_wait(trans->c);
-	),
-
-	TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
-		  __entry->trans_fn, (void *) __entry->caller_ip,
-		  __entry->key_cache_nr_keys,
-		  __entry->key_cache_nr_dirty,
-		  __entry->must_wait)
-);
-
-#if 0
-/* todo: bring back dynamic fault injection */
-DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-#endif
-
-DEFINE_EVENT(transaction_event,	trans_traverse_all,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 const char *paths),
-	TP_ARGS(trans, caller_ip, paths)
-);
-
-DECLARE_EVENT_CLASS(transaction_restart_iter,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(pos, path->pos)
-	),
-
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(fs_str, trans_restart_upgrade,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(trans_str,	trans_restart_relock,
-	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
-	TP_ARGS(trans, caller_ip, str)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path),
-	TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
-	TP_PROTO(struct btree_trans *trans,
-		 const char *cycle),
-	TP_ARGS(trans, cycle)
-);
-
-DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_would_deadlock_write,
-	TP_PROTO(struct btree_trans *trans),
-	TP_ARGS(trans),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-	),
-
-	TP_printk("%s", __entry->trans_fn)
-);
-
-TRACE_EVENT(trans_restart_mem_realloced,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 unsigned long bytes),
-	TP_ARGS(trans, caller_ip, bytes),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(unsigned long,		bytes		)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip	= caller_ip;
-		__entry->bytes		= bytes;
-	),
-
-	TP_printk("%s %pS bytes %lu",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->bytes)
-);
-
-DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(path_downgrade,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_path *path,
-		 unsigned old_locks_want),
-	TP_ARGS(trans, caller_ip, path, old_locks_want),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(unsigned,		old_locks_want	)
-		__field(unsigned,		new_locks_want	)
-		__field(unsigned,		btree		)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->old_locks_want		= old_locks_want;
-		__entry->new_locks_want		= path->locks_want;
-		__entry->btree			= path->btree_id;
-		TRACE_BPOS_assign(pos, path->pos);
-	),
-
-	TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  __entry->old_locks_want,
-		  __entry->new_locks_want,
-		  bch2_btree_id_str(__entry->btree),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot)
-);
-
-TRACE_EVENT(key_cache_fill,
-	TP_PROTO(struct btree_trans *trans, const char *key),
-	TP_ARGS(trans, key),
-
-	TP_STRUCT__entry(
-		__array(char,		trans_fn, 32	)
-		__string(key,		key			)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__assign_str(key);
-	),
-
-	TP_printk("%s %s", __entry->trans_fn, __get_str(key))
-);
-
-TRACE_EVENT(write_buffer_flush,
-	TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
-	TP_ARGS(trans, nr, skipped, fast, size),
-
-	TP_STRUCT__entry(
-		__field(size_t,		nr		)
-		__field(size_t,		skipped		)
-		__field(size_t,		fast		)
-		__field(size_t,		size		)
-	),
-
-	TP_fast_assign(
-		__entry->nr	= nr;
-		__entry->skipped = skipped;
-		__entry->fast	= fast;
-		__entry->size	= size;
-	),
-
-	TP_printk("%zu/%zu skipped %zu fast %zu",
-		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
-);
-
-TRACE_EVENT(write_buffer_flush_sync,
-	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-	),
-
-	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-TRACE_EVENT(write_buffer_flush_slowpath,
-	TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
-	TP_ARGS(trans, slowpath, total),
-
-	TP_STRUCT__entry(
-		__field(size_t,		slowpath	)
-		__field(size_t,		total		)
-	),
-
-	TP_fast_assign(
-		__entry->slowpath	= slowpath;
-		__entry->total		= total;
-	),
-
-	TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
-);
-
-TRACE_EVENT(write_buffer_maybe_flush,
-	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
-	TP_ARGS(trans, caller_ip, key),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__string(key,			key		)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__assign_str(key);
-	),
-
-	TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
-);
-
-DEFINE_EVENT(fs_str, rebalance_extent,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, data_update,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_pred,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, extent_trim_atomic,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_slot,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, __btree_iter_peek,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_max,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
-	TP_PROTO(struct bch_fs *c, const char *str),
-	TP_ARGS(c, str)
-);
-
-#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
-
-TRACE_EVENT(update_by_path,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path,
-		 struct btree_insert_entry *i, bool overwrite),
-	TP_ARGS(trans, path, i, overwrite),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(btree_path_idx_t,	path_idx	)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(pos)
-		__field(u8,			overwrite	)
-		__field(btree_path_idx_t,	update_idx	)
-		__field(btree_path_idx_t,	nr_updates	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->path_idx		= path - trans->paths;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(pos, path->pos);
-		__entry->overwrite		= overwrite;
-		__entry->update_idx		= i - trans->updates;
-		__entry->nr_updates		= trans->nr_updates;
-	),
-
-	TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
-		  __entry->trans_fn,
-		  __entry->path_idx,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->overwrite,
-		  __entry->update_idx,
-		  __entry->nr_updates)
-);
-
-TRACE_EVENT(btree_path_lock,
-	TP_PROTO(struct btree_trans *trans,
-		 unsigned long caller_ip,
-		 struct btree_bkey_cached_common *b),
-	TP_ARGS(trans, caller_ip, b),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(unsigned long,		caller_ip	)
-		__field(u8,			btree_id	)
-		__field(u8,			level		)
-		__array(char,			node, 24	)
-		__field(u32,			lock_seq	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__entry->caller_ip		= caller_ip;
-		__entry->btree_id		= b->btree_id;
-		__entry->level			= b->level;
-
-		scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
-		__entry->lock_seq		= six_lock_seq(&b->lock);
-	),
-
-	TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
-		  __entry->trans_fn,
-		  (void *) __entry->caller_ip,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->level,
-		  __entry->node,
-		  __entry->lock_seq)
-);
-
-DECLARE_EVENT_CLASS(btree_path_ev,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path),
-
-	TP_STRUCT__entry(
-		__field(u16,			idx		)
-		__field(u8,			ref		)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path - trans->paths;
-		__entry->ref			= path->ref;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(pos, path->pos);
-	),
-
-	TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
-		  __entry->idx, __entry->ref,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_alloc,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path),
-
-	TP_STRUCT__entry(
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			locks_want	)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path - trans->paths;
-		__entry->locks_want		= path->locks_want;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(pos, path->pos);
-	),
-
-	TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
-		  __entry->idx,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->locks_want,
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot)
-);
-
-TRACE_EVENT(btree_path_get,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
-	TP_ARGS(trans, path, new_pos),
-
-	TP_STRUCT__entry(
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			ref		)
-		__field(u8,			preserve	)
-		__field(u8,			locks_want	)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(old_pos)
-		TRACE_BPOS_entries(new_pos)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path - trans->paths;
-		__entry->ref			= path->ref;
-		__entry->preserve		= path->preserve;
-		__entry->locks_want		= path->locks_want;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(old_pos, path->pos);
-		TRACE_BPOS_assign(new_pos, *new_pos);
-	),
-
-	TP_printk("    path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
-		  __entry->idx,
-		  __entry->ref,
-		  __entry->preserve,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->locks_want,
-		  __entry->old_pos_inode,
-		  __entry->old_pos_offset,
-		  __entry->old_pos_snapshot,
-		  __entry->new_pos_inode,
-		  __entry->new_pos_offset,
-		  __entry->new_pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_path_clone,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-	TP_ARGS(trans, path, new),
-
-	TP_STRUCT__entry(
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			new_idx		)
-		__field(u8,			btree_id	)
-		__field(u8,			ref		)
-		__field(u8,			preserve	)
-		TRACE_BPOS_entries(pos)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path - trans->paths;
-		__entry->new_idx		= new - trans->paths;
-		__entry->btree_id		= path->btree_id;
-		__entry->ref			= path->ref;
-		__entry->preserve		= path->preserve;
-		TRACE_BPOS_assign(pos, path->pos);
-	),
-
-	TP_printk("  path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
-		  __entry->idx,
-		  __entry->ref,
-		  __entry->preserve,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->new_idx)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_clone,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-	TP_ARGS(trans, path, new)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
-	TP_ARGS(trans, path, new)
-);
-
-DECLARE_EVENT_CLASS(btree_path_traverse,
-	TP_PROTO(struct btree_trans *trans,
-		 struct btree_path *path),
-	TP_ARGS(trans, path),
-
-	TP_STRUCT__entry(
-		__array(char,			trans_fn, 32	)
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			ref		)
-		__field(u8,			preserve	)
-		__field(u8,			should_be_locked )
-		__field(u8,			btree_id	)
-		__field(u8,			level		)
-		TRACE_BPOS_entries(pos)
-		__field(u8,			locks_want	)
-		__field(u8,			nodes_locked	)
-		__array(char,			node0, 24	)
-		__array(char,			node1, 24	)
-		__array(char,			node2, 24	)
-		__array(char,			node3, 24	)
-	),
-
-	TP_fast_assign(
-		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-
-		__entry->idx			= path - trans->paths;
-		__entry->ref			= path->ref;
-		__entry->preserve		= path->preserve;
-		__entry->btree_id		= path->btree_id;
-		__entry->level			= path->level;
-		TRACE_BPOS_assign(pos, path->pos);
-
-		__entry->locks_want		= path->locks_want;
-		__entry->nodes_locked		= path->nodes_locked;
-		struct btree *b = path->l[0].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[1].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[2].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[3].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
-	),
-
-	TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
-		  "locks %u %u %u %u node %s %s %s %s",
-		  __entry->trans_fn,
-		  __entry->idx,
-		  __entry->ref,
-		  __entry->preserve,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->pos_inode,
-		  __entry->pos_offset,
-		  __entry->pos_snapshot,
-		  __entry->level,
-		  __entry->locks_want,
-		  (__entry->nodes_locked >> 6) & 3,
-		  (__entry->nodes_locked >> 4) & 3,
-		  (__entry->nodes_locked >> 2) & 3,
-		  (__entry->nodes_locked >> 0) & 3,
-		  __entry->node3,
-		  __entry->node2,
-		  __entry->node1,
-		  __entry->node0)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
-	TP_PROTO(struct btree_trans *trans,
-		 struct btree_path *path),
-	TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
-	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
-	TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_set_pos,
-	TP_PROTO(struct btree_trans *trans,
-		 struct btree_path *path,
-		 struct bpos *new_pos),
-	TP_ARGS(trans, path, new_pos),
-
-	TP_STRUCT__entry(
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			ref		)
-		__field(u8,			preserve	)
-		__field(u8,			btree_id	)
-		TRACE_BPOS_entries(old_pos)
-		TRACE_BPOS_entries(new_pos)
-		__field(u8,			locks_want	)
-		__field(u8,			nodes_locked	)
-		__array(char,			node0, 24	)
-		__array(char,			node1, 24	)
-		__array(char,			node2, 24	)
-		__array(char,			node3, 24	)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path - trans->paths;
-		__entry->ref			= path->ref;
-		__entry->preserve		= path->preserve;
-		__entry->btree_id		= path->btree_id;
-		TRACE_BPOS_assign(old_pos, path->pos);
-		TRACE_BPOS_assign(new_pos, *new_pos);
-
-		__entry->nodes_locked		= path->nodes_locked;
-		struct btree *b = path->l[0].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[1].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[2].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
-		b = path->l[3].b;
-		if (IS_ERR(b))
-			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
-		else
-			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
-	),
-
-	TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
-		  "locks %u %u %u %u node %s %s %s %s",
-		  __entry->idx,
-		  __entry->ref,
-		  __entry->preserve,
-		  bch2_btree_id_str(__entry->btree_id),
-		  __entry->old_pos_inode,
-		  __entry->old_pos_offset,
-		  __entry->old_pos_snapshot,
-		  __entry->new_pos_inode,
-		  __entry->new_pos_offset,
-		  __entry->new_pos_snapshot,
-		  (__entry->nodes_locked >> 6) & 3,
-		  (__entry->nodes_locked >> 4) & 3,
-		  (__entry->nodes_locked >> 2) & 3,
-		  (__entry->nodes_locked >> 0) & 3,
-		  __entry->node3,
-		  __entry->node2,
-		  __entry->node1,
-		  __entry->node0)
-);
-
-TRACE_EVENT(btree_path_free,
-	TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
-	TP_ARGS(trans, path, dup),
-
-	TP_STRUCT__entry(
-		__field(btree_path_idx_t,	idx		)
-		__field(u8,			preserve	)
-		__field(u8,			should_be_locked)
-		__field(s8,			dup		)
-		__field(u8,			dup_locked	)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= path;
-		__entry->preserve		= trans->paths[path].preserve;
-		__entry->should_be_locked	= trans->paths[path].should_be_locked;
-		__entry->dup			= dup ? dup - trans->paths  : -1;
-		__entry->dup_locked		= dup ? btree_node_locked(dup, dup->level) : 0;
-	),
-
-	TP_printk("   path %3u %c %c dup %2i locked %u", __entry->idx,
-		  __entry->preserve ? 'P' : ' ',
-		  __entry->should_be_locked ? 'S' : ' ',
-		  __entry->dup,
-		  __entry->dup_locked)
-);
-
-#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-#ifndef _TRACE_BCACHEFS_H
-
-static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
-					struct btree_insert_entry *i, bool overwrite) {}
-static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
-static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-
-#endif
-#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-
-#define _TRACE_BCACHEFS_H
-#endif /* _TRACE_BCACHEFS_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../fs/bcachefs
-
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
deleted file mode 100644
index 9764c2e6a910..000000000000
--- a/fs/bcachefs/two_state_shared_lock.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "two_state_shared_lock.h"
-
-void __bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
-	__wait_event(lock->wait, bch2_two_state_trylock(lock, s));
-}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
deleted file mode 100644
index 7f647846b511..000000000000
--- a/fs/bcachefs/two_state_shared_lock.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TWO_STATE_LOCK_H
-#define _BCACHEFS_TWO_STATE_LOCK_H
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-
-#include "util.h"
-
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-typedef struct {
-	atomic_long_t		v;
-	wait_queue_head_t	wait;
-} two_state_lock_t;
-
-static inline void two_state_lock_init(two_state_lock_t *lock)
-{
-	atomic_long_set(&lock->v, 0);
-	init_waitqueue_head(&lock->wait);
-}
-
-static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
-{
-	long i = s ? 1 : -1;
-
-	EBUG_ON(atomic_long_read(&lock->v) == 0);
-
-	if (atomic_long_sub_return_release(i, &lock->v) == 0)
-		wake_up_all(&lock->wait);
-}
-
-static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
-{
-	long i = s ? 1 : -1;
-	long old;
-
-	old = atomic_long_read(&lock->v);
-	do {
-		if (i > 0 ? old < 0 : old > 0)
-			return false;
-	} while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
-
-	return true;
-}
-
-void __bch2_two_state_lock(two_state_lock_t *, int);
-
-static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
-	if (!bch2_two_state_trylock(lock, s))
-		__bch2_two_state_lock(lock, s);
-}
-
-#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
deleted file mode 100644
index df9a6071fe18..000000000000
--- a/fs/bcachefs/util.c
+++ /dev/null
@@ -1,1047 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * random utility code, for bcache but in theory not specific to bcache
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/log2.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/sched/clock.h>
-
-#include "eytzinger.h"
-#include "mean_and_variance.h"
-#include "util.h"
-
-static const char si_units[] = "?kMGTPEZY";
-
-/* string_get_size units: */
-static const char *const units_2[] = {
-	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
-};
-static const char *const units_10[] = {
-	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
-};
-
-static int parse_u64(const char *cp, u64 *res)
-{
-	const char *start = cp;
-	u64 v = 0;
-
-	if (!isdigit(*cp))
-		return -EINVAL;
-
-	do {
-		if (v > U64_MAX / 10)
-			return -ERANGE;
-		v *= 10;
-		if (v > U64_MAX - (*cp - '0'))
-			return -ERANGE;
-		v += *cp - '0';
-		cp++;
-	} while (isdigit(*cp));
-
-	*res = v;
-	return cp - start;
-}
-
-static int bch2_pow(u64 n, u64 p, u64 *res)
-{
-	*res = 1;
-
-	while (p--) {
-		if (*res > div64_u64(U64_MAX, n))
-			return -ERANGE;
-		*res *= n;
-	}
-	return 0;
-}
-
-static int parse_unit_suffix(const char *cp, u64 *res)
-{
-	const char *start = cp;
-	u64 base = 1024;
-	unsigned u;
-	int ret;
-
-	if (*cp == ' ')
-		cp++;
-
-	for (u = 1; u < strlen(si_units); u++)
-		if (*cp == si_units[u]) {
-			cp++;
-			goto got_unit;
-		}
-
-	for (u = 0; u < ARRAY_SIZE(units_2); u++)
-		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
-			cp += strlen(units_2[u]);
-			goto got_unit;
-		}
-
-	for (u = 0; u < ARRAY_SIZE(units_10); u++)
-		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
-			cp += strlen(units_10[u]);
-			base = 1000;
-			goto got_unit;
-		}
-
-	*res = 1;
-	return 0;
-got_unit:
-	ret = bch2_pow(base, u, res);
-	if (ret)
-		return ret;
-
-	return cp - start;
-}
-
-#define parse_or_ret(cp, _f)			\
-do {						\
-	int _ret = _f;				\
-	if (_ret < 0)				\
-		return _ret;			\
-	cp += _ret;				\
-} while (0)
-
-static int __bch2_strtou64_h(const char *cp, u64 *res)
-{
-	const char *start = cp;
-	u64 v = 0, b, f_n = 0, f_d = 1;
-	int ret;
-
-	parse_or_ret(cp, parse_u64(cp, &v));
-
-	if (*cp == '.') {
-		cp++;
-		ret = parse_u64(cp, &f_n);
-		if (ret < 0)
-			return ret;
-		cp += ret;
-
-		ret = bch2_pow(10, ret, &f_d);
-		if (ret)
-			return ret;
-	}
-
-	parse_or_ret(cp, parse_unit_suffix(cp, &b));
-
-	if (v > div64_u64(U64_MAX, b))
-		return -ERANGE;
-	v *= b;
-
-	if (f_n > div64_u64(U64_MAX, b))
-		return -ERANGE;
-
-	f_n = div64_u64(f_n * b, f_d);
-	if (v + f_n < v)
-		return -ERANGE;
-	v += f_n;
-
-	*res = v;
-	return cp - start;
-}
-
-static int __bch2_strtoh(const char *cp, u64 *res,
-			 u64 t_max, bool t_signed)
-{
-	bool positive = *cp != '-';
-	u64 v = 0;
-
-	if (*cp == '+' || *cp == '-')
-		cp++;
-
-	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
-
-	if (*cp == '\n')
-		cp++;
-	if (*cp)
-		return -EINVAL;
-
-	if (positive) {
-		if (v > t_max)
-			return -ERANGE;
-	} else {
-		if (v && !t_signed)
-			return -ERANGE;
-
-		if (v > t_max + 1)
-			return -ERANGE;
-		v = -v;
-	}
-
-	*res = v;
-	return 0;
-}
-
-#define STRTO_H(name, type)					\
-int bch2_ ## name ## _h(const char *cp, type *res)		\
-{								\
-	u64 v = 0;						\
-	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
-			ANYSINT_MAX(type) != ((type) ~0ULL));	\
-	*res = v;						\
-	return ret;						\
-}
-
-STRTO_H(strtoint, int)
-STRTO_H(strtouint, unsigned int)
-STRTO_H(strtoll, long long)
-STRTO_H(strtoull, unsigned long long)
-STRTO_H(strtou64, u64)
-
-u64 bch2_read_flag_list(const char *opt, const char * const list[])
-{
-	u64 ret = 0;
-	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
-
-	if (!d)
-		return -ENOMEM;
-
-	s = strim(d);
-
-	while ((p = strsep(&s, ",;"))) {
-		int flag = match_string(list, -1, p);
-
-		if (flag < 0) {
-			ret = -1;
-			break;
-		}
-
-		ret |= BIT_ULL(flag);
-	}
-
-	kfree(d);
-
-	return ret;
-}
-
-bool bch2_is_zero(const void *_p, size_t n)
-{
-	const char *p = _p;
-	size_t i;
-
-	for (i = 0; i < n; i++)
-		if (p[i])
-			return false;
-	return true;
-}
-
-void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
-{
-	while (nr_bits)
-		prt_char(out, '0' + ((v >> --nr_bits) & 1));
-}
-
-void bch2_prt_u64_base2(struct printbuf *out, u64 v)
-{
-	bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
-}
-
-static bool string_is_spaces(const char *str)
-{
-	while (*str) {
-		if (*str != ' ')
-			return false;
-		str++;
-	}
-	return true;
-}
-
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
-	bool locked = false;
-	const char *p;
-
-	if (!lines) {
-		printk("%s (null)\n", prefix);
-		return;
-	}
-
-	locked = console_trylock();
-
-	while (*lines) {
-		p = strchrnul(lines, '\n');
-		if (!*p && string_is_spaces(lines))
-			break;
-
-		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
-		if (!*p)
-			break;
-		lines = p + 1;
-	}
-	if (locked)
-		console_unlock();
-}
-
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
-			gfp_t gfp)
-{
-#ifdef CONFIG_STACKTRACE
-	unsigned nr_entries = 0;
-
-	stack->nr = 0;
-	int ret = darray_make_room_gfp(stack, 32, gfp);
-	if (ret)
-		return ret;
-
-	if (!down_read_trylock(&task->signal->exec_update_lock))
-		return -1;
-
-	do {
-		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
-	} while (nr_entries == stack->size &&
-		 !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
-
-	stack->nr = nr_entries;
-	up_read(&task->signal->exec_update_lock);
-
-	return ret;
-#else
-	return 0;
-#endif
-}
-
-void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
-{
-	darray_for_each(*stack, i) {
-		prt_printf(out, "[<0>] %pB", (void *) *i);
-		prt_newline(out);
-	}
-}
-
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
-{
-	bch_stacktrace stack = { 0 };
-	int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
-
-	bch2_prt_backtrace(out, &stack);
-	darray_exit(&stack);
-	return ret;
-}
-
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
-	time_t t = sec;
-	char buf[64];
-	ctime_r(&t, buf);
-	strim(buf);
-	prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
-	char buf[64];
-	snprintf(buf, sizeof(buf), "%ptT", &sec);
-	prt_u64(out, sec);
-}
-#endif
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
-	const struct time_unit *u = bch2_pick_time_units(ns);
-
-	prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
-{
-	const struct time_unit *u = bch2_pick_time_units(ns);
-
-	prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
-{
-	prt_printf(out, "%s\t", name);
-	bch2_pr_time_units_aligned(out, ns);
-	prt_newline(out);
-}
-
-#define TABSTOP_SIZE 12
-
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
-{
-	struct quantiles *quantiles = time_stats_to_quantiles(stats);
-	s64 f_mean = 0, d_mean = 0;
-	u64 f_stddev = 0, d_stddev = 0;
-
-	if (stats->buffer) {
-		int cpu;
-
-		spin_lock_irq(&stats->lock);
-		for_each_possible_cpu(cpu)
-			__bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
-		spin_unlock_irq(&stats->lock);
-	}
-
-	/*
-	 * avoid divide by zero
-	 */
-	if (stats->freq_stats.n) {
-		f_mean = mean_and_variance_get_mean(stats->freq_stats);
-		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
-		d_mean = mean_and_variance_get_mean(stats->duration_stats);
-		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
-	}
-
-	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
-	prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
-	printbuf_tabstop_pop(out);
-
-	printbuf_tabstops_reset(out);
-
-	printbuf_tabstop_push(out, out->indent + 20);
-	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-	printbuf_tabstop_push(out, 0);
-	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-
-	prt_printf(out, "\tsince mount\r\trecent\r\n");
-
-	printbuf_tabstops_reset(out);
-	printbuf_tabstop_push(out, out->indent + 20);
-	printbuf_tabstop_push(out, TABSTOP_SIZE);
-	printbuf_tabstop_push(out, 2);
-	printbuf_tabstop_push(out, TABSTOP_SIZE);
-
-	prt_printf(out, "duration of events\n");
-	printbuf_indent_add(out, 2);
-
-	pr_name_and_units(out, "min:", stats->min_duration);
-	pr_name_and_units(out, "max:", stats->max_duration);
-	pr_name_and_units(out, "total:", stats->total_duration);
-
-	prt_printf(out, "mean:\t");
-	bch2_pr_time_units_aligned(out, d_mean);
-	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-	prt_newline(out);
-
-	prt_printf(out, "stddev:\t");
-	bch2_pr_time_units_aligned(out, d_stddev);
-	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-
-	printbuf_indent_sub(out, 2);
-	prt_newline(out);
-
-	prt_printf(out, "time between events\n");
-	printbuf_indent_add(out, 2);
-
-	pr_name_and_units(out, "min:", stats->min_freq);
-	pr_name_and_units(out, "max:", stats->max_freq);
-
-	prt_printf(out, "mean:\t");
-	bch2_pr_time_units_aligned(out, f_mean);
-	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-	prt_newline(out);
-
-	prt_printf(out, "stddev:\t");
-	bch2_pr_time_units_aligned(out, f_stddev);
-	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-
-	printbuf_indent_sub(out, 2);
-	prt_newline(out);
-
-	printbuf_tabstops_reset(out);
-
-	if (quantiles) {
-		int i = eytzinger0_first(NR_QUANTILES);
-		const struct time_unit *u =
-			bch2_pick_time_units(quantiles->entries[i].m);
-		u64 last_q = 0;
-
-		prt_printf(out, "quantiles (%s):\t", u->name);
-		eytzinger0_for_each(j, NR_QUANTILES) {
-			bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
-
-			u64 q = max(quantiles->entries[j].m, last_q);
-			prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
-			if (is_last)
-				prt_newline(out);
-			last_q = q;
-		}
-	}
-}
-
-/* ratelimit: */
-
-/**
- * bch2_ratelimit_delay() - return how long to delay until the next time to do
- *		some work
- * @d:		the struct bch_ratelimit to update
- * Returns:	the amount of time to delay by, in jiffies
- */
-u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
-{
-	u64 now = local_clock();
-
-	return time_after64(d->next, now)
-		? nsecs_to_jiffies(d->next - now)
-		: 0;
-}
-
-/**
- * bch2_ratelimit_increment() - increment @d by the amount of work done
- * @d:		the struct bch_ratelimit to update
- * @done:	the amount of work done, in arbitrary units
- */
-void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
-{
-	u64 now = local_clock();
-
-	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
-
-	if (time_before64(now + NSEC_PER_SEC, d->next))
-		d->next = now + NSEC_PER_SEC;
-
-	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
-		d->next = now - NSEC_PER_SEC * 2;
-}
-
-/* pd controller: */
-
-/*
- * Updates pd_controller. Attempts to scale inputed values to units per second.
- * @target: desired value
- * @actual: current value
- *
- * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
- * it makes actual go down.
- */
-void bch2_pd_controller_update(struct bch_pd_controller *pd,
-			      s64 target, s64 actual, int sign)
-{
-	s64 proportional, derivative, change;
-
-	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
-
-	if (seconds_since_update == 0)
-		return;
-
-	pd->last_update = jiffies;
-
-	proportional = actual - target;
-	proportional *= seconds_since_update;
-	proportional = div_s64(proportional, pd->p_term_inverse);
-
-	derivative = actual - pd->last_actual;
-	derivative = div_s64(derivative, seconds_since_update);
-	derivative = ewma_add(pd->smoothed_derivative, derivative,
-			      (pd->d_term / seconds_since_update) ?: 1);
-	derivative = derivative * pd->d_term;
-	derivative = div_s64(derivative, pd->p_term_inverse);
-
-	change = proportional + derivative;
-
-	/* Don't increase rate if not keeping up */
-	if (change > 0 &&
-	    pd->backpressure &&
-	    time_after64(local_clock(),
-			 pd->rate.next + NSEC_PER_MSEC))
-		change = 0;
-
-	change *= (sign * -1);
-
-	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
-				1, UINT_MAX);
-
-	pd->last_actual		= actual;
-	pd->last_derivative	= derivative;
-	pd->last_proportional	= proportional;
-	pd->last_change		= change;
-	pd->last_target		= target;
-}
-
-void bch2_pd_controller_init(struct bch_pd_controller *pd)
-{
-	pd->rate.rate		= 1024;
-	pd->last_update		= jiffies;
-	pd->p_term_inverse	= 6000;
-	pd->d_term		= 30;
-	pd->d_smooth		= pd->d_term;
-	pd->backpressure	= 1;
-}
-
-void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
-{
-	if (!out->nr_tabstops)
-		printbuf_tabstop_push(out, 20);
-
-	prt_printf(out, "rate:\t");
-	prt_human_readable_s64(out, pd->rate.rate);
-	prt_newline(out);
-
-	prt_printf(out, "target:\t");
-	prt_human_readable_u64(out, pd->last_target);
-	prt_newline(out);
-
-	prt_printf(out, "actual:\t");
-	prt_human_readable_u64(out, pd->last_actual);
-	prt_newline(out);
-
-	prt_printf(out, "proportional:\t");
-	prt_human_readable_s64(out, pd->last_proportional);
-	prt_newline(out);
-
-	prt_printf(out, "derivative:\t");
-	prt_human_readable_s64(out, pd->last_derivative);
-	prt_newline(out);
-
-	prt_printf(out, "change:\t");
-	prt_human_readable_s64(out, pd->last_change);
-	prt_newline(out);
-
-	prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
-}
-
-/* misc: */
-
-void bch2_bio_map(struct bio *bio, void *base, size_t size)
-{
-	while (size) {
-		struct page *page = is_vmalloc_addr(base)
-				? vmalloc_to_page(base)
-				: virt_to_page(base);
-		unsigned offset = offset_in_page(base);
-		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
-
-		BUG_ON(!bio_add_page(bio, page, len, offset));
-		size -= len;
-		base += len;
-	}
-}
-
-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
-{
-	while (size) {
-		struct page *page = alloc_pages(gfp_mask, 0);
-		unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-		if (!page)
-			return -ENOMEM;
-
-		if (unlikely(!bio_add_page(bio, page, len, 0))) {
-			__free_page(page);
-			break;
-		}
-
-		size -= len;
-	}
-
-	return 0;
-}
-
-u64 bch2_get_random_u64_below(u64 ceil)
-{
-	if (ceil <= U32_MAX)
-		return __get_random_u32_below(ceil);
-
-	/* this is the same (clever) algorithm as in __get_random_u32_below() */
-	u64 rand = get_random_u64();
-	u64 mult = ceil * rand;
-
-	if (unlikely(mult < ceil)) {
-		u64 bound;
-		div64_u64_rem(-ceil, ceil, &bound);
-		while (unlikely(mult < bound)) {
-			rand = get_random_u64();
-			mult = ceil * rand;
-		}
-	}
-
-	return mul_u64_u64_shr(ceil, rand, 64);
-}
-
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-
-	__bio_for_each_segment(bv, dst, iter, dst_iter) {
-		void *dstp = kmap_local_page(bv.bv_page);
-
-		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-		kunmap_local(dstp);
-
-		src += bv.bv_len;
-	}
-}
-
-void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-
-	__bio_for_each_segment(bv, src, iter, src_iter) {
-		void *srcp = kmap_local_page(bv.bv_page);
-
-		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-		kunmap_local(srcp);
-
-		dst += bv.bv_len;
-	}
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *bio)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
-	bio_for_each_segment(bv, bio, iter) {
-		unsigned u64s = bv.bv_len / sizeof(u64);
-
-		if (offset < u64s) {
-			u64 *segment = bvec_kmap_local(&bv);
-			segment[offset] = get_random_u64();
-			kunmap_local(segment);
-			return;
-		}
-		offset -= u64s;
-	}
-}
-#endif
-
-void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
-{
-	prt_printf(out, "bi_remaining:\t%u\n",
-		   atomic_read(&bio->__bi_remaining));
-	prt_printf(out, "bi_end_io:\t%ps\n",
-		   bio->bi_end_io);
-	prt_printf(out, "bi_status:\t%u\n",
-		   bio->bi_status);
-}
-
-#if 0
-void eytzinger1_test(void)
-{
-	unsigned inorder, size;
-
-	pr_info("1 based eytzinger test:\n");
-
-	for (size = 2;
-	     size < 65536;
-	     size++) {
-		unsigned extra = eytzinger1_extra(size);
-
-		if (!(size % 4096))
-			pr_info("tree size %u\n", size);
-
-		inorder = 1;
-		eytzinger1_for_each(eytz, size) {
-			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
-			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
-			BUG_ON(eytz != eytzinger1_last(size) &&
-			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
-
-			inorder++;
-		}
-		BUG_ON(inorder - 1 != size);
-	}
-}
-
-void eytzinger0_test(void)
-{
-
-	unsigned inorder, size;
-
-	pr_info("0 based eytzinger test:\n");
-
-	for (size = 1;
-	     size < 65536;
-	     size++) {
-		unsigned extra = eytzinger0_extra(size);
-
-		if (!(size % 4096))
-			pr_info("tree size %u\n", size);
-
-		inorder = 0;
-		eytzinger0_for_each(eytz, size) {
-			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
-			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
-			BUG_ON(eytz != eytzinger0_last(size) &&
-			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
-
-			inorder++;
-		}
-		BUG_ON(inorder != size);
-
-		inorder = size - 1;
-		eytzinger0_for_each_prev(eytz, size) {
-			BUG_ON(eytz != eytzinger0_first(size) &&
-			       eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
-
-			inorder--;
-		}
-		BUG_ON(inorder != -1);
-	}
-}
-
-static inline int cmp_u16(const void *_l, const void *_r)
-{
-	const u16 *l = _l, *r = _r;
-
-	return (*l > *r) - (*r > *l);
-}
-
-static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
-{
-	int r, s;
-	bool bad;
-
-	r = eytzinger0_find_le(test_array, nr,
-			       sizeof(test_array[0]),
-			       cmp_u16, &search);
-	if (r >= 0) {
-		if (test_array[r] > search) {
-			bad = true;
-		} else {
-			s = eytzinger0_next(r, nr);
-			bad = s >= 0 && test_array[s] <= search;
-		}
-	} else {
-		s = eytzinger0_last(nr);
-		bad = s >= 0 && test_array[s] <= search;
-	}
-
-	if (bad) {
-		s = -1;
-		eytzinger0_for_each_prev(j, nr) {
-			if (test_array[j] <= search) {
-				s = j;
-				break;
-			}
-		}
-
-		eytzinger0_for_each(j, nr)
-			pr_info("[%3u] = %12u\n", j, test_array[j]);
-		pr_info("find_le(%12u) = %3i should be %3i\n",
-			search, r, s);
-		BUG();
-	}
-}
-
-static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
-{
-	int r, s;
-	bool bad;
-
-	r = eytzinger0_find_gt(test_array, nr,
-			       sizeof(test_array[0]),
-			       cmp_u16, &search);
-	if (r >= 0) {
-		if (test_array[r] <= search) {
-			bad = true;
-		} else {
-			s = eytzinger0_prev(r, nr);
-			bad = s >= 0 && test_array[s] > search;
-		}
-	} else {
-		s = eytzinger0_first(nr);
-		bad = s >= 0 && test_array[s] > search;
-	}
-
-	if (bad) {
-		s = -1;
-		eytzinger0_for_each(j, nr) {
-			if (test_array[j] > search) {
-				s = j;
-				break;
-			}
-		}
-
-		eytzinger0_for_each(j, nr)
-			pr_info("[%3u] = %12u\n", j, test_array[j]);
-		pr_info("find_gt(%12u) = %3i should be %3i\n",
-			search, r, s);
-		BUG();
-	}
-}
-
-static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
-{
-	int r, s;
-	bool bad;
-
-	r = eytzinger0_find_ge(test_array, nr,
-			       sizeof(test_array[0]),
-			       cmp_u16, &search);
-	if (r >= 0) {
-		if (test_array[r] < search) {
-			bad = true;
-		} else {
-			s = eytzinger0_prev(r, nr);
-			bad = s >= 0 && test_array[s] >= search;
-		}
-	} else {
-		s = eytzinger0_first(nr);
-		bad = s >= 0 && test_array[s] >= search;
-	}
-
-	if (bad) {
-		s = -1;
-		eytzinger0_for_each(j, nr) {
-			if (test_array[j] >= search) {
-				s = j;
-				break;
-			}
-		}
-
-		eytzinger0_for_each(j, nr)
-			pr_info("[%3u] = %12u\n", j, test_array[j]);
-		pr_info("find_ge(%12u) = %3i should be %3i\n",
-			search, r, s);
-		BUG();
-	}
-}
-
-static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
-{
-	unsigned r;
-	int s;
-	bool bad;
-
-	r = eytzinger0_find(test_array, nr,
-			    sizeof(test_array[0]),
-			    cmp_u16, &search);
-
-	if (r < nr) {
-		bad = test_array[r] != search;
-	} else {
-		s = eytzinger0_find_le(test_array, nr,
-				       sizeof(test_array[0]),
-				       cmp_u16, &search);
-		bad = s >= 0 && test_array[s] == search;
-	}
-
-	if (bad) {
-		eytzinger0_for_each(j, nr)
-			pr_info("[%3u] = %12u\n", j, test_array[j]);
-		pr_info("find(%12u) = %3i is incorrect\n",
-			search, r);
-		BUG();
-	}
-}
-
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
-{
-	eytzinger0_find_test_le(test_array, nr, search);
-	eytzinger0_find_test_gt(test_array, nr, search);
-	eytzinger0_find_test_ge(test_array, nr, search);
-	eytzinger0_find_test_eq(test_array, nr, search);
-}
-
-void eytzinger0_find_test(void)
-{
-	unsigned i, nr, allocated = 1 << 12;
-	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
-
-	for (nr = 1; nr < allocated; nr++) {
-		u16 prev = 0;
-
-		pr_info("testing %u elems\n", nr);
-
-		get_random_bytes(test_array, nr * sizeof(test_array[0]));
-		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
-
-		/* verify array is sorted correctly: */
-		eytzinger0_for_each(j, nr) {
-			BUG_ON(test_array[j] < prev);
-			prev = test_array[j];
-		}
-
-		for (i = 0; i < U16_MAX; i += 1 << 12)
-			eytzinger0_find_test_val(test_array, nr, i);
-
-		for (i = 0; i < nr; i++) {
-			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
-			eytzinger0_find_test_val(test_array, nr, test_array[i]);
-			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
-		}
-	}
-
-	kfree(test_array);
-}
-#endif
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-	u64 *ret;
-	int cpu;
-
-	/* access to pcpu vars has to be blocked by other locking */
-	preempt_disable();
-	ret = this_cpu_ptr(p);
-	preempt_enable();
-
-	for_each_possible_cpu(cpu) {
-		u64 *i = per_cpu_ptr(p, cpu);
-
-		if (i != ret) {
-			acc_u64s(ret, i, nr);
-			memset(i, 0, nr * sizeof(u64));
-		}
-	}
-
-	return ret;
-}
-
-void bch2_darray_str_exit(darray_const_str *d)
-{
-	darray_for_each(*d, i)
-		kfree(*i);
-	darray_exit(d);
-}
-
-int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
-{
-	darray_init(ret);
-
-	char *dev_name, *s, *orig;
-
-	dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
-	if (!dev_name)
-		return -ENOMEM;
-
-	while ((s = strsep(&dev_name, ":"))) {
-		char *p = kstrdup(s, GFP_KERNEL);
-		if (!p)
-			goto err;
-
-		if (darray_push(ret, p)) {
-			kfree(p);
-			goto err;
-		}
-	}
-
-	kfree(orig);
-	return 0;
-err:
-	bch2_darray_str_exit(ret);
-	kfree(orig);
-	return -ENOMEM;
-}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
deleted file mode 100644
index 6488f098d140..000000000000
--- a/fs/bcachefs/util.h
+++ /dev/null
@@ -1,782 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_UTIL_H
-#define _BCACHEFS_UTIL_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/closure.h>
-#include <linux/errno.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/min_heap.h>
-#include <linux/sched/clock.h>
-#include <linux/llist.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include <linux/vmalloc.h>
-#include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
-#include "time_stats.h"
-
-struct closure;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define EBUG_ON(cond)		BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define CPU_BIG_ENDIAN		0
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define CPU_BIG_ENDIAN		1
-#endif
-
-/* type hackery */
-
-#define type_is_exact(_val, _type)					\
-	__builtin_types_compatible_p(typeof(_val), _type)
-
-#define type_is(_val, _type)						\
-	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
-	 __builtin_types_compatible_p(typeof(_val), const _type))
-
-/* Userspace doesn't align allocations as nicely as the kernel allocators: */
-static inline size_t buf_pages(void *p, size_t len)
-{
-	return DIV_ROUND_UP(len +
-			    ((unsigned long) p & (PAGE_SIZE - 1)),
-			    PAGE_SIZE);
-}
-
-static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
-{
-	void *p = unlikely(n >= INT_MAX)
-		? vmalloc_noprof(n)
-		: kvmalloc_noprof(n, flags & ~__GFP_ZERO);
-	if (p && (flags & __GFP_ZERO))
-		memset(p, 0, n);
-	return p;
-}
-#define bch2_kvmalloc(...)			alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
-
-#define init_heap(heap, _size, gfp)					\
-({									\
-	(heap)->nr = 0;						\
-	(heap)->size = (_size);						\
-	(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
-				 (gfp));				\
-})
-
-#define free_heap(heap)							\
-do {									\
-	kvfree((heap)->data);						\
-	(heap)->data = NULL;						\
-} while (0)
-
-#define ANYSINT_MAX(t)							\
-	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-
-#include "printbuf.h"
-
-#define prt_vprintf(_out, ...)		bch2_prt_vprintf(_out, __VA_ARGS__)
-#define prt_printf(_out, ...)		bch2_prt_printf(_out, __VA_ARGS__)
-#define printbuf_str(_buf)		bch2_printbuf_str(_buf)
-#define printbuf_exit(_buf)		bch2_printbuf_exit(_buf)
-
-#define printbuf_tabstops_reset(_buf)	bch2_printbuf_tabstops_reset(_buf)
-#define printbuf_tabstop_pop(_buf)	bch2_printbuf_tabstop_pop(_buf)
-#define printbuf_tabstop_push(_buf, _n)	bch2_printbuf_tabstop_push(_buf, _n)
-
-#define printbuf_indent_add(_out, _n)	bch2_printbuf_indent_add(_out, _n)
-#define printbuf_indent_add_nextline(_out, _n)	bch2_printbuf_indent_add_nextline(_out, _n)
-#define printbuf_indent_sub(_out, _n)	bch2_printbuf_indent_sub(_out, _n)
-
-#define prt_newline(_out)		bch2_prt_newline(_out)
-#define prt_tab(_out)			bch2_prt_tab(_out)
-#define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
-
-#define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
-#define prt_u64(_out, _v)		prt_printf(_out, "%llu", (u64) (_v))
-#define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
-#define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
-#define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
-#define prt_units_s64(...)		bch2_prt_units_s64(__VA_ARGS__)
-#define prt_string_option(...)		bch2_prt_string_option(__VA_ARGS__)
-#define prt_bitflags(...)		bch2_prt_bitflags(__VA_ARGS__)
-#define prt_bitflags_vector(...)	bch2_prt_bitflags_vector(__VA_ARGS__)
-
-void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_datetime(struct printbuf *, time64_t);
-
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
-	sprintf(out, "%pUb", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
-static inline void pr_uuid(struct printbuf *out, u8 *uuid)
-{
-	char uuid_str[40];
-
-	uuid_unparse_lower(uuid, uuid_str);
-	prt_printf(out, "%s", uuid_str);
-}
-
-int bch2_strtoint_h(const char *, int *);
-int bch2_strtouint_h(const char *, unsigned int *);
-int bch2_strtoll_h(const char *, long long *);
-int bch2_strtoull_h(const char *, unsigned long long *);
-int bch2_strtou64_h(const char *, u64 *);
-
-static inline int bch2_strtol_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
-	return bch2_strtoint_h(cp, (int *) res);
-#else
-	return bch2_strtoll_h(cp, (long long *) res);
-#endif
-}
-
-static inline int bch2_strtoul_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
-	return bch2_strtouint_h(cp, (unsigned int *) res);
-#else
-	return bch2_strtoull_h(cp, (unsigned long long *) res);
-#endif
-}
-
-#define strtoi_h(cp, res)						\
-	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
-	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
-	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
-	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
-	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
-	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
-	: -EINVAL)
-
-#define strtoul_safe(cp, var)						\
-({									\
-	unsigned long _v;						\
-	int _r = kstrtoul(cp, 10, &_v);					\
-	if (!_r)							\
-		var = _v;						\
-	_r;								\
-})
-
-#define strtoul_safe_clamp(cp, var, min, max)				\
-({									\
-	unsigned long _v;						\
-	int _r = kstrtoul(cp, 10, &_v);					\
-	if (!_r)							\
-		var = clamp_t(typeof(var), _v, min, max);		\
-	_r;								\
-})
-
-#define strtoul_safe_restrict(cp, var, min, max)			\
-({									\
-	unsigned long _v;						\
-	int _r = kstrtoul(cp, 10, &_v);					\
-	if (!_r && _v >= min && _v <= max)				\
-		var = _v;						\
-	else								\
-		_r = -EINVAL;						\
-	_r;								\
-})
-
-#define snprint(out, var)						\
-	prt_printf(out,							\
-		   type_is(var, int)		? "%i\n"		\
-		 : type_is(var, unsigned)	? "%u\n"		\
-		 : type_is(var, long)		? "%li\n"		\
-		 : type_is(var, unsigned long)	? "%lu\n"		\
-		 : type_is(var, s64)		? "%lli\n"		\
-		 : type_is(var, u64)		? "%llu\n"		\
-		 : type_is(var, char *)		? "%s\n"		\
-		 : "%i\n", var)
-
-bool bch2_is_zero(const void *, size_t);
-
-u64 bch2_read_flag_list(const char *, const char * const[]);
-
-void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
-void bch2_prt_u64_base2(struct printbuf *, u64);
-
-void bch2_print_string_as_lines(const char *, const char *);
-
-typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
-void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
-
-static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
-{
-#ifdef __KERNEL__
-	prt_printf(out, "%pg", bdev);
-#else
-	prt_str(out, bdev->name);
-#endif
-}
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-#define ewma_add(ewma, val, weight)					\
-({									\
-	typeof(ewma) _ewma = (ewma);					\
-	typeof(weight) _weight = (weight);				\
-									\
-	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
-})
-
-struct bch_ratelimit {
-	/* Next time we want to do some work, in nanoseconds */
-	u64			next;
-
-	/*
-	 * Rate at which we want to do work, in units per nanosecond
-	 * The units here correspond to the units passed to
-	 * bch2_ratelimit_increment()
-	 */
-	unsigned		rate;
-};
-
-static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
-{
-	d->next = local_clock();
-}
-
-u64 bch2_ratelimit_delay(struct bch_ratelimit *);
-void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-
-struct bch_pd_controller {
-	struct bch_ratelimit	rate;
-	unsigned long		last_update;
-
-	s64			last_actual;
-	s64			smoothed_derivative;
-
-	unsigned		p_term_inverse;
-	unsigned		d_smooth;
-	unsigned		d_term;
-
-	/* for exporting to sysfs (no effect on behavior) */
-	s64			last_derivative;
-	s64			last_proportional;
-	s64			last_change;
-	s64			last_target;
-
-	/*
-	 * If true, the rate will not increase if bch2_ratelimit_delay()
-	 * is not being called often enough.
-	 */
-	bool			backpressure;
-};
-
-void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
-void bch2_pd_controller_init(struct bch_pd_controller *);
-void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
-
-#define sysfs_pd_controller_attribute(name)				\
-	rw_attribute(name##_rate);					\
-	rw_attribute(name##_rate_bytes);				\
-	rw_attribute(name##_rate_d_term);				\
-	rw_attribute(name##_rate_p_term_inverse);			\
-	read_attribute(name##_rate_debug)
-
-#define sysfs_pd_controller_files(name)					\
-	&sysfs_##name##_rate,						\
-	&sysfs_##name##_rate_bytes,					\
-	&sysfs_##name##_rate_d_term,					\
-	&sysfs_##name##_rate_p_term_inverse,				\
-	&sysfs_##name##_rate_debug
-
-#define sysfs_pd_controller_show(name, var)				\
-do {									\
-	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
-	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
-	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
-	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
-									\
-	if (attr == &sysfs_##name##_rate_debug)				\
-		bch2_pd_controller_debug_to_text(out, var);		\
-} while (0)
-
-#define sysfs_pd_controller_store(name, var)				\
-do {									\
-	sysfs_strtoul_clamp(name##_rate,				\
-			    (var)->rate.rate, 1, UINT_MAX);		\
-	sysfs_strtoul_clamp(name##_rate_bytes,				\
-			    (var)->rate.rate, 1, UINT_MAX);		\
-	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
-	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
-			    (var)->p_term_inverse, 1, INT_MAX);		\
-} while (0)
-
-#define container_of_or_null(ptr, type, member)				\
-({									\
-	typeof(ptr) _ptr = ptr;						\
-	_ptr ? container_of(_ptr, type, member) : NULL;			\
-})
-
-static inline struct list_head *list_pop(struct list_head *head)
-{
-	if (list_empty(head))
-		return NULL;
-
-	struct list_head *ret = head->next;
-	list_del_init(ret);
-	return ret;
-}
-
-#define list_pop_entry(head, type, member)		\
-	container_of_or_null(list_pop(head), type, member)
-
-/* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
-{
-	unsigned fract = x & ~(~0 << fract_bits);
-
-	x >>= fract_bits;
-	x   = 1 << x;
-	x  += (x * fract) >> fract_bits;
-
-	return x;
-}
-
-void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-
-#define closure_bio_submit(bio, cl)					\
-do {									\
-	closure_get(cl);						\
-	submit_bio(bio);						\
-} while (0)
-
-#define kthread_wait(cond)						\
-({									\
-	int _ret = 0;							\
-									\
-	while (1) {							\
-		set_current_state(TASK_INTERRUPTIBLE);			\
-		if (kthread_should_stop()) {				\
-			_ret = -1;					\
-			break;						\
-		}							\
-									\
-		if (cond)						\
-			break;						\
-									\
-		schedule();						\
-	}								\
-	set_current_state(TASK_RUNNING);				\
-	_ret;								\
-})
-
-#define kthread_wait_freezable(cond)					\
-({									\
-	int _ret = 0;							\
-	while (1) {							\
-		set_current_state(TASK_INTERRUPTIBLE);			\
-		if (kthread_should_stop()) {				\
-			_ret = -1;					\
-			break;						\
-		}							\
-									\
-		if (cond)						\
-			break;						\
-									\
-		schedule();						\
-		try_to_freeze();					\
-	}								\
-	set_current_state(TASK_RUNNING);				\
-	_ret;								\
-})
-
-u64 bch2_get_random_u64_below(u64);
-
-void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
-void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *);
-
-static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
-{
-	if (ratio && !get_random_u32_below(ratio))
-		bch2_corrupt_bio(bio);
-}
-#else
-#define bch2_maybe_corrupt_bio(...)	do {} while (0)
-#endif
-
-void bch2_bio_to_text(struct printbuf *, struct bio *);
-
-static inline void memcpy_u64s_small(void *dst, const void *src,
-				     unsigned u64s)
-{
-	u64 *d = dst;
-	const u64 *s = src;
-
-	while (u64s--)
-		*d++ = *s++;
-}
-
-static inline void __memcpy_u64s(void *dst, const void *src,
-				 unsigned u64s)
-{
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
-	long d0, d1, d2;
-
-	asm volatile("rep ; movsq"
-		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		     : "0" (u64s), "1" (dst), "2" (src)
-		     : "memory");
-#else
-	u64 *d = dst;
-	const u64 *s = src;
-
-	while (u64s--)
-		*d++ = *s++;
-#endif
-}
-
-static inline void memcpy_u64s(void *dst, const void *src,
-			       unsigned u64s)
-{
-	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
-		 dst + u64s * sizeof(u64) <= src));
-
-	__memcpy_u64s(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down(void *dst, const void *src,
-				       unsigned u64s)
-{
-	__memcpy_u64s(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down(void *dst, const void *src,
-				     unsigned u64s)
-{
-	EBUG_ON(dst > src);
-
-	__memmove_u64s_down(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down_small(void *dst, const void *src,
-				       unsigned u64s)
-{
-	memcpy_u64s_small(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down_small(void *dst, const void *src,
-				     unsigned u64s)
-{
-	EBUG_ON(dst > src);
-
-	__memmove_u64s_down_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
-					   unsigned u64s)
-{
-	u64 *dst = (u64 *) _dst + u64s;
-	u64 *src = (u64 *) _src + u64s;
-
-	while (u64s--)
-		*--dst = *--src;
-}
-
-static inline void memmove_u64s_up_small(void *dst, const void *src,
-					 unsigned u64s)
-{
-	EBUG_ON(dst < src);
-
-	__memmove_u64s_up_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up(void *_dst, const void *_src,
-				     unsigned u64s)
-{
-	u64 *dst = (u64 *) _dst + u64s - 1;
-	u64 *src = (u64 *) _src + u64s - 1;
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
-	long d0, d1, d2;
-
-	asm volatile("std ;\n"
-		     "rep ; movsq\n"
-		     "cld ;\n"
-		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		     : "0" (u64s), "1" (dst), "2" (src)
-		     : "memory");
-#else
-	while (u64s--)
-		*dst-- = *src--;
-#endif
-}
-
-static inline void memmove_u64s_up(void *dst, const void *src,
-				   unsigned u64s)
-{
-	EBUG_ON(dst < src);
-
-	__memmove_u64s_up(dst, src, u64s);
-}
-
-static inline void memmove_u64s(void *dst, const void *src,
-				unsigned u64s)
-{
-	if (dst < src)
-		__memmove_u64s_down(dst, src, u64s);
-	else
-		__memmove_u64s_up(dst, src, u64s);
-}
-
-/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
-static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
-{
-	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
-
-	memset(s + bytes, c, rem);
-}
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos)				\
-	memmove(&(_array)[(_pos) + 1],					\
-		&(_array)[(_pos)],					\
-		sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item)			\
-do {									\
-	__array_insert_item(_array, _nr, _pos);				\
-	(_nr)++;							\
-	(_array)[(_pos)] = (_new_item);					\
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
-do {									\
-	(_nr) -= (_nr_to_remove);					\
-	memmove(&(_array)[(_pos)],					\
-		&(_array)[(_pos) + (_nr_to_remove)],			\
-		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos)				\
-	array_remove_items(_array, _nr, _pos, 1)
-
-static inline void __move_gap(void *array, size_t element_size,
-			      size_t nr, size_t size,
-			      size_t old_gap, size_t new_gap)
-{
-	size_t gap_end = old_gap + size - nr;
-
-	if (new_gap < old_gap) {
-		size_t move = old_gap - new_gap;
-
-		memmove(array + element_size * (gap_end - move),
-			array + element_size * (old_gap - move),
-				element_size * move);
-	} else if (new_gap > old_gap) {
-		size_t move = new_gap - old_gap;
-
-		memmove(array + element_size * old_gap,
-			array + element_size * gap_end,
-				element_size * move);
-	}
-}
-
-/* Move the gap in a gap buffer: */
-#define move_gap(_d, _new_gap)						\
-do {									\
-	BUG_ON(_new_gap > (_d)->nr);					\
-	BUG_ON((_d)->gap > (_d)->nr);					\
-									\
-	__move_gap((_d)->data, sizeof((_d)->data[0]),			\
-		   (_d)->nr, (_d)->size, (_d)->gap, _new_gap);		\
-	(_d)->gap = _new_gap;						\
-} while (0)
-
-#define bubble_sort(_base, _nr, _cmp)					\
-do {									\
-	ssize_t _i, _last;						\
-	bool _swapped = true;						\
-									\
-	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
-		_swapped = false;					\
-		for (_i = 0; _i < _last; _i++)				\
-			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
-				swap((_base)[_i], (_base)[_i + 1]);	\
-				_swapped = true;			\
-			}						\
-	}								\
-} while (0)
-
-#define per_cpu_sum(_p)							\
-({									\
-	TYPEOF_UNQUAL(*_p) _ret = 0;					\
-									\
-	int cpu;							\
-	for_each_possible_cpu(cpu)					\
-		_ret += *per_cpu_ptr(_p, cpu);				\
-	_ret;								\
-})
-
-static inline u64 percpu_u64_get(u64 __percpu *src)
-{
-	return per_cpu_sum(src);
-}
-
-static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(dst, cpu) = 0;
-	this_cpu_write(*dst, src);
-}
-
-static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
-{
-	for (unsigned i = 0; i < nr; i++)
-		acc[i] += src[i];
-}
-
-static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
-				   unsigned nr)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
-}
-
-static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		memset(per_cpu_ptr(p, cpu), c, bytes);
-}
-
-u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-
-static inline int u8_cmp(u8 l, u8 r)
-{
-	return cmp_int(l, r);
-}
-
-static inline int cmp_le32(__le32 l, __le32 r)
-{
-	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-#include <linux/uuid.h>
-
-static inline bool qstr_eq(const struct qstr l, const struct qstr r)
-{
-	return l.len == r.len && !memcmp(l.name, r.name, l.len);
-}
-
-void bch2_darray_str_exit(darray_const_str *);
-int bch2_split_devs(const char *, darray_const_str *);
-
-#ifdef __KERNEL__
-
-__must_check
-static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
-	return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
-__must_check
-static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
-{
-	return copy_from_user(to, from, n) ? -EFAULT : 0;
-}
-
-#endif
-
-static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
-{
-	if (v)
-		set_bit(nr, addr);
-	else
-		clear_bit(nr, addr);
-}
-
-static inline void __set_bit_le64(size_t bit, __le64 *addr)
-{
-	addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline void __clear_bit_le64(size_t bit, __le64 *addr)
-{
-	addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline bool test_bit_le64(size_t bit, __le64 *addr)
-{
-	return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
-}
-
-static inline void memcpy_swab(void *_dst, void *_src, size_t len)
-{
-	u8 *dst = _dst + len;
-	u8 *src = _src;
-
-	while (len--)
-		*--dst = *src++;
-}
-
-#define set_flags(_map, _in, _out)					\
-do {									\
-	unsigned _i;							\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & (1 << _i))					\
-			(_out) |= _map[_i];				\
-		else							\
-			(_out) &= ~_map[_i];				\
-} while (0)
-
-#define map_flags(_map, _in)						\
-({									\
-	unsigned _out = 0;						\
-									\
-	set_flags(_map, _in, _out);					\
-	_out;								\
-})
-
-#define map_flags_rev(_map, _in)					\
-({									\
-	unsigned _i, _out = 0;						\
-									\
-	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
-		if ((_in) & _map[_i]) {					\
-			(_out) |= 1 << _i;				\
-			(_in) &= ~_map[_i];				\
-		}							\
-	(_out);								\
-})
-
-#define map_defined(_map)						\
-({									\
-	unsigned _in = ~0;						\
-									\
-	map_flags_rev(_map, _in);					\
-})
-
-#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
deleted file mode 100644
index 6620ecae26af..000000000000
--- a/fs/bcachefs/varint.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bitops.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-#ifdef CONFIG_VALGRIND
-#include <valgrind/memcheck.h>
-#endif
-
-#include "errcode.h"
-#include "varint.h"
-
-/**
- * bch2_varint_encode - encode a variable length integer
- * @out:	destination to encode to
- * @v:		unsigned integer to encode
- * Returns:	size in bytes of the encoded integer - at most 9 bytes
- */
-int bch2_varint_encode(u8 *out, u64 v)
-{
-	unsigned bits = fls64(v|1);
-	unsigned bytes = DIV_ROUND_UP(bits, 7);
-	__le64 v_le;
-
-	if (likely(bytes < 9)) {
-		v <<= bytes;
-		v |= ~(~0 << (bytes - 1));
-		v_le = cpu_to_le64(v);
-		memcpy(out, &v_le, bytes);
-	} else {
-		*out++ = 255;
-		bytes = 9;
-		put_unaligned_le64(v, out);
-	}
-
-	return bytes;
-}
-
-/**
- * bch2_varint_decode - encode a variable length integer
- * @in:		varint to decode
- * @end:	end of buffer to decode from
- * @out:	on success, decoded integer
- * Returns:	size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- */
-int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
-{
-	unsigned bytes = likely(in < end)
-		? ffz(*in & 255) + 1
-		: 1;
-	u64 v;
-
-	if (unlikely(in + bytes > end))
-		return -BCH_ERR_varint_decode_error;
-
-	if (likely(bytes < 9)) {
-		__le64 v_le = 0;
-
-		memcpy(&v_le, in, bytes);
-		v = le64_to_cpu(v_le);
-		v >>= bytes;
-	} else {
-		v = get_unaligned_le64(++in);
-	}
-
-	*out = v;
-	return bytes;
-}
-
-/**
- * bch2_varint_encode_fast - fast version of bch2_varint_encode
- * @out:	destination to encode to
- * @v:		unsigned integer to encode
- * Returns:	size in bytes of the encoded integer - at most 9 bytes
- *
- * This version assumes it's always safe to write 8 bytes to @out, even if the
- * encoded integer would be smaller.
- */
-int bch2_varint_encode_fast(u8 *out, u64 v)
-{
-	unsigned bits = fls64(v|1);
-	unsigned bytes = DIV_ROUND_UP(bits, 7);
-
-	if (likely(bytes < 9)) {
-		v <<= bytes;
-		v |= ~(~0U << (bytes - 1));
-	} else {
-		*out++ = 255;
-		bytes = 9;
-	}
-
-	put_unaligned_le64(v, out);
-	return bytes;
-}
-
-/**
- * bch2_varint_decode_fast - fast version of bch2_varint_decode
- * @in:		varint to decode
- * @end:	end of buffer to decode from
- * @out:	on success, decoded integer
- * Returns:	size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- *
- * This version assumes that it is safe to read at most 8 bytes past the end of
- * @end (we still return an error if the varint extends past @end).
- */
-int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
-{
-#ifdef CONFIG_VALGRIND
-	VALGRIND_MAKE_MEM_DEFINED(in, 8);
-#endif
-	u64 v = get_unaligned_le64(in);
-	unsigned bytes = ffz(*in) + 1;
-
-	if (unlikely(in + bytes > end))
-		return -BCH_ERR_varint_decode_error;
-
-	if (likely(bytes < 9)) {
-		v >>= bytes;
-		v &= ~(~0ULL << (7 * bytes));
-	} else {
-		v = get_unaligned_le64(++in);
-	}
-
-	*out = v;
-	return bytes;
-}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
deleted file mode 100644
index 92a182fb3d7a..000000000000
--- a/fs/bcachefs/varint.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_VARINT_H
-#define _BCACHEFS_VARINT_H
-
-int bch2_varint_encode(u8 *, u64);
-int bch2_varint_decode(const u8 *, const u8 *, u64 *);
-
-int bch2_varint_encode_fast(u8 *, u64);
-int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
-
-#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
deleted file mode 100644
index 2ad338e282da..000000000000
--- a/fs/bcachefs/vstructs.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VSTRUCTS_H
-#define _VSTRUCTS_H
-
-#include "util.h"
-
-/*
- * NOTE: we can't differentiate between __le64 and u64 with type_is - this
- * assumes u64 is little endian:
- */
-#define __vstruct_u64s(_s)						\
-({									\
-	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
-	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
-	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
-	: ((__force u8) ((_s)->u64s)));						\
-})
-
-#define __vstruct_bytes(_type, _u64s)					\
-({									\
-	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
-									\
-	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
-})
-
-#define vstruct_bytes(_s)						\
-	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
-
-#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
-	(round_up(__vstruct_bytes(_type, _u64s),			\
-		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
-
-#define vstruct_blocks(_s, _sector_block_bits)				\
-	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
-
-#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
-	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
-			 __vstruct_u64s(_s) + (_u64s))
-
-#define vstruct_sectors(_s, _sector_block_bits)				\
-	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
-
-#define vstruct_next(_s)						\
-	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_last(_s)						\
-	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_end(_s)							\
-	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-
-#define vstruct_for_each(_s, _i)					\
-	for (typeof(&(_s)->start[0]) _i = (_s)->start;			\
-	     _i < vstruct_last(_s);					\
-	     _i = vstruct_next(_i))
-
-#define vstruct_for_each_safe(_s, _i)					\
-	for (typeof(&(_s)->start[0]) _next, _i = (_s)->start;		\
-	     _i < vstruct_last(_s) && (_next = vstruct_next(_i), true);	\
-	     _i = _next)
-
-#define vstruct_idx(_s, _idx)						\
-	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
-
-#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
deleted file mode 100644
index 627f153798c6..000000000000
--- a/fs/bcachefs/xattr.c
+++ /dev/null
@@ -1,642 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "fs.h"
-#include "rebalance.h"
-#include "str_hash.h"
-#include "xattr.h"
-
-#include <linux/dcache.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
-static u64 bch2_xattr_hash(const struct bch_hash_info *info,
-			  const struct xattr_search_key *key)
-{
-	struct bch_str_hash_ctx ctx;
-
-	bch2_str_hash_init(&ctx, info);
-	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
-	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
-
-	return bch2_str_hash_end(&ctx, info);
-}
-
-static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
-{
-	return bch2_xattr_hash(info, key);
-}
-
-static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
-	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
-
-	return bch2_xattr_hash(info,
-		 &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
-}
-
-static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
-{
-	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
-	const struct xattr_search_key *r = _r;
-
-	return l.v->x_type != r->type ||
-		l.v->x_name_len != r->name.len ||
-		memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
-}
-
-static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
-	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
-	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
-
-	return l.v->x_type != r.v->x_type ||
-		l.v->x_name_len != r.v->x_name_len ||
-		memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
-}
-
-const struct bch_hash_desc bch2_xattr_hash_desc = {
-	.btree_id	= BTREE_ID_xattrs,
-	.key_type	= KEY_TYPE_xattr,
-	.hash_key	= xattr_hash_key,
-	.hash_bkey	= xattr_hash_bkey,
-	.cmp_key	= xattr_cmp_key,
-	.cmp_bkey	= xattr_cmp_bkey,
-};
-
-int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
-			struct bkey_validate_context from)
-{
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-	unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
-					   le16_to_cpu(xattr.v->x_val_len));
-	int ret = 0;
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
-			 c, xattr_val_size_too_small,
-			 "value too small (%zu < %u)",
-			 bkey_val_u64s(k.k), val_u64s);
-
-	/* XXX why +4 ? */
-	val_u64s = xattr_val_u64s(xattr.v->x_name_len,
-				  le16_to_cpu(xattr.v->x_val_len) + 4);
-
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
-			 c, xattr_val_size_too_big,
-			 "value too big (%zu > %u)",
-			 bkey_val_u64s(k.k), val_u64s);
-
-	bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
-			 c, xattr_invalid_type,
-			 "invalid type (%u)", xattr.v->x_type);
-
-	bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
-			 c, xattr_name_invalid_chars,
-			 "xattr name has invalid characters");
-fsck_err:
-	return ret;
-}
-
-void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
-			struct bkey_s_c k)
-{
-	const struct xattr_handler *handler;
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
-	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-	if (handler && handler->prefix)
-		prt_printf(out, "%s", handler->prefix);
-	else if (handler)
-		prt_printf(out, "(type %u)", xattr.v->x_type);
-	else
-		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
-
-	unsigned name_len = xattr.v->x_name_len;
-	unsigned val_len  = le16_to_cpu(xattr.v->x_val_len);
-	unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
-		offsetof(struct bch_xattr, x_name_and_value);
-
-	val_len  = min_t(int, val_len, max_name_val_bytes - name_len);
-	name_len = min(name_len, max_name_val_bytes);
-
-	prt_printf(out, "%.*s:%.*s",
-		   name_len, xattr.v->x_name_and_value,
-		   val_len,  (char *) xattr_val(xattr.v));
-
-	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
-	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
-		prt_char(out, ' ');
-		bch2_acl_to_text(out, xattr_val(xattr.v),
-				 le16_to_cpu(xattr.v->x_val_len));
-	}
-}
-
-static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
-				const char *name, void *buffer, size_t size, int type)
-{
-	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
-	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
-	struct btree_iter iter;
-	struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
-					     inode_inum(inode), &search, 0);
-	int ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-	ret = le16_to_cpu(xattr.v->x_val_len);
-	if (buffer) {
-		if (ret > size)
-			ret = -ERANGE;
-		else
-			memcpy(buffer, xattr_val(xattr.v), ret);
-	}
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
-		   struct bch_inode_unpacked *inode_u,
-		   const struct bch_hash_info *hash_info,
-		   const char *name, const void *value, size_t size,
-		   int type, int flags)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter inode_iter = {};
-	int ret;
-
-	ret   = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
-		bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
-	if (ret)
-		return ret;
-
-	/*
-	 * Besides the ctime update, extents, dirents and xattrs updates require
-	 * that an inode update also happens - to ensure that if a key exists in
-	 * one of those btrees with a given snapshot ID an inode is also present
-	 */
-	inode_u->bi_ctime = bch2_current_time(c);
-
-	ret = bch2_inode_write(trans, &inode_iter, inode_u);
-	bch2_trans_iter_exit(trans, &inode_iter);
-
-	if (ret)
-		return ret;
-
-	if (value) {
-		struct bkey_i_xattr *xattr;
-		unsigned namelen = strlen(name);
-		unsigned u64s = BKEY_U64s +
-			xattr_val_u64s(namelen, size);
-
-		if (u64s > U8_MAX)
-			return -ERANGE;
-
-		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
-		if (IS_ERR(xattr))
-			return PTR_ERR(xattr);
-
-		bkey_xattr_init(&xattr->k_i);
-		xattr->k.u64s		= u64s;
-		xattr->v.x_type		= type;
-		xattr->v.x_name_len	= namelen;
-		xattr->v.x_val_len	= cpu_to_le16(size);
-		memcpy(xattr->v.x_name_and_value, name, namelen);
-		memcpy(xattr_val(&xattr->v), value, size);
-
-		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-			      inum, &xattr->k_i,
-			      (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
-			      (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
-	} else {
-		struct xattr_search_key search =
-			X_SEARCH(type, name, strlen(name));
-
-		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
-				       hash_info, inum, &search);
-	}
-
-	if (bch2_err_matches(ret, ENOENT))
-		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
-
-	return ret;
-}
-
-struct xattr_buf {
-	char		*buf;
-	size_t		len;
-	size_t		used;
-};
-
-static int __bch2_xattr_emit(const char *prefix,
-			     const char *name, size_t name_len,
-			     struct xattr_buf *buf)
-{
-	const size_t prefix_len = strlen(prefix);
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (buf->buf) {
-		if (buf->used + total_len > buf->len)
-			return -ERANGE;
-
-		memcpy(buf->buf + buf->used, prefix, prefix_len);
-		memcpy(buf->buf + buf->used + prefix_len,
-		       name, name_len);
-		buf->buf[buf->used + prefix_len + name_len] = '\0';
-	}
-
-	buf->used += total_len;
-	return 0;
-}
-
-static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
-{
-	const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
-
-	if (!xattr_handler_can_list(handler, dentry))
-		return NULL;
-
-	return xattr_prefix(handler);
-}
-
-static int bch2_xattr_emit(struct dentry *dentry,
-			    const struct bch_xattr *xattr,
-			    struct xattr_buf *buf)
-{
-	const char *prefix;
-
-	prefix = bch2_xattr_prefix(xattr->x_type, dentry);
-	if (!prefix)
-		return 0;
-
-	return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
-}
-
-static int bch2_xattr_list_bcachefs(struct bch_fs *c,
-				    struct bch_inode_unpacked *inode,
-				    struct xattr_buf *buf,
-				    bool all)
-{
-	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
-	unsigned id;
-	int ret = 0;
-	u64 v;
-
-	for (id = 0; id < Inode_opt_nr; id++) {
-		v = bch2_inode_opt_get(inode, id);
-		if (!v)
-			continue;
-
-		if (!all &&
-		    !(inode->bi_fields_set & (1 << id)))
-			continue;
-
-		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
-					strlen(bch2_inode_opts[id]), buf);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-	struct bch_fs *c = dentry->d_sb->s_fs_info;
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
-	u64 offset = 0, inum = inode->ei_inode.bi_inum;
-
-	int ret = bch2_trans_run(c,
-		for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
-				   POS(inum, offset),
-				   POS(inum, U64_MAX),
-				   inode->ei_inum.subvol, 0, k, ({
-			if (k.k->type != KEY_TYPE_xattr)
-				continue;
-
-			bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
-		}))) ?:
-		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
-		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
-
-	return ret ? bch2_err_class(ret) : buf.used;
-}
-
-static int bch2_xattr_get_handler(const struct xattr_handler *handler,
-				  struct dentry *dentry, struct inode *vinode,
-				  const char *name, void *buffer, size_t size)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	int ret = bch2_trans_do(c,
-		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
-
-	if (ret < 0 && bch2_err_matches(ret, ENOENT))
-		ret = -ENODATA;
-
-	return bch2_err_class(ret);
-}
-
-static int bch2_xattr_set_handler(const struct xattr_handler *handler,
-				  struct mnt_idmap *idmap,
-				  struct dentry *dentry, struct inode *vinode,
-				  const char *name, const void *value,
-				  size_t size, int flags)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-	struct bch_inode_unpacked inode_u;
-	int ret;
-
-	ret = bch2_trans_run(c,
-		commit_do(trans, NULL, NULL, 0,
-			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
-				       &hash, name, value, size,
-				       handler->flags, flags)) ?:
-		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
-
-	return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_user_handler = {
-	.prefix	= XATTR_USER_PREFIX,
-	.get	= bch2_xattr_get_handler,
-	.set	= bch2_xattr_set_handler,
-	.flags	= KEY_TYPE_XATTR_INDEX_USER,
-};
-
-static bool bch2_xattr_trusted_list(struct dentry *dentry)
-{
-	return capable(CAP_SYS_ADMIN);
-}
-
-static const struct xattr_handler bch_xattr_trusted_handler = {
-	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= bch2_xattr_trusted_list,
-	.get	= bch2_xattr_get_handler,
-	.set	= bch2_xattr_set_handler,
-	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
-};
-
-static const struct xattr_handler bch_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.get	= bch2_xattr_get_handler,
-	.set	= bch2_xattr_set_handler,
-	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
-};
-
-#ifndef NO_BCACHEFS_FS
-
-static int opt_to_inode_opt(int id)
-{
-	switch (id) {
-#define x(name, ...)				\
-	case Opt_##name: return Inode_opt_##name;
-	BCH_INODE_OPTS()
-#undef  x
-	default:
-		return -1;
-	}
-}
-
-static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-				struct dentry *dentry, struct inode *vinode,
-				const char *name, void *buffer, size_t size,
-				bool all)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct bch_opts opts =
-		bch2_inode_opts_to_opts(&inode->ei_inode);
-	const struct bch_option *opt;
-	int id, inode_opt_id;
-	struct printbuf out = PRINTBUF;
-	int ret;
-	u64 v;
-
-	id = bch2_opt_lookup(name);
-	if (id < 0 || !bch2_opt_is_inode_opt(id))
-		return -EINVAL;
-
-	inode_opt_id = opt_to_inode_opt(id);
-	if (inode_opt_id < 0)
-		return -EINVAL;
-
-	opt = bch2_opt_table + id;
-
-	if (!bch2_opt_defined_by_id(&opts, id))
-		return -ENODATA;
-
-	if (!all &&
-	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
-		return -ENODATA;
-
-	v = bch2_opt_get_by_id(&opts, id);
-	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
-
-	ret = out.pos;
-
-	if (out.allocation_failure) {
-		ret = -ENOMEM;
-	} else if (buffer) {
-		if (out.pos > size)
-			ret = -ERANGE;
-		else
-			memcpy(buffer, out.buf, out.pos);
-	}
-
-	printbuf_exit(&out);
-	return ret;
-}
-
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, struct inode *vinode,
-				   const char *name, void *buffer, size_t size)
-{
-	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
-					 name, buffer, size, false);
-}
-
-struct inode_opt_set {
-	int			id;
-	u64			v;
-	bool			defined;
-};
-
-static int inode_opt_set_fn(struct btree_trans *trans,
-			    struct bch_inode_info *inode,
-			    struct bch_inode_unpacked *bi,
-			    void *p)
-{
-	struct inode_opt_set *s = p;
-
-	if (s->id == Inode_opt_casefold) {
-		int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
-		if (ret)
-			return ret;
-	}
-
-	if (s->defined)
-		bi->bi_fields_set |= 1U << s->id;
-	else
-		bi->bi_fields_set &= ~(1U << s->id);
-
-	bch2_inode_opt_set(bi, s->id, s->v);
-
-	return 0;
-}
-
-static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
-				   struct mnt_idmap *idmap,
-				   struct dentry *dentry, struct inode *vinode,
-				   const char *name, const void *value,
-				   size_t size, int flags)
-{
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	const struct bch_option *opt;
-	char *buf;
-	struct inode_opt_set s;
-	int opt_id, inode_opt_id, ret;
-
-	opt_id = bch2_opt_lookup(name);
-	if (opt_id < 0)
-		return -EINVAL;
-
-	opt = bch2_opt_table + opt_id;
-
-	inode_opt_id = opt_to_inode_opt(opt_id);
-	if (inode_opt_id < 0)
-		return -EINVAL;
-
-	s.id = inode_opt_id;
-
-	if (value) {
-		u64 v = 0;
-
-		buf = kmalloc(size + 1, GFP_KERNEL);
-		if (!buf)
-			return -ENOMEM;
-		memcpy(buf, value, size);
-		buf[size] = '\0';
-
-		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
-		kfree(buf);
-
-		if (ret < 0)
-			goto err_class_exit;
-
-		ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
-		if (ret < 0)
-			goto err_class_exit;
-
-		s.v = v + 1;
-		s.defined = true;
-	} else {
-		/*
-		 * Check if this option was set on the parent - if so, switched
-		 * back to inheriting from the parent:
-		 *
-		 * rename() also has to deal with keeping inherited options up
-		 * to date - see bch2_reinherit_attrs()
-		 */
-		spin_lock(&dentry->d_lock);
-		if (!IS_ROOT(dentry)) {
-			struct bch_inode_info *dir =
-				to_bch_ei(d_inode(dentry->d_parent));
-
-			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
-		} else {
-			s.v = 0;
-		}
-		spin_unlock(&dentry->d_lock);
-
-		s.defined = false;
-	}
-
-	mutex_lock(&inode->ei_update_lock);
-	if (inode_opt_id == Inode_opt_project) {
-		/*
-		 * inode fields accessible via the xattr interface are stored
-		 * with a +1 bias, so that 0 means unset:
-		 */
-		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
-		if (ret)
-			goto err;
-	}
-
-	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
-err:
-	mutex_unlock(&inode->ei_update_lock);
-err_class_exit:
-	return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_handler = {
-	.prefix	= "bcachefs.",
-	.get	= bch2_xattr_bcachefs_get,
-	.set	= bch2_xattr_bcachefs_set,
-};
-
-static int bch2_xattr_bcachefs_get_effective(
-				const struct xattr_handler *handler,
-				struct dentry *dentry, struct inode *vinode,
-				const char *name, void *buffer, size_t size)
-{
-	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
-					 name, buffer, size, true);
-}
-
-/* Noop - xattrs in the bcachefs_effective namespace are inherited */
-static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
-				   struct mnt_idmap *idmap,
-				   struct dentry *dentry, struct inode *vinode,
-				   const char *name, const void *value,
-				   size_t size, int flags)
-{
-	return 0;
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
-	.prefix	= "bcachefs_effective.",
-	.get	= bch2_xattr_bcachefs_get_effective,
-	.set	= bch2_xattr_bcachefs_set_effective,
-};
-
-#endif /* NO_BCACHEFS_FS */
-
-const struct xattr_handler * const bch2_xattr_handlers[] = {
-	&bch_xattr_user_handler,
-	&bch_xattr_trusted_handler,
-	&bch_xattr_security_handler,
-#ifndef NO_BCACHEFS_FS
-	&bch_xattr_bcachefs_handler,
-	&bch_xattr_bcachefs_effective_handler,
-#endif
-	NULL
-};
-
-static const struct xattr_handler *bch_xattr_handler_map[] = {
-	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
-	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
-		&nop_posix_acl_access,
-	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
-		&nop_posix_acl_default,
-	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
-	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
-};
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
-{
-	return type < ARRAY_SIZE(bch_xattr_handler_map)
-		? bch_xattr_handler_map[type]
-		: NULL;
-}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
deleted file mode 100644
index 1139bf345f70..000000000000
--- a/fs/bcachefs/xattr.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_H
-#define _BCACHEFS_XATTR_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_xattr_hash_desc;
-
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
-			struct bkey_validate_context);
-void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
-	.key_validate	= bch2_xattr_validate,		\
-	.val_to_text	= bch2_xattr_to_text,		\
-	.min_val_size	= 8,				\
-})
-
-static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
-{
-	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
-			    name_len + val_len, sizeof(u64));
-}
-
-#define xattr_val(_xattr)					\
-	((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
-
-struct xattr_search_key {
-	u8		type;
-	struct qstr	name;
-};
-
-#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
-	{ .type = _type, .name = QSTR_INIT(_name, _len) })
-
-struct dentry;
-struct xattr_handler;
-struct bch_hash_info;
-struct bch_inode_info;
-
-/* Exported for cmd_migrate.c in tools: */
-int bch2_xattr_set(struct btree_trans *, subvol_inum,
-		   struct bch_inode_unpacked *, const struct bch_hash_info *,
-		   const char *, const void *, size_t, int, int);
-
-ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-
-extern const struct xattr_handler * const bch2_xattr_handlers[];
-
-#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
deleted file mode 100644
index 4121b78d9a92..000000000000
--- a/fs/bcachefs/xattr_format.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_FORMAT_H
-#define _BCACHEFS_XATTR_FORMAT_H
-
-#define KEY_TYPE_XATTR_INDEX_USER		0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED		3
-#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
-
-struct bch_xattr {
-	struct bch_val		v;
-	__u8			x_type;
-	__u8			x_name_len;
-	__le16			x_val_len;
-	/*
-	 * x_name contains the name and value counted by
-	 * x_name_len + x_val_len. The introduction of
-	 * __counted_by(x_name_len) previously caused a false positive
-	 * detection of an out of bounds write.
-	 */
-	__u8			x_name_and_value[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 264fba0d44bd..e4653bb99946 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
 
+static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+	mm->saved_e_flags = flags;
+#endif
+}
+
+static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+	flags = mm->saved_e_flags;
+#endif
+	return flags;
+}
+
 /*
  * We need to explicitly zero any trailing portion of the page that follows
  * p_filesz when it ends before the page ends (e.g. bss), otherwise this
@@ -1290,6 +1305,8 @@ out_free_interp:
 	mm->end_data = end_data;
 	mm->start_stack = bprm->p;
 
+	elf_coredump_set_mm_eflags(mm, elf_ex->e_flags);
+
 	/**
 	 * DOC: "brk" handling
 	 *
@@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	struct elf_thread_core_info *t;
 	struct elf_prpsinfo *psinfo;
 	struct core_thread *ct;
+	u16 machine;
+	u32 flags;
 
 	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
 	if (!psinfo)
@@ -1831,30 +1850,37 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 		return 0;
 	}
 
-	/*
-	 * Initialize the ELF file header.
-	 */
-	fill_elf_header(elf, phdrs,
-			view->e_machine, view->e_flags);
+	machine = view->e_machine;
+	flags = view->e_flags;
 #else
 	view = NULL;
 	info->thread_notes = 2;
-	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+	machine = ELF_ARCH;
+	flags = ELF_CORE_EFLAGS;
 #endif
 
 	/*
+	 * Override ELF e_flags with value taken from process,
+	 * if arch needs that.
+	 */
+	flags = elf_coredump_get_mm_eflags(dump_task->mm, flags);
+
+	/*
+	 * Initialize the ELF file header.
+	 */
+	fill_elf_header(elf, phdrs, machine, flags);
+
+	/*
 	 * Allocate a structure for each thread.
 	 */
-	info->thread = kzalloc(offsetof(struct elf_thread_core_info,
-				     notes[info->thread_notes]),
-			    GFP_KERNEL);
+	info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes),
+			       GFP_KERNEL);
 	if (unlikely(!info->thread))
 		return 0;
 
 	info->thread->task = dump_task;
 	for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
-		t = kzalloc(offsetof(struct elf_thread_core_info,
-				     notes[info->thread_notes]),
+		t = kzalloc(struct_size(t, notes, info->thread_notes),
 			    GFP_KERNEL);
 		if (unlikely(!t))
 			return 0;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ea95c90c8474..4438637c8900 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS
 config BTRFS_DEBUG
 	bool "Btrfs debugging support"
 	depends on BTRFS_FS
+	select REF_TRACKER if STACKTRACE_SUPPORT
 	help
 	  Enable run-time debugging support for the btrfs filesystem.
 
@@ -117,14 +118,3 @@ config BTRFS_EXPERIMENTAL
 	  - large folio support
 
 	  If unsure, say N.
-
-config BTRFS_FS_REF_VERIFY
-	bool "Btrfs with the ref verify tool compiled in"
-	depends on BTRFS_FS
-	default n
-	help
-	  Enable run-time extent reference verification instrumentation.  This
-	  is meant to be used by btrfs developers for tracking down extent
-	  reference problems or verifying they didn't break something.
-
-	  If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 2d5f0482678b..743d7677b175 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o
 btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
 btrfs-$(CONFIG_FS_VERITY) += verity.o
 
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 861c7d92c437..1248aa2535d3 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1,
  * gives us all the type checking.
  *
  * The extent buffer pages stored in the array folios may not form a contiguous
- * phyusical range, but the API functions assume the linear offset to the range
+ * physical range, but the API functions assume the linear offset to the range
  * from 0 to metadata node size.
  */
 
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6a450be293b1..2ab550a1e715 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			free_pref(ref);
 			return PTR_ERR(eb);
 		}
-		if (!extent_buffer_uptodate(eb)) {
+		if (unlikely(!extent_buffer_uptodate(eb))) {
 			free_pref(ref);
 			free_extent_buffer(eb);
 			return -EIO;
@@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		type = btrfs_get_extent_inline_ref_type(leaf, iref,
 							BTRFS_REF_TYPE_ANY);
-		if (type == BTRFS_REF_TYPE_INVALID)
+		if (unlikely(type == BTRFS_REF_TYPE_INVALID))
 			return -EUCLEAN;
 
 		offset = btrfs_extent_inline_ref_offset(leaf, iref);
@@ -1422,7 +1422,7 @@ again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
@@ -1614,7 +1614,7 @@ again:
 					ret = PTR_ERR(eb);
 					goto out;
 				}
-				if (!extent_buffer_uptodate(eb)) {
+				if (unlikely(!extent_buffer_uptodate(eb))) {
 					free_extent_buffer(eb);
 					ret = -EIO;
 					goto out;
@@ -1652,7 +1652,7 @@ again:
 				 * case.
 				 */
 				ASSERT(eie);
-				if (!eie) {
+				if (unlikely(!eie)) {
 					ret = -EUCLEAN;
 					goto out;
 				}
@@ -1690,7 +1690,7 @@ out:
  * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
  * added to the ulist at @ctx->refs, and that ulist is allocated by this
  * function. The caller should free the ulist with free_leaf_list() if
- * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is
  * enough.
  *
  * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
@@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
@@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr,
 	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
 	*out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
 						     BTRFS_REF_TYPE_ANY);
-	if (*out_type == BTRFS_REF_TYPE_INVALID)
+	if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID))
 		return -EUCLEAN;
 
 	*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
@@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
 		ret = -EUCLEAN;
 		goto release;
 	}
-	if (path->slots[0] == 0) {
+	if (unlikely(path->slots[0] == 0)) {
 		DEBUG_WARN();
 		ret = -EUCLEAN;
 		goto release;
@@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
 		if (ret < 0)
 			goto out;
 		/* No extra backref? This means the tree block is corrupted */
-		if (ret > 0) {
+		if (unlikely(ret > 0)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
 				((unsigned long)iter->cur_ptr);
 			type = btrfs_get_extent_inline_ref_type(eb, iref,
 							BTRFS_REF_TYPE_BLOCK);
-			if (type == BTRFS_REF_TYPE_INVALID) {
+			if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
 				ret = -EUCLEAN;
 				goto out;
 			}
@@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
 		}
 
 		/* Sanity check, we shouldn't have any unchecked nodes */
-		if (!upper->checked) {
+		if (unlikely(!upper->checked)) {
 			DEBUG_WARN("we should not have any unchecked nodes");
 			return -EUCLEAN;
 		}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 34b0193a181c..25d51c246070 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx {
 	 * It's very common to have several file extent items that point to the
 	 * same extent (bytenr) but with different offsets and lengths. This
 	 * typically happens for COW writes, partial writes into prealloc
-	 * extents, NOCOW writes after snapshoting a root, hole punching or
+	 * extents, NOCOW writes after snapshotting a root, hole punching or
 	 * reflinking within the same file (less common perhaps).
 	 * So keep a small cache with the lookup results for the extent pointed
 	 * by the last few file extent items. This cache is checked, with a
@@ -414,7 +414,7 @@ struct btrfs_backref_cache {
 	/*
 	 * Whether this cache is for relocation
 	 *
-	 * Reloction backref cache require more info for reloc root compared
+	 * Relocation backref cache require more info for reloc root compared
 	 * to generic backref cache.
 	 */
 	bool is_reloc;
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 50b5fc1c06d7..21df48e6c4fa 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
 		refcount_inc(&orig_bbio->ordered->refs);
 		bbio->ordered = orig_bbio->ordered;
 	}
+	bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
 	atomic_inc(&orig_bbio->pending_ios);
 	return bbio;
 }
@@ -166,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
 	int mirror = repair_bbio->mirror_num;
 
 	if (repair_bbio->bio.bi_status ||
-	    !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+	    !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) {
 		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
 		repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
 
@@ -203,18 +204,21 @@ done:
  */
 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 						  u32 bio_offset,
-						  struct bio_vec *bv,
+						  phys_addr_t paddr,
 						  struct btrfs_failed_bio *fbio)
 {
 	struct btrfs_inode *inode = failed_bbio->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct folio *folio = page_folio(phys_to_page(paddr));
 	const u32 sectorsize = fs_info->sectorsize;
+	const u32 foff = offset_in_folio(folio, paddr);
 	const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
 	struct btrfs_bio *repair_bbio;
 	struct bio *repair_bio;
 	int num_copies;
 	int mirror;
 
+	ASSERT(foff + sectorsize <= folio_size(folio));
 	btrfs_debug(fs_info, "repair read error: read error at %llu",
 		    failed_bbio->file_offset + bio_offset);
 
@@ -237,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
 	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
 				      &btrfs_repair_bioset);
 	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
-	__bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+	bio_add_folio_nofail(repair_bio, folio, sectorsize, foff);
 
 	repair_bbio = btrfs_bio(repair_bio);
 	btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
@@ -258,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
 	struct bvec_iter *iter = &bbio->saved_iter;
 	blk_status_t status = bbio->bio.bi_status;
 	struct btrfs_failed_bio *fbio = NULL;
+	phys_addr_t paddr;
 	u32 offset = 0;
 
 	/* Read-repair requires the inode field to be set by the submitter. */
@@ -275,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
 	/* Clear the I/O error. A failed repair will reset it. */
 	bbio->bio.bi_status = BLK_STS_OK;
 
-	while (iter->bi_size) {
-		struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
-
-		bv.bv_len = min(bv.bv_len, sectorsize);
-		if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
-			fbio = repair_one_sector(bbio, offset, &bv, fbio);
-
-		bio_advance_iter_single(&bbio->bio, iter, sectorsize);
+	btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) {
+		if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr))
+			fbio = repair_one_sector(bbio, offset, paddr, fbio);
 		offset += sectorsize;
 	}
-
 	if (bbio->csum != bbio->csum_inline)
 		kfree(bbio->csum);
 
@@ -780,11 +779,38 @@ end_bbio:
 	return true;
 }
 
+static void assert_bbio_alignment(struct btrfs_bio *bbio)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+	struct btrfs_fs_info *fs_info = bbio->fs_info;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	const u32 blocksize = fs_info->sectorsize;
+
+	/* Metadata has no extra bs > ps alignment requirement. */
+	if (!is_data_bbio(bbio))
+		return;
+
+	bio_for_each_bvec(bvec, &bbio->bio, iter)
+		ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) &&
+		       IS_ALIGNED(bvec.bv_len, blocksize),
+		"root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
+		btrfs_root_id(bbio->inode->root),
+		btrfs_ino(bbio->inode),
+		bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT,
+		bbio->bio.bi_iter.bi_size, iter.bi_idx,
+		bvec.bv_offset,
+		bvec.bv_len);
+#endif
+}
+
 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
 {
 	/* If bbio->inode is not populated, its file_offset must be 0. */
 	ASSERT(bbio->inode || bbio->file_offset == 0);
 
+	assert_bbio_alignment(bbio);
+
 	while (!btrfs_submit_chunk(bbio, mirror_num))
 		;
 }
@@ -823,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	if (ret < 0)
 		goto out_counter_dec;
 
-	if (!smap.dev->bdev ||
-	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
+	if (unlikely(!smap.dev->bdev ||
+		     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
 		ret = -EIO;
 		goto out_counter_dec;
 	}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index dc2eb43b7097..00883aea55d7 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -82,6 +82,8 @@ struct btrfs_bio {
 	/* Save the first error status of split bio. */
 	blk_status_t status;
 
+	/* Use the commit root to look up csums (data read bio only). */
+	bool csum_search_commit_root;
 	/*
 	 * This member must come last, bio_alloc_bioset will allocate enough
 	 * bytes for entire btrfs_bio but relies on bio being last.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9bf282d2453c..5322ef2ae015 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
  * data in this block group. That check should be done by relocation routine,
  * not this function.
  */
-static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
@@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
 	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
 	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
 
-	return bg1->used > bg2->used;
+	/*
+	 * Some other task may be updating the ->used field concurrently, but it
+	 * is not serious if we get a stale value or load/store tearing issues,
+	 * as sorting the list of block groups to reclaim is not critical and an
+	 * occasional imperfect order is ok. So silence KCSAN and avoid the
+	 * overhead of locking or any other synchronization.
+	 */
+	return data_race(bg1->used > bg2->used);
 }
 
 static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -1964,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 		 * called, which is where we will transfer a reserved extent's
 		 * size from the "reserved" counter to the "used" counter - this
 		 * happens when running delayed references. When we relocate the
-		 * chunk below, relocation first flushes dellaloc, waits for
+		 * chunk below, relocation first flushes delalloc, waits for
 		 * ordered extent completion (which is where we create delayed
 		 * references for data extents) and commits the current
 		 * transaction (which runs delayed references), and only after
@@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
 	btrfs_reclaim_sweep(fs_info);
 	spin_lock(&fs_info->unused_bgs_lock);
 	if (!list_empty(&fs_info->reclaim_bgs))
-		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+		queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
 	spin_unlock(&fs_info->unused_bgs_lock);
 }
 
@@ -2064,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
 		return -ENOENT;
 	}
 
-	if (map->start != key->objectid || map->chunk_len != key->offset) {
+	if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
 		btrfs_err(fs_info,
 			"block group %llu len %llu mismatch with chunk %llu len %llu",
 			  key->objectid, key->offset, map->start, map->chunk_len);
@@ -2077,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
 	flags = btrfs_stack_block_group_flags(&bg) &
 		BTRFS_BLOCK_GROUP_TYPE_MASK;
 
-	if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+	if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
 		btrfs_err(fs_info,
 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
 			  key->objectid, key->offset, flags,
@@ -2238,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
 			return ret;
 
 		/* Shouldn't have super stripes in sequential zones */
-		if (zoned && nr) {
+		if (unlikely(zoned && nr)) {
 			kfree(logical);
 			btrfs_err(fs_info,
 			"zoned: block group %llu must not contain super block",
@@ -2329,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 			break;
 
 		bg = btrfs_lookup_block_group(fs_info, map->start);
-		if (!bg) {
+		if (unlikely(!bg)) {
 			btrfs_err(fs_info,
 	"chunk start=%llu len=%llu doesn't have corresponding block group",
 				     map->start, map->chunk_len);
@@ -2337,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 			btrfs_free_chunk_map(map);
 			break;
 		}
-		if (bg->start != map->start || bg->length != map->chunk_len ||
-		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
-		    (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+		if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
+			     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+			     (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
 			btrfs_err(fs_info,
 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
 				map->start, map->chunk_len,
@@ -2832,7 +2839,7 @@ next:
 		 * space or none at all (due to no need to COW, extent buffers
 		 * were already COWed in the current transaction and still
 		 * unwritten, tree heights lower than the maximum possible
-		 * height, etc). For data we generally reserve the axact amount
+		 * height, etc). For data we generally reserve the exact amount
 		 * of space we are going to allocate later, the exception is
 		 * when using compression, as we must reserve space based on the
 		 * uncompressed data size, because the compression is only done
@@ -3241,7 +3248,7 @@ again:
 	 */
 	BTRFS_I(inode)->generation = 0;
 	ret = btrfs_update_inode(trans, BTRFS_I(inode));
-	if (ret) {
+	if (unlikely(ret)) {
 		/*
 		 * So theoretically we could recover from this, simply set the
 		 * super cache generation to 0 so we know to invalidate the
@@ -3988,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
 		struct btrfs_space_info *sys_space_info;
 
 		sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
-		if (!sys_space_info) {
+		if (unlikely(!sys_space_info)) {
 			ret = -EINVAL;
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -4002,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
 		}
 
 		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
 
 		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
-	} else if (ret) {
+	} else if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index a8bb8429c966..9172104a5889 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -63,7 +63,7 @@ enum btrfs_discard_state {
  * CHUNK_ALLOC_FORCE means it must try to allocate one
  *
  * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
- * find_free_extent() that also activaes the zone
+ * find_free_extent() that also activates the zone
  */
 enum btrfs_chunk_alloc_enum {
 	CHUNK_ALLOC_NO_FORCE,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b99fb0273292..af373d50a901 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -248,7 +248,7 @@ struct btrfs_inode {
 		u64 new_delalloc_bytes;
 		/*
 		 * The offset of the last dir index key that was logged.
-		 * This is used only for directories.
+		 * This is used only for directories. Protected by 'log_mutex'.
 		 */
 		u64 last_dir_index_offset;
 	};
@@ -338,6 +338,11 @@ struct btrfs_inode {
 	struct list_head delayed_iput;
 
 	struct rw_semaphore i_mmap_lock;
+
+#ifdef CONFIG_FS_VERITY
+	struct fsverity_info *i_verity_info;
+#endif
+
 	struct inode vfs_inode;
 };
 
@@ -532,9 +537,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
 
 	/* We only allow BITS_PER_LONGS blocks for each bitmap. */
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
-	mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0,
-			ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits)
-				>> PAGE_SHIFT)));
+	mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
+				      inode->root->fs_info->block_min_order,
+				      inode->root->fs_info->block_max_order);
 #endif
 }
 
@@ -542,10 +547,12 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
 #define CSUM_FMT				"0x%*phN"
 #define CSUM_FMT_VALUE(size, bytes)		size, bytes
 
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
-			    const u8 * const csum_expected);
+void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
+				u8 *dest);
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+			   const u8 * const csum_expected);
 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
-			u32 bio_offset, struct bio_vec *bv);
+			u32 bio_offset, phys_addr_t paddr);
 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
 			      struct btrfs_file_extent *file_extent,
 			      bool nowait);
@@ -558,7 +565,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       const struct fscrypt_str *name);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const struct fscrypt_str *name, int add_backref, u64 index);
+		   const struct fscrypt_str *name, bool add_backref, u64 index);
 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end);
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d09d622016ef..bacad18357b3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
 }
 
 static int compression_compress_pages(int type, struct list_head *ws,
-				      struct address_space *mapping, u64 start,
+				      struct btrfs_inode *inode, u64 start,
 				      struct folio **folios, unsigned long *out_folios,
 				      unsigned long *total_in, unsigned long *total_out)
 {
 	switch (type) {
 	case BTRFS_COMPRESS_ZLIB:
-		return zlib_compress_folios(ws, mapping, start, folios,
+		return zlib_compress_folios(ws, inode, start, folios,
 					    out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_LZO:
-		return lzo_compress_folios(ws, mapping, start, folios,
+		return lzo_compress_folios(ws, inode, start, folios,
 					   out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_ZSTD:
-		return zstd_compress_folios(ws, mapping, start, folios,
+		return zstd_compress_folios(ws, inode, start, folios,
 					    out_folios, total_in, total_out);
 	case BTRFS_COMPRESS_NONE:
 	default:
@@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
 /*
  * Common wrappers for page allocation from compression wrappers
  */
-struct folio *btrfs_alloc_compr_folio(void)
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
 {
 	struct folio *folio = NULL;
 
+	/* For bs > ps cases, no cached folio pool for now. */
+	if (fs_info->block_min_order)
+		goto alloc;
+
 	spin_lock(&compr_pool.lock);
 	if (compr_pool.count > 0) {
 		folio = list_first_entry(&compr_pool.list, struct folio, lru);
@@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void)
 	if (folio)
 		return folio;
 
-	return folio_alloc(GFP_NOFS, 0);
+alloc:
+	return folio_alloc(GFP_NOFS, fs_info->block_min_order);
 }
 
 void btrfs_free_compr_folio(struct folio *folio)
 {
 	bool do_free = false;
 
+	/* The folio is from bs > ps fs, no cached pool for now. */
+	if (folio_order(folio))
+		goto free;
+
 	spin_lock(&compr_pool.lock);
 	if (compr_pool.count > compr_pool.thresh) {
 		do_free = true;
@@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio)
 	if (!do_free)
 		return;
 
+free:
 	ASSERT(folio_ref_count(folio) == 1);
 	folio_put(folio);
 }
@@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio)
 
 static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
 {
+	struct btrfs_fs_info *fs_info = cb->bbio.fs_info;
 	struct bio *bio = &cb->bbio.bio;
 	u32 offset = 0;
 
 	while (offset < cb->compressed_len) {
+		struct folio *folio;
 		int ret;
-		u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
+		u32 len = min_t(u32, cb->compressed_len - offset,
+				btrfs_min_folio_size(fs_info));
 
+		folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)];
 		/* Maximum compressed extent is smaller than bio size limit. */
-		ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
-				    len, 0);
+		ret = bio_add_folio(bio, folio, len, 0);
 		ASSERT(ret);
 		offset += len;
 	}
@@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	if (fs_info->sectorsize < PAGE_SIZE)
 		return 0;
 
+	/* For bs > ps cases, we don't support readahead for compressed folios for now. */
+	if (fs_info->block_min_order)
+		return 0;
+
 	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
 	while (cur < compressed_end) {
@@ -602,17 +619,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
 	cb->compressed_len = compressed_len;
 	cb->compress_type = btrfs_extent_map_compression(em);
 	cb->orig_bbio = bbio;
+	cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
 
 	btrfs_free_extent_map(em);
 
-	cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+	cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
 	cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
 	if (!cb->compressed_folios) {
 		status = BLK_STS_RESOURCE;
 		goto out_free_bio;
 	}
 
-	ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
+	ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
+				      cb->compressed_folios);
 	if (ret) {
 		status = BLK_STS_RESOURCE;
 		goto out_free_compressed_pages;
@@ -687,8 +706,6 @@ struct heuristic_ws {
 	struct list_head list;
 };
 
-static struct workspace_manager heuristic_wsm;
-
 static void free_heuristic_ws(struct list_head *ws)
 {
 	struct heuristic_ws *workspace;
@@ -701,7 +718,7 @@ static void free_heuristic_ws(struct list_head *ws)
 	kfree(workspace);
 }
 
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
 {
 	struct heuristic_ws *ws;
 
@@ -728,11 +745,9 @@ fail:
 	return ERR_PTR(-ENOMEM);
 }
 
-const struct btrfs_compress_op btrfs_heuristic_compress = {
-	.workspace_manager = &heuristic_wsm,
-};
+const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
 
-static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
 	/* The heuristic is represented as compression type 0 */
 	&btrfs_heuristic_compress,
 	&btrfs_zlib_compress,
@@ -740,13 +755,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zstd_compress,
 };
 
-static struct list_head *alloc_workspace(int type, int level)
+static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
 {
 	switch (type) {
-	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
-	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
-	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace();
-	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
+	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
+	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(fs_info);
+	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
 	default:
 		/*
 		 * This can't happen, the type is validated several times
@@ -772,44 +787,58 @@ static void free_workspace(int type, struct list_head *ws)
 	}
 }
 
-static void btrfs_init_workspace_manager(int type)
+static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
+				   enum btrfs_compression_type type)
 {
-	struct workspace_manager *wsm;
+	struct workspace_manager *gwsm;
 	struct list_head *workspace;
 
-	wsm = btrfs_compress_op[type]->workspace_manager;
-	INIT_LIST_HEAD(&wsm->idle_ws);
-	spin_lock_init(&wsm->ws_lock);
-	atomic_set(&wsm->total_ws, 0);
-	init_waitqueue_head(&wsm->ws_wait);
+	ASSERT(fs_info->compr_wsm[type] == NULL);
+	gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
+	if (!gwsm)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&gwsm->idle_ws);
+	spin_lock_init(&gwsm->ws_lock);
+	atomic_set(&gwsm->total_ws, 0);
+	init_waitqueue_head(&gwsm->ws_wait);
+	fs_info->compr_wsm[type] = gwsm;
 
 	/*
 	 * Preallocate one workspace for each compression type so we can
 	 * guarantee forward progress in the worst case
 	 */
-	workspace = alloc_workspace(type, 0);
+	workspace = alloc_workspace(fs_info, type, 0);
 	if (IS_ERR(workspace)) {
-		btrfs_warn(NULL,
-			   "cannot preallocate compression workspace, will try later");
+		btrfs_warn(fs_info,
+	"cannot preallocate compression workspace for %s, will try later",
+			   btrfs_compress_type2str(type));
 	} else {
-		atomic_set(&wsm->total_ws, 1);
-		wsm->free_ws = 1;
-		list_add(workspace, &wsm->idle_ws);
+		atomic_set(&gwsm->total_ws, 1);
+		gwsm->free_ws = 1;
+		list_add(workspace, &gwsm->idle_ws);
 	}
+	return 0;
 }
 
-static void btrfs_cleanup_workspace_manager(int type)
+static void free_workspace_manager(struct btrfs_fs_info *fs_info,
+				   enum btrfs_compression_type type)
 {
-	struct workspace_manager *wsman;
 	struct list_head *ws;
+	struct workspace_manager *gwsm = fs_info->compr_wsm[type];
 
-	wsman = btrfs_compress_op[type]->workspace_manager;
-	while (!list_empty(&wsman->idle_ws)) {
-		ws = wsman->idle_ws.next;
+	/* ZSTD uses its own workspace manager, should enter here. */
+	ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
+	if (!gwsm)
+		return;
+	fs_info->compr_wsm[type] = NULL;
+	while (!list_empty(&gwsm->idle_ws)) {
+		ws = gwsm->idle_ws.next;
 		list_del(ws);
 		free_workspace(type, ws);
-		atomic_dec(&wsman->total_ws);
+		atomic_dec(&gwsm->total_ws);
 	}
+	kfree(gwsm);
 }
 
 /*
@@ -818,9 +847,9 @@ static void btrfs_cleanup_workspace_manager(int type)
  * Preallocation makes a forward progress guarantees and we do not return
  * errors.
  */
-struct list_head *btrfs_get_workspace(int type, int level)
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
 {
-	struct workspace_manager *wsm;
+	struct workspace_manager *wsm = fs_info->compr_wsm[type];
 	struct list_head *workspace;
 	int cpus = num_online_cpus();
 	unsigned nofs_flag;
@@ -830,7 +859,7 @@ struct list_head *btrfs_get_workspace(int type, int level)
 	wait_queue_head_t *ws_wait;
 	int *free_ws;
 
-	wsm = btrfs_compress_op[type]->workspace_manager;
+	ASSERT(wsm);
 	idle_ws	 = &wsm->idle_ws;
 	ws_lock	 = &wsm->ws_lock;
 	total_ws = &wsm->total_ws;
@@ -866,7 +895,7 @@ again:
 	 * context of btrfs_compress_bio/btrfs_compress_pages
 	 */
 	nofs_flag = memalloc_nofs_save();
-	workspace = alloc_workspace(type, level);
+	workspace = alloc_workspace(fs_info, type, level);
 	memalloc_nofs_restore(nofs_flag);
 
 	if (IS_ERR(workspace)) {
@@ -889,7 +918,7 @@ again:
 					/* no burst */ 1);
 
 			if (__ratelimit(&_rs))
-				btrfs_warn(NULL,
+				btrfs_warn(fs_info,
 				"no compression workspaces, low memory, retrying");
 		}
 		goto again;
@@ -897,13 +926,13 @@ again:
 	return workspace;
 }
 
-static struct list_head *get_workspace(int type, int level)
+static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
 {
 	switch (type) {
-	case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
-	case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
-	case BTRFS_COMPRESS_LZO:  return btrfs_get_workspace(type, level);
-	case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+	case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
+	case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
+	case BTRFS_COMPRESS_LZO:  return btrfs_get_workspace(fs_info, type, level);
+	case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
 	default:
 		/*
 		 * This can't happen, the type is validated several times
@@ -917,21 +946,21 @@ static struct list_head *get_workspace(int type, int level)
  * put a workspace struct back on the list or free it if we have enough
  * idle ones sitting around
  */
-void btrfs_put_workspace(int type, struct list_head *ws)
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
 {
-	struct workspace_manager *wsm;
+	struct workspace_manager *gwsm = fs_info->compr_wsm[type];
 	struct list_head *idle_ws;
 	spinlock_t *ws_lock;
 	atomic_t *total_ws;
 	wait_queue_head_t *ws_wait;
 	int *free_ws;
 
-	wsm = btrfs_compress_op[type]->workspace_manager;
-	idle_ws	 = &wsm->idle_ws;
-	ws_lock	 = &wsm->ws_lock;
-	total_ws = &wsm->total_ws;
-	ws_wait	 = &wsm->ws_wait;
-	free_ws	 = &wsm->free_ws;
+	ASSERT(gwsm);
+	idle_ws	 = &gwsm->idle_ws;
+	ws_lock	 = &gwsm->ws_lock;
+	total_ws = &gwsm->total_ws;
+	ws_wait	 = &gwsm->ws_wait;
+	free_ws	 = &gwsm->free_ws;
 
 	spin_lock(ws_lock);
 	if (*free_ws <= num_online_cpus()) {
@@ -948,13 +977,13 @@ wake:
 	cond_wake_up(ws_wait);
 }
 
-static void put_workspace(int type, struct list_head *ws)
+static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
 {
 	switch (type) {
-	case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
-	case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
-	case BTRFS_COMPRESS_LZO:  return btrfs_put_workspace(type, ws);
-	case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+	case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
+	case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
+	case BTRFS_COMPRESS_LZO:  return btrfs_put_workspace(fs_info, type, ws);
+	case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
 	default:
 		/*
 		 * This can't happen, the type is validated several times
@@ -970,12 +999,12 @@ static void put_workspace(int type, struct list_head *ws)
  */
 static int btrfs_compress_set_level(unsigned int type, int level)
 {
-	const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+	const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
 
 	if (level == 0)
-		level = ops->default_level;
+		level = levels->default_level;
 	else
-		level = clamp(level, ops->min_level, ops->max_level);
+		level = clamp(level, levels->min_level, levels->max_level);
 
 	return level;
 }
@@ -985,9 +1014,9 @@ static int btrfs_compress_set_level(unsigned int type, int level)
  */
 bool btrfs_compress_level_valid(unsigned int type, int level)
 {
-	const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+	const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
 
-	return ops->min_level <= level && level <= ops->max_level;
+	return levels->min_level <= level && level <= levels->max_level;
 }
 
 /* Wrapper around find_get_page(), with extra error message. */
@@ -1022,44 +1051,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
  * - compression algo are 0-3
  * - the level are bits 4-7
  *
- * @out_pages is an in/out parameter, holds maximum number of pages to allocate
- * and returns number of actually allocated pages
+ * @out_folios is an in/out parameter, holds maximum number of folios to allocate
+ * and returns number of actually allocated folios
  *
  * @total_in is used to return the number of bytes actually read.  It
  * may be smaller than the input length if we had to exit early because we
- * ran out of room in the pages array or because we cross the
+ * ran out of room in the folios array or because we cross the
  * max_out threshold.
  *
  * @total_out is an in/out parameter, must be set to the input length and will
  * be also used to return the total number of compressed bytes
  */
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 			 unsigned long *total_in, unsigned long *total_out)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	const unsigned long orig_len = *total_out;
 	struct list_head *workspace;
 	int ret;
 
 	level = btrfs_compress_set_level(type, level);
-	workspace = get_workspace(type, level);
-	ret = compression_compress_pages(type, workspace, mapping, start, folios,
+	workspace = get_workspace(fs_info, type, level);
+	ret = compression_compress_pages(type, workspace, inode, start, folios,
 					 out_folios, total_in, total_out);
 	/* The total read-in bytes should be no larger than the input. */
 	ASSERT(*total_in <= orig_len);
-	put_workspace(type, workspace);
+	put_workspace(fs_info, type, workspace);
 	return ret;
 }
 
 static int btrfs_decompress_bio(struct compressed_bio *cb)
 {
+	struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
 	struct list_head *workspace;
 	int ret;
 	int type = cb->compress_type;
 
-	workspace = get_workspace(type, 0);
+	workspace = get_workspace(fs_info, type, 0);
 	ret = compression_decompress_bio(workspace, cb);
-	put_workspace(type, workspace);
+	put_workspace(fs_info, type, workspace);
 
 	if (!ret)
 		zero_fill_bio(&cb->orig_bbio->bio);
@@ -1080,20 +1111,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
 	int ret;
 
 	/*
-	 * The full destination page range should not exceed the page size.
+	 * The full destination folio range should not exceed the folio size.
 	 * And the @destlen should not exceed sectorsize, as this is only called for
 	 * inline file extents, which should not exceed sectorsize.
 	 */
-	ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+	ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
 
-	workspace = get_workspace(type, 0);
+	workspace = get_workspace(fs_info, type, 0);
 	ret = compression_decompress(type, workspace, data_in, dest_folio,
 				     dest_pgoff, srclen, destlen);
-	put_workspace(type, workspace);
+	put_workspace(fs_info, type, workspace);
 
 	return ret;
 }
 
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+
+	ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+	if (ret < 0)
+		goto error;
+	ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+	if (ret < 0)
+		goto error;
+	ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+	if (ret < 0)
+		goto error;
+	ret = zstd_alloc_workspace_manager(fs_info);
+	if (ret < 0)
+		goto error;
+	return 0;
+error:
+	btrfs_free_compress_wsm(fs_info);
+	return ret;
+}
+
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+	free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+	free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+	free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+	zstd_free_workspace_manager(fs_info);
+}
+
 int __init btrfs_init_compress(void)
 {
 	if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
@@ -1105,11 +1166,6 @@ int __init btrfs_init_compress(void)
 	if (!compr_pool.shrinker)
 		return -ENOMEM;
 
-	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
-	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
-	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
-	zstd_init_workspace_manager();
-
 	spin_lock_init(&compr_pool.lock);
 	INIT_LIST_HEAD(&compr_pool.list);
 	compr_pool.count = 0;
@@ -1130,10 +1186,6 @@ void __cold btrfs_exit_compress(void)
 	btrfs_compr_pool_scan(NULL, NULL);
 	shrinker_free(compr_pool.shrinker);
 
-	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
-	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
-	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
-	zstd_cleanup_workspace_manager();
 	bioset_exit(&btrfs_compressed_bioset);
 }
 
@@ -1256,7 +1308,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
 #define ENTROPY_LVL_HIGH		(80)
 
 /*
- * For increasead precision in shannon_entropy calculation,
+ * For increased precision in shannon_entropy calculation,
  * let's do pow(n, M) to save more digits after comma:
  *
  * - maximum int bit length is 64
@@ -1542,7 +1594,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
  */
 int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
 {
-	struct list_head *ws_list = get_workspace(0, 0);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct list_head *ws_list = get_workspace(fs_info, 0, 0);
 	struct heuristic_ws *ws;
 	u32 i;
 	u8 byte;
@@ -1611,30 +1664,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
 	}
 
 out:
-	put_workspace(0, ws_list);
+	put_workspace(fs_info, 0, ws_list);
 	return ret;
 }
 
 /*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level. Negative level
- * numbers are allowed.
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
  */
-int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
 {
 	int level = 0;
 	int ret;
 
-	if (!type)
+	if (!type) {
+		*level_ret = btrfs_compress_set_level(type, level);
 		return 0;
+	}
 
 	if (str[0] == ':') {
 		ret = kstrtoint(str + 1, 10, &level);
 		if (ret)
-			level = 0;
+			return ret;
 	}
 
-	level = btrfs_compress_set_level(type, level);
-
-	return level;
+	*level_ret = btrfs_compress_set_level(type, level);
+	return 0;
 }
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1b38e707bbd9..eba188a9e3bb 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -75,6 +75,11 @@ struct compressed_bio {
 	struct btrfs_bio bbio;
 };
 
+static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb)
+{
+	return cb->bbio.fs_info;
+}
+
 /* @range_end must be exclusive. */
 static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
 {
@@ -84,11 +89,14 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6
 	return min(range_end, folio_end(folio)) - cur;
 }
 
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info);
+
 int __init btrfs_init_compress(void);
 void __cold btrfs_exit_compress(void);
 
 bool btrfs_compress_level_valid(unsigned int type, int level);
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
 			  u64 start, struct folio **folios, unsigned long *out_folios,
 			 unsigned long *total_in, unsigned long *total_out);
 int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
@@ -102,21 +110,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
 				   bool writeback);
 void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
 
-int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
 
-struct folio *btrfs_alloc_compr_folio(void);
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info);
 void btrfs_free_compr_folio(struct folio *folio);
 
-enum btrfs_compression_type {
-	BTRFS_COMPRESS_NONE  = 0,
-	BTRFS_COMPRESS_ZLIB  = 1,
-	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_ZSTD  = 3,
-	BTRFS_NR_COMPRESS_TYPES = 4,
-
-	BTRFS_DEFRAG_DONT_COMPRESS,
-};
-
 struct workspace_manager {
 	struct list_head idle_ws;
 	spinlock_t ws_lock;
@@ -128,11 +126,10 @@ struct workspace_manager {
 	wait_queue_head_t ws_wait;
 };
 
-struct list_head *btrfs_get_workspace(int type, int level);
-void btrfs_put_workspace(int type, struct list_head *ws);
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level);
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws);
 
-struct btrfs_compress_op {
-	struct workspace_manager *workspace_manager;
+struct btrfs_compress_levels {
 	/* Maximum level supported by the compression algorithm */
 	int min_level;
 	int max_level;
@@ -142,10 +139,10 @@ struct btrfs_compress_op {
 /* The heuristic workspaces are managed via the 0th workspace manager */
 #define BTRFS_NR_WORKSPACE_MANAGERS	BTRFS_NR_COMPRESS_TYPES
 
-extern const struct btrfs_compress_op btrfs_heuristic_compress;
-extern const struct btrfs_compress_op btrfs_zlib_compress;
-extern const struct btrfs_compress_op btrfs_lzo_compress;
-extern const struct btrfs_compress_op btrfs_zstd_compress;
+extern const struct btrfs_compress_levels btrfs_heuristic_compress;
+extern const struct btrfs_compress_levels btrfs_zlib_compress;
+extern const struct btrfs_compress_levels btrfs_lzo_compress;
+extern const struct btrfs_compress_levels btrfs_zstd_compress;
 
 const char* btrfs_compress_type2str(enum btrfs_compression_type type);
 bool btrfs_compress_is_valid_type(const char *str, size_t len);
@@ -155,39 +152,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
 int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
 				     struct folio **in_folio_ret);
 
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zlib_decompress(struct list_head *ws, const u8 *data_in,
 		struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
 void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
 
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int lzo_decompress(struct list_head *ws, const u8 *data_in,
 		struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
-struct list_head *lzo_alloc_workspace(void);
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info);
 void lzo_free_workspace(struct list_head *ws);
 
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 		unsigned long *total_in, unsigned long *total_out);
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
 int zstd_decompress(struct list_head *ws, const u8 *data_in,
 		struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
 		size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(int level);
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info);
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info);
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level);
 void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(int level);
-void zstd_put_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level);
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws);
 
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 74e6d7f3d266..561658aca018 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
-		      int data_size, int extend);
+		      int data_size, bool extend);
 static int push_node_left(struct btrfs_trans_handle *trans,
 			  struct extent_buffer *dst,
-			  struct extent_buffer *src, int empty);
+			  struct extent_buffer *src, bool empty);
 static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
@@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		ret = btrfs_inc_ref(trans, root, cow, 1);
-		if (ret)
+		if (unlikely(ret))
 			btrfs_abort_transaction(trans, ret);
 	} else {
 		ret = btrfs_inc_ref(trans, root, cow, 0);
-		if (ret)
+		if (unlikely(ret))
 			btrfs_abort_transaction(trans, ret);
 	}
 	if (ret) {
@@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto error_unlock_cow;
 	}
 
 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
@@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 			parent_start = buf->start;
 
 		ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
@@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 					    parent_start, last_ref);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
@@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
 						    BTRFS_MOD_LOG_KEY_REPLACE);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
@@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(trans, parent);
 		if (last_ref) {
 			ret = btrfs_tree_mod_log_free_eb(buf);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto error_unlock_cow;
 			}
 		}
 		ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
 					    parent_start, last_ref);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
@@ -613,15 +613,12 @@ error_unlock_cow:
 	return ret;
 }
 
-static inline int should_cow_block(const struct btrfs_trans_handle *trans,
-				   const struct btrfs_root *root,
-				   const struct extent_buffer *buf)
+static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
+				    const struct btrfs_root *root,
+				    const struct extent_buffer *buf)
 {
 	if (btrfs_is_testing(root->fs_info))
-		return 0;
-
-	/* Ensure we can see the FORCE_COW bit */
-	smp_mb__before_atomic();
+		return false;
 
 	/*
 	 * We do not need to cow a block if
@@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans,
 	 *    after we've finished copying src root, we must COW the shared
 	 *    block to ensure the metadata consistency.
 	 */
-	if (btrfs_header_generation(buf) == trans->transid &&
-	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
-	    !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
-	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
-	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
-		return 0;
-	return 1;
+
+	if (btrfs_header_generation(buf) != trans->transid)
+		return true;
+
+	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
+		return true;
+
+	/* Ensure we can see the FORCE_COW bit. */
+	smp_mb__before_atomic();
+	if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+		return true;
+
+	if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+		return false;
+
+	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+		return true;
+
+	return false;
 }
 
 /*
@@ -844,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
 			     &check);
 	if (IS_ERR(eb))
 		return eb;
-	if (!extent_buffer_uptodate(eb)) {
+	if (unlikely(!extent_buffer_uptodate(eb))) {
 		free_extent_buffer(eb);
 		return ERR_PTR(-EIO);
 	}
@@ -913,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		}
 
 		ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_tree_unlock(child);
 			free_extent_buffer(child);
 			btrfs_abort_transaction(trans, ret);
@@ -935,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer_stale(mid);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -1010,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 						    right, 0, 1);
 			free_extent_buffer_stale(right);
 			right = NULL;
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
 			}
@@ -1019,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			btrfs_node_key(right, &right_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
 					BTRFS_MOD_LOG_KEY_REPLACE);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
 			}
@@ -1071,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		free_extent_buffer_stale(mid);
 		mid = NULL;
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -1081,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_node_key(mid, &mid_key, 0);
 		ret = btrfs_tree_mod_log_insert_key(parent, pslot,
 						    BTRFS_MOD_LOG_KEY_REPLACE);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -1186,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			btrfs_node_key(mid, &disk_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot,
 					BTRFS_MOD_LOG_KEY_REPLACE);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_tree_unlock(left);
 				free_extent_buffer(left);
 				btrfs_abort_transaction(trans, ret);
@@ -1246,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			btrfs_node_key(right, &disk_key, 0);
 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
 					BTRFS_MOD_LOG_KEY_REPLACE);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_tree_unlock(right);
 				free_extent_buffer(right);
 				btrfs_abort_transaction(trans, ret);
@@ -1484,13 +1493,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 			reada_for_search(fs_info, p, parent_level, slot, key->objectid);
 
 		/* first we do an atomic uptodate check */
-		if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
+		if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
 			/*
 			 * Do extra check for first_key, eb can be stale due to
 			 * being cached, read from scrub, or have multiple
 			 * parents (shared tree blocks).
 			 */
-			if (btrfs_verify_level_key(tmp, &check)) {
+			if (unlikely(btrfs_verify_level_key(tmp, &check))) {
 				ret = -EUCLEAN;
 				goto out;
 			}
@@ -1571,7 +1580,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	 * and give up so that our caller doesn't loop forever
 	 * on our EAGAINs.
 	 */
-	if (!extent_buffer_uptodate(tmp)) {
+	if (unlikely(!extent_buffer_uptodate(tmp))) {
 		ret = -EIO;
 		goto out;
 	}
@@ -1752,7 +1761,7 @@ out:
 	 * The root may have failed to write out at some point, and thus is no
 	 * longer valid, return an error in this case.
 	 */
-	if (!extent_buffer_uptodate(b)) {
+	if (unlikely(!extent_buffer_uptodate(b))) {
 		if (root_lock)
 			btrfs_tree_unlock_rw(b, root_lock);
 		free_extent_buffer(b);
@@ -2260,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 
 again:
 	b = btrfs_get_old_root(root, time_seq);
-	if (!b) {
+	if (unlikely(!b)) {
 		ret = -EIO;
 		goto done;
 	}
@@ -2686,7 +2695,7 @@ static bool check_sibling_keys(const struct extent_buffer *left,
  */
 static int push_node_left(struct btrfs_trans_handle *trans,
 			  struct extent_buffer *dst,
-			  struct extent_buffer *src, int empty)
+			  struct extent_buffer *src, bool empty)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int push_items = 0;
@@ -2722,13 +2731,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 		push_items = min(src_nritems - 8, push_items);
 
 	/* dst is the left eb, src is the middle eb */
-	if (check_sibling_keys(dst, src)) {
+	if (unlikely(check_sibling_keys(dst, src))) {
 		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
 	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -2796,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 		push_items = max_push;
 
 	/* dst is the right eb, src is the middle eb */
-	if (check_sibling_keys(src, dst)) {
+	if (unlikely(check_sibling_keys(src, dst))) {
 		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -2813,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
 					 push_items);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -2883,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 		btrfs_clear_buffer_dirty(trans, c);
 		ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
-		if (ret2 < 0)
+		if (unlikely(ret2 < 0))
 			btrfs_abort_transaction(trans, ret2);
 		btrfs_tree_unlock(c);
 		free_extent_buffer(c);
@@ -2928,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
 		if (level) {
 			ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
 					slot, nritems - slot);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				return ret;
 			}
@@ -2941,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
 	if (level) {
 		ret = btrfs_tree_mod_log_insert_key(lower, slot,
 						    BTRFS_MOD_LOG_KEY_ADD);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
@@ -3017,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	ASSERT(btrfs_header_level(c) == level);
 
 	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_tree_unlock(split);
 		free_extent_buffer(split);
 		btrfs_abort_transaction(trans, ret);
@@ -3086,7 +3095,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
 	int ret;
 
 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_crit(fs_info,
 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
 			   ret,
@@ -3102,7 +3111,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
  */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 				      struct btrfs_path *path,
-				      int data_size, int empty,
+				      int data_size, bool empty,
 				      struct extent_buffer *right,
 				      int free_space, u32 left_nritems,
 				      u32 min_slot)
@@ -3239,7 +3248,7 @@ out_unlock:
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct btrfs_path *path,
 			   int min_data_size, int data_size,
-			   int empty, u32 min_slot)
+			   bool empty, u32 min_slot)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *right;
@@ -3278,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_nritems == 0)
 		goto out_unlock;
 
-	if (check_sibling_keys(left, right)) {
+	if (unlikely(check_sibling_keys(left, right))) {
 		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
 		btrfs_tree_unlock(right);
@@ -3316,7 +3325,7 @@ out_unlock:
  */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *path, int data_size,
-				     int empty, struct extent_buffer *left,
+				     bool empty, struct extent_buffer *left,
 				     int free_space, u32 right_nritems,
 				     u32 max_slot)
 {
@@ -3494,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out;
 	}
 
-	if (check_sibling_keys(left, right)) {
+	if (unlikely(check_sibling_keys(left, right))) {
 		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3642,7 +3651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       const struct btrfs_key *ins_key,
 			       struct btrfs_path *path, int data_size,
-			       int extend)
+			       bool extend)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *l;
@@ -4075,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	btrfs_set_item_size(leaf, slot, new_size);
 	btrfs_mark_buffer_dirty(trans, leaf);
 
-	if (btrfs_leaf_free_space(leaf) < 0) {
+	if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4108,7 +4117,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
 	old_data = btrfs_item_data_end(leaf, slot);
 
 	BUG_ON(slot < 0);
-	if (slot >= nritems) {
+	if (unlikely(slot >= nritems)) {
 		btrfs_print_leaf(leaf);
 		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
 			   slot, nritems);
@@ -4135,7 +4144,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
 	btrfs_set_item_size(leaf, slot, old_size + data_size);
 	btrfs_mark_buffer_dirty(trans, leaf);
 
-	if (btrfs_leaf_free_space(leaf) < 0) {
+	if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4183,7 +4192,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
 	data_end = leaf_data_end(leaf);
 	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
 
-	if (btrfs_leaf_free_space(leaf) < total_size) {
+	if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
 		btrfs_print_leaf(leaf);
 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
 			   total_size, btrfs_leaf_free_space(leaf));
@@ -4193,7 +4202,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
 	if (slot != nritems) {
 		unsigned int old_data = btrfs_item_data_end(leaf, slot);
 
-		if (old_data < data_end) {
+		if (unlikely(old_data < data_end)) {
 			btrfs_print_leaf(leaf);
 			btrfs_crit(fs_info,
 		"item at slot %d with data offset %u beyond data end of leaf %u",
@@ -4232,7 +4241,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
 	btrfs_mark_buffer_dirty(trans, leaf);
 
-	if (btrfs_leaf_free_space(leaf) < 0) {
+	if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
 		btrfs_print_leaf(leaf);
 		BUG();
 	}
@@ -4374,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (level) {
 			ret = btrfs_tree_mod_log_insert_move(parent, slot,
 					slot + 1, nritems - slot - 1);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				return ret;
 			}
@@ -4387,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	} else if (level) {
 		ret = btrfs_tree_mod_log_insert_key(parent, slot,
 						    BTRFS_MOD_LOG_KEY_REMOVE);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 738179a5e170..7b277934f66f 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
 }
 
 /*
- * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * Pick the defraggable inode that we want, if it doesn't exist, we will get the
  * next one.
  */
 static struct inode_defrag *btrfs_pick_defrag_inode(
@@ -924,7 +924,7 @@ again:
 			folio_put(folio);
 			goto again;
 		}
-		if (!folio_test_uptodate(folio)) {
+		if (unlikely(!folio_test_uptodate(folio))) {
 			folio_unlock(folio);
 			folio_put(folio);
 			return ERR_PTR(-EIO);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..41e37f7f67cc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node(
 	delayed_node->root = root;
 	delayed_node->inode_id = inode_id;
 	refcount_set(&delayed_node->refs, 0);
+	btrfs_delayed_node_ref_tracker_dir_init(delayed_node);
 	delayed_node->ins_root = RB_ROOT_CACHED;
 	delayed_node->del_root = RB_ROOT_CACHED;
 	mutex_init(&delayed_node->mutex);
@@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node(
 }
 
 static struct btrfs_delayed_node *btrfs_get_delayed_node(
-		struct btrfs_inode *btrfs_inode)
+		struct btrfs_inode *btrfs_inode,
+		struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_root *root = btrfs_inode->root;
 	u64 ino = btrfs_ino(btrfs_inode);
@@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 	node = READ_ONCE(btrfs_inode->delayed_node);
 	if (node) {
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
 		return node;
 	}
 
@@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 	if (node) {
 		if (btrfs_inode->delayed_node) {
 			refcount_inc(&node->refs);	/* can be accessed */
+			btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
 			BUG_ON(btrfs_inode->delayed_node != node);
 			xa_unlock(&root->delayed_nodes);
 			return node;
@@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
 		 */
 		if (refcount_inc_not_zero(&node->refs)) {
 			refcount_inc(&node->refs);
+			btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+			btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker,
+							     GFP_ATOMIC);
 			btrfs_inode->delayed_node = node;
 		} else {
 			node = NULL;
@@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
  * Return the delayed node, or error pointer on failure.
  */
 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
-		struct btrfs_inode *btrfs_inode)
+		struct btrfs_inode *btrfs_inode,
+		struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_delayed_node *node;
 	struct btrfs_root *root = btrfs_inode->root;
@@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 	void *ptr;
 
 again:
-	node = btrfs_get_delayed_node(btrfs_inode);
+	node = btrfs_get_delayed_node(btrfs_inode, tracker);
 	if (node)
 		return node;
 
@@ -144,12 +152,10 @@ again:
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
-	/* Cached in the inode and can be accessed. */
-	refcount_set(&node->refs, 2);
-
 	/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
 	ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
 	if (ret == -ENOMEM) {
+		btrfs_delayed_node_ref_tracker_dir_exit(node);
 		kmem_cache_free(delayed_node_cache, node);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -158,6 +164,7 @@ again:
 	if (ptr) {
 		/* Somebody inserted it, go back and read it. */
 		xa_unlock(&root->delayed_nodes);
+		btrfs_delayed_node_ref_tracker_dir_exit(node);
 		kmem_cache_free(delayed_node_cache, node);
 		node = NULL;
 		goto again;
@@ -166,6 +173,12 @@ again:
 	ASSERT(xa_err(ptr) != -EINVAL);
 	ASSERT(xa_err(ptr) != -ENOMEM);
 	ASSERT(ptr == NULL);
+
+	/* Cached in the inode and can be accessed. */
+	refcount_set(&node->refs, 2);
+	btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+	btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC);
+
 	btrfs_inode->delayed_node = node;
 	xa_unlock(&root->delayed_nodes);
 
@@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
 		list_add_tail(&node->n_list, &root->node_list);
 		list_add_tail(&node->p_list, &root->prepare_list);
 		refcount_inc(&node->refs);	/* inserted into list */
+		btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker,
+						     GFP_ATOMIC);
 		root->nodes++;
 		set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
 	}
@@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 	spin_lock(&root->lock);
 	if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
 		root->nodes--;
+		btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker);
 		refcount_dec(&node->refs);	/* not in the list */
 		list_del_init(&node->n_list);
 		if (!list_empty(&node->p_list))
@@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 }
 
 static struct btrfs_delayed_node *btrfs_first_delayed_node(
-			struct btrfs_delayed_root *delayed_root)
+			struct btrfs_delayed_root *delayed_root,
+			struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_delayed_node *node;
 
 	spin_lock(&delayed_root->lock);
 	node = list_first_entry_or_null(&delayed_root->node_list,
 					struct btrfs_delayed_node, n_list);
-	if (node)
+	if (node) {
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+	}
 	spin_unlock(&delayed_root->lock);
 
 	return node;
 }
 
 static struct btrfs_delayed_node *btrfs_next_delayed_node(
-						struct btrfs_delayed_node *node)
+						struct btrfs_delayed_node *node,
+						struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_delayed_root *delayed_root;
 	struct list_head *p;
@@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
 
 	next = list_entry(p, struct btrfs_delayed_node, n_list);
 	refcount_inc(&next->refs);
+	btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC);
 out:
 	spin_unlock(&delayed_root->lock);
 
@@ -257,7 +278,7 @@ out:
 
 static void __btrfs_release_delayed_node(
 				struct btrfs_delayed_node *delayed_node,
-				int mod)
+				int mod, struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_delayed_root *delayed_root;
 
@@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node(
 		btrfs_dequeue_delayed_node(delayed_root, delayed_node);
 	mutex_unlock(&delayed_node->mutex);
 
+	btrfs_delayed_node_ref_tracker_free(delayed_node, tracker);
 	if (refcount_dec_and_test(&delayed_node->refs)) {
 		struct btrfs_root *root = delayed_node->root;
 
@@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node(
 		 * back up.  We can delete it now.
 		 */
 		ASSERT(refcount_read(&delayed_node->refs) == 0);
+		btrfs_delayed_node_ref_tracker_dir_exit(delayed_node);
 		kmem_cache_free(delayed_node_cache, delayed_node);
 	}
 }
 
-static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node,
+					      struct btrfs_ref_tracker *tracker)
 {
-	__btrfs_release_delayed_node(node, 0);
+	__btrfs_release_delayed_node(node, 0, tracker);
 }
 
 static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
-					struct btrfs_delayed_root *delayed_root)
+					struct btrfs_delayed_root *delayed_root,
+					struct btrfs_ref_tracker *tracker)
 {
 	struct btrfs_delayed_node *node;
 
@@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
 	if (node) {
 		list_del_init(&node->p_list);
 		refcount_inc(&node->refs);
+		btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
 	}
 	spin_unlock(&delayed_root->lock);
 
@@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
 }
 
 static inline void btrfs_release_prepared_delayed_node(
-					struct btrfs_delayed_node *node)
+					struct btrfs_delayed_node *node,
+					struct btrfs_ref_tracker *tracker)
 {
-	__btrfs_release_delayed_node(node, 1);
+	__btrfs_release_delayed_node(node, 1, tracker);
 }
 
 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
@@ -711,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 		u32 *ins_sizes;
 		int i = 0;
 
-		ins_data = kmalloc(batch.nr * sizeof(u32) +
-				   batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
+		ins_data = kmalloc_array(batch.nr,
+					 sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
 		if (!ins_data) {
 			ret = -ENOMEM;
 			goto out;
@@ -1011,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 		 * transaction, because we could leave the inode with the
 		 * improper counts behind.
 		 */
-		if (ret != -ENOENT)
+		if (unlikely(ret != -ENOENT))
 			btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -1039,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto err_out;
 		}
@@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret = 0;
@@ -1143,17 +1171,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 
 	delayed_root = fs_info->delayed_root;
 
-	curr_node = btrfs_first_delayed_node(delayed_root);
+	curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker);
 	while (curr_node && (!count || nr--)) {
 		ret = __btrfs_commit_inode_delayed_items(trans, path,
 							 curr_node);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
 
 		prev_node = curr_node;
-		curr_node = btrfs_next_delayed_node(curr_node);
+		prev_delayed_node_tracker = curr_delayed_node_tracker;
+		curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
 		/*
 		 * See the comment below about releasing path before releasing
 		 * node. If the commit of delayed items was successful the path
@@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 		 * point to locked extent buffers (a leaf at the very least).
 		 */
 		ASSERT(path->nodes[0] == NULL);
-		btrfs_release_delayed_node(prev_node);
+		btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
 	}
 
 	/*
@@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 	btrfs_free_path(path);
 
 	if (curr_node)
-		btrfs_release_delayed_node(curr_node);
+		btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker);
 	trans->block_rsv = block_rsv;
 
 	return ret;
@@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 				     struct btrfs_inode *inode)
 {
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_ref_tracker delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node =
+		btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
@@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	mutex_lock(&delayed_node->mutex);
 	if (!delayed_node->count) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -ENOMEM;
 	}
 
@@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 
 	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	trans->block_rsv = block_rsv;
 
 	return ret;
@@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_ref_tracker delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node;
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return 0;
 
 	mutex_lock(&delayed_node->mutex);
 	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
@@ -1275,7 +1308,7 @@ trans_out:
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(fs_info);
 out:
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 
 	return ret;
 }
@@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode)
 		return;
 
 	inode->delayed_node = NULL;
-	btrfs_release_delayed_node(delayed_node);
+
+	btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker);
 }
 
 struct btrfs_async_delayed_work {
@@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	struct btrfs_delayed_node *delayed_node = NULL;
+	struct btrfs_ref_tracker delayed_node_tracker;
 	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
 	int total_done = 0;
@@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		    BTRFS_DELAYED_BACKGROUND / 2)
 			break;
 
-		delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+		delayed_node = btrfs_first_prepared_delayed_node(delayed_root,
+								 &delayed_node_tracker);
 		if (!delayed_node)
 			break;
 
@@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			btrfs_release_path(path);
-			btrfs_release_prepared_delayed_node(delayed_node);
+			btrfs_release_prepared_delayed_node(delayed_node,
+							    &delayed_node_tracker);
 			total_done++;
 			continue;
 		}
@@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 		btrfs_btree_balance_dirty_nodelay(root->fs_info);
 
 		btrfs_release_path(path);
-		btrfs_release_prepared_delayed_node(delayed_node);
+		btrfs_release_prepared_delayed_node(delayed_node,
+						    &delayed_node_tracker);
 		total_done++;
 
 	} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
@@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 
 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+	struct btrfs_ref_tracker delayed_node_tracker;
+	struct btrfs_delayed_node *node;
 
-	if (WARN_ON(node))
+	node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker);
+	if (WARN_ON(node)) {
+		btrfs_delayed_node_ref_tracker_free(node,
+						    &delayed_node_tracker);
 		refcount_dec(&node->refs);
+	}
 }
 
 static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
@@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
 	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 	struct btrfs_delayed_item *delayed_item;
 	struct btrfs_dir_item *dir_item;
 	bool reserve_leaf_space;
 	u32 data_len;
 	int ret;
 
-	delayed_node = btrfs_get_or_create_delayed_node(dir);
+	delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	mutex_unlock(&delayed_node->mutex);
 
 release_node:
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return ret;
 }
 
@@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   struct btrfs_inode *dir, u64 index)
 {
 	struct btrfs_delayed_node *node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 	struct btrfs_delayed_item *item;
 	int ret;
 
-	node = btrfs_get_or_create_delayed_node(dir);
+	node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
 	if (IS_ERR(node))
 		return PTR_ERR(node);
 
@@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 	}
 	mutex_unlock(&node->mutex);
 end:
-	btrfs_release_delayed_node(node);
+	btrfs_release_delayed_node(node, &delayed_node_tracker);
 	return ret;
 }
 
 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
 {
-	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_ref_tracker delayed_node_tracker;
+	struct btrfs_delayed_node *delayed_node;
 
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return -ENOENT;
 
@@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
 	 * is updated now. So we needn't lock the delayed node.
 	 */
 	if (!delayed_node->index_cnt) {
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -EINVAL;
 	}
 
 	inode->index_cnt = delayed_node->index_cnt;
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
 {
 	struct btrfs_delayed_node *delayed_node;
 	struct btrfs_delayed_item *item;
+	struct btrfs_ref_tracker delayed_node_tracker;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return false;
 
@@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
 	 * insert/delete delayed items in this period. So we also needn't
 	 * requeue or dequeue this delayed node.
 	 */
+	btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker);
 	refcount_dec(&delayed_node->refs);
 
 	return true;
@@ -1843,19 +1892,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 
 int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
 {
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 	struct btrfs_inode_item *inode_item;
 	struct inode *vfs_inode = &inode->vfs_inode;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return -ENOENT;
 
 	mutex_lock(&delayed_node->mutex);
 	if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
 		mutex_unlock(&delayed_node->mutex);
-		btrfs_release_delayed_node(delayed_node);
+		btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 		return -ENOENT;
 	}
 
@@ -1864,8 +1913,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
 	i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
 	i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
 	btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
-	btrfs_inode_set_file_extent_range(inode, 0,
-			round_up(i_size_read(vfs_inode), fs_info->sectorsize));
 	vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
 	set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
 	inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
@@ -1895,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
 		inode->index_cnt = (u64)-1;
 
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -1904,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 	int ret = 0;
 
-	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1926,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 	atomic_inc(&root->fs_info->delayed_root->items);
 release_node:
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return ret;
 }
 
@@ -1934,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 
 	/*
 	 * we don't do delayed inode updates during log recovery because it
@@ -1943,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 		return -EAGAIN;
 
-	delayed_node = btrfs_get_or_create_delayed_node(inode);
+	delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
 	if (IS_ERR(delayed_node))
 		return PTR_ERR(delayed_node);
 
@@ -1970,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
 	atomic_inc(&fs_info->delayed_root->items);
 release_node:
 	mutex_unlock(&delayed_node->mutex);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 	return 0;
 }
 
@@ -2014,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
 {
 	struct btrfs_delayed_node *delayed_node;
+	struct btrfs_ref_tracker delayed_node_tracker;
 
-	delayed_node = btrfs_get_delayed_node(inode);
+	delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!delayed_node)
 		return;
 
 	__btrfs_kill_delayed_node(delayed_node);
-	btrfs_release_delayed_node(delayed_node);
+	btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
 }
 
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 {
 	unsigned long index = 0;
 	struct btrfs_delayed_node *delayed_nodes[8];
+	struct btrfs_ref_tracker delayed_node_trackers[8];
 
 	while (1) {
 		struct btrfs_delayed_node *node;
@@ -2045,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 			 * about to be removed from the tree in the loop below
 			 */
 			if (refcount_inc_not_zero(&node->refs)) {
+				btrfs_delayed_node_ref_tracker_alloc(node,
+						     &delayed_node_trackers[count],
+						     GFP_ATOMIC);
 				delayed_nodes[count] = node;
 				count++;
 			}
@@ -2056,7 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 
 		for (int i = 0; i < count; i++) {
 			__btrfs_kill_delayed_node(delayed_nodes[i]);
-			btrfs_release_delayed_node(delayed_nodes[i]);
+			btrfs_release_delayed_node(delayed_nodes[i],
+						   &delayed_node_trackers[i]);
+			btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
 		}
 	}
 }
@@ -2064,14 +2120,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_delayed_node *curr_node, *prev_node;
+	struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
 
-	curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+	curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
+					     &curr_delayed_node_tracker);
 	while (curr_node) {
 		__btrfs_kill_delayed_node(curr_node);
 
 		prev_node = curr_node;
-		curr_node = btrfs_next_delayed_node(curr_node);
-		btrfs_release_delayed_node(prev_node);
+		prev_delayed_node_tracker = curr_delayed_node_tracker;
+		curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
+		btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
 	}
 }
 
@@ -2081,8 +2140,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
 {
 	struct btrfs_delayed_node *node;
 	struct btrfs_delayed_item *item;
+	struct btrfs_ref_tracker delayed_node_tracker;
 
-	node = btrfs_get_delayed_node(inode);
+	node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!node)
 		return;
 
@@ -2140,6 +2200,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
 	 * delete delayed items.
 	 */
 	ASSERT(refcount_read(&node->refs) > 1);
+	btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
 	refcount_dec(&node->refs);
 }
 
@@ -2150,8 +2211,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
 	struct btrfs_delayed_node *node;
 	struct btrfs_delayed_item *item;
 	struct btrfs_delayed_item *next;
+	struct btrfs_ref_tracker delayed_node_tracker;
 
-	node = btrfs_get_delayed_node(inode);
+	node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
 	if (!node)
 		return;
 
@@ -2183,5 +2245,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
 	 * delete delayed items.
 	 */
 	ASSERT(refcount_read(&node->refs) > 1);
+	btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
 	refcount_dec(&node->refs);
 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index e6e763ad2d42..0d949edc0caf 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/refcount.h>
+#include <linux/ref_tracker.h>
 #include "ctree.h"
 
 struct btrfs_disk_key;
@@ -44,6 +45,22 @@ struct btrfs_delayed_root {
 	wait_queue_head_t wait;
 };
 
+struct btrfs_ref_tracker_dir {
+#ifdef CONFIG_BTRFS_DEBUG
+	struct ref_tracker_dir dir;
+#else
+	struct {} tracker;
+#endif
+};
+
+struct btrfs_ref_tracker {
+#ifdef CONFIG_BTRFS_DEBUG
+	struct ref_tracker *tracker;
+#else
+	struct {} tracker;
+#endif
+};
+
 #define BTRFS_DELAYED_NODE_IN_LIST	0
 #define BTRFS_DELAYED_NODE_INODE_DIRTY	1
 #define BTRFS_DELAYED_NODE_DEL_IREF	2
@@ -78,6 +95,12 @@ struct btrfs_delayed_node {
 	 * actual number of leaves we end up using. Protected by @mutex.
 	 */
 	u32 index_item_leaves;
+	/* Track all references to this delayed node. */
+	struct btrfs_ref_tracker_dir ref_dir;
+	/* Track delayed node reference stored in node list. */
+	struct btrfs_ref_tracker node_list_tracker;
+	/* Track delayed node reference stored in inode cache. */
+	struct btrfs_ref_tracker inode_cache_tracker;
 };
 
 struct btrfs_delayed_item {
@@ -169,4 +192,74 @@ void __cold btrfs_delayed_inode_exit(void);
 /* for debugging */
 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
 
+#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT		16
+#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT		16
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node)
+{
+	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+		return;
+
+	ref_tracker_dir_init(&node->ref_dir.dir,
+			     BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT,
+			     "delayed_node");
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node)
+{
+	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+		return;
+
+	ref_tracker_dir_exit(&node->ref_dir.dir);
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node)
+{
+	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+		return;
+
+	ref_tracker_dir_print(&node->ref_dir.dir,
+			      BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+						       struct btrfs_ref_tracker *tracker,
+						       gfp_t gfp)
+{
+	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+		return 0;
+
+	return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+						      struct btrfs_ref_tracker *tracker)
+{
+	if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+		return 0;
+
+	return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker);
+}
+#else
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { }
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+						       struct btrfs_ref_tracker *tracker,
+						       gfp_t gfp)
+{
+	return 0;
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+						      struct btrfs_ref_tracker *tracker)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca382c5b186f..481802efaa14 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * Initialize the structure which represents a modification to a an extent.
+ * Initialize the structure which represents a modification to an extent.
  *
  * @fs_info:    Internal to the mounted filesystem mount structure.
  *
@@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
 void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
 			 bool skip_qgroup)
 {
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: generic_ref->ref_root;
 #endif
@@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
 void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
 			 u64 mod_root, bool skip_qgroup)
 {
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: generic_ref->ref_root;
 #endif
@@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 {
 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	bool testing = btrfs_is_testing(fs_info);
 
 	spin_lock(&delayed_refs->lock);
 	while (true) {
@@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 		spin_unlock(&delayed_refs->lock);
 		mutex_unlock(&head->mutex);
 
-		if (!testing && pin_bytes) {
+		if (!btrfs_is_testing(fs_info) && pin_bytes) {
 			struct btrfs_block_group *bg;
 
 			bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
 			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
 				head->bytenr + head->num_bytes - 1);
 		}
-		if (!testing)
+		if (!btrfs_is_testing(fs_info))
 			btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
 
-	if (!testing)
+	if (!btrfs_is_testing(fs_info))
 		btrfs_qgroup_destroy_extent_records(trans);
 
 	spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 552ec4fa645d..5ce940532144 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -276,10 +276,6 @@ struct btrfs_ref {
 	 */
 	bool skip_qgroup;
 
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
-	/* Through which root is this modification. */
-	u64 real_root;
-#endif
 	u64 bytenr;
 	u64 num_bytes;
 	u64 owning_root;
@@ -296,6 +292,11 @@ struct btrfs_ref {
 		struct btrfs_data_ref data_ref;
 		struct btrfs_tree_ref tree_ref;
 	};
+
+#ifdef CONFIG_BTRFS_DEBUG
+	/* Through which root is this modification. */
+	u64 real_root;
+#endif
 };
 
 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4675bcd5f92e..a4eaef60549e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -98,7 +98,7 @@ no_valid_dev_replace_entry_found:
 		 * We don't have a replace item or it's corrupted.  If there is
 		 * a replace target, fail the mount.
 		 */
-		if (btrfs_find_device(fs_info->fs_devices, &args)) {
+		if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
 			btrfs_err(fs_info,
 			"found replace target device without a valid replace item");
 			return -EUCLEAN;
@@ -158,7 +158,7 @@ no_valid_dev_replace_entry_found:
 		 * We don't have an active replace item but if there is a
 		 * replace target, fail the mount.
 		 */
-		if (btrfs_find_device(fs_info->fs_devices, &args)) {
+		if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
 			btrfs_err(fs_info,
 "replace without active item, run 'device scan --forget' on the target device");
 			ret = -EUCLEAN;
@@ -177,8 +177,7 @@ no_valid_dev_replace_entry_found:
 		 * allow 'btrfs dev replace_cancel' if src/tgt device is
 		 * missing
 		 */
-		if (!dev_replace->srcdev &&
-		    !btrfs_test_opt(fs_info, DEGRADED)) {
+		if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) {
 			ret = -EIO;
 			btrfs_warn(fs_info,
 			   "cannot mount because device replace operation is ongoing and");
@@ -186,8 +185,7 @@ no_valid_dev_replace_entry_found:
 			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 			   src_devid);
 		}
-		if (!dev_replace->tgtdev &&
-		    !btrfs_test_opt(fs_info, DEGRADED)) {
+		if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) {
 			ret = -EIO;
 			btrfs_warn(fs_info,
 			   "cannot mount because device replace operation is ongoing and");
@@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-		DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state");
+		DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state");
 		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 		up_write(&dev_replace->rwsem);
 		goto leave;
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index fe9a4bd7e6e6..802d4dbe5b38 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 	if (iov_iter_alignment(iter) & blocksize_mask)
 		return -EINVAL;
 
+	/*
+	 * For bs > ps support, we heavily rely on large folios to make sure no
+	 * block will cross large folio boundaries.
+	 *
+	 * But memory provided by direct IO is only virtually contiguous, not
+	 * physically contiguous, and will break the btrfs' large folio requirement.
+	 *
+	 * So for bs > ps support, all direct IOs should fallback to buffered ones.
+	 */
+	if (fs_info->sectorsize > PAGE_SIZE)
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 70fc4e7cc5a0..9247a58894de 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
  * detect blocks that either didn't get written at all or got written
  * in the wrong place.
  */
-int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
 {
 	if (!extent_buffer_uptodate(eb))
 		return 0;
@@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
 	ASSERT(check);
 
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != eb->start) {
+	if (unlikely(found_start != eb->start)) {
 		btrfs_err_rl(fs_info,
 			"bad tree block start, mirror %u want %llu have %llu",
 			     eb->read_mirror, eb->start, found_start);
 		ret = -EIO;
 		goto out;
 	}
-	if (check_tree_block_fsid(eb)) {
+	if (unlikely(check_tree_block_fsid(eb))) {
 		btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
 			     eb->start, eb->read_mirror);
 		ret = -EIO;
 		goto out;
 	}
 	found_level = btrfs_header_level(eb);
-	if (found_level >= BTRFS_MAX_LEVEL) {
+	if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
 		btrfs_err(fs_info,
 			"bad tree block level, mirror %u level %d on logical %llu",
 			eb->read_mirror, btrfs_header_level(eb), eb->start);
@@ -404,13 +404,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
 			      CSUM_FMT_VALUE(csum_size, result),
 			      btrfs_header_level(eb),
 			      ignore_csum ? ", ignored" : "");
-		if (!ignore_csum) {
+		if (unlikely(!ignore_csum)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
 	}
 
-	if (found_level != check->level) {
+	if (unlikely(found_level != check->level)) {
 		btrfs_err(fs_info,
 		"level verify failed on logical %llu mirror %u wanted %u found %u",
 			  eb->start, eb->read_mirror, check->level, found_level);
@@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
 					   u64 objectid, gfp_t flags)
 {
 	struct btrfs_root *root;
-	bool dummy = btrfs_is_testing(fs_info);
 
 	root = kzalloc(sizeof(*root), flags);
 	if (!root)
@@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
 	root->log_transid_committed = -1;
 	btrfs_set_root_last_log_commit(root, 0);
 	root->anon_dev = 0;
-	if (!dummy) {
+	if (!btrfs_is_testing(fs_info)) {
 		btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
 					  IO_TREE_ROOT_DIRTY_LOG_PAGES);
 		btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
@@ -1047,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
 		root->node = NULL;
 		goto fail;
 	}
-	if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+	if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
 		ret = -EIO;
 		goto fail;
 	}
@@ -1056,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
 	 * For real fs, and not log/reloc trees, root owner must
 	 * match its root node owner
 	 */
-	if (!btrfs_is_testing(fs_info) &&
-	    btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
-	    btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
-	    btrfs_root_id(root) != btrfs_header_owner(root->node)) {
+	if (unlikely(!btrfs_is_testing(fs_info) &&
+		     btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+		     btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+		     btrfs_root_id(root) != btrfs_header_owner(root->node))) {
 		btrfs_crit(fs_info,
 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
 			   btrfs_root_id(root), root->node->start,
@@ -1248,6 +1247,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 
 	if (fs_info->fs_devices)
 		btrfs_close_devices(fs_info->fs_devices);
+	btrfs_free_compress_wsm(fs_info);
 	percpu_counter_destroy(&fs_info->stats_read_blocks);
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
 {
 	u32 max_active = fs_info->thread_pool_size;
 	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
-	unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
+	unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
 
 	fs_info->workers =
 		btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	u64 bytenr = btrfs_super_log_root(disk_super);
 	int level = btrfs_super_log_root_level(disk_super);
 
-	if (fs_devices->rw_devices == 0) {
+	if (unlikely(fs_devices->rw_devices == 0)) {
 		btrfs_warn(fs_info, "log replay required on RO media");
 		return -EIO;
 	}
@@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 		btrfs_put_root(log_tree_root);
 		return ret;
 	}
-	if (!extent_buffer_uptodate(log_tree_root->node)) {
+	if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
 		btrfs_err(fs_info, "failed to read log tree");
 		btrfs_put_root(log_tree_root);
 		return -EIO;
@@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 
 	/* returns with log_tree_root freed on success */
 	ret = btrfs_recover_log_trees(log_tree_root);
+	btrfs_put_root(log_tree_root);
 	if (ret) {
 		btrfs_handle_fs_error(fs_info, ret,
 				      "Failed to recover log tree");
-		btrfs_put_root(log_tree_root);
 		return ret;
 	}
 
@@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
 	const u32 sectorsize = btrfs_super_sectorsize(sb);
 	u32 sys_array_size = btrfs_super_sys_array_size(sb);
 
-	if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+	if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
 		btrfs_err(fs_info, "system chunk array too big %u > %u",
 			  sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
 		return -EUCLEAN;
@@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
 		disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
 		len = sizeof(*disk_key);
 
-		if (cur + len > sys_array_size)
+		if (unlikely(cur + len > sys_array_size))
 			goto short_read;
 		cur += len;
 
 		btrfs_disk_key_to_cpu(&key, disk_key);
-		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+		if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
 			btrfs_err(fs_info,
 			    "unexpected item type %u in sys_array at offset %u",
 				  key.type, cur);
@@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
 		}
 		chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
 		num_stripes = btrfs_stack_chunk_num_stripes(chunk);
-		if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
+		if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
 			goto short_read;
 		type = btrfs_stack_chunk_type(chunk);
-		if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+		if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
 			btrfs_err(fs_info,
 			"invalid chunk type %llu in sys_array at offset %u",
 				  type, cur);
@@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
-	/*
-	 * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
-	 *
-	 * For 4K page sized systems with non-debug builds, all 3 matches (4K).
-	 * For 4K page sized systems with debug builds, there are two block sizes
-	 * supported. (4K and 2K)
-	 *
-	 * We can support 16K sectorsize with 64K page size without problem,
-	 * but such sectorsize/pagesize combination doesn't make much sense.
-	 * 4K will be our future standard, PAGE_SIZE is supported from the very
-	 * beginning.
-	 */
-	if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
-				       sectorsize != PAGE_SIZE &&
-				       sectorsize != BTRFS_MIN_BLOCKSIZE)) {
+	if (!btrfs_supported_blocksize(sectorsize)) {
 		btrfs_err(fs_info,
 			"sectorsize %llu not yet supported for page size %lu",
 			sectorsize, PAGE_SIZE);
@@ -2619,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
 	ret = btrfs_validate_super(fs_info, sb, -1);
 	if (ret < 0)
 		goto out;
-	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
+	if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
 		ret = -EUCLEAN;
 		btrfs_err(fs_info, "invalid csum type, has %u want %u",
 			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
 		goto out;
 	}
-	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+	if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
 		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
@@ -2655,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
 		root->node = NULL;
 		return ret;
 	}
-	if (!extent_buffer_uptodate(root->node)) {
+	if (unlikely(!extent_buffer_uptodate(root->node))) {
 		free_extent_buffer(root->node);
 		root->node = NULL;
 		return -EIO;
@@ -3256,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
 	}
 
 	/*
-	 * Subpage runtime limitation on v1 cache.
+	 * Subpage/bs > ps runtime limitation on v1 cache.
 	 *
-	 * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+	 * V1 space cache still has some hard coded PAGE_SIZE usage, while
 	 * we're already defaulting to v2 cache, no need to bother v1 as it's
 	 * going to be deprecated anyway.
 	 */
-	if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+	if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
 		btrfs_warn(fs_info,
 	"v1 space cache is not supported for page size %lu with sectorsize %u",
 			   PAGE_SIZE, fs_info->sectorsize);
 		return -EINVAL;
 	}
+	if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) {
+		btrfs_err(fs_info,
+		"RAID56 is not supported for page size %lu with sectorsize %u",
+			  PAGE_SIZE, fs_info->sectorsize);
+		return -EINVAL;
+	}
 
 	/* This can be called by remount, we need to protect the super block. */
 	spin_lock(&fs_info->super_lock);
@@ -3396,10 +3388,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	fs_info->nodesize_bits = ilog2(nodesize);
 	fs_info->sectorsize = sectorsize;
 	fs_info->sectorsize_bits = ilog2(sectorsize);
+	fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
+	fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
 	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
 	fs_info->stripesize = stripesize;
 	fs_info->fs_devices->fs_info = fs_info;
 
+	if (fs_info->sectorsize > PAGE_SIZE)
+		btrfs_warn(fs_info,
+			   "support for block size %u with page size %zu is experimental, some features may be missing",
+			   fs_info->sectorsize, PAGE_SIZE);
 	/*
 	 * Handle the space caching options appropriately now that we have the
 	 * super block loaded and validated.
@@ -3421,6 +3419,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 */
 	fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
 
+	ret = btrfs_alloc_compress_wsm(fs_info);
+	if (ret)
+		goto fail_sb_buffer;
 	ret = btrfs_init_workqueues(fs_info);
 	if (ret)
 		goto fail_sb_buffer;
@@ -3468,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	 * below in btrfs_init_dev_replace().
 	 */
 	btrfs_free_extra_devids(fs_devices);
-	if (!fs_devices->latest_dev->bdev) {
+	if (unlikely(!fs_devices->latest_dev->bdev)) {
 		btrfs_err(fs_info, "failed to read devices");
 		ret = -EIO;
 		goto fail_tree_roots;
@@ -3962,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	 * Checks last_flush_error of disks in order to determine the device
 	 * state.
 	 */
-	if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+	if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
 		return -EIO;
 
 	return 0;
@@ -4064,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
 		ret = btrfs_validate_write_super(fs_info, sb);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 			btrfs_handle_fs_error(fs_info, -EUCLEAN,
 				"unexpected superblock corruption detected");
@@ -4075,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		if (ret)
 			total_errors++;
 	}
-	if (total_errors > max_errors) {
+	if (unlikely(total_errors > max_errors)) {
 		btrfs_err(fs_info, "%d errors while writing supers",
 			  total_errors);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4100,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 			total_errors++;
 	}
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-	if (total_errors > max_errors) {
+	if (unlikely(total_errors > max_errors)) {
 		btrfs_handle_fs_error(fs_info, -EIO,
 				      "%d errors while writing supers",
 				      total_errors);
@@ -4880,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist a root
 		 * with such id, but this is out of valid range.
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 864a55a96226..57920f2c6fe4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 void btrfs_put_root(struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
 			     struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
-			  int atomic);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf,
 			     const struct btrfs_tree_parent_check *check);
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 7fc8a3200b40..d062ac521051 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto fail;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset of -1 found, there would have to exist an
 		 * inode with such number or a root with such id.
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 66361325f6dc..bb2ca1c9c7b0 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1237,7 +1237,7 @@ hit_next:
 		state = next_search_state(inserted_state, end);
 		/*
 		 * If there's a next state, whether contiguous or not, we don't
-		 * need to unlock and start search agian. If it's not contiguous
+		 * need to unlock and start search again. If it's not contiguous
 		 * we will end up here and try to allocate a prealloc state and insert.
 		 */
 		if (state)
@@ -1664,7 +1664,7 @@ out:
  */
 u64 btrfs_count_range_bits(struct extent_io_tree *tree,
 			   u64 *start, u64 search_end, u64 max_bytes,
-			   u32 bits, int contig,
+			   u32 bits, bool contig,
 			   struct extent_state **cached_state)
 {
 	struct extent_state *state = NULL;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 36facca37973..6f07b965e8da 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void);
 
 u64 btrfs_count_range_bits(struct extent_io_tree *tree,
 			   u64 *start, u64 search_end,
-			   u64 max_bytes, u32 bits, int contig,
+			   u64 max_bytes, u32 bits, bool contig,
 			   struct extent_state **cached_state);
 
 void btrfs_free_extent_state(struct extent_state *state);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 97d517cdf2df..dc4ca98c3780 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -325,7 +325,7 @@ search_again:
 
 /*
  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is required,
  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
  */
 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -879,7 +879,7 @@ again:
 			ptr += btrfs_extent_inline_ref_size(type);
 			continue;
 		}
-		if (type == BTRFS_REF_TYPE_INVALID) {
+		if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 		 * We're adding refs to a tree block we already own, this
 		 * should not happen at all.
 		 */
-		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) {
 			btrfs_print_leaf(path->nodes[0]);
 			btrfs_crit(trans->fs_info,
 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
@@ -2157,7 +2157,7 @@ again:
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 	ret = __btrfs_run_delayed_refs(trans, min_bytes);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
@@ -2457,7 +2457,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   bool full_backref, bool inc)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 parent;
@@ -2543,15 +2543,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, bool full_backref)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, true);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, bool full_backref)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, false);
 }
 
 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
@@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 				btrfs_put_block_group(cache);
 			total_unpinned = 0;
 			cache = btrfs_lookup_block_group(fs_info, start);
-			if (cache == NULL) {
+			if (unlikely(cache == NULL)) {
 				/* Logic error, something removed the block group. */
 				ret = -EUCLEAN;
 				goto out;
@@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
 
 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
 		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
 
 		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
 	}
 
 	ret = btrfs_record_squota_delta(trans->fs_info, delta);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
 
 	ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 
-	if (!is_data && refs_to_drop != 1) {
+	if (unlikely(!is_data && refs_to_drop != 1)) {
 		btrfs_crit(info,
 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
 			   node->bytenr, refs_to_drop);
@@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		}
 
 		if (!found_extent) {
-			if (iref) {
+			if (unlikely(iref)) {
 				abort_and_dump(trans, path,
 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
 					   path->slots[0]);
@@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			/* Must be SHARED_* item, remove the backref first */
 			ret = remove_extent_backref(trans, extent_root, path,
 						    NULL, refs_to_drop, is_data);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
 			}
@@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			"umm, got %d back from search, was looking for %llu, slot %d",
 					  ret, bytenr, path->slots[0]);
 			}
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
 			}
@@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 		struct btrfs_tree_block_info *bi;
 
-		if (item_size < sizeof(*ei) + sizeof(*bi)) {
+		if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) {
 			abort_and_dump(trans, path,
 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
 				       key.objectid, key.type, key.offset,
@@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 
 	refs = btrfs_extent_refs(leaf, ei);
-	if (refs < refs_to_drop) {
+	if (unlikely(refs < refs_to_drop)) {
 		abort_and_dump(trans, path,
 		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
 			       refs_to_drop, refs, bytenr, path->slots[0]);
@@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		 * be updated by remove_extent_backref
 		 */
 		if (iref) {
-			if (!found_extent) {
+			if (unlikely(!found_extent)) {
 				abort_and_dump(trans, path,
 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
 					       path->slots[0]);
@@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		if (found_extent) {
 			ret = remove_extent_backref(trans, extent_root, path,
 						    iref, refs_to_drop, is_data);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out;
 			}
@@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 		/* In this branch refs == 1 */
 		if (found_extent) {
-			if (is_data && refs_to_drop !=
-			    extent_data_ref_count(path, iref)) {
+			if (unlikely(is_data && refs_to_drop !=
+				     extent_data_ref_count(path, iref))) {
 				abort_and_dump(trans, path,
 		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
 					       extent_data_ref_count(path, iref),
@@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 			if (iref) {
-				if (path->slots[0] != extent_slot) {
+				if (unlikely(path->slots[0] != extent_slot)) {
 					abort_and_dump(trans, path,
 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
 						       key.objectid, key.type,
@@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				 * |	extent_slot	  ||extent_slot + 1|
 				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
 				 */
-				if (path->slots[0] != extent_slot + 1) {
+				if (unlikely(path->slots[0] != extent_slot + 1)) {
 					abort_and_dump(trans, path,
 	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
 						       path->slots[0]);
@@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
 }
 
 static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
-				    struct find_free_extent_ctl *ffe_ctl)
+				    struct find_free_extent_ctl *ffe_ctl,
+				    struct btrfs_space_info *space_info)
 {
 	if (ffe_ctl->for_treelog) {
 		spin_lock(&fs_info->treelog_bg_lock);
@@ -4315,12 +4316,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
 		spin_lock(&fs_info->zone_active_bgs_lock);
 		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
 			/*
-			 * No lock is OK here because avail is monotinically
+			 * No lock is OK here because avail is monotonically
 			 * decreasing, and this is just a hint.
 			 */
 			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
 
 			if (block_group_bits(block_group, ffe_ctl->flags) &&
+			    block_group->space_info == space_info &&
 			    avail >= ffe_ctl->num_bytes) {
 				ffe_ctl->hint_byte = block_group->start;
 				break;
@@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
 		return prepare_allocation_clustered(fs_info, ffe_ctl,
 						    space_info, ins);
 	case BTRFS_EXTENT_ALLOC_ZONED:
-		return prepare_allocation_zoned(fs_info, ffe_ctl);
+		return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
 	default:
 		BUG();
 	}
@@ -5061,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (IS_ERR(buf))
 		return buf;
 
-	if (check_eb_lock_owner(buf)) {
+	if (unlikely(check_eb_lock_owner(buf))) {
 		free_extent_buffer(buf);
 		return ERR_PTR(-EUCLEAN);
 	}
@@ -5470,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	if (!(wc->flags[level] & flag)) {
 		ASSERT(path->locks[level]);
 		ret = btrfs_inc_ref(trans, root, eb, 1);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
 		ret = btrfs_dec_ref(trans, root, eb, 0);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
 		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
@@ -5582,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
 
 	generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
 
-	if (btrfs_buffer_uptodate(next, generation, 0))
+	if (btrfs_buffer_uptodate(next, generation, false))
 		return 0;
 
 	check.level = level - 1;
@@ -5611,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
  * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
  *
  * If we are DROP_REFERENCE this will figure out if we need to drop our current
- * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * reference, skipping it if we dropped it from a previous uncompleted drop, or
  * dropping it if we still have a reference to it.
  */
 static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -5636,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r
 		ref.parent = path->nodes[level]->start;
 	} else {
 		ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
-		if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+		if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) {
 			btrfs_err(root->fs_info, "mismatched block owner");
 			return -EIO;
 		}
@@ -5758,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
 	/*
 	 * We have to walk down into this node, and if we're currently at the
-	 * DROP_REFERNCE stage and this block is shared then we need to switch
+	 * DROP_REFERENCE stage and this block is shared then we need to switch
 	 * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
 	 */
 	if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
@@ -5772,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
 	level--;
 	ASSERT(level == btrfs_header_level(next));
-	if (level != btrfs_header_level(next)) {
+	if (unlikely(level != btrfs_header_level(next))) {
 		btrfs_err(root->fs_info, "mismatched level");
 		ret = -EIO;
 		goto out_unlock;
@@ -5883,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 				}
 			} else {
 				ret = btrfs_dec_ref(trans, root, eb, 0);
-				if (ret) {
+				if (unlikely(ret)) {
 					btrfs_abort_transaction(trans, ret);
 					return ret;
 				}
@@ -5908,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 	if (eb == root->node) {
 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 			parent = eb->start;
-		else if (btrfs_root_id(root) != btrfs_header_owner(eb))
+		else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb)))
 			goto owner_mismatch;
 	} else {
 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 			parent = path->nodes[level + 1]->start;
-		else if (btrfs_root_id(root) !=
-			 btrfs_header_owner(path->nodes[level + 1]))
+		else if (unlikely(btrfs_root_id(root) !=
+				  btrfs_header_owner(path->nodes[level + 1])))
 			goto owner_mismatch;
 	}
 
@@ -6049,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
  *
- * If called with for_reloc == 0, may exit early with -EAGAIN
+ * If called with for_reloc set, may exit early with -EAGAIN
  */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc)
 {
 	const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6178,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 	while (1) {
 
 		ret = walk_down_tree(trans, root, path, wc);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
 
 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -6211,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 			ret = btrfs_update_root(trans, tree_root,
 						&root->root_key,
 						root_item);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_end_trans;
 			}
@@ -6247,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 		goto out_end_trans;
 
 	ret = btrfs_del_root(trans, &root->root_key);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
 	}
@@ -6255,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 	if (!is_reloc_root) {
 		ret = btrfs_find_root(tree_root, &root->root_key, path,
 				      NULL, NULL);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_end_trans;
 		} else if (ret > 0) {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 82d3a82dc712..e970ac42a871 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
 			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
 			 struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, bool full_backref);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, bool full_backref);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct extent_buffer *eb, u64 flags);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
@@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
 			      const struct extent_buffer *eb);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
-				     int for_reloc);
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 835b0deef9bb..c123a3ef154a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -101,6 +101,26 @@ struct btrfs_bio_ctrl {
 	enum btrfs_compression_type compress_type;
 	u32 len_to_oe_boundary;
 	blk_opf_t opf;
+	/*
+	 * For data read bios, we attempt to optimize csum lookups if the extent
+	 * generation is older than the current one. To make this possible, we
+	 * need to track the maximum generation of an extent in a bio_ctrl to
+	 * make the decision when submitting the bio.
+	 *
+	 * The pattern between do_readpage(), submit_one_bio() and
+	 * submit_extent_folio() is quite subtle, so tracking this is tricky.
+	 *
+	 * As we process extent E, we might submit a bio with existing built up
+	 * extents before adding E to a new bio, or we might just add E to the
+	 * bio. As a result, E's generation could apply to the current bio or
+	 * to the next one, so we need to be careful to update the bio_ctrl's
+	 * generation with E's only when we are sure E is added to bio_ctrl->bbio
+	 * in submit_extent_folio().
+	 *
+	 * See the comment in btrfs_lookup_bio_sums() for more detail on the
+	 * need for this optimization.
+	 */
+	u64 generation;
 	btrfs_bio_end_io_t end_io_func;
 	struct writeback_control *wbc;
 
@@ -111,8 +131,46 @@ struct btrfs_bio_ctrl {
 	 */
 	unsigned long submit_bitmap;
 	struct readahead_control *ractl;
+
+	/*
+	 * The start offset of the last used extent map by a read operation.
+	 *
+	 * This is for proper compressed read merge.
+	 * U64_MAX means we are starting the read and have made no progress yet.
+	 *
+	 * The current btrfs_bio_is_contig() only uses disk_bytenr as
+	 * the condition to check if the read can be merged with previous
+	 * bio, which is not correct. E.g. two file extents pointing to the
+	 * same extent but with different offset.
+	 *
+	 * So here we need to do extra checks to only merge reads that are
+	 * covered by the same extent map.
+	 * Just extent_map::start will be enough, as they are unique
+	 * inside the same inode.
+	 */
+	u64 last_em_start;
 };
 
+/*
+ * Helper to set the csum search commit root option for a bio_ctrl's bbio
+ * before submitting the bio.
+ *
+ * Only for use by submit_one_bio().
+ */
+static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
+{
+	struct btrfs_bio *bbio = bio_ctrl->bbio;
+
+	ASSERT(bbio);
+
+	if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
+		return;
+
+	bio_ctrl->bbio->csum_search_commit_root =
+		(bio_ctrl->generation &&
+		 bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
+}
+
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct btrfs_bio *bbio = bio_ctrl->bbio;
@@ -123,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 	/* Caller should ensure the bio has at least some range added */
 	ASSERT(bbio->bio.bi_iter.bi_size);
 
+	bio_set_csum_search_commit_root(bio_ctrl);
+
 	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
 	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
 		btrfs_submit_compressed_read(bbio);
@@ -131,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 
 	/* The bbio is owned by the end_io handler now */
 	bio_ctrl->bbio = NULL;
+	/*
+	 * We used the generation to decide whether to lookup csums in the
+	 * commit_root or not when we called bio_set_csum_search_commit_root()
+	 * above. Now, reset the generation for the next bio.
+	 */
+	bio_ctrl->generation = 0;
 }
 
 /*
@@ -327,6 +393,13 @@ again:
 	/* step one, find a bunch of delalloc bytes starting at start */
 	delalloc_start = *start;
 	delalloc_end = 0;
+
+	/*
+	 * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+	 * return early without handling any dirty ranges.
+	 */
+	ASSERT(max_bytes >= fs_info->sectorsize);
+
 	found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
 					  max_bytes, &cached_state);
 	if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -352,18 +425,19 @@ again:
 	if (delalloc_end + 1 - delalloc_start > max_bytes)
 		delalloc_end = delalloc_start + max_bytes - 1;
 
-	/* step two, lock all the folioss after the folios that has start */
+	/* step two, lock all the folios after the folios that has start */
 	ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
 				   delalloc_end);
 	ASSERT(!ret || ret == -EAGAIN);
 	if (ret == -EAGAIN) {
-		/* some of the folios are gone, lets avoid looping by
-		 * shortening the size of the delalloc range we're searching
+		/*
+		 * Some of the folios are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching.
 		 */
 		btrfs_free_extent_state(cached_state);
 		cached_state = NULL;
 		if (!loops) {
-			max_bytes = PAGE_SIZE;
+			max_bytes = fs_info->sectorsize;
 			loops = 1;
 			goto again;
 		} else {
@@ -552,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
  * Populate every free slot in a provided array with folios using GFP_NOFS.
  *
  * @nr_folios:   number of folios to allocate
+ * @order:	 the order of the folios to be allocated
  * @folio_array: the array to fill with folios; any existing non-NULL entries in
  *		 the array will be skipped
  *
@@ -559,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
  *         -ENOMEM  otherwise, the partially allocated folios would be freed and
  *                  the array slots zeroed
  */
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+			    struct folio **folio_array)
 {
 	for (int i = 0; i < nr_folios; i++) {
 		if (folio_array[i])
 			continue;
-		folio_array[i] = folio_alloc(GFP_NOFS, 0);
+		folio_array[i] = folio_alloc(GFP_NOFS, order);
 		if (!folio_array[i])
 			goto error;
 	}
@@ -573,6 +649,7 @@ error:
 	for (int i = 0; i < nr_folios; i++) {
 		if (folio_array[i])
 			folio_put(folio_array[i]);
+		folio_array[i] = NULL;
 	}
 	return -ENOMEM;
 }
@@ -701,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode,
  * @size:	portion of page that we want to write to
  * @pg_offset:	offset of the new bio or to check whether we are adding
  *              a contiguous page to the previous one
+ * @read_em_generation: generation of the extent_map we are submitting
+ *			(only used for read)
  *
  * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
  * new one in @bio_ctrl->bbio.
- * The mirror number for this IO should already be initizlied in
+ * The mirror number for this IO should already be initialized in
  * @bio_ctrl->mirror_num.
  */
 static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 			       u64 disk_bytenr, struct folio *folio,
-			       size_t size, unsigned long pg_offset)
+			       size_t size, unsigned long pg_offset,
+			       u64 read_em_generation)
 {
 	struct btrfs_inode *inode = folio_to_inode(folio);
 	loff_t file_offset = folio_pos(folio) + pg_offset;
@@ -740,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
 			submit_one_bio(bio_ctrl);
 			continue;
 		}
+		/*
+		 * Now that the folio is definitely added to the bio, include its
+		 * generation in the max generation calculation.
+		 */
+		bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
 		bio_ctrl->next_file_offset += len;
 
 		if (bio_ctrl->wbc)
@@ -909,7 +994,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
  * return 0 on success, otherwise return error
  */
 static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
-		      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+			     struct btrfs_bio_ctrl *bio_ctrl)
 {
 	struct inode *inode = folio->mapping->host;
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -942,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 		bool force_bio_submit = false;
 		u64 disk_bytenr;
 		u64 block_start;
+		u64 em_gen;
 
 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
 		if (cur >= last_byte) {
@@ -1019,13 +1105,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 		 * non-optimal behavior (submitting 2 bios for the same extent).
 		 */
 		if (compress_type != BTRFS_COMPRESS_NONE &&
-		    prev_em_start && *prev_em_start != (u64)-1 &&
-		    *prev_em_start != em->start)
+		    bio_ctrl->last_em_start != U64_MAX &&
+		    bio_ctrl->last_em_start != em->start)
 			force_bio_submit = true;
 
-		if (prev_em_start)
-			*prev_em_start = em->start;
+		bio_ctrl->last_em_start = em->start;
 
+		em_gen = em->generation;
 		btrfs_free_extent_map(em);
 		em = NULL;
 
@@ -1049,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
 		if (force_bio_submit)
 			submit_one_bio(bio_ctrl);
 		submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
-				    pg_offset);
+				    pg_offset, em_gen);
 	}
 	return 0;
 }
@@ -1238,12 +1324,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
 	const u64 start = folio_pos(folio);
 	const u64 end = start + folio_size(folio) - 1;
 	struct extent_state *cached_state = NULL;
-	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+	struct btrfs_bio_ctrl bio_ctrl = {
+		.opf = REQ_OP_READ,
+		.last_em_start = U64_MAX,
+	};
 	struct extent_map *em_cached = NULL;
 	int ret;
 
 	lock_extents_for_read(inode, start, end, &cached_state);
-	ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+	ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
 	btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
 
 	btrfs_free_extent_map(em_cached);
@@ -1512,7 +1601,7 @@ out:
 
 /*
  * Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
  *
  * Caller should make sure filepos < i_size and handle filepos >= i_size case.
  */
@@ -1535,8 +1624,17 @@ static int submit_one_sector(struct btrfs_inode *inode,
 	ASSERT(filepos < i_size);
 
 	em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
-	if (IS_ERR(em))
+	if (IS_ERR(em)) {
+		/*
+		 * When submission failed, we should still clear the folio dirty.
+		 * Or the folio will be written back again but without any
+		 * ordered extent.
+		 */
+		btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+		btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+		btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
 		return PTR_ERR(em);
+	}
 
 	extent_offset = filepos - em->start;
 	em_end = btrfs_extent_map_end(em);
@@ -1571,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
 	ASSERT(folio_test_writeback(folio));
 
 	submit_extent_folio(bio_ctrl, disk_bytenr, folio,
-			    sectorsize, filepos - folio_pos(folio));
+			    sectorsize, filepos - folio_pos(folio), 0);
 	return 0;
 }
 
@@ -1592,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	unsigned long range_bitmap = 0;
 	bool submitted_io = false;
-	bool error = false;
+	int found_error = 0;
 	const u64 folio_start = folio_pos(folio);
 	const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
 	u64 cur;
@@ -1609,8 +1707,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 		folio_unlock(folio);
 		return 1;
 	}
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_folio_clear_dirty(fs_info, folio, start, len);
+		btrfs_folio_set_writeback(fs_info, folio, start, len);
+		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 		return ret;
+	}
 
 	for (cur = start; cur < start + len; cur += fs_info->sectorsize)
 		set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
@@ -1652,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 			 */
 			btrfs_mark_ordered_io_finished(inode, folio, cur,
 						       fs_info->sectorsize, false);
-			error = true;
+			if (!found_error)
+				found_error = ret;
 			continue;
 		}
 		submitted_io = true;
@@ -1666,14 +1769,14 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
 	 * Here we set writeback and clear for the range. If the full folio
 	 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
 	 *
-	 * If we hit any error, the corresponding sector will still be dirty
-	 * thus no need to clear PAGECACHE_TAG_DIRTY.
+	 * If we hit any error, the corresponding sector will have its dirty
+	 * flag cleared and writeback finished, thus no need to handle the error case.
 	 */
-	if (!submitted_io && !error) {
+	if (!submitted_io && !found_error) {
 		btrfs_folio_set_writeback(fs_info, folio, start, len);
 		btrfs_folio_clear_writeback(fs_info, folio, start, len);
 	}
-	return ret;
+	return found_error;
 }
 
 /*
@@ -1813,6 +1916,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
 		xas_load(&xas);
 		xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
 		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
 		xas_unlock_irqrestore(&xas, flags);
 
 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
@@ -2133,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
  * @fs_info:	The fs_info for this file system.
  * @start:	The offset of the range to start waiting on writeback.
  * @end:	The end of the range, inclusive. This is meant to be used in
- *		conjuction with wait_marked_extents, so this will usually be
+ *		conjunction with wait_marked_extents, so this will usually be
  *		the_next_eb->start - 1.
  */
 void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
@@ -2403,7 +2507,7 @@ retry:
 			 * In above case, [32K, 96K) is asynchronously submitted
 			 * for compression, and [124K, 128K) needs to be written back.
 			 *
-			 * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+			 * If we didn't wait writeback for page 64K, [128K, 128K)
 			 * won't be submitted as the page still has writeback flag
 			 * and will be skipped in the next check.
 			 *
@@ -2569,7 +2673,8 @@ void btrfs_readahead(struct readahead_control *rac)
 {
 	struct btrfs_bio_ctrl bio_ctrl = {
 		.opf = REQ_OP_READ | REQ_RAHEAD,
-		.ractl = rac
+		.ractl = rac,
+		.last_em_start = U64_MAX,
 	};
 	struct folio *folio;
 	struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2577,12 +2682,11 @@ void btrfs_readahead(struct readahead_control *rac)
 	const u64 end = start + readahead_length(rac) - 1;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em_cached = NULL;
-	u64 prev_em_start = (u64)-1;
 
 	lock_extents_for_read(inode, start, end, &cached_state);
 
 	while ((folio = readahead_folio(rac)) != NULL)
-		btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+		btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
 
 	btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
 
@@ -2887,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
 {
 	const int num_folios = num_extent_folios(eb);
 
-	/* We canont use num_extent_folios() as loop bound as eb->folios changes. */
+	/* We cannot use num_extent_folios() as loop bound as eb->folios changes. */
 	for (int i = 0; i < num_folios; i++) {
 		ASSERT(eb->folios[i]);
 		detach_extent_buffer_folio(eb, eb->folios[i]);
@@ -3134,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
  */
 static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 {
-	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+	const u32 nodesize = fs_info->nodesize;
+
+	if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) {
 		btrfs_err(fs_info, "bad tree block start %llu", start);
 		return true;
 	}
 
-	if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) {
+	if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) {
 		btrfs_err(fs_info,
 		"tree block is not nodesize aligned, start %llu nodesize %u",
-			  start, fs_info->nodesize);
+			  start, nodesize);
 		return true;
 	}
-	if (fs_info->nodesize >= PAGE_SIZE &&
-	    !PAGE_ALIGNED(start)) {
+	if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) {
 		btrfs_err(fs_info,
 		"tree block is not page aligned, start %llu nodesize %u",
-			  start, fs_info->nodesize);
+			  start, nodesize);
 		return true;
 	}
-	if (!IS_ALIGNED(start, fs_info->nodesize) &&
-	    !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+	if (unlikely(!IS_ALIGNED(start, nodesize) &&
+		     !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) {
 		btrfs_warn(fs_info,
 "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
-			      start, fs_info->nodesize);
+			      start, nodesize);
 	}
 	return false;
 }
@@ -3775,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
 		return ret;
 
 	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+	if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)))
 		return -EIO;
 	return 0;
 }
@@ -4331,15 +4436,18 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 	unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
 	int ret;
 
-	xa_lock_irq(&fs_info->buffer_tree);
+	rcu_read_lock();
 	xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
 		/*
 		 * The same as try_release_extent_buffer(), to ensure the eb
 		 * won't disappear out from under us.
 		 */
 		spin_lock(&eb->refs_lock);
+		rcu_read_unlock();
+
 		if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
 			spin_unlock(&eb->refs_lock);
+			rcu_read_lock();
 			continue;
 		}
 
@@ -4358,11 +4466,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 		 * check the folio private at the end.  And
 		 * release_extent_buffer() will release the refs_lock.
 		 */
-		xa_unlock_irq(&fs_info->buffer_tree);
 		release_extent_buffer(eb);
-		xa_lock_irq(&fs_info->buffer_tree);
+		rcu_read_lock();
 	}
-	xa_unlock_irq(&fs_info->buffer_tree);
+	rcu_read_unlock();
 
 	/*
 	 * Finally to check if we have cleared folio private, as if we have
@@ -4375,7 +4482,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 		ret = 0;
 	spin_unlock(&folio->mapping->i_private_lock);
 	return ret;
-
 }
 
 int try_release_extent_buffer(struct folio *folio)
@@ -4450,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(eb))
 		return;
 
-	if (btrfs_buffer_uptodate(eb, gen, 1)) {
+	if (btrfs_buffer_uptodate(eb, gen, true)) {
 		free_extent_buffer(eb);
 		return;
 	}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 61130786b9a3..5fcbfe44218c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
 			   bool nofail);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+			    struct folio **folio_array);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 57f52585a6dd..7e38c23a0c1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
 
 static inline void setup_extent_mapping(struct btrfs_inode *inode,
 					struct extent_map *em,
-					int modified)
+					bool modified)
 {
 	refcount_inc(&em->refs);
 
@@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode,
  * taken, or a reference dropped if the merge attempt was successful.
  */
 static int add_extent_mapping(struct btrfs_inode *inode,
-			      struct extent_map *em, int modified)
+			      struct extent_map *em, bool modified)
 {
 	struct extent_map_tree *tree = &inode->extent_tree;
 	struct btrfs_root *root = inode->root;
@@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode,
 }
 
 static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-						u64 start, u64 len, int strict)
+						u64 start, u64 len, bool strict)
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
@@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
 					       u64 start, u64 len)
 {
-	return lookup_extent_mapping(tree, start, len, 1);
+	return lookup_extent_mapping(tree, start, len, true);
 }
 
 /*
@@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
 struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
 					       u64 start, u64 len)
 {
-	return lookup_extent_mapping(tree, start, len, 0);
+	return lookup_extent_mapping(tree, start, len, false);
 }
 
 /*
@@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e
 static void replace_extent_mapping(struct btrfs_inode *inode,
 				   struct extent_map *cur,
 				   struct extent_map *new,
-				   int modified)
+				   bool modified)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct extent_map_tree *tree = &inode->extent_tree;
@@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
 	em->len = end - start;
 	if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
 		em->offset += start_diff;
-	return add_extent_mapping(inode, em, 0);
+	return add_extent_mapping(inode, em, false);
 }
 
 /*
@@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
 	if (em->disk_bytenr == EXTENT_MAP_INLINE)
 		ASSERT(em->start == 0);
 
-	ret = add_extent_mapping(inode, em, 0);
+	ret = add_extent_mapping(inode, em, false);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
 	btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL);
 	write_lock(&em_tree->lock);
 	em = btrfs_lookup_extent_mapping(em_tree, start, len);
-	if (!em) {
+	if (unlikely(!em)) {
 		ret = -EIO;
 		goto out_unlock;
 	}
@@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
 	split_pre->flags = flags;
 	split_pre->generation = em->generation;
 
-	replace_extent_mapping(inode, em, split_pre, 1);
+	replace_extent_mapping(inode, em, split_pre, true);
 
 	/*
 	 * Now we only have an extent_map at:
@@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
 	split_mid->ram_bytes = split_mid->len;
 	split_mid->flags = flags;
 	split_mid->generation = em->generation;
-	add_extent_mapping(inode, split_mid, 1);
+	add_extent_mapping(inode, split_mid, true);
 
 	/* Once for us */
 	btrfs_free_extent_map(em);
@@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
 		return;
 
-	queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+	queue_work(system_dfl_wq, &fs_info->em_shrinker_work);
 }
 
 void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 7935586a9dbd..f2eaaef8422b 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 	if (cache_end > offset) {
 		if (offset == cache->offset) {
 			/*
-			 * We cached a dealloc range (found in the io tree) for
+			 * We cached a delalloc range (found in the io tree) for
 			 * a hole or prealloc extent and we have now found a
 			 * file extent item for the same offset. What we have
 			 * now is more recent and up to date, so discard what
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c09fbc257634..a42e6d54e7cd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 		path->skip_locking = 1;
 	}
 
+	/*
+	 * If we are searching for a csum of an extent from a past
+	 * transaction, we can search in the commit root and reduce
+	 * lock contention on the csum tree extent buffers.
+	 *
+	 * This is important because that lock is an rwsem which gets
+	 * pretty heavy write load under memory pressure and sustained
+	 * csum overwrites, unlike the commit_root_sem. (Memory pressure
+	 * makes us writeback the nodes multiple times per transaction,
+	 * which makes us cow them each time, taking the write lock.)
+	 *
+	 * Due to how rwsem is implemented, there is a possible
+	 * priority inversion where the readers holding the lock don't
+	 * get scheduled (say they're in a cgroup stuck in heavy reclaim)
+	 * which then blocks writers, including transaction commit. By
+	 * using a semaphore with fewer writers (only a commit switching
+	 * the roots), we make this issue less likely.
+	 *
+	 * Note that we don't rely on btrfs_search_slot to lock the
+	 * commit root csum. We call search_slot multiple times, which would
+	 * create a potential race where a commit comes in between searches
+	 * while we are not holding the commit_root_sem, and we get csums
+	 * from across transactions.
+	 */
+	if (bbio->csum_search_commit_root) {
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+		down_read(&fs_info->commit_root_sem);
+	}
+
 	while (bio_offset < orig_len) {
 		int count;
 		u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
@@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
 		bio_offset += count * sectorsize;
 	}
 
+	if (bbio->csum_search_commit_root)
+		up_read(&fs_info->commit_root_sem);
 	return ret;
 }
 
@@ -743,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio)
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	struct bio *bio = &bbio->bio;
 	struct btrfs_ordered_sum *sums;
-	char *data;
-	struct bvec_iter iter;
-	struct bio_vec bvec;
+	struct bvec_iter iter = bio->bi_iter;
+	phys_addr_t paddr;
+	const u32 blocksize = fs_info->sectorsize;
 	int index;
-	unsigned int blockcount;
-	int i;
 	unsigned nofs_flag;
 
 	nofs_flag = memalloc_nofs_save();
@@ -767,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio)
 
 	shash->tfm = fs_info->csum_shash;
 
-	bio_for_each_segment(bvec, bio, iter) {
-		blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
-						 bvec.bv_len + fs_info->sectorsize
-						 - 1);
-
-		for (i = 0; i < blockcount; i++) {
-			data = bvec_kmap_local(&bvec);
-			crypto_shash_digest(shash,
-					    data + (i * fs_info->sectorsize),
-					    fs_info->sectorsize,
-					    sums->sums + index);
-			kunmap_local(data);
-			index += fs_info->csum_size;
-		}
-
+	btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) {
+		btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index);
+		index += fs_info->csum_size;
 	}
 
 	bbio->sums = sums;
@@ -993,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 			 * item changed size or key
 			 */
 			ret = btrfs_split_item(trans, root, path, &key, offset);
-			if (ret && ret != -EAGAIN) {
+			if (unlikely(ret && ret != -EAGAIN)) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 204674934795..7efd1f8a1912 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -327,7 +327,7 @@ next_slot:
 						    args->start - extent_offset,
 						    0, false);
 				ret = btrfs_inc_extent_ref(trans, &ref);
-				if (ret) {
+				if (unlikely(ret)) {
 					btrfs_abort_transaction(trans, ret);
 					break;
 				}
@@ -426,7 +426,7 @@ delete_extent_item:
 						    key.offset - extent_offset,
 						    0, false);
 				ret = btrfs_free_extent(trans, &ref);
-				if (ret) {
+				if (unlikely(ret)) {
 					btrfs_abort_transaction(trans, ret);
 					break;
 				}
@@ -443,7 +443,7 @@ delete_extent_item:
 
 			ret = btrfs_del_items(trans, root, path, del_slot,
 					      del_nr);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
@@ -587,21 +587,20 @@ again:
 
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-	if (key.objectid != ino ||
-	    key.type != BTRFS_EXTENT_DATA_KEY) {
+	if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
 		ret = -EINVAL;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+	if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
 		ret = -EINVAL;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-	if (key.offset > start || extent_end < end) {
+	if (unlikely(key.offset > start || extent_end < end)) {
 		ret = -EINVAL;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -676,7 +675,7 @@ again:
 			btrfs_release_path(path);
 			goto again;
 		}
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -704,7 +703,7 @@ again:
 		ref.ref_root = btrfs_root_id(root);
 		btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -712,7 +711,7 @@ again:
 		if (split == start) {
 			key.offset = start;
 		} else {
-			if (start != key.offset) {
+			if (unlikely(start != key.offset)) {
 				ret = -EINVAL;
 				btrfs_abort_transaction(trans, ret);
 				goto out;
@@ -744,7 +743,7 @@ again:
 		del_slot = path->slots[0] + 1;
 		del_nr++;
 		ret = btrfs_free_extent(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -762,7 +761,7 @@ again:
 		del_slot = path->slots[0];
 		del_nr++;
 		ret = btrfs_free_extent(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -783,7 +782,7 @@ again:
 						extent_end - key.offset);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -815,7 +814,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
 	if (ret)
 		return ret;
 	folio_lock(folio);
-	if (!folio_test_uptodate(folio)) {
+	if (unlikely(!folio_test_uptodate(folio))) {
 		folio_unlock(folio);
 		return -EIO;
 	}
@@ -970,7 +969,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
  * Return:
  * > 0          If we can nocow, and updates @write_bytes.
  *  0           If we can't do a nocow write.
- * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
+ * -EAGAIN      If we can't do a nocow write because snapshotting of the inode's
  *              root is in progress or because we are in a non-blocking IO
  *              context and need to block (@nowait is true).
  * < 0          If an error happened.
@@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 			 * got EOPNOTSUPP via prealloc then we messed up and
 			 * need to abort.
 			 */
-			if (ret &&
-			    (ret != -EOPNOTSUPP ||
-			     (extent_info && extent_info->is_new_extent)))
+			if (unlikely(ret &&
+				     (ret != -EOPNOTSUPP ||
+				      (extent_info && extent_info->is_new_extent))))
 				btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		    cur_offset < ino_size) {
 			ret = fill_holes(trans, inode, path, cur_offset,
 					 drop_args.drop_end);
-			if (ret) {
+			if (unlikely(ret)) {
 				/*
 				 * If we failed then we didn't insert our hole
 				 * entries for the area we dropped, so now the
@@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 			ret = btrfs_inode_clear_file_extent_range(inode,
 					cur_offset,
 					drop_args.drop_end - cur_offset);
-			if (ret) {
+			if (unlikely(ret)) {
 				/*
 				 * We couldn't clear our area, so we could
 				 * presumably adjust up and corrupt the fs, so
@@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 			ret = btrfs_insert_replace_extent(trans, inode,	path,
 					extent_info, replace_len,
 					drop_args.bytes_found);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
@@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 	    cur_offset < drop_args.drop_end) {
 		ret = fill_holes(trans, inode, path, cur_offset,
 				 drop_args.drop_end);
-		if (ret) {
+		if (unlikely(ret)) {
 			/* Same comment as above. */
 			btrfs_abort_transaction(trans, ret);
 			goto out_trans;
@@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		/* See the comment in the loop above for the reasoning here. */
 		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
 					drop_args.drop_end - cur_offset);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_trans;
 		}
@@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		ret = btrfs_insert_replace_extent(trans, inode, path,
 				extent_info, extent_info->data_len,
 				drop_args.bytes_found);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_trans;
 		}
@@ -3345,7 +3344,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
 	 * We could also use the extent map tree to find such delalloc that is
 	 * being flushed, but using the ordered extents tree is more efficient
 	 * because it's usually much smaller as ordered extents are removed from
-	 * the tree once they complete. With the extent maps, we mau have them
+	 * the tree once they complete. With the extent maps, we may have them
 	 * in the extent map tree for a very long time, and they were either
 	 * created by previous writes or loaded by read operations.
 	 */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5d8d1570a5c9..ab873bd67192 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 		 * If this block group has some small extents we don't want to
 		 * use up all of our free slots in the cache with them, we want
 		 * to reserve them to larger extents, however if we have plenty
-		 * of cache left then go ahead an dadd them, no sense in adding
+		 * of cache left then go ahead and add them, no sense in adding
 		 * the overhead of a bitmap if we don't have to.
 		 */
 		if (info->bytes <= fs_info->sectorsize * 8) {
@@ -3829,7 +3829,7 @@ out_unlock:
 
 /*
  * If we break out of trimming a bitmap prematurely, we should reset the
- * trimming bit.  In a rather contrieved case, it's possible to race here so
+ * trimming bit.  In a rather contrived case, it's possible to race here so
  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
  *
  * start = start of bitmap
@@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act
 	if (!active) {
 		set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
 		ret = cleanup_free_space_cache_v1(fs_info, trans);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			btrfs_end_transaction(trans);
 			goto out;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index eba7f22ae49c..dad0b492a663 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		return ret;
 
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		DEBUG_WARN();
 		return -EIO;
 	}
 
-	if (p->slots[0] == 0) {
+	if (unlikely(p->slots[0] == 0)) {
 		DEBUG_WARN("no previous slot found");
 		return -EIO;
 	}
@@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 
 	bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
 	bitmap = alloc_bitmap(bitmap_size);
-	if (!bitmap) {
+	if (unlikely(!bitmap)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 
 	while (!done) {
 		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 		}
 
 		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
 	btrfs_release_path(path);
 
-	if (extent_count != expected_extent_count) {
+	if (unlikely(extent_count != expected_extent_count)) {
 		btrfs_err(fs_info,
 			  "incorrect extent count for %llu; counted %u, expected %u",
 			  block_group->start, extent_count,
@@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      data_size);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 
 	bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
 	bitmap = alloc_bitmap(bitmap_size);
-	if (!bitmap) {
+	if (unlikely(!bitmap)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 
 	while (!done) {
 		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 		}
 
 		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 		key.offset = (end_bit - start_bit) * fs_info->sectorsize;
 
 		ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 		start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
 	}
 
-	if (extent_count != expected_extent_count) {
+	if (unlikely(extent_count != expected_extent_count)) {
 		btrfs_err(fs_info,
 			  "incorrect extent count for %llu; counted %u, expected %u",
 			  block_group->start, extent_count,
@@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
 		return 0;
 
 	path = btrfs_alloc_path();
-	if (!path) {
+	if (unlikely(!path)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	block_group = btrfs_lookup_block_group(trans->fs_info, start);
-	if (!block_group) {
+	if (unlikely(!block_group)) {
 		DEBUG_WARN("no block group found for start=%llu", start);
 		ret = -ENOENT;
 		btrfs_abort_transaction(trans, ret);
@@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
 		return 0;
 
 	path = btrfs_alloc_path();
-	if (!path) {
+	if (unlikely(!path)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	block_group = btrfs_lookup_block_group(trans->fs_info, start);
-	if (!block_group) {
+	if (unlikely(!block_group)) {
 		DEBUG_WARN("no block group found for start=%llu", start);
 		ret = -ENOENT;
 		btrfs_abort_transaction(trans, ret);
@@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 		goto out_clear;
 	}
 	ret = btrfs_global_root_insert(free_space_root);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_put_root(free_space_root);
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
@@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
 		block_group = rb_entry(node, struct btrfs_block_group,
 				       cache_node);
 		ret = populate_free_space_tree(trans, block_group);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			btrfs_end_transaction(trans);
 			goto out_clear;
@@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
 	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
 
 	ret = clear_free_space_tree(trans, free_space_root);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
 	}
 
 	ret = btrfs_del_root(trans, &free_space_root->root_key);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
@@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
 	ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
 				    free_space_root->node, 0, 1);
 	btrfs_put_root(free_space_root);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
@@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 	set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
 
 	ret = clear_free_space_tree(trans, free_space_root);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
@@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
 			goto next;
 
 		ret = populate_free_space_tree(trans, block_group);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			btrfs_end_transaction(trans);
 			return ret;
@@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 
 	if (!path) {
 		path = btrfs_alloc_path();
-		if (!path) {
+		if (unlikely(!path)) {
 			btrfs_abort_transaction(trans, -ENOMEM);
 			return -ENOMEM;
 		}
@@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 	}
 
 	ret = add_new_free_space_info(trans, block_group, path);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 	}
 
 	path = btrfs_alloc_path();
-	if (!path) {
+	if (unlikely(!path)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 
 	while (!done) {
 		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
 		}
 
 		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
 		extent_count++;
 	}
 
-	if (extent_count != expected_extent_count) {
+	if (unlikely(extent_count != expected_extent_count)) {
 		btrfs_err(fs_info,
 			  "incorrect extent count for %llu; counted %u, expected %u",
 			  block_group->start, extent_count,
@@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 		extent_count++;
 	}
 
-	if (extent_count != expected_extent_count) {
+	if (unlikely(extent_count != expected_extent_count)) {
 		btrfs_err(fs_info,
 			  "incorrect extent count for %llu; counted %u, expected %u",
 			  block_group->start, extent_count,
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index b2bb86f8d7cf..feb0a2faa837 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -55,6 +55,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
 }
 
 /*
+ * We support the following block sizes for all systems:
+ *
+ * - 4K
+ *   This is the most common block size. For PAGE SIZE > 4K cases the subpage
+ *   mode is used.
+ *
+ * - PAGE_SIZE
+ *   The straightforward block size to support.
+ *
+ * And extra support for the following block sizes based on the kernel config:
+ *
+ * - MIN_BLOCKSIZE
+ *   This is either 4K (regular builds) or 2K (debug builds)
+ *   This allows testing subpage routines on x86_64.
+ */
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
+{
+	/* @blocksize should be validated first. */
+	ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE &&
+	       blocksize <= BTRFS_MAX_BLOCKSIZE);
+
+	if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE)
+		return true;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+	/*
+	 * For bs > ps support it's done by specifying a minimal folio order
+	 * for filemap, thus implying large data folios.
+	 * For HIGHMEM systems, we can not always access the content of a (large)
+	 * folio in one go, but go through them page by page.
+	 *
+	 * A lot of features don't implement a proper PAGE sized loop for large
+	 * folios, this includes:
+	 *
+	 * - compression
+	 * - verity
+	 * - encoded write
+	 *
+	 * Considering HIGHMEM is such a pain to deal with and it's going
+	 * to be deprecated eventually, just reject HIGHMEM && bs > ps cases.
+	 */
+	if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
+		return false;
+	return true;
+#endif
+	return false;
+}
+
+/*
  * Start exclusive operation @type, return true on success.
  */
 bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 8cc07cc70b12..814bbc9417d2 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -59,6 +59,8 @@ struct btrfs_space_info;
 #define BTRFS_MIN_BLOCKSIZE	(SZ_4K)
 #endif
 
+#define BTRFS_MAX_BLOCKSIZE	(SZ_64K)
+
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 #define BTRFS_OLDEST_GENERATION	0ULL
@@ -102,6 +104,8 @@ enum {
 	BTRFS_FS_STATE_RO,
 	/* Track if a transaction abort has been reported on this filesystem */
 	BTRFS_FS_STATE_TRANS_ABORTED,
+	/* Track if log replay has failed. */
+	BTRFS_FS_STATE_LOG_REPLAY_ABORTED,
 	/*
 	 * Bio operations should be blocked on this filesystem because a source
 	 * or target device is being destroyed as part of a device replace
@@ -243,6 +247,7 @@ enum {
 	BTRFS_MOUNT_NOSPACECACHE		= (1ULL << 30),
 	BTRFS_MOUNT_IGNOREMETACSUMS		= (1ULL << 31),
 	BTRFS_MOUNT_IGNORESUPERFLAGS		= (1ULL << 32),
+	BTRFS_MOUNT_REF_TRACKER			= (1ULL << 33),
 };
 
 /*
@@ -280,7 +285,7 @@ enum {
 
 #ifdef CONFIG_BTRFS_EXPERIMENTAL
 	/*
-	 * Features under developmen like Extent tree v2 support is enabled
+	 * Features under development like Extent tree v2 support is enabled
 	 * only under CONFIG_BTRFS_EXPERIMENTAL
 	 */
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
@@ -303,6 +308,16 @@ enum {
 #define BTRFS_WARNING_COMMIT_INTERVAL	(300)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
 
+enum btrfs_compression_type {
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_ZSTD  = 3,
+	BTRFS_NR_COMPRESS_TYPES = 4,
+
+	BTRFS_DEFRAG_DONT_COMPRESS,
+};
+
 struct btrfs_dev_replace {
 	/* See #define above */
 	u64 replace_state;
@@ -505,6 +520,9 @@ struct btrfs_fs_info {
 	u64 last_trans_log_full_commit;
 	unsigned long long mount_opt;
 
+	/* Compress related structures. */
+	void *compr_wsm[BTRFS_NR_COMPRESS_TYPES];
+
 	int compress_type;
 	int compress_level;
 	u32 commit_interval;
@@ -809,6 +827,8 @@ struct btrfs_fs_info {
 	u32 sectorsize;
 	/* ilog2 of sectorsize, use to avoid 64bit division */
 	u32 sectorsize_bits;
+	u32 block_min_order;
+	u32 block_max_order;
 	u32 csum_size;
 	u32 csums_per_leaf;
 	u32 stripesize;
@@ -878,12 +898,10 @@ struct btrfs_fs_info {
 	struct lockdep_map btrfs_trans_pending_ordered_map;
 	struct lockdep_map btrfs_ordered_extent_map;
 
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
 	spinlock_t ref_verify_lock;
 	struct rb_root block_tree;
-#endif
 
-#ifdef CONFIG_BTRFS_DEBUG
 	struct kobject *debug_kobj;
 	struct list_head allocated_roots;
 
@@ -905,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
+/* Return the minimal folio size of the fs. */
+static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info)
+{
+	return 1U << (PAGE_SHIFT + fs_info->block_min_order);
+}
+
 static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
 {
 	return READ_ONCE(fs_info->generation);
@@ -997,6 +1021,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs
 	return folio_size(folio) >> fs_info->sectorsize_bits;
 }
 
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize);
 bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
 			enum btrfs_exclusive_operation type);
 bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
@@ -1107,9 +1132,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
 
 #define EXPORT_FOR_TESTS
 
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
 {
-	return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+	return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state));
 }
 
 void btrfs_test_destroy_inode(struct inode *inode);
@@ -1118,9 +1143,9 @@ void btrfs_test_destroy_inode(struct inode *inode);
 
 #define EXPORT_FOR_TESTS static
 
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
 {
-	return 0;
+	return false;
 }
 #endif
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index f06cf701ae5a..1bd73b80f9fa 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	 */
 	extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
 						ref_objectid, name);
-	if (!extref) {
+	if (unlikely(!extref)) {
 		btrfs_abort_transaction(trans, -ENOENT);
 		return -ENOENT;
 	}
@@ -627,7 +627,7 @@ delete:
 		if (control->clear_extent_range) {
 			ret = btrfs_inode_clear_file_extent_range(control->inode,
 						  clear_start, clear_len);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
@@ -666,7 +666,7 @@ delete:
 			btrfs_init_data_ref(&ref, control->ino, extent_offset,
 					    btrfs_root_id(root), false);
 			ret = btrfs_free_extent(trans, &ref);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				break;
 			}
@@ -684,7 +684,7 @@ delete:
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
 						pending_del_nr);
-				if (ret) {
+				if (unlikely(ret)) {
 					btrfs_abort_transaction(trans, ret);
 					break;
 				}
@@ -720,7 +720,7 @@ out:
 		int ret2;
 
 		ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
-		if (ret2) {
+		if (unlikely(ret2)) {
 			btrfs_abort_transaction(trans, ret2);
 			ret = ret2;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b77dd22b8cdb..ced87c9e4682 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -72,6 +72,9 @@
 #include "raid-stripe-tree.h"
 #include "fiemap.h"
 
+#define COW_FILE_RANGE_KEEP_LOCKED	(1UL << 0)
+#define COW_FILE_RANGE_NO_INLINE	(1UL << 1)
+
 struct btrfs_iget_args {
 	u64 ino;
 	struct btrfs_root *root;
@@ -367,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
 }
 
 /*
- * Unock inode i_rwsem.
+ * Unlock inode i_rwsem.
  *
  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  * to decide whether the lock acquired is shared or exclusive.
@@ -401,10 +404,12 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 
 	while (index <= end_index) {
 		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
-		index++;
-		if (IS_ERR(folio))
+		if (IS_ERR(folio)) {
+			index++;
 			continue;
+		}
 
+		index = folio_end(folio) >> PAGE_SHIFT;
 		/*
 		 * Here we just clear all Ordered bits for every page in the
 		 * range, then btrfs_mark_ordered_io_finished() will handle
@@ -629,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
 	drop_args.replace_extent = true;
 	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -637,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
 	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
 				   size, compressed_size, compress_type,
 				   compressed_folio, update_i_size);
-	if (ret && ret != -ENOSPC) {
+	if (unlikely(ret && ret != -ENOSPC)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	} else if (ret == -ENOSPC) {
@@ -647,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
 
 	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
 	ret = btrfs_update_inode(trans, inode);
-	if (ret && ret != -ENOSPC) {
+	if (unlikely(ret && ret != -ENOSPC)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	} else if (ret == -ENOSPC) {
@@ -849,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work)
 	struct btrfs_inode *inode = async_chunk->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct address_space *mapping = inode->vfs_inode.i_mapping;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
 	u64 blocksize = fs_info->sectorsize;
 	u64 start = async_chunk->start;
 	u64 end = async_chunk->end;
@@ -859,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work)
 	unsigned long nr_folios;
 	unsigned long total_compressed = 0;
 	unsigned long total_in = 0;
-	unsigned int poff;
+	unsigned int loff;
 	int i;
 	int compress_type = fs_info->compress_type;
 	int compress_level = fs_info->compress_level;
@@ -897,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work)
 	actual_end = min_t(u64, i_size, end + 1);
 again:
 	folios = NULL;
-	nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
-	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
+	nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
+	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
 
 	/*
 	 * we don't want to send crud past the end of i_size through
@@ -954,18 +961,18 @@ again:
 
 	/* Compression level is applied here. */
 	ret = btrfs_compress_folios(compress_type, compress_level,
-				    mapping, start, folios, &nr_folios, &total_in,
+				    inode, start, folios, &nr_folios, &total_in,
 				    &total_compressed);
 	if (ret)
 		goto mark_incompressible;
 
 	/*
-	 * Zero the tail end of the last page, as we might be sending it down
+	 * Zero the tail end of the last folio, as we might be sending it down
 	 * to disk.
 	 */
-	poff = offset_in_page(total_compressed);
-	if (poff)
-		folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
+	loff = (total_compressed & (min_folio_size - 1));
+	if (loff)
+		folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
 
 	/*
 	 * Try to create an inline extent.
@@ -1243,18 +1250,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
  * locked_folio is the folio that writepage had locked already.  We use
  * it to make sure we don't do extra locks or unlocks.
  *
- * When this function fails, it unlocks all pages except @locked_folio.
+ * When this function fails, it unlocks all folios except @locked_folio.
  *
  * When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_folio and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_folio is
- * the only page handled anyway).
+ * unlocks all folios including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single block, so locked_folio is
+ * the only folio handled anyway).
  *
- * When this function succeed and creates a normal extent, the page locking
+ * When this function succeed and creates a normal extent, the folio locking
  * status depends on the passed in flags:
  *
- * - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_folio are unlocked.
+ * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
+ * - Else all folios except for @locked_folio are unlocked.
  *
  * When a failure happens in the second or later iteration of the
  * while-loop, the ordered extents created in previous iterations are cleaned up.
@@ -1262,7 +1269,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
 static noinline int cow_file_range(struct btrfs_inode *inode,
 				   struct folio *locked_folio, u64 start,
 				   u64 end, u64 *done_offset,
-				   bool keep_locked, bool no_inline)
+				   unsigned long flags)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1290,7 +1297,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 
 	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
 
-	if (!no_inline) {
+	if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
 		/* lets try to make an inline extent */
 		ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
 					    BTRFS_COMPRESS_NONE, NULL, false);
@@ -1318,7 +1325,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
 	 * Do set the Ordered (Private2) bit so we know this page was properly
 	 * setup for writepage.
 	 */
-	page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+	page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
 	page_ops |= PAGE_SET_ORDERED;
 
 	/*
@@ -1529,10 +1536,11 @@ out_unlock:
 		btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
 				       end - start - cur_alloc_size + 1, NULL);
 	}
-	btrfs_err_rl(fs_info,
-		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
-		     __func__, btrfs_root_id(inode->root),
-		     btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
+	btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
+		  __func__, btrfs_root_id(inode->root),
+		  btrfs_ino(inode), orig_start, end + 1 - orig_start,
+		  start, cur_alloc_size, ret);
 	return ret;
 }
 
@@ -1685,7 +1693,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
 
 	while (start <= end) {
 		ret = cow_file_range(inode, locked_folio, start, end,
-				     &done_offset, true, false);
+				     &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
 		if (ret)
 			return ret;
 		extent_write_locked_range(&inode->vfs_inode, locked_folio,
@@ -1766,9 +1774,15 @@ static int fallback_to_cow(struct btrfs_inode *inode,
 	 * Don't try to create inline extents, as a mix of inline extent that
 	 * is written out and unlocked directly and a normal NOCOW extent
 	 * doesn't work.
+	 *
+	 * And here we do not unlock the folio after a successful run.
+	 * The folios will be unlocked after everything is finished, or by error handling.
+	 *
+	 * This is to ensure error handling won't need to clear dirty/ordered flags without
+	 * a locked folio, which can race with writeback.
 	 */
-	ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
-			     true);
+	ret = cow_file_range(inode, locked_folio, start, end, NULL,
+			     COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
 	ASSERT(ret != 1);
 	return ret;
 }
@@ -1911,61 +1925,14 @@ static int can_nocow_file_extent(struct btrfs_path *path,
 	return ret < 0 ? ret : can_nocow;
 }
 
-/*
- * Cleanup the dirty folios which will never be submitted due to error.
- *
- * When running a delalloc range, we may need to split the ranges (due to
- * fragmentation or NOCOW). If we hit an error in the later part, we will error
- * out and previously successfully executed range will never be submitted, thus
- * we have to cleanup those folios by clearing their dirty flag, starting and
- * finishing the writeback.
- */
-static void cleanup_dirty_folios(struct btrfs_inode *inode,
-				 struct folio *locked_folio,
-				 u64 start, u64 end, int error)
-{
-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
-	struct address_space *mapping = inode->vfs_inode.i_mapping;
-	pgoff_t start_index = start >> PAGE_SHIFT;
-	pgoff_t end_index = end >> PAGE_SHIFT;
-	u32 len;
-
-	ASSERT(end + 1 - start < U32_MAX);
-	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
-	       IS_ALIGNED(end + 1, fs_info->sectorsize));
-	len = end + 1 - start;
-
-	/*
-	 * Handle the locked folio first.
-	 * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
-	 */
-	btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
-
-	for (pgoff_t index = start_index; index <= end_index; index++) {
-		struct folio *folio;
-
-		/* Already handled at the beginning. */
-		if (index == locked_folio->index)
-			continue;
-		folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
-		/* Cache already dropped, no need to do any cleanup. */
-		if (IS_ERR(folio))
-			continue;
-		btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
-		folio_unlock(folio);
-		folio_put(folio);
-	}
-	mapping_set_error(mapping, error);
-}
-
 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
 			   struct extent_state **cached,
 			   struct can_nocow_file_extent_args *nocow_args,
 			   u64 file_pos, bool is_prealloc)
 {
 	struct btrfs_ordered_extent *ordered;
-	u64 len = nocow_args->file_extent.num_bytes;
-	u64 end = file_pos + len - 1;
+	const u64 len = nocow_args->file_extent.num_bytes;
+	const u64 end = file_pos + len - 1;
 	int ret = 0;
 
 	btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
@@ -1976,8 +1943,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
 		em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
 					BTRFS_ORDERED_PREALLOC);
 		if (IS_ERR(em)) {
-			btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
-			return PTR_ERR(em);
+			ret = PTR_ERR(em);
+			goto error;
 		}
 		btrfs_free_extent_map(em);
 	}
@@ -1989,8 +1956,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
 	if (IS_ERR(ordered)) {
 		if (is_prealloc)
 			btrfs_drop_extent_map_range(inode, file_pos, end, false);
-		btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
-		return PTR_ERR(ordered);
+		ret = PTR_ERR(ordered);
+		goto error;
 	}
 
 	if (btrfs_is_data_reloc_root(inode->root))
@@ -2002,23 +1969,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
 		ret = btrfs_reloc_clone_csums(ordered);
 	btrfs_put_ordered_extent(ordered);
 
+	if (ret < 0)
+		goto error;
 	extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
 				     EXTENT_LOCKED | EXTENT_DELALLOC |
 				     EXTENT_CLEAR_DATA_RESV,
-				     PAGE_UNLOCK | PAGE_SET_ORDERED);
-	/*
-	 * On error, we need to cleanup the ordered extents we created.
-	 *
-	 * We do not clear the folio Dirty flags because they are set and
-	 * cleaered by the caller.
-	 */
-	if (ret < 0)
-		btrfs_cleanup_ordered_extents(inode, file_pos, end);
+				     PAGE_SET_ORDERED);
+	return ret;
+
+error:
+	btrfs_cleanup_ordered_extents(inode, file_pos, len);
+	extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+				     EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_CLEAR_DATA_RESV,
+				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
+				     PAGE_END_WRITEBACK);
+	btrfs_err(inode->root->fs_info,
+		  "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
+		  __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+		  file_pos, len, ret);
 	return ret;
 }
 
 /*
- * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * When nocow writeback calls back.  This checks for snapshots or COW copies
  * of the extents that exist in the file, and COWs the file as required.
  *
  * If no cow copies or snapshots exist, we write directly to the existing
@@ -2035,13 +2009,23 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
 	/*
 	 * If not 0, represents the inclusive end of the last fallback_to_cow()
 	 * range. Only for error handling.
+	 *
+	 * The same for nocow_end, it's to avoid double cleaning up the range
+	 * already cleaned by nocow_one_range().
 	 */
 	u64 cow_end = 0;
+	u64 nocow_end = 0;
 	u64 cur_offset = start;
 	int ret;
 	bool check_prev = true;
 	u64 ino = btrfs_ino(inode);
 	struct can_nocow_file_extent_args nocow_args = { 0 };
+	/* The range that has ordered extent(s). */
+	u64 oe_cleanup_start;
+	u64 oe_cleanup_len = 0;
+	/* The range that is untouched. */
+	u64 untouched_start;
+	u64 untouched_len = 0;
 
 	/*
 	 * Normally on a zoned device we're only doing COW writes, but in case
@@ -2205,8 +2189,10 @@ must_cow:
 				      &nocow_args, cur_offset,
 				      extent_type == BTRFS_FILE_EXTENT_PREALLOC);
 		btrfs_dec_nocow_writers(nocow_bg);
-		if (ret < 0)
+		if (ret < 0) {
+			nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
 			goto error;
+		}
 		cur_offset = extent_end;
 	}
 	btrfs_release_path(path);
@@ -2223,86 +2209,105 @@ must_cow:
 		cow_start = (u64)-1;
 	}
 
-	btrfs_free_path(path);
-	return 0;
-
-error:
 	/*
-	 * There are several error cases:
-	 *
-	 * 1) Failed without falling back to COW
-	 *    start         cur_offset             end
-	 *    |/////////////|                      |
-	 *
-	 *    In this case, cow_start should be (u64)-1.
-	 *
-	 *    For range [start, cur_offset) the folios are already unlocked (except
-	 *    @locked_folio), EXTENT_DELALLOC already removed.
-	 *    Need to clear the dirty flags and finish the ordered extents.
-	 *
-	 * 2) Failed with error before calling fallback_to_cow()
-	 *
-	 *    start         cow_start              end
-	 *    |/////////////|                      |
-	 *
-	 *    In this case, only @cow_start is set, @cur_offset is between
-	 *    [cow_start, end)
+	 * Everything is finished without an error, can unlock the folios now.
 	 *
-	 *    It's mostly the same as case 1), just replace @cur_offset with
-	 *    @cow_start.
-	 *
-	 * 3) Failed with error from fallback_to_cow()
-	 *
-	 *    start         cow_start   cow_end    end
-	 *    |/////////////|-----------|          |
-	 *
-	 *    In this case, both @cow_start and @cow_end is set.
-	 *
-	 *    For range [start, cow_start) it's the same as case 1).
-	 *    But for range [cow_start, cow_end), all the cleanup is handled by
-	 *    cow_file_range(), we should not touch anything in that range.
-	 *
-	 * So for all above cases, if @cow_start is set, cleanup ordered extents
-	 * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
+	 * No need to touch the io tree range nor set folio ordered flag, as
+	 * fallback_to_cow() and nocow_one_range() have already handled them.
 	 */
-	if (cow_start != (u64)-1)
-		cur_offset = cow_start;
+	extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
 
-	if (cur_offset > start) {
-		btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
-		cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
-	}
+	btrfs_free_path(path);
+	return 0;
 
-	/*
-	 * If an error happened while a COW region is outstanding, cur_offset
-	 * needs to be reset to @cow_end + 1 to skip the COW range, as
-	 * cow_file_range() will do the proper cleanup at error.
-	 */
-	if (cow_end)
-		cur_offset = cow_end + 1;
+error:
+	if (cow_start == (u64)-1) {
+		/*
+		 * case a)
+		 *    start           cur_offset               end
+		 *    |   OE cleanup  |       Untouched        |
+		 *
+		 * We finished a fallback_to_cow() or nocow_one_range() call,
+		 * but failed to check the next range.
+		 *
+		 * or
+		 *    start           cur_offset   nocow_end   end
+		 *    |   OE cleanup  |   Skip     | Untouched |
+		 *
+		 * nocow_one_range() failed, the range [cur_offset, nocow_end] is
+		 * already cleaned up.
+		 */
+		oe_cleanup_start = start;
+		oe_cleanup_len = cur_offset - start;
+		if (nocow_end)
+			untouched_start = nocow_end + 1;
+		else
+			untouched_start = cur_offset;
+		untouched_len = end + 1 - untouched_start;
+	} else if (cow_start != (u64)-1 && cow_end == 0) {
+		/*
+		 * case b)
+		 *    start        cow_start    cur_offset   end
+		 *    | OE cleanup |        Untouched        |
+		 *
+		 * We got a range that needs COW, but before we hit the next NOCOW range,
+		 * thus [cow_start, cur_offset) doesn't yet have any OE.
+		 */
+		oe_cleanup_start = start;
+		oe_cleanup_len = cow_start - start;
+		untouched_start = cow_start;
+		untouched_len = end + 1 - untouched_start;
+	} else {
+		/*
+		 * case c)
+		 *    start        cow_start    cow_end      end
+		 *    | OE cleanup |   Skip     |  Untouched |
+		 *
+		 * fallback_to_cow() failed, and fallback_to_cow() will do the
+		 * cleanup for its range, we shouldn't touch the range
+		 * [cow_start, cow_end].
+		 */
+		ASSERT(cow_start != (u64)-1 && cow_end != 0);
+		oe_cleanup_start = start;
+		oe_cleanup_len = cow_start - start;
+		untouched_start = cow_end + 1;
+		untouched_len = end + 1 - untouched_start;
+	}
+
+	if (oe_cleanup_len) {
+		const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
+		btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
+		extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
+					     locked_folio, NULL,
+					     EXTENT_LOCKED | EXTENT_DELALLOC,
+					     PAGE_UNLOCK | PAGE_START_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+	}
 
-	/*
-	 * We need to lock the extent here because we're clearing DELALLOC and
-	 * we're not locked at this point.
-	 */
-	if (cur_offset < end) {
+	if (untouched_len) {
 		struct extent_state *cached = NULL;
+		const u64 untouched_end = untouched_start + untouched_len - 1;
 
-		btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached);
-		extent_clear_unlock_delalloc(inode, cur_offset, end,
+		/*
+		 * We need to lock the extent here because we're clearing DELALLOC and
+		 * we're not locked at this point.
+		 */
+		btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
+		extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
 					     locked_folio, &cached,
 					     EXTENT_LOCKED | EXTENT_DELALLOC |
 					     EXTENT_DEFRAG |
 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 					     PAGE_START_WRITEBACK |
 					     PAGE_END_WRITEBACK);
-		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+		btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
 	}
 	btrfs_free_path(path);
-	btrfs_err_rl(fs_info,
-		     "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
-		     __func__, btrfs_root_id(inode->root),
-		     btrfs_ino(inode), start, end + 1 - start, ret);
+	btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
+		  __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+		  start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
+		  untouched_start, untouched_len, ret);
 	return ret;
 }
 
@@ -2347,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
 		ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
 				       true);
 	else
-		ret = cow_file_range(inode, locked_folio, start, end, NULL,
-				     false, false);
+		ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
 	return ret;
 }
 
@@ -2984,7 +2988,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * If we dropped an inline extent here, we know the range where it is
 	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
 	 * number of bytes only for that range containing the inline extent.
-	 * The remaining of the range will be processed when clearning the
+	 * The remaining of the range will be processed when clearing the
 	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
 	 */
 	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
@@ -3100,14 +3104,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 	if (!freespace_inode)
 		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
 
-	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+	if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
 		ret = -EIO;
 		goto out;
 	}
 
-	if (btrfs_is_zoned(fs_info))
-		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
-					ordered_extent->disk_num_bytes);
+	ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+				      ordered_extent->disk_num_bytes);
+	if (ret)
+		goto out;
 
 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
 		truncated = true;
@@ -3145,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 	trans->block_rsv = &inode->block_rsv;
 
 	ret = btrfs_insert_raid_extent(trans, ordered_extent);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3153,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		/* Logic error */
 		ASSERT(list_empty(&ordered_extent->list));
-		if (!list_empty(&ordered_extent->list)) {
+		if (unlikely(!list_empty(&ordered_extent->list))) {
 			ret = -EINVAL;
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -3161,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 		btrfs_inode_safe_disk_i_size_write(inode, 0);
 		ret = btrfs_update_inode_fallback(trans, inode);
-		if (ret) {
+		if (unlikely(ret)) {
 			/* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
 		}
@@ -3188,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->disk_num_bytes);
 		}
 	}
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
 				       ordered_extent->num_bytes, trans->transid);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ret = add_pending_csums(trans, &ordered_extent->list);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3219,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 	btrfs_inode_safe_disk_i_size_write(inode, 0);
 	ret = btrfs_update_inode_fallback(trans, inode);
-	if (ret) { /* -ENOMEM or corruption */
+	if (unlikely(ret)) { /* -ENOMEM or corruption */
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3325,21 +3330,47 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 	return btrfs_finish_one_ordered(ordered);
 }
 
+void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
+				u8 *dest)
+{
+	struct folio *folio = page_folio(phys_to_page(paddr));
+	const u32 blocksize = fs_info->sectorsize;
+	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+
+	shash->tfm = fs_info->csum_shash;
+	/* The full block must be inside the folio. */
+	ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
+
+	if (folio_test_partial_kmap(folio)) {
+		size_t cur = paddr;
+
+		crypto_shash_init(shash);
+		while (cur < paddr + blocksize) {
+			void *kaddr;
+			size_t len = min(paddr + blocksize - cur,
+					 PAGE_SIZE - offset_in_page(cur));
+
+			kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur));
+			crypto_shash_update(shash, kaddr, len);
+			kunmap_local(kaddr);
+			cur += len;
+		}
+		crypto_shash_final(shash, dest);
+	} else {
+		crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest);
+	}
+}
 /*
  * Verify the checksum for a single sector without any extra action that depend
  * on the type of I/O.
  *
  * @kaddr must be a properly kmapped address.
  */
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
-			    const u8 * const csum_expected)
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+			   const u8 * const csum_expected)
 {
-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-
-	shash->tfm = fs_info->csum_shash;
-	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
-
-	if (memcmp(csum, csum_expected, fs_info->csum_size))
+	btrfs_calculate_block_csum(fs_info, paddr, csum);
+	if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
 		return -EIO;
 	return 0;
 }
@@ -3358,17 +3389,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum
  * Return %true if the sector is ok or had no checksum to start with, else %false.
  */
 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
-			u32 bio_offset, struct bio_vec *bv)
+			u32 bio_offset, phys_addr_t paddr)
 {
 	struct btrfs_inode *inode = bbio->inode;
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	const u32 blocksize = fs_info->sectorsize;
+	struct folio *folio;
 	u64 file_offset = bbio->file_offset + bio_offset;
-	u64 end = file_offset + bv->bv_len - 1;
+	u64 end = file_offset + blocksize - 1;
 	u8 *csum_expected;
 	u8 csum[BTRFS_CSUM_SIZE];
-	void *kaddr;
-
-	ASSERT(bv->bv_len == fs_info->sectorsize);
 
 	if (!bbio->csum)
 		return true;
@@ -3384,12 +3414,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 
 	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
 				fs_info->csum_size;
-	kaddr = bvec_kmap_local(bv);
-	if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) {
-		kunmap_local(kaddr);
+	if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected))
 		goto zeroit;
-	}
-	kunmap_local(kaddr);
 	return true;
 
 zeroit:
@@ -3397,7 +3423,9 @@ zeroit:
 				    bbio->mirror_num);
 	if (dev)
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
-	memzero_bvec(bv);
+	folio = page_folio(phys_to_page(paddr));
+	ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
+	folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize);
 	return false;
 }
 
@@ -3511,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 	int ret;
 
 	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
-	if (ret && ret != -EEXIST) {
+	if (unlikely(ret && ret != -EEXIST)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -3883,10 +3911,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
 	bool filled = false;
 	int first_xattr_slot;
 
-	ret = btrfs_init_file_extent_tree(inode);
-	if (ret)
-		goto out;
-
 	ret = btrfs_fill_inode(inode, &rdev);
 	if (!ret)
 		filled = true;
@@ -3918,8 +3942,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
 	i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
 	i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
-	btrfs_inode_set_file_extent_range(inode, 0,
-			round_up(i_size_read(vfs_inode), fs_info->sectorsize));
 
 	inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
 			btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3951,6 +3973,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
 	btrfs_set_inode_mapping_order(inode);
 
 cache_index:
+	ret = btrfs_init_file_extent_tree(inode);
+	if (ret)
+		goto out;
+	btrfs_inode_set_file_extent_range(inode, 0,
+			round_up(i_size_read(vfs_inode), fs_info->sectorsize));
 	/*
 	 * If we were modified in the current generation and evicted from memory
 	 * and then re-read we need to do a full sync since we don't have any
@@ -4187,6 +4214,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+	struct timespec64 now;
+
+	/*
+	 * If we are replaying a log tree, we do not want to update the mtime
+	 * and ctime of the parent directory with the current time, since the
+	 * log replay procedure is responsible for setting them to their correct
+	 * values (the ones it had when the fsync was done).
+	 */
+	if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+		return;
+
+	now = inode_set_ctime_current(&dir->vfs_inode);
+	inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
 /*
  * unlink helper that gets used here in inode.c and in the tree logging
  * recovery code.  It remove a link in a directory with a given name, and
@@ -4244,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_crit(fs_info,
 	   "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
 			   name->len, name->name, btrfs_root_id(root), ino, dir_ino);
@@ -4256,7 +4300,7 @@ skip_backref:
 		rename_ctx->index = index;
 
 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -4287,7 +4331,7 @@ skip_backref:
 	inode_inc_iversion(&inode->vfs_inode);
 	inode_set_ctime_current(&inode->vfs_inode);
 	inode_inc_iversion(&dir->vfs_inode);
- 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+	update_time_after_link_or_unlink(dir);
 
 	return btrfs_update_inode(trans, dir);
 }
@@ -4411,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -4442,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 		ret = btrfs_del_root_ref(trans, objectid,
 					 btrfs_root_id(root), dir_ino,
 					 &index, &fname.disk_name);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
 	}
 
 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -4507,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist a root
 		 * with such id, but this is out of valid range.
@@ -4538,7 +4582,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
-		if (atomic_read(&inode->vfs_inode.i_count) > 1)
+		if (icount_read(&inode->vfs_inode) > 1)
 			d_prune_aliases(&inode->vfs_inode);
 
 		min_ino = btrfs_ino(inode) + 1;
@@ -4621,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 	btrfs_record_snapshot_destroy(trans, dir);
 
 	ret = btrfs_unlink_subvol(trans, dir, dentry);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
 	}
 
 	ret = btrfs_record_root_in_trans(trans, dest);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
 	}
@@ -4641,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 		ret = btrfs_insert_orphan_item(trans,
 					fs_info->tree_root,
 					btrfs_root_id(dest));
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_end_trans;
 		}
@@ -4649,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 
 	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
 				     BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
-	if (ret && ret != -ENOENT) {
+	if (unlikely(ret && ret != -ENOENT)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_end_trans;
 	}
@@ -4658,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 					  dest->root_item.received_uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  btrfs_root_id(dest));
-		if (ret && ret != -ENOENT) {
+		if (unlikely(ret && ret != -ENOENT)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_end_trans;
 		}
@@ -4798,7 +4842,7 @@ again:
 			folio_put(folio);
 			goto again;
 		}
-		if (!folio_test_uptodate(folio)) {
+		if (unlikely(!folio_test_uptodate(folio))) {
 			ret = -EIO;
 			goto out_unlock;
 		}
@@ -4886,7 +4930,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e
 		goto out;
 
 	/*
-	 * Skip the truncatioin if the range in the target block is already aligned.
+	 * Skip the truncation if the range in the target block is already aligned.
 	 * The seemingly complex check will also handle the same block case.
 	 */
 	if (in_head_block && !IS_ALIGNED(start, blocksize))
@@ -4942,7 +4986,7 @@ again:
 			folio_put(folio);
 			goto again;
 		}
-		if (!folio_test_uptodate(folio)) {
+		if (unlikely(!folio_test_uptodate(folio))) {
 			ret = -EIO;
 			goto out_unlock;
 		}
@@ -5062,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 	drop_args.drop_cache = true;
 
 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
@@ -5582,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
-	if (location->type != BTRFS_INODE_ITEM_KEY &&
-	    location->type != BTRFS_ROOT_ITEM_KEY) {
+	if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
+		     location->type != BTRFS_ROOT_ITEM_KEY)) {
 		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
@@ -5677,7 +5721,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
 	bool empty = false;
 
 	xa_lock(&root->inodes);
-	entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+	/*
+	 * This btrfs_inode is being freed and has already been unhashed at this
+	 * point. It's possible that another btrfs_inode has already been
+	 * allocated for the same inode and inserted itself into the root, so
+	 * don't delete it in that case.
+	 *
+	 * Note that this shouldn't need to allocate memory, so the gfp flags
+	 * don't really matter.
+	 */
+	entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+			     GFP_ATOMIC);
 	if (entry == inode)
 		empty = xa_empty(&root->inodes);
 	xa_unlock(&root->inodes);
@@ -5864,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 			return ERR_CAST(inode);
 
 		/* Do extra check against inode mode with di_type */
-		if (btrfs_inode_type(inode) != di_type) {
+		if (unlikely(btrfs_inode_type(inode) != di_type)) {
 			btrfs_crit(fs_info,
 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
 				  inode->vfs_inode.i_mode, btrfs_inode_type(inode),
@@ -6451,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	if (!args->subvol)
 		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
 
+	btrfs_set_inode_mapping_order(BTRFS_I(inode));
 	if (S_ISREG(inode->i_mode)) {
 		if (btrfs_test_opt(fs_info, NODATASUM))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
@@ -6458,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
 				BTRFS_INODE_NODATASUM;
 		btrfs_update_inode_mapping_flags(BTRFS_I(inode));
-		btrfs_set_inode_mapping_order(BTRFS_I(inode));
 	}
 
 	ret = btrfs_insert_inode_locked(inode);
@@ -6505,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
 	batch.nr = args->orphan ? 1 : 2;
 	ret = btrfs_insert_empty_items(trans, root, path, &batch);
-	if (ret != 0) {
+	if (unlikely(ret != 0)) {
 		btrfs_abort_transaction(trans, ret);
 		goto discard;
 	}
@@ -6582,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 	 */
 	if (!args->subvol) {
 		ret = btrfs_init_inode_security(trans, args);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto discard;
 		}
@@ -6602,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 
 	if (args->orphan) {
 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto discard;
 		}
 	} else {
 		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
 				     0, BTRFS_I(inode)->dir_index);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto discard;
 		}
@@ -6640,7 +6694,7 @@ out:
  */
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
-		   const struct fscrypt_str *name, int add_backref, u64 index)
+		   const struct fscrypt_str *name, bool add_backref, u64 index)
 {
 	int ret = 0;
 	struct btrfs_key key;
@@ -6673,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    btrfs_inode_type(inode), index);
 	if (ret == -EEXIST || ret == -EOVERFLOW)
 		goto fail_dir_item;
-	else if (ret) {
+	else if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -6681,15 +6735,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 			   name->len * 2);
 	inode_inc_iversion(&parent_inode->vfs_inode);
-	/*
-	 * If we are replaying a log tree, we do not want to update the mtime
-	 * and ctime of the parent directory with the current time, since the
-	 * log replay procedure is responsible for setting them to their correct
-	 * values (the ones it had when the fsync was done).
-	 */
-	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
-		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
-				      inode_set_ctime_current(&parent_inode->vfs_inode));
+	update_time_after_link_or_unlink(parent_inode);
 
 	ret = btrfs_update_inode(trans, parent_inode);
 	if (ret)
@@ -6794,7 +6840,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct fscrypt_name fname;
 	u64 index;
 	int ret;
-	int drop_inode = 0;
 
 	/* do not allow sys_link's with other subvols of the same device */
 	if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6826,44 +6871,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	/* There are several dir indexes for this inode, clear the cache. */
 	BTRFS_I(inode)->dir_index = 0ULL;
-	inc_nlink(inode);
 	inode_inc_iversion(inode);
 	inode_set_ctime_current(inode);
-	ihold(inode);
 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
 			     &fname.disk_name, 1, index);
+	if (ret)
+		goto fail;
 
-	if (ret) {
-		drop_inode = 1;
-	} else {
-		struct dentry *parent = dentry->d_parent;
+	/* Link added now we update the inode item with the new link count. */
+	inc_nlink(inode);
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
+	if (unlikely(ret)) {
+		btrfs_abort_transaction(trans, ret);
+		goto fail;
+	}
 
-		ret = btrfs_update_inode(trans, BTRFS_I(inode));
-		if (ret)
+	if (inode->i_nlink == 1) {
+		/*
+		 * If the new hard link count is 1, it's a file created with the
+		 * open(2) O_TMPFILE flag.
+		 */
+		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+		if (unlikely(ret)) {
+			btrfs_abort_transaction(trans, ret);
 			goto fail;
-		if (inode->i_nlink == 1) {
-			/*
-			 * If new hard link count is 1, it's a file created
-			 * with open(2) O_TMPFILE flag.
-			 */
-			ret = btrfs_orphan_del(trans, BTRFS_I(inode));
-			if (ret)
-				goto fail;
 		}
-		d_instantiate(dentry, inode);
-		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
 	}
 
+	/* Grab reference for the new dentry passed to d_instantiate(). */
+	ihold(inode);
+	d_instantiate(dentry, inode);
+	btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
 fail:
 	fscrypt_free_filename(&fname);
 	if (trans)
 		btrfs_end_transaction(trans);
-	if (drop_inode) {
-		inode_dec_link_count(inode);
-		iput(inode);
-	}
 	btrfs_btree_balance_dirty(fs_info);
 	return ret;
 }
@@ -7057,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		/* Only regular file could have regular/prealloc extent */
-		if (!S_ISREG(inode->vfs_inode.i_mode)) {
+		if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
 			ret = -EUCLEAN;
 			btrfs_crit(fs_info,
 		"regular/prealloc extent found for non-regular inode %llu",
@@ -7134,7 +7179,7 @@ not_found:
 insert:
 	ret = 0;
 	btrfs_release_path(path);
-	if (em->start > start || btrfs_extent_map_end(em) <= start) {
+	if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
 		btrfs_err(fs_info,
 			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
 			  em->start, em->len, start, len);
@@ -7819,6 +7864,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
 	ei->delalloc_bytes = 0;
+	/* new_delalloc_bytes and last_dir_index_offset are in a union. */
 	ei->new_delalloc_bytes = 0;
 	ei->defrag_bytes = 0;
 	ei->disk_i_size = 0;
@@ -7953,7 +7999,7 @@ int btrfs_drop_inode(struct inode *inode)
 	if (btrfs_root_refs(&root->root_item) == 0)
 		return 1;
 	else
-		return generic_drop_inode(inode);
+		return inode_generic_drop(inode);
 }
 
 static void init_once(void *foo)
@@ -7961,6 +8007,9 @@ static void init_once(void *foo)
 	struct btrfs_inode *ei = foo;
 
 	inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_FS_VERITY
+	ei->i_verity_info = NULL;
+#endif
 }
 
 void __cold btrfs_destroy_cachep(void)
@@ -8162,7 +8211,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 					     btrfs_ino(BTRFS_I(old_dir)),
 					     new_idx);
 		if (ret) {
-			if (need_abort)
+			if (unlikely(need_abort))
 				btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8210,7 +8259,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	/* src is a subvolume */
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8218,12 +8267,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(old_dentry->d_inode),
 					   old_name, &old_rename_ctx);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
 		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8232,7 +8281,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 	/* dest is a subvolume */
 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8240,12 +8289,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 					   BTRFS_I(new_dentry->d_inode),
 					   new_name, &new_rename_ctx);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
 		ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8253,14 +8302,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 			     new_name, 0, old_idx);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
 
 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
 			     old_name, 0, new_idx);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
@@ -8501,7 +8550,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 
 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8509,12 +8558,12 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 					   BTRFS_I(d_inode(old_dentry)),
 					   &old_fname.disk_name, &rename_ctx);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
 		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		}
@@ -8525,7 +8574,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_fail;
 			}
@@ -8534,7 +8583,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 						 BTRFS_I(d_inode(new_dentry)),
 						 &new_fname.disk_name);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_fail;
 			}
@@ -8542,7 +8591,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 		if (new_inode->i_nlink == 0) {
 			ret = btrfs_orphan_add(trans,
 					BTRFS_I(d_inode(new_dentry)));
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_fail;
 			}
@@ -8551,7 +8600,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 
 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 			     &new_fname.disk_name, 0, index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_fail;
 	}
@@ -8565,7 +8614,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 
 	if (flags & RENAME_WHITEOUT) {
 		ret = btrfs_create_new_inode(trans, &whiteout_args);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_fail;
 		} else {
@@ -8859,7 +8908,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		goto out;
 
 	path = btrfs_alloc_path();
-	if (!path) {
+	if (unlikely(!path)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		discard_new_inode(inode);
@@ -8871,7 +8920,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	key.offset = 0;
 	datasize = btrfs_file_extent_calc_inline_size(name_len);
 	ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_free_path(path);
 		discard_new_inode(inode);
@@ -9084,7 +9133,7 @@ next:
 
 		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			if (own_trans)
 				btrfs_end_transaction(trans);
@@ -9252,7 +9301,7 @@ static ssize_t btrfs_encoded_read_inline(
 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 				       extent_start, 0);
 	if (ret) {
-		if (ret > 0) {
+		if (unlikely(ret > 0)) {
 			/* The extent item disappeared? */
 			return -EIO;
 		}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7e13de2bdcbf..a454b5ba2097 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
 	if (comp) {
 		ret = btrfs_set_prop(trans, inode, "btrfs.compression",
 				     comp, strlen(comp), 0);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_end_trans;
 		}
 	} else {
 		ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
-		if (ret && ret != -ENODATA) {
+		if (unlikely(ret && ret != -ENODATA)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_end_trans;
 		}
@@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 		btrfs_clear_buffer_dirty(trans, leaf);
 		btrfs_tree_unlock(leaf);
 		ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
-		if (ret2 < 0)
+		if (unlikely(ret2 < 0))
 			btrfs_abort_transaction(trans, ret2);
 		free_extent_buffer(leaf);
 		goto out;
@@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 	/* ... and new_root is owned by new_inode_args.inode now. */
 
 	ret = btrfs_record_root_in_trans(trans, new_root);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ret = btrfs_uuid_tree_add(trans, root_item->uuid,
 				  BTRFS_UUID_KEY_SUBVOL, objectid);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 	btrfs_record_new_subvolume(trans, BTRFS_I(dir));
 
 	ret = btrfs_create_new_inode(trans, &new_inode_args);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent,
 
 	/*
 	 * Force new buffered writes to reserve space even when NOCOW is
-	 * possible. This is to avoid later writeback (running dealloc) to
+	 * possible. This is to avoid later writeback (running delalloc) to
 	 * fallback to COW mode and unexpectedly fail with ENOSPC.
 	 */
 	btrfs_drew_read_lock(&root->snapshot_lock);
@@ -1251,7 +1251,7 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg, int subvol)
+					    void __user *arg, bool subvol)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
 			ret = btrfs_next_leaf(fs_info->tree_root, path);
 			if (ret < 0) {
 				goto out;
-			} else if (ret > 0) {
+			} else if (unlikely(ret > 0)) {
 				ret = -EUCLEAN;
 				goto out;
 			}
@@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
 		ret = btrfs_next_leaf(root, path);
 		if (ret < 0) {
 			goto out;
-		} else if (ret > 0) {
+		} else if (unlikely(ret > 0)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
 		ret = btrfs_next_item(root, path);
 		if (ret < 0) {
 			goto out;
-		} else if (ret > 0) {
+		} else if (unlikely(ret > 0)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 		ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  btrfs_root_id(root));
-		if (ret && ret != -ENOENT) {
+		if (unlikely(ret && ret != -ENOENT)) {
 		        btrfs_abort_transaction(trans, ret);
 		        btrfs_end_transaction(trans);
 		        goto out;
@@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 		ret = btrfs_uuid_tree_add(trans, sa->uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  btrfs_root_id(root));
-		if (ret < 0 && ret != -EEXIST) {
+		if (unlikely(ret < 0 && ret != -EEXIST)) {
 			btrfs_abort_transaction(trans, ret);
 			btrfs_end_transaction(trans);
 			goto out;
@@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
 		goto out_acct;
 	}
 
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
 	if (compat) {
 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
 		struct btrfs_ioctl_encoded_io_args_32 args32;
@@ -4509,6 +4513,7 @@ out_acct:
 
 static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
 {
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
 	struct btrfs_ioctl_encoded_io_args args;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 		goto out_acct;
 	}
 
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
+
 	if (!(file->f_mode & FMODE_WRITE)) {
 		ret = -EBADF;
 		goto out_acct;
@@ -4780,14 +4790,14 @@ out_fail:
 
 static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	struct file *file = cmd->file;
+	struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+	struct extent_io_tree *io_tree = &inode->io_tree;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
 	size_t copy_end;
 	int ret;
 	u64 disk_bytenr, disk_io_size;
-	struct file *file;
-	struct btrfs_inode *inode;
-	struct btrfs_fs_info *fs_info;
-	struct extent_io_tree *io_tree;
 	loff_t pos;
 	struct kiocb kiocb;
 	struct extent_state *cached_state = NULL;
@@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
 		ret = -EPERM;
 		goto out_acct;
 	}
-	file = cmd->file;
-	inode = BTRFS_I(file->f_inode);
-	fs_info = inode->root->fs_info;
-	io_tree = &inode->io_tree;
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
+
 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
 
 	if (issue_flags & IO_URING_F_COMPAT) {
@@ -4933,9 +4944,10 @@ out_acct:
 
 static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	struct file *file = cmd->file;
+	struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
 	loff_t pos;
 	struct kiocb kiocb;
-	struct file *file;
 	ssize_t ret;
 	void __user *sqe_addr;
 	struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
@@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
 		ret = -EPERM;
 		goto out_acct;
 	}
+	if (fs_info->sectorsize > PAGE_SIZE) {
+		ret = -ENOTTY;
+		goto out_acct;
+	}
 
-	file = cmd->file;
 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
 
 	if (!(file->f_mode & FMODE_WRITE)) {
@@ -5223,13 +5238,13 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case FITRIM:
 		return btrfs_ioctl_fitrim(fs_info, argp);
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 0);
+		return btrfs_ioctl_snap_create(file, argp, false);
 	case BTRFS_IOC_SNAP_CREATE_V2:
-		return btrfs_ioctl_snap_create_v2(file, argp, 0);
+		return btrfs_ioctl_snap_create_v2(file, argp, false);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, argp, 1);
+		return btrfs_ioctl_snap_create(file, argp, true);
 	case BTRFS_IOC_SUBVOL_CREATE_V2:
-		return btrfs_ioctl_snap_create_v2(file, argp, 1);
+		return btrfs_ioctl_snap_create_v2(file, argp, true);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp, false);
 	case BTRFS_IOC_SNAP_DESTROY_V2:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a3e6d9616e60..0035851d72b0 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
 	atomic_inc(&lock->readers);
 
 	/*
-	 * Ensure the pending reader count is perceieved BEFORE this reader
+	 * Ensure the pending reader count is perceived BEFORE this reader
 	 * goes to sleep in case of active writers. This guarantees new writers
 	 * won't be allowed and that the current reader will be woken up when
 	 * the last active writer finishes its jobs.
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index af29df98ac14..a4673e7d95d7 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -74,7 +74,7 @@ enum btrfs_lock_nesting {
 	BTRFS_NESTING_NEW_ROOT,
 
 	/*
-	 * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+	 * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so
 	 * add this in here and add a static_assert to keep us from going over
 	 * the limit.  As of this writing we're limited to 8, and we're
 	 * definitely using 8, hence this check to keep us from messing up in
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d403641889ca..4758f66da449 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -58,9 +58,6 @@
  * 0x1000   | SegHdr N+1| Data payload N+1 ...                |
  */
 
-#define WORKSPACE_BUF_LENGTH	(lzo1x_worst_compress(PAGE_SIZE))
-#define WORKSPACE_CBUF_LENGTH	(lzo1x_worst_compress(PAGE_SIZE))
-
 struct workspace {
 	void *mem;
 	void *buf;	/* where decompressed data goes */
@@ -68,7 +65,14 @@ struct workspace {
 	struct list_head list;
 };
 
-static struct workspace_manager wsm;
+static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info)
+{
+	return lzo1x_worst_compress(fs_info->sectorsize);
+}
+static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info)
+{
+	return lzo1x_worst_compress(fs_info->sectorsize);
+}
 
 void lzo_free_workspace(struct list_head *ws)
 {
@@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws)
 	kfree(workspace);
 }
 
-struct list_head *lzo_alloc_workspace(void)
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info)
 {
 	struct workspace *workspace;
 
@@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void)
 		return ERR_PTR(-ENOMEM);
 
 	workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
-	workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
-	workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
+	workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
+	workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
 	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
 		goto fail;
 
@@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf)
  *
  * Will allocate new pages when needed.
  */
-static int copy_compressed_data_to_page(char *compressed_data,
+static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info,
+					char *compressed_data,
 					size_t compressed_size,
 					struct folio **out_folios,
 					unsigned long max_nr_folio,
-					u32 *cur_out,
-					const u32 sectorsize)
+					u32 *cur_out)
 {
+	const u32 sectorsize = fs_info->sectorsize;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
 	u32 sector_bytes_left;
 	u32 orig_out;
 	struct folio *cur_folio;
 	char *kaddr;
 
-	if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+	if ((*cur_out >> min_folio_shift) >= max_nr_folio)
 		return -E2BIG;
 
 	/*
@@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data,
 	 */
 	ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
 
-	cur_folio = out_folios[*cur_out / PAGE_SIZE];
+	cur_folio = out_folios[*cur_out >> min_folio_shift];
 	/* Allocate a new page */
 	if (!cur_folio) {
-		cur_folio = btrfs_alloc_compr_folio();
+		cur_folio = btrfs_alloc_compr_folio(fs_info);
 		if (!cur_folio)
 			return -ENOMEM;
-		out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+		out_folios[*cur_out >> min_folio_shift] = cur_folio;
 	}
 
-	kaddr = kmap_local_folio(cur_folio, 0);
-	write_compress_length(kaddr + offset_in_page(*cur_out),
-			      compressed_size);
+	kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out));
+	write_compress_length(kaddr, compressed_size);
 	*cur_out += LZO_LEN;
 
 	orig_out = *cur_out;
@@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data,
 
 		kunmap_local(kaddr);
 
-		if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+		if ((*cur_out >> min_folio_shift) >= max_nr_folio)
 			return -E2BIG;
 
-		cur_folio = out_folios[*cur_out / PAGE_SIZE];
+		cur_folio = out_folios[*cur_out >> min_folio_shift];
 		/* Allocate a new page */
 		if (!cur_folio) {
-			cur_folio = btrfs_alloc_compr_folio();
+			cur_folio = btrfs_alloc_compr_folio(fs_info);
 			if (!cur_folio)
 				return -ENOMEM;
-			out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+			out_folios[*cur_out >> min_folio_shift] = cur_folio;
 		}
 		kaddr = kmap_local_folio(cur_folio, 0);
 
-		memcpy(kaddr + offset_in_page(*cur_out),
+		memcpy(kaddr + offset_in_folio(cur_folio, *cur_out),
 		       compressed_data + *cur_out - orig_out, copy_len);
 
 		*cur_out += copy_len;
@@ -209,12 +214,15 @@ out:
 	return 0;
 }
 
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			u64 start, struct folio **folios, unsigned long *out_folios,
 			unsigned long *total_in, unsigned long *total_out)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
-	const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+	const u32 sectorsize = fs_info->sectorsize;
+	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
 	struct folio *folio_in = NULL;
 	char *sizes_ptr;
 	const unsigned long max_nr_folio = *out_folios;
@@ -263,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
 			goto out;
 		}
 
-		ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+		ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len,
 						   folios, max_nr_folio,
-						   &cur_out, sectorsize);
+						   &cur_out);
 		if (ret < 0)
 			goto out;
 
@@ -280,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
 			goto out;
 		}
 
-		/* Check if we have reached page boundary */
-		if (PAGE_ALIGNED(cur_in)) {
+		/* Check if we have reached folio boundary. */
+		if (IS_ALIGNED(cur_in, min_folio_size)) {
 			folio_put(folio_in);
 			folio_in = NULL;
 		}
@@ -298,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
 out:
 	if (folio_in)
 		folio_put(folio_in);
-	*out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+	*out_folios = DIV_ROUND_UP(cur_out, min_folio_size);
 	return ret;
 }
 
@@ -310,15 +318,16 @@ out:
 static void copy_compressed_segment(struct compressed_bio *cb,
 				    char *dest, u32 len, u32 *cur_in)
 {
+	struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
 	u32 orig_in = *cur_in;
 
 	while (*cur_in < orig_in + len) {
-		struct folio *cur_folio;
-		u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
-					  orig_in + len - *cur_in);
+		struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift];
+		u32 copy_len = min_t(u32, orig_in + len - *cur_in,
+				     folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
 
 		ASSERT(copy_len);
-		cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
 
 		memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
 				  offset_in_folio(cur_folio, *cur_in), copy_len);
@@ -332,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
 	const u32 sectorsize = fs_info->sectorsize;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
 	char *kaddr;
 	int ret;
 	/* Compressed data length, can be unaligned */
@@ -378,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		 */
 		ASSERT(cur_in / sectorsize ==
 		       (cur_in + LZO_LEN - 1) / sectorsize);
-		cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+		cur_folio = cb->compressed_folios[cur_in >> min_folio_shift];
 		ASSERT(cur_folio);
 		kaddr = kmap_local_folio(cur_folio, 0);
-		seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+		seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in));
 		kunmap_local(kaddr);
 		cur_in += LZO_LEN;
 
-		if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+		if (unlikely(seg_len > workspace_cbuf_length(fs_info))) {
 			struct btrfs_inode *inode = cb->bbio.inode;
 
 			/*
@@ -445,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
 	const u32 sectorsize = fs_info->sectorsize;
 	size_t in_len;
 	size_t out_len;
-	size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+	size_t max_segment_len = workspace_buf_length(fs_info);
 	int ret = 0;
 
-	if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+	if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2))
 		return -EUCLEAN;
 
 	in_len = read_compress_length(data_in);
-	if (in_len != srclen)
+	if (unlikely(in_len != srclen))
 		return -EUCLEAN;
 	data_in += LZO_LEN;
 
 	in_len = read_compress_length(data_in);
-	if (in_len != srclen - LZO_LEN * 2) {
+	if (unlikely(in_len != srclen - LZO_LEN * 2)) {
 		ret = -EUCLEAN;
 		goto out;
 	}
@@ -487,8 +497,7 @@ out:
 	return ret;
 }
 
-const struct btrfs_compress_op btrfs_lzo_compress = {
-	.workspace_manager	= &wsm,
+const struct btrfs_compress_levels  btrfs_lzo_compress = {
 	.max_level		= 1,
 	.default_level		= 1,
 };
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 363fd28c0268..a0cf8effe008 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -18,6 +18,7 @@ static const char fs_state_chars[] = {
 	[BTRFS_FS_STATE_REMOUNTING]		= 'M',
 	[BTRFS_FS_STATE_RO]			= 0,
 	[BTRFS_FS_STATE_TRANS_ABORTED]		= 'A',
+	[BTRFS_FS_STATE_LOG_REPLAY_ABORTED]	= 'O',
 	[BTRFS_FS_STATE_DEV_REPLACING]		= 'R',
 	[BTRFS_FS_STATE_DUMMY_FS_INFO]		= 0,
 	[BTRFS_FS_STATE_NO_DATA_CSUMS]		= 'C',
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 022ebc89af85..4416c165644f 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -4,7 +4,6 @@
 #define BTRFS_MESSAGES_H
 
 #include <linux/types.h>
-#include <linux/types.h>
 #include <linux/printk.h>
 #include <linux/bug.h>
 
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index ff5eac84d819..60f9b000d644 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -11,6 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/math64.h>
 #include <linux/rbtree.h>
+#include <linux/bio.h>
 
 /*
  * Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
@@ -20,6 +21,54 @@
 	name = (1U << __ ## name ## _BIT),              \
 	__ ## name ## _SEQ = __ ## name ## _BIT
 
+static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter)
+{
+	struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+	return bvec_phys(&bv);
+}
+
+/*
+ * Iterate bio using btrfs block size.
+ *
+ * This will handle large folio and highmem.
+ *
+ * @paddr:	Physical memory address of each iteration
+ * @bio:	The bio to iterate
+ * @iter:	The bvec_iter (pointer) to use.
+ * @blocksize:	The blocksize to iterate.
+ *
+ * This requires all folios in the bio to cover at least one block.
+ */
+#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize)		\
+	for (; (iter)->bi_size &&					\
+	     (paddr = bio_iter_phys((bio), (iter)), 1);			\
+	     bio_advance_iter_single((bio), (iter), (blocksize)))
+
+/* Initialize a bvec_iter to the size of the specified bio. */
+static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	u32 bio_size = 0;
+	int i;
+
+	bio_for_each_bvec_all(bvec, bio, i)
+		bio_size += bvec->bv_len;
+
+	return (struct bvec_iter) {
+		.bi_sector = 0,
+		.bi_size = bio_size,
+		.bi_idx = 0,
+		.bi_bvec_done = 0,
+	};
+}
+
+#define btrfs_bio_for_each_block_all(paddr, bio, blocksize)		\
+	for (struct bvec_iter iter = init_bvec_iter_for_bio(bio);	\
+	     (iter).bi_size &&						\
+	     (paddr = bio_iter_phys((bio), &(iter)), 1);		\
+	     bio_advance_iter_single((bio), &(iter), (blocksize)))
+
 static inline void cond_wake_up(struct wait_queue_head *wq)
 {
 	/*
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 74e38da9bd39..62b993fae54f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -6,12 +6,19 @@
 #include "messages.h"
 #include "ctree.h"
 #include "disk-io.h"
+#include "file-item.h"
 #include "print-tree.h"
 #include "accessors.h"
 #include "tree-checker.h"
 #include "volumes.h"
 #include "raid-stripe-tree.h"
 
+/*
+ * Large enough buffer size for the stringification of any key type yet short
+ * enough to use the stack and avoid allocations.
+ */
+#define KEY_TYPE_BUF_SIZE 32
+
 struct root_name_map {
 	u64 id;
 	const char *name;
@@ -227,21 +234,209 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
 #endif
 }
 
+static void print_timespec(const struct extent_buffer *eb,
+			   struct btrfs_timespec *timespec,
+			   const char *prefix, const char *suffix)
+{
+	const u64 secs = btrfs_timespec_sec(eb, timespec);
+	const u32 nsecs = btrfs_timespec_nsec(eb, timespec);
+
+	pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix);
+}
+
+static void print_inode_item(const struct extent_buffer *eb, int i)
+{
+	struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+
+	pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n",
+		btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii),
+		btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii));
+	pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n",
+		btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii),
+		btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii),
+		btrfs_inode_gid(eb, ii));
+	pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n",
+		btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii),
+		btrfs_inode_flags(eb, ii));
+	print_timespec(eb, &ii->atime, "\t\tatime ", "\n");
+	print_timespec(eb, &ii->ctime, "\t\tctime ", "\n");
+	print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n");
+	print_timespec(eb, &ii->otime, "\t\totime ", "\n");
+}
+
+static void print_dir_item(const struct extent_buffer *eb, int i)
+{
+	const u32 size = btrfs_item_size(eb, i);
+	struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item);
+	u32 cur = 0;
+
+	while (cur < size) {
+		const u32 name_len = btrfs_dir_name_len(eb, di);
+		const u32 data_len = btrfs_dir_data_len(eb, di);
+		const u32 len = sizeof(*di) + name_len + data_len;
+		struct btrfs_key location;
+
+		btrfs_dir_item_key_to_cpu(eb, di, &location);
+		pr_info("\t\tlocation key (%llu %u %llu) type %d\n",
+			location.objectid, location.type, location.offset,
+			btrfs_dir_ftype(eb, di));
+		pr_info("\t\ttransid %llu data_len %u name_len %u\n",
+			btrfs_dir_transid(eb, di), data_len, name_len);
+		di = (struct btrfs_dir_item *)((char *)di + len);
+		cur += len;
+	}
+}
+
+static void print_inode_ref_item(const struct extent_buffer *eb, int i)
+{
+	const u32 size = btrfs_item_size(eb, i);
+	struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref);
+	u32 cur = 0;
+
+	while (cur < size) {
+		const u64 index = btrfs_inode_ref_index(eb, ref);
+		const u32 name_len = btrfs_inode_ref_name_len(eb, ref);
+		const u32 len = sizeof(*ref) + name_len;
+
+		pr_info("\t\tindex %llu name_len %u\n", index, name_len);
+		ref = (struct btrfs_inode_ref *)((char *)ref + len);
+		cur += len;
+	}
+}
+
+static void print_inode_extref_item(const struct extent_buffer *eb, int i)
+{
+	const u32 size = btrfs_item_size(eb, i);
+	struct btrfs_inode_extref *extref;
+	u32 cur = 0;
+
+	extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref);
+	while (cur < size) {
+		const u64 index = btrfs_inode_extref_index(eb, extref);
+		const u32 name_len = btrfs_inode_extref_name_len(eb, extref);
+		const u64 parent = btrfs_inode_extref_parent(eb, extref);
+		const u32 len = sizeof(*extref) + name_len;
+
+		pr_info("\t\tindex %llu parent %llu name_len %u\n",
+			index, parent, name_len);
+		extref = (struct btrfs_inode_extref *)((char *)extref + len);
+		cur += len;
+	}
+}
+
+static void print_dir_log_index_item(const struct extent_buffer *eb, int i)
+{
+	struct btrfs_dir_log_item *dlog;
+
+	dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item);
+	pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog));
+}
+
+static void print_extent_csum(const struct extent_buffer *eb, int i)
+{
+	const struct btrfs_fs_info *fs_info = eb->fs_info;
+	const u32 size = btrfs_item_size(eb, i);
+	const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize;
+	struct btrfs_key key;
+
+	btrfs_item_key_to_cpu(eb, &key, i);
+	pr_info("\t\trange start %llu end %llu length %u\n",
+		key.offset, key.offset + csum_bytes, csum_bytes);
+}
+
+static void print_file_extent_item(const struct extent_buffer *eb, int i)
+{
+	struct btrfs_file_extent_item *fi;
+
+	fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+	pr_info("\t\tgeneration %llu type %hhu\n",
+		btrfs_file_extent_generation(eb, fi),
+		btrfs_file_extent_type(eb, fi));
+
+	if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) {
+		pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n",
+			btrfs_file_extent_inline_item_len(eb, i),
+			btrfs_file_extent_ram_bytes(eb, fi),
+			btrfs_file_extent_compression(eb, fi));
+		return;
+	}
+
+	pr_info("\t\textent data disk bytenr %llu nr %llu\n",
+		btrfs_file_extent_disk_bytenr(eb, fi),
+		btrfs_file_extent_disk_num_bytes(eb, fi));
+	pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
+		btrfs_file_extent_offset(eb, fi),
+		btrfs_file_extent_num_bytes(eb, fi),
+		btrfs_file_extent_ram_bytes(eb, fi));
+	pr_info("\t\textent compression %hhu\n",
+		btrfs_file_extent_compression(eb, fi));
+}
+
+static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size)
+{
+	static const char *key_to_str[256] = {
+		[BTRFS_INODE_ITEM_KEY]			= "INODE_ITEM",
+		[BTRFS_INODE_REF_KEY]			= "INODE_REF",
+		[BTRFS_INODE_EXTREF_KEY]		= "INODE_EXTREF",
+		[BTRFS_DIR_ITEM_KEY]			= "DIR_ITEM",
+		[BTRFS_DIR_INDEX_KEY]			= "DIR_INDEX",
+		[BTRFS_DIR_LOG_ITEM_KEY]		= "DIR_LOG_ITEM",
+		[BTRFS_DIR_LOG_INDEX_KEY]		= "DIR_LOG_INDEX",
+		[BTRFS_XATTR_ITEM_KEY]			= "XATTR_ITEM",
+		[BTRFS_VERITY_DESC_ITEM_KEY]		= "VERITY_DESC_ITEM",
+		[BTRFS_VERITY_MERKLE_ITEM_KEY]		= "VERITY_MERKLE_ITEM",
+		[BTRFS_ORPHAN_ITEM_KEY]			= "ORPHAN_ITEM",
+		[BTRFS_ROOT_ITEM_KEY]			= "ROOT_ITEM",
+		[BTRFS_ROOT_REF_KEY]			= "ROOT_REF",
+		[BTRFS_ROOT_BACKREF_KEY]		= "ROOT_BACKREF",
+		[BTRFS_EXTENT_ITEM_KEY]			= "EXTENT_ITEM",
+		[BTRFS_METADATA_ITEM_KEY]		= "METADATA_ITEM",
+		[BTRFS_TREE_BLOCK_REF_KEY]		= "TREE_BLOCK_REF",
+		[BTRFS_SHARED_BLOCK_REF_KEY]		= "SHARED_BLOCK_REF",
+		[BTRFS_EXTENT_DATA_REF_KEY]		= "EXTENT_DATA_REF",
+		[BTRFS_SHARED_DATA_REF_KEY]		= "SHARED_DATA_REF",
+		[BTRFS_EXTENT_OWNER_REF_KEY]		= "EXTENT_OWNER_REF",
+		[BTRFS_EXTENT_CSUM_KEY]			= "EXTENT_CSUM",
+		[BTRFS_EXTENT_DATA_KEY]			= "EXTENT_DATA",
+		[BTRFS_BLOCK_GROUP_ITEM_KEY]		= "BLOCK_GROUP_ITEM",
+		[BTRFS_FREE_SPACE_INFO_KEY]		= "FREE_SPACE_INFO",
+		[BTRFS_FREE_SPACE_EXTENT_KEY]		= "FREE_SPACE_EXTENT",
+		[BTRFS_FREE_SPACE_BITMAP_KEY]		= "FREE_SPACE_BITMAP",
+		[BTRFS_CHUNK_ITEM_KEY]			= "CHUNK_ITEM",
+		[BTRFS_DEV_ITEM_KEY]			= "DEV_ITEM",
+		[BTRFS_DEV_EXTENT_KEY]			= "DEV_EXTENT",
+		[BTRFS_TEMPORARY_ITEM_KEY]		= "TEMPORARY_ITEM",
+		[BTRFS_DEV_REPLACE_KEY]			= "DEV_REPLACE",
+		[BTRFS_STRING_ITEM_KEY]			= "STRING_ITEM",
+		[BTRFS_QGROUP_STATUS_KEY]		= "QGROUP_STATUS",
+		[BTRFS_QGROUP_RELATION_KEY]		= "QGROUP_RELATION",
+		[BTRFS_QGROUP_INFO_KEY]			= "QGROUP_INFO",
+		[BTRFS_QGROUP_LIMIT_KEY]		= "QGROUP_LIMIT",
+		[BTRFS_PERSISTENT_ITEM_KEY]		= "PERSISTENT_ITEM",
+		[BTRFS_UUID_KEY_SUBVOL]			= "UUID_KEY_SUBVOL",
+		[BTRFS_UUID_KEY_RECEIVED_SUBVOL]	= "UUID_KEY_RECEIVED_SUBVOL",
+		[BTRFS_RAID_STRIPE_KEY]			= "RAID_STRIPE",
+	};
+
+	if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+		scnprintf(buf, buf_size, "UNTYPED");
+	else if (key_to_str[key->type])
+		scnprintf(buf, buf_size, key_to_str[key->type]);
+	else
+		scnprintf(buf, buf_size, "UNKNOWN.%d", key->type);
+}
+
 void btrfs_print_leaf(const struct extent_buffer *l)
 {
 	struct btrfs_fs_info *fs_info;
 	int i;
 	u32 type, nr;
 	struct btrfs_root_item *ri;
-	struct btrfs_dir_item *di;
-	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
-	struct btrfs_file_extent_item *fi;
 	struct btrfs_extent_data_ref *dref;
 	struct btrfs_shared_data_ref *sref;
 	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_key key;
-	struct btrfs_key found_key;
 
 	if (!l)
 		return;
@@ -255,25 +450,35 @@ void btrfs_print_leaf(const struct extent_buffer *l)
 		   btrfs_leaf_free_space(l), btrfs_header_owner(l));
 	print_eb_refs_lock(l);
 	for (i = 0 ; i < nr ; i++) {
+		char key_buf[KEY_TYPE_BUF_SIZE];
+
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = key.type;
-		pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
-			i, key.objectid, type, key.offset,
+		key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE);
+
+		pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n",
+			i, key.objectid, key_buf, key.offset,
 			btrfs_item_offset(l, i), btrfs_item_size(l, i));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
-			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			pr_info("\t\tinode generation %llu size %llu mode %o\n",
-			       btrfs_inode_generation(l, ii),
-			       btrfs_inode_size(l, ii),
-			       btrfs_inode_mode(l, ii));
+			print_inode_item(l, i);
+			break;
+		case BTRFS_INODE_REF_KEY:
+			print_inode_ref_item(l, i);
+			break;
+		case BTRFS_INODE_EXTREF_KEY:
+			print_inode_extref_item(l, i);
 			break;
 		case BTRFS_DIR_ITEM_KEY:
-			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
-			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			pr_info("\t\tdir oid %llu flags %u\n",
-				found_key.objectid,
-				btrfs_dir_flags(l, di));
+		case BTRFS_DIR_INDEX_KEY:
+		case BTRFS_XATTR_ITEM_KEY:
+			print_dir_item(l, i);
+			break;
+		case BTRFS_DIR_LOG_INDEX_KEY:
+			print_dir_log_index_item(l, i);
+			break;
+		case BTRFS_EXTENT_CSUM_KEY:
+			print_extent_csum(l, i);
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -303,24 +508,7 @@ void btrfs_print_leaf(const struct extent_buffer *l)
 			       btrfs_shared_data_ref_count(l, sref));
 			break;
 		case BTRFS_EXTENT_DATA_KEY:
-			fi = btrfs_item_ptr(l, i,
-					    struct btrfs_file_extent_item);
-			pr_info("\t\tgeneration %llu type %hhu\n",
-				btrfs_file_extent_generation(l, fi),
-				btrfs_file_extent_type(l, fi));
-			if (btrfs_file_extent_type(l, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE) {
-				pr_info("\t\tinline extent data size %llu\n",
-				       btrfs_file_extent_ram_bytes(l, fi));
-				break;
-			}
-			pr_info("\t\textent data disk bytenr %llu nr %llu\n",
-			       btrfs_file_extent_disk_bytenr(l, fi),
-			       btrfs_file_extent_disk_num_bytes(l, fi));
-			pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
-			       btrfs_file_extent_offset(l, fi),
-			       btrfs_file_extent_num_bytes(l, fi),
-			       btrfs_file_extent_ram_bytes(l, fi));
+			print_file_extent_item(l, i);
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1a5972178b3a..1175b8192cd7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 	}
 
 	path = btrfs_alloc_path();
-	if (!path) {
+	if (unlikely(!path)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_root;
@@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 
 	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 				      sizeof(*ptr));
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
@@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
 	if (ret > 0)
 		goto out_add_root;
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
@@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 			/* We should not have a stray @prealloc pointer. */
 			ASSERT(prealloc == NULL);
 			prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
-			if (!prealloc) {
+			if (unlikely(!prealloc)) {
 				ret = -ENOMEM;
 				btrfs_abort_transaction(trans, ret);
 				goto out_free_path;
@@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 
 			ret = add_qgroup_item(trans, quota_root,
 					      found_key.offset);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_free_path;
 			}
@@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 			prealloc = NULL;
 			ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_free_path;
 			}
 			ret = btrfs_search_slot_for_read(tree_root, &found_key,
 							 path, 1, 0);
-			if (ret < 0) {
+			if (unlikely(ret < 0)) {
 				btrfs_abort_transaction(trans, ret);
 				goto out_free_path;
 			}
@@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 			}
 		}
 		ret = btrfs_next_item(tree_root, path);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out_free_path;
 		}
@@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 out_add_root:
 	btrfs_release_path(path);
 	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
@@ -1190,7 +1190,7 @@ out_add_root:
 	qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
 	prealloc = NULL;
 	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
@@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	btrfs_free_qgroup_config(fs_info);
 
 	ret = btrfs_clean_quota_tree(trans, quota_root);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ret = btrfs_del_root(trans, &quota_root->root_key);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -1453,9 +1453,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
 				    struct btrfs_qgroup *src, int sign)
 {
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_qgroup *cur;
 	LIST_HEAD(qgroup_list);
 	u64 num_bytes = src->excl;
+	u64 num_bytes_cmpr = src->excl_cmpr;
 	int ret = 0;
 
 	qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1463,15 +1463,16 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
 		goto out;
 
 	qgroup_iterator_add(&qgroup_list, qgroup);
-	list_for_each_entry(cur, &qgroup_list, iterator) {
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
 		struct btrfs_qgroup_list *glist;
 
 		qgroup->rfer += sign * num_bytes;
-		qgroup->rfer_cmpr += sign * num_bytes;
+		qgroup->rfer_cmpr += sign * num_bytes_cmpr;
 
 		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+		WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
 		qgroup->excl += sign * num_bytes;
-		qgroup->excl_cmpr += sign * num_bytes;
+		qgroup->excl_cmpr += sign * num_bytes_cmpr;
 
 		if (sign > 0)
 			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
@@ -2425,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
 	int i;
 
 	/* Level sanity check */
-	if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
-	    root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
-	    root_level < cur_level) {
+	if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+		     root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+		     root_level < cur_level)) {
 		btrfs_err_rl(fs_info,
 			"%s: bad levels, cur_level=%d root_level=%d",
 			__func__, cur_level, root_level);
@@ -2443,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
 		 * dst_path->nodes[root_level] must be initialized before
 		 * calling this function.
 		 */
-		if (cur_level == root_level) {
+		if (unlikely(cur_level == root_level)) {
 			btrfs_err_rl(fs_info,
 	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
 				__func__, root_level, root_level, cur_level);
@@ -2529,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 		return 0;
 
 	/* Wrong parameter order */
-	if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+	if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) {
 		btrfs_err_rl(fs_info,
 		"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
 			     btrfs_header_generation(src_eb),
@@ -2537,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 		return -EUCLEAN;
 	}
 
-	if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+	if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) {
 		ret = -EIO;
 		goto out;
 	}
@@ -2728,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head)
  */
 static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
 				 struct ulist *roots, struct list_head *qgroups,
-				 u64 seq, int update_old)
+				 u64 seq, bool update_old)
 {
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
@@ -4709,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
 	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
-	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
-	    btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+	if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+		     btrfs_node_ptr_generation(reloc_parent, reloc_slot))) {
 		btrfs_err_rl(fs_info,
 		"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
 			__func__,
@@ -4842,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 		reloc_eb = NULL;
 		goto free_out;
 	}
-	if (!extent_buffer_uptodate(reloc_eb)) {
+	if (unlikely(!extent_buffer_uptodate(reloc_eb))) {
 		ret = -EIO;
 		goto free_out;
 	}
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index cab0b291088c..cc6f6095cc9f 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *stripe_root = fs_info->stripe_root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	u64 found_start;
@@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
 		btrfs_release_path(path);
 	}
 
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
 				   struct btrfs_stripe_extent *stripe_extent,
 				   const size_t item_size)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *leaf;
 	int ret;
 	int slot;
@@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
 
 	write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
 			    item_size);
-	btrfs_free_path(path);
 
 	return ret;
 }
@@ -306,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	stripe_extent = kzalloc(item_size, GFP_NOFS);
-	if (!stripe_extent) {
+	if (!unlikely(stripe_extent)) {
 		btrfs_abort_transaction(trans, -ENOMEM);
 		btrfs_end_transaction(trans);
 		return -ENOMEM;
@@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 	struct btrfs_stripe_extent *stripe_extent;
 	struct btrfs_key stripe_key;
 	struct btrfs_key found_key;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *leaf;
 	const u64 end = logical + *length;
 	int num_stripes;
@@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 
 	ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
 	if (ret < 0)
-		goto free_path;
+		return ret;
 	if (ret) {
 		if (path->slots[0] != 0)
 			path->slots[0]--;
@@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
 		trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
 						   stripe->physical, devid);
 
-		ret = 0;
-		goto free_path;
+		return 0;
 	}
 
 	/* If we're here, we haven't found the requested devid in the stripe. */
@@ -474,8 +471,6 @@ out:
 			  logical, logical + *length, stripe->dev->devid,
 			  btrfs_bg_type_to_raid_name(map_type));
 	}
-free_path:
-	btrfs_free_path(path);
 
 	return ret;
 }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3ff2bedfb3a4..0135dceb7baa 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 		/* Check if we have reached tolerance early. */
 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
 							 NULL, NULL);
-		if (found_errors > rbio->bioc->max_errors)
+		if (unlikely(found_errors > rbio->bioc->max_errors))
 			return -EIO;
 		return 0;
 	}
@@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
 	const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
 	struct bvec_iter iter = bio->bi_iter;
+	phys_addr_t paddr;
 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
 		     rbio->bioc->full_stripe_logical;
 
-	while (iter.bi_size) {
+	btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) {
 		unsigned int index = (offset >> sectorsize_bits);
 		struct sector_ptr *sector = &rbio->bio_sectors[index];
-		struct bio_vec bv = bio_iter_iovec(bio, iter);
 
 		sector->has_paddr = true;
-		sector->paddr = bvec_phys(&bv);
-		bio_advance_iter_single(bio, &iter, sectorsize);
+		sector->paddr = paddr;
 		offset += sectorsize;
 	}
 }
@@ -1511,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
  */
 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
-	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
+	const u32 blocksize = rbio->bioc->fs_info->sectorsize;
+	phys_addr_t paddr;
 
 	ASSERT(!bio_flagged(bio, BIO_CLONED));
 
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		struct sector_ptr *sector;
-		phys_addr_t paddr = bvec_phys(bvec);
+	btrfs_bio_for_each_block_all(paddr, bio, blocksize) {
+		struct sector_ptr *sector = find_stripe_sector(rbio, paddr);
 
-		for (u32 off = 0; off < bvec->bv_len; off += sectorsize) {
-			sector = find_stripe_sector(rbio, paddr + off);
-			ASSERT(sector);
-			if (sector)
-				sector->uptodate = 1;
-		}
+		ASSERT(sector);
+		if (sector)
+			sector->uptodate = 1;
 	}
 }
 
@@ -1573,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
 {
 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
+	phys_addr_t paddr;
 
 	/* No data csum for the whole stripe, no need to verify. */
 	if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1584,27 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
 	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
 		return;
 
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		void *kaddr;
-
-		kaddr = bvec_kmap_local(bvec);
-		for (u32 off = 0; off < bvec->bv_len;
-		     off += fs_info->sectorsize, total_sector_nr++) {
-			u8 csum_buf[BTRFS_CSUM_SIZE];
-			u8 *expected_csum = rbio->csum_buf +
-					    total_sector_nr * fs_info->csum_size;
-			int ret;
+	btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) {
+		u8 csum_buf[BTRFS_CSUM_SIZE];
+		u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
+		int ret;
 
-			/* No csum for this sector, skip to the next sector. */
-			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
-				continue;
+		/* No csum for this sector, skip to the next sector. */
+		if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+			continue;
 
-			ret = btrfs_check_sector_csum(fs_info, kaddr + off,
-						      csum_buf, expected_csum);
-			if (ret < 0)
-				set_bit(total_sector_nr, rbio->error_bitmap);
-		}
-		kunmap_local(kaddr);
+		ret = btrfs_check_block_csum(fs_info, paddr,
+					     csum_buf, expected_csum);
+		if (ret < 0)
+			set_bit(total_sector_nr, rbio->error_bitmap);
+		total_sector_nr++;
 	}
 }
 
@@ -1802,7 +1788,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
 	struct sector_ptr *sector;
 	u8 csum_buf[BTRFS_CSUM_SIZE];
 	u8 *csum_expected;
-	void *kaddr;
 	int ret;
 
 	if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1824,9 +1809,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
 	csum_expected = rbio->csum_buf +
 			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
 			fs_info->csum_size;
-	kaddr = kmap_local_sector(sector);
-	ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected);
-	kunmap_local(kaddr);
+	ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected);
 	return ret;
 }
 
@@ -1864,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
 	if (!found_errors)
 		return 0;
 
-	if (found_errors > rbio->bioc->max_errors)
+	if (unlikely(found_errors > rbio->bioc->max_errors))
 		return -EIO;
 
 	/*
@@ -2416,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
 		int found_errors;
 
 		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
-		if (found_errors > rbio->bioc->max_errors) {
+		if (unlikely(found_errors > rbio->bioc->max_errors)) {
 			ret = -EIO;
 			break;
 		}
@@ -2705,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
 
 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
 							 &faila, &failb);
-		if (found_errors > rbio->bioc->max_errors) {
+		if (unlikely(found_errors > rbio->bioc->max_errors)) {
 			ret = -EIO;
 			goto out;
 		}
@@ -2729,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
 		 * data, so the capability of the repair is declined.  (In the
 		 * case of RAID5, we can not repair anything.)
 		 */
-		if (dfail > rbio->bioc->max_errors - 1) {
+		if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
 			ret = -EIO;
 			goto out;
 		}
@@ -2746,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
 		 * scrubbing parity, luckily, use the other one to repair the
 		 * data, or we can not repair the data stripe.
 		 */
-		if (failp != rbio->scrubp) {
+		if (unlikely(failp != rbio->scrubp)) {
 			ret = -EIO;
 			goto out;
 		}
@@ -2837,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
 		int found_errors;
 
 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
-		if (found_errors > rbio->bioc->max_errors) {
+		if (unlikely(found_errors > rbio->bioc->max_errors)) {
 			ret = -EIO;
 			break;
 		}
@@ -2861,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
  * This is for scrub call sites where we already have correct data contents.
  * This allows us to avoid reading data stripes again.
  *
- * Unfortunately here we have to do page copy, other than reusing the pages.
+ * Unfortunately here we have to do folio copy, other than reusing the pages.
  * This is due to the fact rbio has its own page management for its cache.
  */
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
-				    struct page **data_pages, u64 data_logical)
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+				     struct folio **data_folios, u64 data_logical)
 {
+	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
 	const u64 offset_in_full_stripe = data_logical -
 					  rbio->bioc->full_stripe_logical;
-	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
-	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
-	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+	unsigned int findex = 0;
+	unsigned int foffset = 0;
 	int ret;
 
+	/* We shouldn't hit RAID56 for bs > ps cases for now. */
+	ASSERT(fs_info->sectorsize <= PAGE_SIZE);
+
 	/*
 	 * If we hit ENOMEM temporarily, but later at
 	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
@@ -2890,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
 	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
 	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
 
-	for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
-		struct page *dst = rbio->stripe_pages[page_nr + page_index];
-		struct page *src = data_pages[page_nr];
+	for (unsigned int cur_off = offset_in_full_stripe;
+	     cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
+	     cur_off += PAGE_SIZE) {
+		const unsigned int pindex = cur_off >> PAGE_SHIFT;
+		void *kaddr;
+
+		kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
+		memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
+		kunmap_local(kaddr);
 
-		memcpy_page(dst, 0, src, 0, PAGE_SIZE);
-		for (int sector_nr = sectors_per_page * page_index;
-		     sector_nr < sectors_per_page * (page_index + 1);
-		     sector_nr++)
-			rbio->stripe_sectors[sector_nr].uptodate = true;
+		foffset += PAGE_SIZE;
+		ASSERT(foffset <= folio_size(data_folios[findex]));
+		if (foffset == folio_size(data_folios[findex])) {
+			findex++;
+			foffset = 0;
+		}
 	}
+	for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits;
+	     sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits;
+	     sector_nr++)
+		rbio->stripe_sectors[sector_nr].uptodate = true;
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0d7b4c2fb6ae..84c4d1d29c7a 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
 				unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
 
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
-				    struct page **data_pages, u64 data_logical);
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+				     struct folio **data_folios, u64 data_logical);
 
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 3871c3a6c743..de4cb0f3fbd0 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
 int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *extent_root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *eb;
 	int tree_block_level = 0;
 	u64 bytenr = 0, num_bytes = 0;
@@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 	if (!btrfs_test_opt(fs_info, REF_VERIFY))
 		return 0;
 
+	extent_root = btrfs_extent_root(fs_info, 0);
+	/* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+	if (IS_ERR(extent_root)) {
+		btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+		return 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	extent_root = btrfs_extent_root(fs_info, 0);
 	eb = btrfs_read_lock_root_node(extent_root);
 	level = btrfs_header_level(eb);
 	path->nodes[level] = eb;
@@ -1014,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 		btrfs_free_ref_cache(fs_info);
 		btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
 	}
-	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 559bd25a2b7a..1ce544d53cc5 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -12,7 +12,7 @@
 struct btrfs_fs_info;
 struct btrfs_ref;
 
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
 
 #include <linux/spinlock.h>
 
@@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
 {
 }
 
-#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* CONFIG_BTRFS_DEBUG */
 
 #endif
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index ce25ab7f0e99..5465a5eae9b2 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     u64 endoff,
 				     const u64 destoff,
 				     const u64 olen,
-				     int no_time_update)
+				     bool no_time_update)
 {
 	int ret;
 
@@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_update_inode(trans, BTRFS_I(inode));
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 		return ret;
@@ -268,12 +268,12 @@ copy_inline_extent:
 	drop_args.end = aligned_end;
 	drop_args.drop_cache = true;
 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -285,7 +285,7 @@ copy_inline_extent:
 	btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
 	btrfs_set_inode_full_sync(inode);
 	ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
-	if (ret)
+	if (unlikely(ret))
 		btrfs_abort_transaction(trans, ret);
 out:
 	if (!ret && !trans) {
@@ -337,10 +337,10 @@ copy_to_page:
  */
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 off, const u64 olen, const u64 olen_aligned,
-		       const u64 destoff, int no_time_update)
+		       const u64 destoff, bool no_time_update)
 {
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
-	struct btrfs_path *path = NULL;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *leaf;
 	struct btrfs_trans_handle *trans;
 	char *buf = NULL;
@@ -611,7 +611,6 @@ process_slot:
 	}
 
 out:
-	btrfs_free_path(path);
 	kvfree(buf);
 	clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e58151933844..8dd8de6b9fb8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -602,6 +602,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	if (btrfs_root_id(root) == objectid) {
 		u64 commit_root_gen;
 
+		/*
+		 * Relocation will wait for cleaner thread, and any half-dropped
+		 * subvolume will be fully cleaned up at mount time.
+		 * So here we shouldn't hit a subvolume with non-zero drop_progress.
+		 *
+		 * If this isn't the case, error out since it can make us attempt to
+		 * drop references for extents that were already dropped before.
+		 */
+		if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+			struct btrfs_key cpu_key;
+
+			btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+			btrfs_err(fs_info,
+	"cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
+				  objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
+			ret = -EUCLEAN;
+			goto fail;
+		}
+
 		/* called by btrfs_init_reloc_root */
 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
 				      BTRFS_TREE_RELOC_OBJECTID);
@@ -802,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 			    u64 bytenr, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
 	int ret;
@@ -815,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 	ret = btrfs_lookup_file_extent(NULL, root, path,
 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
 	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
-	}
+		return ret;
+	if (ret > 0)
+		return -ENOENT;
 
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -830,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 	       btrfs_file_extent_encryption(leaf, fi) ||
 	       btrfs_file_extent_other_encoding(leaf, fi));
 
-	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
+		return -EINVAL;
 
 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
 
 /*
@@ -955,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		btrfs_init_data_ref(&ref, key.objectid, key.offset,
 				    btrfs_root_id(root), false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -969,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		btrfs_init_data_ref(&ref, key.objectid, key.offset,
 				    btrfs_root_id(root), false);
 		ret = btrfs_free_extent(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -1180,7 +1192,7 @@ again:
 		ref.ref_root = btrfs_root_id(src);
 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -1193,7 +1205,7 @@ again:
 		ref.ref_root = btrfs_root_id(dest);
 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -1207,7 +1219,7 @@ again:
 		ref.ref_root = btrfs_root_id(src);
 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
 		ret = btrfs_free_extent(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -1221,7 +1233,7 @@ again:
 		ref.ref_root = btrfs_root_id(dest);
 		btrfs_init_tree_ref(&ref, level - 1, 0, true);
 		ret = btrfs_free_extent(trans, &ref);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			break;
 		}
@@ -1471,7 +1483,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
 				 * ->reloc_root.  If it fails however we must
 				 * drop the ref ourselves.
 				 */
-				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+				ret2 = btrfs_drop_snapshot(reloc_root, false, true);
 				if (ret2 < 0) {
 					btrfs_put_root(reloc_root);
 					if (!ret)
@@ -1481,7 +1493,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
 			btrfs_put_root(root);
 		} else {
 			/* Orphan reloc tree, just clean it up */
-			ret2 = btrfs_drop_snapshot(root, 0, 1);
+			ret2 = btrfs_drop_snapshot(root, false, true);
 			if (ret2 < 0) {
 				btrfs_put_root(root);
 				if (!ret)
@@ -1772,7 +1784,7 @@ again:
 		list_add(&reloc_root->root_list, &reloc_roots);
 		btrfs_put_root(root);
 
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			if (!err)
 				err = ret;
@@ -1941,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
 		DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
 		return PTR_ERR(root);
 	}
-	if (root->reloc_root != reloc_root) {
+	if (unlikely(root->reloc_root != reloc_root)) {
 		DEBUG_WARN("unexpected reloc root found");
 		btrfs_err(fs_info,
 			  "root %llu has two reloc roots associated with it",
@@ -2012,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
 	if (!root)
 		return ERR_PTR(-ENOENT);
 
-	if (next->new_bytenr) {
+	if (unlikely(next->new_bytenr)) {
 		/*
 		 * We just created the reloc root, so we shouldn't have
 		 * ->new_bytenr set yet. If it is then we have multiple roots
@@ -2071,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
 		 * This can occur if we have incomplete extent refs leading all
 		 * the way up a particular path, in this case return -EUCLEAN.
 		 */
-		if (!root)
+		if (unlikely(!root))
 			return ERR_PTR(-EUCLEAN);
 
 		/* No other choice for non-shareable tree */
@@ -2258,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 
 		bytenr = btrfs_node_blockptr(upper->eb, slot);
 		if (lowest) {
-			if (bytenr != node->bytenr) {
+			if (unlikely(bytenr != node->bytenr)) {
 				btrfs_err(root->fs_info,
 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
 					  bytenr, node->bytenr, slot,
@@ -2313,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 			if (!ret)
 				ret = btrfs_drop_subtree(trans, root, eb,
 							 upper->eb);
-			if (ret)
+			if (unlikely(ret))
 				btrfs_abort_transaction(trans, ret);
 		}
 next:
@@ -2435,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
 	eb = read_tree_block(fs_info, block->bytenr, &check);
 	if (IS_ERR(eb))
 		return PTR_ERR(eb);
-	if (!extent_buffer_uptodate(eb)) {
+	if (unlikely(!extent_buffer_uptodate(eb))) {
 		free_extent_buffer(eb);
 		return -EIO;
 	}
@@ -2500,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 			 * normal user in the case of corruption.
 			 */
 			ASSERT(node->new_bytenr == 0);
-			if (node->new_bytenr) {
+			if (unlikely(node->new_bytenr)) {
 				btrfs_err(root->fs_info,
 				  "bytenr %llu has improper references to it",
 					  node->bytenr);
@@ -2820,7 +2832,7 @@ again:
 	if (!folio_test_uptodate(folio)) {
 		btrfs_read_folio(NULL, folio);
 		folio_lock(folio);
-		if (!folio_test_uptodate(folio)) {
+		if (unlikely(!folio_test_uptodate(folio))) {
 			ret = -EIO;
 			goto release_folio;
 		}
@@ -3139,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc,
 			    struct rb_root *blocks)
 {
 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	int ret;
 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -3167,7 +3179,7 @@ again:
 	path->skip_locking = 1;
 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	if (ret > 0 && skinny) {
 		if (path->slots[0]) {
@@ -3194,14 +3206,10 @@ again:
 	     "tree block extent item (%llu) is not found in extent tree",
 		     bytenr);
 		WARN_ON(1);
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
-	ret = add_tree_block(rc, &key, path, blocks);
-out:
-	btrfs_free_path(path);
-	return ret;
+	return add_tree_block(rc, &key, path, blocks);
 }
 
 static int delete_block_group_cache(struct btrfs_block_group *block_group,
@@ -3491,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	struct rb_root blocks = RB_ROOT;
 	struct btrfs_key key;
 	struct btrfs_trans_handle *trans = NULL;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_extent_item *ei;
 	u64 flags;
 	int ret;
@@ -3660,14 +3668,13 @@ out_free:
 	if (ret < 0 && !err)
 		err = ret;
 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
-	btrfs_free_path(path);
 	return err;
 }
 
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root, u64 objectid)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_inode_item *item;
 	struct extent_buffer *leaf;
 	int ret;
@@ -3678,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
-		goto out;
+		return ret;
 
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
@@ -3688,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
 					  BTRFS_INODE_PREALLOC);
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
 
 static void delete_orphan_inode(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 objectid)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	int ret = 0;
 
@@ -3719,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans,
 out:
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
-	btrfs_free_path(path);
 }
 
 /*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e22e6b06927a..d07eab70f759 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
 		 * Key with offset -1 found, there would have to exist a root
 		 * with such id, but this is out of the valid range.
 		 */
-		if (ret == 0) {
+		if (unlikely(ret == 0)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *l;
 	int ret;
 	int slot;
@@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	if (ret > 0) {
+	if (unlikely(ret > 0)) {
 		btrfs_crit(fs_info,
 			"unable to find root key (%llu %u %llu) in tree %llu",
 			key->objectid, key->type, key->offset, btrfs_root_id(root));
 		ret = -EUCLEAN;
 		btrfs_abort_transaction(trans, ret);
-		goto out;
+		return ret;
 	}
 
 	l = path->nodes[0];
@@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(trans, root, key, path,
 				-1, 1);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
-			goto out;
+			return ret;
 		}
 
 		ret = btrfs_del_item(trans, root, path);
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
-			goto out;
+			return ret;
 		}
 		btrfs_release_path(path);
 		ret = btrfs_insert_empty_item(trans, root, path,
 				key, sizeof(*item));
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
-			goto out;
+			return ret;
 		}
 		l = path->nodes[0];
 		slot = path->slots[0];
@@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
 
 	write_extent_buffer(l, item, ptr, sizeof(*item));
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_root *root;
 	int err = 0;
@@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 		btrfs_put_root(root);
 	}
 
-	btrfs_free_path(path);
 	return err;
 }
 
@@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
 		   const struct btrfs_key *key)
 {
 	struct btrfs_root *root = trans->fs_info->tree_root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
-		goto out;
-	if (ret != 0) {
+		return ret;
+	if (unlikely(ret > 0))
 		/* The root must exist but we did not find it by the key. */
-		ret = -EUCLEAN;
-		goto out;
-	}
+		return -EUCLEAN;
 
-	ret = btrfs_del_item(trans, root, path);
-out:
-	btrfs_free_path(path);
-	return ret;
+	return btrfs_del_item(trans, root, path);
 }
 
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 		       const struct fscrypt_str *name)
 {
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_root_ref *ref;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
@@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 again:
 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
 	if (ret < 0) {
-		goto out;
+		return ret;
 	} else if (ret == 0) {
 		leaf = path->nodes[0];
 		ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -369,18 +361,16 @@ again:
 		ptr = (unsigned long)(ref + 1);
 		if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
 		    (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
-		    memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
-			ret = -ENOENT;
-			goto out;
-		}
+		    memcmp_extent_buffer(leaf, name->name, ptr, name->len))
+			return -ENOENT;
+
 		*sequence = btrfs_root_ref_sequence(leaf, ref);
 
 		ret = btrfs_del_item(trans, tree_root, path);
 		if (ret)
-			goto out;
+			return ret;
 	} else {
-		ret = -ENOENT;
-		goto out;
+		return -ENOENT;
 	}
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
@@ -391,8 +381,6 @@ again:
 		goto again;
 	}
 
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 	struct btrfs_root *tree_root = trans->fs_info->tree_root;
 	struct btrfs_key key;
 	int ret;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_root_ref *ref;
 	struct extent_buffer *leaf;
 	unsigned long ptr;
@@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 again:
 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
 				      sizeof(*ref) + name->len);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
-		btrfs_free_path(path);
 		return ret;
 	}
 
@@ -455,7 +442,6 @@ again:
 		goto again;
 	}
 
-	btrfs_free_path(path);
 	return 0;
 }
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6776e6ab8d10..4691d0bdb2e8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -113,7 +113,7 @@ enum {
 	/* Which blocks are covered by extent items. */
 	scrub_bitmap_nr_has_extent = 0,
 
-	/* Which blocks are meteadata. */
+	/* Which blocks are metadata. */
 	scrub_bitmap_nr_is_metadata,
 
 	/*
@@ -130,7 +130,7 @@ enum {
 	scrub_bitmap_nr_last,
 };
 
-#define SCRUB_STRIPE_PAGES		(BTRFS_STRIPE_LEN / PAGE_SIZE)
+#define SCRUB_STRIPE_MAX_FOLIOS		(BTRFS_STRIPE_LEN / PAGE_SIZE)
 
 /*
  * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
@@ -139,7 +139,7 @@ struct scrub_stripe {
 	struct scrub_ctx *sctx;
 	struct btrfs_block_group *bg;
 
-	struct page *pages[SCRUB_STRIPE_PAGES];
+	struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS];
 	struct scrub_sector_verification *sectors;
 
 	struct btrfs_device *dev;
@@ -206,7 +206,7 @@ struct scrub_ctx {
 	ktime_t			throttle_deadline;
 	u64			throttle_sent;
 
-	int			is_dev_replace;
+	bool			is_dev_replace;
 	u64			write_pointer;
 
 	struct mutex            wr_lock;
@@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
 	if (!stripe)
 		return;
 
-	for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
-		if (stripe->pages[i])
-			__free_page(stripe->pages[i]);
-		stripe->pages[i] = NULL;
+	for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) {
+		if (stripe->folios[i])
+			folio_put(stripe->folios[i]);
+		stripe->folios[i] = NULL;
 	}
 	kfree(stripe->sectors);
 	kfree(stripe->csums);
@@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
 static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
 			     struct scrub_stripe *stripe)
 {
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
 	int ret;
 
 	memset(stripe, 0, sizeof(*stripe));
@@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
 	atomic_set(&stripe->pending_io, 0);
 	spin_lock_init(&stripe->write_error_lock);
 
-	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
+	ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
+	ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
+				      fs_info->block_min_order, stripe->folios);
 	if (ret < 0)
 		goto error;
 
@@ -446,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
 }
 
 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
-		struct btrfs_fs_info *fs_info, int is_dev_replace)
+		struct btrfs_fs_info *fs_info, bool is_dev_replace)
 {
 	struct scrub_ctx *sctx;
 	int		i;
@@ -585,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 				       bool is_super, u64 logical, u64 physical)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key found_key;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
@@ -612,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 				  &flags);
 	if (ret < 0)
-		goto out;
+		return;
 
 	swarn.extent_item_size = found_key.offset;
 
@@ -658,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
 
 		iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
 	}
-
-out:
-	btrfs_free_path(path);
 }
 
 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -687,13 +687,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
 
 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
 {
-	u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits);
-	const struct page *page = stripe->pages[offset >> PAGE_SHIFT];
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+	u32 offset = (sector_nr << fs_info->sectorsize_bits);
+	const struct folio *folio = stripe->folios[offset >> min_folio_shift];
 
-	/* stripe->pages[] is allocated by us and no highmem is allowed. */
-	ASSERT(page);
-	ASSERT(!PageHighMem(page));
-	return page_address(page) + offset_in_page(offset);
+	/* stripe->folios[] is allocated by us and no highmem is allowed. */
+	ASSERT(folio);
+	ASSERT(!folio_test_partial_kmap(folio));
+	return folio_address(folio) + offset_in_folio(folio, offset);
+}
+
+static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr)
+{
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+	u32 offset = (sector_nr << fs_info->sectorsize_bits);
+	const struct folio *folio = stripe->folios[offset >> min_folio_shift];
+
+	/* stripe->folios[] is allocated by us and no highmem is allowed. */
+	ASSERT(folio);
+	ASSERT(!folio_test_partial_kmap(folio));
+	/* And the range must be contained inside the folio. */
+	ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
+	return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
 }
 
 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
@@ -788,7 +805,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
 	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
-	void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+	phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr);
 	u8 csum_buf[BTRFS_CSUM_SIZE];
 	int ret;
 
@@ -833,7 +850,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
 		return;
 	}
 
-	ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum);
+	ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum);
 	if (ret < 0) {
 		scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
 		scrub_bitmap_set_bit_error(stripe, sector_nr);
@@ -1369,8 +1386,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
 	 * Slice is divided into intervals when the IO is submitted, adjust by
 	 * bwlimit and maximum of 64 intervals.
 	 */
-	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
-	div = min_t(u32, 64, div);
+	div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
 
 	/* Start new epoch, set deadline */
 	now = ktime_get();
@@ -1513,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/*
 		 * Key with offset -1 found, there would have to exist an extent
 		 * item with such offset, but this is out of the valid range.
@@ -1859,6 +1875,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_bio *bbio;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
 	unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
 	int mirror = stripe->mirror_num;
 
@@ -1871,7 +1888,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 		return;
 	}
 
-	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+	bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info,
 			       scrub_read_endio, stripe);
 
 	bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
@@ -1970,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
 		 * metadata, we should immediately abort.
 		 */
 		for (int i = 0; i < nr_stripes; i++) {
-			if (stripe_has_metadata_error(&sctx->stripes[i])) {
+			if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) {
 				ret = -EIO;
 				goto out;
 			}
@@ -2164,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 		 * As we may hit an empty data stripe while it's missing.
 		 */
 		bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
-		if (!bitmap_empty(&error, stripe->nr_sectors)) {
+		if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) {
 			btrfs_err(fs_info,
 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
 				  full_stripe_start, i, stripe->nr_sectors,
@@ -2202,7 +2219,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 	for (int i = 0; i < data_stripes; i++) {
 		stripe = &sctx->raid56_data_stripes[i];
 
-		raid56_parity_cache_data_pages(rbio, stripe->pages,
+		raid56_parity_cache_data_folios(rbio, stripe->folios,
 				full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
 	}
 	raid56_parity_submit_scrub_rbio(rbio);
@@ -2586,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			   struct btrfs_device *scrub_dev, u64 start, u64 end)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_root *root = fs_info->dev_root;
 	u64 chunk_offset;
@@ -2858,8 +2875,8 @@ skip_unfreeze:
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
-		if (sctx->is_dev_replace &&
-		    atomic64_read(&dev_replace->num_write_errors) > 0) {
+		if (unlikely(sctx->is_dev_replace &&
+			     atomic64_read(&dev_replace->num_write_errors) > 0)) {
 			ret = -EIO;
 			break;
 		}
@@ -2872,8 +2889,6 @@ skip:
 		btrfs_release_path(path);
 	}
 
-	btrfs_free_path(path);
-
 	return ret;
 }
 
@@ -2889,13 +2904,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
 	if (ret < 0)
 		return ret;
 	ret = btrfs_check_super_csum(fs_info, sb);
-	if (ret != 0) {
+	if (unlikely(ret != 0)) {
 		btrfs_err_rl(fs_info,
 		  "scrub: super block at physical %llu devid %llu has bad csum",
 			physical, dev->devid);
 		return -EIO;
 	}
-	if (btrfs_super_generation(sb) != generation) {
+	if (unlikely(btrfs_super_generation(sb) != generation)) {
 		btrfs_err_rl(fs_info,
 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
 			     physical, dev->devid,
@@ -3013,7 +3028,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
-		    int readonly, int is_dev_replace)
+		    bool readonly, bool is_dev_replace)
 {
 	struct btrfs_dev_lookup_args args = { .devid = devid };
 	struct scrub_ctx *sctx;
@@ -3065,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
-	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
-	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
+	if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+		     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		ret = -EIO;
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index f0df597b75c7..aa68b6ebaf55 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -11,7 +11,7 @@ struct btrfs_scrub_progress;
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
-		    int readonly, int is_dev_replace);
+		    bool readonly, bool is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7664025a5af4..9230e5066fc6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 		ret = kernel_write(filp, buf + pos, len - pos, off);
 		if (ret < 0)
 			return ret;
-		if (ret == 0)
+		if (unlikely(ret == 0))
 			return -EIO;
 		pos += ret;
 	}
@@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
 			  struct btrfs_inode_info *info)
 {
 	int ret;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_inode_item *ii;
 	struct btrfs_key key;
 
@@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
-		goto out;
+		return ret;
 	}
 
 	if (!info)
-		goto out;
+		return 0;
 
 	ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			struct btrfs_inode_item);
@@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
 	 */
 	info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
 
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
 
 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
@@ -973,13 +971,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
  * path must point to the INODE_REF or INODE_EXTREF when called.
  */
 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
-			     struct btrfs_key *found_key, int resolve,
+			     struct btrfs_key *found_key, bool resolve,
 			     iterate_inode_ref_t iterate, void *ctx)
 {
 	struct extent_buffer *eb = path->nodes[0];
 	struct btrfs_inode_ref *iref;
 	struct btrfs_inode_extref *extref;
-	struct btrfs_path *tmp_path;
+	BTRFS_PATH_AUTO_FREE(tmp_path);
 	struct fs_path *p;
 	u32 cur = 0;
 	u32 total;
@@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	}
 
 out:
-	btrfs_free_path(tmp_path);
 	fs_path_free(p);
 	return ret;
 }
@@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_key key, found_key;
-	struct btrfs_path *p;
+	BTRFS_PATH_AUTO_FREE(p);
 
 	p = alloc_path_for_send();
 	if (!p)
@@ -1238,28 +1235,20 @@ static int get_inode_path(struct btrfs_root *root,
 
 	ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
 	if (ret < 0)
-		goto out;
-	if (ret) {
-		ret = 1;
-		goto out;
-	}
+		return ret;
+	if (ret)
+		return 1;
+
 	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
 	if (found_key.objectid != ino ||
 	    (found_key.type != BTRFS_INODE_REF_KEY &&
-	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
-		ret = -ENOENT;
-		goto out;
-	}
+	     found_key.type != BTRFS_INODE_EXTREF_KEY))
+		return -ENOENT;
 
-	ret = iterate_inode_ref(root, p, &found_key, 1,
-				__copy_first_ref, path);
+	ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path);
 	if (ret < 0)
-		goto out;
-	ret = 0;
-
-out:
-	btrfs_free_path(p);
-	return ret;
+		return ret;
+	return 0;
 }
 
 struct backref_ctx {
@@ -1389,7 +1378,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
 	struct backref_ctx *bctx = ctx;
 	struct send_ctx *sctx = bctx->sctx;
 	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
-	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+	const u64 key = leaf_bytenr >> fs_info->nodesize_bits;
 	struct btrfs_lru_cache_entry *raw_entry;
 	struct backref_cache_entry *entry;
 
@@ -1444,7 +1433,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
 	if (!new_entry)
 		return;
 
-	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+	new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits;
 	new_entry->entry.gen = 0;
 	new_entry->num_roots = 0;
 	ULIST_ITER_INIT(&uiter);
@@ -1716,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root,
 			struct fs_path *dest)
 {
 	int ret;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *ei;
 	u8 type;
@@ -1733,21 +1722,20 @@ static int read_symlink(struct btrfs_root *root,
 	key.offset = 0;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
-	if (ret) {
+		return ret;
+	if (unlikely(ret)) {
 		/*
 		 * An empty symlink inode. Can happen in rare error paths when
 		 * creating a symlink (transaction committed before the inode
 		 * eviction handler removed the symlink inode items and a crash
-		 * happened in between or the subvol was snapshoted in between).
+		 * happened in between or the subvol was snapshotted in between).
 		 * Print an informative message to dmesg/syslog so that the user
 		 * can delete the symlink.
 		 */
 		btrfs_err(root->fs_info,
 			  "Found empty symlink inode %llu at root %llu",
 			  ino, btrfs_root_id(root));
-		ret = -EIO;
-		goto out;
+		return -EIO;
 	}
 
 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1758,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root,
 		btrfs_crit(root->fs_info,
 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
 			   ino, btrfs_root_id(root), type);
-		goto out;
+		return ret;
 	}
 	compression = btrfs_file_extent_compression(path->nodes[0], ei);
 	if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
@@ -1766,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root,
 		btrfs_crit(root->fs_info,
 "send: found symlink extent with compression, ino %llu root %llu compression type %d",
 			   ino, btrfs_root_id(root), compression);
-		goto out;
+		return ret;
 	}
 
 	off = btrfs_file_extent_inline_start(ei);
 	len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
 
-	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-
-out:
-	btrfs_free_path(path);
-	return ret;
+	return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
 }
 
 /*
@@ -1787,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 			   u64 ino, u64 gen,
 			   struct fs_path *dest)
 {
-	int ret = 0;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_dir_item *di;
 	char tmp[64];
 	int len;
@@ -1811,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx,
 				path, BTRFS_FIRST_FREE_OBJECTID,
 				&tmp_name, 0);
 		btrfs_release_path(path);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out;
-		}
+		if (IS_ERR(di))
+			return PTR_ERR(di);
+
 		if (di) {
 			/* not unique, try again */
 			idx++;
@@ -1823,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx,
 
 		if (!sctx->parent_root) {
 			/* unique */
-			ret = 0;
 			break;
 		}
 
@@ -1831,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx,
 				path, BTRFS_FIRST_FREE_OBJECTID,
 				&tmp_name, 0);
 		btrfs_release_path(path);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out;
-		}
+		if (IS_ERR(di))
+			return PTR_ERR(di);
+
 		if (di) {
 			/* not unique, try again */
 			idx++;
@@ -1844,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 		break;
 	}
 
-	ret = fs_path_add(dest, tmp, len);
-
-out:
-	btrfs_free_path(path);
-	return ret;
+	return fs_path_add(dest, tmp, len);
 }
 
 enum inode_state {
@@ -1960,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
 	int ret = 0;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
 
 	path = alloc_path_for_send();
@@ -1968,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
 		return -ENOMEM;
 
 	di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
-	if (IS_ERR_OR_NULL(di)) {
-		ret = di ? PTR_ERR(di) : -ENOENT;
-		goto out;
-	}
+	if (IS_ERR_OR_NULL(di))
+		return di ? PTR_ERR(di) : -ENOENT;
+
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
-	if (key.type == BTRFS_ROOT_ITEM_KEY) {
-		ret = -ENOENT;
-		goto out;
-	}
+	if (key.type == BTRFS_ROOT_ITEM_KEY)
+		return -ENOENT;
+
 	*found_inode = key.objectid;
 
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -1994,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	int len;
 	u64 parent_dir;
 
@@ -2008,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
 
 	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (!ret)
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				path->slots[0]);
 	if (ret || found_key.objectid != ino ||
 	    (found_key.type != BTRFS_INODE_REF_KEY &&
-	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
-		ret = -ENOENT;
-		goto out;
-	}
+	     found_key.type != BTRFS_INODE_EXTREF_KEY))
+		return -ENOENT;
 
 	if (found_key.type == BTRFS_INODE_REF_KEY) {
 		struct btrfs_inode_ref *iref;
@@ -2038,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
 		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
 	}
 	if (ret < 0)
-		goto out;
+		return ret;
 	btrfs_release_path(path);
 
 	if (dir_gen) {
 		ret = get_inode_gen(root, parent_dir, dir_gen);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 
 	*dir = parent_dir;
 
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -2486,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	int ret;
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_root *parent_root = sctx->parent_root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_root_ref *ref;
 	struct extent_buffer *leaf;
@@ -2498,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx)
 		return -ENOMEM;
 
 	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
-	if (!name) {
-		btrfs_free_path(path);
+	if (!name)
 		return -ENOMEM;
-	}
 
 	key.objectid = btrfs_root_id(send_root);
 	key.type = BTRFS_ROOT_BACKREF_KEY;
@@ -2564,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
 
 tlv_put_failure:
 out:
-	btrfs_free_path(path);
 	kfree(name);
 	return ret;
 }
@@ -2715,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
 	int ret = 0;
 	struct fs_path *p = NULL;
 	struct btrfs_inode_item *ii;
-	struct btrfs_path *path = NULL;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *eb;
 	struct btrfs_key key;
 	int slot;
@@ -2759,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
 tlv_put_failure:
 out:
 	free_path_for_command(sctx, p);
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -2769,7 +2733,7 @@ out:
  * processing an inode that is a directory and it just got renamed, and existing
  * entries in the cache may refer to inodes that have the directory in their
  * full path - in which case we would generate outdated paths (pre-rename)
- * for the inodes that the cache entries point to. Instead of prunning the
+ * for the inodes that the cache entries point to. Instead of pruning the
  * cache when inserting, do it after we finish processing each inode at
  * finish_inode_if_needed().
  */
@@ -2930,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 {
 	int ret = 0;
 	int iter_ret = 0;
-	struct btrfs_path *path = NULL;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_key di_key;
@@ -2970,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 	if (iter_ret < 0)
 		ret = iter_ret;
 
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3750,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 				  struct recorded_ref *parent_ref,
 				  const bool is_orphan)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_key di_key;
 	struct btrfs_dir_item *di;
@@ -3771,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
 
 	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
-	if (ret < 0) {
-		goto out;
-	} else if (ret > 0) {
-		ret = 0;
-		goto out;
-	}
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		return 0;
 
 	di = btrfs_match_dir_item_name(path, parent_ref->name,
 				       parent_ref->name_len);
-	if (!di) {
-		ret = 0;
-		goto out;
-	}
+	if (!di)
+		return 0;
 	/*
 	 * di_key.objectid has the number of the inode that has a dentry in the
 	 * parent directory with the same name that sctx->cur_ino is being
@@ -3793,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 	 * that it happens after that other inode is renamed.
 	 */
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
-	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
-		ret = 0;
-		goto out;
-	}
+	if (di_key.type != BTRFS_INODE_ITEM_KEY)
+		return 0;
 
 	ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
 	if (ret < 0)
-		goto out;
+		return ret;
 	ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
 	if (ret < 0) {
 		if (ret == -ENOENT)
 			ret = 0;
-		goto out;
+		return ret;
 	}
 
 	/* Different inode, no need to delay the rename of sctx->cur_ino */
-	if (right_gen != left_gen) {
-		ret = 0;
-		goto out;
-	}
+	if (right_gen != left_gen)
+		return 0;
 
 	wdm = get_waiting_dir_move(sctx, di_key.objectid);
 	if (wdm && !wdm->orphanized) {
@@ -3826,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 		if (!ret)
 			ret = 1;
 	}
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3877,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root,
 	bool free_fs_path = false;
 	int ret = 0;
 	int iter_ret = 0;
-	struct btrfs_path *path = NULL;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 
 	if (!fs_path) {
@@ -3945,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root,
 		ret = iter_ret;
 
 out:
-	btrfs_free_path(path);
 	if (free_fs_path)
 		fs_path_free(fs_path);
 	return ret;
@@ -4756,8 +4708,8 @@ static int record_new_ref(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
-				sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+				false, record_new_ref_if_needed, sctx);
 	if (ret < 0)
 		return ret;
 
@@ -4768,9 +4720,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
-				sctx->cmp_key, 0, record_deleted_ref_if_needed,
-				sctx);
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+				false, record_deleted_ref_if_needed, sctx);
 	if (ret < 0)
 		return ret;
 
@@ -4781,12 +4732,12 @@ static int record_changed_ref(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
-			sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+				false, record_new_ref_if_needed, sctx);
 	if (ret < 0)
 		return ret;
-	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+				false, record_deleted_ref_if_needed, sctx);
 	if (ret < 0)
 		return ret;
 
@@ -4803,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx,
 	int ret = 0;
 	int iter_ret = 0;
 	struct btrfs_root *root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	iterate_inode_ref_t cb;
@@ -4822,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx,
 	} else {
 		btrfs_err(sctx->send_root->fs_info,
 				"Wrong command %d in process_all_refs", cmd);
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	key.objectid = sctx->cmp_key->objectid;
@@ -4835,15 +4785,14 @@ static int process_all_refs(struct send_ctx *sctx,
 		     found_key.type != BTRFS_INODE_EXTREF_KEY))
 			break;
 
-		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+		ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx);
 		if (ret < 0)
-			goto out;
+			return ret;
 	}
 	/* Catch error found during iteration */
-	if (iter_ret < 0) {
-		ret = iter_ret;
-		goto out;
-	}
+	if (iter_ret < 0)
+		return iter_ret;
+
 	btrfs_release_path(path);
 
 	/*
@@ -4851,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx,
 	 * re-creating this inode and will be rename'ing it into place once we
 	 * rename the parent directory.
 	 */
-	ret = process_recorded_refs(sctx, &pending_move);
-out:
-	btrfs_free_path(path);
-	return ret;
+	return process_recorded_refs(sctx, &pending_move);
 }
 
 static int send_set_xattr(struct send_ctx *sctx,
@@ -5080,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 	int ret = 0;
 	int iter_ret = 0;
 	struct btrfs_root *root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
@@ -5108,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 	if (iter_ret < 0)
 		ret = iter_ret;
 
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -5254,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 		if (!folio_test_uptodate(folio)) {
 			btrfs_read_folio(NULL, folio);
 			folio_lock(folio);
-			if (!folio_test_uptodate(folio)) {
+			if (unlikely(!folio_test_uptodate(folio))) {
 				folio_unlock(folio);
 				btrfs_err(fs_info,
 			"send: IO error at offset %llu for inode %llu root %llu",
@@ -5656,7 +5601,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
 
 	ei = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+	/*
+	 * Do not go through encoded read for bs > ps cases.
+	 *
+	 * Encoded send is using vmallocated pages as buffer, which we can
+	 * not ensure every folio is large enough to contain a block.
+	 */
+	if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE &&
+	    (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
 	    btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
 		bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
 				  BTRFS_FILE_EXTENT_INLINE);
@@ -5766,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
  */
 static int send_capabilities(struct send_ctx *sctx)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_dir_item *di;
 	struct extent_buffer *leaf;
 	unsigned long data_ptr;
@@ -5804,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx)
 			strlen(XATTR_NAME_CAPS), buf, buf_len);
 out:
 	kfree(buf);
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -5812,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 		       struct clone_root *clone_root, const u64 disk_byte,
 		       u64 data_offset, u64 offset, u64 len)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	int ret;
 	struct btrfs_inode_info info;
@@ -5848,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 	ret = get_inode_info(clone_root->root, clone_root->ino, &info);
 	btrfs_release_path(path);
 	if (ret < 0)
-		goto out;
+		return ret;
 	clone_src_i_size = info.size;
 
 	/*
@@ -5878,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 	key.offset = clone_root->offset;
 	ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (ret > 0 && path->slots[0] > 0) {
 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
 		if (key.objectid == clone_root->ino &&
@@ -5899,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(clone_root->root, path);
 			if (ret < 0)
-				goto out;
+				return ret;
 			else if (ret > 0)
 				break;
 			continue;
@@ -5936,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 			ret = send_extent_data(sctx, dst_path, offset,
 					       hole_len);
 			if (ret < 0)
-				goto out;
+				return ret;
 
 			len -= hole_len;
 			if (len == 0)
@@ -6007,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 					ret = send_clone(sctx, offset, slen,
 							 clone_root);
 					if (ret < 0)
-						goto out;
+						return ret;
 				}
 				ret = send_extent_data(sctx, dst_path,
 						       offset + slen,
@@ -6041,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
 		}
 
 		if (ret < 0)
-			goto out;
+			return ret;
 
 		len -= clone_len;
 		if (len == 0)
@@ -6072,8 +6023,6 @@ next:
 		ret = send_extent_data(sctx, dst_path, offset, len);
 	else
 		ret = 0;
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -6162,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 {
 	int ret = 0;
 	struct btrfs_key key;
-	struct btrfs_path *path = NULL;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct extent_buffer *eb;
 	int slot;
 	struct btrfs_key found_key;
@@ -6188,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 	left_type = btrfs_file_extent_type(eb, ei);
 
-	if (left_type != BTRFS_FILE_EXTENT_REG) {
-		ret = 0;
-		goto out;
-	}
+	if (left_type != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
 	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
 	left_len = btrfs_file_extent_num_bytes(eb, ei);
 	left_offset = btrfs_file_extent_offset(eb, ei);
@@ -6223,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	key.offset = ekey->offset;
 	ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
-	if (ret) {
-		ret = 0;
-		goto out;
-	}
+		return ret;
+	if (ret)
+		return 0;
 
 	/*
 	 * Handle special case where the right side has no extents at all.
@@ -6236,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	slot = path->slots[0];
 	btrfs_item_key_to_cpu(eb, &found_key, slot);
 	if (found_key.objectid != key.objectid ||
-	    found_key.type != key.type) {
+	    found_key.type != key.type)
 		/* If we're a hole then just pretend nothing changed */
-		ret = (left_disknr) ? 0 : 1;
-		goto out;
-	}
+		return (left_disknr ? 0 : 1);
 
 	/*
 	 * We're now on 2a, 2b or 7.
@@ -6250,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 		right_type = btrfs_file_extent_type(eb, ei);
 		if (right_type != BTRFS_FILE_EXTENT_REG &&
-		    right_type != BTRFS_FILE_EXTENT_INLINE) {
-			ret = 0;
-			goto out;
-		}
+		    right_type != BTRFS_FILE_EXTENT_INLINE)
+			return 0;
 
 		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
 			right_len = btrfs_file_extent_ram_bytes(eb, ei);
@@ -6266,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * Are we at extent 8? If yes, we know the extent is changed.
 		 * This may only happen on the first iteration.
 		 */
-		if (found_key.offset + right_len <= ekey->offset) {
+		if (found_key.offset + right_len <= ekey->offset)
 			/* If we're a hole just pretend nothing changed */
-			ret = (left_disknr) ? 0 : 1;
-			goto out;
-		}
+			return (left_disknr ? 0 : 1);
 
 		/*
 		 * We just wanted to see if when we have an inline extent, what
@@ -6280,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * compressed extent representing data with a size matching
 		 * the page size (currently the same as sector size).
 		 */
-		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
-			ret = 0;
-			goto out;
-		}
+		if (right_type == BTRFS_FILE_EXTENT_INLINE)
+			return 0;
 
 		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
 		right_offset = btrfs_file_extent_offset(eb, ei);
@@ -6303,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 */
 		if (left_disknr != right_disknr ||
 		    left_offset_fixed != right_offset ||
-		    left_gen != right_gen) {
-			ret = 0;
-			goto out;
-		}
+		    left_gen != right_gen)
+			return 0;
 
 		/*
 		 * Go to the next extent.
 		 */
 		ret = btrfs_next_item(sctx->parent_root, path);
 		if (ret < 0)
-			goto out;
+			return ret;
 		if (!ret) {
 			eb = path->nodes[0];
 			slot = path->slots[0];
@@ -6324,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 			key.offset += right_len;
 			break;
 		}
-		if (found_key.offset != key.offset + right_len) {
-			ret = 0;
-			goto out;
-		}
+		if (found_key.offset != key.offset + right_len)
+			return 0;
+
 		key = found_key;
 	}
 
@@ -6340,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	else
 		ret = 0;
 
-
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
 static int get_last_extent(struct send_ctx *sctx, u64 offset)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_root *root = sctx->send_root;
 	struct btrfs_key key;
 	int ret;
@@ -6364,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
 	key.offset = offset;
 	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
 	if (ret < 0)
-		goto out;
+		return ret;
 	ret = 0;
 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
-		goto out;
+		return ret;
 
 	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -6380,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
 				   const u64 start,
 				   const u64 end)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_root *root = sctx->parent_root;
 	u64 search_start = start;
@@ -6395,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
 	key.offset = search_start;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (ret > 0 && path->slots[0] > 0)
 		path->slots[0]--;
 
@@ -6408,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out;
-			else if (ret > 0)
+				return ret;
+			if (ret > 0)
 				break;
 			continue;
 		}
@@ -6431,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
 			search_start = extent_end;
 			goto next;
 		}
-		ret = 0;
-		goto out;
+		return 0;
 next:
 		path->slots[0]++;
 	}
-	ret = 1;
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 1;
 }
 
 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
@@ -6547,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx)
 	int ret = 0;
 	int iter_ret = 0;
 	struct btrfs_root *root;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
@@ -6574,11 +6500,10 @@ static int process_all_extents(struct send_ctx *sctx)
 	if (iter_ret < 0)
 		ret = iter_ret;
 
-	btrfs_free_path(path);
 	return ret;
 }
 
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
 					   int *pending_move,
 					   int *refs_processed)
 {
@@ -6601,7 +6526,7 @@ out:
 	return ret;
 }
 
-static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
 {
 	int ret = 0;
 	struct btrfs_inode_info info;
@@ -7036,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	if (sctx->cur_ino != sctx->cmp_key->objectid) {
+	if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
 		inconsistent_snapshot_error(sctx, result, "reference");
 		return -EIO;
 	}
@@ -7064,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	if (sctx->cur_ino != sctx->cmp_key->objectid) {
+	if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
 		inconsistent_snapshot_error(sctx, result, "xattr");
 		return -EIO;
 	}
@@ -7304,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx,
 	 */
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 	ASSERT(ret <= 0);
-	if (ret > 0) {
+	if (unlikely(ret > 0)) {
 		btrfs_print_tree(path->nodes[path->lowest_level], false);
 		btrfs_err(root->fs_info,
 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
@@ -7324,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx)
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_key key;
 	struct btrfs_fs_info *fs_info = send_root->fs_info;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 
 	path = alloc_path_for_send();
 	if (!path)
@@ -7341,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx)
 
 	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (ret)
 		goto out_finish;
 
@@ -7351,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx)
 		ret = changed_cb(path, NULL, &key,
 				 BTRFS_COMPARE_TREE_NEW, sctx);
 		if (ret < 0)
-			goto out;
+			return ret;
 
 		down_read(&fs_info->commit_root_sem);
 		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -7370,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx)
 			btrfs_release_path(path);
 			ret = search_key_again(sctx, send_root, path, &key);
 			if (ret < 0)
-				goto out;
+				return ret;
 		} else {
 			up_read(&fs_info->commit_root_sem);
 		}
 
 		ret = btrfs_next_item(send_root, path);
 		if (ret < 0)
-			goto out;
+			return ret;
 		if (ret) {
 			ret  = 0;
 			break;
@@ -7385,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx)
 	}
 
 out_finish:
-	ret = finish_inode_if_needed(sctx, 1);
-
-out:
-	btrfs_free_path(path);
-	return ret;
+	return finish_inode_if_needed(sctx, 1);
 }
 
 static int replace_node_with_clone(struct btrfs_path *path, int level)
@@ -7644,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 	struct btrfs_fs_info *fs_info = left_root->fs_info;
 	int ret;
 	int cmp;
-	struct btrfs_path *left_path = NULL;
-	struct btrfs_path *right_path = NULL;
+	BTRFS_PATH_AUTO_FREE(left_path);
+	BTRFS_PATH_AUTO_FREE(right_path);
 	struct btrfs_key left_key;
 	struct btrfs_key right_key;
 	char *tmp_buf = NULL;
@@ -7918,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
 out_unlock:
 	up_read(&fs_info->commit_root_sem);
 out:
-	btrfs_free_path(left_path);
-	btrfs_free_path(right_path);
 	kvfree(tmp_buf);
 	return ret;
 }
@@ -7986,7 +7905,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
 }
 
 /*
- * Make sure any existing dellaloc is flushed for any root used by a send
+ * Make sure any existing delalloc is flushed for any root used by a send
  * operation so that we do not miss any data and we do not race with writeback
  * finishing and changing a tree while send is using the tree. This could
  * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 0481c693ac2e..97452fb5d29b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 
 	/*
 	 * On the zoned mode, we always allocate one zone as one chunk.
-	 * Returning non-zone size alingned bytes here will result in
+	 * Returning non-zone size aligned bytes here will result in
 	 * less pressure for the async metadata reclaim process, and it
 	 * will over-commit too much leading to ENOSPC. Align down to the
 	 * zone size to avoid that.
@@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 	 * turned into error mode due to a transaction abort when flushing space
 	 * above, in that case fail with the abort error instead of returning
 	 * success to the caller if we can steal from the global rsv - this is
-	 * just to have caller fail immeditelly instead of later when trying to
+	 * just to have caller fail immediately instead of later when trying to
 	 * modify the fs, making it easier to debug -ENOSPC problems.
 	 */
 	if (BTRFS_FS_ERROR(fs_info)) {
@@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 							  space_info->flags,
 							  orig_bytes, flush,
 							  "enospc");
-				queue_work(system_unbound_wq, async_work);
+				queue_work(system_dfl_wq, async_work);
 			}
 		} else {
 			list_add_tail(&ticket.list,
@@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
 		    need_preemptive_reclaim(fs_info, space_info)) {
 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
 						  orig_bytes, flush, "preempt");
-			queue_work(system_unbound_wq,
+			queue_work(system_dfl_wq,
 				   &fs_info->preempt_reclaim_work);
 		}
 	}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index c9b3821957f7..5ca8d4db6722 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
 
 	spin_lock_irqsave(&bfs->lock, flags);
 	bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+
+	/*
+	 * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+	 * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+	 * assume writeback is complete, and exit too early — violating sync
+	 * ordering guarantees.
+	 */
 	if (!folio_test_writeback(folio))
-		folio_start_writeback(folio);
+		__folio_start_writeback(folio, true);
+	if (!folio_test_dirty(folio)) {
+		struct address_space *mapping = folio_mapping(folio);
+		XA_STATE(xas, &mapping->i_pages, folio->index);
+		unsigned long flags;
+
+		xas_lock_irqsave(&xas, flags);
+		xas_load(&xas);
+		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+		xas_unlock_irqrestore(&xas, flags);
+	}
 	spin_unlock_irqrestore(&bfs->lock, flags);
 }
 
@@ -673,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
 									\
 	GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap);		\
 	btrfs_warn(fs_info,						\
-	"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+	"dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
 		   start, len, folio_pos(folio),			\
 		   blocks_per_folio, &bitmap);				\
 }
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index ee0710eb13fd..ad0552db7c7d 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -13,7 +13,7 @@ struct address_space;
 struct folio;
 
 /*
- * Extra info for subpapge bitmap.
+ * Extra info for subpage bitmap.
  *
  * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
  * one larger bitmap.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68e35a3700ff..d6e496436539 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -88,6 +88,9 @@ struct btrfs_fs_context {
 	refcount_t refs;
 };
 
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+			       struct btrfs_fs_context *old);
+
 enum {
 	Opt_acl,
 	Opt_clear_cache,
@@ -130,9 +133,8 @@ enum {
 	Opt_enospc_debug,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	Opt_ref_verify,
+	Opt_ref_tracker,
 #endif
 	Opt_err,
 };
@@ -254,8 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
 	fsparam_flag_no("enospc_debug", Opt_enospc_debug),
 #ifdef CONFIG_BTRFS_DEBUG
 	fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+	fsparam_flag("ref_tracker", Opt_ref_tracker),
 	fsparam_flag("ref_verify", Opt_ref_verify),
 #endif
 	{}
@@ -273,6 +274,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
 				const struct fs_parameter *param, int opt)
 {
 	const char *string = param->string;
+	int ret;
 
 	/*
 	 * Provide the same semantics as older kernels that don't use fs
@@ -291,21 +293,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
 		btrfs_clear_opt(ctx->mount_opt, NODATASUM);
 	} else if (btrfs_match_compress_type(string, "zlib", true)) {
 		ctx->compress_type = BTRFS_COMPRESS_ZLIB;
-		ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
-							       string + 4);
+		ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+					       &ctx->compress_level);
+		if (ret < 0)
+			goto error;
 		btrfs_set_opt(ctx->mount_opt, COMPRESS);
 		btrfs_clear_opt(ctx->mount_opt, NODATACOW);
 		btrfs_clear_opt(ctx->mount_opt, NODATASUM);
-	} else if (btrfs_match_compress_type(string, "lzo", false)) {
+	} else if (btrfs_match_compress_type(string, "lzo", true)) {
 		ctx->compress_type = BTRFS_COMPRESS_LZO;
-		ctx->compress_level = 0;
+		ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+					       &ctx->compress_level);
+		if (ret < 0)
+			goto error;
+		if (string[3] == ':' && string[4])
+			btrfs_warn(NULL, "Compression level ignored for LZO");
 		btrfs_set_opt(ctx->mount_opt, COMPRESS);
 		btrfs_clear_opt(ctx->mount_opt, NODATACOW);
 		btrfs_clear_opt(ctx->mount_opt, NODATASUM);
 	} else if (btrfs_match_compress_type(string, "zstd", true)) {
 		ctx->compress_type = BTRFS_COMPRESS_ZSTD;
-		ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
-							       string + 4);
+		ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+					       &ctx->compress_level);
+		if (ret < 0)
+			goto error;
 		btrfs_set_opt(ctx->mount_opt, COMPRESS);
 		btrfs_clear_opt(ctx->mount_opt, NODATACOW);
 		btrfs_clear_opt(ctx->mount_opt, NODATASUM);
@@ -316,10 +327,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
 		btrfs_clear_opt(ctx->mount_opt, COMPRESS);
 		btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
 	} else {
-		btrfs_err(NULL, "unrecognized compression value %s", string);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto error;
 	}
 	return 0;
+error:
+	btrfs_err(NULL, "failed to parse compression option '%s'", string);
+	return ret;
+
 }
 
 static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -629,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			return -EINVAL;
 		}
 		break;
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	case Opt_ref_verify:
 		btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
 		break;
+	case Opt_ref_tracker:
+		btrfs_set_opt(ctx->mount_opt, REF_TRACKER);
+		break;
 #endif
 	default:
 		btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
@@ -698,12 +714,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
 
 	if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
 		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
-			btrfs_info(info, "disk space caching is enabled");
 			btrfs_warn(info,
 "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
 		}
-		if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
-			btrfs_info(info, "using free-space-tree");
 	}
 
 	return ret;
@@ -912,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 {
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key location;
 	struct fscrypt_str name = FSTR_INIT("default", 7);
 	u64 dir_id;
@@ -929,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
 	if (IS_ERR(di)) {
-		btrfs_free_path(path);
 		return PTR_ERR(di);
 	}
 	if (!di) {
@@ -938,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
 		 * it's always been there, but don't freak out, just try and
 		 * mount the top-level subvolume.
 		 */
-		btrfs_free_path(path);
 		*objectid = BTRFS_FS_TREE_OBJECTID;
 		return 0;
 	}
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
-	btrfs_free_path(path);
 	*objectid = location.objectid;
 	return 0;
 }
@@ -980,6 +990,8 @@ static int btrfs_fill_super(struct super_block *sb,
 		return ret;
 	}
 
+	btrfs_emit_options(fs_info, NULL);
+
 	inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
@@ -1077,7 +1089,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
 			seq_printf(seq, ",compress=%s", compress_type);
-		if (info->compress_level)
+		if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
 			seq_printf(seq, ":%d", info->compress_level);
 	}
 	if (btrfs_test_opt(info, NOSSD))
@@ -1140,6 +1152,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 #endif
 	if (btrfs_test_opt(info, REF_VERIFY))
 		seq_puts(seq, ",ref_verify");
+	if (btrfs_test_opt(info, REF_TRACKER))
+		seq_puts(seq, ",ref_tracker");
 	seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
 	subvol_name = btrfs_get_subvol_name_from_objectid(info,
 			btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
@@ -1266,7 +1280,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
 	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
 
 	/*
-	 * We need to cleanup all defragable inodes if the autodefragment is
+	 * We need to cleanup all defraggable inodes if the autodefragment is
 	 * close or the filesystem is read only.
 	 */
 	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
@@ -1437,7 +1451,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
 {
 	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
 	btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
-	btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+	btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
 	btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
 	btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
 	btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1459,10 +1473,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
 	btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
 	btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
 
+	btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
 	btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
 	btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
 	btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
-	btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+	btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
 	btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
 	btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
 	btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
@@ -2257,10 +2272,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		device = btrfs_scan_one_device(vol->name, false);
 		if (IS_ERR_OR_NULL(device)) {
 			mutex_unlock(&uuid_mutex);
-			if (IS_ERR(device))
-				ret = PTR_ERR(device);
-			else
-				ret = 0;
+			ret = PTR_ERR_OR_ZERO(device);
 			break;
 		}
 		ret = !(device->fs_devices->num_devices ==
@@ -2313,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev)
 
 	/* Verify the checksum. */
 	csum_type = btrfs_super_csum_type(sb);
-	if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+	if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) {
 		btrfs_err(fs_info, "csum type changed, has %u expect %u",
 			  csum_type, btrfs_super_csum_type(fs_info->super_copy));
 		ret = -EUCLEAN;
 		goto out;
 	}
 
-	if (btrfs_check_super_csum(fs_info, sb)) {
+	if (unlikely(btrfs_check_super_csum(fs_info, sb))) {
 		btrfs_err(fs_info, "csum for on-disk super block no longer matches");
 		ret = -EUCLEAN;
 		goto out;
@@ -2332,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev)
 		goto out;
 
 	last_trans = btrfs_get_last_trans_committed(fs_info);
-	if (btrfs_super_generation(sb) != last_trans) {
+	if (unlikely(btrfs_super_generation(sb) != last_trans)) {
 		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
 			  btrfs_super_generation(sb), last_trans);
 		ret = -EUCLEAN;
@@ -2469,9 +2481,6 @@ static int __init btrfs_print_mod_info(void)
 #ifdef CONFIG_BTRFS_ASSERT
 			", assert=on"
 #endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
-			", ref-verify=on"
-#endif
 #ifdef CONFIG_BLK_DEV_ZONED
 			", zoned=yes"
 #else
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9d398f7a36ad..81f52c1f55ce 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
 					  char *buf)
 {
 	ssize_t ret = 0;
+	bool has_output = false;
 
-	if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
-		ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
-	if (PAGE_SIZE > SZ_4K)
-		ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
-	ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
-
+	for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) {
+		if (!btrfs_supported_blocksize(cur))
+			continue;
+		if (has_output)
+			ret += sysfs_emit_at(buf, ret, " ");
+		ret += sysfs_emit_at(buf, ret, "%u", cur);
+		has_output = true;
+	}
+	ret += sysfs_emit_at(buf, ret, "\n");
 	return ret;
 }
 BTRFS_ATTR(static_feature, supported_sectorsizes,
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
index 265370e79a54..e2248acb906b 100644
--- a/fs/btrfs/tests/delayed-refs-tests.c
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
 
 	ret = simple_tests(&trans);
 	if (!ret) {
-		test_msg("running delayed refs merg tests on metadata refs");
+		test_msg("running delayed refs merge tests on metadata refs");
 		ret = merge_tests(&trans, BTRFS_REF_METADATA);
 	}
 
 	if (!ret) {
-		test_msg("running delayed refs merg tests on data refs");
+		test_msg("running delayed refs merge tests on data refs");
 		ret = merge_tests(&trans, BTRFS_REF_DATA);
 	}
 
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 3a86534c116f..42af6c737c6e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void)
 			/*
 			 * Test a chunk with 2 data stripes one of which
 			 * intersects the physical address of the super block
-			 * is correctly recognised.
+			 * is correctly recognized.
 			 */
 			.raid_type = BTRFS_BLOCK_GROUP_RAID1,
 			.physical_start = SZ_64M - SZ_4M,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c5c0d9cf1a80..89ae0c7a610a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
  * | attached to transid N+1.			    |
  * |						    |
  * | To next stage:				    |
- * |  Until all tree blocks are super blocks are    |
+ * |  Until all tree blocks and super blocks are    |
  * |  written to block devices			    |
  * V						    |
  * Transaction N [[TRANS_STATE_COMPLETED]]	    V
@@ -404,7 +404,7 @@ loop:
  */
 static int record_root_in_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       int force)
+			       bool force)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
@@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 * qgroup counters could end up wrong.
 	 */
 	ret = btrfs_run_delayed_refs(trans, U64_MAX);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
@@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *parent_root;
 	struct btrfs_block_rsv *rsv;
 	struct btrfs_inode *parent_inode = pending->dir;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_dir_item *dir_item;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
@@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 			goto clear_skip_qgroup;
 	}
 
-	key.objectid = objectid;
-	key.type = BTRFS_ROOT_ITEM_KEY;
-	key.offset = (u64)-1;
-
 	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
 	trans->bytes_reserved = trans->block_rsv->reserved;
@@ -1714,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * insert the directory item
 	 */
 	ret = btrfs_set_inode_index(parent_inode, &index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1735,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_create_qgroup(trans, objectid);
 	if (ret && ret != -EEXIST) {
-		if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+		if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) {
 			btrfs_abort_transaction(trans, ret);
 			goto fail;
 		}
@@ -1748,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans);
-	if (ret) {	/* Transaction aborted */
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
 
 	ret = record_root_in_trans(trans, root, 0);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1789,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	old = btrfs_lock_root_node(root);
 	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
 			      BTRFS_NESTING_COW);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_tree_unlock(old);
 		free_extent_buffer(old);
 		btrfs_abort_transaction(trans, ret);
@@ -1800,21 +1796,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* clean up in any case */
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
 	/* see comments in should_cow_block() */
 	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
-	smp_wmb();
+	smp_mb__after_atomic();
 
 	btrfs_set_root_node(new_root_item, tmp);
 	/* record when the snapshot was created in key.offset */
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = trans->transid;
 	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1826,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				 btrfs_root_id(parent_root),
 				 btrfs_ino(parent_inode), index,
 				 &fname.disk_name);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1841,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_reloc_post_snapshot(trans, pending);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1864,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, &fname.disk_name,
 				    parent_inode, &key, BTRFS_FT_DIR,
 				    index);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1874,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	inode_set_mtime_to_ts(&parent_inode->vfs_inode,
 			      inode_set_ctime_current(&parent_inode->vfs_inode));
 	ret = btrfs_update_inode_fallback(trans, parent_inode);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
 	ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
 				  BTRFS_UUID_KEY_SUBVOL,
 				  objectid);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
 	}
@@ -1889,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 					  objectid);
-		if (ret && ret != -EEXIST) {
+		if (unlikely(ret && ret != -EEXIST)) {
 			btrfs_abort_transaction(trans, ret);
 			goto fail;
 		}
@@ -1907,7 +1905,6 @@ free_fname:
 free_pending:
 	kfree(new_root_item);
 	pending->root_item = NULL;
-	btrfs_free_path(path);
 	pending->path = NULL;
 
 	return ret;
@@ -2423,7 +2420,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * them.
 	 *
 	 * We needn't worry that this operation will corrupt the snapshots,
-	 * because all the tree which are snapshoted will be forced to COW
+	 * because all the tree which are snapshotted will be forced to COW
 	 * the nodes and leaves.
 	 */
 	ret = btrfs_run_delayed_items(trans);
@@ -2657,9 +2654,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
 
 	if (btrfs_header_backref_rev(root->node) <
 			BTRFS_MIXED_BACKREF_REV)
-		ret = btrfs_drop_snapshot(root, 0, 0);
+		ret = btrfs_drop_snapshot(root, false, false);
 	else
-		ret = btrfs_drop_snapshot(root, 1, 0);
+		ret = btrfs_drop_snapshot(root, true, false);
 
 	btrfs_put_root(root);
 	return (ret < 0) ? 0 : 1;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0f556f4de3f9..ca30b15ea452 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
 	/* Only these key->types needs to be checked */
 	ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
 	       key->type == BTRFS_INODE_REF_KEY ||
+	       key->type == BTRFS_INODE_EXTREF_KEY ||
 	       key->type == BTRFS_DIR_INDEX_KEY ||
 	       key->type == BTRFS_DIR_ITEM_KEY ||
 	       key->type == BTRFS_EXTENT_DATA_KEY);
@@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
 	/*
 	 * For legacy root item, the members starting at generation_v2 will be
 	 * all filled with 0.
-	 * And since we allow geneartion_v2 as 0, it will still pass the check.
+	 * And since we allow generation_v2 as 0, it will still pass the check.
 	 */
 	read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
 			   btrfs_item_size(leaf, slot));
@@ -1756,10 +1757,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
 	while (ptr < end) {
 		u16 namelen;
 
-		if (unlikely(ptr + sizeof(iref) > end)) {
+		if (unlikely(ptr + sizeof(*iref) > end)) {
 			inode_ref_err(leaf, slot,
 			"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
-				ptr, end, sizeof(iref));
+				ptr, end, sizeof(*iref));
 			return -EUCLEAN;
 		}
 
@@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int check_inode_extref(struct extent_buffer *leaf,
+			      struct btrfs_key *key, struct btrfs_key *prev_key,
+			      int slot)
+{
+	unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+	unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+	if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+		return -EUCLEAN;
+
+	while (ptr < end) {
+		struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
+		u16 namelen;
+
+		if (unlikely(ptr + sizeof(*extref)) > end) {
+			inode_ref_err(leaf, slot,
+			"inode extref overflow, ptr %lu end %lu inode_extref size %zu",
+				      ptr, end, sizeof(*extref));
+			return -EUCLEAN;
+		}
+
+		namelen = btrfs_inode_extref_name_len(leaf, extref);
+		if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
+			inode_ref_err(leaf, slot,
+				"inode extref overflow, ptr %lu end %lu namelen %u",
+				ptr, end, namelen);
+			return -EUCLEAN;
+		}
+		ptr += sizeof(*extref) + namelen;
+	}
+	return 0;
+}
+
 static int check_raid_stripe_extent(const struct extent_buffer *leaf,
 				    const struct btrfs_key *key, int slot)
 {
@@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 	case BTRFS_INODE_REF_KEY:
 		ret = check_inode_ref(leaf, key, prev_key, slot);
 		break;
+	case BTRFS_INODE_EXTREF_KEY:
+		ret = check_inode_extref(leaf, key, prev_key, slot);
+		break;
 	case BTRFS_BLOCK_GROUP_ITEM_KEY:
 		ret = check_block_group_item(leaf, key, slot);
 		break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2186e87fb61b..6aad6b65522b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
 #include "file-item.h"
 #include "file.h"
 #include "orphan.h"
+#include "print-tree.h"
 #include "tree-checker.h"
 
 #define MAX_CONFLICT_INODES 10
@@ -101,17 +102,134 @@ enum {
 	LOG_WALK_REPLAY_ALL,
 };
 
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+	/*
+	 * Signal that we are freeing the metadata extents of a log tree.
+	 * This is used at transaction commit time while freeing a log tree.
+	 */
+	bool free;
+
+	/*
+	 * Signal that we are pinning the metadata extents of a log tree and the
+	 * data extents its leaves point to (if using mixed block groups).
+	 * This happens in the first stage of log replay to ensure that during
+	 * replay, while we are modifying subvolume trees, we don't overwrite
+	 * the metadata extents of log trees.
+	 */
+	bool pin;
+
+	/* What stage of the replay code we're currently in. */
+	int stage;
+
+	/*
+	 * Ignore any items from the inode currently being processed. Needs
+	 * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+	 */
+	bool ignore_cur_inode;
+
+	/*
+	 * The root we are currently replaying to. This is NULL for the replay
+	 * stage LOG_WALK_PIN_ONLY.
+	 */
+	struct btrfs_root *root;
+
+	/* The log tree we are currently processing (not NULL for any stage). */
+	struct btrfs_root *log;
+
+	/* The transaction handle used for replaying all log trees. */
+	struct btrfs_trans_handle *trans;
+
+	/*
+	 * The function that gets used to process blocks we find in the tree.
+	 * Note the extent_buffer might not be up to date when it is passed in,
+	 * and it must be checked or read if you need the data inside it.
+	 */
+	int (*process_func)(struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen, int level);
+
+	/*
+	 * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+	 * and by the replay_one_buffer() callback.
+	 */
+
+	/* The current log leaf being processed. */
+	struct extent_buffer *log_leaf;
+	/* The key being processed of the current log leaf. */
+	struct btrfs_key log_key;
+	/* The slot being processed of the current log leaf. */
+	int log_slot;
+
+	/* A path used for searches and modifications to subvolume trees. */
+	struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+				unsigned int line, int error, const char *fmt, ...)
+{
+	struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+	struct va_format vaf;
+	va_list args;
+
+	/*
+	 * Do nothing if we already aborted, to avoid dumping leaves again which
+	 * can be verbose. Further more, only the first call is useful since it
+	 * is where we have a problem. Note that we do not use the flag
+	 * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+	 * are outside of tree-log.c that can abort transactions (such as
+	 * btrfs_add_link() for example), so if that happens we still want to
+	 * dump all log replay specific information below.
+	 */
+	if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+		return;
+
+	btrfs_abort_transaction(wc->trans, error);
+
+	if (wc->subvol_path->nodes[0]) {
+		btrfs_crit(fs_info,
+			   "subvolume (root %llu) leaf currently being processed:",
+			   btrfs_root_id(wc->root));
+		btrfs_print_leaf(wc->subvol_path->nodes[0]);
+	}
+
+	if (wc->log_leaf) {
+		btrfs_crit(fs_info,
+	  "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):",
+			   btrfs_root_id(wc->root), wc->log_slot,
+			   wc->log_key.objectid, wc->log_key.type, wc->log_key.offset);
+		btrfs_print_leaf(wc->log_leaf);
+	}
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(fs_info,
+	   "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+		   function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+	va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+	do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
+
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode,
 			   int inode_only,
 			   struct btrfs_log_ctx *ctx);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct btrfs_root *log,
-				       struct btrfs_path *path,
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
 				       u64 dirid, bool del_all);
 static void wait_log_commit(struct btrfs_root *root, int transid);
 
@@ -300,53 +418,13 @@ void btrfs_end_log_trans(struct btrfs_root *root)
 }
 
 /*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree.  The stage field tells us which part
- * of the log tree processing we are currently doing.  The others
- * are state fields used for that specific part
- */
-struct walk_control {
-	/* should we free the extent on disk when done?  This is used
-	 * at transaction commit time while freeing a log tree
-	 */
-	int free;
-
-	/* pin only walk, we record which extents on disk belong to the
-	 * log trees
-	 */
-	int pin;
-
-	/* what stage of the replay code we're currently in */
-	int stage;
-
-	/*
-	 * Ignore any items from the inode currently being processed. Needs
-	 * to be set every time we find a BTRFS_INODE_ITEM_KEY.
-	 */
-	bool ignore_cur_inode;
-
-	/* the root we are currently replaying */
-	struct btrfs_root *replay_dest;
-
-	/* the trans handle for the current replay */
-	struct btrfs_trans_handle *trans;
-
-	/* the function that gets used to process blocks we find in the
-	 * tree.  Note the extent_buffer might not be up to date when it is
-	 * passed in, and it must be checked or read if you need the data
-	 * inside it
-	 */
-	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
-			    struct walk_control *wc, u64 gen, int level);
-};
-
-/*
  * process_func used to pin down extents, write them or wait on them
  */
-static int process_one_buffer(struct btrfs_root *log,
-			      struct extent_buffer *eb,
+static int process_one_buffer(struct extent_buffer *eb,
 			      struct walk_control *wc, u64 gen, int level)
 {
+	struct btrfs_root *log = wc->log;
+	struct btrfs_trans_handle *trans = wc->trans;
 	struct btrfs_fs_info *fs_info = log->fs_info;
 	int ret = 0;
 
@@ -361,25 +439,36 @@ static int process_one_buffer(struct btrfs_root *log,
 		};
 
 		ret = btrfs_read_extent_buffer(eb, &check);
-		if (ret)
+		if (unlikely(ret)) {
+			if (trans)
+				btrfs_abort_transaction(trans, ret);
+			else
+				btrfs_handle_fs_error(fs_info, ret, NULL);
 			return ret;
+		}
 	}
 
 	if (wc->pin) {
-		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
-		if (ret)
+		ASSERT(trans != NULL);
+		ret = btrfs_pin_extent_for_log_replay(trans, eb);
+		if (unlikely(ret)) {
+			btrfs_abort_transaction(trans, ret);
 			return ret;
+		}
 
-		if (btrfs_buffer_uptodate(eb, gen, 0) &&
-		    btrfs_header_level(eb) == 0)
+		if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
 			ret = btrfs_exclude_logged_extents(eb);
+			if (ret)
+				btrfs_abort_transaction(trans, ret);
+		}
 	}
 	return ret;
 }
 
 /*
- * Item overwrite used by log replay. The given eb, slot and key all refer to
- * the source data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
  *
  * The given root is for the tree we are copying into, and path is a scratch
  * path for use in this function (it should be released on entry and will be
@@ -391,12 +480,10 @@ static int process_one_buffer(struct btrfs_root *log,
  *
  * If the key isn't in the destination yet, a new item is inserted.
  */
-static int overwrite_item(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  struct extent_buffer *eb, int slot,
-			  struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	int ret;
 	u32 item_size;
 	u64 saved_i_size = 0;
@@ -405,7 +492,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 	unsigned long dst_ptr;
 	struct extent_buffer *dst_eb;
 	int dst_slot;
-	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+	const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
 
 	/*
 	 * This is only used during log replay, so the root is always from a
@@ -416,16 +503,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 	 */
 	ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
 
-	item_size = btrfs_item_size(eb, slot);
-	src_ptr = btrfs_item_ptr_offset(eb, slot);
+	item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+	src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
 
 	/* Look for the key in the destination tree. */
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret < 0)
+	ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+		"failed to search subvolume tree for key (%llu %u %llu) root %llu",
+				       wc->log_key.objectid, wc->log_key.type,
+				       wc->log_key.offset, btrfs_root_id(root));
 		return ret;
+	}
 
-	dst_eb = path->nodes[0];
-	dst_slot = path->slots[0];
+	dst_eb = wc->subvol_path->nodes[0];
+	dst_slot = wc->subvol_path->slots[0];
 
 	if (ret == 0) {
 		char *src_copy;
@@ -435,16 +527,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 			goto insert;
 
 		if (item_size == 0) {
-			btrfs_release_path(path);
+			btrfs_release_path(wc->subvol_path);
 			return 0;
 		}
 		src_copy = kmalloc(item_size, GFP_NOFS);
 		if (!src_copy) {
-			btrfs_release_path(path);
+			btrfs_abort_log_replay(wc, -ENOMEM,
+			       "failed to allocate memory for log leaf item");
 			return -ENOMEM;
 		}
 
-		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+		read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
 		dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
 		ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
 
@@ -456,7 +549,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 		 * sync
 		 */
 		if (ret == 0) {
-			btrfs_release_path(path);
+			btrfs_release_path(wc->subvol_path);
 			return 0;
 		}
 
@@ -464,7 +557,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 		 * We need to load the old nbytes into the inode so when we
 		 * replay the extents we've logged we get the right nbytes.
 		 */
-		if (inode_item) {
+		if (is_inode_item) {
 			struct btrfs_inode_item *item;
 			u64 nbytes;
 			u32 mode;
@@ -472,20 +565,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 			item = btrfs_item_ptr(dst_eb, dst_slot,
 					      struct btrfs_inode_item);
 			nbytes = btrfs_inode_nbytes(dst_eb, item);
-			item = btrfs_item_ptr(eb, slot,
+			item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
 					      struct btrfs_inode_item);
-			btrfs_set_inode_nbytes(eb, item, nbytes);
+			btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
 
 			/*
 			 * If this is a directory we need to reset the i_size to
 			 * 0 so that we can set it up properly when replaying
 			 * the rest of the items in this log.
 			 */
-			mode = btrfs_inode_mode(eb, item);
+			mode = btrfs_inode_mode(wc->log_leaf, item);
 			if (S_ISDIR(mode))
-				btrfs_set_inode_size(eb, item, 0);
+				btrfs_set_inode_size(wc->log_leaf, item, 0);
 		}
-	} else if (inode_item) {
+	} else if (is_inode_item) {
 		struct btrfs_inode_item *item;
 		u32 mode;
 
@@ -493,38 +586,41 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
 		 * New inode, set nbytes to 0 so that the nbytes comes out
 		 * properly when we replay the extents.
 		 */
-		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
-		btrfs_set_inode_nbytes(eb, item, 0);
+		item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
 
 		/*
 		 * If this is a directory we need to reset the i_size to 0 so
 		 * that we can set it up properly when replaying the rest of
 		 * the items in this log.
 		 */
-		mode = btrfs_inode_mode(eb, item);
+		mode = btrfs_inode_mode(wc->log_leaf, item);
 		if (S_ISDIR(mode))
-			btrfs_set_inode_size(eb, item, 0);
+			btrfs_set_inode_size(wc->log_leaf, item, 0);
 	}
 insert:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	/* try to insert the key into the destination tree */
-	path->skip_release_on_error = 1;
-	ret = btrfs_insert_empty_item(trans, root, path,
-				      key, item_size);
-	path->skip_release_on_error = 0;
+	wc->subvol_path->skip_release_on_error = 1;
+	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+	wc->subvol_path->skip_release_on_error = 0;
 
-	dst_eb = path->nodes[0];
-	dst_slot = path->slots[0];
+	dst_eb = wc->subvol_path->nodes[0];
+	dst_slot = wc->subvol_path->slots[0];
 
 	/* make sure any existing item is the correct size */
 	if (ret == -EEXIST || ret == -EOVERFLOW) {
 		const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
 
 		if (found_size > item_size)
-			btrfs_truncate_item(trans, path, item_size, 1);
+			btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
 		else if (found_size < item_size)
-			btrfs_extend_item(trans, path, item_size - found_size);
+			btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
 	} else if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to insert item for key (%llu %u %llu)",
+				       wc->log_key.objectid, wc->log_key.type,
+				       wc->log_key.offset);
 		return ret;
 	}
 	dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
@@ -538,15 +634,15 @@ insert:
 	 * state of the tree found in the subvolume, and i_size is modified
 	 * as it goes
 	 */
-	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+	if (is_inode_item && ret == -EEXIST) {
 		struct btrfs_inode_item *src_item;
 		struct btrfs_inode_item *dst_item;
 
 		src_item = (struct btrfs_inode_item *)src_ptr;
 		dst_item = (struct btrfs_inode_item *)dst_ptr;
 
-		if (btrfs_inode_generation(eb, src_item) == 0) {
-			const u64 ino_size = btrfs_inode_size(eb, src_item);
+		if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+			const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
 
 			/*
 			 * For regular files an ino_size == 0 is used only when
@@ -555,21 +651,21 @@ insert:
 			 * case don't set the size of the inode in the fs/subvol
 			 * tree, otherwise we would be throwing valid data away.
 			 */
-			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+			if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 			    ino_size != 0)
 				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 			goto no_copy;
 		}
 
-		if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
 		    S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
 			save_old_i_size = 1;
 			saved_i_size = btrfs_inode_size(dst_eb, dst_item);
 		}
 	}
 
-	copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
+	copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
 
 	if (save_old_i_size) {
 		struct btrfs_inode_item *dst_item;
@@ -579,7 +675,7 @@ insert:
 	}
 
 	/* make sure the generation is filled in */
-	if (key->type == BTRFS_INODE_ITEM_KEY) {
+	if (is_inode_item) {
 		struct btrfs_inode_item *dst_item;
 
 		dst_item = (struct btrfs_inode_item *)dst_ptr;
@@ -587,7 +683,7 @@ insert:
 			btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
 	}
 no_copy:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	return 0;
 }
 
@@ -618,292 +714,354 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
  * The extent is inserted into the file, dropping any existing extents
  * from the file that overlap the new one.
  */
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct extent_buffer *eb, int slot,
-				      struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	struct btrfs_drop_extents_args drop_args = { 0 };
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int found_type;
 	u64 extent_end;
-	u64 start = key->offset;
+	const u64 start = wc->log_key.offset;
 	u64 nbytes = 0;
+	u64 csum_start;
+	u64 csum_end;
+	LIST_HEAD(ordered_sums);
+	u64 offset;
+	unsigned long dest_offset;
+	struct btrfs_key ins;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_inode *inode = NULL;
-	unsigned long size;
 	int ret = 0;
 
-	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-	found_type = btrfs_file_extent_type(eb, item);
+	item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(wc->log_leaf, item);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-		nbytes = btrfs_file_extent_num_bytes(eb, item);
-		extent_end = start + nbytes;
-
-		/*
-		 * We don't add to the inodes nbytes if we are prealloc or a
-		 * hole.
-		 */
-		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
-			nbytes = 0;
+		extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+		/* Holes don't take up space. */
+		if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+			nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_ram_bytes(eb, item);
-		nbytes = btrfs_file_extent_ram_bytes(eb, item);
-		extent_end = ALIGN(start + size,
-				   fs_info->sectorsize);
+		nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+		extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
 	} else {
-		btrfs_err(fs_info,
-		  "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
-			  found_type, btrfs_root_id(root), key->objectid, key->offset);
+		btrfs_abort_log_replay(wc, -EUCLEAN,
+		       "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+				       found_type, btrfs_root_id(root),
+				       wc->log_key.objectid, wc->log_key.offset);
 		return -EUCLEAN;
 	}
 
-	inode = btrfs_iget_logging(key->objectid, root);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+	inode = btrfs_iget_logging(wc->log_key.objectid, root);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to get inode %llu for root %llu",
+				       wc->log_key.objectid, btrfs_root_id(root));
+		return ret;
+	}
 
 	/*
 	 * first check to see if we already have this extent in the
 	 * file.  This must be done before the btrfs_drop_extents run
 	 * so we don't try to drop this extent.
 	 */
-	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
+	ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+				       btrfs_ino(inode), start, 0);
 
 	if (ret == 0 &&
 	    (found_type == BTRFS_FILE_EXTENT_REG ||
 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+		struct extent_buffer *leaf = wc->subvol_path->nodes[0];
 		struct btrfs_file_extent_item existing;
 		unsigned long ptr;
 
-		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
-		read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
+		ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+		read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
 
 		/*
 		 * we already have a pointer to this exact extent,
 		 * we don't have to do anything
 		 */
-		if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+		if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
 					 sizeof(existing)) == 0) {
-			btrfs_release_path(path);
+			btrfs_release_path(wc->subvol_path);
 			goto out;
 		}
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	/* drop any overlapping extents */
 	drop_args.start = start;
 	drop_args.end = extent_end;
 	drop_args.drop_cache = true;
+	drop_args.path = wc->subvol_path;
 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
-	if (ret)
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+				       wc->log_key.objectid, start, extent_end,
+				       btrfs_root_id(root));
 		goto out;
+	}
 
-	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-		u64 offset;
-		unsigned long dest_offset;
-		struct btrfs_key ins;
-
-		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
-		    btrfs_fs_incompat(fs_info, NO_HOLES))
-			goto update_inode;
-
-		ret = btrfs_insert_empty_item(trans, root, path, key,
-					      sizeof(*item));
+	if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(wc);
 		if (ret)
 			goto out;
-		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
-						    path->slots[0]);
-		copy_extent_buffer(path->nodes[0], eb, dest_offset,
-				(unsigned long)item,  sizeof(*item));
+		goto update_inode;
+	}
 
-		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
-		offset = key->offset - btrfs_file_extent_offset(eb, item);
+	/*
+	 * If not an inline extent, it can only be a regular or prealloc one.
+	 * We have checked that above and returned -EUCLEAN if not.
+	 */
 
-		/*
-		 * Manually record dirty extent, as here we did a shallow
-		 * file extent item copy and skip normal backref update,
-		 * but modifying extent tree all by ourselves.
-		 * So need to manually record dirty extent for qgroup,
-		 * as the owner of the file extent changed from log tree
-		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
-		 */
-		ret = btrfs_qgroup_trace_extent(trans,
-				btrfs_file_extent_disk_bytenr(eb, item),
-				btrfs_file_extent_disk_num_bytes(eb, item));
-		if (ret < 0)
-			goto out;
+	/* A hole and NO_HOLES feature enabled, nothing else to do. */
+	if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+	    btrfs_fs_incompat(fs_info, NO_HOLES))
+		goto update_inode;
 
-		if (ins.objectid > 0) {
-			u64 csum_start;
-			u64 csum_end;
-			LIST_HEAD(ordered_sums);
+	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+				      &wc->log_key, sizeof(*item));
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to insert item with key (%llu %u %llu) root %llu",
+				       wc->log_key.objectid, wc->log_key.type,
+				       wc->log_key.offset, btrfs_root_id(root));
+		goto out;
+	}
+	dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+					    wc->subvol_path->slots[0]);
+	copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+			   (unsigned long)item, sizeof(*item));
 
-			/*
-			 * is this extent already allocated in the extent
-			 * allocation tree?  If so, just add a reference
-			 */
-			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
-						ins.offset);
-			if (ret < 0) {
-				goto out;
-			} else if (ret == 0) {
-				struct btrfs_ref ref = {
-					.action = BTRFS_ADD_DELAYED_REF,
-					.bytenr = ins.objectid,
-					.num_bytes = ins.offset,
-					.owning_root = btrfs_root_id(root),
-					.ref_root = btrfs_root_id(root),
-				};
-				btrfs_init_data_ref(&ref, key->objectid, offset,
-						    0, false);
-				ret = btrfs_inc_extent_ref(trans, &ref);
-				if (ret)
-					goto out;
-			} else {
-				/*
-				 * insert the extent pointer in the extent
-				 * allocation tree
-				 */
-				ret = btrfs_alloc_logged_file_extent(trans,
-						btrfs_root_id(root),
-						key->objectid, offset, &ins);
-				if (ret)
-					goto out;
-			}
-			btrfs_release_path(path);
+	/*
+	 * We have an explicit hole and NO_HOLES is not enabled. We have added
+	 * the hole file extent item to the subvolume tree, so we don't have
+	 * anything else to do other than update the file extent item range and
+	 * update the inode item.
+	 */
+	if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+		btrfs_release_path(wc->subvol_path);
+		goto update_inode;
+	}
 
-			if (btrfs_file_extent_compression(eb, item)) {
-				csum_start = ins.objectid;
-				csum_end = csum_start + ins.offset;
-			} else {
-				csum_start = ins.objectid +
-					btrfs_file_extent_offset(eb, item);
-				csum_end = csum_start +
-					btrfs_file_extent_num_bytes(eb, item);
-			}
+	ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+	offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
 
-			ret = btrfs_lookup_csums_list(root->log_root,
-						csum_start, csum_end - 1,
-						&ordered_sums, false);
-			if (ret < 0)
-				goto out;
-			ret = 0;
-			/*
-			 * Now delete all existing cums in the csum root that
-			 * cover our range. We do this because we can have an
-			 * extent that is completely referenced by one file
-			 * extent item and partially referenced by another
-			 * file extent item (like after using the clone or
-			 * extent_same ioctls). In this case if we end up doing
-			 * the replay of the one that partially references the
-			 * extent first, and we do not do the csum deletion
-			 * below, we can get 2 csum items in the csum tree that
-			 * overlap each other. For example, imagine our log has
-			 * the two following file extent items:
-			 *
-			 * key (257 EXTENT_DATA 409600)
-			 *     extent data disk byte 12845056 nr 102400
-			 *     extent data offset 20480 nr 20480 ram 102400
-			 *
-			 * key (257 EXTENT_DATA 819200)
-			 *     extent data disk byte 12845056 nr 102400
-			 *     extent data offset 0 nr 102400 ram 102400
-			 *
-			 * Where the second one fully references the 100K extent
-			 * that starts at disk byte 12845056, and the log tree
-			 * has a single csum item that covers the entire range
-			 * of the extent:
-			 *
-			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
-			 *
-			 * After the first file extent item is replayed, the
-			 * csum tree gets the following csum item:
-			 *
-			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
-			 *
-			 * Which covers the 20K sub-range starting at offset 20K
-			 * of our extent. Now when we replay the second file
-			 * extent item, if we do not delete existing csum items
-			 * that cover any of its blocks, we end up getting two
-			 * csum items in our csum tree that overlap each other:
-			 *
-			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
-			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
-			 *
-			 * Which is a problem, because after this anyone trying
-			 * to lookup up for the checksum of any block of our
-			 * extent starting at an offset of 40K or higher, will
-			 * end up looking at the second csum item only, which
-			 * does not contain the checksum for any block starting
-			 * at offset 40K or higher of our extent.
-			 */
-			while (!list_empty(&ordered_sums)) {
-				struct btrfs_ordered_sum *sums;
-				struct btrfs_root *csum_root;
-
-				sums = list_first_entry(&ordered_sums,
-							struct btrfs_ordered_sum,
-							list);
-				csum_root = btrfs_csum_root(fs_info,
-							    sums->logical);
-				if (!ret)
-					ret = btrfs_del_csums(trans, csum_root,
-							      sums->logical,
-							      sums->len);
-				if (!ret)
-					ret = btrfs_csum_file_blocks(trans,
-								     csum_root,
-								     sums);
-				list_del(&sums->list);
-				kfree(sums);
-			}
-			if (ret)
-				goto out;
-		} else {
-			btrfs_release_path(path);
+	/*
+	 * Manually record dirty extent, as here we did a shallow file extent
+	 * item copy and skip normal backref update, but modifying extent tree
+	 * all by ourselves. So need to manually record dirty extent for qgroup,
+	 * as the owner of the file extent changed from log tree (doesn't affect
+	 * qgroup) to fs/file tree (affects qgroup).
+	 */
+	ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+				       ins.objectid, ins.offset,
+				       wc->log_key.objectid, btrfs_root_id(root));
+		goto out;
+	}
+
+	/*
+	 * Is this extent already allocated in the extent tree?
+	 * If so, just add a reference.
+	 */
+	ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+				       ins.objectid, ins.offset,
+				       wc->log_key.objectid, btrfs_root_id(root));
+		goto out;
+	} else if (ret == 0) {
+		struct btrfs_ref ref = {
+			.action = BTRFS_ADD_DELAYED_REF,
+			.bytenr = ins.objectid,
+			.num_bytes = ins.offset,
+			.owning_root = btrfs_root_id(root),
+			.ref_root = btrfs_root_id(root),
+		};
+
+		btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+		ret = btrfs_inc_extent_ref(trans, &ref);
+		if (ret) {
+			btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+					       ins.objectid, ins.offset,
+					       wc->log_key.objectid,
+					       btrfs_root_id(root));
+			goto out;
 		}
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		/* inline extents are easy, we just overwrite them */
-		ret = overwrite_item(trans, root, path, eb, slot, key);
-		if (ret)
+	} else {
+		/* Insert the extent pointer in the extent tree. */
+		ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+						     wc->log_key.objectid, offset, &ins);
+		if (ret) {
+			btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+					       ins.objectid, ins.offset, offset,
+					       wc->log_key.objectid, btrfs_root_id(root));
 			goto out;
+		}
 	}
 
-	ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+	btrfs_release_path(wc->subvol_path);
+
+	if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+		csum_start = ins.objectid;
+		csum_end = csum_start + ins.offset;
+	} else {
+		csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+		csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+	}
+
+	ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+				      &ordered_sums, false);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+				       csum_start, csum_end, wc->log_key.objectid,
+				       btrfs_root_id(root));
+		goto out;
+	}
+	ret = 0;
+	/*
+	 * Now delete all existing cums in the csum root that cover our range.
+	 * We do this because we can have an extent that is completely
+	 * referenced by one file extent item and partially referenced by
+	 * another file extent item (like after using the clone or extent_same
+	 * ioctls). In this case if we end up doing the replay of the one that
+	 * partially references the extent first, and we do not do the csum
+	 * deletion below, we can get 2 csum items in the csum tree that overlap
+	 * each other. For example, imagine our log has the two following file
+	 * extent items:
+	 *
+	 * key (257 EXTENT_DATA 409600)
+	 *     extent data disk byte 12845056 nr 102400
+	 *     extent data offset 20480 nr 20480 ram 102400
+	 *
+	 * key (257 EXTENT_DATA 819200)
+	 *     extent data disk byte 12845056 nr 102400
+	 *     extent data offset 0 nr 102400 ram 102400
+	 *
+	 * Where the second one fully references the 100K extent that starts at
+	 * disk byte 12845056, and the log tree has a single csum item that
+	 * covers the entire range of the extent:
+	 *
+	 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+	 *
+	 * After the first file extent item is replayed, the csum tree gets the
+	 * following csum item:
+	 *
+	 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+	 *
+	 * Which covers the 20K sub-range starting at offset 20K of our extent.
+	 * Now when we replay the second file extent item, if we do not delete
+	 * existing csum items that cover any of its blocks, we end up getting
+	 * two csum items in our csum tree that overlap each other:
+	 *
+	 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+	 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+	 *
+	 * Which is a problem, because after this anyone trying to lookup for
+	 * the checksum of any block of our extent starting at an offset of 40K
+	 * or higher, will end up looking at the second csum item only, which
+	 * does not contain the checksum for any block starting at offset 40K or
+	 * higher of our extent.
+	 */
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums;
+		struct btrfs_root *csum_root;
+
+		sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+		csum_root = btrfs_csum_root(fs_info, sums->logical);
+		if (!ret) {
+			ret = btrfs_del_csums(trans, csum_root, sums->logical,
+					      sums->len);
+			if (ret)
+				btrfs_abort_log_replay(wc, ret,
+	       "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+						       sums->logical,
+						       sums->logical + sums->len,
+						       wc->log_key.objectid,
+						       btrfs_root_id(root));
+		}
+		if (!ret) {
+			ret = btrfs_csum_file_blocks(trans, csum_root, sums);
+			if (ret)
+				btrfs_abort_log_replay(wc, ret,
+	       "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+						       sums->logical,
+						       sums->logical + sums->len,
+						       wc->log_key.objectid,
+						       btrfs_root_id(root));
+		}
+		list_del(&sums->list);
+		kfree(sums);
+	}
 	if (ret)
 		goto out;
 
 update_inode:
+	ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+				       start, extent_end, wc->log_key.objectid,
+				       btrfs_root_id(root));
+		goto out;
+	}
+
 	btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
 	ret = btrfs_update_inode(trans, inode);
+	if (ret)
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to update inode %llu root %llu",
+				       wc->log_key.objectid, btrfs_root_id(root));
 out:
 	iput(&inode->vfs_inode);
 	return ret;
 }
 
-static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+static int unlink_inode_for_log_replay(struct walk_control *wc,
 				       struct btrfs_inode *dir,
 				       struct btrfs_inode *inode,
 				       const struct fscrypt_str *name)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
 	int ret;
 
 	ret = btrfs_unlink_inode(trans, dir, inode, name);
-	if (ret)
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+				       btrfs_ino(inode), btrfs_ino(dir), name->len,
+				       name->name, btrfs_root_id(inode->root));
 		return ret;
+	}
 	/*
 	 * Whenever we need to check if a name exists or not, we check the
 	 * fs/subvolume tree. So after an unlink we must run delayed items, so
 	 * that future checks for a name during log replay see that the name
 	 * does not exists anymore.
 	 */
-	return btrfs_run_delayed_items(trans);
+	ret = btrfs_run_delayed_items(trans);
+	if (ret)
+		btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+				       btrfs_ino(inode), btrfs_ino(dir), name->len,
+				       name->name, btrfs_root_id(inode->root));
+
+	return ret;
 }
 
 /*
@@ -914,39 +1072,44 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
  * This is a helper function to do the unlink of a specific directory
  * item
  */
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
-				      struct btrfs_path *path,
+static noinline int drop_one_dir_item(struct walk_control *wc,
 				      struct btrfs_inode *dir,
 				      struct btrfs_dir_item *di)
 {
 	struct btrfs_root *root = dir->root;
 	struct btrfs_inode *inode;
 	struct fscrypt_str name;
-	struct extent_buffer *leaf;
+	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
 	struct btrfs_key location;
 	int ret;
 
-	leaf = path->nodes[0];
-
 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
 	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
-	if (ret)
-		return -ENOMEM;
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to allocate name for dir %llu root %llu",
+				       btrfs_ino(dir), btrfs_root_id(root));
+		return ret;
+	}
 
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	inode = btrfs_iget_logging(location.objectid, root);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to open inode %llu parent dir %llu name %.*s root %llu",
+				       location.objectid, btrfs_ino(dir),
+				       name.len, name.name, btrfs_root_id(root));
 		inode = NULL;
 		goto out;
 	}
 
-	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	ret = link_to_fixup_dir(wc, location.objectid);
 	if (ret)
 		goto out;
 
-	ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+	ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
 out:
 	kfree(name.name);
 	if (inode)
@@ -1013,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log,
 				   u64 ref_objectid,
 				   const struct fscrypt_str *name)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1021,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
-	if (ret < 0) {
-		goto out;
-	} else if (ret == 1) {
-		ret = 0;
-		goto out;
-	}
+	if (ret < 0)
+		return ret;
+	if (ret == 1)
+		return 0;
 
 	if (key->type == BTRFS_INODE_EXTREF_KEY)
 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
@@ -1035,20 +1196,15 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	else
 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
 						   path->slots[0], name);
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
-static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
-				  struct btrfs_path *path,
-				  struct btrfs_root *log_root,
+static int unlink_refs_not_in_log(struct walk_control *wc,
 				  struct btrfs_key *search_key,
 				  struct btrfs_inode *dir,
-				  struct btrfs_inode *inode,
-				  u64 parent_objectid)
+				  struct btrfs_inode *inode)
 {
-	struct extent_buffer *leaf = path->nodes[0];
+	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
 	unsigned long ptr;
 	unsigned long ptr_end;
 
@@ -1057,8 +1213,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
 	 * log. If so, we allow them to stay otherwise they must be unlinked as
 	 * a conflict.
 	 */
-	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
-	ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+	ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
 	while (ptr < ptr_end) {
 		struct fscrypt_str victim_name;
 		struct btrfs_inode_ref *victim_ref;
@@ -1068,22 +1224,34 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
 		ret = read_alloc_one_name(leaf, (victim_ref + 1),
 					  btrfs_inode_ref_name_len(leaf, victim_ref),
 					  &victim_name);
-		if (ret)
+		if (ret) {
+			btrfs_abort_log_replay(wc, ret,
+	       "failed to allocate name for inode %llu parent dir %llu root %llu",
+					       btrfs_ino(inode), btrfs_ino(dir),
+					       btrfs_root_id(inode->root));
 			return ret;
+		}
 
-		ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+		ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
 		if (ret) {
-			kfree(victim_name.name);
-			if (ret < 0)
+			if (ret < 0) {
+				btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+						       btrfs_ino(inode), btrfs_ino(dir),
+						       victim_name.len, victim_name.name,
+						       btrfs_root_id(inode->root));
+				kfree(victim_name.name);
 				return ret;
+			}
+			kfree(victim_name.name);
 			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
 			continue;
 		}
 
 		inc_nlink(&inode->vfs_inode);
-		btrfs_release_path(path);
+		btrfs_release_path(wc->subvol_path);
 
-		ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
+		ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
 		kfree(victim_name.name);
 		if (ret)
 			return ret;
@@ -1093,64 +1261,64 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
-				     struct btrfs_path *path,
-				     struct btrfs_root *root,
-				     struct btrfs_root *log_root,
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
 				     struct btrfs_key *search_key,
-				     struct btrfs_inode *inode,
-				     u64 inode_objectid,
-				     u64 parent_objectid)
+				     struct btrfs_inode *dir,
+				     struct btrfs_inode *inode)
 {
-	struct extent_buffer *leaf = path->nodes[0];
-	const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
-	const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+	struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+	const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+	const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
 	u32 cur_offset = 0;
 
 	while (cur_offset < item_size) {
+		struct btrfs_root *log_root = wc->log;
 		struct btrfs_inode_extref *extref;
-		struct btrfs_inode *victim_parent;
 		struct fscrypt_str victim_name;
 		int ret;
 
 		extref = (struct btrfs_inode_extref *)(base + cur_offset);
 		victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
 
-		if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+		if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
 			goto next;
 
 		ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
 					  &victim_name);
-		if (ret)
+		if (ret) {
+			btrfs_abort_log_replay(wc, ret,
+	       "failed to allocate name for inode %llu parent dir %llu root %llu",
+					       btrfs_ino(inode), btrfs_ino(dir),
+					       btrfs_root_id(inode->root));
 			return ret;
+		}
 
-		search_key->objectid = inode_objectid;
+		search_key->objectid = btrfs_ino(inode);
 		search_key->type = BTRFS_INODE_EXTREF_KEY;
-		search_key->offset = btrfs_extref_hash(parent_objectid,
+		search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
 						       victim_name.name,
 						       victim_name.len);
-		ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+		ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
 		if (ret) {
-			kfree(victim_name.name);
-			if (ret < 0)
+			if (ret < 0) {
+				btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+						       btrfs_ino(inode), btrfs_ino(dir),
+						       victim_name.len, victim_name.name,
+						       btrfs_root_id(inode->root));
+				kfree(victim_name.name);
 				return ret;
+			}
+			kfree(victim_name.name);
 next:
 			cur_offset += victim_name.len + sizeof(*extref);
 			continue;
 		}
 
-		victim_parent = btrfs_iget_logging(parent_objectid, root);
-		if (IS_ERR(victim_parent)) {
-			kfree(victim_name.name);
-			return PTR_ERR(victim_parent);
-		}
-
 		inc_nlink(&inode->vfs_inode);
-		btrfs_release_path(path);
+		btrfs_release_path(wc->subvol_path);
 
-		ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
-						  &victim_name);
-		iput(&victim_parent->vfs_inode);
+		ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
 		kfree(victim_name.name);
 		if (ret)
 			return ret;
@@ -1160,27 +1328,29 @@ next:
 	return 0;
 }
 
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct btrfs_path *path,
-				  struct btrfs_root *log_root,
+static inline int __add_inode_ref(struct walk_control *wc,
 				  struct btrfs_inode *dir,
 				  struct btrfs_inode *inode,
-				  u64 inode_objectid, u64 parent_objectid,
 				  u64 ref_index, struct fscrypt_str *name)
 {
 	int ret;
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	struct btrfs_dir_item *di;
 	struct btrfs_key search_key;
 	struct btrfs_inode_extref *extref;
 
 again:
 	/* Search old style refs */
-	search_key.objectid = inode_objectid;
+	search_key.objectid = btrfs_ino(inode);
 	search_key.type = BTRFS_INODE_REF_KEY;
-	search_key.offset = parent_objectid;
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	search_key.offset = btrfs_ino(dir);
+	ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
 	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to search subvolume tree for key (%llu %u %llu) root %llu",
+				       search_key.objectid, search_key.type,
+				       search_key.offset, btrfs_root_id(root));
 		return ret;
 	} else if (ret == 0) {
 		/*
@@ -1190,52 +1360,60 @@ again:
 		if (search_key.objectid == search_key.offset)
 			return 1;
 
-		ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
-					     dir, inode, parent_objectid);
+		ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
 		if (ret == -EAGAIN)
 			goto again;
 		else if (ret)
 			return ret;
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	/* Same search but for extended refs */
-	extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
+	extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+					   btrfs_ino(inode), btrfs_ino(dir));
 	if (IS_ERR(extref)) {
 		return PTR_ERR(extref);
 	} else if (extref) {
-		ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
-						&search_key, inode,
-						inode_objectid, parent_objectid);
+		ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
 		if (ret == -EAGAIN)
 			goto again;
 		else if (ret)
 			return ret;
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	/* look for a conflicting sequence number */
-	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+	di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
 					 ref_index, name, 0);
 	if (IS_ERR(di)) {
-		return PTR_ERR(di);
+		ret = PTR_ERR(di);
+		btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+				       btrfs_ino(dir), ref_index, name->len,
+				       name->name, btrfs_root_id(root));
+		return ret;
 	} else if (di) {
-		ret = drop_one_dir_item(trans, path, dir, di);
+		ret = drop_one_dir_item(wc, dir, di);
 		if (ret)
 			return ret;
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	/* look for a conflicting name */
-	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
+	di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
 	if (IS_ERR(di)) {
-		return PTR_ERR(di);
+		ret = PTR_ERR(di);
+		btrfs_abort_log_replay(wc, ret,
+	"failed to lookup dir item for dir %llu name %.*s root %llu",
+				       btrfs_ino(dir), name->len, name->name,
+				       btrfs_root_id(root));
+		return ret;
 	} else if (di) {
-		ret = drop_one_dir_item(trans, path, dir, di);
+		ret = drop_one_dir_item(wc, dir, di);
 		if (ret)
 			return ret;
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	return 0;
 }
@@ -1288,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
  * proper unlink of that name (that is, remove its entry from the inode
  * reference item and both dir index keys).
  */
-static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_inode *inode,
-				 struct extent_buffer *log_eb,
-				 int log_slot,
-				 struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = wc->root;
 	int ret;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
 	struct extent_buffer *eb;
 
 again:
-	btrfs_release_path(path);
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	btrfs_release_path(wc->subvol_path);
+	ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
 	if (ret > 0) {
 		ret = 0;
 		goto out;
 	}
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to search subvolume tree for key (%llu %u %llu) root %llu",
+				       wc->log_key.objectid, wc->log_key.type,
+				       wc->log_key.offset, btrfs_root_id(root));
 		goto out;
+	}
 
-	eb = path->nodes[0];
-	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
-	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+	eb = wc->subvol_path->nodes[0];
+	ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+	ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
 	while (ref_ptr < ref_end) {
 		struct fscrypt_str name;
 		u64 parent_id;
 
-		if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
 			ret = extref_get_fields(eb, ref_ptr, &name,
 						NULL, &parent_id);
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+			       "failed to get extref details for inode %llu root %llu",
+						       btrfs_ino(inode),
+						       btrfs_root_id(root));
+				goto out;
+			}
 		} else {
-			parent_id = key->offset;
+			parent_id = wc->log_key.offset;
 			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+	       "failed to get ref details for inode %llu parent_id %llu root %llu",
+						       btrfs_ino(inode), parent_id,
+						       btrfs_root_id(root));
+				goto out;
+			}
 		}
-		if (ret)
-			goto out;
 
-		if (key->type == BTRFS_INODE_EXTREF_KEY)
-			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+			ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
 							       parent_id, &name);
 		else
-			ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
+			ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+							   &name);
 
 		if (!ret) {
 			struct btrfs_inode *dir;
 
-			btrfs_release_path(path);
+			btrfs_release_path(wc->subvol_path);
 			dir = btrfs_iget_logging(parent_id, root);
 			if (IS_ERR(dir)) {
 				ret = PTR_ERR(dir);
 				kfree(name.name);
+				btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup dir inode %llu root %llu",
+						       parent_id, btrfs_root_id(root));
 				goto out;
 			}
-			ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+			ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
 			kfree(name.name);
 			iput(&dir->vfs_inode);
 			if (ret)
@@ -1354,56 +1548,51 @@ again:
 
 		kfree(name.name);
 		ref_ptr += name.len;
-		if (key->type == BTRFS_INODE_EXTREF_KEY)
+		if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
 			ref_ptr += sizeof(struct btrfs_inode_extref);
 		else
 			ref_ptr += sizeof(struct btrfs_inode_ref);
 	}
 	ret = 0;
  out:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	return ret;
 }
 
 /*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
  */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct btrfs_root *log,
-				  struct btrfs_path *path,
-				  struct extent_buffer *eb, int slot,
-				  struct btrfs_key *key)
+static noinline int add_inode_ref(struct walk_control *wc)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	struct btrfs_inode *dir = NULL;
 	struct btrfs_inode *inode = NULL;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
 	struct fscrypt_str name = { 0 };
 	int ret;
-	const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
+	const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
 	u64 parent_objectid;
 	u64 inode_objectid;
 	u64 ref_index = 0;
 	int ref_struct_size;
 
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size(eb, slot);
+	ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+	ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
 
 	if (is_extref_item) {
 		struct btrfs_inode_extref *r;
 
 		ref_struct_size = sizeof(struct btrfs_inode_extref);
 		r = (struct btrfs_inode_extref *)ref_ptr;
-		parent_objectid = btrfs_inode_extref_parent(eb, r);
+		parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
 	} else {
 		ref_struct_size = sizeof(struct btrfs_inode_ref);
-		parent_objectid = key->offset;
+		parent_objectid = wc->log_key.offset;
 	}
-	inode_objectid = key->objectid;
+	inode_objectid = wc->log_key.objectid;
 
 	/*
 	 * it is possible that we didn't log all the parent directories
@@ -1416,6 +1605,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 		ret = PTR_ERR(dir);
 		if (ret == -ENOENT)
 			ret = 0;
+		else
+			btrfs_abort_log_replay(wc, ret,
+			       "failed to lookup dir inode %llu root %llu",
+					       parent_objectid, btrfs_root_id(root));
 		dir = NULL;
 		goto out;
 	}
@@ -1423,16 +1616,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	inode = btrfs_iget_logging(inode_objectid, root);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup inode %llu root %llu",
+				       inode_objectid, btrfs_root_id(root));
 		inode = NULL;
 		goto out;
 	}
 
 	while (ref_ptr < ref_end) {
 		if (is_extref_item) {
-			ret = extref_get_fields(eb, ref_ptr, &name,
+			ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
 						&ref_index, &parent_objectid);
-			if (ret)
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+			       "failed to get extref details for inode %llu root %llu",
+						       btrfs_ino(inode),
+						       btrfs_root_id(root));
 				goto out;
+			}
 			/*
 			 * parent object can change from one array
 			 * item to another.
@@ -1457,19 +1658,35 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 						 */
 						ret = 0;
 						goto next;
+					} else {
+						btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup dir inode %llu root %llu",
+								       parent_objectid,
+								       btrfs_root_id(root));
 					}
 					goto out;
 				}
 			}
 		} else {
-			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
-			if (ret)
+			ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+	"failed to get ref details for inode %llu parent_objectid %llu root %llu",
+						       btrfs_ino(inode),
+						       parent_objectid,
+						       btrfs_root_id(root));
 				goto out;
+			}
 		}
 
-		ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-				   ref_index, &name);
+		ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+				   btrfs_ino(inode), ref_index, &name);
 		if (ret < 0) {
+			btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+					       btrfs_ino(inode), btrfs_ino(dir),
+					       ref_index, name.len, name.name,
+					       btrfs_root_id(root));
 			goto out;
 		} else if (ret == 0) {
 			/*
@@ -1479,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			 * overwrite any existing back reference, and we don't
 			 * want to create dangling pointers in the directory.
 			 */
-			ret = __add_inode_ref(trans, root, path, log, dir, inode,
-					      inode_objectid, parent_objectid,
-					      ref_index, &name);
+			ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
 			if (ret) {
 				if (ret == 1)
 					ret = 0;
@@ -1490,12 +1705,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			/* insert our name */
 			ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
-			if (ret)
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+						       btrfs_ino(inode),
+						       btrfs_ino(dir), ref_index,
+						       name.len, name.name,
+						       btrfs_root_id(root));
 				goto out;
+			}
 
 			ret = btrfs_update_inode(trans, inode);
-			if (ret)
+			if (ret) {
+				btrfs_abort_log_replay(wc, ret,
+				       "failed to update inode %llu root %llu",
+						       btrfs_ino(inode),
+						       btrfs_root_id(root));
 				goto out;
+			}
 		}
 		/* Else, ret == 1, we already have a perfect match, we're done. */
 
@@ -1517,14 +1744,14 @@ next:
 	 * dir index entries exist for a name but there is no inode reference
 	 * item with the same name.
 	 */
-	ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
+	ret = unlink_old_inode_refs(wc, inode);
 	if (ret)
 		goto out;
 
 	/* finally write the back reference in the inode */
-	ret = overwrite_item(trans, root, path, eb, slot, key);
+	ret = overwrite_item(wc);
 out:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	kfree(name.name);
 	if (dir)
 		iput(&dir->vfs_inode);
@@ -1642,26 +1869,22 @@ process_slot:
  * number of back refs found.  If it goes down to zero, the iput
  * will free the inode.
  */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+static noinline int fixup_inode_link_count(struct walk_control *wc,
 					   struct btrfs_inode *inode)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
 	struct btrfs_root *root = inode->root;
-	struct btrfs_path *path;
 	int ret;
 	u64 nlink = 0;
 	const u64 ino = btrfs_ino(inode);
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	ret = count_inode_refs(inode, path);
+	ret = count_inode_refs(inode, wc->subvol_path);
 	if (ret < 0)
 		goto out;
 
 	nlink = ret;
 
-	ret = count_inode_extrefs(inode, path);
+	ret = count_inode_extrefs(inode, wc->subvol_path);
 	if (ret < 0)
 		goto out;
 
@@ -1680,7 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	if (inode->vfs_inode.i_nlink == 0) {
 		if (S_ISDIR(inode->vfs_inode.i_mode)) {
-			ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
+			ret = replay_dir_deletes(wc, ino, true);
 			if (ret)
 				goto out;
 		}
@@ -1690,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	}
 
 out:
-	btrfs_free_path(path);
+	btrfs_release_path(wc->subvol_path);
 	return ret;
 }
 
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1705,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = (u64)-1;
 	while (1) {
+		struct btrfs_trans_handle *trans = wc->trans;
+		struct btrfs_root *root = wc->root;
 		struct btrfs_inode *inode;
 
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
 		if (ret < 0)
 			break;
 
 		if (ret == 1) {
 			ret = 0;
-			if (path->slots[0] == 0)
+			if (wc->subvol_path->slots[0] == 0)
 				break;
-			path->slots[0]--;
+			wc->subvol_path->slots[0]--;
 		}
 
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
 			break;
 
-		ret = btrfs_del_item(trans, root, path);
+		ret = btrfs_del_item(trans, root, wc->subvol_path);
 		if (ret)
 			break;
 
-		btrfs_release_path(path);
+		btrfs_release_path(wc->subvol_path);
 		inode = btrfs_iget_logging(key.offset, root);
 		if (IS_ERR(inode)) {
 			ret = PTR_ERR(inode);
 			break;
 		}
 
-		ret = fixup_inode_link_count(trans, inode);
+		ret = fixup_inode_link_count(wc, inode);
 		iput(&inode->vfs_inode);
 		if (ret)
 			break;
 
 		/*
 		 * fixup on a directory may create new entries,
-		 * make sure we always look for the highset possible
+		 * make sure we always look for the highest possible
 		 * offset
 		 */
 		key.offset = (u64)-1;
 	}
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	return ret;
 }
 
@@ -1756,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
  * count when replay is done.  The link count is incremented here
  * so the inode won't go away until we check it
  */
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	struct btrfs_key key;
 	int ret = 0;
 	struct btrfs_inode *inode;
 	struct inode *vfs_inode;
 
 	inode = btrfs_iget_logging(objectid, root);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup inode %llu root %llu",
+				       objectid, btrfs_root_id(root));
+		return ret;
+	}
 
 	vfs_inode = &inode->vfs_inode;
 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = objectid;
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+	ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
 
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	if (ret == 0) {
 		if (!vfs_inode->i_nlink)
 			set_nlink(vfs_inode, 1);
 		else
 			inc_nlink(vfs_inode);
 		ret = btrfs_update_inode(trans, inode);
+		if (ret)
+			btrfs_abort_log_replay(wc, ret,
+				       "failed to update inode %llu root %llu",
+					       objectid, btrfs_root_id(root));
 	} else if (ret == -EEXIST) {
 		ret = 0;
+	} else {
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to insert fixup item for inode %llu root %llu",
+				       objectid, btrfs_root_id(root));
 	}
 	iput(vfs_inode);
 
@@ -1826,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+static int delete_conflicting_dir_entry(struct walk_control *wc,
 					struct btrfs_inode *dir,
-					struct btrfs_path *path,
 					struct btrfs_dir_item *dst_di,
 					const struct btrfs_key *log_key,
 					u8 log_flags,
@@ -1836,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_key found_key;
 
-	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
 	/* The existing dentry points to the same inode, don't delete it. */
 	if (found_key.objectid == log_key->objectid &&
 	    found_key.type == log_key->type &&
 	    found_key.offset == log_key->offset &&
-	    btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
+	    btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
 		return 1;
 
 	/*
@@ -1851,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
 	if (!exists)
 		return 0;
 
-	return drop_one_dir_item(trans, path, dir, dst_di);
+	return drop_one_dir_item(wc, dir, dst_di);
 }
 
 /*
@@ -1870,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
  * non-existing inode) and 1 if the name was replayed.
  */
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_path *path,
-				    struct extent_buffer *eb,
-				    struct btrfs_dir_item *di,
-				    struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
 	struct fscrypt_str name = { 0 };
 	struct btrfs_dir_item *dir_dst_di;
 	struct btrfs_dir_item *index_dst_di;
@@ -1891,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	bool update_size = true;
 	bool name_added = false;
 
-	dir = btrfs_iget_logging(key->objectid, root);
-	if (IS_ERR(dir))
-		return PTR_ERR(dir);
+	dir = btrfs_iget_logging(wc->log_key.objectid, root);
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup dir inode %llu root %llu",
+				       wc->log_key.objectid, btrfs_root_id(root));
+		return ret;
+	}
 
-	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
-	if (ret)
+	ret = read_alloc_one_name(wc->log_leaf, di + 1,
+				  btrfs_dir_name_len(wc->log_leaf, di), &name);
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+			       "failed to allocate name for dir %llu root %llu",
+				       btrfs_ino(dir), btrfs_root_id(root));
 		goto out;
+	}
 
-	log_flags = btrfs_dir_flags(eb, di);
-	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
-	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
-	btrfs_release_path(path);
-	if (ret < 0)
+	log_flags = btrfs_dir_flags(wc->log_leaf, di);
+	btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+	ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+	btrfs_release_path(wc->subvol_path);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup inode %llu root %llu",
+				       log_key.objectid, btrfs_root_id(root));
 		goto out;
+	}
 	exists = (ret == 0);
 	ret = 0;
 
-	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
-					   &name, 1);
+	dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+					   wc->log_key.objectid, &name, 1);
 	if (IS_ERR(dir_dst_di)) {
 		ret = PTR_ERR(dir_dst_di);
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to lookup dir item for dir %llu name %.*s root %llu",
+				       wc->log_key.objectid, name.len, name.name,
+				       btrfs_root_id(root));
 		goto out;
 	} else if (dir_dst_di) {
-		ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+		ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
 						   &log_key, log_flags, exists);
-		if (ret < 0)
+		if (ret < 0) {
+			btrfs_abort_log_replay(wc, ret,
+	       "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+					       btrfs_ino(dir), name.len, name.name,
+					       btrfs_root_id(root));
 			goto out;
+		}
 		dir_dst_matches = (ret == 1);
 	}
 
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
-	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
-						   key->objectid, key->offset,
-						   &name, 1);
+	index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+						   wc->log_key.objectid,
+						   wc->log_key.offset, &name, 1);
 	if (IS_ERR(index_dst_di)) {
 		ret = PTR_ERR(index_dst_di);
+		btrfs_abort_log_replay(wc, ret,
+	       "failed to lookup dir index item for dir %llu name %.*s root %llu",
+				       wc->log_key.objectid, name.len, name.name,
+				       btrfs_root_id(root));
 		goto out;
 	} else if (index_dst_di) {
-		ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+		ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
 						   &log_key, log_flags, exists);
-		if (ret < 0)
+		if (ret < 0) {
+			btrfs_abort_log_replay(wc, ret,
+	       "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+					       btrfs_ino(dir), name.len, name.name,
+					       btrfs_root_id(root));
 			goto out;
+		}
 		index_dst_matches = (ret == 1);
 	}
 
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 
 	if (dir_dst_matches && index_dst_matches) {
 		ret = 0;
@@ -1951,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	 */
 	search_key.objectid = log_key.objectid;
 	search_key.type = BTRFS_INODE_REF_KEY;
-	search_key.offset = key->objectid;
+	search_key.offset = wc->log_key.objectid;
 	ret = backref_in_log(root->log_root, &search_key, 0, &name);
 	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+				       search_key.objectid, btrfs_ino(dir),
+				       name.len, name.name, btrfs_root_id(root));
 	        goto out;
 	} else if (ret) {
 	        /* The dentry will be added later. */
@@ -1964,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 	search_key.objectid = log_key.objectid;
 	search_key.type = BTRFS_INODE_EXTREF_KEY;
-	search_key.offset = key->objectid;
-	ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
+	search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+	ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
 	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+				       search_key.objectid, btrfs_ino(dir),
+				       name.len, name.name, btrfs_root_id(root));
 		goto out;
 	} else if (ret) {
 		/* The dentry will be added later. */
@@ -1974,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		update_size = false;
 		goto out;
 	}
-	btrfs_release_path(path);
-	ret = insert_one_name(trans, root, key->objectid, key->offset,
+	ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
 			      &name, &log_key);
-	if (ret && ret != -ENOENT && ret != -EEXIST)
+	if (ret && ret != -ENOENT && ret != -EEXIST) {
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to insert name %.*s for inode %llu dir %llu root %llu",
+				       name.len, name.name, log_key.objectid,
+				       btrfs_ino(dir), btrfs_root_id(root));
 		goto out;
+	}
 	if (!ret)
 		name_added = true;
 	update_size = false;
@@ -1988,6 +2263,10 @@ out:
 	if (!ret && update_size) {
 		btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
 		ret = btrfs_update_inode(trans, dir);
+		if (ret)
+			btrfs_abort_log_replay(wc, ret,
+				       "failed to update dir inode %llu root %llu",
+					       btrfs_ino(dir), btrfs_root_id(root));
 	}
 	kfree(name.name);
 	iput(&dir->vfs_inode);
@@ -1997,20 +2276,16 @@ out:
 }
 
 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_path *path,
-					struct extent_buffer *eb, int slot,
-					struct btrfs_key *key)
+static noinline int replay_one_dir_item(struct walk_control *wc)
 {
 	int ret;
 	struct btrfs_dir_item *di;
 
 	/* We only log dir index keys, which only contain a single dir item. */
-	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+	ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY);
 
-	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
-	ret = replay_one_name(trans, root, path, eb, di, key);
+	di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+	ret = replay_one_name(wc, di);
 	if (ret < 0)
 		return ret;
 
@@ -2040,17 +2315,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 	 * to ever delete the parent directory has it would result in stale
 	 * dentries that can never be deleted.
 	 */
-	if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
-		struct btrfs_path *fixup_path;
+	if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
 		struct btrfs_key di_key;
 
-		fixup_path = btrfs_alloc_path();
-		if (!fixup_path)
-			return -ENOMEM;
-
-		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
-		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
-		btrfs_free_path(fixup_path);
+		btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+		ret = link_to_fixup_dir(wc, di_key.objectid);
 	}
 
 	return ret;
@@ -2143,13 +2412,13 @@ out:
  * item is not in the log, the item is removed and the inode it points
  * to is unlinked
  */
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *log,
-				      struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
 				      struct btrfs_path *log_path,
 				      struct btrfs_inode *dir,
-				      struct btrfs_key *dir_key)
+				      struct btrfs_key *dir_key,
+				      bool force_remove)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
 	struct btrfs_root *root = dir->root;
 	int ret;
 	struct extent_buffer *eb;
@@ -2167,21 +2436,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	 */
 	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
 
-	eb = path->nodes[0];
-	slot = path->slots[0];
+	eb = wc->subvol_path->nodes[0];
+	slot = wc->subvol_path->slots[0];
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
-	if (ret)
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to allocate name for dir %llu index %llu root %llu",
+				       btrfs_ino(dir), dir_key->offset,
+				       btrfs_root_id(root));
 		goto out;
+	}
 
-	if (log) {
+	if (!force_remove) {
 		struct btrfs_dir_item *log_di;
 
-		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+		log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
 						     dir_key->objectid,
 						     dir_key->offset, &name, 0);
 		if (IS_ERR(log_di)) {
 			ret = PTR_ERR(log_di);
+			btrfs_abort_log_replay(wc, ret,
+	"failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+					       btrfs_ino(dir), dir_key->offset,
+					       name.len, name.name,
+					       btrfs_root_id(root));
 			goto out;
 		} else if (log_di) {
 			/* The dentry exists in the log, we have nothing to do. */
@@ -2191,28 +2470,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_dir_item_key_to_cpu(eb, di, &location);
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	btrfs_release_path(log_path);
 	inode = btrfs_iget_logging(location.objectid, root);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		inode = NULL;
+		btrfs_abort_log_replay(wc, ret,
+				       "failed to lookup inode %llu root %llu",
+				       location.objectid, btrfs_root_id(root));
 		goto out;
 	}
 
-	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	ret = link_to_fixup_dir(wc, location.objectid);
 	if (ret)
 		goto out;
 
 	inc_nlink(&inode->vfs_inode);
-	ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+	ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
 	/*
 	 * Unlike dir item keys, dir index keys can only have one name (entry) in
 	 * them, as there are no key collisions since each key has a unique offset
 	 * (an index number), so we're done.
 	 */
 out:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	btrfs_release_path(log_path);
 	kfree(name.name);
 	if (inode)
@@ -2220,59 +2502,67 @@ out:
 	return ret;
 }
 
-static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct btrfs_root *log,
-			      struct btrfs_path *path,
-			      const u64 ino)
+static int replay_xattr_deletes(struct walk_control *wc)
 {
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_root *root = wc->root;
+	struct btrfs_root *log = wc->log;
 	struct btrfs_key search_key;
-	struct btrfs_path *log_path;
-	int i;
+	BTRFS_PATH_AUTO_FREE(log_path);
+	const u64 ino = wc->log_key.objectid;
 	int nritems;
 	int ret;
 
 	log_path = btrfs_alloc_path();
-	if (!log_path)
+	if (!log_path) {
+		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
 		return -ENOMEM;
+	}
 
 	search_key.objectid = ino;
 	search_key.type = BTRFS_XATTR_ITEM_KEY;
 	search_key.offset = 0;
 again:
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
-	if (ret < 0)
+	ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+	if (ret < 0) {
+		btrfs_abort_log_replay(wc, ret,
+			       "failed to search xattrs for inode %llu root %llu",
+				       ino, btrfs_root_id(root));
 		goto out;
+	}
 process_leaf:
-	nritems = btrfs_header_nritems(path->nodes[0]);
-	for (i = path->slots[0]; i < nritems; i++) {
+	nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+	for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
 		struct btrfs_key key;
 		struct btrfs_dir_item *di;
 		struct btrfs_dir_item *log_di;
 		u32 total_size;
 		u32 cur;
 
-		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+		btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
 			ret = 0;
 			goto out;
 		}
 
-		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
-		total_size = btrfs_item_size(path->nodes[0], i);
+		di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+		total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
 		cur = 0;
 		while (cur < total_size) {
-			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
-			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+			u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+			u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
 			u32 this_len = sizeof(*di) + name_len + data_len;
 			char *name;
 
 			name = kmalloc(name_len, GFP_NOFS);
 			if (!name) {
 				ret = -ENOMEM;
+				btrfs_abort_log_replay(wc, ret,
+				       "failed to allocate memory for name of length %u",
+						       name_len);
 				goto out;
 			}
-			read_extent_buffer(path->nodes[0], name,
+			read_extent_buffer(wc->subvol_path->nodes[0], name,
 					   (unsigned long)(di + 1), name_len);
 
 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
@@ -2280,40 +2570,59 @@ process_leaf:
 			btrfs_release_path(log_path);
 			if (!log_di) {
 				/* Doesn't exist in log tree, so delete it. */
-				btrfs_release_path(path);
-				di = btrfs_lookup_xattr(trans, root, path, ino,
+				btrfs_release_path(wc->subvol_path);
+				di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
 							name, name_len, -1);
-				kfree(name);
 				if (IS_ERR(di)) {
 					ret = PTR_ERR(di);
+					btrfs_abort_log_replay(wc, ret,
+		       "failed to lookup xattr with name %.*s for inode %llu root %llu",
+							       name_len, name, ino,
+							       btrfs_root_id(root));
+					kfree(name);
 					goto out;
 				}
 				ASSERT(di);
 				ret = btrfs_delete_one_dir_name(trans, root,
-								path, di);
-				if (ret)
+								wc->subvol_path, di);
+				if (ret) {
+					btrfs_abort_log_replay(wc, ret,
+		       "failed to delete xattr with name %.*s for inode %llu root %llu",
+							       name_len, name, ino,
+							       btrfs_root_id(root));
+					kfree(name);
 					goto out;
-				btrfs_release_path(path);
+				}
+				btrfs_release_path(wc->subvol_path);
+				kfree(name);
 				search_key = key;
 				goto again;
 			}
-			kfree(name);
 			if (IS_ERR(log_di)) {
 				ret = PTR_ERR(log_di);
+				btrfs_abort_log_replay(wc, ret,
+	"failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+						       name_len, name, ino,
+						       btrfs_root_id(root));
+				kfree(name);
 				goto out;
 			}
+			kfree(name);
 			cur += this_len;
 			di = (struct btrfs_dir_item *)((char *)di + this_len);
 		}
 	}
-	ret = btrfs_next_leaf(root, path);
+	ret = btrfs_next_leaf(root, wc->subvol_path);
 	if (ret > 0)
 		ret = 0;
 	else if (ret == 0)
 		goto process_leaf;
+	else
+		btrfs_abort_log_replay(wc, ret,
+			       "failed to get next leaf in subvolume root %llu",
+				       btrfs_root_id(root));
 out:
-	btrfs_free_path(log_path);
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	return ret;
 }
 
@@ -2328,12 +2637,11 @@ out:
  * Anything we don't find in the log is unlinked and removed from the
  * directory.
  */
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
-				       struct btrfs_root *log,
-				       struct btrfs_path *path,
+static noinline int replay_dir_deletes(struct walk_control *wc,
 				       u64 dirid, bool del_all)
 {
+	struct btrfs_root *root = wc->root;
+	struct btrfs_root *log = (del_all ? NULL : wc->log);
 	u64 range_start;
 	u64 range_end;
 	int ret = 0;
@@ -2345,8 +2653,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 	dir_key.objectid = dirid;
 	dir_key.type = BTRFS_DIR_INDEX_KEY;
 	log_path = btrfs_alloc_path();
-	if (!log_path)
+	if (!log_path) {
+		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
 		return -ENOMEM;
+	}
 
 	dir = btrfs_iget_logging(dirid, root);
 	/*
@@ -2358,6 +2668,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 		ret = PTR_ERR(dir);
 		if (ret == -ENOENT)
 			ret = 0;
+		else
+			btrfs_abort_log_replay(wc, ret,
+			       "failed to lookup dir inode %llu root %llu",
+					       dirid, btrfs_root_id(root));
 		return ret;
 	}
 
@@ -2367,32 +2681,46 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 		if (del_all)
 			range_end = (u64)-1;
 		else {
-			ret = find_dir_range(log, path, dirid,
+			ret = find_dir_range(log, wc->subvol_path, dirid,
 					     &range_start, &range_end);
-			if (ret < 0)
+			if (ret < 0) {
+				btrfs_abort_log_replay(wc, ret,
+			       "failed to find range for dir %llu in log tree root %llu",
+						       dirid, btrfs_root_id(root));
 				goto out;
-			else if (ret > 0)
+			} else if (ret > 0) {
 				break;
+			}
 		}
 
 		dir_key.offset = range_start;
 		while (1) {
 			int nritems;
-			ret = btrfs_search_slot(NULL, root, &dir_key, path,
-						0, 0);
-			if (ret < 0)
+			ret = btrfs_search_slot(NULL, root, &dir_key,
+						wc->subvol_path, 0, 0);
+			if (ret < 0) {
+				btrfs_abort_log_replay(wc, ret,
+			       "failed to search root %llu for key (%llu %u %llu)",
+						       btrfs_root_id(root),
+						       dir_key.objectid, dir_key.type,
+						       dir_key.offset);
 				goto out;
+			}
 
-			nritems = btrfs_header_nritems(path->nodes[0]);
-			if (path->slots[0] >= nritems) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret == 1)
+			nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+			if (wc->subvol_path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, wc->subvol_path);
+				if (ret == 1) {
 					break;
-				else if (ret < 0)
+				} else if (ret < 0) {
+					btrfs_abort_log_replay(wc, ret,
+				       "failed to get next leaf in subvolume root %llu",
+							       btrfs_root_id(root));
 					goto out;
+				}
 			}
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-					      path->slots[0]);
+			btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+					      wc->subvol_path->slots[0]);
 			if (found_key.objectid != dirid ||
 			    found_key.type != dir_key.type) {
 				ret = 0;
@@ -2402,23 +2730,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 			if (found_key.offset > range_end)
 				break;
 
-			ret = check_item_in_log(trans, log, path,
-						log_path, dir,
-						&found_key);
+			ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
 			if (ret)
 				goto out;
 			if (found_key.offset == (u64)-1)
 				break;
 			dir_key.offset = found_key.offset + 1;
 		}
-		btrfs_release_path(path);
+		btrfs_release_path(wc->subvol_path);
 		if (range_end == (u64)-1)
 			break;
 		range_start = range_end + 1;
 	}
 	ret = 0;
 out:
-	btrfs_release_path(path);
+	btrfs_release_path(wc->subvol_path);
 	btrfs_free_path(log_path);
 	iput(&dir->vfs_inode);
 	return ret;
@@ -2435,7 +2761,7 @@ out:
  * only in the log (references come from either directory items or inode
  * back refs).
  */
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+static int replay_one_buffer(struct extent_buffer *eb,
 			     struct walk_control *wc, u64 gen, int level)
 {
 	int nritems;
@@ -2443,33 +2769,44 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		.transid = gen,
 		.level = level
 	};
-	struct btrfs_path *path;
-	struct btrfs_root *root = wc->replay_dest;
-	struct btrfs_key key;
-	int i;
+	struct btrfs_root *root = wc->root;
+	struct btrfs_trans_handle *trans = wc->trans;
 	int ret;
 
-	ret = btrfs_read_extent_buffer(eb, &check);
-	if (ret)
-		return ret;
-
-	level = btrfs_header_level(eb);
-
 	if (level != 0)
 		return 0;
 
-	path = btrfs_alloc_path();
-	if (!path)
+	/*
+	 * Set to NULL since it was not yet read and in case we abort log replay
+	 * on error, we have no valid log tree leaf to dump.
+	 */
+	wc->log_leaf = NULL;
+	ret = btrfs_read_extent_buffer(eb, &check);
+	if (ret) {
+		btrfs_abort_log_replay(wc, ret,
+		       "failed to read log tree leaf %llu for root %llu",
+				       eb->start, btrfs_root_id(root));
+		return ret;
+	}
+
+	ASSERT(wc->subvol_path == NULL);
+	wc->subvol_path = btrfs_alloc_path();
+	if (!wc->subvol_path) {
+		btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
 		return -ENOMEM;
+	}
+
+	wc->log_leaf = eb;
 
 	nritems = btrfs_header_nritems(eb);
-	for (i = 0; i < nritems; i++) {
+	for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
 		struct btrfs_inode_item *inode_item;
 
-		btrfs_item_key_to_cpu(eb, &key, i);
+		btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
 
-		if (key.type == BTRFS_INODE_ITEM_KEY) {
-			inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+		if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(eb, wc->log_slot,
+						    struct btrfs_inode_item);
 			/*
 			 * An inode with no links is either:
 			 *
@@ -2498,22 +2835,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		}
 
 		/* Inode keys are done during the first stage. */
-		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
 		    wc->stage == LOG_WALK_REPLAY_INODES) {
 			u32 mode;
 
-			ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
+			ret = replay_xattr_deletes(wc);
 			if (ret)
 				break;
 			mode = btrfs_inode_mode(eb, inode_item);
 			if (S_ISDIR(mode)) {
-				ret = replay_dir_deletes(wc->trans, root, log, path,
-							 key.objectid, false);
+				ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
 				if (ret)
 					break;
 			}
-			ret = overwrite_item(wc->trans, root, path,
-					     eb, i, &key);
+			ret = overwrite_item(wc);
 			if (ret)
 				break;
 
@@ -2530,9 +2865,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 				struct btrfs_inode *inode;
 				u64 from;
 
-				inode = btrfs_iget_logging(key.objectid, root);
+				inode = btrfs_iget_logging(wc->log_key.objectid, root);
 				if (IS_ERR(inode)) {
 					ret = PTR_ERR(inode);
+					btrfs_abort_log_replay(wc, ret,
+					       "failed to lookup inode %llu root %llu",
+							       wc->log_key.objectid,
+							       btrfs_root_id(root));
 					break;
 				}
 				from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -2540,21 +2879,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 				drop_args.start = from;
 				drop_args.end = (u64)-1;
 				drop_args.drop_cache = true;
-				ret = btrfs_drop_extents(wc->trans, root, inode,
-							 &drop_args);
-				if (!ret) {
+				drop_args.path = wc->subvol_path;
+				ret = btrfs_drop_extents(trans, root, inode,  &drop_args);
+				if (ret) {
+					btrfs_abort_log_replay(wc, ret,
+		       "failed to drop extents for inode %llu root %llu offset %llu",
+							       btrfs_ino(inode),
+							       btrfs_root_id(root),
+							       from);
+				} else {
 					inode_sub_bytes(&inode->vfs_inode,
 							drop_args.bytes_found);
 					/* Update the inode's nbytes. */
-					ret = btrfs_update_inode(wc->trans, inode);
+					ret = btrfs_update_inode(trans, inode);
+					if (ret)
+						btrfs_abort_log_replay(wc, ret,
+					       "failed to update inode %llu root %llu",
+								       btrfs_ino(inode),
+								       btrfs_root_id(root));
 				}
 				iput(&inode->vfs_inode);
 				if (ret)
 					break;
 			}
 
-			ret = link_to_fixup_dir(wc->trans, root,
-						path, key.objectid);
+			ret = link_to_fixup_dir(wc, wc->log_key.objectid);
 			if (ret)
 				break;
 		}
@@ -2562,10 +2911,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		if (wc->ignore_cur_inode)
 			continue;
 
-		if (key.type == BTRFS_DIR_INDEX_KEY &&
+		if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
-			ret = replay_one_dir_item(wc->trans, root, path,
-						  eb, i, &key);
+			ret = replay_one_dir_item(wc);
 			if (ret)
 				break;
 		}
@@ -2574,20 +2922,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			continue;
 
 		/* these keys are simply copied */
-		if (key.type == BTRFS_XATTR_ITEM_KEY) {
-			ret = overwrite_item(wc->trans, root, path,
-					     eb, i, &key);
+		if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc);
 			if (ret)
 				break;
-		} else if (key.type == BTRFS_INODE_REF_KEY ||
-			   key.type == BTRFS_INODE_EXTREF_KEY) {
-			ret = add_inode_ref(wc->trans, root, log, path,
-					    eb, i, &key);
+		} else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+			   wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = add_inode_ref(wc);
 			if (ret)
 				break;
-		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
-			ret = replay_one_extent(wc->trans, root, path,
-						eb, i, &key);
+		} else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc);
 			if (ret)
 				break;
 		}
@@ -2598,37 +2943,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		 * older kernel with such keys, ignore them.
 		 */
 	}
-	btrfs_free_path(path);
+	btrfs_free_path(wc->subvol_path);
+	wc->subvol_path = NULL;
 	return ret;
 }
 
-/*
- * Correctly adjust the reserved bytes occupied by a log tree extent buffer
- */
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
-{
-	struct btrfs_block_group *cache;
-
-	cache = btrfs_lookup_block_group(fs_info, start);
-	if (!cache) {
-		btrfs_err(fs_info, "unable to find block group for %llu", start);
-		return;
-	}
-
-	spin_lock(&cache->space_info->lock);
-	spin_lock(&cache->lock);
-	cache->reserved -= fs_info->nodesize;
-	cache->space_info->bytes_reserved -= fs_info->nodesize;
-	spin_unlock(&cache->lock);
-	spin_unlock(&cache->space_info->lock);
-
-	btrfs_put_block_group(cache);
-}
-
 static int clean_log_buffer(struct btrfs_trans_handle *trans,
 			    struct extent_buffer *eb)
 {
-	int ret;
+	struct btrfs_fs_info *fs_info = eb->fs_info;
+	struct btrfs_block_group *bg;
 
 	btrfs_tree_lock(eb);
 	btrfs_clear_buffer_dirty(trans, eb);
@@ -2636,22 +2960,38 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(eb);
 
 	if (trans) {
+		int ret;
+
 		ret = btrfs_pin_reserved_extent(trans, eb);
 		if (ret)
-			return ret;
-	} else {
-		unaccount_log_buffer(eb->fs_info, eb->start);
+			btrfs_abort_transaction(trans, ret);
+		return ret;
 	}
 
+	bg = btrfs_lookup_block_group(fs_info, eb->start);
+	if (!bg) {
+		btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+		btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+		return -ENOENT;
+	}
+
+	spin_lock(&bg->space_info->lock);
+	spin_lock(&bg->lock);
+	bg->reserved -= fs_info->nodesize;
+	bg->space_info->bytes_reserved -= fs_info->nodesize;
+	spin_unlock(&bg->lock);
+	spin_unlock(&bg->space_info->lock);
+
+	btrfs_put_block_group(bg);
+
 	return 0;
 }
 
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path, int *level,
-				   struct walk_control *wc)
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+				       struct walk_control *wc)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_trans_handle *trans = wc->trans;
+	struct btrfs_fs_info *fs_info = wc->log->fs_info;
 	u64 bytenr;
 	u64 ptr_gen;
 	struct extent_buffer *next;
@@ -2679,12 +3019,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		next = btrfs_find_create_tree_block(fs_info, bytenr,
 						    btrfs_header_owner(cur),
 						    *level - 1);
-		if (IS_ERR(next))
-			return PTR_ERR(next);
+		if (IS_ERR(next)) {
+			ret = PTR_ERR(next);
+			if (trans)
+				btrfs_abort_transaction(trans, ret);
+			else
+				btrfs_handle_fs_error(fs_info, ret, NULL);
+			return ret;
+		}
 
 		if (*level == 1) {
-			ret = wc->process_func(root, next, wc, ptr_gen,
-					       *level - 1);
+			ret = wc->process_func(next, wc, ptr_gen, *level - 1);
 			if (ret) {
 				free_extent_buffer(next);
 				return ret;
@@ -2695,6 +3040,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				ret = btrfs_read_extent_buffer(next, &check);
 				if (ret) {
 					free_extent_buffer(next);
+					if (trans)
+						btrfs_abort_transaction(trans, ret);
+					else
+						btrfs_handle_fs_error(fs_info, ret, NULL);
 					return ret;
 				}
 
@@ -2710,6 +3059,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		ret = btrfs_read_extent_buffer(next, &check);
 		if (ret) {
 			free_extent_buffer(next);
+			if (trans)
+				btrfs_abort_transaction(trans, ret);
+			else
+				btrfs_handle_fs_error(fs_info, ret, NULL);
 			return ret;
 		}
 
@@ -2726,10 +3079,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path, int *level,
-				 struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+				     struct walk_control *wc)
 {
 	int i;
 	int slot;
@@ -2743,14 +3094,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 			WARN_ON(*level == 0);
 			return 0;
 		} else {
-			ret = wc->process_func(root, path->nodes[*level], wc,
+			ret = wc->process_func(path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]),
 				 *level);
 			if (ret)
 				return ret;
 
 			if (wc->free) {
-				ret = clean_log_buffer(trans, path->nodes[*level]);
+				ret = clean_log_buffer(wc->trans, path->nodes[*level]);
 				if (ret)
 					return ret;
 			}
@@ -2767,13 +3118,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
  * the tree freeing any blocks that have a ref count of zero after being
  * decremented.
  */
-static int walk_log_tree(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
 {
+	struct btrfs_root *log = wc->log;
 	int ret = 0;
 	int wret;
 	int level;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	int orig_level;
 
 	path = btrfs_alloc_path();
@@ -2787,36 +3138,30 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	path->slots[level] = 0;
 
 	while (1) {
-		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		wret = walk_down_log_tree(path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0) {
-			ret = wret;
-			goto out;
-		}
+		if (wret < 0)
+			return wret;
 
-		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		wret = walk_up_log_tree(path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0) {
-			ret = wret;
-			goto out;
-		}
+		if (wret < 0)
+			return wret;
 	}
 
 	/* was the root node processed? if not, catch it here */
 	if (path->nodes[orig_level]) {
-		ret = wc->process_func(log, path->nodes[orig_level], wc,
+		ret = wc->process_func(path->nodes[orig_level], wc,
 			 btrfs_header_generation(path->nodes[orig_level]),
 			 orig_level);
 		if (ret)
-			goto out;
+			return ret;
 		if (wc->free)
-			ret = clean_log_buffer(trans, path->nodes[orig_level]);
+			ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
 	}
 
-out:
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3225,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
 	ret = write_all_supers(fs_info, 1);
 	mutex_unlock(&fs_info->tree_log_mutex);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_set_log_full_commit(trans);
 		btrfs_abort_transaction(trans, ret);
 		goto out_wake_log_root;
@@ -3277,12 +3622,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct walk_control wc = {
-		.free = 1,
-		.process_func = process_one_buffer
+		.free = true,
+		.process_func = process_one_buffer,
+		.log = log,
+		.trans = trans,
 	};
 
 	if (log->node) {
-		ret = walk_log_tree(trans, log, &wc);
+		ret = walk_log_tree(&wc);
 		if (ret) {
 			/*
 			 * We weren't able to traverse the entire log tree, the
@@ -3345,6 +3692,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+				     struct btrfs_inode *inode)
+{
+	bool ret = false;
+
+	/*
+	 * Do this only if ->logged_trans is still 0 to prevent races with
+	 * concurrent logging as we may see the inode not logged when
+	 * inode_logged() is called but it gets logged after inode_logged() did
+	 * not find it in the log tree and we end up setting ->logged_trans to a
+	 * value less than trans->transid after the concurrent logging task has
+	 * set it to trans->transid. As a consequence, subsequent rename, unlink
+	 * and link operations may end up not logging new names and removing old
+	 * names from the log.
+	 */
+	spin_lock(&inode->lock);
+	if (inode->logged_trans == 0)
+		inode->logged_trans = trans->transid - 1;
+	else if (inode->logged_trans == trans->transid)
+		ret = true;
+	spin_unlock(&inode->lock);
+
+	return ret;
+}
+
 /*
  * Check if an inode was logged in the current transaction. This correctly deals
  * with the case where the inode was logged but has a logged_trans of 0, which
@@ -3362,15 +3734,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret;
 
-	if (inode->logged_trans == trans->transid)
+	/*
+	 * Quick lockless call, since once ->logged_trans is set to the current
+	 * transaction, we never set it to a lower value anywhere else.
+	 */
+	if (data_race(inode->logged_trans) == trans->transid)
 		return 1;
 
 	/*
-	 * If logged_trans is not 0, then we know the inode logged was not logged
-	 * in this transaction, so we can return false right away.
+	 * If logged_trans is not 0 and not trans->transid, then we know the
+	 * inode was not logged in this transaction, so we can return false
+	 * right away. We take the lock to avoid a race caused by load/store
+	 * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+	 * in this function further below - an update to trans->transid can be
+	 * teared into two 32 bits updates for example, in which case we could
+	 * see a positive value that is not trans->transid and assume the inode
+	 * was not logged when it was.
 	 */
-	if (inode->logged_trans > 0)
+	spin_lock(&inode->lock);
+	if (inode->logged_trans == trans->transid) {
+		spin_unlock(&inode->lock);
+		return 1;
+	} else if (inode->logged_trans > 0) {
+		spin_unlock(&inode->lock);
 		return 0;
+	}
+	spin_unlock(&inode->lock);
 
 	/*
 	 * If no log tree was created for this root in this transaction, then
@@ -3379,10 +3768,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
 	 * transaction's ID, to avoid the search below in a future call in case
 	 * a log tree gets created after this.
 	 */
-	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
-		inode->logged_trans = trans->transid - 1;
-		return 0;
-	}
+	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+		return mark_inode_as_not_logged(trans, inode);
 
 	/*
 	 * We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3436,29 +3823,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
 		 * Set logged_trans to a value greater than 0 and less then the
 		 * current transaction to avoid doing the search in future calls.
 		 */
-		inode->logged_trans = trans->transid - 1;
-		return 0;
+		return mark_inode_as_not_logged(trans, inode);
 	}
 
 	/*
 	 * The inode was previously logged and then evicted, set logged_trans to
-	 * the current transacion's ID, to avoid future tree searches as long as
+	 * the current transaction's ID, to avoid future tree searches as long as
 	 * the inode is not evicted again.
 	 */
+	spin_lock(&inode->lock);
 	inode->logged_trans = trans->transid;
-
-	/*
-	 * If it's a directory, then we must set last_dir_index_offset to the
-	 * maximum possible value, so that the next attempt to log the inode does
-	 * not skip checking if dir index keys found in modified subvolume tree
-	 * leaves have been logged before, otherwise it would result in attempts
-	 * to insert duplicate dir index keys in the log tree. This must be done
-	 * because last_dir_index_offset is an in-memory only field, not persisted
-	 * in the inode item or any other on-disk structure, so its value is lost
-	 * once the inode is evicted.
-	 */
-	if (S_ISDIR(inode->vfs_inode.i_mode))
-		inode->last_dir_index_offset = (u64)-1;
+	spin_unlock(&inode->lock);
 
 	return 1;
 }
@@ -3524,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				  const struct fscrypt_str *name,
 				  struct btrfs_inode *dir, u64 index)
 {
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	int ret;
 
 	ret = inode_logged(trans, dir, NULL);
 	if (ret == 0)
 		return;
-	else if (ret < 0) {
+	if (ret < 0) {
 		btrfs_set_log_full_commit(trans);
 		return;
 	}
@@ -3544,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	ret = join_running_log_trans(root);
 	ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
 	if (WARN_ON(ret))
-		goto out;
+		return;
 
 	mutex_lock(&dir->log_mutex);
 
@@ -3554,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		btrfs_set_log_full_commit(trans);
 	btrfs_end_log_trans(root);
-out:
-	btrfs_free_path(path);
 }
 
 /* see comments for btrfs_del_dir_entries_in_log */
@@ -3668,8 +4041,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
 		struct btrfs_key *ins_keys;
 		u32 *ins_sizes;
 
-		ins_data = kmalloc(count * sizeof(u32) +
-				   count * sizeof(struct btrfs_key), GFP_NOFS);
+		ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
 		if (!ins_data)
 			return -ENOMEM;
 
@@ -4050,7 +4422,7 @@ done:
 
 /*
  * If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
  * key offset. If that's the case, search for it and update the inode. This
  * is to avoid lookups in the log tree every time we try to insert a dir index
  * key from a leaf changed in the current transaction, and to allow us to always
@@ -4066,7 +4438,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
 
 	lockdep_assert_held(&inode->log_mutex);
 
-	if (inode->last_dir_index_offset != (u64)-1)
+	if (inode->last_dir_index_offset != 0)
 		return 0;
 
 	if (!ctx->logged_before) {
@@ -4232,7 +4604,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
 static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
-			    struct inode *inode, int log_inode_only,
+			    struct inode *inode, bool log_inode_only,
 			    u64 logged_isize)
 {
 	u64 flags;
@@ -4328,7 +4700,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				    struct btrfs_inode_item);
 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
-			0, 0);
+			false, 0);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -4432,8 +4804,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 	src = src_path->nodes[0];
 
-	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
-			   nr * sizeof(u32), GFP_NOFS);
+	ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
 	if (!ins_data)
 		return -ENOMEM;
 
@@ -4834,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	const u64 i_size = i_size_read(&inode->vfs_inode);
 	const u64 ino = btrfs_ino(inode);
-	struct btrfs_path *dst_path = NULL;
+	BTRFS_PATH_AUTO_FREE(dst_path);
 	bool dropped_extents = false;
 	u64 truncate_offset = i_size;
 	struct extent_buffer *leaf;
@@ -4952,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
 				 start_slot, ins_nr, 1, 0, ctx);
 out:
 	btrfs_release_path(path);
-	btrfs_free_path(dst_path);
 	return ret;
 }
 
@@ -5325,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 					 u64 *other_ino, u64 *other_parent)
 {
 	int ret;
-	struct btrfs_path *search_path;
+	BTRFS_PATH_AUTO_FREE(search_path);
 	char *name = NULL;
 	u32 name_len = 0;
 	u32 item_size = btrfs_item_size(eb, slot);
@@ -5410,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 	}
 	ret = 0;
 out:
-	btrfs_free_path(search_path);
 	kfree(name);
 	return ret;
 }
@@ -6138,8 +6507,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
 	if (!first)
 		return 0;
 
-	ins_data = kmalloc(max_batch_size * sizeof(u32) +
-			   max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+	ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
 	if (!ins_data)
 		return -ENOMEM;
 	ins_sizes = (u32 *)ins_data;
@@ -6793,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				 struct btrfs_log_ctx *ctx)
 {
 	int ret;
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key key;
 	struct btrfs_root *root = inode->root;
 	const u64 ino = btrfs_ino(inode);
@@ -6809,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 	key.offset = 0;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	while (true) {
 		struct extent_buffer *leaf = path->nodes[0];
@@ -6821,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out;
-			else if (ret > 0)
+				return ret;
+			if (ret > 0)
 				break;
 			continue;
 		}
@@ -6880,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 			 * at both parents and the old parent B would still
 			 * exist.
 			 */
-			if (IS_ERR(dir_inode)) {
-				ret = PTR_ERR(dir_inode);
-				goto out;
-			}
+			if (IS_ERR(dir_inode))
+				return PTR_ERR(dir_inode);
 
 			if (!need_log_inode(trans, dir_inode)) {
 				btrfs_add_delayed_iput(dir_inode);
@@ -6896,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 				ret = log_new_dir_dentries(trans, dir_inode, ctx);
 			btrfs_add_delayed_iput(dir_inode);
 			if (ret)
-				goto out;
+				return ret;
 		}
 		path->slots[0]++;
 	}
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
 
 static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -7014,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = inode->root;
 	const u64 ino = btrfs_ino(inode);
-	struct btrfs_path *path;
+	BTRFS_PATH_AUTO_FREE(path);
 	struct btrfs_key search_key;
 	int ret;
 
@@ -7035,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 	if (ret == 0)
 		path->slots[0]++;
 
@@ -7047,8 +7410,8 @@ again:
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out;
-			else if (ret > 0)
+				return ret;
+			if (ret > 0)
 				break;
 			continue;
 		}
@@ -7065,10 +7428,8 @@ again:
 		 * this loop, etc). So just return some error to fallback to
 		 * a transaction commit.
 		 */
-		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
-			ret = -EMLINK;
-			goto out;
-		}
+		if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+			return -EMLINK;
 
 		/*
 		 * Logging ancestors needs to do more searches on the fs/subvol
@@ -7080,14 +7441,11 @@ again:
 
 		ret = log_new_ancestors(trans, root, path, ctx);
 		if (ret)
-			goto out;
+			return ret;
 		btrfs_release_path(path);
 		goto again;
 	}
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	return ret;
+	return 0;
 }
 
 /*
@@ -7267,10 +7625,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	}
 
 	wc.trans = trans;
-	wc.pin = 1;
+	wc.pin = true;
+	wc.log = log_root_tree;
 
-	ret = walk_log_tree(trans, log_root_tree, &wc);
-	if (ret) {
+	ret = walk_log_tree(&wc);
+	wc.log = NULL;
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto error;
 	}
@@ -7281,12 +7641,11 @@ again:
 	key.offset = (u64)-1;
 
 	while (1) {
-		struct btrfs_root *log;
 		struct btrfs_key found_key;
 
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
-		if (ret < 0) {
+		if (unlikely(ret < 0)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error;
 		}
@@ -7301,20 +7660,19 @@ again:
 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 			break;
 
-		log = btrfs_read_tree_root(log_root_tree, &found_key);
-		if (IS_ERR(log)) {
-			ret = PTR_ERR(log);
+		wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+		if (IS_ERR(wc.log)) {
+			ret = PTR_ERR(wc.log);
+			wc.log = NULL;
 			btrfs_abort_transaction(trans, ret);
 			goto error;
 		}
 
-		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
-						   true);
-		if (IS_ERR(wc.replay_dest)) {
-			ret = PTR_ERR(wc.replay_dest);
-			wc.replay_dest = NULL;
-			if (ret != -ENOENT) {
-				btrfs_put_root(log);
+		wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+		if (IS_ERR(wc.root)) {
+			ret = PTR_ERR(wc.root);
+			wc.root = NULL;
+			if (unlikely(ret != -ENOENT)) {
 				btrfs_abort_transaction(trans, ret);
 				goto error;
 			}
@@ -7330,33 +7688,34 @@ again:
 			 * block from being modified, and we'll just bail for
 			 * each subsequent pass.
 			 */
-			ret = btrfs_pin_extent_for_log_replay(trans, log->node);
-			if (ret) {
-				btrfs_put_root(log);
+			ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto error;
 			}
 			goto next;
 		}
 
-		wc.replay_dest->log_root = log;
-		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
-		if (ret) {
+		wc.root->log_root = wc.log;
+		ret = btrfs_record_root_in_trans(trans, wc.root);
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto next;
 		}
 
-		ret = walk_log_tree(trans, log, &wc);
-		if (ret) {
+		ret = walk_log_tree(&wc);
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto next;
 		}
 
 		if (wc.stage == LOG_WALK_REPLAY_ALL) {
-			struct btrfs_root *root = wc.replay_dest;
+			struct btrfs_root *root = wc.root;
 
-			ret = fixup_inode_link_counts(trans, wc.replay_dest, path);
-			if (ret) {
+			wc.subvol_path = path;
+			ret = fixup_inode_link_counts(&wc);
+			wc.subvol_path = NULL;
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto next;
 			}
@@ -7369,17 +7728,18 @@ again:
 			 * could only happen during mount.
 			 */
 			ret = btrfs_init_root_free_objectid(root);
-			if (ret) {
+			if (unlikely(ret)) {
 				btrfs_abort_transaction(trans, ret);
 				goto next;
 			}
 		}
 next:
-		if (wc.replay_dest) {
-			wc.replay_dest->log_root = NULL;
-			btrfs_put_root(wc.replay_dest);
+		if (wc.root) {
+			wc.root->log_root = NULL;
+			btrfs_put_root(wc.root);
 		}
-		btrfs_put_root(log);
+		btrfs_put_root(wc.log);
+		wc.log = NULL;
 
 		if (ret)
 			goto error;
@@ -7391,7 +7751,7 @@ next:
 
 	/* step one is to pin it all, step two is to replay just inodes */
 	if (wc.pin) {
-		wc.pin = 0;
+		wc.pin = false;
 		wc.process_func = replay_one_buffer;
 		wc.stage = LOG_WALK_REPLAY_INODES;
 		goto again;
@@ -7409,14 +7769,13 @@ next:
 	if (ret)
 		return ret;
 
-	log_root_tree->log_root = NULL;
 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
-	btrfs_put_root(log_root_tree);
 
 	return 0;
 error:
 	if (wc.trans)
 		btrfs_end_transaction(wc.trans);
+	btrfs_put_root(wc.log);
 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
 	btrfs_free_path(path);
 	return ret;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b7a96a005487..46bd8ca58670 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode)
 	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(inode);
 	ret = btrfs_update_inode(trans, inode);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 	ret = del_orphan(trans, inode);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
 	if (ret < 0)
 		return ret;
 
-	if (item.reserved[0] != 0 || item.reserved[1] != 0)
+	if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0))
 		return -EUCLEAN;
 
 	true_size = btrfs_stack_verity_descriptor_size(&item);
-	if (true_size > INT_MAX)
+	if (unlikely(true_size > INT_MAX))
 		return -EUCLEAN;
 
 	if (buf_size == 0)
@@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
 }
 
 const struct fsverity_operations btrfs_verityops = {
+	.inode_info_offs         = (int)offsetof(struct btrfs_inode, i_verity_info) -
+				   (int)offsetof(struct btrfs_inode, vfs_inode),
 	.begin_enable_verity     = btrfs_begin_enable_verity,
 	.end_enable_verity       = btrfs_end_enable_verity,
 	.get_verity_descriptor   = btrfs_get_verity_descriptor,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa7a929a0461..2bec544d8ba3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
 	}
 
 	/*
-	 * Make sure the last byte of label is properly NUL termiated.  We use
-	 * '%s' to print the label, if not properly NUL termiated we can access
+	 * Make sure the last byte of label is properly NUL terminated.  We use
+	 * '%s' to print the label, if not properly NUL terminated we can access
 	 * beyond the label.
 	 */
 	if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
@@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
 	if (ret < 0)
 		goto error;
 
-	if (ret == 0) {
+	if (unlikely(ret == 0)) {
 		/* Corruption */
 		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
 		ret = -EUCLEAN;
@@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_rm_dev_item(trans, device);
-	if (ret) {
+	if (unlikely(ret)) {
 		/* Any error in dev item removal is critical */
 		btrfs_crit(fs_info,
 			   "failed to remove device item for devid %llu: %d",
@@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		goto error;
 	}
 
+	if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+		ret = -EINVAL;
+		goto error;
+	}
+
 	if (fs_devices->seeding) {
 		seeding_dev = true;
 		down_write(&sb->s_umount);
@@ -2838,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		mutex_lock(&fs_info->chunk_mutex);
 		ret = init_first_rw_device(trans);
 		mutex_unlock(&fs_info->chunk_mutex);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_sysfs;
 		}
 	}
 
 	ret = btrfs_add_dev_item(trans, device);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto error_sysfs;
 	}
 
 	if (seeding_dev) {
 		ret = btrfs_finish_sprout(trans);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto error_sysfs;
 		}
@@ -3044,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	else if (ret > 0) { /* Logic error or corruption */
+	else if (unlikely(ret > 0)) { /* Logic error or corruption */
 		btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
 			  chunk_offset);
 		btrfs_abort_transaction(trans, -ENOENT);
@@ -3053,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 	}
 
 	ret = btrfs_del_item(trans, root, path);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3278,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 		ret = btrfs_free_dev_extent(trans, device,
 					    map->stripes[i].physical,
 					    &dev_extent_len);
-		if (ret) {
+		if (unlikely(ret)) {
 			mutex_unlock(&fs_devices->device_list_mutex);
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -3348,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 		struct btrfs_space_info *space_info;
 
 		space_info = btrfs_find_space_info(fs_info, sys_flags);
-		if (!space_info) {
+		if (unlikely(!space_info)) {
 			ret = -EINVAL;
 			btrfs_abort_transaction(trans, ret);
 			goto out;
@@ -3362,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 		}
 
 		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
 
 		ret = remove_chunk_item(trans, map, chunk_offset);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
-	} else if (ret) {
+	} else if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3381,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
-		if (ret) {
+		if (unlikely(ret)) {
 			btrfs_abort_transaction(trans, ret);
 			goto out;
 		}
@@ -3397,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 	btrfs_trans_release_chunk_metadata(trans);
 
 	ret = btrfs_remove_block_group(trans, map);
-	if (ret) {
+	if (unlikely(ret)) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3522,7 +3527,7 @@ again:
 			mutex_unlock(&fs_info->reclaim_bgs_lock);
 			goto error;
 		}
-		if (ret == 0) {
+		if (unlikely(ret == 0)) {
 			/*
 			 * On the first search we would find chunk tree with
 			 * offset -1, which is not possible. On subsequent
@@ -4264,7 +4269,7 @@ error:
  * @flags:     profile to validate
  * @extended:  if true @flags is treated as an extended profile
  */
-static int alloc_profile_is_valid(u64 flags, int extended)
+static int alloc_profile_is_valid(u64 flags, bool extended)
 {
 	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
 			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -4458,7 +4463,7 @@ out_overflow:
 }
 
 /*
- * Should be called with balance mutexe held
+ * Should be called with balance mutex held
  */
 int btrfs_balance(struct btrfs_fs_info *fs_info,
 		  struct btrfs_balance_control *bctl,
@@ -5036,7 +5041,7 @@ again:
 	/* Now btrfs_update_device() will change the on-disk size. */
 	ret = btrfs_update_device(trans, device);
 	btrfs_trans_release_chunk_metadata(trans);
-	if (ret < 0) {
+	if (unlikely(ret < 0)) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
 	} else {
@@ -5696,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
 	item_size = btrfs_chunk_item_size(map->num_stripes);
 
 	chunk = kzalloc(item_size, GFP_NOFS);
-	if (!chunk) {
+	if (unlikely(!chunk)) {
 		ret = -ENOMEM;
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -7481,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 	/*
 	 * Lockdep complains about possible circular locking dependency between
 	 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
-	 * used for freeze procection of a fs (struct super_block.s_writers),
+	 * used for freeze protection of a fs (struct super_block.s_writers),
 	 * which we take when starting a transaction, and extent buffers of the
 	 * chunk tree if we call read_one_dev() while holding a lock on an
 	 * extent buffer of the chunk tree. Since we are mounting the filesystem
@@ -7914,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags)
 	return btrfs_raid_array[index].ncopies;
 }
 
-
-
 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 				 u64 chunk_offset, u64 devid,
 				 u64 physical_offset, u64 physical_len)
@@ -7929,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 	int i;
 
 	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
-	if (!map) {
+	if (unlikely(!map)) {
 		btrfs_err(fs_info,
 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
 			  physical_offset, devid);
@@ -7938,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 	}
 
 	stripe_len = btrfs_calc_stripe_length(map);
-	if (physical_len != stripe_len) {
+	if (unlikely(physical_len != stripe_len)) {
 		btrfs_err(fs_info,
 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
 			  physical_offset, devid, map->start, physical_len,
@@ -7958,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 			   devid, physical_offset, physical_len);
 
 	for (i = 0; i < map->num_stripes; i++) {
-		if (map->stripes[i].dev->devid == devid &&
-		    map->stripes[i].physical == physical_offset) {
+		if (unlikely(map->stripes[i].dev->devid == devid &&
+			     map->stripes[i].physical == physical_offset)) {
 			found = true;
 			if (map->verified_stripes >= map->num_stripes) {
 				btrfs_err(fs_info,
@@ -7972,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 			break;
 		}
 	}
-	if (!found) {
+	if (unlikely(!found)) {
 		btrfs_err(fs_info,
 	"dev extent physical offset %llu devid %llu has no corresponding chunk",
 			physical_offset, devid);
@@ -7981,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 
 	/* Make sure no dev extent is beyond device boundary */
 	dev = btrfs_find_device(fs_info->fs_devices, &args);
-	if (!dev) {
+	if (unlikely(!dev)) {
 		btrfs_err(fs_info, "failed to find devid %llu", devid);
 		ret = -EUCLEAN;
 		goto out;
 	}
 
-	if (physical_offset + physical_len > dev->disk_total_bytes) {
+	if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
 		btrfs_err(fs_info,
 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
 			  devid, physical_offset, physical_len,
@@ -7999,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
 	if (dev->zone_info) {
 		u64 zone_size = dev->zone_info->zone_size;
 
-		if (!IS_ALIGNED(physical_offset, zone_size) ||
-		    !IS_ALIGNED(physical_len, zone_size)) {
+		if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
+			     !IS_ALIGNED(physical_len, zone_size))) {
 			btrfs_err(fs_info,
 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
 				  devid, physical_offset, physical_len);
@@ -8024,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
 		struct btrfs_chunk_map *map;
 
 		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
-		if (map->num_stripes != map->verified_stripes) {
+		if (unlikely(map->num_stripes != map->verified_stripes)) {
 			btrfs_err(fs_info,
 			"chunk %llu has missing dev extent, have %d expect %d",
 				  map->start, map->verified_stripes, map->num_stripes);
@@ -8084,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 		if (ret < 0)
 			goto out;
 		/* No dev extents at all? Not good */
-		if (ret > 0) {
+		if (unlikely(ret > 0)) {
 			ret = -EUCLEAN;
 			goto out;
 		}
@@ -8109,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 		physical_len = btrfs_dev_extent_length(leaf, dext);
 
 		/* Check if this dev extent overlaps with the previous one */
-		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+		if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
 			btrfs_err(fs_info,
 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
 				  devid, physical_offset, prev_dev_ext_end);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a56e873a3029..2cbf8080eade 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -34,7 +34,7 @@ struct btrfs_zoned_device_info;
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
 /*
- * Arbitratry maximum size of one discard request to limit potentially long time
+ * Arbitrary maximum size of one discard request to limit potentially long time
  * spent in blkdev_issue_discard().
  */
 #define BTRFS_MAX_DISCARD_CHUNK_SIZE	(SZ_1G)
@@ -495,7 +495,7 @@ struct btrfs_discard_stripe {
 };
 
 /*
- * Context for IO subsmission for device stripe.
+ * Context for IO submission for device stripe.
  *
  * - Track the unfinished mirrors for mirror based profiles
  *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5292cd341f70..6caba8be7c84 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -34,11 +34,9 @@ struct workspace {
 	int level;
 };
 
-static struct workspace_manager wsm;
-
-struct list_head *zlib_get_workspace(unsigned int level)
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
 {
-	struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
+	struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level);
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 
 	workspace->level = level;
@@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws)
 	kfree(workspace);
 }
 
-struct list_head *zlib_alloc_workspace(unsigned int level)
+/*
+ * For s390 hardware acceleration, the buffer size should be at least
+ * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance.
+ *
+ * But if bs > ps we can have large enough folios that meet the s390 hardware
+ * handling.
+ */
+static bool need_special_buffer(struct btrfs_fs_info *fs_info)
+{
+	if (!zlib_deflate_dfltcc_enabled())
+		return false;
+	if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE)
+		return false;
+	return true;
+}
+
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
 {
+	const u32 blocksize = fs_info->sectorsize;
 	struct workspace *workspace;
 	int workspacesize;
 
@@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
 	workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
 	workspace->level = level;
 	workspace->buf = NULL;
-	/*
-	 * In case of s390 zlib hardware support, allocate lager workspace
-	 * buffer. If allocator fails, fall back to a single page buffer.
-	 */
-	if (zlib_deflate_dfltcc_enabled()) {
+	if (need_special_buffer(fs_info)) {
 		workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
 					 __GFP_NOMEMALLOC | __GFP_NORETRY |
 					 __GFP_NOWARN | GFP_NOIO);
 		workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
 	}
 	if (!workspace->buf) {
-		workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		workspace->buf_size = PAGE_SIZE;
+		workspace->buf = kmalloc(blocksize, GFP_KERNEL);
+		workspace->buf_size = blocksize;
 	}
 	if (!workspace->strm.workspace || !workspace->buf)
 		goto fail;
@@ -133,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping,
 	return 0;
 }
 
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 			 unsigned long *total_in, unsigned long *total_out)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
+	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
 	int ret;
 	char *data_in = NULL;
 	char *cfolio_out;
@@ -146,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 	struct folio *out_folio = NULL;
 	unsigned long len = *total_out;
 	unsigned long nr_dest_folios = *out_folios;
-	const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+	const unsigned long max_out = nr_dest_folios << min_folio_shift;
+	const u32 blocksize = fs_info->sectorsize;
 	const u64 orig_end = start + len;
 
 	*out_folios = 0;
@@ -155,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 
 	ret = zlib_deflateInit(&workspace->strm, workspace->level);
 	if (unlikely(ret != Z_OK)) {
-		struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
-		btrfs_err(inode->root->fs_info,
+		btrfs_err(fs_info,
 	"zlib compression init failed, error %d root %llu inode %llu offset %llu",
 			  ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
 		ret = -EIO;
@@ -167,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	out_folio = btrfs_alloc_compr_folio();
+	out_folio = btrfs_alloc_compr_folio(fs_info);
 	if (out_folio == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 	workspace->strm.next_in = workspace->buf;
 	workspace->strm.avail_in = 0;
 	workspace->strm.next_out = cfolio_out;
-	workspace->strm.avail_out = PAGE_SIZE;
+	workspace->strm.avail_out = min_folio_size;
 
 	while (workspace->strm.total_in < len) {
 		/*
@@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 			unsigned int copy_length = min(bytes_left, workspace->buf_size);
 
 			/*
-			 * This can only happen when hardware zlib compression is
-			 * enabled.
+			 * For s390 hardware accelerated zlib, and our folio is smaller
+			 * than the copy_length, we need to fill the buffer so that
+			 * we can take full advantage of hardware acceleration.
 			 */
-			if (copy_length > PAGE_SIZE) {
+			if (need_special_buffer(fs_info)) {
 				ret = copy_data_into_buffer(mapping, workspace,
 							    start, copy_length);
 				if (ret < 0)
@@ -225,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 
 		ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
 		if (unlikely(ret != Z_OK)) {
-			struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
-			btrfs_warn(inode->root->fs_info,
+			btrfs_warn(fs_info,
 		"zlib compression failed, error %d root %llu inode %llu offset %llu",
 				   ret, btrfs_root_id(inode->root), btrfs_ino(inode),
 				   start);
@@ -237,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 		}
 
 		/* we're making it bigger, give up */
-		if (workspace->strm.total_in > 8192 &&
+		if (workspace->strm.total_in > blocksize * 2 &&
 		    workspace->strm.total_in <
 		    workspace->strm.total_out) {
 			ret = -E2BIG;
@@ -252,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_folio = btrfs_alloc_compr_folio();
+			out_folio = btrfs_alloc_compr_folio(fs_info);
 			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -260,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 			cfolio_out = folio_address(out_folio);
 			folios[nr_folios] = out_folio;
 			nr_folios++;
-			workspace->strm.avail_out = PAGE_SIZE;
+			workspace->strm.avail_out = min_folio_size;
 			workspace->strm.next_out = cfolio_out;
 		}
 		/* we're all done */
@@ -278,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 		ret = zlib_deflate(&workspace->strm, Z_FINISH);
 		if (ret == Z_STREAM_END)
 			break;
-		if (ret != Z_OK && ret != Z_BUF_ERROR) {
+		if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) {
 			zlib_deflateEnd(&workspace->strm);
 			ret = -EIO;
 			goto out;
@@ -288,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 				ret = -E2BIG;
 				goto out;
 			}
-			out_folio = btrfs_alloc_compr_folio();
+			out_folio = btrfs_alloc_compr_folio(fs_info);
 			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -296,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 			cfolio_out = folio_address(out_folio);
 			folios[nr_folios] = out_folio;
 			nr_folios++;
-			workspace->strm.avail_out = PAGE_SIZE;
+			workspace->strm.avail_out = min_folio_size;
 			workspace->strm.next_out = cfolio_out;
 		}
 	}
@@ -322,20 +335,22 @@ out:
 
 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
+	struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
 	int ret = 0, ret2;
 	int wbits = MAX_WBITS;
 	char *data_in;
 	size_t total_out = 0;
 	unsigned long folio_in_index = 0;
 	size_t srclen = cb->compressed_len;
-	unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
 	unsigned long buf_start;
 	struct folio **folios_in = cb->compressed_folios;
 
 	data_in = kmap_local_folio(folios_in[folio_in_index], 0);
 	workspace->strm.next_in = data_in;
-	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
+	workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
 	workspace->strm.total_in = 0;
 
 	workspace->strm.total_out = 0;
@@ -396,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			data_in = kmap_local_folio(folios_in[folio_in_index], 0);
 			workspace->strm.next_in = data_in;
 			tmp = srclen - workspace->strm.total_in;
-			workspace->strm.avail_in = min(tmp, PAGE_SIZE);
+			workspace->strm.avail_in = min(tmp, min_folio_size);
 		}
 	}
 	if (unlikely(ret != Z_STREAM_END)) {
@@ -484,8 +499,7 @@ out:
 	return ret;
 }
 
-const struct btrfs_compress_op btrfs_zlib_compress = {
-	.workspace_manager	= &wsm,
+const struct btrfs_compress_levels btrfs_zlib_compress = {
 	.min_level		= 1,
 	.max_level		= 9,
 	.default_level		= BTRFS_ZLIB_DEFAULT_LEVEL,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 245e813ecd78..e00036672f33 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
 #include "accessors.h"
 #include "bio.h"
 #include "transaction.h"
+#include "sysfs.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
@@ -42,6 +43,9 @@
 /* Number of superblock log zones */
 #define BTRFS_NR_SB_LOG_ZONES 2
 
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES	128
+
 /*
  * Minimum of active zones we need:
  *
@@ -270,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 		return ret;
 	}
 	*nr_zones = ret;
-	if (!ret)
+	if (unlikely(!ret))
 		return -EIO;
 
 	/* Populate cache */
@@ -311,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 		if (ret < 0)
 			return ret;
 		/* No dev extents at all? Not good */
-		if (ret > 0)
+		if (unlikely(ret > 0))
 			return -EUCLEAN;
 	}
 
@@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	if (!IS_ALIGNED(nr_sectors, zone_sectors))
 		zone_info->nr_zones++;
 
-	max_active_zones = bdev_max_active_zones(bdev);
+	max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+					bdev_max_open_zones(bdev));
+	if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+		max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
 	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
 		btrfs_err(fs_info,
 "zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -496,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 	}
 
-	if (nreported != zone_info->nr_zones) {
+	if (unlikely(nreported != zone_info->nr_zones)) {
 		btrfs_err(device->fs_info,
 				 "inconsistent number of zones on %s (%u/%u)",
 				 rcu_dereference(device->name), nreported,
@@ -506,7 +513,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 	}
 
 	if (max_active_zones) {
-		if (nactive > max_active_zones) {
+		if (unlikely(nactive > max_active_zones)) {
+			if (bdev_max_active_zones(bdev) == 0) {
+				max_active_zones = 0;
+				zone_info->max_active_zones = 0;
+				goto validate;
+			}
 			btrfs_err(device->fs_info,
 			"zoned: %u active zones on %s exceeds max_active_zones %u",
 					 nactive, rcu_dereference(device->name),
@@ -519,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
 	}
 
+validate:
 	/* Validate superblock log */
 	nr_zones = BTRFS_NR_SB_LOG_ZONES;
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -537,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 		if (ret)
 			goto out;
 
-		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+		if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) {
 			btrfs_err(device->fs_info,
 	"zoned: failed to read super block log zone info at devid %llu zone %u",
 					 device->devid, sb_zone);
@@ -555,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 
 		ret = sb_write_pointer(device->bdev,
 				       &zone_info->sb_zones[sb_pos], &sb_wp);
-		if (ret != -ENOENT && ret) {
+		if (unlikely(ret != -ENOENT && ret)) {
 			btrfs_err(device->fs_info,
 			"zoned: super block log zone corrupted devid %llu zone %u",
 					 device->devid, sb_zone);
@@ -888,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 				  zones);
 	if (ret < 0)
 		return ret;
-	if (ret != BTRFS_NR_SB_LOG_ZONES)
+	if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
 		return -EIO;
 
 	return sb_log_location(bdev, zones, rw, bytenr_ret);
@@ -1240,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 	root = btrfs_extent_root(fs_info, key.objectid);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	/* We should not find the exact match */
-	if (!ret)
+	if (unlikely(!ret))
 		ret = -EUCLEAN;
 	if (ret < 0)
 		return ret;
@@ -1261,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 	else
 		length = fs_info->nodesize;
 
-	if (!(found_key.objectid >= cache->start &&
-	       found_key.objectid + length <= cache->start + cache->length)) {
+	if (unlikely(!(found_key.objectid >= cache->start &&
+		       found_key.objectid + length <= cache->start + cache->length))) {
 		return -EUCLEAN;
 	}
 	*offset_ret = found_key.objectid + length - cache->start;
@@ -1344,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
 		return 0;
 	}
 
-	if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+	if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) {
 		btrfs_err(fs_info,
 		"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
 			zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
@@ -1386,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
 					 struct zone_info *info,
 					 unsigned long *active)
 {
-	if (info->alloc_offset == WP_MISSING_DEV) {
+	if (unlikely(info->alloc_offset == WP_MISSING_DEV)) {
 		btrfs_err(bg->fs_info,
 			"zoned: cannot recover write pointer for zone %llu",
 			info->physical);
@@ -1415,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
 
 	bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
 
-	if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+	if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
 		btrfs_err(bg->fs_info,
 			  "zoned: cannot recover write pointer for zone %llu",
 			  zone_info[0].physical);
 		return -EIO;
 	}
-	if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+	if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
 		btrfs_err(bg->fs_info,
 			  "zoned: cannot recover write pointer for zone %llu",
 			  zone_info[1].physical);
@@ -1434,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
 	if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
 		zone_info[1].alloc_offset = last_alloc;
 
-	if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+	if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
 		btrfs_err(bg->fs_info,
 			  "zoned: write pointer offset mismatch of zones in DUP profile");
 		return -EIO;
 	}
 
 	if (test_bit(0, active) != test_bit(1, active)) {
-		if (!btrfs_zone_activate(bg))
+		if (unlikely(!btrfs_zone_activate(bg)))
 			return -EIO;
 	} else if (test_bit(0, active)) {
 		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
@@ -1476,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
 		if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
 			zone_info[i].alloc_offset = last_alloc;
 
-		if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
-		    !btrfs_test_opt(fs_info, DEGRADED)) {
+		if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+			     !btrfs_test_opt(fs_info, DEGRADED))) {
 			btrfs_err(fs_info,
 			"zoned: write pointer offset mismatch of zones in %s profile",
 				  btrfs_bg_type_to_raid_name(map->type));
 			return -EIO;
 		}
 		if (test_bit(0, active) != test_bit(i, active)) {
-			if (!btrfs_test_opt(fs_info, DEGRADED) &&
-			    !btrfs_zone_activate(bg)) {
+			if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) &&
+				     !btrfs_zone_activate(bg))) {
 				return -EIO;
 			}
 		} else {
@@ -1541,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
 		}
 
 		if (test_bit(0, active) != test_bit(i, active)) {
-			if (!btrfs_zone_activate(bg))
+			if (unlikely(!btrfs_zone_activate(bg)))
 				return -EIO;
 		} else {
 			if (test_bit(0, active))
@@ -1573,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
 			continue;
 
 		if (test_bit(0, active) != test_bit(i, active)) {
-			if (!btrfs_zone_activate(bg))
+			if (unlikely(!btrfs_zone_activate(bg)))
 				return -EIO;
 		} else {
 			if (test_bit(0, active))
@@ -1630,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		return 0;
 
 	/* Sanity check */
-	if (!IS_ALIGNED(length, fs_info->zone_size)) {
+	if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) {
 		btrfs_err(fs_info,
 		"zoned: block group %llu len %llu unaligned to zone size %llu",
 			  logical, length, fs_info->zone_size);
@@ -1743,7 +1756,7 @@ out:
 		return -EINVAL;
 	}
 
-	if (cache->alloc_offset > cache->zone_capacity) {
+	if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
 		btrfs_err(fs_info,
 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
 			  cache->alloc_offset, cache->zone_capacity,
@@ -2074,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
 
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 			      &mapped_length, &bioc, NULL, NULL);
-	if (ret || !bioc || mapped_length < PAGE_SIZE) {
+	if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) {
 		ret = -EIO;
 		goto out_put_bioc;
 	}
@@ -2132,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
 	if (physical_pos == wp)
 		return 0;
 
-	if (physical_pos > wp)
+	if (unlikely(physical_pos > wp))
 		return -EUCLEAN;
 
 	length = wp - physical_pos;
@@ -2168,10 +2181,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 		goto out_unlock;
 	}
 
-	/* No space left */
-	if (btrfs_zoned_bg_is_full(block_group)) {
-		ret = false;
-		goto out_unlock;
+	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+		/* The caller should check if the block group is full. */
+		if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+			ret = false;
+			goto out_unlock;
+		}
+	} else {
+		/* Since it is already written, it should have been active. */
+		WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
 	}
 
 	for (i = 0; i < map->num_stripes; i++) {
@@ -2230,7 +2248,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	const u64 end = block_group->start + block_group->length;
 	struct extent_buffer *eb;
-	unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
+	unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
 
 	rcu_read_lock();
 	xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
@@ -2245,6 +2263,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
 	rcu_read_unlock();
 }
 
+static int call_zone_finish(struct btrfs_block_group *block_group,
+			    struct btrfs_io_stripe *stripe)
+{
+	struct btrfs_device *device = stripe->dev;
+	const u64 physical = stripe->physical;
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	int ret;
+
+	if (!device->bdev)
+		return 0;
+
+	if (zinfo->max_active_zones == 0)
+		return 0;
+
+	if (btrfs_dev_is_sequential(device, physical)) {
+		unsigned int nofs_flags;
+
+		nofs_flags = memalloc_nofs_save();
+		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+				       physical >> SECTOR_SHIFT,
+				       zinfo->zone_size >> SECTOR_SHIFT);
+		memalloc_nofs_restore(nofs_flags);
+
+		if (ret)
+			return ret;
+	}
+
+	if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+		zinfo->reserved_active_zones++;
+	btrfs_dev_clear_active_zone(device, physical);
+
+	return 0;
+}
+
 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2329,31 +2381,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 	down_read(&dev_replace->rwsem);
 	map = block_group->physical_map;
 	for (i = 0; i < map->num_stripes; i++) {
-		struct btrfs_device *device = map->stripes[i].dev;
-		const u64 physical = map->stripes[i].physical;
-		struct btrfs_zoned_device_info *zinfo = device->zone_info;
-		unsigned int nofs_flags;
-
-		if (!device->bdev)
-			continue;
-
-		if (zinfo->max_active_zones == 0)
-			continue;
-
-		nofs_flags = memalloc_nofs_save();
-		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
-				       physical >> SECTOR_SHIFT,
-				       zinfo->zone_size >> SECTOR_SHIFT);
-		memalloc_nofs_restore(nofs_flags);
 
+		ret = call_zone_finish(block_group, &map->stripes[i]);
 		if (ret) {
 			up_read(&dev_replace->rwsem);
 			return ret;
 		}
-
-		if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
-			zinfo->reserved_active_zones++;
-		btrfs_dev_clear_active_zone(device, physical);
 	}
 	up_read(&dev_replace->rwsem);
 
@@ -2431,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
 	return ret;
 }
 
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
 {
 	struct btrfs_block_group *block_group;
 	u64 min_alloc_bytes;
 
 	if (!btrfs_is_zoned(fs_info))
-		return;
+		return 0;
 
 	block_group = btrfs_lookup_block_group(fs_info, logical);
-	ASSERT(block_group);
+	if (WARN_ON_ONCE(!block_group))
+		return -ENOENT;
 
 	/* No MIXED_BG on zoned btrfs. */
 	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2457,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
 
 out:
 	btrfs_put_block_group(block_group);
+	return 0;
 }
 
 static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
 {
+	int ret;
 	struct btrfs_block_group *bg =
 		container_of(work, struct btrfs_block_group, zone_finish_work);
 
 	wait_on_extent_buffer_writeback(bg->last_eb);
 	free_extent_buffer(bg->last_eb);
-	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+	ret = do_zone_finish(bg, true);
+	if (ret)
+		btrfs_handle_fs_error(bg->fs_info, ret,
+				      "Failed to finish block-group's zone");
 	btrfs_put_block_group(bg);
 }
 
@@ -2488,7 +2527,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
 	refcount_inc(&eb->refs);
 	bg->last_eb = eb;
 	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
-	queue_work(system_unbound_wq, &bg->zone_finish_work);
+	queue_work(system_dfl_wq, &bg->zone_finish_work);
 }
 
 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2504,12 +2543,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
 void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
-	struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
+	struct btrfs_space_info *space_info = data_sinfo;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_block_group *bg;
 	struct list_head *bg_list;
 	u64 alloc_flags;
-	bool initial = false;
+	bool first = true;
 	bool did_chunk_alloc = false;
 	int index;
 	int ret;
@@ -2523,21 +2562,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
 	if (sb_rdonly(fs_info->sb))
 		return;
 
-	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
 	alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
 	index = btrfs_bg_flags_to_raid_index(alloc_flags);
 
-	bg_list = &data_sinfo->block_groups[index];
+	/* Scan the data space_info to find empty block groups. Take the second one. */
 again:
+	bg_list = &space_info->block_groups[index];
 	list_for_each_entry(bg, bg_list, list) {
-		if (bg->used > 0)
+		if (bg->alloc_offset != 0)
 			continue;
 
-		if (!initial) {
-			initial = true;
+		if (first) {
+			first = false;
 			continue;
 		}
 
+		if (space_info == data_sinfo) {
+			/* Migrate the block group to the data relocation space_info. */
+			struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+			int factor;
+
+			ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+			factor = btrfs_bg_type_to_factor(bg->flags);
+
+			down_write(&space_info->groups_sem);
+			list_del_init(&bg->list);
+			/* We can assume this as we choose the second empty one. */
+			ASSERT(!list_empty(&space_info->block_groups[index]));
+			up_write(&space_info->groups_sem);
+
+			spin_lock(&space_info->lock);
+			space_info->total_bytes -= bg->length;
+			space_info->disk_total -= bg->length * factor;
+			space_info->disk_total -= bg->zone_unusable;
+			/* There is no allocation ever happened. */
+			ASSERT(bg->used == 0);
+			/* No super block in a block group on the zoned setup. */
+			ASSERT(bg->bytes_super == 0);
+			spin_unlock(&space_info->lock);
+
+			bg->space_info = reloc_sinfo;
+			if (reloc_sinfo->block_group_kobjs[index] == NULL)
+				btrfs_sysfs_add_block_group_type(bg);
+
+			btrfs_add_bg_to_space_info(fs_info, bg);
+		}
+
 		fs_info->data_reloc_bg = bg->start;
 		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
 		btrfs_zone_activate(bg);
@@ -2552,11 +2622,18 @@ again:
 	if (IS_ERR(trans))
 		return;
 
+	/* Allocate new BG in the data relocation space_info. */
+	space_info = data_sinfo->sub_group[0];
+	ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
 	ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
 	btrfs_end_transaction(trans);
 	if (ret == 1) {
+		/*
+		 * We allocated a new block group in the data relocation space_info. We
+		 * can take that one.
+		 */
+		first = false;
 		did_chunk_alloc = true;
-		bg_list = &space_info->block_groups[index];
 		goto again;
 	}
 }
@@ -2650,7 +2727,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
 
 		spin_lock(&block_group->lock);
 		if (block_group->reserved || block_group->alloc_offset == 0 ||
-		    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+		    !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
 		    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
 			spin_unlock(&block_group->lock);
 			continue;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6e11533b8e14..17c5656580dd 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
 bool btrfs_zone_activate(struct btrfs_block_group *block_group);
 int btrfs_zone_finish(struct btrfs_block_group *block_group);
 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
 			     u64 length);
 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
 				   struct extent_buffer *eb);
@@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
 	return true;
 }
 
-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
-					   u64 logical, u64 length) { }
+static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+					   u64 logical, u64 length)
+{
+	return 0;
+}
 
 static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
 						 struct extent_buffer *eb) { }
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index ff0292615e1f..c9cddcfa337b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -77,7 +77,6 @@ struct workspace {
  */
 
 struct zstd_workspace_manager {
-	const struct btrfs_compress_op *ops;
 	spinlock_t lock;
 	struct list_head lru_list;
 	struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
@@ -86,8 +85,6 @@ struct zstd_workspace_manager {
 	struct timer_list timer;
 };
 
-static struct zstd_workspace_manager wsm;
-
 static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
 
 static inline struct workspace *list_to_workspace(struct list_head *list)
@@ -112,19 +109,19 @@ static inline int clip_level(int level)
  */
 static void zstd_reclaim_timer_fn(struct timer_list *timer)
 {
+	struct zstd_workspace_manager *zwsm =
+		container_of(timer, struct zstd_workspace_manager, timer);
 	unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
 	struct list_head *pos, *next;
 
-	ASSERT(timer == &wsm.timer);
-
-	spin_lock(&wsm.lock);
+	spin_lock(&zwsm->lock);
 
-	if (list_empty(&wsm.lru_list)) {
-		spin_unlock(&wsm.lock);
+	if (list_empty(&zwsm->lru_list)) {
+		spin_unlock(&zwsm->lock);
 		return;
 	}
 
-	list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+	list_for_each_prev_safe(pos, next, &zwsm->lru_list) {
 		struct workspace *victim = container_of(pos, struct workspace,
 							lru_list);
 		int level;
@@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
 		list_del(&victim->list);
 		zstd_free_workspace(&victim->list);
 
-		if (list_empty(&wsm.idle_ws[level]))
-			clear_bit(level, &wsm.active_map);
+		if (list_empty(&zwsm->idle_ws[level]))
+			clear_bit(level, &zwsm->active_map);
 
 	}
 
-	if (!list_empty(&wsm.lru_list))
-		mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+	if (!list_empty(&zwsm->lru_list))
+		mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
 
-	spin_unlock(&wsm.lock);
+	spin_unlock(&zwsm->lock);
 }
 
 /*
@@ -182,49 +179,56 @@ static void zstd_calc_ws_mem_sizes(void)
 	}
 }
 
-void zstd_init_workspace_manager(void)
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info)
 {
+	struct zstd_workspace_manager *zwsm;
 	struct list_head *ws;
-	int i;
 
+	ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL);
+	zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL);
+	if (!zwsm)
+		return -ENOMEM;
 	zstd_calc_ws_mem_sizes();
+	spin_lock_init(&zwsm->lock);
+	init_waitqueue_head(&zwsm->wait);
+	timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0);
 
-	wsm.ops = &btrfs_zstd_compress;
-	spin_lock_init(&wsm.lock);
-	init_waitqueue_head(&wsm.wait);
-	timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
-
-	INIT_LIST_HEAD(&wsm.lru_list);
-	for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
-		INIT_LIST_HEAD(&wsm.idle_ws[i]);
+	INIT_LIST_HEAD(&zwsm->lru_list);
+	for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&zwsm->idle_ws[i]);
+	fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm;
 
-	ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+	ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL);
 	if (IS_ERR(ws)) {
 		btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
 	} else {
-		set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
-		list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+		set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map);
+		list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
 	}
+	return 0;
 }
 
-void zstd_cleanup_workspace_manager(void)
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info)
 {
+	struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
 	struct workspace *workspace;
-	int i;
 
-	spin_lock_bh(&wsm.lock);
-	for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
-		while (!list_empty(&wsm.idle_ws[i])) {
-			workspace = container_of(wsm.idle_ws[i].next,
+	if (!zwsm)
+		return;
+	fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL;
+	spin_lock_bh(&zwsm->lock);
+	for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+		while (!list_empty(&zwsm->idle_ws[i])) {
+			workspace = container_of(zwsm->idle_ws[i].next,
 						 struct workspace, list);
 			list_del(&workspace->list);
 			list_del(&workspace->lru_list);
 			zstd_free_workspace(&workspace->list);
 		}
 	}
-	spin_unlock_bh(&wsm.lock);
-
-	timer_delete_sync(&wsm.timer);
+	spin_unlock_bh(&zwsm->lock);
+	timer_delete_sync(&zwsm->timer);
+	kfree(zwsm);
 }
 
 /*
@@ -239,29 +243,31 @@ void zstd_cleanup_workspace_manager(void)
  * offer the opportunity to reclaim the workspace in favor of allocating an
  * appropriately sized one in the future.
  */
-static struct list_head *zstd_find_workspace(int level)
+static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level)
 {
+	struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
 	struct list_head *ws;
 	struct workspace *workspace;
 	int i = clip_level(level);
 
-	spin_lock_bh(&wsm.lock);
-	for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
-		if (!list_empty(&wsm.idle_ws[i])) {
-			ws = wsm.idle_ws[i].next;
+	ASSERT(zwsm);
+	spin_lock_bh(&zwsm->lock);
+	for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) {
+		if (!list_empty(&zwsm->idle_ws[i])) {
+			ws = zwsm->idle_ws[i].next;
 			workspace = list_to_workspace(ws);
 			list_del_init(ws);
 			/* keep its place if it's a lower level using this */
 			workspace->req_level = level;
 			if (clip_level(level) == workspace->level)
 				list_del(&workspace->lru_list);
-			if (list_empty(&wsm.idle_ws[i]))
-				clear_bit(i, &wsm.active_map);
-			spin_unlock_bh(&wsm.lock);
+			if (list_empty(&zwsm->idle_ws[i]))
+				clear_bit(i, &zwsm->active_map);
+			spin_unlock_bh(&zwsm->lock);
 			return ws;
 		}
 	}
-	spin_unlock_bh(&wsm.lock);
+	spin_unlock_bh(&zwsm->lock);
 
 	return NULL;
 }
@@ -276,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level)
  * attempt to allocate a new workspace.  If we fail to allocate one due to
  * memory pressure, go to sleep waiting for the max level workspace to free up.
  */
-struct list_head *zstd_get_workspace(int level)
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level)
 {
+	struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
 	struct list_head *ws;
 	unsigned int nofs_flag;
 
+	ASSERT(zwsm);
+
 	/* level == 0 means we can use any workspace */
 	if (!level)
 		level = 1;
 
 again:
-	ws = zstd_find_workspace(level);
+	ws = zstd_find_workspace(fs_info, level);
 	if (ws)
 		return ws;
 
 	nofs_flag = memalloc_nofs_save();
-	ws = zstd_alloc_workspace(level);
+	ws = zstd_alloc_workspace(fs_info, level);
 	memalloc_nofs_restore(nofs_flag);
 
 	if (IS_ERR(ws)) {
 		DEFINE_WAIT(wait);
 
-		prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE);
 		schedule();
-		finish_wait(&wsm.wait, &wait);
+		finish_wait(&zwsm->wait, &wait);
 
 		goto again;
 	}
@@ -318,34 +327,36 @@ again:
  * isn't set, it is also set here.  Only the max level workspace tries and wakes
  * up waiting workspaces.
  */
-void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws)
 {
+	struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
 	struct workspace *workspace = list_to_workspace(ws);
 
-	spin_lock_bh(&wsm.lock);
+	ASSERT(zwsm);
+	spin_lock_bh(&zwsm->lock);
 
 	/* A node is only taken off the lru if we are the corresponding level */
 	if (clip_level(workspace->req_level) == workspace->level) {
 		/* Hide a max level workspace from reclaim */
-		if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+		if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
 			INIT_LIST_HEAD(&workspace->lru_list);
 		} else {
 			workspace->last_used = jiffies;
-			list_add(&workspace->lru_list, &wsm.lru_list);
-			if (!timer_pending(&wsm.timer))
-				mod_timer(&wsm.timer,
+			list_add(&workspace->lru_list, &zwsm->lru_list);
+			if (!timer_pending(&zwsm->timer))
+				mod_timer(&zwsm->timer,
 					  jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
 		}
 	}
 
-	set_bit(workspace->level, &wsm.active_map);
-	list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
+	set_bit(workspace->level, &zwsm->active_map);
+	list_add(&workspace->list, &zwsm->idle_ws[workspace->level]);
 	workspace->req_level = 0;
 
-	spin_unlock_bh(&wsm.lock);
+	spin_unlock_bh(&zwsm->lock);
 
 	if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
-		cond_wake_up(&wsm.wait);
+		cond_wake_up(&zwsm->wait);
 }
 
 void zstd_free_workspace(struct list_head *ws)
@@ -357,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws)
 	kfree(workspace);
 }
 
-struct list_head *zstd_alloc_workspace(int level)
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level)
 {
+	const u32 blocksize = fs_info->sectorsize;
 	struct workspace *workspace;
 
 	workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
@@ -371,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level)
 	workspace->req_level = level;
 	workspace->last_used = jiffies;
 	workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
-	workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	workspace->buf = kmalloc(blocksize, GFP_KERNEL);
 	if (!workspace->mem || !workspace->buf)
 		goto fail;
 
@@ -384,11 +396,13 @@ fail:
 	return ERR_PTR(-ENOMEM);
 }
 
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 			 unsigned long *total_in, unsigned long *total_out)
 {
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
 	zstd_cstream *stream;
 	int ret = 0;
 	int nr_folios = 0;
@@ -399,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 	unsigned long len = *total_out;
 	const unsigned long nr_dest_folios = *out_folios;
 	const u64 orig_end = start + len;
-	unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+	const u32 blocksize = fs_info->sectorsize;
+	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+	unsigned long max_out = nr_dest_folios * min_folio_size;
 	unsigned int cur_len;
 
 	workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
@@ -411,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 	stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
 			workspace->size);
 	if (unlikely(!stream)) {
-		struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
-		btrfs_err(inode->root->fs_info,
+		btrfs_err(fs_info,
 	"zstd compression init level %d failed, root %llu inode %llu offset %llu",
 			  workspace->req_level, btrfs_root_id(inode->root),
 			  btrfs_ino(inode), start);
@@ -431,7 +445,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 	workspace->in_buf.size = cur_len;
 
 	/* Allocate and map in the output buffer */
-	out_folio = btrfs_alloc_compr_folio();
+	out_folio = btrfs_alloc_compr_folio(fs_info);
 	if (out_folio == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -439,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 	folios[nr_folios++] = out_folio;
 	workspace->out_buf.dst = folio_address(out_folio);
 	workspace->out_buf.pos = 0;
-	workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+	workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
 
 	while (1) {
 		size_t ret2;
@@ -447,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 		ret2 = zstd_compress_stream(stream, &workspace->out_buf,
 				&workspace->in_buf);
 		if (unlikely(zstd_is_error(ret2))) {
-			struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
-			btrfs_warn(inode->root->fs_info,
+			btrfs_warn(fs_info,
 "zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
 				   workspace->req_level, zstd_get_error_code(ret2),
 				   btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -459,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 		}
 
 		/* Check to see if we are making it bigger */
-		if (tot_in + workspace->in_buf.pos > 8192 &&
+		if (tot_in + workspace->in_buf.pos > blocksize * 2 &&
 				tot_in + workspace->in_buf.pos <
 				tot_out + workspace->out_buf.pos) {
 			ret = -E2BIG;
@@ -475,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 
 		/* Check if we need more output space */
 		if (workspace->out_buf.pos == workspace->out_buf.size) {
-			tot_out += PAGE_SIZE;
-			max_out -= PAGE_SIZE;
+			tot_out += min_folio_size;
+			max_out -= min_folio_size;
 			if (nr_folios == nr_dest_folios) {
 				ret = -E2BIG;
 				goto out;
 			}
-			out_folio = btrfs_alloc_compr_folio();
+			out_folio = btrfs_alloc_compr_folio(fs_info);
 			if (out_folio == NULL) {
 				ret = -ENOMEM;
 				goto out;
@@ -489,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 			folios[nr_folios++] = out_folio;
 			workspace->out_buf.dst = folio_address(out_folio);
 			workspace->out_buf.pos = 0;
-			workspace->out_buf.size = min_t(size_t, max_out,
-							PAGE_SIZE);
+			workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
 		}
 
 		/* We've reached the end of the input */
@@ -522,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 
 		ret2 = zstd_end_stream(stream, &workspace->out_buf);
 		if (unlikely(zstd_is_error(ret2))) {
-			struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
-			btrfs_err(inode->root->fs_info,
+			btrfs_err(fs_info,
 "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
 				  workspace->req_level, zstd_get_error_code(ret2),
 				  btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -542,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 			goto out;
 		}
 
-		tot_out += PAGE_SIZE;
-		max_out -= PAGE_SIZE;
+		tot_out += min_folio_size;
+		max_out -= min_folio_size;
 		if (nr_folios == nr_dest_folios) {
 			ret = -E2BIG;
 			goto out;
 		}
-		out_folio = btrfs_alloc_compr_folio();
+		out_folio = btrfs_alloc_compr_folio(fs_info);
 		if (out_folio == NULL) {
 			ret = -ENOMEM;
 			goto out;
@@ -556,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
 		folios[nr_folios++] = out_folio;
 		workspace->out_buf.dst = folio_address(out_folio);
 		workspace->out_buf.pos = 0;
-		workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+		workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
 	}
 
 	if (tot_out >= tot_in) {
@@ -578,13 +587,16 @@ out:
 
 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 {
+	struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	struct folio **folios_in = cb->compressed_folios;
 	size_t srclen = cb->compressed_len;
 	zstd_dstream *stream;
 	int ret = 0;
+	const u32 blocksize = fs_info->sectorsize;
+	const unsigned int min_folio_size = btrfs_min_folio_size(fs_info);
 	unsigned long folio_in_index = 0;
-	unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
 	unsigned long buf_start;
 	unsigned long total_out = 0;
 
@@ -602,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 
 	workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
 	workspace->in_buf.pos = 0;
-	workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+	workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
 
 	workspace->out_buf.dst = workspace->buf;
 	workspace->out_buf.pos = 0;
-	workspace->out_buf.size = PAGE_SIZE;
+	workspace->out_buf.size = blocksize;
 
 	while (1) {
 		size_t ret2;
@@ -642,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			kunmap_local(workspace->in_buf.src);
 			folio_in_index++;
-			if (folio_in_index >= total_folios_in) {
+			if (unlikely(folio_in_index >= total_folios_in)) {
 				workspace->in_buf.src = NULL;
 				ret = -EIO;
 				goto done;
 			}
-			srclen -= PAGE_SIZE;
+			srclen -= min_folio_size;
 			workspace->in_buf.src =
 				kmap_local_folio(folios_in[folio_in_index], 0);
 			workspace->in_buf.pos = 0;
-			workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+			workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
 		}
 	}
 	ret = 0;
@@ -718,9 +730,7 @@ finish:
 	return ret;
 }
 
-const struct btrfs_compress_op btrfs_zstd_compress = {
-	/* ZSTD uses own workspace manager */
-	.workspace_manager = NULL,
+const struct btrfs_compress_levels btrfs_zstd_compress = {
 	.min_level	= ZSTD_BTRFS_MIN_LEVEL,
 	.max_level	= ZSTD_BTRFS_MAX_LEVEL,
 	.default_level	= ZSTD_BTRFS_DEFAULT_LEVEL,
diff --git a/fs/buffer.c b/fs/buffer.c
index ead4dc85debd..6a8752f7bbed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
  */
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 {
-	__end_buffer_read_notouch(bh, uptodate);
 	put_bh(bh);
+	__end_buffer_read_notouch(bh, uptodate);
 }
 EXPORT_SYMBOL(end_buffer_read_sync);
 
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 91dfd0231877..d1edb2ac3837 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -387,10 +387,9 @@ try_again:
 		cachefiles_io_error(cache, "Rename security error %d", ret);
 	} else {
 		struct renamedata rd = {
-			.old_mnt_idmap	= &nop_mnt_idmap,
+			.mnt_idmap	= &nop_mnt_idmap,
 			.old_parent	= dir,
 			.old_dentry	= rep,
-			.new_mnt_idmap	= &nop_mnt_idmap,
 			.new_parent	= cache->graveyard,
 			.new_dentry	= grave,
 		};
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b202d789e93..322ed268f14a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
 								0,
 								gfp_flags);
 		if (IS_ERR(pages[index])) {
-			if (PTR_ERR(pages[index]) == -EINVAL) {
+			int err = PTR_ERR(pages[index]);
+
+			if (err == -EINVAL) {
 				pr_err_client(cl, "inode->i_blkbits=%hhu\n",
 						inode->i_blkbits);
 			}
@@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
 			BUG_ON(ceph_wbc->locked_pages == 0);
 
 			pages[index] = NULL;
-			return PTR_ERR(pages[index]);
+			return err;
 		}
 	} else {
 		pages[index] = &folio->page;
@@ -1687,6 +1689,7 @@ get_more_pages:
 
 process_folio_batch:
 		rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+		ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
 		if (rc)
 			goto release_folios;
 
@@ -1695,8 +1698,6 @@ process_folio_batch:
 			goto release_folios;
 
 		if (ceph_wbc.processed_in_fbatch) {
-			ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
 			if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
 			    ceph_wbc.locked_pages < ceph_wbc.max_pages) {
 				doutc(cl, "reached end fbatch, trying for more\n");
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index cab722619207..7026e794813c 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
 }
 
 static struct fscrypt_operations ceph_fscrypt_ops = {
+	.inode_info_offs	= (int)offsetof(struct ceph_inode_info, i_crypt_info) -
+				  (int)offsetof(struct ceph_inode_info, netfs.inode),
 	.needs_bounce_pages	= 1,
 	.get_context		= ceph_crypt_get_context,
 	.set_context		= ceph_crypt_set_context,
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct rb_node *rp;
-	int pathlen = 0;
-	u64 pathbase;
 	char *path;
 
 	mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		if (req->r_inode) {
 			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
 		} else if (req->r_dentry) {
-			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
-						    &pathbase, 0);
+			struct ceph_path_info path_info;
+			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
 			if (IS_ERR(path))
 				path = NULL;
 			spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 				   req->r_dentry,
 				   path ? path : "");
 			spin_unlock(&req->r_dentry->d_lock);
-			ceph_mdsc_free_path(path, pathlen);
+			ceph_mdsc_free_path_info(&path_info);
 		} else if (req->r_path1) {
 			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
 				   req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		}
 
 		if (req->r_old_dentry) {
-			path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
-						    &pathbase, 0);
+			struct ceph_path_info path_info;
+			path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
 			if (IS_ERR(path))
 				path = NULL;
 			spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 				   req->r_old_dentry,
 				   path ? path : "");
 			spin_unlock(&req->r_old_dentry->d_lock);
-			ceph_mdsc_free_path(path, pathlen);
+			ceph_mdsc_free_path_info(&path_info);
 		} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
 			if (req->r_ino2.ino)
 				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8478e7e75df6..32973c62c1a2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 
 	/* If op failed, mark everyone involved for errors */
 	if (result) {
-		int pathlen = 0;
-		u64 base = 0;
-		char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
-						  &base, 0);
+		struct ceph_path_info path_info = {0};
+		char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
 
 		/* mark error on parent + clear complete */
 		mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 		mapping_set_error(req->r_old_inode->i_mapping, result);
 
 		pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
-			       base, IS_ERR(path) ? "<<bad>>" : path, result);
-		ceph_mdsc_free_path(path, pathlen);
+			       path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+		ceph_mdsc_free_path_info(&path_info);
 	}
 out:
 	iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	int err = -EROFS;
 	int op;
 	char *path;
-	int pathlen;
-	u64 pathbase;
 
 	if (ceph_snap(dir) == CEPH_SNAPDIR) {
 		/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dn) {
 		try_async = false;
 	} else {
-		path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+		struct ceph_path_info path_info;
+		path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
 		if (IS_ERR(path)) {
 			try_async = false;
 			err = 0;
 		} else {
 			err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
 		}
-		ceph_mdsc_free_path(path, pathlen);
+		ceph_mdsc_free_path_info(&path_info);
 		dput(dn);
 
 		/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..978acd3d4b32 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
 	int flags, fmode, wanted;
 	struct dentry *dentry;
 	char *path;
-	int pathlen;
-	u64 pathbase;
 	bool do_sync = false;
 	int mask = MAY_READ;
 
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
 	if (!dentry) {
 		do_sync = true;
 	} else {
-		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+		struct ceph_path_info path_info;
+		path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
 		if (IS_ERR(path)) {
 			do_sync = true;
 			err = 0;
 		} else {
 			err = ceph_mds_check_access(mdsc, path, mask);
 		}
-		ceph_mdsc_free_path(path, pathlen);
+		ceph_mdsc_free_path_info(&path_info);
 		dput(dentry);
 
 		/* For none EACCES cases will let the MDS do the mds auth check */
@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 	mapping_set_error(req->r_parent->i_mapping, result);
 
 	if (result) {
-		int pathlen = 0;
-		u64 base = 0;
-		char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
-						  &base, 0);
+		struct ceph_path_info path_info = {0};
+		char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
 
 		pr_warn_client(cl,
 			"async create failure path=(%llx)%s result=%d!\n",
-			base, IS_ERR(path) ? "<<bad>>" : path, result);
-		ceph_mdsc_free_path(path, pathlen);
+			path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+		ceph_mdsc_free_path_info(&path_info);
 
 		ceph_dir_clear_complete(req->r_parent);
 		if (!d_unhashed(dentry))
@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	int mask;
 	int err;
 	char *path;
-	int pathlen;
-	u64 pathbase;
 
 	doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
 	      dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (!dn) {
 		try_async = false;
 	} else {
-		path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+		struct ceph_path_info path_info;
+		path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
 		if (IS_ERR(path)) {
 			try_async = false;
 			err = 0;
@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 				mask |= MAY_WRITE;
 			err = ceph_mds_check_access(mdsc, path, mask);
 		}
-		ceph_mdsc_free_path(path, pathlen);
+		ceph_mdsc_free_path_info(&path_info);
 		dput(dn);
 
 		/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..949f0badc944 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
 	return 0;
 }
 
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+					    struct ceph_vino vino)
+{
+	return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record.  If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned.  Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+					struct inode *parent,
+					struct ceph_mds_reply_info_parsed *rinfo)
+{
+	struct ceph_vino vino;
+
+	if (unlikely(!rinfo->diri.in))
+		return parent; /* nothing to compare against */
+
+	/* If we didn't have a cached parent inode to begin with, just bail out. */
+	if (!parent)
+		return NULL;
+
+	vino.ino  = le64_to_cpu(rinfo->diri.in->ino);
+	vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+	if (likely(ceph_vino_matches_parent(parent, vino)))
+		return parent; /* matches – use the original reference */
+
+	/* Mismatch – this should be rare.  Emit a WARN and obtain the correct inode. */
+	WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+		  ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+	return ceph_get_inode(sb, vino, NULL);
+}
+
 /**
  * ceph_new_inode - allocate a new inode in advance of an expected create
  * @dir: parent directory for new inode
@@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_work_mask = 0;
 	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
 #ifdef CONFIG_FS_ENCRYPTION
+	ci->i_crypt_info = NULL;
 	ci->fscrypt_auth = NULL;
 	ci->fscrypt_auth_len = 0;
 #endif
@@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 	struct ceph_vino tvino, dvino;
 	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_client *cl = fsc->client;
+	struct inode *parent_dir = NULL;
 	int err = 0;
 
 	doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 	}
 
 	if (rinfo->head->is_dentry) {
-		struct inode *dir = req->r_parent;
-
-		if (dir) {
-			err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+		/*
+		 * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+		 * so we need to get the correct inode
+		 */
+		parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+		if (unlikely(IS_ERR(parent_dir))) {
+			err = PTR_ERR(parent_dir);
+			goto done;
+		}
+		if (parent_dir) {
+			err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
 					      rinfo->dirfrag, session, -1,
 					      &req->r_caps_reservation);
 			if (err < 0)
@@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 			WARN_ON_ONCE(1);
 		}
 
-		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+		if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
 		    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
 		    !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
 			bool is_nokey = false;
 			struct qstr dname;
 			struct dentry *dn, *parent;
 			struct fscrypt_str oname = FSTR_INIT(NULL, 0);
-			struct ceph_fname fname = { .dir	= dir,
+			struct ceph_fname fname = { .dir	= parent_dir,
 						    .name	= rinfo->dname,
 						    .ctext	= rinfo->altname,
 						    .name_len	= rinfo->dname_len,
@@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 			BUG_ON(!rinfo->head->is_target);
 			BUG_ON(req->r_dentry);
 
-			parent = d_find_any_alias(dir);
+			parent = d_find_any_alias(parent_dir);
 			BUG_ON(!parent);
 
-			err = ceph_fname_alloc_buffer(dir, &oname);
+			err = ceph_fname_alloc_buffer(parent_dir, &oname);
 			if (err < 0) {
 				dput(parent);
 				goto done;
@@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 			err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
 			if (err < 0) {
 				dput(parent);
-				ceph_fname_free_buffer(dir, &oname);
+				ceph_fname_free_buffer(parent_dir, &oname);
 				goto done;
 			}
 			dname.name = oname.name;
@@ -1595,7 +1650,7 @@ retry_lookup:
 				      dname.len, dname.name, dn);
 				if (!dn) {
 					dput(parent);
-					ceph_fname_free_buffer(dir, &oname);
+					ceph_fname_free_buffer(parent_dir, &oname);
 					err = -ENOMEM;
 					goto done;
 				}
@@ -1610,12 +1665,12 @@ retry_lookup:
 				    ceph_snap(d_inode(dn)) != tvino.snap)) {
 				doutc(cl, " dn %p points to wrong inode %p\n",
 				      dn, d_inode(dn));
-				ceph_dir_clear_ordered(dir);
+				ceph_dir_clear_ordered(parent_dir);
 				d_delete(dn);
 				dput(dn);
 				goto retry_lookup;
 			}
-			ceph_fname_free_buffer(dir, &oname);
+			ceph_fname_free_buffer(parent_dir, &oname);
 
 			req->r_dentry = dn;
 			dput(parent);
@@ -1794,6 +1849,9 @@ retry_lookup:
 					    &dvino, ptvino);
 	}
 done:
+	/* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+	if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+		iput(parent_dir);
 	doutc(cl, "done err=%d\n", err);
 	return err;
 }
@@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
 	int truncate_retry = 20; /* The RMW will take around 50ms */
 	struct dentry *dentry;
 	char *path;
-	int pathlen;
-	u64 pathbase;
 	bool do_sync = false;
 
 	dentry = d_find_alias(inode);
 	if (!dentry) {
 		do_sync = true;
 	} else {
-		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+		struct ceph_path_info path_info;
+		path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
 		if (IS_ERR(path)) {
 			do_sync = true;
 			err = 0;
 		} else {
 			err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
 		}
-		ceph_mdsc_free_path(path, pathlen);
+		ceph_mdsc_free_path_info(&path_info);
 		dput(dentry);
 
 		/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f497c39ff82..73da2648fa0f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 			int count;
 			dput(dentry);
 			d_prune_aliases(inode);
-			count = atomic_read(&inode->i_count);
+			count = icount_read(inode);
 			if (count == 1)
 				(*remaining)--;
 			doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
@@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
  * ceph_mdsc_build_path - build a path string to a given dentry
  * @mdsc: mds client
  * @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
  * @for_wire: is this path going to be sent to the MDS?
  *
  * Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
  *   foo/.snap/bar -> foo//bar
  */
 char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
-			   int *plen, u64 *pbase, int for_wire)
+			   struct ceph_path_info *path_info, int for_wire)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
 	struct dentry *cur;
@@ -2810,16 +2809,28 @@ retry:
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 
-	*pbase = base;
-	*plen = PATH_MAX - 1 - pos;
+	/* Initialize the output structure */
+	memset(path_info, 0, sizeof(*path_info));
+
+	path_info->vino.ino = base;
+	path_info->pathlen = PATH_MAX - 1 - pos;
+	path_info->path = path + pos;
+	path_info->freepath = true;
+
+	/* Set snap from dentry if available */
+	if (d_inode(dentry))
+		path_info->vino.snap = ceph_snap(d_inode(dentry));
+	else
+		path_info->vino.snap = CEPH_NOSNAP;
+
 	doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
-	      base, *plen, path + pos);
+	      base, PATH_MAX - 1 - pos, path + pos);
 	return path + pos;
 }
 
 static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
-			     struct inode *dir, const char **ppath, int *ppathlen,
-			     u64 *pino, bool *pfreepath, bool parent_locked)
+			     struct inode *dir, struct ceph_path_info *path_info,
+			     bool parent_locked)
 {
 	char *path;
 
@@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
 		dir = d_inode_rcu(dentry->d_parent);
 	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
 	    !IS_ENCRYPTED(dir)) {
-		*pino = ceph_ino(dir);
+		path_info->vino.ino = ceph_ino(dir);
+		path_info->vino.snap = ceph_snap(dir);
 		rcu_read_unlock();
-		*ppath = dentry->d_name.name;
-		*ppathlen = dentry->d_name.len;
+		path_info->path = dentry->d_name.name;
+		path_info->pathlen = dentry->d_name.len;
+		path_info->freepath = false;
 		return 0;
 	}
 	rcu_read_unlock();
-	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
-	*ppath = path;
-	*pfreepath = true;
+	/*
+	 * ceph_mdsc_build_path already fills path_info, including snap handling.
+	 */
 	return 0;
 }
 
-static int build_inode_path(struct inode *inode,
-			    const char **ppath, int *ppathlen, u64 *pino,
-			    bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct dentry *dentry;
 	char *path;
 
 	if (ceph_snap(inode) == CEPH_NOSNAP) {
-		*pino = ceph_ino(inode);
-		*ppathlen = 0;
+		path_info->vino.ino = ceph_ino(inode);
+		path_info->vino.snap = ceph_snap(inode);
+		path_info->pathlen = 0;
+		path_info->freepath = false;
 		return 0;
 	}
 	dentry = d_find_alias(inode);
-	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
 	dput(dentry);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
-	*ppath = path;
-	*pfreepath = true;
+	/*
+	 * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+	 * Override with inode's snap since that's what this function is for.
+	 */
+	path_info->vino.snap = ceph_snap(inode);
 	return 0;
 }
 
@@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode,
  */
 static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
 				 struct dentry *rdentry, struct inode *rdiri,
-				 const char *rpath, u64 rino, const char **ppath,
-				 int *pathlen, u64 *ino, bool *freepath,
+				 const char *rpath, u64 rino,
+				 struct ceph_path_info *path_info,
 				 bool parent_locked)
 {
 	struct ceph_client *cl = mdsc->fsc->client;
 	int r = 0;
 
+	/* Initialize the output structure */
+	memset(path_info, 0, sizeof(*path_info));
+
 	if (rinode) {
-		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		r = build_inode_path(rinode, path_info);
 		doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
 		      ceph_snap(rinode));
 	} else if (rdentry) {
-		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
-					freepath, parent_locked);
-		doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+		r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+		doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+		      path_info->pathlen, path_info->path);
 	} else if (rpath || rino) {
-		*ino = rino;
-		*ppath = rpath;
-		*pathlen = rpath ? strlen(rpath) : 0;
-		doutc(cl, " path %.*s\n", *pathlen, rpath);
+		path_info->vino.ino = rino;
+		path_info->vino.snap = CEPH_NOSNAP;
+		path_info->path = rpath;
+		path_info->pathlen = rpath ? strlen(rpath) : 0;
+		path_info->freepath = false;
+
+		doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
 	}
 
 	return r;
@@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	struct ceph_mds_request_head_legacy *lhead;
-	const char *path1 = NULL;
-	const char *path2 = NULL;
-	u64 ino1 = 0, ino2 = 0;
-	int pathlen1 = 0, pathlen2 = 0;
-	bool freepath1 = false, freepath2 = false;
+	struct ceph_path_info path_info1 = {0};
+	struct ceph_path_info path_info2 = {0};
 	struct dentry *old_dentry = NULL;
 	int len;
 	u16 releases;
@@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	u16 request_head_version = mds_supported_head_version(session);
 	kuid_t caller_fsuid = req->r_cred->fsuid;
 	kgid_t caller_fsgid = req->r_cred->fsgid;
+	bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
 
 	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
-			      req->r_parent, req->r_path1, req->r_ino1.ino,
-			      &path1, &pathlen1, &ino1, &freepath1,
-			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
-					&req->r_req_flags));
+				    req->r_parent, req->r_path1, req->r_ino1.ino,
+				    &path_info1, parent_locked);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out;
 	}
 
+	/*
+	 * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+	 * have become stale (e.g. after a concurrent rename) between the time the
+	 * dentry was looked up and now.  If we detect that the stored r_parent
+	 * does not match the inode number we just encoded for the request, switch
+	 * to the correct inode so that the MDS receives a valid parent reference.
+	 */
+	if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+	    ceph_ino(req->r_parent) != path_info1.vino.ino) {
+		struct inode *old_parent = req->r_parent;
+		struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+		if (!IS_ERR(correct_dir)) {
+			WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+			          ceph_ino(old_parent), path_info1.vino.ino);
+			/*
+			 * Transfer CEPH_CAP_PIN from the old parent to the new one.
+			 * The pin was taken earlier in ceph_mdsc_submit_request().
+			 */
+			ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+			iput(old_parent);
+			req->r_parent = correct_dir;
+			ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+		}
+	}
+
 	/* If r_old_dentry is set, then assume that its parent is locked */
 	if (req->r_old_dentry &&
 	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
 		old_dentry = req->r_old_dentry;
 	ret = set_request_path_attr(mdsc, NULL, old_dentry,
-			      req->r_old_dentry_dir,
-			      req->r_path2, req->r_ino2.ino,
-			      &path2, &pathlen2, &ino2, &freepath2, true);
+				    req->r_old_dentry_dir,
+				    req->r_path2, req->r_ino2.ino,
+				    &path_info2, true);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out_free1;
@@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 
 	/* filepaths */
 	len += 2 * (1 + sizeof(u32) + sizeof(u64));
-	len += pathlen1 + pathlen2;
+	len += path_info1.pathlen + path_info2.pathlen;
 
 	/* cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
 
 	if (req->r_dentry_drop)
-		len += pathlen1;
+		len += path_info1.pathlen;
 	if (req->r_old_dentry_drop)
-		len += pathlen2;
+		len += path_info2.pathlen;
 
 	/* MClientRequest tail */
 
@@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	lhead->ino = cpu_to_le64(req->r_deleg_ino);
 	lhead->args = req->r_args;
 
-	ceph_encode_filepath(&p, end, ino1, path1);
-	ceph_encode_filepath(&p, end, ino2, path2);
+	ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+	ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
 
 	/* make note of release offset, in case we need to replay */
 	req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	msg->hdr.data_off = cpu_to_le16(0);
 
 out_free2:
-	if (freepath2)
-		ceph_mdsc_free_path((char *)path2, pathlen2);
+	ceph_mdsc_free_path_info(&path_info2);
 out_free1:
-	if (freepath1)
-		ceph_mdsc_free_path((char *)path1, pathlen1);
+	ceph_mdsc_free_path_info(&path_info1);
 out:
 	return msg;
 out_err:
@@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
 	struct dentry *dentry;
 	struct ceph_cap *cap;
-	char *path;
-	int pathlen = 0, err;
-	u64 pathbase;
+	struct ceph_path_info path_info = {0};
+	int err;
 	u64 snap_follows;
 
 	dentry = d_find_primary(inode);
 	if (dentry) {
 		/* set pathbase to parent dir when msg_version >= 2 */
-		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+		char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
 					    recon_state->msg_version >= 2);
 		dput(dentry);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out_err;
 		}
-	} else {
-		path = NULL;
-		pathbase = 0;
 	}
 
 	spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v2.pathbase = cpu_to_le64(pathbase);
+		rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
@@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 		ts = inode_get_atime(inode);
 		ceph_encode_timespec64(&rec.v1.atime, &ts);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v1.pathbase = cpu_to_le64(pathbase);
+		rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
 	}
 
 	if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4744,7 @@ encode_again:
 			    sizeof(struct ceph_filelock);
 		rec.v2.flock_len = cpu_to_le32(struct_len);
 
-		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+		struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
 
 		if (struct_v >= 2)
 			struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4768,7 @@ encode_again:
 			ceph_pagelist_encode_8(pagelist, 1);
 			ceph_pagelist_encode_32(pagelist, struct_len);
 		}
-		ceph_pagelist_encode_string(pagelist, path, pathlen);
+		ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
 		ceph_locks_to_pagelist(flocks, pagelist,
 				       num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4779,17 @@ out_freeflocks:
 	} else {
 		err = ceph_pagelist_reserve(pagelist,
 					    sizeof(u64) + sizeof(u32) +
-					    pathlen + sizeof(rec.v1));
+					    path_info.pathlen + sizeof(rec.v1));
 		if (err)
 			goto out_err;
 
 		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
-		ceph_pagelist_encode_string(pagelist, path, pathlen);
+		ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
 	}
 
 out_err:
-	ceph_mdsc_free_path(path, pathlen);
+	ceph_mdsc_free_path_info(&path_info);
 	if (!err)
 		recon_state->nr_caps++;
 	return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
 
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+	const char *path;
+	int pathlen;
+	struct ceph_vino vino;
+	bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
 {
-	if (!IS_ERR_OR_NULL(path))
-		__putname(path - (PATH_MAX - 1 - len));
+	if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+		__putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
 }
 
 extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
-				  struct dentry *dentry, int *plen, u64 *base,
+				  struct dentry *dentry, struct ceph_path_info *path_info,
 				  int for_wire);
 
 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c3eb651862c5..db6c2db68f96 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
 	if (!fsc->inode_wq)
 		goto fail_client;
-	fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+	fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1);
 	if (!fsc->cap_wq)
 		goto fail_inode_wq;
 
@@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
 	.free_inode	= ceph_free_inode,
 	.write_inode    = ceph_write_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= ceph_evict_inode,
 	.sync_fs        = ceph_sync_fs,
 	.put_super	= ceph_put_super,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cf176aab0f82..25d8bacbcf44 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -463,6 +463,7 @@ struct ceph_inode_info {
 	unsigned long  i_work_mask;
 
 #ifdef CONFIG_FS_ENCRYPTION
+	struct fscrypt_inode_info *i_crypt_info;
 	u32 fscrypt_auth_len;
 	u32 fscrypt_file_len;
 	u8 *fscrypt_auth;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 740f18b60c9d..456c4a2efb53 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode)
 
 static const struct super_operations configfs_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.free_inode	= configfs_free_inode,
 };
 
diff --git a/fs/coredump.c b/fs/coredump.c
index fedbead956ed..0d9a5d07a75d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -345,7 +345,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
 				was_space = false;
 				err = cn_printf(cn, "%c", '\0');
 				if (err)
-					return err;
+					return false;
 				(*argv)[(*argc)++] = cn->used;
 			}
 		}
@@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
 
 		/*
 		 * Usermode helpers are childen of either
-		 * system_unbound_wq or of kthreadd. So we know that
+		 * system_dfl_wq or of kthreadd. So we know that
 		 * we're starting off with a clean file descriptor
 		 * table. So we should always be able to use
 		 * COREDUMP_PIDFD_NUMBER as our file descriptor value.
@@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
 	ssize_t retval;
 	char old_core_pattern[CORENAME_MAX_SIZE];
 
+	if (write)
+		return proc_dostring(table, write, buffer, lenp, ppos);
+
 	retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
 
 	error = proc_dostring(table, write, buffer, lenp, ppos);
 	if (error)
 		return error;
+
 	if (!check_coredump_socket()) {
 		strscpy(core_pattern, old_core_pattern, retval + 1);
 		return -EINVAL;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b002e9b734f9..12daa85ed941 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 		inode_nohighmem(inode);
 		inode->i_data.a_ops = &cramfs_aops;
 		break;
-	default:
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
 		init_special_inode(inode, cramfs_inode->mode,
 				old_decode_dev(cramfs_inode->size));
+		break;
+	default:
+		printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n",
+		       inode->i_mode, inode->i_ino);
+		iget_failed(inode);
+		return ERR_PTR(-EIO);
 	}
 
 	inode->i_mode = cramfs_inode->mode;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index b5dfb0aa405a..464b54610fd3 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -2,10 +2,9 @@
 config FS_ENCRYPTION
 	bool "FS Encryption (Per-file encryption)"
 	select CRYPTO
-	select CRYPTO_HASH
-	select CRYPTO_HKDF
 	select CRYPTO_SKCIPHER
 	select CRYPTO_LIB_SHA256
+	select CRYPTO_LIB_SHA512
 	select KEYS
 	help
 	  Enable encryption of files and directories.  This
@@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS
 	select CRYPTO_CBC
 	select CRYPTO_CTS
 	select CRYPTO_ECB
-	select CRYPTO_HMAC
-	select CRYPTO_SHA512
 	select CRYPTO_XTS
 
 config FS_ENCRYPTION_INLINE_CRYPT
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 486fcb2ecf13..5f5599020e94 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -113,7 +113,7 @@ out:
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len)
 {
-	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
@@ -148,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	 */
 	for (i = 0; i < nr_pages; i++) {
 		pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS :
-						     GFP_NOWAIT | __GFP_NOWARN);
+						     GFP_NOWAIT);
 		if (!pages[i])
 			break;
 	}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b6ccab524fde..07f9cbfe3ea4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 		size_t len, size_t offs, gfp_t gfp_flags)
 {
 	const struct inode *inode = folio->mapping->host;
-	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	struct page *ciphertext_page;
@@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 {
 	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
 		return -EOPNOTSUPP;
-	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
-				       lblk_num, page, page, len, offs);
+	return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+				       FS_ENCRYPT, lblk_num, page, page, len,
+				       offs);
 }
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
@@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs)
 {
 	const struct inode *inode = folio->mapping->host;
-	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 	const unsigned int du_bits = ci->ci_data_unit_bits;
 	const unsigned int du_size = 1U << du_bits;
 	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
@@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 {
 	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
 		return -EOPNOTSUPP;
-	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
-				       lblk_num, page, page, len, offs);
+	return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+				       FS_DECRYPT, lblk_num, page, page, len,
+				       offs);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index f9f6713e144f..8e4c213d418b 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,7 +11,6 @@
  * This has not yet undergone a rigorous security audit.
  */
 
-#include <crypto/hash.h>
 #include <crypto/sha2.h>
 #include <crypto/skcipher.h>
 #include <linux/export.h>
@@ -94,7 +93,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 			  u8 *out, unsigned int olen)
 {
-	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 	struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
 	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	union fscrypt_iv iv;
@@ -138,7 +137,7 @@ static int fname_decrypt(const struct inode *inode,
 			 const struct fscrypt_str *iname,
 			 struct fscrypt_str *oname)
 {
-	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 	struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
 	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 	union fscrypt_iv iv;
@@ -274,8 +273,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
 				  u32 max_len, u32 *encrypted_len_ret)
 {
-	return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
-					      orig_len, max_len,
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+
+	return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len,
 					      encrypted_len_ret);
 }
 EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
@@ -543,7 +543,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-	const struct fscrypt_inode_info *ci = dir->i_crypt_info;
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir);
 
 	WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d8b485b9881c..4e8e82a9ccf9 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,10 +11,10 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
+#include <crypto/sha2.h>
 #include <linux/fscrypt.h>
 #include <linux/minmax.h>
 #include <linux/siphash.h>
-#include <crypto/hash.h>
 #include <linux/blk-crypto.h>
 
 #define CONST_STRLEN(str)	(sizeof(str) - 1)
@@ -249,8 +249,8 @@ struct fscrypt_prepared_key {
  * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct is
- * allocated and stored in ->i_crypt_info.  Once created, it remains until the
- * inode is evicted.
+ * allocated and a pointer to it is stored in the file's in-memory inode.  Once
+ * created, it remains until the inode is evicted.
  */
 struct fscrypt_inode_info {
 
@@ -381,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 				    u32 *encrypted_len_ret);
 
 /* hkdf.c */
-struct fscrypt_hkdf {
-	struct crypto_shash *hmac_tfm;
-};
-
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
-		      unsigned int master_key_size);
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+		       unsigned int master_key_size);
 
 /*
  * The list of contexts in which fscrypt uses HKDF.  These values are used as
@@ -405,11 +401,9 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
 #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \
 					8 /* info=<empty>		*/
 
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
-			const u8 *info, unsigned int infolen,
-			u8 *okm, unsigned int okmlen);
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+			 const u8 *info, unsigned int infolen,
+			 u8 *okm, unsigned int okmlen);
 
 /* inline_crypt.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -517,7 +511,7 @@ struct fscrypt_master_key_secret {
 	 * ->is_hw_wrapped=false, or by the "software secret" that hardware
 	 * derived from this master key if ->is_hw_wrapped=true.
 	 */
-	struct fscrypt_hkdf	hkdf;
+	struct hmac_sha512_key	hkdf;
 
 	/*
 	 * True if this key is a hardware-wrapped key; false if this key is a
@@ -696,7 +690,7 @@ struct fscrypt_master_key *
 fscrypt_find_master_key(struct super_block *sb,
 			const struct fscrypt_key_specifier *mk_spec);
 
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
 			  u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
 
 int fscrypt_add_test_dummy_key(struct super_block *sb,
@@ -732,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
 int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
 				 const u8 *raw_key);
 
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
-			       const struct fscrypt_master_key *mk);
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+				const struct fscrypt_master_key *mk);
 
 void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index b1ef506cd341..706f56d0076e 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -1,5 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869.  See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
  * This is used to derive keys from the fscrypt master keys (or from the
  * "software secrets" which hardware derives from the fscrypt master keys, in
  * the case that the fscrypt master keys are hardware-wrapped keys).
@@ -7,10 +11,6 @@
  * Copyright 2019 Google LLC
  */
 
-#include <crypto/hash.h>
-#include <crypto/hkdf.h>
-#include <crypto/sha2.h>
-
 #include "fscrypt_private.h"
 
 /*
@@ -24,7 +24,6 @@
  * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
  * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
  */
-#define HKDF_HMAC_ALG		"hmac(sha512)"
 #define HKDF_HASHLEN		SHA512_DIGEST_SIZE
 
 /*
@@ -44,54 +43,24 @@
  */
 
 /*
- * Compute HKDF-Extract using the given master key as the input keying material,
- * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
- *
- * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
- * times without having to recompute HKDF-Extract each time.
+ * Compute HKDF-Extract using 'master_key' as the input keying material, and
+ * prepare the resulting HMAC key in 'hkdf'.  Afterwards, 'hkdf' can be used for
+ * HKDF-Expand many times without having to recompute HKDF-Extract each time.
  */
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
-		      unsigned int master_key_size)
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+		       unsigned int master_key_size)
 {
-	struct crypto_shash *hmac_tfm;
 	static const u8 default_salt[HKDF_HASHLEN];
 	u8 prk[HKDF_HASHLEN];
-	int err;
-
-	hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK);
-	if (IS_ERR(hmac_tfm)) {
-		fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
-			    PTR_ERR(hmac_tfm));
-		return PTR_ERR(hmac_tfm);
-	}
-
-	if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
-		err = -EINVAL;
-		goto err_free_tfm;
-	}
-
-	err = hkdf_extract(hmac_tfm, master_key, master_key_size,
-			   default_salt, HKDF_HASHLEN, prk);
-	if (err)
-		goto err_free_tfm;
-
-	err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
-	if (err)
-		goto err_free_tfm;
 
-	hkdf->hmac_tfm = hmac_tfm;
-	goto out;
-
-err_free_tfm:
-	crypto_free_shash(hmac_tfm);
-out:
+	hmac_sha512_usingrawkey(default_salt, sizeof(default_salt),
+				master_key, master_key_size, prk);
+	hmac_sha512_preparekey(hkdf, prk, sizeof(prk));
 	memzero_explicit(prk, sizeof(prk));
-	return err;
 }
 
 /*
- * HKDF-Expand (RFC 5869 section 2.3).  This expands the pseudorandom key, which
- * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen'
+ * HKDF-Expand (RFC 5869 section 2.3).  Expand the HMAC key 'hkdf' into 'okmlen'
  * bytes of output keying material parameterized by the application-specific
  * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context'
  * byte.  This is thread-safe and may be called by multiple threads in parallel.
@@ -100,30 +69,32 @@ out:
  * adds to its application-specific info strings to guarantee that it doesn't
  * accidentally repeat an info string when using HKDF for different purposes.)
  */
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
-			const u8 *info, unsigned int infolen,
-			u8 *okm, unsigned int okmlen)
-{
-	SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
-	u8 *full_info;
-	int err;
-
-	full_info = kzalloc(infolen + 9, GFP_KERNEL);
-	if (!full_info)
-		return -ENOMEM;
-	desc->tfm = hkdf->hmac_tfm;
-
-	memcpy(full_info, "fscrypt\0", 8);
-	full_info[8] = context;
-	memcpy(full_info + 9, info, infolen);
-
-	err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9,
-			  okm, okmlen);
-	kfree_sensitive(full_info);
-	return err;
-}
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf)
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+			 const u8 *info, unsigned int infolen,
+			 u8 *okm, unsigned int okmlen)
 {
-	crypto_free_shash(hkdf->hmac_tfm);
+	struct hmac_sha512_ctx ctx;
+	u8 counter = 1;
+	u8 tmp[HKDF_HASHLEN];
+
+	WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN);
+
+	for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) {
+		hmac_sha512_init(&ctx, hkdf);
+		if (i != 0)
+			hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN],
+					   HKDF_HASHLEN);
+		hmac_sha512_update(&ctx, "fscrypt\0", 8);
+		hmac_sha512_update(&ctx, &context, 1);
+		hmac_sha512_update(&ctx, info, infolen);
+		hmac_sha512_update(&ctx, &counter, 1);
+		if (okmlen - i < HKDF_HASHLEN) {
+			hmac_sha512_final(&ctx, tmp);
+			memcpy(&okm[i], tmp, okmlen - i);
+			memzero_explicit(tmp, sizeof(tmp));
+		} else {
+			hmac_sha512_final(&ctx, &okm[i]);
+		}
+		counter++;
+	}
 }
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index e0b32ac841f7..b97de0d1430f 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -199,13 +199,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
 		err = fscrypt_require_key(inode);
 		if (err)
 			return err;
-		ci = inode->i_crypt_info;
+		ci = fscrypt_get_inode_info_raw(inode);
 		if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
 			return -EINVAL;
 		mk = ci->ci_master_key;
 		down_read(&mk->mk_sem);
 		if (mk->mk_present)
-			err = fscrypt_derive_dirhash_key(ci, mk);
+			fscrypt_derive_dirhash_key(ci, mk);
 		else
 			err = -ENOKEY;
 		up_read(&mk->mk_sem);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index caaff809765b..5dee7c498bc8 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb,
 
 bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
 {
-	return inode->i_crypt_info->ci_inlinecrypt;
+	return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
 
@@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
 		return;
-	ci = inode->i_crypt_info;
+	ci = fscrypt_get_inode_info_raw(inode);
 
 	fscrypt_generate_dun(ci, first_lblk, dun);
 	bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
@@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 			   u64 next_lblk)
 {
 	const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+	const struct fscrypt_inode_info *ci;
 	u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
 	if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
 		return false;
 	if (!bc)
 		return true;
+	ci = fscrypt_get_inode_info_raw(inode);
 
 	/*
 	 * Comparing the key pointers is good enough, as all I/O for each key
 	 * uses the same pointer.  I.e., there's currently no need to support
 	 * merging requests where the keys are the same but the pointers differ.
 	 */
-	if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
+	if (bc->bc_key != ci->ci_enc_key.blk_key)
 		return false;
 
-	fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+	fscrypt_generate_dun(ci, next_lblk, next_dun);
 	return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
 }
 EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
@@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 	if (nr_blocks <= 1)
 		return nr_blocks;
 
-	ci = inode->i_crypt_info;
+	ci = fscrypt_get_inode_info_raw(inode);
 	if (!(fscrypt_policy_flags(&ci->ci_policy) &
 	      FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
 		return nr_blocks;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7557f6a88b8f..3adbd7167055 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -42,7 +42,6 @@ struct fscrypt_keyring {
 
 static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
 {
-	fscrypt_destroy_hkdf(&secret->hkdf);
 	memzero_explicit(secret, sizeof(*secret));
 }
 
@@ -587,21 +586,17 @@ static int add_master_key(struct super_block *sb,
 			keyid_kdf_ctx =
 				HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY;
 		}
-		err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
+		fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
 		/*
 		 * Now that the KDF context is initialized, the raw KDF key is
 		 * no longer needed.
 		 */
 		memzero_explicit(kdf_key, kdf_key_size);
-		if (err)
-			return err;
 
 		/* Calculate the key identifier */
-		err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
-					  key_spec->u.identifier,
-					  FSCRYPT_KEY_IDENTIFIER_SIZE);
-		if (err)
-			return err;
+		fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
+				    key_spec->u.identifier,
+				    FSCRYPT_KEY_IDENTIFIER_SIZE);
 	}
 	return do_add_master_key(sb, secret, key_spec);
 }
@@ -835,24 +830,17 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
 	memcpy(secret->bytes, test_key, sizeof(test_key));
 }
 
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
 				u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
 {
 	struct fscrypt_master_key_secret secret;
-	int err;
 
 	fscrypt_get_test_dummy_secret(&secret);
-
-	err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
-	if (err)
-		goto out;
-	err = fscrypt_hkdf_expand(&secret.hkdf,
-				  HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY,
-				  NULL, 0, key_identifier,
-				  FSCRYPT_KEY_IDENTIFIER_SIZE);
-out:
+	fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
+	fscrypt_hkdf_expand(&secret.hkdf,
+			    HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0,
+			    key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
 	wipe_master_key_secret(&secret);
-	return err;
 }
 
 /**
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 4f3b9ecbfe4e..4bd3918f50e3 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -253,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
 		       sizeof(sb->s_uuid));
 		hkdf_infolen += sizeof(sb->s_uuid);
 	}
-	err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
-				  hkdf_context, hkdf_info, hkdf_infolen,
-				  mode_key, mode->keysize);
-	if (err)
-		goto out_unlock;
+	fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info,
+			    hkdf_infolen, mode_key, mode->keysize);
 	err = fscrypt_prepare_key(prep_key, mode_key, ci);
 	memzero_explicit(mode_key, mode->keysize);
 	if (err)
@@ -278,36 +275,25 @@ out_unlock:
  * as a pair of 64-bit words.  Therefore, on big endian CPUs we have to do an
  * endianness swap in order to get the same results as on little endian CPUs.
  */
-static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
-				      u8 context, const u8 *info,
-				      unsigned int infolen, siphash_key_t *key)
+static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
+				       u8 context, const u8 *info,
+				       unsigned int infolen, siphash_key_t *key)
 {
-	int err;
-
-	err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
-				  (u8 *)key, sizeof(*key));
-	if (err)
-		return err;
-
+	fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
+			    (u8 *)key, sizeof(*key));
 	BUILD_BUG_ON(sizeof(*key) != 16);
 	BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
 	le64_to_cpus(&key->key[0]);
 	le64_to_cpus(&key->key[1]);
-	return 0;
 }
 
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
-			       const struct fscrypt_master_key *mk)
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+				const struct fscrypt_master_key *mk)
 {
-	int err;
-
-	err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
-					 ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
-					 &ci->ci_dirhash_key);
-	if (err)
-		return err;
+	fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
+				   ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+				   &ci->ci_dirhash_key);
 	ci->ci_dirhash_key_initialized = true;
-	return 0;
 }
 
 void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
@@ -338,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
 		if (mk->mk_ino_hash_key_initialized)
 			goto unlock;
 
-		err = fscrypt_derive_siphash_key(mk,
-						 HKDF_CONTEXT_INODE_HASH_KEY,
-						 NULL, 0, &mk->mk_ino_hash_key);
-		if (err)
-			goto unlock;
+		fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY,
+					   NULL, 0, &mk->mk_ino_hash_key);
 		/* pairs with smp_load_acquire() above */
 		smp_store_release(&mk->mk_ino_hash_key_initialized, true);
 unlock:
 		mutex_unlock(&fscrypt_mode_key_setup_mutex);
-		if (err)
-			return err;
 	}
 
 	/*
@@ -402,13 +383,10 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
 	} else {
 		u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE];
 
-		err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
-					  HKDF_CONTEXT_PER_FILE_ENC_KEY,
-					  ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
-					  derived_key, ci->ci_mode->keysize);
-		if (err)
-			return err;
-
+		fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+				    HKDF_CONTEXT_PER_FILE_ENC_KEY,
+				    ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+				    derived_key, ci->ci_mode->keysize);
 		err = fscrypt_set_per_file_enc_key(ci, derived_key);
 		memzero_explicit(derived_key, ci->ci_mode->keysize);
 	}
@@ -416,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
 		return err;
 
 	/* Derive a secret dirhash key for directories that need it. */
-	if (need_dirhash_key) {
-		err = fscrypt_derive_dirhash_key(ci, mk);
-		if (err)
-			return err;
-	}
+	if (need_dirhash_key)
+		fscrypt_derive_dirhash_key(ci, mk);
 
 	return 0;
 }
@@ -642,15 +617,16 @@ fscrypt_setup_encryption_info(struct inode *inode,
 		goto out;
 
 	/*
-	 * For existing inodes, multiple tasks may race to set ->i_crypt_info.
-	 * So use cmpxchg_release().  This pairs with the smp_load_acquire() in
-	 * fscrypt_get_inode_info().  I.e., here we publish ->i_crypt_info with
-	 * a RELEASE barrier so that other tasks can ACQUIRE it.
+	 * For existing inodes, multiple tasks may race to set the inode's
+	 * fscrypt info pointer.  So use cmpxchg_release().  This pairs with the
+	 * smp_load_acquire() in fscrypt_get_inode_info().  I.e., publish the
+	 * pointer with a RELEASE barrier so that other tasks can ACQUIRE it.
 	 */
-	if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+	if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) ==
+	    NULL) {
 		/*
-		 * We won the race and set ->i_crypt_info to our crypt_info.
-		 * Now link it into the master key's inode list.
+		 * We won the race and set the inode's fscrypt info to our
+		 * crypt_info.  Now link it into the master key's inode list.
 		 */
 		if (mk) {
 			crypt_info->ci_master_key = mk;
@@ -681,13 +657,13 @@ out:
  *		       %false unless the operation being performed is needed in
  *		       order for files (or directories) to be deleted.
  *
- * Set up ->i_crypt_info, if it hasn't already been done.
+ * Set up the inode's encryption key, if it hasn't already been done.
  *
- * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe.  So
+ * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe.  So
  * generally this shouldn't be called from within a filesystem transaction.
  *
- * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
- *	   encryption key is unavailable.  (Use fscrypt_has_encryption_key() to
+ * Return: 0 if the key is now set up, *or* if it couldn't be set up because the
+ *	   needed master key is absent.  (Use fscrypt_has_encryption_key() to
  *	   distinguish these cases.)  Also can return another -errno code.
  */
 int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
@@ -741,9 +717,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
  *	   ->i_ino doesn't need to be set yet.
  * @encrypt_ret: (output) set to %true if the new inode will be encrypted
  *
- * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * If the directory is encrypted, set up its encryption key in preparation for
  * encrypting the name of the new file.  Also, if the new inode will be
- * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ * encrypted, set up its encryption key too and set *encrypt_ret=true.
  *
  * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
  * any filesystem transaction to create the inode.  For this reason, ->i_ino
@@ -752,8 +728,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
  * This doesn't persist the new inode's encryption context.  That still needs to
  * be done later by calling fscrypt_set_context().
  *
- * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
- *	   -errno code
+ * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode
+ *	   but the needed master key is absent, or another -errno code
  */
 int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
 			      bool *encrypt_ret)
@@ -800,8 +776,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
  */
 void fscrypt_put_encryption_info(struct inode *inode)
 {
-	put_crypt_info(inode->i_crypt_info);
-	inode->i_crypt_info = NULL;
+	/*
+	 * Ideally we'd start with a lightweight IS_ENCRYPTED() check here
+	 * before proceeding to retrieve and check the pointer.  However, during
+	 * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED.  If
+	 * an error occurs, it needs to be cleaned up regardless.
+	 */
+	struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode);
+
+	put_crypt_info(*ci_addr);
+	*ci_addr = NULL;
 }
 EXPORT_SYMBOL(fscrypt_put_encryption_info);
 
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6ad30ae07c06..bbb2f5ced988 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
 		err = fscrypt_require_key(dir);
 		if (err)
 			return ERR_PTR(err);
-		return &dir->i_crypt_info->ci_policy;
+		return &fscrypt_get_inode_info_raw(dir)->ci_policy;
 	}
 
 	return fscrypt_get_dummy_policy(dir->i_sb);
@@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
  */
 int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
 {
-	struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
 
 	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
 			FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
  */
 int fscrypt_set_context(struct inode *inode, void *fs_data)
 {
-	struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci;
 	union fscrypt_context ctx;
 	int ctxsize;
 
@@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
 	 * This may be the first time the inode number is available, so do any
 	 * delayed key setup that requires the inode number.
 	 */
+	ci = fscrypt_get_inode_info_raw(inode);
 	if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
 	    (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
 		fscrypt_hash_inode_number(ci, ci->ci_master_key);
@@ -826,10 +827,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
 		policy->version = FSCRYPT_POLICY_V2;
 		policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
 		policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
-		err = fscrypt_get_test_dummy_key_identifier(
+		fscrypt_get_test_dummy_key_identifier(
 				policy->v2.master_key_identifier);
-		if (err)
-			goto out;
 	} else {
 		err = -EINVAL;
 		goto out;
diff --git a/fs/dax.c b/fs/dax.c
index 4229513806be..20ecf652c129 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1743,6 +1743,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t done = 0;
 	int ret;
 
+	if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
+		return -EIO;
+
 	if (!iomi.len)
 		return 0;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 60046ae23d51..65cc11939654 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir)
 {
 	preempt_disable_nested();
 	for (;;) {
-		unsigned n = dir->i_dir_seq;
-		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+		unsigned n = READ_ONCE(dir->i_dir_seq);
+		if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
 			return n;
 		cpu_relax();
 	}
@@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
 
 	write_sequnlock(&rename_lock);
 }
+EXPORT_SYMBOL(d_exchange);
 
 /**
  * d_ancestor - search for an ancestor
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a0357b0cf362..661a99a7dfbe 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
 	struct debugfs_fs_info *sb_opts = sb->s_fs_info;
 	struct debugfs_fs_info *new_opts = fc->s_fs_info;
 
+	if (!new_opts)
+		return 0;
+
 	sync_filesystem(sb);
 
 	/* structure copy of new mount options to sb */
@@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 static int debugfs_get_tree(struct fs_context *fc)
 {
+	int err;
+
 	if (!(debugfs_allow & DEBUGFS_ALLOW_API))
 		return -EPERM;
 
-	return get_tree_single(fc, debugfs_fill_super);
+	err = get_tree_single(fc, debugfs_fill_super);
+	if (err)
+		return err;
+
+	return debugfs_reconfigure(fc);
 }
 
 static void debugfs_free_fc(struct fs_context *fc)
@@ -353,7 +362,8 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
 }
 EXPORT_SYMBOL_GPL(debugfs_lookup);
 
-static struct dentry *start_creating(const char *name, struct dentry *parent)
+static struct dentry *debugfs_start_creating(const char *name,
+					     struct dentry *parent)
 {
 	struct dentry *dentry;
 	int error;
@@ -419,7 +429,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 	if (!(mode & S_IFMT))
 		mode |= S_IFREG;
 	BUG_ON(!S_ISREG(mode));
-	dentry = start_creating(name, parent);
+	dentry = debugfs_start_creating(name, parent);
 
 	if (IS_ERR(dentry))
 		return dentry;
@@ -568,7 +578,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
  */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 {
-	struct dentry *dentry = start_creating(name, parent);
+	struct dentry *dentry = debugfs_start_creating(name, parent);
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
@@ -615,7 +625,7 @@ struct dentry *debugfs_create_automount(const char *name,
 					debugfs_automount_t f,
 					void *data)
 {
-	struct dentry *dentry = start_creating(name, parent);
+	struct dentry *dentry = debugfs_start_creating(name, parent);
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
@@ -678,7 +688,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 	if (!link)
 		return ERR_PTR(-ENOMEM);
 
-	dentry = start_creating(name, parent);
+	dentry = debugfs_start_creating(name, parent);
 	if (IS_ERR(dentry)) {
 		kfree(link);
 		return dentry;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index a23fd524a6ee..a0d75b5c83c6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -26,6 +26,7 @@
 /*
  * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>)
  * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/release_recover
  * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>)
  * /config/dlm/<cluster>/comms/<comm>/local
  * /config/dlm/<cluster>/comms/<comm>/addr      (write only)
@@ -267,6 +268,7 @@ enum {
 enum {
 	NODE_ATTR_NODEID = 0,
 	NODE_ATTR_WEIGHT,
+	NODE_ATTR_RELEASE_RECOVER,
 };
 
 struct dlm_clusters {
@@ -280,6 +282,8 @@ struct dlm_spaces {
 struct dlm_space {
 	struct config_group group;
 	struct list_head members;
+	struct list_head members_gone;
+	int members_gone_count;
 	struct mutex members_lock;
 	int members_count;
 	struct dlm_nodes *nds;
@@ -310,6 +314,14 @@ struct dlm_node {
 	int weight;
 	int new;
 	int comm_seq; /* copy of cm->seq when nd->nodeid is set */
+	unsigned int release_recover;
+};
+
+struct dlm_member_gone {
+	int nodeid;
+	unsigned int release_recover;
+
+	struct list_head list; /* space->members_gone */
 };
 
 static struct configfs_group_operations clusters_ops = {
@@ -480,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	configfs_add_default_group(&nds->ns_group, &sp->group);
 
 	INIT_LIST_HEAD(&sp->members);
+	INIT_LIST_HEAD(&sp->members_gone);
 	mutex_init(&sp->members_lock);
 	sp->members_count = 0;
 	sp->nds = nds;
@@ -587,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i)
 {
 	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
 	struct dlm_node *nd = config_item_to_node(i);
+	struct dlm_member_gone *mb_gone;
+
+	mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL);
+	if (!mb_gone)
+		return;
 
 	mutex_lock(&sp->members_lock);
 	list_del(&nd->list);
 	sp->members_count--;
+
+	mb_gone->nodeid = nd->nodeid;
+	mb_gone->release_recover = nd->release_recover;
+	list_add(&mb_gone->list, &sp->members_gone);
+	sp->members_gone_count++;
 	mutex_unlock(&sp->members_lock);
 
 	config_item_put(i);
@@ -815,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf,
 	return len;
 }
 
+static ssize_t node_release_recover_show(struct config_item *item, char *buf)
+{
+	struct dlm_node *n = config_item_to_node(item);
+
+	return sprintf(buf, "%u\n", n->release_recover);
+}
+
+static ssize_t node_release_recover_store(struct config_item *item,
+					  const char *buf, size_t len)
+{
+	struct dlm_node *n = config_item_to_node(item);
+	int rc;
+
+	rc = kstrtouint(buf, 0, &n->release_recover);
+	if (rc)
+		return rc;
+
+	return len;
+}
+
 CONFIGFS_ATTR(node_, nodeid);
 CONFIGFS_ATTR(node_, weight);
+CONFIGFS_ATTR(node_, release_recover);
 
 static struct configfs_attribute *node_attrs[] = {
 	[NODE_ATTR_NODEID] = &node_attr_nodeid,
 	[NODE_ATTR_WEIGHT] = &node_attr_weight,
+	[NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover,
 	NULL,
 };
 
@@ -882,9 +927,10 @@ static void put_comm(struct dlm_comm *cm)
 int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
 		     int *count_out)
 {
+	struct dlm_member_gone *mb_gone, *mb_safe;
+	struct dlm_config_node *nodes, *node;
 	struct dlm_space *sp;
 	struct dlm_node *nd;
-	struct dlm_config_node *nodes, *node;
 	int rv, count;
 
 	sp = get_space(lsname);
@@ -898,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
 		goto out;
 	}
 
-	count = sp->members_count;
+	count = sp->members_count + sp->members_gone_count;
 
 	nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
 	if (!nodes) {
@@ -917,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
 		nd->new = 0;
 	}
 
+	/* we delay the remove on nodes until here as configfs does
+	 * not support addtional attributes for rmdir().
+	 */
+	list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) {
+		node->nodeid = mb_gone->nodeid;
+		node->release_recover = mb_gone->release_recover;
+		node->gone = true;
+		node++;
+
+		list_del(&mb_gone->list);
+		sp->members_gone_count--;
+		kfree(mb_gone);
+	}
+
 	*count_out = count;
 	*nodes_out = nodes;
 	rv = 0;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 13a3d0b26194..4ebd45f75276 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,8 +17,10 @@
 struct dlm_config_node {
 	int nodeid;
 	int weight;
+	bool gone;
 	int new;
 	uint32_t comm_seq;
+	unsigned int release_recover;
 };
 
 extern const struct rhashtable_params dlm_rhash_rsb_params;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6dd3a524cd35..be938fdf17d9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 	if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) {
 		/* We may need to adjust grmode depending on other granted locks. */
-		log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x",
+		log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x",
 			  __func__, lkb->lkb_id, lkb->lkb_grmode,
 			  lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid);
 		rsb_set_flag(r, RSB_RECOVER_CONVERT);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1929327ffbe1..ddaa76558706 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = {
 
 static struct kset *dlm_kset;
 
-static int do_uevent(struct dlm_ls *ls, int in)
+static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover)
 {
-	if (in)
+	char message[512] = {};
+	char *envp[] = { message, NULL };
+
+	if (in) {
 		kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
-	else
-		kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+	} else {
+		snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover);
+		kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp);
+	}
 
 	log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
 
@@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster,
 	   current lockspace members are (via configfs) and then tells the
 	   lockspace to start running (via sysfs) in dlm_ls_start(). */
 
-	error = do_uevent(ls, 1);
+	error = do_uevent(ls, 1, 0);
 	if (error < 0)
 		goto out_recoverd;
 
@@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster,
 	return 0;
 
  out_members:
-	do_uevent(ls, 0);
+	do_uevent(ls, 0, 0);
 	dlm_clear_members(ls);
 	kfree(ls->ls_node_array);
  out_recoverd:
@@ -671,19 +676,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster,
    This is because there may be LKBs queued as ASTs that have been unlinked
    from their RSBs and are pending deletion once the AST has been delivered */
 
-static int lockspace_busy(struct dlm_ls *ls, int force)
+static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option)
 {
 	struct dlm_lkb *lkb;
 	unsigned long id;
 	int rv = 0;
 
 	read_lock_bh(&ls->ls_lkbxa_lock);
-	if (force == 0) {
+	if (release_option == DLM_RELEASE_NO_LOCKS) {
 		xa_for_each(&ls->ls_lkbxa, id, lkb) {
 			rv = 1;
 			break;
 		}
-	} else if (force == 1) {
+	} else if (release_option == DLM_RELEASE_UNUSED) {
+		/* TODO: handle this UNUSED option as NO_LOCKS in later patch */
 		xa_for_each(&ls->ls_lkbxa, id, lkb) {
 			if (lkb->lkb_nodeid == 0 &&
 			    lkb->lkb_grmode != DLM_LOCK_IV) {
@@ -698,11 +704,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
 	return rv;
 }
 
-static int release_lockspace(struct dlm_ls *ls, int force)
+static int release_lockspace(struct dlm_ls *ls, unsigned int release_option)
 {
 	int busy, rv;
 
-	busy = lockspace_busy(ls, force);
+	busy = lockspace_busy(ls, release_option);
 
 	spin_lock_bh(&lslist_lock);
 	if (ls->ls_create_count == 1) {
@@ -730,8 +736,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 
 	dlm_device_deregister(ls);
 
-	if (force < 3 && dlm_user_daemon_available())
-		do_uevent(ls, 0);
+	if (release_option != DLM_RELEASE_NO_EVENT &&
+	    dlm_user_daemon_available())
+		do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER));
 
 	dlm_recoverd_stop(ls);
 
@@ -782,25 +789,24 @@ static int release_lockspace(struct dlm_ls *ls, int force)
  * lockspace must continue to function as usual, participating in recoveries,
  * until this returns.
  *
- * Force has 4 possible values:
- * 0 - don't destroy lockspace if it has any LKBs
- * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
- * 2 - destroy lockspace regardless of LKBs
- * 3 - destroy lockspace as part of a forced shutdown
+ * See DLM_RELEASE defines for release_option values and their meaning.
  */
 
-int dlm_release_lockspace(void *lockspace, int force)
+int dlm_release_lockspace(void *lockspace, unsigned int release_option)
 {
 	struct dlm_ls *ls;
 	int error;
 
+	if (release_option > __DLM_RELEASE_MAX)
+		return -EINVAL;
+
 	ls = dlm_find_lockspace_local(lockspace);
 	if (!ls)
 		return -EINVAL;
 	dlm_put_lockspace(ls);
 
 	mutex_lock(&ls_lock);
-	error = release_lockspace(ls, force);
+	error = release_lockspace(ls, release_option);
 	if (!error)
 		ls_count--;
 	if (!ls_count)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e4373bce1bc2..9a0b6c2b6b01 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1703,7 +1703,7 @@ static int work_start(void)
 		return -ENOMEM;
 	}
 
-	process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0);
+	process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0);
 	if (!process_workqueue) {
 		log_print("can't start dlm_process");
 		destroy_workqueue(io_workqueue);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 4887c8a05318..a44d16da7187 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -52,7 +52,7 @@ static int __init init_dlm(void)
 	if (error)
 		goto out_user;
 
-	dlm_wq = alloc_workqueue("dlm_wq", 0, 0);
+	dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0);
 	if (!dlm_wq) {
 		error = -ENOMEM;
 		goto out_plock;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b0864c93230f..c0f557a80a75 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls)
 	ls->ls_ops->recover_prep(ls->ls_ops_arg);
 }
 
-static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb,
+				  unsigned int release_recover)
 {
 	struct dlm_slot slot;
 	uint32_t seq;
@@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
 
 	error = dlm_comm_seq(memb->nodeid, &seq, false);
 
-	if (!error && seq == memb->comm_seq)
+	if (!release_recover && !error && seq == memb->comm_seq)
 		return;
 
 	slot.nodeid = memb->nodeid;
@@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	struct dlm_member *memb, *safe;
 	struct dlm_config_node *node;
 	int i, error, neg = 0, low = -1;
+	unsigned int release_recover;
 
 	/* previously removed members that we've not finished removing need to
 	 * count as a negative change so the "neg" recovery steps will happen
@@ -569,11 +571,21 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 
 	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
 		node = find_config_node(rv, memb->nodeid);
-		if (node && !node->new)
+		if (!node) {
+			log_error(ls, "remove member %d invalid",
+				  memb->nodeid);
+			return -EFAULT;
+		}
+
+		if (!node->new && !node->gone)
 			continue;
 
-		if (!node) {
-			log_rinfo(ls, "remove member %d", memb->nodeid);
+		release_recover = 0;
+
+		if (node->gone) {
+			release_recover = node->release_recover;
+			log_rinfo(ls, "remove member %d%s", memb->nodeid,
+				  release_recover ? " (release_recover)" : "");
 		} else {
 			/* removed and re-added */
 			log_rinfo(ls, "remove member %d comm_seq %u %u",
@@ -584,13 +596,16 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 		list_move(&memb->list, &ls->ls_nodes_gone);
 		remove_remote_member(memb->nodeid);
 		ls->ls_num_nodes--;
-		dlm_lsop_recover_slot(ls, memb);
+		dlm_lsop_recover_slot(ls, memb, release_recover);
 	}
 
 	/* add new members to ls_nodes */
 
 	for (i = 0; i < rv->nodes_count; i++) {
 		node = &rv->nodes[i];
+		if (node->gone)
+			continue;
+
 		if (dlm_is_member(ls, node->nodeid))
 			continue;
 		error = dlm_add_member(ls, node);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index be4240f09abd..3ac020fb8139 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r)
 		 */
 		if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) ||
 		    ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) {
-			log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
+			log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
 				  __func__, lkb->lkb_id, lkb->lkb_grmode,
 				  lkb->lkb_rqmode, lkb->lkb_nodeid,
 				  lkb->lkb_remid, other_lkid, other_grmode);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 5cb3896be826..51daf4acbe31 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	dlm_put_lockspace(ls);
 
 	if (error)
-		dlm_release_lockspace(lockspace, 0);
+		dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS);
 	else
 		error = ls->ls_device.minor;
 
@@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 {
 	dlm_lockspace_t *lockspace;
 	struct dlm_ls *ls;
-	int error, force = 0;
+	int error, force = DLM_RELEASE_NO_LOCKS;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 		return -ENOENT;
 
 	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
-		force = 2;
+		force = DLM_RELEASE_NORMAL;
 
 	lockspace = ls;
 	dlm_put_lockspace(ls);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 72fbe1316ab8..abd954c6a14e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		goto out_lock;
 	}
 
-	rd.old_mnt_idmap	= &nop_mnt_idmap;
+	rd.mnt_idmap		= &nop_mnt_idmap;
 	rd.old_parent		= lower_old_dir_dentry;
 	rd.old_dentry		= lower_old_dentry;
-	rd.new_mnt_idmap	= &nop_mnt_idmap;
 	rd.new_parent		= lower_new_dir_dentry;
 	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index c4a139911356..1f4d8ce56667 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb);
 
 static const struct super_operations efivarfs_ops = {
 	.statfs = efivarfs_statfs,
-	.drop_inode = generic_delete_inode,
+	.drop_inode = inode_just_drop,
 	.alloc_inode = efivarfs_alloc_inode,
 	.free_inode = efivarfs_free_inode,
 	.show_options = efivarfs_show_options,
@@ -152,6 +152,10 @@ static int efivarfs_d_compare(const struct dentry *dentry,
 {
 	int guid = len - EFI_VARIABLE_GUID_LEN;
 
+	/* Parallel lookups may produce a temporary invalid filename */
+	if (guid <= 0)
+		return 1;
+
 	if (name->len != len)
 		return 1;
 
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 7b26efc271ee..d81f3318417d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,18 @@
 config EROFS_FS
 	tristate "EROFS filesystem support"
 	depends on BLOCK
+	select CACHEFILES if EROFS_FS_ONDEMAND
 	select CRC32
+	select CRYPTO if EROFS_FS_ZIP_ACCEL
+	select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL
 	select FS_IOMAP
+	select LZ4_DECOMPRESS if EROFS_FS_ZIP
+	select NETFS_SUPPORT if EROFS_FS_ONDEMAND
+	select XXHASH if EROFS_FS_XATTR
+	select XZ_DEC if EROFS_FS_ZIP_LZMA
+	select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA
+	select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE
+	select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD
 	help
 	  EROFS (Enhanced Read-Only File System) is a lightweight read-only
 	  file system with modern designs (e.g. no buffer heads, inline
@@ -38,7 +48,6 @@ config EROFS_FS_DEBUG
 config EROFS_FS_XATTR
 	bool "EROFS extended attributes"
 	depends on EROFS_FS
-	select XXHASH
 	default y
 	help
 	  Extended attributes are name:value pairs associated with inodes by
@@ -94,7 +103,6 @@ config EROFS_FS_BACKED_BY_FILE
 config EROFS_FS_ZIP
 	bool "EROFS Data Compression Support"
 	depends on EROFS_FS
-	select LZ4_DECOMPRESS
 	default y
 	help
 	  Enable transparent compression support for EROFS file systems.
@@ -104,8 +112,6 @@ config EROFS_FS_ZIP
 config EROFS_FS_ZIP_LZMA
 	bool "EROFS LZMA compressed data support"
 	depends on EROFS_FS_ZIP
-	select XZ_DEC
-	select XZ_DEC_MICROLZMA
 	help
 	  Saying Y here includes support for reading EROFS file systems
 	  containing LZMA compressed data, specifically called microLZMA. It
@@ -117,7 +123,6 @@ config EROFS_FS_ZIP_LZMA
 config EROFS_FS_ZIP_DEFLATE
 	bool "EROFS DEFLATE compressed data support"
 	depends on EROFS_FS_ZIP
-	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for reading EROFS file systems
 	  containing DEFLATE compressed data.  It gives better compression
@@ -132,7 +137,6 @@ config EROFS_FS_ZIP_DEFLATE
 config EROFS_FS_ZIP_ZSTD
 	bool "EROFS Zstandard compressed data support"
 	depends on EROFS_FS_ZIP
-	select ZSTD_DECOMPRESS
 	help
 	  Saying Y here includes support for reading EROFS file systems
 	  containing Zstandard compressed data.  It gives better compression
@@ -147,8 +151,6 @@ config EROFS_FS_ZIP_ZSTD
 config EROFS_FS_ZIP_ACCEL
 	bool "EROFS hardware decompression support"
 	depends on EROFS_FS_ZIP
-	select CRYPTO
-	select CRYPTO_DEFLATE
 	help
 	  Saying Y here includes hardware accelerator support for reading
 	  EROFS file systems containing compressed data.  It gives better
@@ -163,9 +165,7 @@ config EROFS_FS_ZIP_ACCEL
 config EROFS_FS_ONDEMAND
 	bool "EROFS fscache-based on-demand read support (deprecated)"
 	depends on EROFS_FS
-	select NETFS_SUPPORT
 	select FSCACHE
-	select CACHEFILES
 	select CACHEFILES_ONDEMAND
 	help
 	  This permits EROFS to use fscache-backed data blobs with on-demand
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3b1ba571c728..8ca29962a3dd 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -475,6 +475,10 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
 const struct file_operations erofs_file_fops = {
 	.llseek		= erofs_file_llseek,
 	.read_iter	= erofs_file_read_iter,
+	.unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = erofs_compat_ioctl,
+#endif
 	.mmap_prepare	= erofs_file_mmap_prepare,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= filemap_splice_read,
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index debf469ad6bd..32b4f5aa60c9 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -123,4 +123,8 @@ const struct file_operations erofs_dir_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= erofs_readdir,
+	.unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = erofs_compat_ioctl,
+#endif
 };
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 377ee12b8b96..3d5738f80072 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,10 +12,12 @@
 /* to allow for x86 boot sectors and other oddities. */
 #define EROFS_SUPER_OFFSET      1024
 
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM          0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME              0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER	0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM			0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME			0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER		0x00000004
 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX	0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX		0x00000010
+
 
 /*
  * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 9a2f59721522..cb780c095d28 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2021, Alibaba Cloud
  */
 #include "xattr.h"
+#include <linux/compat.h>
 #include <trace/events/erofs.h>
 
 static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -213,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_op = &erofs_generic_iops;
-		if (erofs_inode_is_data_compressed(vi->datalayout))
-			inode->i_fop = &generic_ro_fops;
-		else
-			inode->i_fop = &erofs_file_fops;
+		inode->i_fop = &erofs_file_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &erofs_dir_iops;
@@ -341,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
 	return 0;
 }
 
+static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	int ret;
+
+	if (!sbi->volume_name)
+		ret = clear_user(arg, 1);
+	else
+		ret = copy_to_user(arg, sbi->volume_name,
+				   strlen(sbi->volume_name));
+	return ret ? -EFAULT : 0;
+}
+
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case FS_IOC_GETFSLABEL:
+		return erofs_ioctl_get_volume_label(inode, argp);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+			unsigned long arg)
+{
+	return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 const struct inode_operations erofs_generic_iops = {
 	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ccc5f0ee8df..f7f622836198 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -153,6 +153,7 @@ struct erofs_sb_info {
 	/* used for statfs, f_files - f_favail */
 	u64 inos;
 
+	char *volume_name;
 	u32 feature_compat;
 	u32 feature_incompat;
 
@@ -234,6 +235,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
 EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
 
 static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
 {
@@ -535,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) {
 static inline void erofs_fscache_submit_bio(struct bio *bio) {}
 #endif
 
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+			unsigned long arg);
+
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
 
 #endif	/* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index e1020aa60771..f3f8d8c066e4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -174,6 +174,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 		if (!erofs_is_fileio_mode(sbi)) {
 			dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
 					&dif->dax_part_off, NULL, NULL);
+			if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
+				erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
+					   dif->path);
+				clear_opt(&sbi->opt, DAX_ALWAYS);
+			}
 		} else if (!S_ISREG(file_inode(file)->i_mode)) {
 			fput(file);
 			return -EINVAL;
@@ -210,8 +215,13 @@ static int erofs_scan_devices(struct super_block *sb,
 			  ondisk_extradevs, sbi->devs->extra_devices);
 		return -EINVAL;
 	}
-	if (!ondisk_extradevs)
+	if (!ondisk_extradevs) {
+		if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
+			erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
+			clear_opt(&sbi->opt, DAX_ALWAYS);
+		}
 		return 0;
+	}
 
 	if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
 		sbi->devs->flatdev = true;
@@ -313,8 +323,8 @@ static int erofs_read_superblock(struct super_block *sb)
 	sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
 	if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
 		sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
-		sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
-				le16_to_cpu(dsb->rb.blocks_hi);
+		sbi->dif0.blocks = sbi->dif0.blocks |
+				((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
 	} else {
 		sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
 	}
@@ -333,12 +343,18 @@ static int erofs_read_superblock(struct super_block *sb)
 	sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
 	super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
 
+	if (dsb->volume_name[0]) {
+		sbi->volume_name = kstrndup(dsb->volume_name,
+					    sizeof(dsb->volume_name), GFP_KERNEL);
+		if (!sbi->volume_name)
+			return -ENOMEM;
+	}
+
 	/* parse on-disk compression configurations */
 	ret = z_erofs_parse_cfgs(sb, dsb);
 	if (ret < 0)
 		goto out;
 
-	/* handle multiple devices */
 	ret = erofs_scan_devices(sb, dsb);
 
 	if (erofs_sb_has_48bit(sbi))
@@ -671,14 +687,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 			return invalfc(fc, "cannot use fsoffset in fscache mode");
 	}
 
-	if (test_opt(&sbi->opt, DAX_ALWAYS)) {
-		if (!sbi->dif0.dax_dev) {
-			errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
-			clear_opt(&sbi->opt, DAX_ALWAYS);
-		} else if (sbi->blkszbits != PAGE_SHIFT) {
-			errorfc(fc, "unsupported blocksize for DAX");
-			clear_opt(&sbi->opt, DAX_ALWAYS);
-		}
+	if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
+		erofs_info(sb, "unsupported blocksize for DAX");
+		clear_opt(&sbi->opt, DAX_ALWAYS);
 	}
 
 	sb->s_time_gran = 1;
@@ -818,6 +829,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi)
 	kfree(sbi->domain_id);
 	if (sbi->dif0.file)
 		fput(sbi->dif0.file);
+	kfree(sbi->volume_name);
 	kfree(sbi);
 }
 
@@ -1014,10 +1026,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	return 0;
 }
 
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(inode))
+		dax_break_layout_final(inode);
+#endif
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+}
+
 const struct super_operations erofs_sops = {
 	.put_super = erofs_put_super,
 	.alloc_inode = erofs_alloc_inode,
 	.free_inode = erofs_free_inode,
+	.evict_inode = erofs_evict_inode,
 	.statfs = erofs_statfs,
 	.show_options = erofs_show_options,
 };
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index eaa9efd766ee..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
 	erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
 	struct erofs_xattr_prefix_item *pfs;
 	int ret = 0, i, len;
+	bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
 
 	if (!sbi->xattr_prefix_count)
 		return 0;
@@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
 	if (!pfs)
 		return -ENOMEM;
 
-	if (sbi->packed_inode)
-		buf.mapping = sbi->packed_inode->i_mapping;
-	else
+	if (!plain) {
+		if (erofs_sb_has_metabox(sbi))
+			(void)erofs_init_metabuf(&buf, sb, true);
+		else if (sbi->packed_inode)
+			buf.mapping = sbi->packed_inode->i_mapping;
+		else
+			plain = true;
+	}
+	if (plain)
 		(void)erofs_init_metabuf(&buf, sb, false);
 
 	for (i = 0; i < sbi->xattr_prefix_count; i++) {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 792f20888a8f..bc80cfe482f7 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -823,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
 			}
 			rcu_read_unlock();
 		}
-	} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
-		DBG_BUGON(1);
-		return -EFSCORRUPTED;
 	}
 
 	if (pcl) {
@@ -1432,6 +1429,16 @@ static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
 }
 #endif
 
+/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
+static inline bool z_erofs_in_atomic(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+		return true;
+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		return true;
+	return !preemptible();
+}
+
 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 				       int bios)
 {
@@ -1446,8 +1453,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 
 	if (atomic_add_return(bios, &io->pending_bios))
 		return;
-	/* Use (kthread_)work and sync decompression for atomic contexts only */
-	if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
+	if (z_erofs_in_atomic()) {
 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
 		struct kthread_worker *worker;
 
@@ -1826,7 +1832,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
 		map->m_la = end;
 		err = z_erofs_map_blocks_iter(inode, map,
 					      EROFS_GET_BLOCKS_READMORE);
-		if (err)
+		if (err || !(map->m_flags & EROFS_MAP_ENCODED))
 			return;
 
 		/* expand ra for the trailing edge if readahead */
@@ -1838,7 +1844,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
 		end = round_up(end, PAGE_SIZE);
 	} else {
 		end = round_up(map->m_la, PAGE_SIZE);
-		if (!map->m_llen)
+		if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen)
 			return;
 	}
 
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a93efd95c555..e5581dbeb4c2 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
 		.map = map,
 		.in_mbox = erofs_inode_in_metabox(inode),
 	};
-	int err = 0;
-	unsigned int endoff, afmt;
+	unsigned int endoff;
 	unsigned long initial_lcn;
 	unsigned long long ofs, end;
+	int err;
 
 	ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
 	if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
@@ -462,8 +462,8 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
 		map->m_pa = vi->z_fragmentoff;
 		map->m_plen = vi->z_idata_size;
 		if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
-			erofs_err(sb, "invalid tail-packing pclustersize %llu",
-				  map->m_plen);
+			erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu",
+				  vi->nid);
 			err = -EFSCORRUPTED;
 			goto unmap_out;
 		}
@@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
 			err = -EFSCORRUPTED;
 			goto unmap_out;
 		}
-		afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
-			Z_EROFS_COMPRESSION_INTERLACED :
-			Z_EROFS_COMPRESSION_SHIFTED;
+		if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+			map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+		else
+			map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+	} else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+		map->m_algorithmformat = vi->z_algorithmtype[1];
 	} else {
-		afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
-			vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
-		if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
-			erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
-				  afmt, vi->nid);
-			err = -EFSCORRUPTED;
-			goto unmap_out;
-		}
+		map->m_algorithmformat = vi->z_algorithmtype[0];
 	}
-	map->m_algorithmformat = afmt;
 
 	if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
 	    ((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
 	struct super_block *const sb = inode->i_sb;
-	int err, headnr;
-	erofs_off_t pos;
 	struct z_erofs_map_header *h;
+	erofs_off_t pos;
+	int err = 0;
 
 	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
 		/*
@@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
 	if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
 		return -ERESTARTSYS;
 
-	err = 0;
 	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
 		goto out_unlock;
 
@@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
 	else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
 		vi->z_idata_size = le16_to_cpu(h->h_idata_size);
 
-	headnr = 0;
-	if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
-	    vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
-		erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
-			  headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
-		err = -EOPNOTSUPP;
-		goto out_unlock;
-	}
-
 	if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
 	    vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
 			    Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -726,6 +711,30 @@ out_unlock:
 	return err;
 }
 
+static int z_erofs_map_sanity_check(struct inode *inode,
+				    struct erofs_map_blocks *map)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+	if (!(map->m_flags & EROFS_MAP_ENCODED))
+		return 0;
+	if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+		erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+			  map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+		return -EOPNOTSUPP;
+	}
+	if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+		     !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+		erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+			  map->m_algorithmformat, EROFS_I(inode)->nid);
+		return -EFSCORRUPTED;
+	}
+	if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+		     map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
 int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 			    int flags)
 {
@@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 			else
 				err = z_erofs_map_blocks_fo(inode, map, flags);
 		}
-		if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
-		    unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
-			     map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
-			err = -EOPNOTSUPP;
+		if (!err)
+			err = z_erofs_map_sanity_check(inode, map);
 		if (err)
 			map->m_llen = 0;
 	}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
  *
  * 1) epnested_mutex (mutex)
  * 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
  *
  * The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
  * from inside the poll callback, that might be triggered from
  * a wake_up() that in turn might be called from IRQ context.
  * So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
 	struct list_head rdllist;
 
 	/* Lock which protects rdllist and ovflist */
-	rwlock_t lock;
+	spinlock_t lock;
 
 	/* RB tree root used to store monitored fd structs */
 	struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
 	 * in a lockless way.
 	 */
 	lockdep_assert_irqs_enabled();
-	write_lock_irq(&ep->lock);
+	spin_lock_irq(&ep->lock);
 	list_splice_init(&ep->rdllist, txlist);
 	WRITE_ONCE(ep->ovflist, NULL);
-	write_unlock_irq(&ep->lock);
+	spin_unlock_irq(&ep->lock);
 }
 
 static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
 {
 	struct epitem *epi, *nepi;
 
-	write_lock_irq(&ep->lock);
+	spin_lock_irq(&ep->lock);
 	/*
 	 * During the time we spent inside the "sproc" callback, some
 	 * other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
 			wake_up(&ep->wq);
 	}
 
-	write_unlock_irq(&ep->lock);
+	spin_unlock_irq(&ep->lock);
 }
 
 static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
-	write_lock_irq(&ep->lock);
+	spin_lock_irq(&ep->lock);
 	if (ep_is_linked(epi))
 		list_del_init(&epi->rdllink);
-	write_unlock_irq(&ep->lock);
+	spin_unlock_irq(&ep->lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 	/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
 		return -ENOMEM;
 
 	mutex_init(&ep->mtx);
-	rwlock_init(&ep->lock);
+	spin_lock_init(&ep->lock);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
@@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
 #endif /* CONFIG_KCMP */
 
 /*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- *         existing list until all changes are completed, in other words
- *         concurrent list_add_tail_lockless() calls should be protected
- *         with a read lock, where write lock acts as a barrier which
- *         makes sure all list_add_tail_lockless() calls are fully
- *         completed.
- *
- *        Also an element can be locklessly added to the list only in one
- *        direction i.e. either to the tail or to the head, otherwise
- *        concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
-					  struct list_head *head)
-{
-	struct list_head *prev;
-
-	/*
-	 * This is simple 'new->next = head' operation, but cmpxchg()
-	 * is used in order to detect that same element has been just
-	 * added to the list from another CPU: the winner observes
-	 * new->next == new.
-	 */
-	if (!try_cmpxchg(&new->next, &new, head))
-		return false;
-
-	/*
-	 * Initially ->next of a new element must be updated with the head
-	 * (we are inserting to the tail) and only then pointers are atomically
-	 * exchanged.  XCHG guarantees memory ordering, thus ->next should be
-	 * updated before pointers are actually swapped and pointers are
-	 * swapped before prev->next is updated.
-	 */
-
-	prev = xchg(&head->prev, new);
-
-	/*
-	 * It is safe to modify prev->next and new->prev, because a new element
-	 * is added only to the tail and new->next is updated before XCHG.
-	 */
-
-	prev->next = new;
-	new->prev = prev;
-
-	return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
-	struct eventpoll *ep = epi->ep;
-
-	/* Fast preliminary check */
-	if (epi->next != EP_UNACTIVE_PTR)
-		return false;
-
-	/* Check that the same epi has not been just chained from another CPU */
-	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
-		return false;
-
-	/* Atomically exchange tail */
-	epi->next = xchg(&ep->ovflist, epi);
-
-	return true;
-}
-
-/*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless.  Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries.  Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly.  This is detected using
- * cmpxchg() operation.
  */
 static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	unsigned long flags;
 	int ewake = 0;
 
-	read_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->lock, flags);
 
 	ep_set_busy_poll_napi_id(epi);
 
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	 * chained in ep->ovflist and requeued later on.
 	 */
 	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
-		if (chain_epi_lockless(epi))
+		if (epi->next == EP_UNACTIVE_PTR) {
+			epi->next = READ_ONCE(ep->ovflist);
+			WRITE_ONCE(ep->ovflist, epi);
 			ep_pm_stay_awake_rcu(epi);
+		}
 	} else if (!ep_is_linked(epi)) {
 		/* In the usual case, add event to ready list. */
-		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
-			ep_pm_stay_awake_rcu(epi);
+		list_add_tail(&epi->rdllink, &ep->rdllist);
+		ep_pm_stay_awake_rcu(epi);
 	}
 
 	/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 		pwake++;
 
 out_unlock:
-	read_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	}
 
 	/* We have to drop the new item inside our item list to keep track of it */
-	write_lock_irq(&ep->lock);
+	spin_lock_irq(&ep->lock);
 
 	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 			pwake++;
 	}
 
-	write_unlock_irq(&ep->lock);
+	spin_unlock_irq(&ep->lock);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	 * list, push it inside.
 	 */
 	if (ep_item_poll(epi, &pt, 1)) {
-		write_lock_irq(&ep->lock);
+		spin_lock_irq(&ep->lock);
 		if (!ep_is_linked(epi)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
-		write_unlock_irq(&ep->lock);
+		spin_unlock_irq(&ep->lock);
 	}
 
 	/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		init_wait(&wait);
 		wait.func = ep_autoremove_wake_function;
 
-		write_lock_irq(&ep->lock);
+		spin_lock_irq(&ep->lock);
 		/*
 		 * Barrierless variant, waitqueue_active() is called under
 		 * the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		if (!eavail)
 			__add_wait_queue_exclusive(&ep->wq, &wait);
 
-		write_unlock_irq(&ep->lock);
+		spin_unlock_irq(&ep->lock);
 
 		if (!eavail)
 			timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		eavail = 1;
 
 		if (!list_empty_careful(&wait.entry)) {
-			write_lock_irq(&ep->lock);
+			spin_lock_irq(&ep->lock);
 			/*
 			 * If the thread timed out and is not on the wait queue,
 			 * it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 			if (timed_out)
 				eavail = list_empty(&wait.entry);
 			__remove_wait_queue(&ep->wq, &wait);
-			write_unlock_irq(&ep->lock);
+			spin_unlock_irq(&ep->lock);
 		}
 	}
 }
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..4a89918b761f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -599,7 +599,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 		    unsigned long stack_top,
 		    int executable_stack)
 {
-	unsigned long ret;
+	int ret;
 	unsigned long stack_shift;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = bprm->vma;
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
 {
 	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
-	if (!error)
+	if (!error && !write)
 		validate_coredump_safety();
 	return error;
 }
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 0a056d97e640..cf0a0970c095 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
 }
 
 const struct fscrypt_operations ext4_cryptops = {
+	.inode_info_offs	= (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+				  (int)offsetof(struct ext4_inode_info, vfs_inode),
 	.needs_bounce_pages	= 1,
 	.has_32bit_inodes	= 1,
 	.supports_subblock_data_units = 1,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01a6e2de7fc3..6cb784a56b3b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1182,6 +1182,14 @@ struct ext4_inode_info {
 	__u32 i_csum_seed;
 
 	kprojid_t i_projid;
+
+#ifdef CONFIG_FS_ENCRYPTION
+	struct fscrypt_inode_info *i_crypt_info;
+#endif
+
+#ifdef CONFIG_FS_VERITY
+	struct fsverity_info *i_verity_info;
+#endif
 };
 
 /*
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 383c6edea6dd..91185c40f755 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
 	/* Reserved GDT blocks */
 	if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
 		len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+		/*
+		 * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+		 * check for that.
+		 */
+		if (!len)
+			return 0;
+
 		error = ext4_getfsmap_fill(meta_list, fsb, len,
 					   EXT4_FMR_OWN_RESV_GDT);
 		if (error)
@@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
 	ext4_group_t end_ag;
 	ext4_grpblk_t first_cluster;
 	ext4_grpblk_t last_cluster;
+	struct ext4_fsmap irec;
 	int error = 0;
 
 	bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
 			goto err;
 	}
 
-	/* Report any gaps at the end of the bg */
+	/*
+	 * The dummy record below will cause ext4_getfsmap_helper() to report
+	 * any allocated blocks at the end of the range.
+	 */
+	irec.fmr_device = 0;
+	irec.fmr_physical = end_fsb + 1;
+	irec.fmr_length = 0;
+	irec.fmr_owner = EXT4_FMR_OWN_FREE;
+	irec.fmr_flags = 0;
+
 	info->gfi_last = true;
-	error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
-					     0, info);
+	error = ext4_getfsmap_helper(sb, info, &irec);
 	if (error)
 		goto err;
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df4051613b29..ba4fd9aba1c1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		       "nonexistent device\n", __func__, __LINE__);
 		return;
 	}
-	if (atomic_read(&inode->i_count) > 1) {
+	if (icount_read(inode) > 1) {
 		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
 			 __func__, __LINE__, inode->i_ino,
-			 atomic_read(&inode->i_count));
+			 icount_read(inode));
 		return;
 	}
 	if (inode->i_nlink) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7de327fa7b1c..d45124318200 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
-	int count = 0;
+	u64 count = 0;
 	ext4_fsblk_t first_block = 0;
 
 	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
@@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 		count++;
 		/* Fill in size of a hole we found */
 		map->m_pblk = 0;
-		map->m_len = min_t(unsigned int, map->m_len, count);
+		map->m_len = umin(map->m_len, count);
 		goto cleanup;
 	}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ed54c4d0f2f9..5b7a15db4953 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -146,7 +146,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
  */
 int ext4_inode_is_fast_symlink(struct inode *inode)
 {
-	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+	if (!ext4_has_feature_ea_inode(inode->i_sb)) {
 		int ea_blocks = EXT4_I(inode)->i_file_acl ?
 				EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
 
@@ -3155,7 +3155,7 @@ retry:
 		folio_unlock(folio);
 		folio_put(folio);
 		/*
-		 * block_write_begin may have instantiated a few blocks
+		 * ext4_block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold inode lock.
 		 */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5898d92ba19f..8b18802e83eb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 		list_splice_tail(&freed_data_list, &sbi->s_discard_list);
 		spin_unlock(&sbi->s_md_lock);
 		if (wake)
-			queue_work(system_unbound_wq, &sbi->s_discard_work);
+			queue_work(system_dfl_wq, &sbi->s_discard_work);
 	} else {
 		list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
 			kmem_cache_free(ext4_free_data_cachep, entry);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d83f91b62317..2cd36f59c9e3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2965,7 +2965,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 			     struct inode *inode)
 {
 	struct buffer_head *dir_block = NULL;
-	struct ext4_dir_entry_2 *de;
 	ext4_lblk_t block = 0;
 	int err;
 
@@ -2982,10 +2981,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 	dir_block = ext4_append(handle, inode, &block);
 	if (IS_ERR(dir_block))
 		return PTR_ERR(dir_block);
-	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
 	err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
-	if (err)
-		goto out;
 out:
 	brelse(dir_block);
 	return err;
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 7c7f792ad6ab..524d4658fa40 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -589,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb)
 	}
 	oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
 	oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
-	oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
-			       GFP_KERNEL);
+	oi->of_binfo = kmalloc_array(oi->of_blocks,
+				     sizeof(struct ext4_orphan_block),
+				     GFP_KERNEL);
 	if (!oi->of_binfo) {
 		ret = -ENOMEM;
 		goto out_put;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3d8b0f6d2dea..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -547,7 +547,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 		 * first page of the bio.  Otherwise it can deadlock.
 		 */
 		if (io->io_bio)
-			gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+			gfp_flags = GFP_NOWAIT;
 	retry_encrypt:
 		bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
 					enc_bytes, 0, gfp_flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7d39da7e733..7f2d4014d128 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -268,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
 {
 	struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
-			sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
+			sb->s_blocksize, GFP_NOWAIT);
 
 	if (likely(bh)) {
 		if (trylock_buffer(bh))
@@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 
 static int ext4_drop_inode(struct inode *inode)
 {
-	int drop = generic_drop_inode(inode);
+	int drop = inode_generic_drop(inode);
 
 	if (!drop)
 		drop = fscrypt_drop_inode(inode);
@@ -1470,6 +1470,12 @@ static void init_once(void *foo)
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 	ext4_fc_init_inode(&ei->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+	ei->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+	ei->i_verity_info = NULL;
+#endif
 }
 
 static int __init init_inodecache(void)
@@ -1998,6 +2004,9 @@ int ext4_init_fs_context(struct fs_context *fc)
 	fc->fs_private = ctx;
 	fc->ops = &ext4_context_ops;
 
+	/* i_version is always enabled now */
+	fc->sb_flags |= SB_I_VERSION;
+
 	return 0;
 }
 
@@ -2975,6 +2984,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
 	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
 		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+	if (nodefs && sb->s_flags & SB_I_VERSION)
+		SEQ_OPTS_PUTS("i_version");
 	if (nodefs || sbi->s_stripe)
 		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
 	if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -5314,9 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
 
-	/* i_version is always enabled now */
-	sb->s_flags |= SB_I_VERSION;
-
 	/* HSM events are allowed by default. */
 	sb->s_iflags |= SB_I_ALLOW_HSM;
 
@@ -5414,6 +5422,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		err = ext4_load_and_init_journal(sb, es, ctx);
 		if (err)
 			goto failed_mount3a;
+		if (bdev_read_only(sb->s_bdev))
+		    needs_recovery = 0;
 	} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
 		   ext4_has_feature_journal_needs_recovery(sb)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d9203228ce97..b0acb0c50313 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
 }
 
 const struct fsverity_operations ext4_verityops = {
+	.inode_info_offs	= (int)offsetof(struct ext4_inode_info, i_verity_info) -
+				  (int)offsetof(struct ext4_inode_info, vfs_inode),
 	.begin_enable_verity	= ext4_begin_enable_verity,
 	.end_enable_verity	= ext4_end_enable_verity,
 	.get_verity_descriptor	= ext4_get_verity_descriptor,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 46be7560548c..6e465bbc85ee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -907,6 +907,12 @@ struct f2fs_inode_info {
 
 	unsigned int atomic_write_cnt;
 	loff_t original_i_size;		/* original i_size before atomic write */
+#ifdef CONFIG_FS_ENCRYPTION
+	struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */
+#endif
+#ifdef CONFIG_FS_VERITY
+	struct fsverity_info *i_verity_info; /* filesystem verity info */
+#endif
 };
 
 static inline void get_read_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e16c4e2830c2..2619cbbd7d2d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -480,6 +480,12 @@ static void init_once(void *foo)
 	struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
 
 	inode_init_once(&fi->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+	fi->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+	fi->i_verity_info = NULL;
+#endif
 }
 
 #ifdef CONFIG_QUOTA
@@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode)
 	if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
 		if (!inode->i_nlink && !is_bad_inode(inode)) {
 			/* to avoid evict_inode call simultaneously */
-			atomic_inc(&inode->i_count);
+			__iget(inode);
 			spin_unlock(&inode->i_lock);
 
 			/* should remain fi->extent_tree for writepage */
@@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode)
 			sb_end_intwrite(inode->i_sb);
 
 			spin_lock(&inode->i_lock);
-			atomic_dec(&inode->i_count);
+			iput(inode);
 		}
 		trace_f2fs_drop_inode(inode, 0);
 		return 0;
 	}
-	ret = generic_drop_inode(inode);
+	ret = inode_generic_drop(inode);
 	if (!ret)
 		ret = fscrypt_drop_inode(inode);
 	trace_f2fs_drop_inode(inode, ret);
@@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
+	.inode_info_offs	= (int)offsetof(struct f2fs_inode_info, i_crypt_info) -
+				  (int)offsetof(struct f2fs_inode_info, vfs_inode),
 	.needs_bounce_pages	= 1,
 	.has_32bit_inodes	= 1,
 	.supports_subblock_data_units = 1,
@@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
 	.has_stable_inodes	= f2fs_has_stable_inodes,
 	.get_devices		= f2fs_get_devices,
 };
-#endif
+#endif /* CONFIG_FS_ENCRYPTION */
 
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
 		u64 ino, u32 generation)
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 2287f238ae09..f0ab9a3c7a82 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
 }
 
 const struct fsverity_operations f2fs_verityops = {
+	.inode_info_offs	= (int)offsetof(struct f2fs_inode_info, i_verity_info) -
+				  (int)offsetof(struct f2fs_inode_info, vfs_inode),
 	.begin_enable_verity	= f2fs_begin_enable_verity,
 	.end_enable_verity	= f2fs_end_enable_verity,
 	.get_verity_descriptor	= f2fs_get_verity_descriptor,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5598e4d57422..72f8433d9109 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint)
 	}
 }
 
-static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
-			      unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
 	u64 __user *argp = (u64 __user *)arg;
@@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
 	return 0;
 }
 
-static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
-			      unsigned long arg)
+static long fcntl_set_rw_hint(struct file *file, unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
 	u64 __user *argp = (u64 __user *)arg;
@@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		err = memfd_fcntl(filp, cmd, argi);
 		break;
 	case F_GET_RW_HINT:
-		err = fcntl_get_rw_hint(filp, cmd, arg);
+		err = fcntl_get_rw_hint(filp, arg);
 		break;
 	case F_SET_RW_HINT:
-		err = fcntl_set_rw_hint(filp, cmd, arg);
+		err = fcntl_set_rw_hint(filp, arg);
 		break;
 	default:
 		break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 7c236f64cdea..052f9c9368fb 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/nsfs.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root)
 		return 0;
 	}
 
+	if (fd == FD_NSFS_ROOT) {
+		nsfs_get_root(root);
+		return 0;
+	}
+
 	return -EBADF;
 }
 
@@ -208,6 +214,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
 		return 1;
 
 	/*
+	 * Verify that the decoded dentry itself has a valid id mapping.
+	 * In case the decoded dentry is the mountfd root itself, this
+	 * verifies that the mountfd inode itself has a valid id mapping.
+	 */
+	if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry)))
+		return 0;
+
+	/*
 	 * It's racy as we're not taking rename_lock but we're able to ignore
 	 * permissions and we just need an approximation whether we were able
 	 * to follow a path to the file.
@@ -402,7 +416,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 	if (retval)
 		return retval;
 
-	CLASS(get_unused_fd, fd)(O_CLOEXEC);
+	CLASS(get_unused_fd, fd)(open_flag);
 	if (fd < 0)
 		return fd;
 
diff --git a/fs/file.c b/fs/file.c
index 6d2275c3be9c..28743b742e3c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	err = expand_files(files, fd);
 	if (unlikely(err < 0))
 		goto out_unlock;
-	return do_dup2(files, file, fd, flags);
+	err = do_dup2(files, file, fd, flags);
+	if (err < 0)
+		return err;
+	return 0;
 
 out_unlock:
 	spin_unlock(&files->file_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cc57367fb641..2b35e80037fe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 }
 
 struct inode_switch_wbs_context {
-	struct rcu_work		work;
+	/* List of queued switching contexts for the wb */
+	struct llist_node	list;
 
 	/*
 	 * Multiple inodes can be switched at once.  The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
 	 * array embedded into struct inode_switch_wbs_context.  Otherwise
 	 * an inode could be left in a non-consistent state.
 	 */
-	struct bdi_writeback	*new_wb;
 	struct inode		*inodes[];
 };
 
@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
 	 * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
 	 * the specific list @inode was on is ignored and the @inode is put on
 	 * ->b_dirty which is always correct including from ->b_dirty_time.
-	 * The transfer preserves @inode->dirtied_when ordering.  If the @inode
-	 * was clean, it means it was on the b_attached list, so move it onto
-	 * the b_attached list of @new_wb.
+	 * If the @inode was clean, it means it was on the b_attached list, so
+	 * move it onto the b_attached list of @new_wb.
 	 */
 	if (!list_empty(&inode->i_io_list)) {
 		inode->i_wb = new_wb;
 
 		if (inode->i_state & I_DIRTY_ALL) {
-			struct inode *pos;
-
-			list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
-				if (time_after_eq(inode->dirtied_when,
-						  pos->dirtied_when))
-					break;
+			/*
+			 * We need to keep b_dirty list sorted by
+			 * dirtied_time_when. However properly sorting the
+			 * inode in the list gets too expensive when switching
+			 * many inodes. So just attach inode at the end of the
+			 * dirty list and clobber the dirtied_time_when.
+			 */
+			inode->dirtied_time_when = jiffies;
 			inode_io_list_move_locked(inode, new_wb,
-						  pos->i_io_list.prev);
+						  &new_wb->b_dirty);
 		} else {
 			inode_cgwb_move_to_attached(inode, new_wb);
 		}
@@ -486,13 +487,11 @@ skip_switch:
 	return switched;
 }
 
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
+				     struct inode_switch_wbs_context *isw)
 {
-	struct inode_switch_wbs_context *isw =
-		container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
 	struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
 	struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
-	struct bdi_writeback *new_wb = isw->new_wb;
 	unsigned long nr_switched = 0;
 	struct inode **inodep;
 
@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	 */
 	down_read(&bdi->wb_switch_rwsem);
 
+	inodep = isw->inodes;
 	/*
 	 * By the time control reaches here, RCU grace period has passed
 	 * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	 * gives us exclusion against all wb related operations on @inode
 	 * including IO list manipulations and stat updates.
 	 */
+relock:
 	if (old_wb < new_wb) {
 		spin_lock(&old_wb->list_lock);
 		spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 		spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
 	}
 
-	for (inodep = isw->inodes; *inodep; inodep++) {
+	while (*inodep) {
 		WARN_ON_ONCE((*inodep)->i_wb != old_wb);
 		if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
 			nr_switched++;
+		inodep++;
+		if (*inodep && need_resched()) {
+			spin_unlock(&new_wb->list_lock);
+			spin_unlock(&old_wb->list_lock);
+			cond_resched();
+			goto relock;
+		}
 	}
 
 	spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	atomic_dec(&isw_nr_in_flight);
 }
 
+void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
+						    switch_work);
+	struct inode_switch_wbs_context *isw, *next_isw;
+	struct llist_node *list;
+
+	/*
+	 * Grab out reference to wb so that it cannot get freed under us
+	 * after we process all the isw items.
+	 */
+	wb_get(new_wb);
+	while (1) {
+		list = llist_del_all(&new_wb->switch_wbs_ctxs);
+		/* Nothing to do? */
+		if (!list)
+			break;
+		/*
+		 * In addition to synchronizing among switchers, I_WB_SWITCH
+		 * tells the RCU protected stat update paths to grab the i_page
+		 * lock so that stat transfer can synchronize against them.
+		 * Let's continue after I_WB_SWITCH is guaranteed to be
+		 * visible.
+		 */
+		synchronize_rcu();
+
+		llist_for_each_entry_safe(isw, next_isw, list, list)
+			process_inode_switch_wbs(new_wb, isw);
+	}
+	wb_put(new_wb);
+}
+
 static bool inode_prepare_wbs_switch(struct inode *inode,
 				     struct bdi_writeback *new_wb)
 {
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
 	return true;
 }
 
+static void wb_queue_isw(struct bdi_writeback *wb,
+			 struct inode_switch_wbs_context *isw)
+{
+	if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
+		queue_work(isw_wq, &wb->switch_work);
+}
+
 /**
  * inode_switch_wbs - change the wb association of an inode
  * @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
+	struct bdi_writeback *new_wb = NULL;
 
 	/* noop if seems to be already in progress */
 	if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	if (!memcg_css)
 		goto out_free;
 
-	isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+	new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
 	css_put(memcg_css);
-	if (!isw->new_wb)
+	if (!new_wb)
 		goto out_free;
 
-	if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+	if (!inode_prepare_wbs_switch(inode, new_wb))
 		goto out_free;
 
 	isw->inodes[0] = inode;
 
-	/*
-	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
-	 * the RCU protected stat update paths to grab the i_page
-	 * lock so that stat transfer can synchronize against them.
-	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
-	 */
-	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
-	queue_rcu_work(isw_wq, &isw->work);
+	trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
+	wb_queue_isw(new_wb, isw);
 	return;
 
 out_free:
 	atomic_dec(&isw_nr_in_flight);
-	if (isw->new_wb)
-		wb_put(isw->new_wb);
+	if (new_wb)
+		wb_put(new_wb);
 	kfree(isw);
 }
 
-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
+				   struct inode_switch_wbs_context *isw,
 				   struct list_head *list, int *nr)
 {
 	struct inode *inode;
 
 	list_for_each_entry(inode, list, i_io_list) {
-		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+		if (!inode_prepare_wbs_switch(inode, new_wb))
 			continue;
 
 		isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 {
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
+	struct bdi_writeback *new_wb;
 	int nr;
 	bool restart = false;
 
@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 
 	for (memcg_css = wb->memcg_css->parent; memcg_css;
 	     memcg_css = memcg_css->parent) {
-		isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
-		if (isw->new_wb)
+		new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+		if (new_wb)
 			break;
 	}
-	if (unlikely(!isw->new_wb))
-		isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+	if (unlikely(!new_wb))
+		new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
 
 	nr = 0;
 	spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 	 * bandwidth restrictions, as writeback of inode metadata is not
 	 * accounted for.
 	 */
-	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+	restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
 	if (!restart)
-		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
+		restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
+						 &nr);
 	spin_unlock(&wb->list_lock);
 
 	/* no attached inodes? bail out */
 	if (nr == 0) {
 		atomic_dec(&isw_nr_in_flight);
-		wb_put(isw->new_wb);
+		wb_put(new_wb);
 		kfree(isw);
 		return restart;
 	}
 
-	/*
-	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
-	 * the RCU protected stat update paths to grab the i_page
-	 * lock so that stat transfer can synchronize against them.
-	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
-	 */
-	INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
-	queue_rcu_work(isw_wq, &isw->work);
+	trace_inode_switch_wbs_queue(wb, new_wb, nr);
+	wb_queue_isw(new_wb, isw);
 
 	return restart;
 }
@@ -1123,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
 	dirty = dirty * 10 / 8;
 
 	/* issue the writeback work */
-	work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+	work = kzalloc(sizeof(*work), GFP_NOWAIT);
 	if (work) {
 		work->nr_pages = dirty;
 		work->sync_mode = WB_SYNC_NONE;
@@ -1180,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb)
 
 static int __init cgroup_writeback_init(void)
 {
-	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0);
 	if (!isw_wq)
 		return -ENOMEM;
 	return 0;
@@ -1767,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode,
 	int ret = 0;
 
 	spin_lock(&inode->i_lock);
-	if (!atomic_read(&inode->i_count))
+	if (!icount_read(inode))
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 	else
 		WARN_ON(inode->i_state & I_WILL_FREE);
@@ -2442,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write,
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
-		mod_delayed_work(system_wq, &dirtytime_work, 0);
+		mod_delayed_work(system_percpu_wq, &dirtytime_work, 0);
 	return ret;
 }
 
@@ -2608,10 +2647,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			wakeup_bdi = inode_io_list_move_locked(inode, wb,
 							       dirty_list);
 
-			spin_unlock(&wb->list_lock);
-			spin_unlock(&inode->i_lock);
-			trace_writeback_dirty_inode_enqueue(inode);
-
 			/*
 			 * If this is the first dirty inode for this bdi,
 			 * we have to wake-up the corresponding bdi thread
@@ -2621,6 +2656,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			if (wakeup_bdi &&
 			    (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
 				wb_wakeup_delayed(wb);
+
+			spin_unlock(&wb->list_lock);
+			spin_unlock(&inode->i_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
+
 			return;
 		}
 	}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 1aaf4cb2afb2..f645c99204eb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -18,50 +18,56 @@
 #include "internal.h"
 #include "mount.h"
 
+static inline const char *fetch_message_locked(struct fc_log *log, size_t len,
+					       bool *need_free)
+{
+	const char *p;
+	int index;
+
+	if (unlikely(log->head == log->tail))
+		return ERR_PTR(-ENODATA);
+
+	index = log->tail & (ARRAY_SIZE(log->buffer) - 1);
+	p = log->buffer[index];
+	if (unlikely(strlen(p) > len))
+		return ERR_PTR(-EMSGSIZE);
+
+	log->buffer[index] = NULL;
+	*need_free = log->need_free & (1 << index);
+	log->need_free &= ~(1 << index);
+	log->tail++;
+
+	return p;
+}
+
 /*
  * Allow the user to read back any error, warning or informational messages.
+ * Only one message is returned for each read(2) call.
  */
 static ssize_t fscontext_read(struct file *file,
 			      char __user *_buf, size_t len, loff_t *pos)
 {
 	struct fs_context *fc = file->private_data;
-	struct fc_log *log = fc->log.log;
-	unsigned int logsize = ARRAY_SIZE(log->buffer);
-	ssize_t ret;
-	char *p;
+	ssize_t err;
+	const char *p __free(kfree) = NULL, *message;
 	bool need_free;
-	int index, n;
+	int n;
 
-	ret = mutex_lock_interruptible(&fc->uapi_mutex);
-	if (ret < 0)
-		return ret;
-
-	if (log->head == log->tail) {
-		mutex_unlock(&fc->uapi_mutex);
-		return -ENODATA;
-	}
-
-	index = log->tail & (logsize - 1);
-	p = log->buffer[index];
-	need_free = log->need_free & (1 << index);
-	log->buffer[index] = NULL;
-	log->need_free &= ~(1 << index);
-	log->tail++;
+	err = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (err < 0)
+		return err;
+	message = fetch_message_locked(fc->log.log, len, &need_free);
 	mutex_unlock(&fc->uapi_mutex);
+	if (IS_ERR(message))
+		return PTR_ERR(message);
 
-	ret = -EMSGSIZE;
-	n = strlen(p);
-	if (n > len)
-		goto err_free;
-	ret = -EFAULT;
-	if (copy_to_user(_buf, p, n) != 0)
-		goto err_free;
-	ret = n;
-
-err_free:
 	if (need_free)
-		kfree(p);
-	return ret;
+		p = message;
+
+	n = strlen(message);
+	if (copy_to_user(_buf, message, n))
+		return -EFAULT;
+	return n;
 }
 
 static int fscontext_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..66a1ba8c56b5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work)
 	    goto abort_conn;
 
 out:
-	queue_delayed_work(system_wq, &fc->timeout.work,
+	queue_delayed_work(system_percpu_wq, &fc->timeout.work,
 			   fuse_timeout_timer_freq);
 	return;
 
@@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
 
 	index = outarg->offset >> PAGE_SHIFT;
 
-	while (num) {
+	while (num && ap->num_folios < num_pages) {
 		struct folio *folio;
 		unsigned int folio_offset;
 		unsigned int nr_bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..5c569c3cb53f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
 	if (attr->blksize != 0)
 		blkbits = ilog2(attr->blksize);
 	else
-		blkbits = inode->i_sb->s_blocksize_bits;
+		blkbits = fc->blkbits;
 
 	stat->blksize = 1 << blkbits;
 }
@@ -1377,6 +1377,7 @@ retry:
 		generic_fillattr(idmap, request_mask, inode, stat);
 		stat->mode = fi->orig_i_mode;
 		stat->ino = fi->orig_ino;
+		stat->blksize = 1 << fi->cached_i_blkbits;
 		if (test_bit(FUSE_I_BTIME, &fi->state)) {
 			stat->btime = fi->i_btime;
 			stat->result_mask |= STATX_BTIME;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5525a4520b0f..4adcf09d4b01 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 		.nodeid_out = ff_out->nodeid,
 		.fh_out = ff_out->fh,
 		.off_out = pos_out,
-		.len = len,
+		.len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
 		.flags = flags
 	};
 	struct fuse_write_out outarg;
@@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 		fc->no_copy_file_range = 1;
 		err = -EOPNOTSUPP;
 	}
+	if (!err && outarg.size > len)
+		err = -EIO;
+
 	if (err)
 		goto out;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ec248d13c8bf..cc428d04be3e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -210,6 +210,12 @@ struct fuse_inode {
 	/** Reference to backing file in passthrough mode */
 	struct fuse_backing *fb;
 #endif
+
+	/*
+	 * The underlying inode->i_blkbits value will not be modified,
+	 * so preserve the blocksize specified by the server.
+	 */
+	u8 cached_i_blkbits;
 };
 
 /** FUSE inode state bits */
@@ -969,6 +975,14 @@ struct fuse_conn {
 		/* Request timeout (in jiffies). 0 = no timeout */
 		unsigned int req_timeout;
 	} timeout;
+
+	/*
+	 * This is a workaround until fuse uses iomap for reads.
+	 * For fuseblk servers, this represents the blocksize passed in at
+	 * mount time and for regular fuse servers, this is equivalent to
+	 * inode->i_blkbits.
+	 */
+	u8 blkbits;
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ecb869e895ab..7485a41af892 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -289,10 +289,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 		}
 	}
 
-	if (attr->blksize != 0)
-		inode->i_blkbits = ilog2(attr->blksize);
+	if (attr->blksize)
+		fi->cached_i_blkbits = ilog2(attr->blksize);
 	else
-		inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+		fi->cached_i_blkbits = fc->blkbits;
 
 	/*
 	 * Don't set the sticky bit in i_mode, unless we want the VFS
@@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = {
 	.free_inode     = fuse_free_inode,
 	.evict_inode	= fuse_evict_inode,
 	.write_inode	= fuse_write_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.umount_begin	= fuse_umount_begin,
 	.statfs		= fuse_statfs,
 	.sync_fs	= fuse_sync_fs,
@@ -1273,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
 {
 	fc->timeout.req_timeout = secs_to_jiffies(timeout);
 	INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
-	queue_delayed_work(system_wq, &fc->timeout.work,
+	queue_delayed_work(system_percpu_wq, &fc->timeout.work,
 			   fuse_timeout_timer_freq);
 }
 
@@ -1810,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 		err = -EINVAL;
 		if (!sb_set_blocksize(sb, ctx->blksize))
 			goto err;
+		/*
+		 * This is a workaround until fuse hooks into iomap for reads.
+		 * Use PAGE_SIZE for the blocksize else if the writeback cache
+		 * is enabled, buffered writes go through iomap and a read may
+		 * overwrite partially written data if blocksize < PAGE_SIZE
+		 */
+		fc->blkbits = sb->s_blocksize_bits;
+		if (ctx->blksize != PAGE_SIZE &&
+		    !sb_set_blocksize(sb, PAGE_SIZE))
+			goto err;
 #endif
 	} else {
 		sb->s_blocksize = PAGE_SIZE;
 		sb->s_blocksize_bits = PAGE_SHIFT;
+		fc->blkbits = sb->s_blocksize_bits;
 	}
 
 	sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..eb97ac009e75 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
 	if (!file)
 		goto out;
 
+	/* read/write/splice/mmap passthrough only relevant for regular files */
+	res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+	if (!d_is_reg(file->f_path.dentry))
+		goto out_fput;
+
 	backing_sb = file_inode(file)->i_sb;
 	res = -ELOOP;
 	if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index c826e7ca49f5..76c8fd0bfc75 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = fs->window_kaddr + offset;
 	if (pfn)
-		*pfn = fs->window_phys_addr + offset;
+		*pfn = PHYS_PFN(fs->window_phys_addr + offset);
 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
 }
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 72d95185a39f..bc67fa058c84 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int ret;
 
 	if (!(fl->c.flc_flags & FL_POSIX))
 		return -ENOLCK;
@@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 			locks_lock_file_wait(file, fl);
 		return -EIO;
 	}
-	if (cmd == F_CANCELLK)
-		return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else if (IS_GETLK(cmd))
-		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else if (lock_is_unlock(fl))
-		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else
-		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+	down_read(&ls->ls_sem);
+	ret = -ENODEV;
+	if (likely(ls->ls_dlm != NULL)) {
+		if (cmd == F_CANCELLK)
+			ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
+		else if (IS_GETLK(cmd))
+			ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+		else if (lock_is_unlock(fl))
+			ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+		else
+			ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+	}
+	up_read(&ls->ls_sem);
+	return ret;
 }
 
 static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b6fd1cb17de7..b677c0e6b9ab 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -481,11 +481,9 @@ done:
 /**
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
- * 
- * Returns true on success (i.e., progress was made or there are no waiters).
  */
 
-static bool do_promote(struct gfs2_glock *gl)
+static void do_promote(struct gfs2_glock *gl)
 {
 	struct gfs2_holder *gh, *current_gh;
 
@@ -496,13 +494,10 @@ static bool do_promote(struct gfs2_glock *gl)
 		if (!may_grant(gl, current_gh, gh)) {
 			/*
 			 * If we get here, it means we may not grant this
-			 * holder for some reason. If this holder is at the
-			 * head of the list, it means we have a blocked holder
-			 * at the head, so return false.
+			 * holder for some reason.
 			 */
-			if (list_is_first(&gh->gh_list, &gl->gl_holders))
-				return false;
-			do_error(gl, 0);
+			if (current_gh)
+				do_error(gl, 0); /* Fail queued try locks */
 			break;
 		}
 		set_bit(HIF_HOLDER, &gh->gh_iflags);
@@ -511,7 +506,6 @@ static bool do_promote(struct gfs2_glock *gl)
 		if (!current_gh)
 			current_gh = gh;
 	}
-	return true;
 }
 
 /**
@@ -646,8 +640,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
 	}
 
 	/* Fast path - we got what we asked for */
-	if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+		clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
 		gfs2_demote_wake(gl);
+	}
 	if (gl->gl_state != LM_ST_UNLOCKED) {
 		if (glops->go_xmote_bh) {
 			int rv;
@@ -693,54 +689,33 @@ __acquires(&gl->gl_lockref.lock)
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
 	int ret;
 
 	if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
 	    gh && !(gh->gh_flags & LM_FLAG_NOEXP))
 		goto skip_inval;
 
-	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP);
 	GLOCK_BUG_ON(gl, gl->gl_state == target);
 	GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
-	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
-	    glops->go_inval) {
-		/*
-		 * If another process is already doing the invalidate, let that
-		 * finish first.  The glock state machine will get back to this
-		 * holder again later.
-		 */
-		if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS,
-				     &gl->gl_flags))
-			return;
-		do_error(gl, 0); /* Fail queued try locks */
-	}
-	gl->gl_req = target;
-	set_bit(GLF_BLOCKING, &gl->gl_flags);
-	if ((gl->gl_req == LM_ST_UNLOCKED) ||
-	    (gl->gl_state == LM_ST_EXCLUSIVE) ||
-	    (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
-		clear_bit(GLF_BLOCKING, &gl->gl_flags);
-	if (!glops->go_inval && !glops->go_sync)
+	if (!glops->go_inval || !glops->go_sync)
 		goto skip_inval;
 
 	spin_unlock(&gl->gl_lockref.lock);
-	if (glops->go_sync) {
-		ret = glops->go_sync(gl);
-		/* If we had a problem syncing (due to io errors or whatever,
-		 * we should not invalidate the metadata or tell dlm to
-		 * release the glock to other nodes.
-		 */
-		if (ret) {
-			if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
-				fs_err(sdp, "Error %d syncing glock \n", ret);
-				gfs2_dump_glock(NULL, gl, true);
-			}
-			spin_lock(&gl->gl_lockref.lock);
-			goto skip_inval;
+	ret = glops->go_sync(gl);
+	/* If we had a problem syncing (due to io errors or whatever,
+	 * we should not invalidate the metadata or tell dlm to
+	 * release the glock to other nodes.
+	 */
+	if (ret) {
+		if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
+			fs_err(sdp, "Error %d syncing glock\n", ret);
+			gfs2_dump_glock(NULL, gl, true);
 		}
+		spin_lock(&gl->gl_lockref.lock);
+		goto skip_inval;
 	}
-	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
+
+	if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) {
 		/*
 		 * The call to go_sync should have cleared out the ail list.
 		 * If there are still items, we have a problem. We ought to
@@ -755,12 +730,10 @@ __acquires(&gl->gl_lockref.lock)
 			gfs2_dump_glock(NULL, gl, true);
 		}
 		glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
-		clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
 	}
 	spin_lock(&gl->gl_lockref.lock);
 
 skip_inval:
-	gl->gl_lockref.count++;
 	/*
 	 * Check for an error encountered since we called go_sync and go_inval.
 	 * If so, we can't withdraw from the glock code because the withdraw
@@ -803,38 +776,41 @@ skip_inval:
 			if (!test_bit(GLF_CANCELING, &gl->gl_flags))
 				clear_bit(GLF_LOCK, &gl->gl_flags);
 			clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+			gl->gl_lockref.count++;
 			gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
 			return;
-		} else {
-			clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
 		}
 	}
 
 	if (ls->ls_ops->lm_lock) {
 		set_bit(GLF_PENDING_REPLY, &gl->gl_flags);
 		spin_unlock(&gl->gl_lockref.lock);
-		ret = ls->ls_ops->lm_lock(gl, target, lck_flags);
+		ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0);
 		spin_lock(&gl->gl_lockref.lock);
 
-		if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
-		    target == LM_ST_UNLOCKED &&
-		    test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
+		if (!ret) {
+			/* The operation will be completed asynchronously. */
+			gl->gl_lockref.count++;
+			return;
+		}
+		clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
+
+		if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED &&
+		    target == LM_ST_UNLOCKED) {
 			/*
 			 * The lockspace has been released and the lock has
 			 * been unlocked implicitly.
 			 */
-		} else if (ret) {
-			fs_err(sdp, "lm_lock ret %d\n", ret);
-			target = gl->gl_state | LM_OUT_ERROR;
 		} else {
-			/* The operation will be completed asynchronously. */
+			fs_err(sdp, "lm_lock ret %d\n", ret);
+			GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp));
 			return;
 		}
-		clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
 	}
 
 	/* Complete the operation now. */
 	finish_xmote(gl, target);
+	gl->gl_lockref.count++;
 	gfs2_glock_queue_work(gl, 0);
 }
 
@@ -855,11 +831,20 @@ __acquires(&gl->gl_lockref.lock)
 		return;
 	set_bit(GLF_LOCK, &gl->gl_flags);
 
-	/* While a demote is in progress, the GLF_LOCK flag must be set. */
+	/*
+	 * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during
+	 * locking operations.  We have just started a locking operation by
+	 * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must
+	 * be cleared.
+	 */
 	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
 
-	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
-	    gl->gl_demote_state != gl->gl_state) {
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+		if (gl->gl_demote_state == gl->gl_state) {
+			gfs2_demote_wake(gl);
+			goto promote;
+		}
+
 		if (find_first_holder(gl))
 			goto out_unlock;
 		if (nonblock)
@@ -869,31 +854,31 @@ __acquires(&gl->gl_lockref.lock)
 		gl->gl_target = gl->gl_demote_state;
 		do_xmote(gl, NULL, gl->gl_target);
 		return;
-	} else {
-		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
-			gfs2_demote_wake(gl);
-		if (do_promote(gl))
-			goto out_unlock;
-		gh = find_first_waiter(gl);
-		if (!gh)
-			goto out_unlock;
-		gl->gl_target = gh->gh_state;
-		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
-			do_error(gl, 0); /* Fail queued try locks */
-		do_xmote(gl, gh, gl->gl_target);
-		return;
 	}
 
+promote:
+	do_promote(gl);
+	if (find_first_holder(gl))
+		goto out_unlock;
+	gh = find_first_waiter(gl);
+	if (!gh)
+		goto out_unlock;
+	if (nonblock)
+		goto out_sched;
+	gl->gl_target = gh->gh_state;
+	if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+		do_error(gl, 0); /* Fail queued try locks */
+	do_xmote(gl, gh, gl->gl_target);
+	return;
+
 out_sched:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	smp_mb__after_atomic();
 	gl->gl_lockref.count++;
 	gfs2_glock_queue_work(gl, 0);
 	return;
 
 out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	smp_mb__after_atomic();
 }
 
 /**
@@ -1462,6 +1447,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 	va_end(args);
 }
 
+static bool gfs2_should_queue_trylock(struct gfs2_glock *gl,
+				      struct gfs2_holder *gh)
+{
+	struct gfs2_holder *current_gh, *gh2;
+
+	current_gh = find_first_holder(gl);
+	if (current_gh && !may_grant(gl, current_gh, gh))
+		return false;
+
+	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+			continue;
+		if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+			return false;
+	}
+	return true;
+}
+
 static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
 {
         if (!(gh->gh_flags & GL_NOPID))
@@ -1480,27 +1483,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
  */
 
 static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct gfs2_holder *gh2;
-	int try_futile = 0;
 
 	GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		GLOCK_BUG_ON(gl, true);
 
-	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		if (test_bit(GLF_LOCK, &gl->gl_flags)) {
-			struct gfs2_holder *current_gh;
-
-			current_gh = find_first_holder(gl);
-			try_futile = !may_grant(gl, current_gh, gh);
-		}
-		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
-			goto fail;
+	if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+	    !gfs2_should_queue_trylock(gl, gh)) {
+		gh->gh_error = GLR_TRYFAILED;
+		gfs2_holder_wake(gh);
+		return;
 	}
 
 	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
@@ -1512,15 +1508,6 @@ __acquires(&gl->gl_lockref.lock)
 			continue;
 		goto trap_recursive;
 	}
-	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
-		if (try_futile &&
-		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
-fail:
-			gh->gh_error = GLR_TRYFAILED;
-			gfs2_holder_wake(gh);
-			return;
-		}
-	}
 	trace_gfs2_glock_queue(gh, 1);
 	gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
 	gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
@@ -2321,8 +2308,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 		*p++ = 'y';
 	if (test_bit(GLF_LFLUSH, gflags))
 		*p++ = 'f';
-	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
-		*p++ = 'i';
 	if (test_bit(GLF_PENDING_REPLY, gflags))
 		*p++ = 'R';
 	if (test_bit(GLF_HAVE_REPLY, gflags))
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 9339a3bff6ee..d041b922b45e 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -68,6 +68,10 @@ enum {
  * also be granted in SHARED.  The preferred state is whichever is compatible
  * with other granted locks, or the specified state if no other locks exist.
  *
+ * In addition, when a lock is already held in EX mode locally, a SHARED or
+ * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted.
+ * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.)
+ *
  * LM_FLAG_NODE_SCOPE
  * This holder agrees to share the lock within this node. In other words,
  * the glock is held in EX mode according to DLM, but local holders on the
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d4ad82f47eee..5a0ea416cfda 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -319,7 +319,6 @@ enum {
 	GLF_DEMOTE_IN_PROGRESS		= 5,
 	GLF_DIRTY			= 6,
 	GLF_LFLUSH			= 7,
-	GLF_INVALIDATE_IN_PROGRESS	= 8,
 	GLF_HAVE_REPLY			= 9,
 	GLF_INITIAL			= 10,
 	GLF_HAVE_FROZEN_REPLY		= 11,
@@ -376,7 +375,6 @@ struct gfs2_glock {
 enum {
 	GIF_QD_LOCKED		= 1,
 	GIF_SW_PAGED		= 3,
-	GIF_FREE_VFS_INODE      = 5,
 	GIF_GLOP_PENDING	= 6,
 };
 
@@ -658,6 +656,8 @@ struct lm_lockstruct {
 	struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
 	char *ls_lvb_bits;
 
+	struct rw_semaphore ls_sem;
+
 	spinlock_t ls_recover_spin; /* protects following fields */
 	unsigned long ls_recover_flags; /* DFL_ */
 	uint32_t ls_recover_mount; /* gen in first recover_done cb */
@@ -823,7 +823,6 @@ struct gfs2_sbd {
 	atomic_t sd_log_in_flight;
 	wait_queue_head_t sd_log_flush_wait;
 	int sd_log_error; /* First log error */
-	wait_queue_head_t sd_withdraw_wait;
 
 	unsigned int sd_log_tail;
 	unsigned int sd_log_flush_tail;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index cee5d199d2d8..4f00af7dd256 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
 /**
  * gfs2_update_reply_times - Update locking statistics
  * @gl: The glock to update
+ * @blocking: The operation may have been blocking
  *
  * This assumes that gl->gl_dstamp has been set earlier.
  *
@@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
  * TRY_1CB flags are set are classified as non-blocking. All
  * other DLM requests are counted as (potentially) blocking.
  */
-static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
+static inline void gfs2_update_reply_times(struct gfs2_glock *gl,
+					   bool blocking)
 {
 	struct gfs2_pcpu_lkstats *lks;
 	const unsigned gltype = gl->gl_name.ln_type;
-	unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ?
-			 GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
+	unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
 	s64 rtt;
 
 	preempt_disable();
@@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
 static void gdlm_ast(void *arg)
 {
 	struct gfs2_glock *gl = arg;
+	bool blocking;
 	unsigned ret;
 
+	blocking = test_bit(GLF_BLOCKING, &gl->gl_flags);
+	gfs2_update_reply_times(gl, blocking);
+	clear_bit(GLF_BLOCKING, &gl->gl_flags);
+
 	/* If the glock is dead, we only react to a dlm_unlock() reply. */
 	if (__lockref_is_dead(&gl->gl_lockref) &&
 	    gl->gl_lksb.sb_status != -DLM_EUNLOCK)
 		return;
 
-	gfs2_update_reply_times(gl);
 	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
 
 	if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
@@ -157,14 +162,6 @@ static void gdlm_ast(void *arg)
 	}
 
 	ret = gl->gl_req;
-	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (gl->gl_req == LM_ST_SHARED)
-			ret = LM_ST_DEFERRED;
-		else if (gl->gl_req == LM_ST_DEFERRED)
-			ret = LM_ST_SHARED;
-		else
-			BUG();
-	}
 
 	/*
 	 * The GLF_INITIAL flag is initially set for new glocks.  Upon the
@@ -241,7 +238,7 @@ static bool down_conversion(int cur, int req)
 }
 
 static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
-		      const int cur, const int req)
+		      const int req, bool blocking)
 {
 	u32 lkf = 0;
 
@@ -256,15 +253,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
 		lkf |= DLM_LKF_NOQUEUEBAST;
 	}
 
-	if (gfs_flags & LM_FLAG_ANY) {
-		if (req == DLM_LOCK_PR)
-			lkf |= DLM_LKF_ALTCW;
-		else if (req == DLM_LOCK_CW)
-			lkf |= DLM_LKF_ALTPR;
-		else
-			BUG();
-	}
-
 	if (!test_bit(GLF_INITIAL, &gl->gl_flags)) {
 		lkf |= DLM_LKF_CONVERT;
 
@@ -274,7 +262,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
 		 * "upward" lock conversions or else DLM will reject the
 		 * request as invalid.
 		 */
-		if (!down_conversion(cur, req))
+		if (blocking)
 			lkf |= DLM_LKF_QUECVT;
 	}
 
@@ -294,14 +282,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 		     unsigned int flags)
 {
 	struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
+	bool blocking;
 	int cur, req;
 	u32 lkf;
 	char strname[GDLM_STRNAME_BYTES] = "";
 	int error;
 
+	gl->gl_req = req_state;
 	cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state);
 	req = make_mode(gl->gl_name.ln_sbd, req_state);
-	lkf = make_flags(gl, flags, cur, req);
+	blocking = !down_conversion(cur, req) &&
+		   !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB));
+	lkf = make_flags(gl, flags, req, blocking);
+	if (blocking)
+		set_bit(GLF_BLOCKING, &gl->gl_flags);
 	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
 	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
 	if (test_bit(GLF_INITIAL, &gl->gl_flags)) {
@@ -318,8 +312,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 	 */
 
 again:
-	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
-			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+	down_read(&ls->ls_sem);
+	error = -ENODEV;
+	if (likely(ls->ls_dlm != NULL)) {
+		error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
+				GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+	}
+	up_read(&ls->ls_sem);
 	if (error == -EBUSY) {
 		msleep(20);
 		goto again;
@@ -341,17 +340,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 		return;
 	}
 
-	clear_bit(GLF_BLOCKING, &gl->gl_flags);
 	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
 	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
 	gfs2_update_request_times(gl);
 
-	/* don't want to call dlm if we've unmounted the lock protocol */
-	if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
-		gfs2_glock_free(gl);
-		return;
-	}
-
 	/*
 	 * When the lockspace is released, all remaining glocks will be
 	 * unlocked automatically.  This is more efficient than unlocking them
@@ -369,13 +361,23 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 		flags |= DLM_LKF_VALBLK;
 
 again:
-	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags,
-			   NULL, gl);
+	down_read(&ls->ls_sem);
+	error = -ENODEV;
+	if (likely(ls->ls_dlm != NULL)) {
+		error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags,
+				   NULL, gl);
+	}
+	up_read(&ls->ls_sem);
 	if (error == -EBUSY) {
 		msleep(20);
 		goto again;
 	}
 
+	if (error == -ENODEV) {
+		gfs2_glock_free(gl);
+		return;
+	}
+
 	if (error) {
 		fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
 		       gl->gl_name.ln_type,
@@ -386,7 +388,12 @@ again:
 static void gdlm_cancel(struct gfs2_glock *gl)
 {
 	struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
-	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+
+	down_read(&ls->ls_sem);
+	if (likely(ls->ls_dlm != NULL)) {
+		dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+	}
+	up_read(&ls->ls_sem);
 }
 
 /*
@@ -567,7 +574,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 	int error;
 
-	error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+	down_read(&ls->ls_sem);
+	error = -ENODEV;
+	if (likely(ls->ls_dlm != NULL))
+		error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+	up_read(&ls->ls_sem);
 	if (error) {
 		fs_err(sdp, "%s lkid %x error %d\n",
 		       name, lksb->sb_lkid, error);
@@ -594,9 +605,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
 	memset(strname, 0, GDLM_STRNAME_BYTES);
 	snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
 
-	error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
-			 strname, GDLM_STRNAME_BYTES - 1,
-			 0, sync_wait_cb, ls, NULL);
+	down_read(&ls->ls_sem);
+	error = -ENODEV;
+	if (likely(ls->ls_dlm != NULL)) {
+		error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+				 strname, GDLM_STRNAME_BYTES - 1,
+				 0, sync_wait_cb, ls, NULL);
+	}
+	up_read(&ls->ls_sem);
 	if (error) {
 		fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
 		       name, lksb->sb_lkid, flags, mode, error);
@@ -1323,6 +1339,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	 */
 
 	INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+	ls->ls_dlm = NULL;
 	spin_lock_init(&ls->ls_recover_spin);
 	ls->ls_recover_flags = 0;
 	ls->ls_recover_mount = 0;
@@ -1357,6 +1374,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	 * create/join lockspace
 	 */
 
+	init_rwsem(&ls->ls_sem);
 	error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
 				  &gdlm_lockspace_ops, sdp, &ops_result,
 				  &ls->ls_dlm);
@@ -1400,7 +1418,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	return 0;
 
 fail_release:
-	dlm_release_lockspace(ls->ls_dlm, 2);
+	dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL);
 fail_free:
 	free_recover_size(ls);
 fail:
@@ -1436,10 +1454,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp)
 
 	/* mounted_lock and control_lock will be purged in dlm recovery */
 release:
+	down_write(&ls->ls_sem);
 	if (ls->ls_dlm) {
-		dlm_release_lockspace(ls->ls_dlm, 2);
+		dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL);
 		ls->ls_dlm = NULL;
 	}
+	up_write(&ls->ls_sem);
 
 	free_recover_size(ls);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0727f60ad028..9d65719353fa 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs2_recovery_wq = alloc_workqueue("gfs2_recovery",
-					  WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+					  WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU,
+					  0);
 	if (!gfs2_recovery_wq)
 		goto fail_wq1;
 
@@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_control_wq)
 		goto fail_wq2;
 
-	gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0);
+	gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0);
 
 	if (!gfs2_freeze_wq)
 		goto fail_wq3;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index efe99b732551..aa15183f9a16 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	error = -ENOMEM;
 	sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s",
-			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0,
+			WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU,
+			0,
 			sdp->sd_fsname);
 	if (!sdp->sd_glock_wq)
 		goto fail_iput;
 
 	sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
-			WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
+			WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0,
+			sdp->sd_fsname);
 	if (!sdp->sd_delete_wq)
 		goto fail_glock_wq;
 
@@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
-		atomic_inc(&inode->i_count);
+		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&sb->s_inode_list_lock);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b42e2110084b..644b2d1e7276 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode)
 	if (test_bit(SDF_EVICTING, &sdp->sd_flags))
 		return 1;
 
-	return generic_drop_inode(inode);
+	return inode_generic_drop(inode);
 }
 
 /**
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 26036ffc3f33..1c2507a27318 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -52,7 +52,6 @@
 	{(1UL << GLF_DEMOTE_IN_PROGRESS),	"p" },		\
 	{(1UL << GLF_DIRTY),			"y" },		\
 	{(1UL << GLF_LFLUSH),			"f" },		\
-	{(1UL << GLF_INVALIDATE_IN_PROGRESS),	"i" },		\
 	{(1UL << GLF_PENDING_REPLY),		"R" },		\
 	{(1UL << GLF_HAVE_REPLY),		"r" },		\
 	{(1UL << GLF_INITIAL),			"a" },		\
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 24864a66074b..56412f63f3bb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -309,7 +309,7 @@ void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...)
 	va_end(args);
 }
 
-int gfs2_withdraw(struct gfs2_sbd *sdp)
+void gfs2_withdraw(struct gfs2_sbd *sdp)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 	const struct lm_lockops *lm = ls->ls_ops;
@@ -322,7 +322,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp)
 				wait_on_bit(&sdp->sd_flags,
 					    SDF_WITHDRAW_IN_PROG,
 					    TASK_UNINTERRUPTIBLE);
-				return -1;
+				return;
 			}
 			new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG);
 		} while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new)));
@@ -350,8 +350,6 @@ int gfs2_withdraw(struct gfs2_sbd *sdp)
 
 	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
 		panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
-
-	return -1;
 }
 
 /*
@@ -473,46 +471,36 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
 
 /*
  * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- *          -2 if it was already withdrawn
  */
 
-int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		       const char *function, char *file,
-		       unsigned int line)
+void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			const char *function, char *file,
+			unsigned int line)
 {
-	int me;
-
 	gfs2_lm(sdp,
 		"fatal: invalid metadata block - "
 		"bh = %llu (bad magic number), "
 		"function = %s, file = %s, line = %u\n",
 		(unsigned long long)bh->b_blocknr,
 		function, file, line);
-	me = gfs2_withdraw(sdp);
-	return (me) ? -1 : -2;
+	gfs2_withdraw(sdp);
 }
 
 /*
  * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- *          -2 if it was already withdrawn
  */
 
-int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
-			   u16 type, u16 t, const char *function,
-			   char *file, unsigned int line)
+void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			    u16 type, u16 t, const char *function,
+			    char *file, unsigned int line)
 {
-	int me;
-
 	gfs2_lm(sdp,
 		"fatal: invalid metadata block - "
 		"bh = %llu (type: exp=%u, found=%u), "
 		"function = %s, file = %s, line = %u\n",
 		(unsigned long long)bh->b_blocknr, type, t,
 		function, file, line);
-	me = gfs2_withdraw(sdp);
-	return (me) ? -1 : -2;
+	gfs2_withdraw(sdp);
 }
 
 /*
@@ -521,14 +509,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
  *          0 if it was already withdrawn
  */
 
-int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
-		    unsigned int line)
+void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+		     unsigned int line)
 {
 	gfs2_lm(sdp,
 		"fatal: I/O error - "
 		"function = %s, file = %s, line = %u\n",
 		function, file, line);
-	return gfs2_withdraw(sdp);
+	gfs2_withdraw(sdp);
 }
 
 /*
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 27d03b641024..da0373b1e82b 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -91,9 +91,9 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
 gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__)
 
 
-int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		       const char *function,
-		       char *file, unsigned int line);
+void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			const char *function,
+			char *file, unsigned int line);
 
 static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
 				    struct buffer_head *bh)
@@ -108,10 +108,10 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
 	return 0;
 }
 
-int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
-			   u16 type, u16 t,
-			   const char *function,
-			   char *file, unsigned int line);
+void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			    u16 type, u16 t,
+			    const char *function,
+			    char *file, unsigned int line);
 
 static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
 					struct buffer_head *bh,
@@ -122,12 +122,16 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
 	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
 	u32 magic = be32_to_cpu(mh->mh_magic);
 	u16 t = be32_to_cpu(mh->mh_type);
-	if (unlikely(magic != GFS2_MAGIC))
-		return gfs2_meta_check_ii(sdp, bh, function,
-					  file, line);
-        if (unlikely(t != type))
-		return gfs2_metatype_check_ii(sdp, bh, type, t, function,
-					      file, line);
+	if (unlikely(magic != GFS2_MAGIC)) {
+		gfs2_meta_check_ii(sdp, bh, function,
+				   file, line);
+		return -EIO;
+	}
+        if (unlikely(t != type)) {
+		gfs2_metatype_check_ii(sdp, bh, type, t, function,
+				       file, line);
+		return -EIO;
+	}
 	return 0;
 }
 
@@ -144,8 +148,8 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
 }
 
 
-int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
-		    char *file, unsigned int line);
+void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+		     char *file, unsigned int line);
 
 int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
 		        bool verbose);
@@ -228,6 +232,6 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
 
 __printf(2, 3)
 void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...);
-int gfs2_withdraw(struct gfs2_sbd *sdp);
+void gfs2_withdraw(struct gfs2_sbd *sdp);
 
 #endif /* __UTIL_DOT_H__ */
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 34e9804e0f36..c2f840c49e60 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -21,12 +21,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 
 	fd->tree = tree;
 	fd->bnode = NULL;
-	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
 	if (!ptr)
 		return -ENOMEM;
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
-	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+	hfs_dbg("cnid %d, caller %ps\n",
 		tree->cnid, __builtin_return_address(0));
 	switch (tree->cnid) {
 	case HFS_CAT_CNID:
@@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
-	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+	hfs_dbg("cnid %d, caller %ps\n",
 		fd->tree->cnid, __builtin_return_address(0));
 	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
@@ -115,6 +115,12 @@ int hfs_brec_find(struct hfs_find_data *fd)
 	__be32 data;
 	int height, res;
 
+	fd->record = -1;
+	fd->keyoffset = -1;
+	fd->keylength = -1;
+	fd->entryoffset = -1;
+	fd->entrylength = -1;
+
 	tree = fd->tree;
 	if (fd->bnode)
 		hfs_bnode_put(fd->bnode);
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 28307bc9ec1e..5e84833a4743 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 		}
 	}
 
-	hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+	hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits);
 	HFS_SB(sb)->free_ablocks -= *num_bits;
 	hfs_bitmap_dirty(sb);
 out:
@@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	if (!count)
 		return 0;
 
-	hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count);
+	hfs_dbg("start %u, count %u\n", start, count);
 	/* are all of the bits in range? */
 	if ((start + count) > HFS_SB(sb)->fs_ablocks)
 		return -2;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index e8cd1a31f247..fcfffe75d84e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 {
 	struct page *src_page, *dst_page;
 
-	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
 	if (!len)
 		return;
 
@@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 	struct page *page;
 	void *ptr;
 
-	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
 	if (!len)
 		return;
 
@@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 	__be32 cnid;
 	int i, off, key_off;
 
-	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_dbg("node %d\n", node->this);
 	hfs_bnode_read(node, &desc, 0, sizeof(desc));
-	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+	hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
 		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
 		desc.type, desc.height, be16_to_cpu(desc.num_recs));
 
 	off = node->tree->node_size - 2;
 	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
 		key_off = hfs_bnode_read_u16(node, off);
-		hfs_dbg_cont(BNODE_MOD, " %d", key_off);
+		hfs_dbg(" key_off %d", key_off);
 		if (i && node->type == HFS_NODE_INDEX) {
 			int tmp;
 
@@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 				tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
 			else
 				tmp = node->tree->max_key_len + 1;
-			hfs_dbg_cont(BNODE_MOD, " (%d,%d",
-				     tmp, hfs_bnode_read_u8(node, key_off));
+			hfs_dbg(" (%d,%d",
+				tmp, hfs_bnode_read_u8(node, key_off));
 			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
-			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+			hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
 		} else if (i && node->type == HFS_NODE_LEAF) {
 			int tmp;
 
 			tmp = hfs_bnode_read_u8(node, key_off);
-			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+			hfs_dbg(" (%d)", tmp);
 		}
 	}
-	hfs_dbg_cont(BNODE_MOD, "\n");
+	hfs_dbg("\n");
 }
 
 void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
 	atomic_set(&node->refcnt, 1);
-	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+	hfs_dbg("cnid %d, node %d, refcnt 1\n",
 		node->tree->cnid, node->this);
 	init_waitqueue_head(&node->lock_wq);
 	spin_lock(&tree->hash_lock);
@@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
 {
 	struct hfs_bnode **p;
 
-	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+	hfs_dbg("cnid %d, node %d, refcnt %d\n",
 		node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
 	     *p && *p != node; p = &(*p)->next_hash)
@@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+		hfs_dbg("cnid %d, node %d, refcnt %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 	}
@@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		struct hfs_btree *tree = node->tree;
 		int i;
 
-		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+		hfs_dbg("cnid %d, node %d, refcnt %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 		BUG_ON(!atomic_read(&node->refcnt));
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 896396554bcc..e49a141c87e5 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -94,7 +94,7 @@ again:
 	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
 	end_off = hfs_bnode_read_u16(node, end_rec_off);
 	end_rec_off -= 2;
-	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+	hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n",
 		rec, size, end_off, end_rec_off);
 	if (size > end_rec_off - end_off) {
 		if (new_node)
@@ -179,6 +179,7 @@ int hfs_brec_remove(struct hfs_find_data *fd)
 	struct hfs_btree *tree;
 	struct hfs_bnode *node, *parent;
 	int end_off, rec_off, data_off, size;
+	int src, dst, len;
 
 	tree = fd->tree;
 	node = fd->bnode;
@@ -191,7 +192,7 @@ again:
 		mark_inode_dirty(tree->inode);
 	}
 	hfs_bnode_dump(node);
-	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+	hfs_dbg("rec %d, len %d\n",
 		fd->record, fd->keylength + fd->entrylength);
 	if (!--node->num_recs) {
 		hfs_bnode_unlink(node);
@@ -208,10 +209,14 @@ again:
 	}
 	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
 
-	if (rec_off == end_off)
-		goto skip;
 	size = fd->keylength + fd->entrylength;
 
+	if (rec_off == end_off) {
+		src = fd->keyoffset;
+		hfs_bnode_clear(node, src, size);
+		goto skip;
+	}
+
 	do {
 		data_off = hfs_bnode_read_u16(node, rec_off);
 		hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
@@ -219,9 +224,23 @@ again:
 	} while (rec_off >= end_off);
 
 	/* fill hole */
-	hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
-		       data_off - fd->keyoffset - size);
+	dst = fd->keyoffset;
+	src = fd->keyoffset + size;
+	len = data_off - src;
+
+	hfs_bnode_move(node, dst, src, len);
+
+	src = dst + len;
+	len = data_off - src;
+
+	hfs_bnode_clear(node, src, len);
+
 skip:
+	/*
+	 * Remove the obsolete offset to free space.
+	 */
+	hfs_bnode_write_u16(node, end_off, 0);
+
 	hfs_bnode_dump(node);
 	if (!fd->record)
 		hfs_brec_update_parent(fd);
@@ -242,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	if (IS_ERR(new_node))
 		return new_node;
 	hfs_bnode_get(node);
-	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+	hfs_dbg("this %d, new %d, next %d\n",
 		node->this, new_node->this, node->next);
 	new_node->next = node->next;
 	new_node->prev = node->this;
@@ -378,7 +397,7 @@ again:
 		newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
 	else
 		fd->keylength = newkeylen = tree->max_key_len + 1;
-	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+	hfs_dbg("rec %d, keylength %d, newkeylen %d\n",
 		rec, fd->keylength, newkeylen);
 
 	rec_off = tree->node_size - (rec + 2) * 2;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index e86e1e235658..22e62fe7448b 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	u32 nidx;
 	u8 *data, byte, m;
 
-	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+	hfs_dbg("node %u\n", node->this);
 	tree = node->tree;
 	nidx = node->this;
 	node = hfs_bnode_find(tree, 0);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index d63880e7d9d6..caebabb6642f 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
 	int entry_size;
 	int err;
 
-	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+	hfs_dbg("name %s, cnid %u, i_nlink %d\n",
 		str->name, cnid, inode->i_nlink);
 	if (dir->i_size >= HFS_MAX_VALENCE)
 		return -ENOSPC;
@@ -211,6 +211,124 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
 	return hfs_brec_find(fd);
 }
 
+static inline
+void hfs_set_next_unused_CNID(struct super_block *sb,
+				u32 deleted_cnid, u32 found_cnid)
+{
+	if (found_cnid < HFS_FIRSTUSER_CNID) {
+		atomic64_cmpxchg(&HFS_SB(sb)->next_id,
+				 deleted_cnid + 1, HFS_FIRSTUSER_CNID);
+	} else {
+		atomic64_cmpxchg(&HFS_SB(sb)->next_id,
+				 deleted_cnid + 1, found_cnid + 1);
+	}
+}
+
+/*
+ * hfs_correct_next_unused_CNID()
+ *
+ * Correct the next unused CNID of Catalog Tree.
+ */
+static
+int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid)
+{
+	struct hfs_btree *cat_tree;
+	struct hfs_bnode *node;
+	s64 leaf_head;
+	s64 leaf_tail;
+	s64 node_id;
+
+	hfs_dbg("cnid %u, next_id %lld\n",
+		cnid, atomic64_read(&HFS_SB(sb)->next_id));
+
+	if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) {
+		/* next ID should be unchanged */
+		return 0;
+	}
+
+	cat_tree = HFS_SB(sb)->cat_tree;
+	leaf_head = cat_tree->leaf_head;
+	leaf_tail = cat_tree->leaf_tail;
+
+	if (leaf_head > leaf_tail) {
+		pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n",
+			leaf_head, leaf_tail);
+		return -ERANGE;
+	}
+
+	node = hfs_bnode_find(cat_tree, leaf_tail);
+	if (IS_ERR(node)) {
+		pr_err("fail to find leaf node: node ID %lld\n",
+			leaf_tail);
+		return -ENOENT;
+	}
+
+	node_id = leaf_tail;
+
+	do {
+		int i;
+
+		if (node_id != leaf_tail) {
+			node = hfs_bnode_find(cat_tree, node_id);
+			if (IS_ERR(node))
+				return -ENOENT;
+		}
+
+		hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n",
+			node_id, leaf_tail, leaf_head);
+
+		hfs_bnode_dump(node);
+
+		for (i = node->num_recs - 1; i >= 0; i--) {
+			hfs_cat_rec rec;
+			u16 off, len, keylen;
+			int entryoffset;
+			int entrylength;
+			u32 found_cnid;
+
+			len = hfs_brec_lenoff(node, i, &off);
+			keylen = hfs_brec_keylen(node, i);
+			if (keylen == 0) {
+				pr_err("fail to get the keylen: "
+					"node_id %lld, record index %d\n",
+					node_id, i);
+				return -EINVAL;
+			}
+
+			entryoffset = off + keylen;
+			entrylength = len - keylen;
+
+			if (entrylength > sizeof(rec)) {
+				pr_err("unexpected record length: "
+					"entrylength %d\n",
+					entrylength);
+				return -EINVAL;
+			}
+
+			hfs_bnode_read(node, &rec, entryoffset, entrylength);
+
+			if (rec.type == HFS_CDR_DIR) {
+				found_cnid = be32_to_cpu(rec.dir.DirID);
+				hfs_dbg("found_cnid %u\n", found_cnid);
+				hfs_set_next_unused_CNID(sb, cnid, found_cnid);
+				hfs_bnode_put(node);
+				return 0;
+			} else if (rec.type == HFS_CDR_FIL) {
+				found_cnid = be32_to_cpu(rec.file.FlNum);
+				hfs_dbg("found_cnid %u\n", found_cnid);
+				hfs_set_next_unused_CNID(sb, cnid, found_cnid);
+				hfs_bnode_put(node);
+				return 0;
+			}
+		}
+
+		hfs_bnode_put(node);
+
+		node_id = node->prev;
+	} while (node_id >= leaf_head);
+
+	return -ENOENT;
+}
 
 /*
  * hfs_cat_delete()
@@ -225,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
 	struct hfs_readdir_data *rd;
 	int res, type;
 
-	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid);
 	sb = dir->i_sb;
 	res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
 	if (res)
@@ -271,6 +389,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
 	dir->i_size--;
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
+
+	res = hfs_correct_next_unused_CNID(sb, cnid);
+	if (res)
+		goto out;
+
 	res = 0;
 out:
 	hfs_find_exit(&fd);
@@ -294,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
 	int entry_size, type;
 	int err;
 
-	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+	hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n",
 		cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
 	sb = src_dir->i_sb;
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 580c62981dbd..a097908b269d 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent)
 {
 	int i;
 
-	hfs_dbg(EXTENT, "   ");
+	hfs_dbg("extent:   ");
 	for (i = 0; i < 3; i++)
-		hfs_dbg_cont(EXTENT, " %u:%u",
-			     be16_to_cpu(extent[i].block),
-			     be16_to_cpu(extent[i].count));
-	hfs_dbg_cont(EXTENT, "\n");
+		hfs_dbg(" block %u, count %u",
+			be16_to_cpu(extent[i].block),
+			be16_to_cpu(extent[i].count));
+	hfs_dbg("\n");
 }
 
 static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
@@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode)
 		goto out;
 	}
 
-	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len);
 	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
 		if (!HFS_I(inode)->first_blocks) {
-			hfs_dbg(EXTENT, "first extents\n");
+			hfs_dbg("first_extent: start %u, len %u\n",
+				start, len);
 			/* no extents yet */
 			HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
 			HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
@@ -456,7 +457,7 @@ out:
 	return res;
 
 insert_extent:
-	hfs_dbg(EXTENT, "insert new extent\n");
+	hfs_dbg("insert new extent\n");
 	res = hfs_ext_write_extent(inode);
 	if (res)
 		goto out;
@@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode)
 	u32 size;
 	int res;
 
-	hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n",
+	hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n",
 		inode->i_ino, (long long)HFS_I(inode)->phys_size,
 		inode->i_size);
 	if (inode->i_size > HFS_I(inode)->phys_size) {
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 7c5a7ecfa246..fff149af89da 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -9,12 +9,6 @@
 #ifndef _LINUX_HFS_FS_H
 #define _LINUX_HFS_FS_H
 
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/mutex.h>
@@ -24,35 +18,10 @@
 
 #include <asm/byteorder.h>
 #include <linux/uaccess.h>
+#include <linux/hfs_common.h>
 
 #include "hfs.h"
 
-#define DBG_BNODE_REFS	0x00000001
-#define DBG_BNODE_MOD	0x00000002
-#define DBG_CAT_MOD	0x00000004
-#define DBG_INODE	0x00000008
-#define DBG_SUPER	0x00000010
-#define DBG_EXTENT	0x00000020
-#define DBG_BITMAP	0x00000040
-
-//#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP)
-//#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
-//#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
-#define DBG_MASK	(0)
-
-#define hfs_dbg(flg, fmt, ...)					\
-do {								\
-	if (DBG_##flg & DBG_MASK)				\
-		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
-} while (0)
-
-#define hfs_dbg_cont(flg, fmt, ...)				\
-do {								\
-	if (DBG_##flg & DBG_MASK)				\
-		pr_cont(fmt, ##__VA_ARGS__);			\
-} while (0)
-
-
 /*
  * struct hfs_inode_info
  *
@@ -112,13 +81,13 @@ struct hfs_sb_info {
 						   the extents b-tree */
 	struct hfs_btree *cat_tree;			/* Information about
 						   the catalog b-tree */
-	u32 file_count;				/* The number of
+	atomic64_t file_count;			/* The number of
 						   regular files in
 						   the filesystem */
-	u32 folder_count;			/* The number of
+	atomic64_t folder_count;		/* The number of
 						   directories in the
 						   filesystem */
-	u32 next_id;				/* The next available
+	atomic64_t next_id;			/* The next available
 						   file id number */
 	u32 clumpablks;				/* The number of allocation
 						   blocks to try to add when
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index bf4cb7e78396..9cd449913dc8 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,6 +183,10 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = new_inode(sb);
+	s64 next_id;
+	s64 file_count;
+	s64 folder_count;
+
 	if (!inode)
 		return NULL;
 
@@ -190,7 +194,9 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 	spin_lock_init(&HFS_I(inode)->open_dir_lock);
 	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
-	inode->i_ino = HFS_SB(sb)->next_id++;
+	next_id = atomic64_inc_return(&HFS_SB(sb)->next_id);
+	BUG_ON(next_id > U32_MAX);
+	inode->i_ino = (u32)next_id;
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
@@ -202,7 +208,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 	HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
 	if (S_ISDIR(mode)) {
 		inode->i_size = 2;
-		HFS_SB(sb)->folder_count++;
+		folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count);
+		BUG_ON(folder_count > U32_MAX);
 		if (dir->i_ino == HFS_ROOT_CNID)
 			HFS_SB(sb)->root_dirs++;
 		inode->i_op = &hfs_dir_inode_operations;
@@ -211,7 +218,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 		inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
 	} else if (S_ISREG(mode)) {
 		HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
-		HFS_SB(sb)->file_count++;
+		file_count = atomic64_inc_return(&HFS_SB(sb)->file_count);
+		BUG_ON(file_count > U32_MAX);
 		if (dir->i_ino == HFS_ROOT_CNID)
 			HFS_SB(sb)->root_files++;
 		inode->i_op = &hfs_file_inode_operations;
@@ -241,16 +249,19 @@ void hfs_delete_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
-	hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
+	hfs_dbg("ino %lu\n", inode->i_ino);
 	if (S_ISDIR(inode->i_mode)) {
-		HFS_SB(sb)->folder_count--;
+		BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX);
+		atomic64_dec(&HFS_SB(sb)->folder_count);
 		if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
 			HFS_SB(sb)->root_dirs--;
 		set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
 		hfs_mark_mdb_dirty(sb);
 		return;
 	}
-	HFS_SB(sb)->file_count--;
+
+	BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX);
+	atomic64_dec(&HFS_SB(sb)->file_count);
 	if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
 		HFS_SB(sb)->root_files--;
 	if (S_ISREG(inode->i_mode)) {
@@ -425,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	hfs_cat_rec rec;
 	int res;
 
-	hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+	hfs_dbg("ino %lu\n", inode->i_ino);
 	res = hfs_ext_write_extent(inode);
 	if (res)
 		return res;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8082eb01127c..53f3fae60217 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -150,11 +150,11 @@ int hfs_mdb_get(struct super_block *sb)
 
 	/* These parameters are read from and written to the MDB */
 	HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks);
-	HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID);
+	atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID));
 	HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls);
 	HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs);
-	HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt);
-	HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt);
+	atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt));
+	atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt));
 
 	/* TRY to get the alternate (backup) MDB. */
 	sect = part_start + part_size - 2;
@@ -172,7 +172,7 @@ int hfs_mdb_get(struct super_block *sb)
 		pr_warn("continuing without an alternate MDB\n");
 	}
 
-	HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
+	HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL);
 	if (!HFS_SB(sb)->bitmap)
 		goto out;
 
@@ -273,11 +273,17 @@ void hfs_mdb_commit(struct super_block *sb)
 		/* These parameters may have been modified, so write them back */
 		mdb->drLsMod = hfs_mtime();
 		mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks);
-		mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id);
+		BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX);
+		mdb->drNxtCNID =
+			cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id));
 		mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files);
 		mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs);
-		mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count);
-		mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count);
+		BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX);
+		mdb->drFilCnt =
+			cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count));
+		BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX);
+		mdb->drDirCnt =
+			cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count));
 
 		/* write MDB to disk */
 		mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 388a318297ec..47f50fa555a4 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -319,6 +319,10 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	int silent = fc->sb_flags & SB_SILENT;
 	int res;
 
+	atomic64_set(&sbi->file_count, 0);
+	atomic64_set(&sbi->folder_count, 0);
+	atomic64_set(&sbi->next_id, 0);
+
 	/* load_nls_default does not fail */
 	if (sbi->nls_disk && !sbi->nls_io)
 		sbi->nls_io = load_nls_default();
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index eeebe80c6be4..ba26980cc503 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid,
 {
 	int err = 0;
 
-	hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+	hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid);
 
 	if (!HFSPLUS_SB(sb)->attr_tree) {
 		pr_err("attributes file doesn't exist\n");
@@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode,
 	int entry_size;
 	int err;
 
-	hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n",
+	hfs_dbg("name %s, ino %ld\n",
 		name ? name : NULL, inode->i_ino);
 
 	if (!HFSPLUS_SB(sb)->attr_tree) {
@@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
 	struct super_block *sb = inode->i_sb;
 	struct hfs_find_data fd;
 
-	hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n",
+	hfs_dbg("name %s, ino %ld\n",
 		name ? name : NULL, inode->i_ino);
 
 	if (!HFSPLUS_SB(sb)->attr_tree) {
@@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
 	int err = 0;
 	struct hfs_find_data fd;
 
-	hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+	hfs_dbg("cnid %d\n", cnid);
 
 	if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
 		pr_err("attributes file doesn't exist\n");
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 901e83d65d20..afc9c89e8c6a 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -18,12 +18,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 
 	fd->tree = tree;
 	fd->bnode = NULL;
-	ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+	ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
 	if (!ptr)
 		return -ENOMEM;
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
-	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+	hfs_dbg("cnid %d, caller %ps\n",
 		tree->cnid, __builtin_return_address(0));
 	mutex_lock_nested(&tree->tree_lock,
 			hfsplus_btree_lock_class(tree));
@@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
-	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+	hfs_dbg("cnid %d, caller %ps\n",
 		fd->tree->cnid, __builtin_return_address(0));
 	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
@@ -158,6 +158,12 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
 	__be32 data;
 	int height, res;
 
+	fd->record = -1;
+	fd->keyoffset = -1;
+	fd->keylength = -1;
+	fd->entryoffset = -1;
+	fd->entrylength = -1;
+
 	tree = fd->tree;
 	if (fd->bnode)
 		hfs_bnode_put(fd->bnode);
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index bd8dcea85588..1b3af8c87cad 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
 	if (!len)
 		return size;
 
-	hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+	hfs_dbg("size %u, offset %u, len %u\n", size, offset, len);
 	mutex_lock(&sbi->alloc_mutex);
 	mapping = sbi->alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
@@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
 		else
 			end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
 	}
-	hfs_dbg(BITMAP, "bitmap full\n");
+	hfs_dbg("bitmap full\n");
 	start = size;
 	goto out;
 
 found:
 	start = offset + (curr - pptr) * 32 + i;
 	if (start >= size) {
-		hfs_dbg(BITMAP, "bitmap full\n");
+		hfs_dbg("bitmap full\n");
 		goto out;
 	}
 	/* do any partial u32 at the start */
@@ -155,7 +155,7 @@ done:
 	*max = offset + (curr - pptr) * 32 + i - start;
 	sbi->free_blocks -= *max;
 	hfsplus_mark_mdb_dirty(sb);
-	hfs_dbg(BITMAP, "-> %u,%u\n", start, *max);
+	hfs_dbg("start %u, max %u\n", start, *max);
 out:
 	mutex_unlock(&sbi->alloc_mutex);
 	return start;
@@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	if (!count)
 		return 0;
 
-	hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count);
+	hfs_dbg("offset %u, count %u\n", offset, count);
 	/* are all of the bits in range? */
 	if ((offset + count) > sbi->total_blocks)
 		return -ENOENT;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 14f4995588ff..63e652ad1e0d 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,47 +18,6 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 
-static inline
-bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
-{
-	bool is_valid = off < node->tree->node_size;
-
-	if (!is_valid) {
-		pr_err("requested invalid offset: "
-		       "NODE: id %u, type %#x, height %u, "
-		       "node_size %u, offset %d\n",
-		       node->this, node->type, node->height,
-		       node->tree->node_size, off);
-	}
-
-	return is_valid;
-}
-
-static inline
-int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
-{
-	unsigned int node_size;
-
-	if (!is_bnode_offset_valid(node, off))
-		return 0;
-
-	node_size = node->tree->node_size;
-
-	if ((off + len) > node_size) {
-		int new_len = (int)node_size - off;
-
-		pr_err("requested length has been corrected: "
-		       "NODE: id %u, type %#x, height %u, "
-		       "node_size %u, offset %d, "
-		       "requested_len %d, corrected_len %d\n",
-		       node->this, node->type, node->height,
-		       node->tree->node_size, off, len, new_len);
-
-		return new_len;
-	}
-
-	return len;
-}
 
 /* Copy a specified range of bytes from the raw data of a node */
 void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
@@ -214,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 	struct page **src_page, **dst_page;
 	int l;
 
-	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
 	if (!len)
 		return;
 
@@ -272,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 	void *src_ptr, *dst_ptr;
 	int l;
 
-	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
 	if (!len)
 		return;
 
@@ -392,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 	__be32 cnid;
 	int i, off, key_off;
 
-	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_dbg("node %d\n", node->this);
 	hfs_bnode_read(node, &desc, 0, sizeof(desc));
-	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+	hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
 		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
 		desc.type, desc.height, be16_to_cpu(desc.num_recs));
 
 	off = node->tree->node_size - 2;
 	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
 		key_off = hfs_bnode_read_u16(node, off);
-		hfs_dbg(BNODE_MOD, " %d", key_off);
+		hfs_dbg(" key_off %d", key_off);
 		if (i && node->type == HFS_NODE_INDEX) {
 			int tmp;
 
@@ -410,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 				tmp = hfs_bnode_read_u16(node, key_off) + 2;
 			else
 				tmp = node->tree->max_key_len + 2;
-			hfs_dbg_cont(BNODE_MOD, " (%d", tmp);
+			hfs_dbg(" (%d", tmp);
 			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
-			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+			hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
 		} else if (i && node->type == HFS_NODE_LEAF) {
 			int tmp;
 
 			tmp = hfs_bnode_read_u16(node, key_off);
-			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+			hfs_dbg(" (%d)", tmp);
 		}
 	}
-	hfs_dbg_cont(BNODE_MOD, "\n");
+	hfs_dbg("\n");
 }
 
 void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -456,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 
 	/* move down? */
 	if (!node->prev && !node->next)
-		hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n");
+		hfs_dbg("btree delete level\n");
 	if (!node->parent) {
 		tree->root = 0;
 		tree->depth = 0;
@@ -511,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
 	atomic_set(&node->refcnt, 1);
-	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+	hfs_dbg("cnid %d, node %d, refcnt 1\n",
 		node->tree->cnid, node->this);
 	init_waitqueue_head(&node->lock_wq);
 	spin_lock(&tree->hash_lock);
@@ -551,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
 {
 	struct hfs_bnode **p;
 
-	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+	hfs_dbg("cnid %d, node %d, refcnt %d\n",
 		node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
 	     *p && *p != node; p = &(*p)->next_hash)
@@ -697,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+		hfs_dbg("cnid %d, node %d, refcnt %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 	}
@@ -710,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		struct hfs_btree *tree = node->tree;
 		int i;
 
-		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+		hfs_dbg("cnid %d, node %d, refcnt %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 		BUG_ON(!atomic_read(&node->refcnt));
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 1918544a7871..b4645102feec 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -92,7 +92,7 @@ again:
 	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
 	end_off = hfs_bnode_read_u16(node, end_rec_off);
 	end_rec_off -= 2;
-	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+	hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n",
 		rec, size, end_off, end_rec_off);
 	if (size > end_rec_off - end_off) {
 		if (new_node)
@@ -193,7 +193,7 @@ again:
 		mark_inode_dirty(tree->inode);
 	}
 	hfs_bnode_dump(node);
-	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+	hfs_dbg("rec %d, len %d\n",
 		fd->record, fd->keylength + fd->entrylength);
 	if (!--node->num_recs) {
 		hfs_bnode_unlink(node);
@@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	if (IS_ERR(new_node))
 		return new_node;
 	hfs_bnode_get(node);
-	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+	hfs_dbg("this %d - new %d - next %d\n",
 		node->this, new_node->this, node->next);
 	new_node->next = node->next;
 	new_node->prev = node->this;
@@ -383,7 +383,7 @@ again:
 		newkeylen = hfs_bnode_read_u16(node, 14) + 2;
 	else
 		fd->keylength = newkeylen = tree->max_key_len + 2;
-	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+	hfs_dbg("rec %d, keylength %d, newkeylen %d\n",
 		rec, fd->keylength, newkeylen);
 
 	rec_off = tree->node_size - (rec + 2) * 2;
@@ -395,7 +395,7 @@ again:
 		end_off = hfs_bnode_read_u16(parent, end_rec_off);
 		if (end_rec_off - end_off < diff) {
 
-			hfs_dbg(BNODE_MOD, "splitting index node\n");
+			hfs_dbg("splitting index node\n");
 			fd->bnode = parent;
 			new_node = hfs_bnode_split(fd);
 			if (IS_ERR(new_node))
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 9e1732a2b92a..7cc5aea14572 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -393,6 +393,12 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	len = hfs_brec_lenoff(node, 2, &off16);
 	off = off16;
 
+	if (!is_bnode_offset_valid(node, off)) {
+		hfs_bnode_put(node);
+		return ERR_PTR(-EIO);
+	}
+	len = check_and_correct_requested_length(node, off, len);
+
 	off += node->page_offset;
 	pagep = node->page + (off >> PAGE_SHIFT);
 	data = kmap_local_page(*pagep);
@@ -428,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		kunmap_local(data);
 		nidx = node->next;
 		if (!nidx) {
-			hfs_dbg(BNODE_MOD, "create new bmap node\n");
+			hfs_dbg("create new bmap node\n");
 			next_node = hfs_bmap_new_bmap(node, idx);
 		} else
 			next_node = hfs_bnode_find(tree, nidx);
@@ -454,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	u32 nidx;
 	u8 *data, byte, m;
 
-	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+	hfs_dbg("node %u\n", node->this);
 	BUG_ON(!node->this);
 	tree = node->tree;
 	nidx = node->this;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 1995bafee839..02c1eee4a4b8 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 	int entry_size;
 	int err;
 
-	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+	hfs_dbg("name %s, cnid %u, i_nlink %d\n",
 		str->name, cnid, inode->i_nlink);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	if (err)
@@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
 	int err, off;
 	u16 type;
 
-	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	if (err)
 		return err;
@@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid,
 	int entry_size, type;
 	int err;
 
-	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+	hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n",
 		cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 876bbb80fb4d..1b3e27a0d5e0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
 			fd.entrylength);
 		type = be16_to_cpu(entry.type);
 		len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN;
-		err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
+		err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len);
 		if (err)
 			goto out;
 		if (type == HFSPLUS_FOLDER) {
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index b1699b3c246a..8e886514d27f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	mutex_unlock(&hip->extents_lock);
 
 done:
-	hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n",
+	hfs_dbg("ino %lu, iblock %llu - dblock %u\n",
 		inode->i_ino, (long long)iblock, dblock);
 
 	mask = (1 << sbi->fs_shift) - 1;
@@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent)
 {
 	int i;
 
-	hfs_dbg(EXTENT, "   ");
+	hfs_dbg("extent   ");
 	for (i = 0; i < 8; i++)
-		hfs_dbg_cont(EXTENT, " %u:%u",
-			     be32_to_cpu(extent[i].start_block),
-			     be32_to_cpu(extent[i].block_count));
-	hfs_dbg_cont(EXTENT, "\n");
+		hfs_dbg(" start_block %u, block_count %u",
+			be32_to_cpu(extent[i].start_block),
+			be32_to_cpu(extent[i].block_count));
+	hfs_dbg("\n");
 }
 
 static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
@@ -359,8 +359,7 @@ found:
 		if (count <= block_nr) {
 			err = hfsplus_block_free(sb, start, count);
 			if (err) {
-				pr_err("can't free extent\n");
-				hfs_dbg(EXTENT, " start: %u count: %u\n",
+				pr_err("can't free extent: start %u, count %u\n",
 					start, count);
 			}
 			extent->block_count = 0;
@@ -370,8 +369,7 @@ found:
 			count -= block_nr;
 			err = hfsplus_block_free(sb, start + count, block_nr);
 			if (err) {
-				pr_err("can't free extent\n");
-				hfs_dbg(EXTENT, " start: %u count: %u\n",
+				pr_err("can't free extent: start %u, count %u\n",
 					start, count);
 			}
 			extent->block_count = cpu_to_be32(count);
@@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout)
 			goto out;
 	}
 
-	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len);
 
 	if (hip->alloc_blocks <= hip->first_blocks) {
 		if (!hip->first_blocks) {
-			hfs_dbg(EXTENT, "first extents\n");
+			hfs_dbg("first_extent: start %u, len %u\n",
+				start, len);
 			/* no extents yet */
 			hip->first_extents[0].start_block = cpu_to_be32(start);
 			hip->first_extents[0].block_count = cpu_to_be32(len);
@@ -521,7 +520,7 @@ out:
 	return res;
 
 insert_extent:
-	hfs_dbg(EXTENT, "insert new extent\n");
+	hfs_dbg("insert new extent\n");
 	res = hfsplus_ext_write_extent_locked(inode);
 	if (res)
 		goto out;
@@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode)
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n",
+	hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n",
 		inode->i_ino, (long long)hip->phys_size, inode->i_size);
 
 	if (inode->i_size > hip->phys_size) {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 96a5c24813dd..89e8b19c127b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,47 +11,14 @@
 #ifndef _LINUX_HFSPLUS_FS_H
 #define _LINUX_HFSPLUS_FS_H
 
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/fs_context.h>
+#include <linux/hfs_common.h>
 #include "hfsplus_raw.h"
 
-#define DBG_BNODE_REFS	0x00000001
-#define DBG_BNODE_MOD	0x00000002
-#define DBG_CAT_MOD	0x00000004
-#define DBG_INODE	0x00000008
-#define DBG_SUPER	0x00000010
-#define DBG_EXTENT	0x00000020
-#define DBG_BITMAP	0x00000040
-#define DBG_ATTR_MOD	0x00000080
-
-#if 0
-#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
-#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
-#endif
-#define DBG_MASK	(0)
-
-#define hfs_dbg(flg, fmt, ...)					\
-do {								\
-	if (DBG_##flg & DBG_MASK)				\
-		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
-} while (0)
-
-#define hfs_dbg_cont(flg, fmt, ...)				\
-do {								\
-	if (DBG_##flg & DBG_MASK)				\
-		pr_cont(fmt, ##__VA_ARGS__);			\
-} while (0)
-
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
 
@@ -521,8 +488,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
 		       const struct hfsplus_unistr *s2);
 int hfsplus_strcmp(const struct hfsplus_unistr *s1,
 		   const struct hfsplus_unistr *s2);
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
-		    char *astr, int *len_p);
+int hfsplus_uni2asc_str(struct super_block *sb,
+			const struct hfsplus_unistr *ustr, char *astr,
+			int *len_p);
+int hfsplus_uni2asc_xattr_str(struct super_block *sb,
+			      const struct hfsplus_attr_unistr *ustr,
+			      char *astr, int *len_p);
 int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 		    int max_unistr_len, const char *astr, int len);
 int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
@@ -577,6 +548,48 @@ hfsplus_btree_lock_class(struct hfs_btree *tree)
 	return class;
 }
 
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+	bool is_valid = off < node->tree->node_size;
+
+	if (!is_valid) {
+		pr_err("requested invalid offset: "
+		       "NODE: id %u, type %#x, height %u, "
+		       "node_size %u, offset %d\n",
+		       node->this, node->type, node->height,
+		       node->tree->node_size, off);
+	}
+
+	return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+	unsigned int node_size;
+
+	if (!is_bnode_offset_valid(node, off))
+		return 0;
+
+	node_size = node->tree->node_size;
+
+	if ((off + len) > node_size) {
+		int new_len = (int)node_size - off;
+
+		pr_err("requested length has been corrected: "
+		       "NODE: id %u, type %#x, height %u, "
+		       "node_size %u, offset %d, "
+		       "requested_len %d, corrected_len %d\n",
+		       node->this, node->type, node->height,
+		       node->tree->node_size, off, len, new_len);
+
+		return new_len;
+	}
+
+	return len;
+}
+
 /* compatibility */
 #define hfsp_mt2ut(t)		(struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
 #define hfsp_ut2mt(t)		__hfsp_ut2mt((t).tv_sec)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 86351bdc8985..16bc4abc67e0 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -68,13 +68,26 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 	if (!(inode->i_state & I_NEW))
 		return inode;
 
-	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
-	spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
-	mutex_init(&HFSPLUS_I(inode)->extents_lock);
-	HFSPLUS_I(inode)->flags = 0;
+	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+	HFSPLUS_I(inode)->first_blocks = 0;
+	HFSPLUS_I(inode)->clump_blocks = 0;
+	HFSPLUS_I(inode)->alloc_blocks = 0;
+	HFSPLUS_I(inode)->cached_start = U32_MAX;
+	HFSPLUS_I(inode)->cached_blocks = 0;
+	memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec));
+	memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec));
 	HFSPLUS_I(inode)->extent_state = 0;
+	mutex_init(&HFSPLUS_I(inode)->extents_lock);
 	HFSPLUS_I(inode)->rsrc_inode = NULL;
-	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+	HFSPLUS_I(inode)->create_date = 0;
+	HFSPLUS_I(inode)->linkid = 0;
+	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->fs_blocks = 0;
+	HFSPLUS_I(inode)->userflags = 0;
+	HFSPLUS_I(inode)->subfolders = 0;
+	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+	spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
+	HFSPLUS_I(inode)->phys_size = 0;
 
 	if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
 	    inode->i_ino == HFSPLUS_ROOT_CNID) {
@@ -150,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode,
 {
 	int err;
 
-	hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+	hfs_dbg("ino %lu\n", inode->i_ino);
 
 	err = hfsplus_ext_write_extent(inode);
 	if (err)
@@ -165,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode,
 
 static void hfsplus_evict_inode(struct inode *inode)
 {
-	hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	hfs_dbg("ino %lu\n", inode->i_ino);
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
@@ -184,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	if (!wait)
 		return 0;
 
-	hfs_dbg(SUPER, "hfsplus_sync_fs\n");
+	hfs_dbg("starting...\n");
 
 	/*
 	 * Explicitly write out the special metadata inodes.
@@ -215,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
 	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
+	hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n",
+		sbi->free_blocks, sbi->next_cnid,
+		sbi->folder_count, sbi->file_count);
+
 	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
 		memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
 		write_backup = 1;
@@ -240,6 +257,8 @@ out:
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
 		blkdev_issue_flush(sb->s_bdev);
 
+	hfs_dbg("finished: err %d\n", error);
+
 	return error;
 }
 
@@ -288,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 
-	hfs_dbg(SUPER, "hfsplus_put_super\n");
+	hfs_dbg("starting...\n");
 
 	cancel_delayed_work_sync(&sbi->sync_work);
 
@@ -310,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb)
 	kfree(sbi->s_vhdr_buf);
 	kfree(sbi->s_backup_vhdr_buf);
 	call_rcu(&sbi->rcu, delayed_free);
+
+	hfs_dbg("finished\n");
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -524,7 +545,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
 		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) {
-			err = -EINVAL;
+			err = -EIO;
 			goto out_put_root;
 		}
 		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 36b6cf2a3abb..11e08a4a18b2 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -40,6 +40,18 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
 	p1 = s1->unicode;
 	p2 = s2->unicode;
 
+	if (len1 > HFSPLUS_MAX_STRLEN) {
+		len1 = HFSPLUS_MAX_STRLEN;
+		pr_err("invalid length %u has been corrected to %d\n",
+			be16_to_cpu(s1->length), len1);
+	}
+
+	if (len2 > HFSPLUS_MAX_STRLEN) {
+		len2 = HFSPLUS_MAX_STRLEN;
+		pr_err("invalid length %u has been corrected to %d\n",
+			be16_to_cpu(s2->length), len2);
+	}
+
 	while (1) {
 		c1 = c2 = 0;
 
@@ -74,6 +86,18 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1,
 	p1 = s1->unicode;
 	p2 = s2->unicode;
 
+	if (len1 > HFSPLUS_MAX_STRLEN) {
+		len1 = HFSPLUS_MAX_STRLEN;
+		pr_err("invalid length %u has been corrected to %d\n",
+			be16_to_cpu(s1->length), len1);
+	}
+
+	if (len2 > HFSPLUS_MAX_STRLEN) {
+		len2 = HFSPLUS_MAX_STRLEN;
+		pr_err("invalid length %u has been corrected to %d\n",
+			be16_to_cpu(s2->length), len2);
+	}
+
 	for (len = min(len1, len2); len > 0; len--) {
 		c1 = be16_to_cpu(*p1);
 		c2 = be16_to_cpu(*p2);
@@ -119,9 +143,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
 	return NULL;
 }
 
-int hfsplus_uni2asc(struct super_block *sb,
-		const struct hfsplus_unistr *ustr,
-		char *astr, int *len_p)
+static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
+		    int max_len, char *astr, int *len_p)
 {
 	const hfsplus_unichr *ip;
 	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -134,8 +157,8 @@ int hfsplus_uni2asc(struct super_block *sb,
 	ip = ustr->unicode;
 
 	ustrlen = be16_to_cpu(ustr->length);
-	if (ustrlen > HFSPLUS_MAX_STRLEN) {
-		ustrlen = HFSPLUS_MAX_STRLEN;
+	if (ustrlen > max_len) {
+		ustrlen = max_len;
 		pr_err("invalid length %u has been corrected to %d\n",
 			be16_to_cpu(ustr->length), ustrlen);
 	}
@@ -256,6 +279,21 @@ out:
 	return res;
 }
 
+inline int hfsplus_uni2asc_str(struct super_block *sb,
+			       const struct hfsplus_unistr *ustr, char *astr,
+			       int *len_p)
+{
+	return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p);
+}
+
+inline int hfsplus_uni2asc_xattr_str(struct super_block *sb,
+				     const struct hfsplus_attr_unistr *ustr,
+				     char *astr, int *len_p)
+{
+	return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr,
+			       HFSPLUS_ATTR_MAX_STRLEN, astr, len_p);
+}
+
 /*
  * Convert one or more ASCII characters into a single unicode character.
  * Returns the number of ASCII characters corresponding to the unicode char.
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 18dc3d254d21..ece4d29c0ab9 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file,
 	u32 used_bmp_bytes;
 	u64 tmp;
 
-	hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
+	hfs_dbg("clump %u, node_size %u\n",
 		clump_size, node_size);
 
 	/* The end of the node contains list of record offsets */
@@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb)
 	struct page *page;
 	int old_state = HFSPLUS_EMPTY_ATTR_TREE;
 
-	hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+	hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID);
 
 check_attr_tree_state_again:
 	switch (atomic_read(&sbi->attr_tree_state)) {
@@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			goto end_listxattr;
 
 		xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN;
-		if (hfsplus_uni2asc(inode->i_sb,
-			(const struct hfsplus_unistr *)&fd.key->attr.key_name,
-					strbuf, &xattr_name_len)) {
+		if (hfsplus_uni2asc_xattr_str(inode->i_sb,
+					      &fd.key->attr.key_name, strbuf,
+					      &xattr_name_len)) {
 			pr_err("unicode conversion failed\n");
 			res = -EIO;
 			goto end_listxattr;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 01e516175bcd..1e1acf5775ab 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 static const struct super_operations hostfs_sbops = {
 	.alloc_inode	= hostfs_alloc_inode,
 	.free_inode	= hostfs_free_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= hostfs_evict_inode,
 	.statfs		= hostfs_statfs,
 	.show_options	= hostfs_show_options,
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index a59e8fa630db..34008442ee26 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i)
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
 	struct inode *parent;
 	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
-	if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
+	if (hpfs_inode->i_rddir_off && !icount_read(i)) {
 		if (*hpfs_inode->i_rddir_off)
 			pr_err("write_inode: some position still there\n");
 		kfree(hpfs_inode->i_rddir_off);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 09d4baef29cf..be4be99304bc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
 
 	/*
 	 * If folio is mapped, it was faulted in after being
-	 * unmapped in caller.  Unmap (again) while holding
-	 * the fault mutex.  The mutex will prevent faults
-	 * until we finish removing the folio.
+	 * unmapped in caller or hugetlb_vmdelete_list() skips
+	 * unmapping it due to fail to grab lock.  Unmap (again)
+	 * while holding the fault mutex.  The mutex will prevent
+	 * faults until we finish removing the folio.  Hold folio
+	 * lock to guarantee no concurrent migration.
 	 */
+	folio_lock(folio);
 	if (unlikely(folio_mapped(folio)))
 		hugetlb_unmap_file_folio(h, mapping, folio, index);
 
-	folio_lock(folio);
 	/*
 	 * We must remove the folio from page cache before removing
 	 * the region/ reserve map (hugetlb_unreserve_pages).  In
diff --git a/fs/init.c b/fs/init.c
index eef5124885e3..07f592ccdba8 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	else if (!(S_ISBLK(mode) || S_ISCHR(mode)))
 		return -EINVAL;
 
-	dentry = kern_path_create(AT_FDCWD, filename, &path, 0);
+	dentry = start_creating_path(AT_FDCWD, filename, &path, 0);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
 				  dentry, mode, new_decode_dev(dev));
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return error;
 }
 
@@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname)
 	if (error)
 		return error;
 
-	new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0);
+	new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
 		goto out;
@@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname)
 	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
 			 new_dentry, NULL);
 out_dput:
-	done_path_create(&new_path, new_dentry);
+	end_creating_path(&new_path, new_dentry);
 out:
 	path_put(&old_path);
 	return error;
@@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname)
 	struct path path;
 	int error;
 
-	dentry = kern_path_create(AT_FDCWD, newname, &path, 0);
+	dentry = start_creating_path(AT_FDCWD, newname, &path, 0);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	error = security_path_symlink(&path, dentry, oldname);
 	if (!error)
 		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
 				    dentry, oldname);
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return error;
 }
 
@@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	struct path path;
 	int error;
 
-	dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
+	dentry = start_creating_path(AT_FDCWD, pathname, &path,
+				     LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	mode = mode_strip_umask(d_inode(path.dentry), mode);
@@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 		if (IS_ERR(dentry))
 			error = PTR_ERR(dentry);
 	}
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return error;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 01ebdc40021e..ec9339024ac3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
 {
 	if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
 		return;
-	if (atomic_read(&inode->i_count))
+	if (icount_read(inode))
 		return;
 	if (!(inode->i_sb->s_flags & SB_ACTIVE))
 		return;
@@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
 struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
 					    struct inode *inode, u32 bit)
 {
-        void *bit_address;
+	void *bit_address;
 
-        bit_address = inode_state_wait_address(inode, bit);
-        init_wait_var_entry(wqe, bit_address, 0);
-        return __var_waitqueue(bit_address);
+	bit_address = inode_state_wait_address(inode, bit);
+	init_wait_var_entry(wqe, bit_address, 0);
+	return __var_waitqueue(bit_address);
 }
 EXPORT_SYMBOL(inode_bit_waitqueue);
 
@@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb)
 again:
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (atomic_read(&inode->i_count))
+		if (icount_read(inode))
 			continue;
 
 		spin_lock(&inode->i_lock);
-		if (atomic_read(&inode->i_count)) {
+		if (icount_read(inode)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
@@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 	 * unreclaimable for a while. Remove them lazily here; iput,
 	 * sync, or the last page cache deletion will requeue them.
 	 */
-	if (atomic_read(&inode->i_count) ||
+	if (icount_read(inode) ||
 	    (inode->i_state & ~I_REFERENCED) ||
 	    !mapping_shrinkable(&inode->i_data)) {
 		list_lru_isolate(lru, &inode->i_lru);
@@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
 	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
 	struct inode *old;
 
+	might_sleep();
+
 again:
 	spin_lock(&inode_hash_lock);
 	old = find_inode(inode->i_sb, head, test, data, true);
@@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode, *new;
 
+	might_sleep();
+
 again:
 	inode = find_inode(sb, head, test, data, false);
 	if (inode) {
@@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
+
+	might_sleep();
+
 again:
 	inode = find_inode_fast(sb, head, ino, false);
 	if (inode) {
@@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
 	struct inode *inode;
+
+	might_sleep();
+
 again:
 	inode = ilookup5_nowait(sb, hashval, test, data);
 	if (inode) {
@@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
+
+	might_sleep();
+
 again:
 	inode = find_inode_fast(sb, head, ino, false);
 
@@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode)
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 
+	might_sleep();
+
 	while (1) {
 		struct inode *old = NULL;
 		spin_lock(&inode_hash_lock);
@@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 {
 	struct inode *old;
 
+	might_sleep();
+
 	inode->i_state |= I_CREATING;
 	old = inode_insert5(inode, hashval, test, NULL, data);
 
@@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 EXPORT_SYMBOL(insert_inode_locked4);
 
 
-int generic_delete_inode(struct inode *inode)
+int inode_just_drop(struct inode *inode)
 {
 	return 1;
 }
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL(inode_just_drop);
 
 /*
  * Called when we're dropping the last reference
@@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode)
 	if (op->drop_inode)
 		drop = op->drop_inode(inode);
 	else
-		drop = generic_drop_inode(inode);
+		drop = inode_generic_drop(inode);
 
 	if (!drop &&
 	    !(inode->i_state & I_DONTCACHE) &&
@@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode)
  */
 void iput(struct inode *inode)
 {
-	if (!inode)
+	might_sleep();
+	if (unlikely(!inode))
 		return;
-	BUG_ON(inode->i_state & I_CLEAR);
+
 retry:
-	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
-		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
-			atomic_inc(&inode->i_count);
-			spin_unlock(&inode->i_lock);
-			trace_writeback_lazytime_iput(inode);
-			mark_inode_dirty_sync(inode);
-			goto retry;
-		}
-		iput_final(inode);
+	lockdep_assert_not_held(&inode->i_lock);
+	VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
+	/*
+	 * Note this assert is technically racy as if the count is bogusly
+	 * equal to one, then two CPUs racing to further drop it can both
+	 * conclude it's fine.
+	 */
+	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);
+
+	if (atomic_add_unless(&inode->i_count, -1, 1))
+		return;
+
+	if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
+		trace_writeback_lazytime_iput(inode);
+		mark_inode_dirty_sync(inode);
+		goto retry;
+	}
+
+	spin_lock(&inode->i_lock);
+	if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
+		spin_unlock(&inode->i_lock);
+		goto retry;
 	}
+
+	if (!atomic_dec_and_test(&inode->i_count)) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	/*
+	 * iput_final() drops ->i_lock, we can't assert on it as the inode may
+	 * be deallocated by the time the call returns.
+	 */
+	iput_final(inode);
 }
 EXPORT_SYMBOL(iput);
 
@@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
 	return notify_change(idmap, dentry, &newattrs, NULL);
 }
 
-int file_remove_privs_flags(struct file *file, unsigned int flags)
+static int file_remove_privs_flags(struct file *file, unsigned int flags)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = file_inode(file);
@@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags)
 		inode_has_no_xattr(inode);
 	return error;
 }
-EXPORT_SYMBOL_GPL(file_remove_privs_flags);
 
 /**
  * file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2519,21 +2560,28 @@ void __init inode_init(void)
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 {
 	inode->i_mode = mode;
-	if (S_ISCHR(mode)) {
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFCHR:
 		inode->i_fop = &def_chr_fops;
 		inode->i_rdev = rdev;
-	} else if (S_ISBLK(mode)) {
+		break;
+	case S_IFBLK:
 		if (IS_ENABLED(CONFIG_BLOCK))
 			inode->i_fop = &def_blk_fops;
 		inode->i_rdev = rdev;
-	} else if (S_ISFIFO(mode))
+		break;
+	case S_IFIFO:
 		inode->i_fop = &pipefifo_fops;
-	else if (S_ISSOCK(mode))
-		;	/* leave it no_open_fops */
-	else
+		break;
+	case S_IFSOCK:
+		/* leave it no_open_fops */
+		break;
+	default:
 		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
 				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
 				  inode->i_ino);
+		break;
+	}
 }
 EXPORT_SYMBOL(init_special_inode);
 
@@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid);
  *
  * TODO: add a proper inode dumping routine, this is a stub to get debug off the
  * ground.
+ *
+ * TODO: handle getting to fs type with get_kernel_nofault()?
+ * See dump_mapping() above.
  */
 void dump_inode(struct inode *inode, const char *reason)
 {
-       pr_warn("%s encountered for inode %px", reason, inode);
+	struct super_block *sb = inode->i_sb;
+
+	pr_warn("%s encountered for inode %px\n"
+		"fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
+		reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
+		inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
 }
 
 EXPORT_SYMBOL(dump_inode);
diff --git a/fs/internal.h b/fs/internal.h
index 38e8aab27bbd..a33d18ee5b74 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
 int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		       struct iattr *attr);
 void pidfs_get_root(struct path *path);
+void nsfs_get_root(struct path *path);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0248cb8db2d3..1c152c2b1b67 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
  *
  * Returns 0 on success, -errno on error.
  */
-int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int error = -ENOTTY;
 
@@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  out:
 	return error;
 }
-EXPORT_SYMBOL(vfs_ioctl);
 
 static int ioctl_fibmap(struct file *filp, int __user *p)
 {
@@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file,
 		goto out;
 	}
 
-	size = offsetof(struct file_dedupe_range, info[count]);
+	size = struct_size(same, info, count);
 	if (size > PAGE_SIZE) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fd827398afd2..8b847a1e27f1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
 	size_t size = i_size_read(iter->inode) - iomap->offset;
 	size_t offset = offset_in_folio(folio, iomap->offset);
 
+	if (WARN_ON_ONCE(!iomap->inline_data))
+		return -EIO;
+
 	if (folio_test_uptodate(folio))
 		return 0;
 
@@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	return true;
 }
 
-static void iomap_write_end_inline(const struct iomap_iter *iter,
+static bool iomap_write_end_inline(const struct iomap_iter *iter,
 		struct folio *folio, loff_t pos, size_t copied)
 {
 	const struct iomap *iomap = &iter->iomap;
@@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter,
 	WARN_ON_ONCE(!folio_test_uptodate(folio));
 	BUG_ON(!iomap_inline_data_valid(iomap));
 
+	if (WARN_ON_ONCE(!iomap->inline_data))
+		return false;
+
 	flush_dcache_folio(folio);
 	addr = kmap_local_folio(folio, pos);
 	memcpy(iomap_inline_data(iomap, pos), addr, copied);
 	kunmap_local(addr);
 
 	mark_inode_dirty(iter->inode);
+	return true;
 }
 
 /*
@@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
 	loff_t pos = iter->pos;
 
-	if (srcmap->type == IOMAP_INLINE) {
-		iomap_write_end_inline(iter, folio, pos, copied);
-		return true;
-	}
+	if (srcmap->type == IOMAP_INLINE)
+		return iomap_write_end_inline(iter, folio, pos, copied);
 
 	if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
 		size_t bh_written;
@@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		/* warn about zeroing folios beyond eof that won't write back */
 		WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
 
+		trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset,
+				bytes);
+
 		folio_zero_range(folio, offset, bytes);
 		folio_mark_accessed(folio);
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 6f25d4cfea9f..46aa85af13dc 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -363,14 +363,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 		if (iomap->flags & IOMAP_F_SHARED)
 			dio->flags |= IOMAP_DIO_COW;
 
-		if (iomap->flags & IOMAP_F_NEW) {
+		if (iomap->flags & IOMAP_F_NEW)
 			need_zeroout = true;
-		} else if (iomap->type == IOMAP_MAPPED) {
-			if (iomap_dio_can_use_fua(iomap, dio))
-				bio_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
-		}
+		else if (iomap->type == IOMAP_MAPPED &&
+			 iomap_dio_can_use_fua(iomap, dio))
+			bio_opf |= REQ_FUA;
+
+		if (!(bio_opf & REQ_FUA))
+			dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
 
 		/*
 		 * We can only do deferred completion for pure overwrites that
@@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
 	loff_t pos = iomi->pos;
 	u64 copied;
 
+	if (WARN_ON_ONCE(!inline_data))
+		return -EIO;
+
 	if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
 		return -EIO;
 
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 6ad66e6ba653..a61c1dae4742 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio);
 DEFINE_RANGE_EVENT(iomap_invalidate_folio);
 DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
 DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
+DEFINE_RANGE_EVENT(iomap_zero_iter);
 
 #define IOMAP_TYPE_STRINGS \
 	{ IOMAP_HOLE,		"HOLE" }, \
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index b3971e91e8eb..38861ca04899 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -285,6 +285,7 @@ restart:
 		retry:
 			if (batch_count)
 				__flush_batch(journal, &batch_count);
+			cond_resched();
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 	}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a6c692cac616..9adf36e6364b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
 					 !list_empty(&of->list));
 }
 
+/* Get active reference to kernfs node for an open file */
+static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
+{
+	/* Skip if file was already released */
+	if (unlikely(of->released))
+		return NULL;
+
+	if (!kernfs_get_active(of->kn))
+		return NULL;
+
+	return of;
+}
+
+static void kernfs_put_active_of(struct kernfs_open_file *of)
+{
+	return kernfs_put_active(of->kn);
+}
+
 /**
  * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
  *
@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
 
 	if (ops->seq_stop)
 		ops->seq_stop(sf, v);
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 }
 
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		return ERR_PTR(-ENODEV);
 
 	ops = kernfs_ops(of->kn);
@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!kernfs_get_active(of->kn)) {
+	if (!kernfs_get_active_of(of)) {
 		len = -ENODEV;
 		mutex_unlock(&of->mutex);
 		goto out_free;
@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	else
 		len = -EINVAL;
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	mutex_unlock(&of->mutex);
 
 	if (len < 0)
@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!kernfs_get_active(of->kn)) {
+	if (!kernfs_get_active_of(of)) {
 		mutex_unlock(&of->mutex);
 		len = -ENODEV;
 		goto out_free;
@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	else
 		len = -EINVAL;
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	mutex_unlock(&of->mutex);
 
 	if (len > 0)
@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	if (!of->vm_ops)
 		return;
 
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		return;
 
 	if (of->vm_ops->open)
 		of->vm_ops->open(vma);
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 }
 
 static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		return VM_FAULT_SIGBUS;
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
 		ret = of->vm_ops->fault(vmf);
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	return ret;
 }
 
@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
 	if (!of->vm_ops)
 		return VM_FAULT_SIGBUS;
 
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		return VM_FAULT_SIGBUS;
 
 	ret = 0;
@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
 	else
 		file_update_time(file);
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	return ret;
 }
 
@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
 	if (!of->vm_ops)
 		return -EINVAL;
 
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		return -EINVAL;
 
 	ret = -EINVAL;
 	if (of->vm_ops->access)
 		ret = of->vm_ops->access(vma, addr, buf, len, write);
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	return ret;
 }
 
@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 	mutex_lock(&of->mutex);
 
 	rc = -ENODEV;
-	if (!kernfs_get_active(of->kn))
+	if (!kernfs_get_active_of(of))
 		goto out_unlock;
 
 	ops = kernfs_ops(of->kn);
@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 out_unlock:
 	mutex_unlock(&of->mutex);
 
@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 	struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
 	__poll_t ret;
 
-	if (!kernfs_get_active(kn))
+	if (!kernfs_get_active_of(of))
 		return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
 
 	if (kn->attr.ops->poll)
@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 	else
 		ret = kernfs_generic_poll(of, wait);
 
-	kernfs_put_active(kn);
+	kernfs_put_active_of(of);
 	return ret;
 }
 
@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
 	 * the ops aren't called concurrently for the same open file.
 	 */
 	mutex_lock(&of->mutex);
-	if (!kernfs_get_active(of->kn)) {
+	if (!kernfs_get_active_of(of)) {
 		mutex_unlock(&of->mutex);
 		return -ENODEV;
 	}
@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
 	else
 		ret = generic_file_llseek(file, offset, whence);
 
-	kernfs_put_active(of->kn);
+	kernfs_put_active_of(of);
 	mutex_unlock(&of->mutex);
 	return ret;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3c293a5a21b1..457f91c412d4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 	struct kernfs_node *kn = kernfs_dentry_node(dentry);
 	struct kernfs_iattrs *attrs;
 
-	attrs = kernfs_iattrs_noalloc(kn);
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
-		return -ENODATA;
+		return -ENOMEM;
 
 	return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
 }
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e384a69fbece..76eaf64b9d9e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 const struct super_operations kernfs_sops = {
 	.statfs		= kernfs_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= kernfs_evict_inode,
 
 	.show_options	= kernfs_sop_show_options,
diff --git a/fs/locks.c b/fs/locks.c
index 559f02aa4172..04a3f0e20724 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2328,8 +2328,8 @@ out:
  * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
  * locks, the ->lock() interface may return asynchronously, before the lock has
  * been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
- * flags need to be set.
+ * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags
+ * need to be set.
  *
  * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
  * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df9d11479caf..32db676127a9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
 		inode->i_op = &minix_symlink_inode_operations;
 		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &minix_aops;
-	} else
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+		   S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
+	} else {
+		printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n",
+		       inode->i_mode, inode->i_ino);
+		make_bad_inode(inode);
+	}
 }
 
 /*
diff --git a/fs/mount.h b/fs/mount.h
index 97737051a8b9..79c85639a7ba 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -17,11 +17,7 @@ struct mnt_namespace {
 	};
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
-	u64			seq;	/* Sequence number to prevent loops */
-	union {
-		wait_queue_head_t	poll;
-		struct rcu_head		mnt_ns_rcu;
-	};
+	wait_queue_head_t	poll;
 	u64			seq_origin; /* Sequence number of origin mount namespace */
 	u64 event;
 #ifdef CONFIG_FSNOTIFY
@@ -30,8 +26,6 @@ struct mnt_namespace {
 #endif
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
-	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
-	struct list_head	mnt_ns_list; /* entry in the sequential list of mounts namespace */
 	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;
 
@@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry)
 
 static inline void get_mnt_ns(struct mnt_namespace *ns)
 {
-	refcount_inc(&ns->ns.count);
+	ns_ref_inc(ns);
 }
 
 extern seqlock_t mount_lock;
@@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
 
 static inline bool is_anon_ns(struct mnt_namespace *ns)
 {
-	return ns->seq == 0;
+	return ns->ns.ns_id == 0;
 }
 
 static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..507ca0d7878d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags
 	    dentry->d_inode)
 		return -EISDIR;
 
+	/* No need to trigger automounts if mountpoint crossing is disabled. */
+	if (lookup_flags & LOOKUP_NO_XDEV)
+		return -EXDEV;
+
 	if (count && (*count)++ >= MAXSYMLINKS)
 		return -ELOOP;
 
@@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
 		/* Allow the filesystem to manage the transit without i_rwsem
 		 * being held. */
 		if (flags & DCACHE_MANAGE_TRANSIT) {
+			if (lookup_flags & LOOKUP_NO_XDEV) {
+				ret = -EXDEV;
+				break;
+			}
 			ret = path->dentry->d_op->d_manage(path, false);
 			flags = smp_load_acquire(&path->dentry->d_flags);
 			if (ret < 0)
@@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
 				// here we know it's positive
 				flags = path->dentry->d_flags;
 				need_mntput = true;
+				if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
+					ret = -EXDEV;
+					break;
+				}
 				continue;
 			}
 		}
@@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 			return -ECHILD;
 	}
 	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
-	if (jumped) {
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			ret = -EXDEV;
-		else
-			nd->state |= ND_JUMPED;
-	}
+	if (jumped)
+		nd->state |= ND_JUMPED;
 	if (unlikely(ret)) {
 		dput(path->dentry);
 		if (path->mnt != nd->path.mnt)
@@ -1827,6 +1835,20 @@ static struct dentry *lookup_slow(const struct qstr *name,
 	return res;
 }
 
+static struct dentry *lookup_slow_killable(const struct qstr *name,
+					   struct dentry *dir,
+					   unsigned int flags)
+{
+	struct inode *inode = dir->d_inode;
+	struct dentry *res;
+
+	if (inode_lock_shared_killable(inode))
+		return ERR_PTR(-EINTR);
+	res = __lookup_slow(name, dir, flags);
+	inode_unlock_shared(inode);
+	return res;
+}
+
 static inline int may_lookup(struct mnt_idmap *idmap,
 			     struct nameidata *restrict nd)
 {
@@ -2744,7 +2766,8 @@ static int filename_parentat(int dfd, struct filename *name,
 }
 
 /* does lookup, returns the object with parent locked */
-static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
+static struct dentry *__start_removing_path(int dfd, struct filename *name,
+					   struct path *path)
 {
 	struct path parent_path __free(path_put) = {};
 	struct dentry *d;
@@ -2756,18 +2779,42 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct
 		return ERR_PTR(error);
 	if (unlikely(type != LAST_NORM))
 		return ERR_PTR(-EINVAL);
+	/* don't fail immediately if it's r/o, at least try to report other errors */
+	error = mnt_want_write(parent_path.mnt);
 	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
 	d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
-	if (IS_ERR(d)) {
-		inode_unlock(parent_path.dentry->d_inode);
-		return d;
-	}
+	if (IS_ERR(d))
+		goto unlock;
+	if (error)
+		goto fail;
 	path->dentry = no_free_ptr(parent_path.dentry);
 	path->mnt = no_free_ptr(parent_path.mnt);
 	return d;
+
+fail:
+	dput(d);
+	d = ERR_PTR(error);
+unlock:
+	inode_unlock(parent_path.dentry->d_inode);
+	if (!error)
+		mnt_drop_write(parent_path.mnt);
+	return d;
 }
 
-struct dentry *kern_path_locked_negative(const char *name, struct path *path)
+/**
+ * kern_path_parent: lookup path returning parent and target
+ * @name: path name
+ * @path: path to store parent in
+ *
+ * The path @name should end with a normal component, not "." or ".." or "/".
+ * A lookup is performed and if successful the parent information
+ * is store in @parent and the dentry is returned.
+ *
+ * The dentry maybe negative, the parent will be positive.
+ *
+ * Returns:  dentry or error.
+ */
+struct dentry *kern_path_parent(const char *name, struct path *path)
 {
 	struct path parent_path __free(path_put) = {};
 	struct filename *filename __free(putname) = getname_kernel(name);
@@ -2780,35 +2827,35 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path)
 		return ERR_PTR(error);
 	if (unlikely(type != LAST_NORM))
 		return ERR_PTR(-EINVAL);
-	inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
-	d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE);
-	if (IS_ERR(d)) {
-		inode_unlock(parent_path.dentry->d_inode);
+
+	d = lookup_noperm_unlocked(&last, parent_path.dentry);
+	if (IS_ERR(d))
 		return d;
-	}
 	path->dentry = no_free_ptr(parent_path.dentry);
 	path->mnt = no_free_ptr(parent_path.mnt);
 	return d;
 }
 
-struct dentry *kern_path_locked(const char *name, struct path *path)
+struct dentry *start_removing_path(const char *name, struct path *path)
 {
 	struct filename *filename = getname_kernel(name);
-	struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
+	struct dentry *res = __start_removing_path(AT_FDCWD, filename, path);
 
 	putname(filename);
 	return res;
 }
 
-struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
+struct dentry *start_removing_user_path_at(int dfd,
+					   const char __user *name,
+					   struct path *path)
 {
 	struct filename *filename = getname(name);
-	struct dentry *res = __kern_path_locked(dfd, filename, path);
+	struct dentry *res = __start_removing_path(dfd, filename, path);
 
 	putname(filename);
 	return res;
 }
-EXPORT_SYMBOL(user_path_locked_at);
+EXPORT_SYMBOL(start_removing_user_path_at);
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
 {
@@ -3011,6 +3058,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
 EXPORT_SYMBOL(lookup_one_unlocked);
 
 /**
+ * lookup_one_positive_killable - lookup single pathname component
+ * @idmap:	idmap of the mount the lookup is performed from
+ * @name:	qstr olding pathname component to lookup
+ * @base:	base directory to lookup from
+ *
+ * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
+ * known positive or ERR_PTR(). This is what most of the users want.
+ *
+ * Note that pinned negative with unlocked parent _can_ become positive at any
+ * time, so callers of lookup_one_unlocked() need to be very careful; pinned
+ * positives have >d_inode stable, so this one avoids such problems.
+ *
+ * This can be used for in-kernel filesystem clients such as file servers.
+ *
+ * It should be called without the parent i_rwsem held, and will take
+ * the i_rwsem itself if necessary.  If a fatal signal is pending or
+ * delivered, it will return %-EINTR if the lock is needed.
+ */
+struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
+					    struct qstr *name,
+					    struct dentry *base)
+{
+	int err;
+	struct dentry *ret;
+
+	err = lookup_one_common(idmap, name, base);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = lookup_dcache(name, base, 0);
+	if (!ret)
+		ret = lookup_slow_killable(name, base, 0);
+	if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+		dput(ret);
+		ret = ERR_PTR(-ENOENT);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_positive_killable);
+
+/**
  * lookup_one_positive_unlocked - lookup single pathname component
  * @idmap:	idmap of the mount the lookup is performed from
  * @name:	qstr holding pathname component to lookup
@@ -4114,7 +4202,6 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
 	unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
 	int type;
-	int err2;
 	int error;
 
 	error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
@@ -4129,7 +4216,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 		goto out;
 
 	/* don't fail immediately if it's r/o, at least try to report other errors */
-	err2 = mnt_want_write(path->mnt);
+	error = mnt_want_write(path->mnt);
 	/*
 	 * Do the final lookup.  Suppress 'create' if there is a trailing
 	 * '/', and a directory wasn't requested.
@@ -4142,25 +4229,24 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	if (IS_ERR(dentry))
 		goto unlock;
 
-	if (unlikely(err2)) {
-		error = err2;
+	if (unlikely(error))
 		goto fail;
-	}
+
 	return dentry;
 fail:
 	dput(dentry);
 	dentry = ERR_PTR(error);
 unlock:
 	inode_unlock(path->dentry->d_inode);
-	if (!err2)
+	if (!error)
 		mnt_drop_write(path->mnt);
 out:
 	path_put(path);
 	return dentry;
 }
 
-struct dentry *kern_path_create(int dfd, const char *pathname,
-				struct path *path, unsigned int lookup_flags)
+struct dentry *start_creating_path(int dfd, const char *pathname,
+				   struct path *path, unsigned int lookup_flags)
 {
 	struct filename *filename = getname_kernel(pathname);
 	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
@@ -4168,9 +4254,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
 	putname(filename);
 	return res;
 }
-EXPORT_SYMBOL(kern_path_create);
+EXPORT_SYMBOL(start_creating_path);
 
-void done_path_create(struct path *path, struct dentry *dentry)
+void end_creating_path(struct path *path, struct dentry *dentry)
 {
 	if (!IS_ERR(dentry))
 		dput(dentry);
@@ -4178,10 +4264,11 @@ void done_path_create(struct path *path, struct dentry *dentry)
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
-EXPORT_SYMBOL(done_path_create);
+EXPORT_SYMBOL(end_creating_path);
 
-inline struct dentry *user_path_create(int dfd, const char __user *pathname,
-				struct path *path, unsigned int lookup_flags)
+inline struct dentry *start_creating_user_path(
+	int dfd, const char __user *pathname,
+	struct path *path, unsigned int lookup_flags)
 {
 	struct filename *filename = getname(pathname);
 	struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
@@ -4189,7 +4276,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
 	putname(filename);
 	return res;
 }
-EXPORT_SYMBOL(user_path_create);
+EXPORT_SYMBOL(start_creating_user_path);
 
 /**
  * vfs_mknod - create device node or file
@@ -4297,7 +4384,7 @@ retry:
 			break;
 	}
 out2:
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
@@ -4401,7 +4488,7 @@ retry:
 		if (IS_ERR(dentry))
 			error = PTR_ERR(dentry);
 	}
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
@@ -4755,7 +4842,7 @@ retry:
 	if (!error)
 		error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
 				    dentry, from->name);
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
@@ -4828,7 +4915,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
 		return -EPERM;
 	/*
 	 * Updating the link count will likely cause i_uid and i_gid to
-	 * be writen back improperly if their true value is unknown to
+	 * be written back improperly if their true value is unknown to
 	 * the vfs.
 	 */
 	if (HAS_UNMAPPED_ID(idmap, inode))
@@ -4924,7 +5011,7 @@ retry:
 	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
 			 new_dentry, &delegated_inode);
 out_dput:
-	done_path_create(&new_path, new_dentry);
+	end_creating_path(&new_path, new_dentry);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error) {
@@ -5024,20 +5111,20 @@ int vfs_rename(struct renamedata *rd)
 	if (source == target)
 		return 0;
 
-	error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
+	error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir);
 	if (error)
 		return error;
 
 	if (!target) {
-		error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
+		error = may_create(rd->mnt_idmap, new_dir, new_dentry);
 	} else {
 		new_is_dir = d_is_dir(new_dentry);
 
 		if (!(flags & RENAME_EXCHANGE))
-			error = may_delete(rd->new_mnt_idmap, new_dir,
+			error = may_delete(rd->mnt_idmap, new_dir,
 					   new_dentry, is_dir);
 		else
-			error = may_delete(rd->new_mnt_idmap, new_dir,
+			error = may_delete(rd->mnt_idmap, new_dir,
 					   new_dentry, new_is_dir);
 	}
 	if (error)
@@ -5052,13 +5139,13 @@ int vfs_rename(struct renamedata *rd)
 	 */
 	if (new_dir != old_dir) {
 		if (is_dir) {
-			error = inode_permission(rd->old_mnt_idmap, source,
+			error = inode_permission(rd->mnt_idmap, source,
 						 MAY_WRITE);
 			if (error)
 				return error;
 		}
 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-			error = inode_permission(rd->new_mnt_idmap, target,
+			error = inode_permission(rd->mnt_idmap, target,
 						 MAY_WRITE);
 			if (error)
 				return error;
@@ -5126,7 +5213,7 @@ int vfs_rename(struct renamedata *rd)
 		if (error)
 			goto out;
 	}
-	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
+	error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
 				      new_dir, new_dentry, flags);
 	if (error)
 		goto out;
@@ -5269,10 +5356,9 @@ retry_deleg:
 
 	rd.old_parent	   = old_path.dentry;
 	rd.old_dentry	   = old_dentry;
-	rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
+	rd.mnt_idmap	   = mnt_idmap(old_path.mnt);
 	rd.new_parent	   = new_path.dentry;
 	rd.new_dentry	   = new_dentry;
-	rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
diff --git a/fs/namespace.c b/fs/namespace.c
index ddfd4457d338..dc01b14c58cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -33,6 +33,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
 #include <linux/pidfs.h>
+#include <linux/nstree.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str)
 }
 __setup("mphash_entries=", set_mphash_entries);
 
+static char * __initdata initramfs_options;
+static int __init initramfs_options_setup(char *str)
+{
+	initramfs_options = str;
+	return 1;
+}
+
+__setup("initramfs_options=", initramfs_options_setup);
+
 static u64 event;
 static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
 static DEFINE_IDA(mnt_group_ida);
@@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
-static DEFINE_SEQLOCK(mnt_ns_tree_lock);
 
 #ifdef CONFIG_FSNOTIFY
 LIST_HEAD(notify_list); /* protected by namespace_sem */
 #endif
-static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
-static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
 
 enum mount_kattr_flags_t {
 	MOUNT_KATTR_RECURSE		= (1 << 0),
@@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 
 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 {
+	struct ns_common *ns;
+
 	if (!node)
 		return NULL;
-	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
-}
-
-static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
-{
-	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
-	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
-	u64 seq_a = ns_a->seq;
-	u64 seq_b = ns_b->seq;
-
-	if (seq_a < seq_b)
-		return -1;
-	if (seq_a > seq_b)
-		return 1;
-	return 0;
-}
-
-static inline void mnt_ns_tree_write_lock(void)
-{
-	write_seqlock(&mnt_ns_tree_lock);
-}
-
-static inline void mnt_ns_tree_write_unlock(void)
-{
-	write_sequnlock(&mnt_ns_tree_lock);
-}
-
-static void mnt_ns_tree_add(struct mnt_namespace *ns)
-{
-	struct rb_node *node, *prev;
-
-	mnt_ns_tree_write_lock();
-	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
-	/*
-	 * If there's no previous entry simply add it after the
-	 * head and if there is add it after the previous entry.
-	 */
-	prev = rb_prev(&ns->mnt_ns_tree_node);
-	if (!prev)
-		list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
-	else
-		list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
-	mnt_ns_tree_write_unlock();
-
-	WARN_ON_ONCE(node);
+	ns = rb_entry(node, struct ns_common, ns_tree_node);
+	return container_of(ns, struct mnt_namespace, ns);
 }
 
 static void mnt_ns_release(struct mnt_namespace *ns)
 {
 	/* keep alive for {list,stat}mount() */
-	if (refcount_dec_and_test(&ns->passive)) {
+	if (ns && refcount_dec_and_test(&ns->passive)) {
 		fsnotify_mntns_delete(ns);
 		put_user_ns(ns->user_ns);
 		kfree(ns);
@@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
 
 static void mnt_ns_release_rcu(struct rcu_head *rcu)
 {
-	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+	mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
 }
 
 static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 {
 	/* remove from global mount namespace list */
-	if (!is_anon_ns(ns)) {
-		mnt_ns_tree_write_lock();
-		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
-		list_bidir_del_rcu(&ns->mnt_ns_list);
-		mnt_ns_tree_write_unlock();
-	}
-
-	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
-}
+	if (ns_tree_active(ns))
+		ns_tree_remove(ns);
 
-static int mnt_ns_find(const void *key, const struct rb_node *node)
-{
-	const u64 mnt_ns_id = *(u64 *)key;
-	const struct mnt_namespace *ns = node_to_mnt_ns(node);
-
-	if (mnt_ns_id < ns->seq)
-		return -1;
-	if (mnt_ns_id > ns->seq)
-		return 1;
-	return 0;
+	call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
 }
 
 /*
@@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node)
  */
 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
 {
-	struct mnt_namespace *ns;
-	struct rb_node *node;
-	unsigned int seq;
+	struct mnt_namespace *mnt_ns;
+	struct ns_common *ns;
 
 	guard(rcu)();
-	do {
-		seq = read_seqbegin(&mnt_ns_tree_lock);
-		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
-		if (node)
-			break;
-	} while (read_seqretry(&mnt_ns_tree_lock, seq));
-
-	if (!node)
+	ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
+	if (!ns)
 		return NULL;
 
 	/*
 	 * The last reference count is put with RCU delay so we can
 	 * unconditonally acquire a reference here.
 	 */
-	ns = node_to_mnt_ns(node);
-	refcount_inc(&ns->passive);
-	return ns;
+	mnt_ns = container_of(ns, struct mnt_namespace, ns);
+	refcount_inc(&mnt_ns->passive);
+	return mnt_ns;
 }
 
 static inline void lock_mount_hash(void)
@@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt)
 		return false;
 
 	seq = mnt->mnt_ns->seq_origin;
-	return !seq || (seq == current->nsproxy->mnt_ns->seq);
+	return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
 }
 
 /*
@@ -1197,10 +1140,7 @@ static void commit_tree(struct mount *mnt)
 
 	if (!mnt_ns_attached(mnt)) {
 		for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
-			if (unlikely(mnt_ns_attached(m)))
-				m = skip_mnt_tree(m);
-			else
-				mnt_add_to_ns(n, m);
+			mnt_add_to_ns(n, m);
 		n->nr_mounts += n->pending_mounts;
 		n->pending_mounts = 0;
 	}
@@ -2155,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
 
 struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
 {
+	struct ns_common *ns;
+
 	guard(rcu)();
 
 	for (;;) {
-		struct list_head *list;
-
-		if (previous)
-			list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
-		else
-			list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
-		if (list_is_head(list, &mnt_ns_list))
-			return ERR_PTR(-ENOENT);
+		ns = ns_tree_adjoined_rcu(mntns, previous);
+		if (IS_ERR(ns))
+			return ERR_CAST(ns);
 
-		mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+		mntns = to_mnt_ns(ns);
 
 		/*
 		 * The last passive reference count is put with RCU
@@ -2182,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
 		 * the mount namespace and it might already be on its
 		 * deathbed.
 		 */
-		if (!refcount_inc_not_zero(&mntns->ns.count))
+		if (!ns_ref_get(mntns))
 			continue;
 
 		return mntns;
@@ -2207,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
 	if (!mnt_ns)
 		return false;
 
-	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+	return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
 }
 
 struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
@@ -2458,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
 			return ERR_PTR(-EINVAL);
 	}
 
-        if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	if (__has_locked_children(old_mnt, path->dentry))
@@ -2704,6 +2641,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 			lock_mnt_tree(child);
 		q = __lookup_mnt(&child->mnt_parent->mnt,
 				 child->mnt_mountpoint);
+		commit_tree(child);
 		if (q) {
 			struct mountpoint *mp = root.mp;
 			struct mount *r = child;
@@ -2713,7 +2651,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 				mp = shorter;
 			mnt_change_mountpoint(r, mp, q);
 		}
-		commit_tree(child);
 	}
 	unpin_mountpoint(&root);
 	unlock_mount_hash();
@@ -2862,6 +2799,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 	return attach_recursive_mnt(mnt, p, mp);
 }
 
+static int may_change_propagation(const struct mount *m)
+{
+        struct mnt_namespace *ns = m->mnt_ns;
+
+	 // it must be mounted in some namespace
+	 if (IS_ERR_OR_NULL(ns))         // is_mounted()
+		 return -EINVAL;
+	 // and the caller must be admin in userns of that namespace
+	 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		 return -EPERM;
+	 return 0;
+}
+
 /*
  * Sanity check the flags to change_mnt_propagation.
  */
@@ -2898,10 +2848,10 @@ static int do_change_type(struct path *path, int ms_flags)
 		return -EINVAL;
 
 	namespace_lock();
-	if (!check_mnt(mnt)) {
-		err = -EINVAL;
+	err = may_change_propagation(mnt);
+	if (err)
 		goto out_unlock;
-	}
+
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
@@ -3070,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 		if (is_anon_ns(src_mnt_ns))
 			ns->seq_origin = src_mnt_ns->seq_origin;
 		else
-			ns->seq_origin = src_mnt_ns->seq;
+			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}
 
 	mnt = __do_loopback(path, recursive);
@@ -3279,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
-static int do_remount(struct path *path, int ms_flags, int sb_flags,
+static int do_remount(struct path *path, int sb_flags,
 		      int mnt_flags, void *data)
 {
 	int err;
@@ -3347,18 +3297,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 
 	namespace_lock();
 
-	err = -EINVAL;
-	/* To and From must be mounted */
-	if (!is_mounted(&from->mnt))
-		goto out;
-	if (!is_mounted(&to->mnt))
-		goto out;
-
-	err = -EPERM;
-	/* We should be allowed to modify mount namespaces of both mounts */
-	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(from);
+	if (err)
 		goto out;
-	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	err = may_change_propagation(to);
+	if (err)
 		goto out;
 
 	err = -EINVAL;
@@ -3724,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 	int error;
 
 	error = security_sb_kern_mount(sb);
-	if (!error && mount_too_revealing(sb, &mnt_flags))
+	if (!error && mount_too_revealing(sb, &mnt_flags)) {
+		errorfcp(fc, "VFS", "Mount too revealing");
 		error = -EPERM;
+	}
 
 	if (unlikely(error)) {
 		fc_drop_locked(fc);
@@ -4109,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path,
 	if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
 		return do_reconfigure_mnt(path, mnt_flags);
 	if (flags & MS_REMOUNT)
-		return do_remount(path, flags, sb_flags, mnt_flags, data_page);
+		return do_remount(path, sb_flags, mnt_flags, data_page);
 	if (flags & MS_BIND)
 		return do_loopback(path, dev_name, flags & MS_REC);
 	if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
@@ -4148,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
 static void free_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!is_anon_ns(ns))
-		ns_free_inum(&ns->ns);
+		ns_common_free(ns);
 	dec_mnt_namespaces(ns->ucounts);
 	mnt_ns_tree_remove(ns);
 }
 
-/*
- * Assign a sequence number so we can detect when we attempt to bind
- * mount a reference to an older mount namespace into the current
- * mount namespace, preventing reference counting loops.  A 64bit
- * number incrementing at 10Ghz will take 12,427 years to wrap which
- * is effectively never, so we can ignore the possibility.
- */
-static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-
 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
 {
 	struct mnt_namespace *new_ns;
@@ -4177,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 		dec_mnt_namespaces(ucounts);
 		return ERR_PTR(-ENOMEM);
 	}
-	if (!anon) {
-		ret = ns_alloc_inum(&new_ns->ns);
-		if (ret) {
-			kfree(new_ns);
-			dec_mnt_namespaces(ucounts);
-			return ERR_PTR(ret);
-		}
+
+	if (anon)
+		ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
+	else
+		ret = ns_common_init(new_ns);
+	if (ret) {
+		kfree(new_ns);
+		dec_mnt_namespaces(ucounts);
+		return ERR_PTR(ret);
 	}
-	new_ns->ns.ops = &mntns_operations;
 	if (!anon)
-		new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
-	refcount_set(&new_ns->ns.count, 1);
+		ns_tree_gen_id(&new_ns->ns);
 	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
-	INIT_LIST_HEAD(&new_ns->mnt_ns_list);
-	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
@@ -4200,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 }
 
 __latent_entropy
-struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;
@@ -4231,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		namespace_unlock();
-		ns_free_inum(&new_ns->ns);
+		ns_common_free(ns);
 		dec_mnt_namespaces(new_ns->ucounts);
 		mnt_ns_release(new_ns);
 		return ERR_CAST(new);
@@ -4278,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (pwdmnt)
 		mntput(pwdmnt);
 
-	mnt_ns_tree_add(new_ns);
+	ns_tree_add_raw(new_ns);
 	return new_ns;
 }
 
@@ -4441,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 
 	ret = -EPERM;
 	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
-		pr_warn("VFS: Mount too revealing\n");
+		errorfcp(fc, "VFS", "Mount too revealing");
 		goto err_unlock;
 	}
 
@@ -4551,20 +4485,10 @@ SYSCALL_DEFINE5(move_mount,
 	if (flags & MOVE_MOUNT_SET_GROUP)	mflags |= MNT_TREE_PROPAGATION;
 	if (flags & MOVE_MOUNT_BENEATH)		mflags |= MNT_TREE_BENEATH;
 
-	lflags = 0;
-	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
-	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
 	uflags = 0;
-	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	uflags = AT_EMPTY_PATH;
-	from_name = getname_maybe_null(from_pathname, uflags);
-	if (IS_ERR(from_name))
-		return PTR_ERR(from_name);
+	if (flags & MOVE_MOUNT_T_EMPTY_PATH)
+		uflags = AT_EMPTY_PATH;
 
-	lflags = 0;
-	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
-	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
-	uflags = 0;
-	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	uflags = AT_EMPTY_PATH;
 	to_name = getname_maybe_null(to_pathname, uflags);
 	if (IS_ERR(to_name))
 		return PTR_ERR(to_name);
@@ -4577,11 +4501,24 @@ SYSCALL_DEFINE5(move_mount,
 		to_path = fd_file(f_to)->f_path;
 		path_get(&to_path);
 	} else {
+		lflags = 0;
+		if (flags & MOVE_MOUNT_T_SYMLINKS)
+			lflags |= LOOKUP_FOLLOW;
+		if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
+			lflags |= LOOKUP_AUTOMOUNT;
 		ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
 		if (ret)
 			return ret;
 	}
 
+	uflags = 0;
+	if (flags & MOVE_MOUNT_F_EMPTY_PATH)
+		uflags = AT_EMPTY_PATH;
+
+	from_name = getname_maybe_null(from_pathname, uflags);
+	if (IS_ERR(from_name))
+		return PTR_ERR(from_name);
+
 	if (!from_name && from_dfd >= 0) {
 		CLASS(fd_raw, f_from)(from_dfd);
 		if (fd_empty(f_from))
@@ -4590,6 +4527,11 @@ SYSCALL_DEFINE5(move_mount,
 		return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
 	}
 
+	lflags = 0;
+	if (flags & MOVE_MOUNT_F_SYMLINKS)
+		lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
+		lflags |= LOOKUP_AUTOMOUNT;
 	ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
 	if (ret)
 		return ret;
@@ -4996,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
 		return -EINVAL;
 
 	ns = get_proc_ns(file_inode(fd_file(f)));
-	if (ns->ops->type != CLONE_NEWUSER)
+	if (ns->ns_type != CLONE_NEWUSER)
 		return -EINVAL;
 
 	/*
@@ -5176,7 +5118,8 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
 		int ret;
 		struct mount_kattr kattr = {};
 
-		kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+		if (flags & OPEN_TREE_CLONE)
+			kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
 		if (flags & AT_RECURSIVE)
 			kattr.kflags |= MOUNT_KATTR_RECURSE;
 
@@ -5388,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
 static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
 {
 	s->sm.mask |= STATMOUNT_MNT_NS_ID;
-	s->sm.mnt_ns_id = ns->seq;
+	s->sm.mnt_ns_id = ns->ns.ns_id;
 }
 
 static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
@@ -5699,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
 static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 			struct mnt_namespace *ns)
 {
-	struct path root __free(path_put) = {};
 	struct mount *m;
 	int err;
 
@@ -5711,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (!s->mnt)
 		return -ENOENT;
 
-	err = grab_requested_root(ns, &root);
+	err = grab_requested_root(ns, &s->root);
 	if (err)
 		return err;
 
@@ -5720,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	 * mounts to show users.
 	 */
 	m = real_mount(s->mnt);
-	if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
 	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -5728,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 	if (err)
 		return err;
 
-	s->root = root;
-
 	/*
 	 * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
 	 * can change concurrently as we only hold the read-side of the
@@ -5898,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
 			return ERR_PTR(-EINVAL);
 
 		ns = get_proc_ns(file_inode(fd_file(f)));
-		if (ns->ops->type != CLONE_NEWNS)
+		if (ns->ns_type != CLONE_NEWNS)
 			return ERR_PTR(-EINVAL);
 
 		mnt_ns = to_mnt_ns(ns);
@@ -5951,28 +5891,40 @@ retry:
 	if (!ret)
 		ret = copy_statmount_to_user(ks);
 	kvfree(ks->seq.buf);
+	path_put(&ks->root);
 	if (retry_statmount(ret, &seq_size))
 		goto retry;
 	return ret;
 }
 
-static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
-			    u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
-			    bool reverse)
+struct klistmount {
+	u64 last_mnt_id;
+	u64 mnt_parent_id;
+	u64 *kmnt_ids;
+	u32 nr_mnt_ids;
+	struct mnt_namespace *ns;
+	struct path root;
+};
+
+static ssize_t do_listmount(struct klistmount *kls, bool reverse)
 {
-	struct path root __free(path_put) = {};
+	struct mnt_namespace *ns = kls->ns;
+	u64 mnt_parent_id = kls->mnt_parent_id;
+	u64 last_mnt_id = kls->last_mnt_id;
+	u64 *mnt_ids = kls->kmnt_ids;
+	size_t nr_mnt_ids = kls->nr_mnt_ids;
 	struct path orig;
 	struct mount *r, *first;
 	ssize_t ret;
 
 	rwsem_assert_held(&namespace_sem);
 
-	ret = grab_requested_root(ns, &root);
+	ret = grab_requested_root(ns, &kls->root);
 	if (ret)
 		return ret;
 
 	if (mnt_parent_id == LSMT_ROOT) {
-		orig = root;
+		orig = kls->root;
 	} else {
 		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
 		if (!orig.mnt)
@@ -5984,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
 	 * Don't trigger audit denials. We just want to determine what
 	 * mounts to show users.
 	 */
-	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
 	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -6017,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
 	return ret;
 }
 
+static void __free_klistmount_free(const struct klistmount *kls)
+{
+	path_put(&kls->root);
+	kvfree(kls->kmnt_ids);
+	mnt_ns_release(kls->ns);
+}
+
+static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
+				     size_t nr_mnt_ids)
+{
+
+	u64 last_mnt_id = kreq->param;
+
+	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+	if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+		return -EINVAL;
+
+	kls->last_mnt_id = last_mnt_id;
+
+	kls->nr_mnt_ids = nr_mnt_ids;
+	kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
+				       GFP_KERNEL_ACCOUNT);
+	if (!kls->kmnt_ids)
+		return -ENOMEM;
+
+	kls->ns = grab_requested_mnt_ns(kreq);
+	if (!kls->ns)
+		return -ENOENT;
+
+	kls->mnt_parent_id = kreq->mnt_id;
+	return 0;
+}
+
 SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 		u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
 {
-	u64 *kmnt_ids __free(kvfree) = NULL;
+	struct klistmount kls __free(klistmount_free) = {};
 	const size_t maxcount = 1000000;
-	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct mnt_id_req kreq;
-	u64 last_mnt_id;
 	ssize_t ret;
 
 	if (flags & ~LISTMOUNT_REVERSE)
@@ -6045,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	if (ret)
 		return ret;
 
-	last_mnt_id = kreq.param;
-	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
-	if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
-		return -EINVAL;
-
-	kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
-				  GFP_KERNEL_ACCOUNT);
-	if (!kmnt_ids)
-		return -ENOMEM;
-
-	ns = grab_requested_mnt_ns(&kreq);
-	if (!ns)
-		return -ENOENT;
+	ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
+	if (ret)
+		return ret;
 
-	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+	if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
+	    !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
 		return -ENOENT;
 
 	/*
@@ -6068,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	 * listmount() doesn't care about any mount properties.
 	 */
 	scoped_guard(rwsem_read, &namespace_sem)
-		ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
-				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+		ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
 	if (ret <= 0)
 		return ret;
 
-	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+	if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
 		return -EFAULT;
 
 	return ret;
 }
 
+struct mnt_namespace init_mnt_ns = {
+	.ns.inum	= ns_init_inum(&init_mnt_ns),
+	.ns.ops		= &mntns_operations,
+	.user_ns	= &init_user_ns,
+	.ns.__ns_ref	= REFCOUNT_INIT(1),
+	.ns.ns_type	= ns_common_type(&init_mnt_ns),
+	.passive	= REFCOUNT_INIT(1),
+	.mounts		= RB_ROOT,
+	.poll		= __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
+};
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
 	struct mount *m;
-	struct mnt_namespace *ns;
 	struct path root;
 
-	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
+	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
-	ns = alloc_mnt_ns(&init_user_ns, true);
-	if (IS_ERR(ns))
-		panic("Can't allocate initial namespace");
-	ns->seq = atomic64_inc_return(&mnt_ns_seq);
-	ns->ns.inum = PROC_MNT_INIT_INO;
 	m = real_mount(mnt);
-	ns->root = m;
-	ns->nr_mounts = 1;
-	mnt_add_to_ns(ns, m);
-	init_task.nsproxy->mnt_ns = ns;
-	get_mnt_ns(ns);
+	init_mnt_ns.root = m;
+	init_mnt_ns.nr_mounts = 1;
+	mnt_add_to_ns(&init_mnt_ns, m);
+	init_task.nsproxy->mnt_ns = &init_mnt_ns;
+	get_mnt_ns(&init_mnt_ns);
 
 	root.mnt = mnt;
 	root.dentry = mnt->mnt_root;
@@ -6108,7 +6085,7 @@ static void __init init_mount_tree(void)
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 
-	mnt_ns_tree_add(ns);
+	ns_tree_add(&init_mnt_ns);
 }
 
 void __init mnt_init(void)
@@ -6148,7 +6125,7 @@ void __init mnt_init(void)
 
 void put_mnt_ns(struct mnt_namespace *ns)
 {
-	if (!refcount_dec_and_test(&ns->ns.count))
+	if (!ns_ref_put(ns))
 		return;
 	namespace_lock();
 	emptied_ns = ns;
@@ -6397,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations mntns_operations = {
 	.name		= "mnt",
-	.type		= CLONE_NEWNS,
 	.get		= mntns_get,
 	.put		= mntns_put,
 	.install	= mntns_install,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 18b3dc74c70e..37ab6f28b5ad 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl)
 	return netfs_put_request(rreq, netfs_rreq_trace_put_return);
 
 cleanup_free:
-	return netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+	return netfs_put_failed_request(rreq);
 }
 EXPORT_SYMBOL(netfs_readahead);
 
@@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
 	return ret < 0 ? ret : 0;
 
 discard:
-	netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+	netfs_put_failed_request(rreq);
 alloc_error:
 	folio_unlock(folio);
 	return ret;
@@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
 	return ret < 0 ? ret : 0;
 
 discard:
-	netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+	netfs_put_failed_request(rreq);
 alloc_error:
 	folio_unlock(folio);
 	return ret;
@@ -699,7 +699,7 @@ have_folio_no_wait:
 	return 0;
 
 error_put:
-	netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+	netfs_put_failed_request(rreq);
 error:
 	if (folio) {
 		folio_unlock(folio);
@@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 	return ret < 0 ? ret : 0;
 
 error_put:
-	netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+	netfs_put_failed_request(rreq);
 error:
 	_leave(" = %d", ret);
 	return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index f27ea5099a68..09394ac2c180 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		folio_put(folio);
 		ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
 		if (ret < 0)
-			goto error_folio_unlock;
+			goto out;
 		continue;
 
 	copied:
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index a05e13472baf..a498ee8d6674 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
 
 	if (rreq->len == 0) {
 		pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+		netfs_put_request(rreq, netfs_rreq_trace_put_discard);
 		return -EIO;
 	}
 
@@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
 	if (user_backed_iter(iter)) {
 		ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
 		if (ret < 0)
-			goto out;
+			goto error_put;
 		rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
 		rreq->direct_bv_count = ret;
 		rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
@@ -238,6 +239,10 @@ out:
 	if (ret > 0)
 		orig_count -= ret;
 	return ret;
+
+error_put:
+	netfs_put_failed_request(rreq);
+	return ret;
 }
 EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
 
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index a16660ab7f83..a9d1c3b2c084 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 			n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
 			if (n < 0) {
 				ret = n;
-				goto out;
+				goto error_put;
 			}
 			wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
 			wreq->direct_bv_count = n;
@@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 out:
 	netfs_put_request(wreq, netfs_rreq_trace_put_return);
 	return ret;
+
+error_put:
+	netfs_put_failed_request(wreq);
+	return ret;
 }
 EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d4f16fefd965..4319611f5354 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
 void netfs_clear_subrequests(struct netfs_io_request *rreq);
 void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
+void netfs_put_failed_request(struct netfs_io_request *rreq);
 struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
 
 static inline void netfs_see_request(struct netfs_io_request *rreq,
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 20748bcfbf59..486166460e17 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq)
 {
 	if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
 	    !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
-		queue_work(system_unbound_wq, &rreq->work);
+		queue_work(system_dfl_wq, &rreq->work);
 	} else {
 		trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
 		wake_up(&rreq->waitq);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e8c99738b5bb..b8c4918d3dcd 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu)
 	netfs_stat_d(&netfs_n_rh_rreq);
 }
 
-static void netfs_free_request(struct work_struct *work)
+static void netfs_deinit_request(struct netfs_io_request *rreq)
 {
-	struct netfs_io_request *rreq =
-		container_of(work, struct netfs_io_request, cleanup_work);
 	struct netfs_inode *ictx = netfs_inode(rreq->inode);
 	unsigned int i;
 
@@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work)
 
 	if (atomic_dec_and_test(&ictx->io_count))
 		wake_up_var(&ictx->io_count);
+}
+
+static void netfs_free_request(struct work_struct *work)
+{
+	struct netfs_io_request *rreq =
+		container_of(work, struct netfs_io_request, cleanup_work);
+
+	netfs_deinit_request(rreq);
 	call_rcu(&rreq->rcu, netfs_free_request_rcu);
 }
 
@@ -163,11 +169,29 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
 		dead = __refcount_dec_and_test(&rreq->ref, &r);
 		trace_netfs_rreq_ref(debug_id, r - 1, what);
 		if (dead)
-			WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work));
+			WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work));
 	}
 }
 
 /*
+ * Free a request (synchronously) that was just allocated but has
+ * failed before it could be submitted.
+ */
+void netfs_put_failed_request(struct netfs_io_request *rreq)
+{
+	int r = refcount_read(&rreq->ref);
+
+	/* new requests have two references (see
+	 * netfs_alloc_request(), and this function is only allowed on
+	 * new request objects
+	 */
+	WARN_ON_ONCE(r != 2);
+
+	trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed);
+	netfs_free_request(&rreq->cleanup_work);
+}
+
+/*
  * Allocate and partially initialise an I/O request structure.
  */
 struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 3e804da1e1eb..a95e7aadafd0 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -281,8 +281,10 @@ reassess:
 		} else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) {
 			notes |= MADE_PROGRESS;
 		} else {
-			if (!stream->failed)
+			if (!stream->failed) {
 				stream->transferred += transferred;
+				stream->transferred_valid = true;
+			}
 			if (front->transferred < front->len)
 				set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags);
 			notes |= MADE_PROGRESS;
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 8097bc069c1d..a1489aa29f78 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
 	return creq;
 
 cancel_put:
-	netfs_put_request(creq, netfs_rreq_trace_put_return);
+	netfs_put_failed_request(creq);
 cancel:
 	rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
 	clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index fa622a6cd56d..5c0dc4efc792 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
 	return ret;
 
 cleanup_free:
-	netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+	netfs_put_failed_request(rreq);
 	return ret;
 }
 EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 0f3a36852a4d..cbf3d9194c7b 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -254,6 +254,7 @@ reassess_streams:
 			if (front->start + front->transferred > stream->collected_to) {
 				stream->collected_to = front->start + front->transferred;
 				stream->transferred = stream->collected_to - wreq->start;
+				stream->transferred_valid = true;
 				notes |= MADE_PROGRESS;
 			}
 			if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
@@ -356,6 +357,7 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
 {
 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
 	size_t transferred;
+	bool transferred_valid = false;
 	int s;
 
 	_enter("R=%x", wreq->debug_id);
@@ -376,12 +378,16 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
 			continue;
 		if (!list_empty(&stream->subrequests))
 			return false;
-		if (stream->transferred < transferred)
+		if (stream->transferred_valid &&
+		    stream->transferred < transferred) {
 			transferred = stream->transferred;
+			transferred_valid = true;
+		}
 	}
 
 	/* Okay, declare that all I/O is complete. */
-	wreq->transferred = transferred;
+	if (transferred_valid)
+		wreq->transferred = transferred;
 	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
 
 	if (wreq->io_streams[1].active &&
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 50bee2c4130d..dd8743bc8d7f 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -118,12 +118,12 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 	wreq->io_streams[0].prepare_write	= ictx->ops->prepare_write;
 	wreq->io_streams[0].issue_write		= ictx->ops->issue_write;
 	wreq->io_streams[0].collected_to	= start;
-	wreq->io_streams[0].transferred		= LONG_MAX;
+	wreq->io_streams[0].transferred		= 0;
 
 	wreq->io_streams[1].stream_nr		= 1;
 	wreq->io_streams[1].source		= NETFS_WRITE_TO_CACHE;
 	wreq->io_streams[1].collected_to	= start;
-	wreq->io_streams[1].transferred		= LONG_MAX;
+	wreq->io_streams[1].transferred		= 0;
 	if (fscache_resources_valid(&wreq->cache_resources)) {
 		wreq->io_streams[1].avail	= true;
 		wreq->io_streams[1].active	= true;
@@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 
 	return wreq;
 nomem:
-	wreq->error = -ENOMEM;
-	netfs_put_request(wreq, netfs_rreq_trace_put_failed);
+	netfs_put_failed_request(wreq);
 	return ERR_PTR(-ENOMEM);
 }
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8fb4a950dd55..4e3dcc157a83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 
 	if (fsinfo->xattr_support)
 		server->caps |= NFS_CAP_XATTR;
+	else
+		server->caps &= ~NFS_CAP_XATTR;
 #endif
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86e36c630f09..8059ece82468 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/gfp.h>
+#include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/compaction.h>
 
@@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL_GPL(nfs_file_fsync);
 
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+			     loff_t to)
+{
+	struct folio *folio;
+
+	if (from >= to)
+		return;
+
+	folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
+	if (IS_ERR(folio))
+		return;
+
+	if (folio_mkclean(folio))
+		folio_mark_dirty(folio);
+
+	if (folio_test_uptodate(folio)) {
+		loff_t fpos = folio_pos(folio);
+		size_t offset = from - fpos;
+		size_t end = folio_size(folio);
+
+		if (to - fpos < end)
+			end = to - fpos;
+		folio_zero_segment(folio, offset, end);
+		trace_nfs_size_truncate_folio(mapping->host, to);
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(nfs_truncate_last_folio);
+
 /*
  * Decide whether a read/modify/write cycle may be more efficient
  * then a modify/write/read cycle when writing to a page in the
@@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb,
 
 	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
 		file, mapping->host->i_ino, len, (long long) pos);
+	nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
 
 	fgp |= fgf_set_order(len);
 start:
@@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
 	dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
 		 folio->index, offset, length);
 
-	if (offset != 0 || length < folio_size(folio))
-		return;
 	/* Cancel any unstarted writes on this page */
-	nfs_wb_folio_cancel(inode, folio);
+	if (offset != 0 || length < folio_size(folio))
+		nfs_wb_folio(inode, folio);
+	else
+		nfs_wb_folio_cancel(inode, folio);
 	folio_wait_private_2(folio); /* [DEPRECATED] */
 	trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8dc921d83538..9edb5f9b0c4e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
 		struct pnfs_layout_segment *l2)
 {
 	const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
-	const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+	const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2);
 	u32 i;
 
 	if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
@@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 			continue;
 
 		if (check_device &&
-		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+			// reinitialize the error state in case if this is the last iteration
+			ds = ERR_PTR(-EINVAL);
 			continue;
+		}
 
 		*best_idx = idx;
 		break;
@@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
 	struct nfs4_pnfs_ds *ds;
 
 	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
-	if (ds)
+	if (!IS_ERR(ds))
 		return ds;
 	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
 }
@@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
 
 	ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
 					       best_idx);
-	if (ds || !pgio->pg_mirror_idx)
+	if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
 		return ds;
 	return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
 }
@@ -868,7 +871,7 @@ retry:
 	req->wb_nio = 0;
 
 	ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
-	if (!ds) {
+	if (IS_ERR(ds)) {
 		if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 			goto out_mds;
 		pnfs_generic_pg_cleanup(pgio);
@@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
 {
 	u32 idx = hdr->pgio_mirror_idx + 1;
 	u32 new_idx = 0;
+	struct nfs4_pnfs_ds *ds;
 
-	if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
-		ff_layout_send_layouterror(hdr->lseg);
-	else
+	ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+	if (IS_ERR(ds))
 		pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+	else
+		ff_layout_send_layouterror(hdr->lseg);
 	pnfs_read_resend_pnfs(hdr, new_idx);
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 338ef77ae423..9bdaf7f38bed 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
 
 int nfs_drop_inode(struct inode *inode)
 {
-	return NFS_STALE(inode) || generic_drop_inode(inode);
+	return NFS_STALE(inode) || inode_generic_drop(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_drop_inode);
 
@@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_sb->s_id,
 		(unsigned long long)NFS_FILEID(inode),
 		nfs_display_fhandle_hash(fh),
-		atomic_read(&inode->i_count));
+		icount_read(inode));
 
 out:
 	return inode;
@@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 {
 	struct inode *inode = d_inode(dentry);
 	struct nfs_fattr *fattr;
+	loff_t oldsize = i_size_read(inode);
 	int error = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		if (error)
 			return error;
 
-		if (attr->ia_size == i_size_read(inode))
+		if (attr->ia_size == oldsize)
 			attr->ia_valid &= ~ATTR_SIZE;
 	}
 
@@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	trace_nfs_setattr_enter(inode);
 
 	/* Write all dirty data */
-	if (S_ISREG(inode->i_mode))
+	if (S_ISREG(inode->i_mode)) {
+		nfs_file_block_o_direct(NFS_I(inode));
 		nfs_sync_inode(inode);
+	}
 
 	fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
 	if (fattr == NULL) {
@@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	}
 
 	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
-	if (error == 0)
+	if (error == 0) {
+		if (attr->ia_valid & ATTR_SIZE)
+			nfs_truncate_last_folio(inode->i_mapping, oldsize,
+						attr->ia_size);
 		error = nfs_refresh_inode(inode, fattr);
+	}
 	nfs_free_fattr(fattr);
 out:
 	trace_nfs_setattr_exit(inode, error);
@@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			nfs_display_fhandle_hash(NFS_FH(inode)),
-			atomic_read(&inode->i_count), fattr->valid);
+			icount_read(inode), fattr->valid);
 
 	if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
 		/* Only a mounted-on-fileid? Just exit */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74d712b58423..c0a44f389f8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *);
 int nfs_lock(struct file *, int, struct file_lock *);
 int nfs_flock(struct file *, int, struct file_lock *);
 int nfs_check_flags(int);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+			     loff_t to);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
@@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
 	return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
 }
 
+/* Must be called with exclusively locked inode->i_rwsem */
+static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
+{
+	if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+		clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+		inode_dio_wait(&nfsi->vfs_inode);
+	}
+}
+
+
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
 extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 3388faf2acb9..d275b0a250bf 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -14,15 +14,6 @@
 
 #include "internal.h"
 
-/* Call with exclusively locked inode->i_rwsem */
-static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
-{
-	if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
-		clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
-		inode_dio_wait(inode);
-	}
-}
-
 /**
  * nfs_start_io_read - declare the file is being used for buffered reads
  * @inode: file inode
@@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode)
 	err = down_write_killable(&inode->i_rwsem);
 	if (err)
 		return err;
-	nfs_block_o_direct(nfsi, inode);
+	nfs_file_block_o_direct(nfsi);
 	downgrade_write(&inode->i_rwsem);
 
 	return 0;
@@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode)
 
 	err = down_write_killable(&inode->i_rwsem);
 	if (!err)
-		nfs_block_o_direct(NFS_I(inode), inode);
+		nfs_file_block_o_direct(NFS_I(inode));
 	return err;
 }
 
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index bd5fca285899..97abf62f109d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp)
 		return;
 	}
 
-	if (nfs_client_is_local(clp)) {
-		/* If already enabled, disable and re-enable */
-		nfs_localio_disable_client(clp);
-	}
+	if (nfs_client_is_local(clp))
+		return;
 
 	if (!nfs_uuid_begin(&clp->cl_uuid))
 		return;
@@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 		case -ENOMEM:
 		case -ENXIO:
 		case -ENOENT:
-			/* Revalidate localio, will disable if unsupported */
+			/* Revalidate localio */
+			nfs_localio_disable_client(clp);
 			nfs_local_probe(clp);
 		}
 	}
@@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work)
 	nfs_local_iter_init(&iter, iocb, READ);
 
 	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+
+	revert_creds(save_cred);
+
 	if (status != -EIOCBQUEUED) {
 		nfs_local_read_done(iocb, status);
 		nfs_local_pgio_release(iocb);
 	}
-
-	revert_creds(save_cred);
 }
 
 static int
@@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work)
 	file_start_write(filp);
 	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
 	file_end_write(filp);
+
+	revert_creds(save_cred);
+	current->flags = old_flags;
+
 	if (status != -EIOCBQUEUED) {
 		nfs_local_write_done(iocb, status);
 		nfs_local_vfs_getattr(iocb);
 		nfs_local_pgio_release(iocb);
 	}
-
-	revert_creds(save_cred);
-	current->flags = old_flags;
 }
 
 static int
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7f1ec9c67ff2..f9a3a1fbf44c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
 			num *= HZ;
 		*((int *)kp->arg) = num;
 		if (!list_empty(&nfs_automount_list))
-			mod_delayed_work(system_wq, &nfs_automount_task, num);
+			mod_delayed_work(system_percpu_wq, &nfs_automount_task, num);
 	} else {
 		*((int *)kp->arg) = -1*HZ;
 		cancel_delayed_work(&nfs_automount_task);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 01c01f45358b..6a0b5871ba3b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 	exception.inode = inode;
 	exception.state = lock->open_context->state;
 
+	nfs_file_block_o_direct(NFS_I(inode));
 	err = nfs_sync_inode(inode);
 	if (err)
 		goto out;
@@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
 	};
 	struct inode *inode = file_inode(filep);
+	loff_t oldsize = i_size_read(inode);
 	int err;
 
 	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
@@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
 	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
-	if (err == -EOPNOTSUPP)
+
+	if (err == 0)
+		nfs_truncate_last_folio(inode->i_mapping, oldsize,
+					offset + len);
+	else if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
 					     NFS_CAP_ZERO_RANGE);
 
@@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
 	};
 	struct inode *inode = file_inode(filep);
+	loff_t oldsize = i_size_read(inode);
 	int err;
 
 	if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
@@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
 	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
-	if (err == 0)
+	if (err == 0) {
+		nfs_truncate_last_folio(inode->i_mapping, oldsize,
+					offset + len);
 		truncate_pagecache_range(inode, offset, (offset + len) -1);
-	if (err == -EOPNOTSUPP)
+	} else if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
 
 	inode_unlock(inode);
@@ -354,22 +363,27 @@ out:
 
 /**
  * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
- * @inode: pointer to destination inode
+ * @file: pointer to destination file
  * @pos: destination offset
  * @len: copy length
+ * @oldsize: length of the file prior to clone/copy
  *
  * Punch a hole in the inode page cache, so that the NFS client will
  * know to retrieve new data.
  * Update the file size if necessary, and then mark the inode as having
  * invalid cached values for change attribute, ctime, mtime and space used.
  */
-static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len,
+				 loff_t oldsize)
 {
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = file->f_mapping;
 	loff_t newsize = pos + len;
 	loff_t end = newsize - 1;
 
-	WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
-				pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+	nfs_truncate_last_folio(mapping, oldsize, pos);
+	WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+						   end >> PAGE_SHIFT));
 
 	spin_lock(&inode->i_lock);
 	if (newsize > i_size_read(inode))
@@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 	struct nfs_server *src_server = NFS_SERVER(src_inode);
 	loff_t pos_src = args->src_pos;
 	loff_t pos_dst = args->dst_pos;
+	loff_t oldsize_dst = i_size_read(dst_inode);
 	size_t count = args->count;
 	ssize_t status;
 
@@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 		return status;
 	}
 
+	nfs_file_block_o_direct(NFS_I(dst_inode));
 	status = nfs_sync_inode(dst_inode);
 	if (status)
 		return status;
@@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 			goto out;
 	}
 
-	nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+	nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst);
 	nfs_invalidate_atime(src_inode);
 	status = res->write_res.count;
 out:
@@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 	struct nfs42_clone_res res = {
 		.server	= server,
 	};
+	loff_t oldsize_dst = i_size_read(dst_inode);
 	int status;
 
 	msg->rpc_argp = &args;
@@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 		/* a zero-length count means clone to EOF in src */
 		if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
 			count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
-		nfs42_copy_dest_done(dst_inode, dst_offset, count);
+		nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst);
 		status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
 	}
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1d6b5f4230c9..c9a0d1e420c6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
 	lock_two_nondirectories(src_inode, dst_inode);
 	/* flush all pending writes on both src and dst so that server
 	 * has the latest data */
+	nfs_file_block_o_direct(NFS_I(src_inode));
 	ret = nfs_sync_inode(src_inode);
 	if (ret)
 		goto out_unlock;
+	nfs_file_block_o_direct(NFS_I(dst_inode));
 	ret = nfs_sync_inode(dst_inode);
 	if (ret)
 		goto out_unlock;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..ce61253efd45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 				     res.attr_bitmask[2];
 		}
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
-		server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
-				  NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+		server->caps &=
+			~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
+			  NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
+			  NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
 		server->fattr_valid = NFS_ATTR_FATTR_V4;
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
 				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 	};
 	int err;
 
-	nfs_server_set_init_caps(server);
 	do {
 		err = nfs4_handle_exception(server,
 				_nfs4_server_capabilities(server, fhandle),
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index db3811af0796..18ae614e5a6c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 		timeout = 5 * HZ;
 	dprintk("%s: requeueing work. Lease period = %ld\n",
 			__func__, (timeout + HZ - 1) / HZ);
-	mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+	mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout);
 	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
 	spin_unlock(&clp->cl_lock);
 }
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 96b1323318c2..627115179795 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class,
 			TP_ARGS(inode, new_size))
 
 DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio);
 DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
 DEFINE_NFS_UPDATE_SIZE_EVENT(update);
 DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb7243..6e69ce43a13f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req)
 	nfs_page_clear_headlock(req);
 }
 
-/*
- * nfs_page_group_sync_on_bit_locked
+/**
+ * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set
+ * @req: request in page group
+ * @bit: PG_* bit that is used to sync page group
  *
  * must be called with page group lock held
  */
-static bool
-nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
 {
 	struct nfs_page *head = req->wb_head;
 	struct nfs_page *tmp;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fa5c41d0989a..647c53d1418a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
 	}
 }
 
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
 {
-	int ret;
-
-	if (!test_bit(PG_REMOVE, &req->wb_flags))
-		return 0;
-	ret = nfs_page_group_lock(req);
-	if (ret)
-		return ret;
 	if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
 		nfs_page_set_inode_ref(req, inode);
-	nfs_page_group_unlock(req);
-	return 0;
 }
 
 /**
@@ -247,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
 }
 
 /*
- * nfs_page_group_search_locked
- * @head - head request of page group
- * @page_offset - offset into page
- *
- * Search page group with head @head to find a request that contains the
- * page offset @page_offset.
- *
- * Returns a pointer to the first matching nfs request, or NULL if no
- * match is found.
- *
- * Must be called with the page group lock held
- */
-static struct nfs_page *
-nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
-{
-	struct nfs_page *req;
-
-	req = head;
-	do {
-		if (page_offset >= req->wb_pgbase &&
-		    page_offset < (req->wb_pgbase + req->wb_bytes))
-			return req;
-
-		req = req->wb_this_page;
-	} while (req != head);
-
-	return NULL;
-}
-
-/*
- * nfs_page_group_covers_page
- * @head - head request of page group
+ * nfs_page_covers_folio
+ * @req: struct nfs_page
  *
- * Return true if the page group with head @head covers the whole page,
- * returns false otherwise
+ * Return true if the request covers the whole folio.
+ * Note that the caller should ensure all subrequests have been joined
  */
 static bool nfs_page_group_covers_page(struct nfs_page *req)
 {
 	unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
-	struct nfs_page *tmp;
-	unsigned int pos = 0;
-
-	nfs_page_group_lock(req);
 
-	for (;;) {
-		tmp = nfs_page_group_search_locked(req->wb_head, pos);
-		if (!tmp)
-			break;
-		pos = tmp->wb_pgbase + tmp->wb_bytes;
-	}
-
-	nfs_page_group_unlock(req);
-	return pos >= len;
+	return req->wb_pgbase == 0 && req->wb_bytes == len;
 }
 
 /* We can set the PG_uptodate flag if we see that a write request
@@ -585,19 +533,18 @@ retry:
 		}
 	}
 
+	ret = nfs_page_group_lock(head);
+	if (ret < 0)
+		goto out_unlock;
+
 	/* Ensure that nobody removed the request before we locked it */
 	if (head != folio->private) {
+		nfs_page_group_unlock(head);
 		nfs_unlock_and_release_request(head);
 		goto retry;
 	}
 
-	ret = nfs_cancel_remove_inode(head, inode);
-	if (ret < 0)
-		goto out_unlock;
-
-	ret = nfs_page_group_lock(head);
-	if (ret < 0)
-		goto out_unlock;
+	nfs_cancel_remove_inode(head, inode);
 
 	/* lock each request in the page group */
 	for (subreq = head->wb_this_page;
@@ -786,7 +733,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
 
-	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+	nfs_page_group_lock(req);
+	if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) {
 		struct folio *folio = nfs_page_to_folio(req->wb_head);
 		struct address_space *mapping = folio->mapping;
 
@@ -798,6 +746,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		}
 		spin_unlock(&mapping->i_private_lock);
 	}
+	nfs_page_group_unlock(req);
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
 		atomic_long_dec(&nfsi->nrequests);
@@ -2054,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
 		 * release it */
 		nfs_inode_remove_request(req);
 		nfs_unlock_and_release_request(req);
+		folio_cancel_dirty(folio);
 	}
 
 	return ret;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 732abf6b92a5..85ca663c052c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -113,7 +113,7 @@ static void
 nfsd_file_schedule_laundrette(void)
 {
 	if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
-		queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette,
+		queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette,
 				   NFSD_LAUNDRETTE_DELAY);
 }
 
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 4f6468eb2adf..cb237f1b902a 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -103,10 +103,11 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
 			if (nfsd_file_get(new) == NULL)
 				goto again;
 			/*
-			 * Drop the ref we were going to install and the
-			 * one we were going to return.
+			 * Drop the ref we were going to install (both file and
+			 * net) and the one we were going to return (only file).
 			 */
 			nfsd_file_put(localio);
+			nfsd_net_put(net);
 			nfsd_file_put(localio);
 			localio = new;
 		}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 98ab55ba3ced..aa4a95713a48 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -470,7 +470,15 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
 	if (!iap->ia_valid)
 		return 0;
 
-	iap->ia_valid |= ATTR_CTIME;
+	/*
+	 * If ATTR_DELEG is set, then this is an update from a client that
+	 * holds a delegation. If this is an update for only the atime, the
+	 * ctime should not be changed. If the update contains the mtime
+	 * too, then ATTR_CTIME should already be set.
+	 */
+	if (!(iap->ia_valid & ATTR_DELEG))
+		iap->ia_valid |= ATTR_CTIME;
+
 	return notify_change(&nop_mnt_idmap, dentry, iap, NULL);
 }
 
@@ -1943,10 +1951,9 @@ retry:
 		goto out_dput_old;
 	} else {
 		struct renamedata rd = {
-			.old_mnt_idmap	= &nop_mnt_idmap,
+			.mnt_idmap	= &nop_mnt_idmap,
 			.old_parent	= fdentry,
 			.old_dentry	= odentry,
-			.new_mnt_idmap	= &nop_mnt_idmap,
 			.new_parent	= tdentry,
 			.new_dentry	= ndentry,
 		};
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 14868a3dd592..bc52afbfc5c7 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
  ************************************************************************/
 
 static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
-					    struct attribute *attr, char *buf)
+					    struct kobj_attribute *attr, char *buf)
 {
 	return sysfs_emit(buf, "%d.%d\n",
 			NILFS_CURRENT_REV, NILFS_MINOR_REV);
@@ -1087,7 +1087,7 @@ static const char features_readme_str[] =
 	"(1) revision\n\tshow current revision of NILFS file system driver.\n";
 
 static ssize_t nilfs_feature_README_show(struct kobject *kobj,
-					 struct attribute *attr,
+					 struct kobj_attribute *attr,
 					 char *buf)
 {
 	return sysfs_emit(buf, features_readme_str);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 78a87a016928..d370cd5cce3f 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups {
 	struct completion sg_segments_kobj_unregister;
 };
 
-#define NILFS_COMMON_ATTR_STRUCT(name) \
+#define NILFS_KOBJ_ATTR_STRUCT(name) \
 struct nilfs_##name##_attr { \
 	struct attribute attr; \
-	ssize_t (*show)(struct kobject *, struct attribute *, \
+	ssize_t (*show)(struct kobject *, struct kobj_attribute *, \
 			char *); \
-	ssize_t (*store)(struct kobject *, struct attribute *, \
+	ssize_t (*store)(struct kobject *, struct kobj_attribute *, \
 			 const char *, size_t); \
 }
 
-NILFS_COMMON_ATTR_STRUCT(feature);
+NILFS_KOBJ_ATTR_STRUCT(feature);
 
 #define NILFS_DEV_ATTR_STRUCT(name) \
 struct nilfs_##name##_attr { \
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 079b868552c2..46bfc543f946 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
 		 * removed all zero refcount inodes, in any case.  Test to
 		 * be sure.
 		 */
-		if (!atomic_read(&inode->i_count)) {
+		if (!icount_read(inode)) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 798340db69d7..55a03bb05aa1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		conn->destroy_next = connector_destroy_list;
 		connector_destroy_list = conn;
 		spin_unlock(&destroy_lock);
-		queue_work(system_unbound_wq, &connector_reaper_work);
+		queue_work(system_dfl_wq, &connector_reaper_work);
 	}
 	/*
 	 * Note that we didn't update flags telling whether inode cares about
@@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	spin_lock(&destroy_lock);
 	list_add(&mark->g_list, &destroy_list);
 	spin_unlock(&destroy_lock);
-	queue_delayed_work(system_unbound_wq, &reaper_work,
+	queue_delayed_work(system_dfl_wq, &reaper_work,
 			   FSNOTIFY_REAPER_DELAY);
 }
 EXPORT_SYMBOL_GPL(fsnotify_put_mark);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 59aa801347a7..e7fd8a790aaa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,12 +13,26 @@
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
 #include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <linux/exportfs.h>
+#include <linux/nstree.h>
+#include <net/net_namespace.h>
 
 #include "mount.h"
 #include "internal.h"
 
 static struct vfsmount *nsfs_mnt;
 
+static struct path nsfs_root_path = {};
+
+void nsfs_get_root(struct path *path)
+{
+	*path = nsfs_root_path;
+	path_get(path);
+}
+
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg);
 static const struct file_operations ns_file_operations = {
@@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
 	 * the size value will be set to the size the kernel knows about.
 	 */
 	kinfo->size		= min(usize, sizeof(*kinfo));
-	kinfo->mnt_ns_id	= mnt_ns->seq;
+	kinfo->mnt_ns_id	= mnt_ns->ns.ns_id;
 	kinfo->nr_mounts	= READ_ONCE(mnt_ns->nr_mounts);
 	/* Subtract the root mount of the mount namespace. */
 	if (kinfo->nr_mounts)
@@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd)
 	case NS_GET_TGID_FROM_PIDNS:
 	case NS_GET_PID_IN_PIDNS:
 	case NS_GET_TGID_IN_PIDNS:
-		return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+	case NS_GET_ID:
+		return true;
 	}
 
 	/* Extensible ioctls require some extra handling. */
 	switch (_IOC_NR(cmd)) {
 	case _IOC_NR(NS_MNT_GET_INFO):
+		return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0);
 	case _IOC_NR(NS_MNT_GET_NEXT):
+		return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0);
 	case _IOC_NR(NS_MNT_GET_PREV):
-		return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+		return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0);
 	}
 
 	return false;
@@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			return -EINVAL;
 		return open_related_ns(ns, ns->ops->get_parent);
 	case NS_GET_NSTYPE:
-		return ns->ops->type;
+		return ns->ns_type;
 	case NS_GET_OWNER_UID:
-		if (ns->ops->type != CLONE_NEWUSER)
+		if (ns->ns_type != CLONE_NEWUSER)
 			return -EINVAL;
 		user_ns = container_of(ns, struct user_namespace, ns);
 		argp = (uid_t __user *) arg;
 		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
 		return put_user(uid, argp);
-	case NS_GET_MNTNS_ID: {
-		__u64 __user *idp;
-		__u64 id;
-
-		if (ns->ops->type != CLONE_NEWNS)
-			return -EINVAL;
-
-		mnt_ns = container_of(ns, struct mnt_namespace, ns);
-		idp = (__u64 __user *)arg;
-		id = mnt_ns->seq;
-		return put_user(id, idp);
-	}
 	case NS_GET_PID_FROM_PIDNS:
 		fallthrough;
 	case NS_GET_TGID_FROM_PIDNS:
@@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 	case NS_GET_PID_IN_PIDNS:
 		fallthrough;
 	case NS_GET_TGID_IN_PIDNS: {
-		if (ns->ops->type != CLONE_NEWPID)
+		if (ns->ns_type != CLONE_NEWPID)
 			return -EINVAL;
 
 		ret = -ESRCH;
@@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			ret = -ESRCH;
 		return ret;
 	}
+	case NS_GET_MNTNS_ID:
+		if (ns->ns_type != CLONE_NEWNS)
+			return -EINVAL;
+		fallthrough;
+	case NS_GET_ID: {
+		__u64 __user *idp;
+		__u64 id;
+
+		idp = (__u64 __user *)arg;
+		id = ns->ns_id;
+		return put_user(id, idp);
+	}
 	}
 
 	/* extensible ioctls */
@@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
 		size_t usize = _IOC_SIZE(ioctl);
 
-		if (ns->ops->type != CLONE_NEWNS)
+		if (ns->ns_type != CLONE_NEWNS)
 			return -EINVAL;
 
 		if (!uinfo)
@@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		struct file *f __free(fput) = NULL;
 		size_t usize = _IOC_SIZE(ioctl);
 
-		if (ns->ops->type != CLONE_NEWNS)
+		if (ns->ns_type != CLONE_NEWNS)
 			return -EINVAL;
 
 		if (usize < MNT_NS_INFO_SIZE_VER0)
@@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = {
 	.put_data = nsfs_put_data,
 };
 
+#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32))
+#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32))
+
+static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			  struct inode *parent)
+{
+	struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+	struct ns_common *ns = inode->i_private;
+	int len = *max_len;
+
+	if (parent)
+		return FILEID_INVALID;
+
+	if (len < NSFS_FID_SIZE_U32_VER0) {
+		*max_len = NSFS_FID_SIZE_U32_LATEST;
+		return FILEID_INVALID;
+	} else if (len > NSFS_FID_SIZE_U32_LATEST) {
+		*max_len = NSFS_FID_SIZE_U32_LATEST;
+	}
+
+	fid->ns_id	= ns->ns_id;
+	fid->ns_type	= ns->ns_type;
+	fid->ns_inum	= inode->i_ino;
+	return FILEID_NSFS;
+}
+
+static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					int fh_len, int fh_type)
+{
+	struct path path __free(path_put) = {};
+	struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+	struct user_namespace *owning_ns = NULL;
+	struct ns_common *ns;
+	int ret;
+
+	if (fh_len < NSFS_FID_SIZE_U32_VER0)
+		return NULL;
+
+	/* Check that any trailing bytes are zero. */
+	if ((fh_len > NSFS_FID_SIZE_U32_LATEST) &&
+	    memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0,
+		       fh_len - NSFS_FID_SIZE_U32_LATEST))
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_NSFS:
+		break;
+	default:
+		return NULL;
+	}
+
+	scoped_guard(rcu) {
+		ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
+		if (!ns)
+			return NULL;
+
+		VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
+		VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
+		VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
+
+		if (!__ns_ref_get(ns))
+			return NULL;
+	}
+
+	switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+	case CLONE_NEWCGROUP:
+		if (!current_in_namespace(to_cg_ns(ns)))
+			owning_ns = to_cg_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_IPC_NS
+	case CLONE_NEWIPC:
+		if (!current_in_namespace(to_ipc_ns(ns)))
+			owning_ns = to_ipc_ns(ns)->user_ns;
+		break;
+#endif
+	case CLONE_NEWNS:
+		if (!current_in_namespace(to_mnt_ns(ns)))
+			owning_ns = to_mnt_ns(ns)->user_ns;
+		break;
+#ifdef CONFIG_NET_NS
+	case CLONE_NEWNET:
+		if (!current_in_namespace(to_net_ns(ns)))
+			owning_ns = to_net_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_PID_NS
+	case CLONE_NEWPID:
+		if (!current_in_namespace(to_pid_ns(ns))) {
+			owning_ns = to_pid_ns(ns)->user_ns;
+		} else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
+			ns->ops->put(ns);
+			return ERR_PTR(-EPERM);
+		}
+		break;
+#endif
+#ifdef CONFIG_TIME_NS
+	case CLONE_NEWTIME:
+		if (!current_in_namespace(to_time_ns(ns)))
+			owning_ns = to_time_ns(ns)->user_ns;
+		break;
+#endif
+#ifdef CONFIG_USER_NS
+	case CLONE_NEWUSER:
+		if (!current_in_namespace(to_user_ns(ns)))
+			owning_ns = to_user_ns(ns);
+		break;
+#endif
+#ifdef CONFIG_UTS_NS
+	case CLONE_NEWUTS:
+		if (!current_in_namespace(to_uts_ns(ns)))
+			owning_ns = to_uts_ns(ns)->user_ns;
+		break;
+#endif
+	default:
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
+		ns->ops->put(ns);
+		return ERR_PTR(-EPERM);
+	}
+
+	/* path_from_stashed() unconditionally consumes the reference. */
+	ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return no_free_ptr(path.dentry);
+}
+
+static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
+				   unsigned int oflags)
+{
+	/* nsfs_fh_to_dentry() performs all permission checks. */
+	return 0;
+}
+
+static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+{
+	return file_open_root(path, "", oflags, 0);
+}
+
+static const struct export_operations nsfs_export_operations = {
+	.encode_fh	= nsfs_encode_fh,
+	.fh_to_dentry	= nsfs_fh_to_dentry,
+	.open		= nsfs_export_open,
+	.permission	= nsfs_export_permission,
+};
+
 static int nsfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
 	if (!ctx)
 		return -ENOMEM;
 	ctx->ops = &nsfs_ops;
+	ctx->eops = &nsfs_export_operations;
 	ctx->dops = &ns_dentry_operations;
 	fc->s_fs_info = (void *)&nsfs_stashed_ops;
 	return 0;
@@ -438,4 +607,6 @@ void __init nsfs_init(void)
 	if (IS_ERR(nsfs_mnt))
 		panic("can't set nsfs up\n");
 	nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
+	nsfs_root_path.mnt = nsfs_mnt;
+	nsfs_root_path.dentry = nsfs_mnt->mnt_root;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2018501b2249..2347a50f079b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 	dlm_debug_init(dlm);
 
 	snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
-	dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
+	dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU,
+					  0);
 	if (!dlm->dlm_worker) {
 		status = -ENOMEM;
 		mlog_errno(status);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 5130ec44e5e1..cccaa1d6fbba 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = {
 	.alloc_inode	= dlmfs_alloc_inode,
 	.free_inode	= dlmfs_free_inode,
 	.evict_inode	= dlmfs_evict_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 };
 
 static const struct inode_operations dlmfs_file_inode_operations = {
@@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void)
 	}
 	cleanup_inode = 1;
 
-	user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
+	user_dlm_worker = alloc_workqueue("user_dlm",
+					  WQ_MEM_RECLAIM | WQ_PERCPU, 0);
 	if (!user_dlm_worker) {
 		status = -ENOMEM;
 		goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 930150ed5db1..ef147e8b3271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -706,6 +706,8 @@ out:
  * it not only handles the fiemap for inlined files, but also deals
  * with the fast symlink, cause they have no difference for extent
  * mapping per se.
+ *
+ * Must be called with ip_alloc_sem semaphore held.
  */
 static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
 			       struct fiemap_extent_info *fieinfo,
@@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
 	u64 phys;
 	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	lockdep_assert_held_read(&oi->ip_alloc_sem);
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	if (ocfs2_inode_is_fast_symlink(inode))
@@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
 			phys += offsetof(struct ocfs2_dinode,
 					 id2.i_data.id_data);
 
+		/* Release the ip_alloc_sem to prevent deadlock on page fault */
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
 					      flags);
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 		if (ret < 0)
 			return ret;
 	}
@@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
 		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
 		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
-
+		/* Release the ip_alloc_sem to prevent deadlock on page fault */
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
 					      len_bytes, fe_flags);
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 		if (ret)
 			break;
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 14bf440ea4df..6c4f78f473fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode)
 	 * the journal is flushed before journal shutdown. Thus it is safe to
 	 * have inodes get cleaned up after journal shutdown.
 	 */
+	if (!osb->journal)
+		return;
+
 	jbd2_journal_release_jbd_inode(osb->journal->j_journal,
 				       &oi->ip_jinode);
 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8f732742b26e..267b50e8e42e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 		return error;
 	}
 
-	new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+	new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry)) {
 		mlog_errno(error);
@@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 				  d_inode(new_path.dentry),
 				  new_dentry, preserve);
 out_dput:
-	done_path_create(&new_path, new_dentry);
+	end_creating_path(&new_path, new_dentry);
 out:
 	path_put(&old_path);
 
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 0f045e45fa0c..765105f1ff8a 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = {
 static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	version_unlock(conn);
-	dlm_release_lockspace(conn->cc_lockspace, 2);
+	dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL);
 	conn->cc_lockspace = NULL;
 	ocfs2_live_connection_drop(conn->cc_private);
 	conn->cc_private = NULL;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index f3da840758e7..b46100a4f529 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = {
 	.free_inode = orangefs_free_inode,
 	.destroy_inode = orangefs_destroy_inode,
 	.write_inode = orangefs_write_inode,
-	.drop_inode = generic_delete_inode,
+	.drop_inode = inode_just_drop,
 	.statfs = orangefs_statfs,
 	.show_options = orangefs_show_options,
 };
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 70b8687dc45e..dbd63a74df4b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -225,7 +225,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 			       struct ovl_cattr *attr)
 {
 	struct dentry *ret;
-	inode_lock(workdir->d_inode);
+	inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT);
 	ret = ovl_create_real(ofs, workdir,
 			      ovl_lookup_temp(ofs, workdir), attr);
 	inode_unlock(workdir->d_inode);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index bb0d7ded8e76..4f84abaa0d68 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
 {
 	int err;
 	struct renamedata rd = {
-		.old_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
+		.mnt_idmap	= ovl_upper_mnt_idmap(ofs),
 		.old_parent	= olddir,
 		.old_dentry	= olddentry,
-		.new_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
 		.new_parent	= newdir,
 		.new_dentry	= newdentry,
 		.flags		= flags,
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index b65cdfce31ce..15cb06fa0c9a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
 
 static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
 {
-	int err;
+	int err = 0;
 	struct dentry *dentry, *dir = path->dentry;
 	const struct cred *old_cred;
 
 	old_cred = ovl_override_creds(rdd->dentry->d_sb);
 
-	err = down_write_killable(&dir->d_inode->i_rwsem);
-	if (!err) {
-		while (rdd->first_maybe_whiteout) {
-			struct ovl_cache_entry *p =
-				rdd->first_maybe_whiteout;
-			rdd->first_maybe_whiteout = p->next_maybe_whiteout;
-			dentry = lookup_one(mnt_idmap(path->mnt),
-					    &QSTR_LEN(p->name, p->len), dir);
-			if (!IS_ERR(dentry)) {
-				p->is_whiteout = ovl_is_whiteout(dentry);
-				dput(dentry);
-			}
+	while (rdd->first_maybe_whiteout) {
+		struct ovl_cache_entry *p =
+			rdd->first_maybe_whiteout;
+		rdd->first_maybe_whiteout = p->next_maybe_whiteout;
+		dentry = lookup_one_positive_killable(mnt_idmap(path->mnt),
+						      &QSTR_LEN(p->name, p->len),
+						      dir);
+		if (!IS_ERR(dentry)) {
+			p->is_whiteout = ovl_is_whiteout(dentry);
+			dput(dentry);
+		} else if (PTR_ERR(dentry) == -EINTR) {
+			err = -EINTR;
+			break;
 		}
-		inode_unlock(dir->d_inode);
 	}
 	ovl_revert_creds(old_cred);
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index df85a76597e9..bd3d7ba8fb95 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = {
 	.alloc_inode	= ovl_alloc_inode,
 	.free_inode	= ovl_free_inode,
 	.destroy_inode	= ovl_destroy_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.put_super	= ovl_put_super,
 	.sync_fs	= ovl_sync_fs,
 	.statfs		= ovl_statfs,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index a33115e7384c..41033bac96cb 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1552,7 +1552,8 @@ void ovl_copyattr(struct inode *inode)
 int ovl_parent_lock(struct dentry *parent, struct dentry *child)
 {
 	inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
-	if (!child || child->d_parent == parent)
+	if (!child ||
+	    (!d_unhashed(child) && child->d_parent == parent))
 		return 0;
 
 	inode_unlock(parent->d_inode);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index edc35522d75c..c40c29c702e5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -296,12 +296,12 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+	struct task_struct *task __free(put_task) = NULL;
 	struct pid *pid = pidfd_pid(file);
 	size_t usize = _IOC_SIZE(cmd);
 	struct pidfd_info kinfo = {};
 	struct pidfs_exit_info *exit_info;
 	struct user_namespace *user_ns;
-	struct task_struct *task;
 	struct pidfs_attr *attr;
 	const struct cred *c;
 	__u64 mask;
@@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
 		 * erronously mistook the file descriptor for a pidfd.
 		 * This is not perfect but will catch most cases.
 		 */
-		return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+		return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
 	}
 
 	return false;
@@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode)
 }
 
 static const struct super_operations pidfs_sops = {
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= pidfs_evict_inode,
 	.statfs		= simple_statfs,
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index 731622d0738d..42fead1efe52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&pipe->mutex);
 
 	if (!pipe->readers) {
-		send_sig(SIGPIPE, current, 0);
+		if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+			send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
 		goto out;
 	}
@@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
 
 	for (;;) {
 		if (!pipe->readers) {
-			send_sig(SIGPIPE, current, 0);
+			if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+				send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
diff --git a/fs/pnode.c b/fs/pnode.c
index 81f7599bdac4..6f7d02f3fa98 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -111,7 +111,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		return;
 	}
 	if (IS_MNT_SHARED(mnt)) {
-		m = propagation_source(mnt);
+		if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
+			m = propagation_source(mnt);
 		if (list_empty(&mnt->mnt_share)) {
 			mnt_release_group_id(mnt);
 		} else {
@@ -637,10 +638,11 @@ void propagate_umount(struct list_head *set)
 	}
 
 	// now to_umount consists of all acceptable candidates
-	// deal with reparenting of remaining overmounts on those
+	// deal with reparenting of surviving overmounts on those
 	list_for_each_entry(m, &to_umount, mnt_list) {
-		if (m->overmount)
-			reparent(m->overmount);
+		struct mount *over = m->overmount;
+		if (over && !will_be_unmounted(over))
+			reparent(over);
 	}
 
 	// and fold them into the set
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d6a0369caa93..69269745d73b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	unsigned int max_fds = 0;
 
 	rcu_read_lock();
-	ppid = pid_alive(p) ?
-		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
 	tracer = ptrace_parent(p);
 	if (tracer)
 		tpid = task_pid_nr_ns(tracer, ns);
 
+	ppid = task_ppid_nr_ns(p, ns);
 	tgid = task_tgid_nr_ns(p, ns);
 	ngid = task_numa_group_id(p);
 	cred = get_task_cred(p);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76e800e38c8f..176281112273 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
 	.setattr	= proc_notify_change,
 };
 
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+	const struct proc_ops *proc_ops = pde->proc_ops;
+
+	if (!proc_ops)
+		return;
+
+	if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+		pde->flags |= PROC_ENTRY_PERMANENT;
+	if (proc_ops->proc_read_iter)
+		pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+	if (proc_ops->proc_compat_ioctl)
+		pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+	if (proc_ops->proc_lseek)
+		pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
 /* returns the registered entry, or frees dp and returns NULL on failure */
 struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
 		struct proc_dir_entry *dp)
@@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
 	if (proc_alloc_inum(&dp->low_ino))
 		goto out_free_entry;
 
+	if (!S_ISDIR(dp->mode))
+		pde_set_flags(dp);
+
 	write_lock(&proc_subdir_lock);
 	dp->parent = dir;
 	if (pde_subdir_insert(dir, dp) == false) {
@@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
 	return p;
 }
 
-static void pde_set_flags(struct proc_dir_entry *pde)
-{
-	if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
-		pde->flags |= PROC_ENTRY_PERMANENT;
-	if (pde->proc_ops->proc_read_iter)
-		pde->flags |= PROC_ENTRY_proc_read_iter;
-#ifdef CONFIG_COMPAT
-	if (pde->proc_ops->proc_compat_ioctl)
-		pde->flags |= PROC_ENTRY_proc_compat_ioctl;
-#endif
-	if (pde->proc_ops->proc_lseek)
-		pde->flags |= PROC_ENTRY_proc_lseek;
-}
-
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent,
 		const struct proc_ops *proc_ops, void *data)
@@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 	if (!p)
 		return NULL;
 	p->proc_ops = proc_ops;
-	pde_set_flags(p);
 	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_data);
@@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
 	p->proc_ops = &proc_seq_ops;
 	p->seq_ops = ops;
 	p->state_size = state_size;
-	pde_set_flags(p);
 	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_seq_private);
@@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
 		return NULL;
 	p->proc_ops = &proc_single_ops;
 	p->single_show = show;
-	pde_set_flags(p);
 	return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 129490151be1..d9b7ef122343 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
 const struct super_operations proc_sops = {
 	.alloc_inode	= proc_alloc_inode,
 	.free_inode	= proc_free_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= proc_evict_inode,
 	.statfs		= simple_statfs,
 	.show_options	= proc_show_options,
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 4403a2e20c16..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
 #include "internal.h"
 
 
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
 #ifdef CONFIG_NET_NS
 	&netns_operations,
 #endif
@@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
 static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct task_struct *task = get_proc_task(file_inode(file));
-	const struct proc_ns_operations **entry, **last;
+	const struct proc_ns_operations *const *entry, *const *last;
 
 	if (!task)
 		return -ENOENT;
@@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 				struct dentry *dentry, unsigned int flags)
 {
 	struct task_struct *task = get_proc_task(dir);
-	const struct proc_ns_operations **entry, **last;
+	const struct proc_ns_operations *const *entry, *const *last;
 	unsigned int len = dentry->d_name.len;
 	struct dentry *res = ERR_PTR(-ENOENT);
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..1e24e085c7d5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
 	Opt_gid,
 	Opt_hidepid,
 	Opt_subset,
+	Opt_pidns,
 };
 
 static const struct fs_parameter_spec proc_fs_parameters[] = {
-	fsparam_u32("gid",	Opt_gid),
+	fsparam_u32("gid",		Opt_gid),
 	fsparam_string("hidepid",	Opt_hidepid),
 	fsparam_string("subset",	Opt_subset),
+	fsparam_file_or_string("pidns",	Opt_pidns),
 	{}
 };
 
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
 	return 0;
 }
 
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+				  struct fs_parameter *param,
+				  struct fs_parse_result *result)
+{
+	struct proc_fs_context *ctx = fc->fs_private;
+	struct pid_namespace *target, *active = task_active_pid_ns(current);
+	struct ns_common *ns;
+	struct file *ns_filp __free(fput) = NULL;
+
+	switch (param->type) {
+	case fs_value_is_file:
+		/* came through fsconfig, steal the file reference */
+		ns_filp = no_free_ptr(param->file);
+		break;
+	case fs_value_is_string:
+		ns_filp = filp_open(param->string, O_RDONLY, 0);
+		break;
+	default:
+		WARN_ON_ONCE(true);
+		break;
+	}
+	if (!ns_filp)
+		ns_filp = ERR_PTR(-EBADF);
+	if (IS_ERR(ns_filp)) {
+		errorfc(fc, "could not get file from pidns argument");
+		return PTR_ERR(ns_filp);
+	}
+
+	if (!proc_ns_file(ns_filp))
+		return invalfc(fc, "pidns argument is not an nsfs file");
+	ns = get_proc_ns(file_inode(ns_filp));
+	if (ns->ns_type != CLONE_NEWPID)
+		return invalfc(fc, "pidns argument is not a pidns file");
+	target = container_of(ns, struct pid_namespace, ns);
+
+	/*
+	 * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+	 * permission model should be the same as pidns_install().
+	 */
+	if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+		errorfc(fc, "insufficient permissions to set pidns");
+		return -EPERM;
+	}
+	if (!pidns_is_ancestor(target, active))
+		return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+	put_pid_ns(ctx->pid_ns);
+	ctx->pid_ns = get_pid_ns(target);
+	put_user_ns(fc->user_ns);
+	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+	return 0;
+}
+#endif /* CONFIG_PID_NS */
+
 static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct proc_fs_context *ctx = fc->fs_private;
 	struct fs_parse_result result;
-	int opt;
+	int opt, err;
 
 	opt = fs_parse(fc, proc_fs_parameters, param, &result);
 	if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		break;
 
 	case Opt_hidepid:
-		if (proc_parse_hidepid_param(fc, param))
-			return -EINVAL;
+		err = proc_parse_hidepid_param(fc, param);
+		if (err)
+			return err;
 		break;
 
 	case Opt_subset:
-		if (proc_parse_subset_param(fc, param->string) < 0)
-			return -EINVAL;
+		err = proc_parse_subset_param(fc, param->string);
+		if (err)
+			return err;
+		break;
+
+	case Opt_pidns:
+#ifdef CONFIG_PID_NS
+		/*
+		 * We would have to RCU-protect every proc_pid_ns() or
+		 * proc_sb_info() access if we allowed this to be reconfigured
+		 * for an existing procfs instance. Luckily, procfs instances
+		 * are cheap to create, and mount-beneath would let you
+		 * atomically replace an instance even with overmounts.
+		 */
+		if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+			errorfc(fc, "cannot reconfigure pidns for existing procfs");
+			return -EBUSY;
+		}
+		err = proc_parse_pidns_param(fc, param, &result);
+		if (err)
+			return err;
 		break;
+#else
+		errorfc(fc, "pidns mount flag not supported on this system");
+		return -EOPNOTSUPP;
+#endif
 
 	default:
 		return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
 		fs_info->hide_pid = ctx->hidepid;
 	if (ctx->mask & (1 << Opt_subset))
 		fs_info->pidonly = ctx->pidonly;
+	if (ctx->mask & (1 << Opt_pidns) &&
+	    !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+		put_pid_ns(fs_info->pid_ns);
+		fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+	}
 }
 
 static int proc_fill_super(struct super_block *s, struct fs_context *fc)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3d6d8a9f13fc..b26ae556b446 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -340,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file,
 
 	priv->inode = inode;
 	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
-	if (IS_ERR_OR_NULL(priv->mm)) {
-		int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
+	if (IS_ERR(priv->mm)) {
+		int err = PTR_ERR(priv->mm);
 
 		seq_release_private(inode, file);
 		return err;
@@ -1148,10 +1148,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = walk->vma;
-	pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
 	struct folio *folio = NULL;
 	bool present = false;
+	spinlock_t *ptl;
+	pte_t ptent;
 
+	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+	ptent = huge_ptep_get(walk->mm, addr, pte);
 	if (pte_present(ptent)) {
 		folio = page_folio(pte_page(ptent));
 		present = true;
@@ -1170,6 +1173,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 		else
 			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
 	}
+	spin_unlock(ptl);
 	return 0;
 }
 #else
@@ -2017,12 +2021,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	struct pagemapread *pm = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 	u64 flags = 0, frame = 0;
+	spinlock_t *ptl;
 	int err = 0;
 	pte_t pte;
 
 	if (vma->vm_flags & VM_SOFTDIRTY)
 		flags |= PM_SOFT_DIRTY;
 
+	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
 	pte = huge_ptep_get(walk->mm, addr, ptep);
 	if (pte_present(pte)) {
 		struct folio *folio = page_folio(pte_page(pte));
@@ -2050,11 +2056,12 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 
 		err = add_to_pagemap(&pme, pm);
 		if (err)
-			return err;
+			break;
 		if (pm->show_pfn && (flags & PM_PRESENT))
 			frame++;
 	}
 
+	spin_unlock(ptl);
 	cond_resched();
 
 	return err;
@@ -2410,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
 {
 	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
 
+	if (!p->vec_buf)
+		return;
+
 	if (cur_buf->start != addr)
 		cur_buf->end = addr;
 	else
@@ -3128,17 +3138,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
 		unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
-	pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
+	pte_t huge_pte;
 	struct numa_maps *md;
 	struct page *page;
+	spinlock_t *ptl;
 
+	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+	huge_pte = huge_ptep_get(walk->mm, addr, pte);
 	if (!pte_present(huge_pte))
-		return 0;
+		goto out;
 
 	page = pte_page(huge_pte);
 
 	md = walk->private;
 	gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+	spin_unlock(ptl);
 	return 0;
 }
 
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1a2e1185426c..b4e55c90f8dc 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc)
 
 static const struct super_operations pstore_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.evict_inode	= pstore_evict_inode,
 	.show_options	= pstore_show_options,
 };
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index ceb5639a0629..eb61ba5bb964 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -43,7 +43,7 @@ struct psz_buffer {
  *
  * @magic: magic num for kmsg dump header
  * @time: kmsg dump trigger time
- * @compressed: whether conpressed
+ * @compressed: whether compressed
  * @counter: kmsg dump counter
  * @reason: the kmsg dump reason (e.g. oops, panic, etc)
  * @data: pointer to log data
@@ -214,7 +214,7 @@ static int psz_zone_write(struct pstore_zone *zone,
 		atomic_set(&zone->buffer->datalen, wlen + off);
 	}
 
-	/* avoid to damage old records */
+	/* avoid damaging old records */
 	if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered))
 		goto dirty;
 
@@ -249,7 +249,7 @@ static int psz_zone_write(struct pstore_zone *zone,
 
 	return 0;
 dirty:
-	/* no need to mark dirty if going to try next zone */
+	/* no need to mark it dirty if going to try next zone */
 	if (wcnt == -ENOMSG)
 		return -ENOMSG;
 	atomic_set(&zone->dirty, true);
@@ -378,7 +378,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
 	struct timespec64 time = { };
 	unsigned long i;
 	/*
-	 * Recover may on panic, we can't allocate any memory by kmalloc.
+	 * Recover may happen on panic, we can't allocate any memory by kmalloc.
 	 * So, we use local array instead.
 	 */
 	char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0};
@@ -856,11 +856,11 @@ static int notrace psz_record_write(struct pstore_zone *zone,
 
 	/**
 	 * psz_zone_write will set datalen as start + cnt.
-	 * It work if actual data length lesser than buffer size.
-	 * If data length greater than buffer size, pmsg will rewrite to
-	 * beginning of zone, which make buffer->datalen wrongly.
+	 * It works if actual data length is lesser than buffer size.
+	 * If data length is greater than buffer size, pmsg will rewrite to
+	 * the beginning of the zone, which makes buffer->datalen wrong.
 	 * So we should reset datalen as buffer size once actual data length
-	 * greater than buffer size.
+	 * is greater than buffer size.
 	 */
 	if (is_full_data) {
 		atomic_set(&zone->buffer->datalen, zone->buffer_size);
@@ -878,8 +878,9 @@ static int notrace psz_pstore_write(struct pstore_record *record)
 		atomic_set(&cxt->on_panic, 1);
 
 	/*
-	 * if on panic, do not write except panic records
-	 * Fix case that panic_write prints log which wakes up console backend.
+	 * If on panic, do not write anything except panic records.
+	 * Fix the case when panic_write prints log that wakes up
+	 * console backend.
 	 */
 	if (is_on_panic() && record->type != PSTORE_TYPE_DMESG)
 		return -EBUSY;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..afa15a214538 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -881,7 +881,7 @@ void dqput(struct dquot *dquot)
 	put_releasing_dquots(dquot);
 	atomic_dec(&dquot->dq_count);
 	spin_unlock(&dq_list_lock);
-	queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
+	queue_delayed_work(system_dfl_wq, &quota_release_work, 1);
 }
 EXPORT_SYMBOL(dqput);
 
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f8874c3b8c1e..41f9995da7ca 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root)
 
 static const struct super_operations ramfs_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.show_options	= ramfs_show_options,
 };
 
diff --git a/fs/read_write.c b/fs/read_write.c
index c5b6265d984b..833bae068770 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (len == 0)
 		return 0;
 
+	/*
+	 * Make sure return value doesn't overflow in 32bit compat mode.  Also
+	 * limit the size for all cases except when calling ->copy_file_range().
+	 */
+	if (splice || !file_out->f_op->copy_file_range || in_compat_syscall())
+		len = min_t(size_t, MAX_RW_COUNT, len);
+
 	file_start_write(file_out);
 
 	/*
@@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 						      len, flags);
 	} else if (!splice && file_in->f_op->remap_file_range && samesb) {
 		ret = file_in->f_op->remap_file_range(file_in, pos_in,
-				file_out, pos_out,
-				min_t(loff_t, MAX_RW_COUNT, len),
-				REMAP_FILE_CAN_SHORTEN);
+				file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN);
 		/* fallback to splice */
 		if (ret <= 0)
 			splice = true;
@@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	 * to splicing from input file, while file_start_write() is held on
 	 * the output file on a different sb.
 	 */
-	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-			       min_t(size_t, len, MAX_RW_COUNT), 0);
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
 done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d98e0d2de09f..0d0ef54fc4de 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
 	rdt_last_cmd_clear();
 
 	if (!strcmp(buf, "mbm_local_bytes")) {
-		if (resctrl_arch_is_mbm_local_enabled())
+		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
 			rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
 		else
 			ret = -EINVAL;
 	} else if (!strcmp(buf, "mbm_total_bytes")) {
-		if (resctrl_arch_is_mbm_total_enabled())
+		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
 			rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
 		else
 			ret = -EINVAL;
@@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 	rr->r = r;
 	rr->d = d;
 	rr->first = first;
-	rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
-	if (IS_ERR(rr->arch_mon_ctx)) {
-		rr->err = -EINVAL;
-		return;
+	if (resctrl_arch_mbm_cntr_assign_enabled(r) &&
+	    resctrl_is_mbm_event(evtid)) {
+		rr->is_mbm_cntr = true;
+	} else {
+		rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+		if (IS_ERR(rr->arch_mon_ctx)) {
+			rr->err = -EINVAL;
+			return;
+		}
 	}
 
 	cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
@@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 	else
 		smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
 
-	resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
+	if (rr->arch_mon_ctx)
+		resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
 }
 
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
@@ -625,11 +631,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		 */
 		list_for_each_entry(d, &r->mon_domains, hdr.list) {
 			if (d->ci_id == domid) {
-				rr.ci_id = d->ci_id;
 				cpu = cpumask_any(&d->hdr.cpu_mask);
 				ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
 				if (!ci)
 					continue;
+				rr.ci = ci;
 				mon_event_read(&rr, r, NULL, rdtgrp,
 					       &ci->shared_cpu_map, evtid, false);
 				goto checkresult;
@@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 
 checkresult:
 
+	/*
+	 * -ENOENT is a special case, set only when "mbm_event" counter assignment
+	 * mode is enabled and no counter has been assigned.
+	 */
 	if (rr.err == -EIO)
 		seq_puts(m, "Error\n");
 	else if (rr.err == -EINVAL)
 		seq_puts(m, "Unavailable\n");
+	else if (rr.err == -ENOENT)
+		seq_puts(m, "Unassigned\n");
 	else
 		seq_printf(m, "%llu\n", rr.val);
 
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 0a1eedba2b03..cf1fd82dc5a9 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
 }
 
 /**
- * struct mon_evt - Entry in the event list of a resource
+ * struct mon_evt - Properties of a monitor event
  * @evtid:		event id
+ * @rid:		resource id for this event
  * @name:		name of the event
+ * @evt_cfg:		Event configuration value that represents the
+ *			memory transactions (e.g., READS_TO_LOCAL_MEM,
+ *			READS_TO_REMOTE_MEM) being tracked by @evtid.
+ *			Only valid if @evtid is an MBM event.
  * @configurable:	true if the event is configurable
- * @list:		entry in &rdt_resource->evt_list
+ * @enabled:		true if the event is enabled
  */
 struct mon_evt {
 	enum resctrl_event_id	evtid;
+	enum resctrl_res_level	rid;
 	char			*name;
+	u32			evt_cfg;
 	bool			configurable;
-	struct list_head	list;
+	bool			enabled;
 };
 
+extern struct mon_evt mon_event_all[QOS_NUM_EVENTS];
+
+#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT];	\
+				      mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++)
+
 /**
  * struct mon_data - Monitoring details for each event file.
  * @list:            Member of the global @mon_data_kn_priv_list list.
@@ -98,7 +110,9 @@ struct mon_data {
  *	   domains in @r sharing L3 @ci.id
  * @evtid: Which monitor event to read.
  * @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it
+ *	   is an MBM event.
  * @err:   Error encountered when reading counter.
  * @val:   Returned value of event counter. If @rgrp is a parent resource group,
  *	   @val includes the sum of event counts from its child resource groups.
@@ -112,7 +126,8 @@ struct rmid_read {
 	struct rdt_mon_domain	*d;
 	enum resctrl_event_id	evtid;
 	bool			first;
-	unsigned int		ci_id;
+	struct cacheinfo	*ci;
+	bool			is_mbm_cntr;
 	int			err;
 	u64			val;
 	void			*arch_mon_ctx;
@@ -226,6 +241,8 @@ struct rdtgroup {
 
 #define RFTYPE_DEBUG			BIT(10)
 
+#define RFTYPE_ASSIGN_CONFIG		BIT(11)
+
 #define RFTYPE_CTRL_INFO		(RFTYPE_INFO | RFTYPE_CTRL)
 
 #define RFTYPE_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
@@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid);
 
 int resctrl_find_cleanest_closid(void);
 
+void *rdt_kn_parent_priv(struct kernfs_node *kn);
+
+int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf,
+				      size_t nbytes, loff_t off);
+
+void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn,
+			     bool show);
+
+int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s,
+				     void *v);
+
+void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp);
+
+void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp);
+
+int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v);
+
+ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off);
+
+int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of,
+				     struct seq_file *s, void *v);
+
+ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf,
+					  size_t nbytes, loff_t off);
+
+int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+				 loff_t off);
+
 #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
 int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
 
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index f5637855c3ac..4076336fbba6 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid)
 
 	entry = __rmid_entry(idx);
 
-	if (resctrl_arch_is_llc_occupancy_enabled())
+	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID))
 		add_rmid_to_limbo(entry);
 	else
 		list_add_tail(&entry->list, &rmid_free_lru);
@@ -346,28 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
 				       u32 rmid, enum resctrl_event_id evtid)
 {
 	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+	struct mbm_state *state;
 
-	switch (evtid) {
-	case QOS_L3_MBM_TOTAL_EVENT_ID:
-		return &d->mbm_total[idx];
-	case QOS_L3_MBM_LOCAL_EVENT_ID:
-		return &d->mbm_local[idx];
-	default:
+	if (!resctrl_is_mbm_event(evtid))
 		return NULL;
+
+	state = d->mbm_states[MBM_STATE_IDX(evtid)];
+
+	return state ? &state[idx] : NULL;
+}
+
+/*
+ * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp.
+ *
+ * Return:
+ * Valid counter ID on success, or -ENOENT on failure.
+ */
+static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d,
+			struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
+{
+	int cntr_id;
+
+	if (!r->mon.mbm_cntr_assignable)
+		return -ENOENT;
+
+	if (!resctrl_is_mbm_event(evtid))
+		return -ENOENT;
+
+	for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
+		if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp &&
+		    d->cntr_cfg[cntr_id].evtid == evtid)
+			return cntr_id;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d.
+ * Caller must ensure that the specified event is not assigned already.
+ *
+ * Return:
+ * Valid counter ID on success, or -ENOSPC on failure.
+ */
+static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d,
+			  struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
+{
+	int cntr_id;
+
+	for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
+		if (!d->cntr_cfg[cntr_id].rdtgrp) {
+			d->cntr_cfg[cntr_id].rdtgrp = rdtgrp;
+			d->cntr_cfg[cntr_id].evtid = evtid;
+			return cntr_id;
+		}
 	}
+
+	return -ENOSPC;
 }
 
-static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
+/*
+ * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d.
+ */
+static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id)
+{
+	memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg));
+}
+
+static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr)
 {
 	int cpu = smp_processor_id();
+	u32 closid = rdtgrp->closid;
+	u32 rmid = rdtgrp->mon.rmid;
 	struct rdt_mon_domain *d;
-	struct cacheinfo *ci;
+	int cntr_id = -ENOENT;
 	struct mbm_state *m;
 	int err, ret;
 	u64 tval = 0;
 
+	if (rr->is_mbm_cntr) {
+		cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid);
+		if (cntr_id < 0) {
+			rr->err = -ENOENT;
+			return -EINVAL;
+		}
+	}
+
 	if (rr->first) {
-		resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+		if (rr->is_mbm_cntr)
+			resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid);
+		else
+			resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
 		m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
 		if (m)
 			memset(m, 0, sizeof(struct mbm_state));
@@ -378,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 		/* Reading a single domain, must be on a CPU in that domain. */
 		if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
 			return -EINVAL;
-		rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
-						 rr->evtid, &tval, rr->arch_mon_ctx);
+		if (rr->is_mbm_cntr)
+			rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id,
+							 rr->evtid, &tval);
+		else
+			rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+							 rr->evtid, &tval, rr->arch_mon_ctx);
 		if (rr->err)
 			return rr->err;
 
@@ -389,8 +462,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 	}
 
 	/* Summing domains that share a cache, must be on a CPU for that cache. */
-	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
-	if (!ci || ci->id != rr->ci_id)
+	if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
 		return -EINVAL;
 
 	/*
@@ -402,10 +474,14 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 	 */
 	ret = -EINVAL;
 	list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
-		if (d->ci_id != rr->ci_id)
+		if (d->ci_id != rr->ci->id)
 			continue;
-		err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
-					     rr->evtid, &tval, rr->arch_mon_ctx);
+		if (rr->is_mbm_cntr)
+			err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id,
+						     rr->evtid, &tval);
+		else
+			err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+						     rr->evtid, &tval, rr->arch_mon_ctx);
 		if (!err) {
 			rr->val += tval;
 			ret = 0;
@@ -421,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 /*
  * mbm_bw_count() - Update bw count from values previously read by
  *		    __mon_event_count().
- * @closid:	The closid used to identify the cached mbm_state.
- * @rmid:	The rmid used to identify the cached mbm_state.
+ * @rdtgrp:	resctrl group associated with the CLOSID and RMID to identify
+ *		the cached mbm_state.
  * @rr:		The struct rmid_read populated by __mon_event_count().
  *
  * Supporting function to calculate the memory bandwidth
@@ -430,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
  * __mon_event_count() is compared with the chunks value from the previous
  * invocation. This must be called once per second to maintain values in MBps.
  */
-static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
+static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr)
 {
 	u64 cur_bw, bytes, cur_bytes;
+	u32 closid = rdtgrp->closid;
+	u32 rmid = rdtgrp->mon.rmid;
 	struct mbm_state *m;
 
 	m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
@@ -461,7 +539,7 @@ void mon_event_count(void *info)
 
 	rdtgrp = rr->rgrp;
 
-	ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
+	ret = __mon_event_count(rdtgrp, rr);
 
 	/*
 	 * For Ctrl groups read data from child monitor groups and
@@ -472,8 +550,7 @@ void mon_event_count(void *info)
 
 	if (rdtgrp->type == RDTCTRL_GROUP) {
 		list_for_each_entry(entry, head, mon.crdtgrp_list) {
-			if (__mon_event_count(entry->closid, entry->mon.rmid,
-					      rr) == 0)
+			if (__mon_event_count(entry, rr) == 0)
 				ret = 0;
 		}
 	}
@@ -604,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
 }
 
 static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
-				 u32 closid, u32 rmid, enum resctrl_event_id evtid)
+				 struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
 {
 	struct rmid_read rr = {0};
 
 	rr.r = r;
 	rr.d = d;
 	rr.evtid = evtid;
-	rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
-	if (IS_ERR(rr.arch_mon_ctx)) {
-		pr_warn_ratelimited("Failed to allocate monitor context: %ld",
-				    PTR_ERR(rr.arch_mon_ctx));
-		return;
+	if (resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rr.is_mbm_cntr = true;
+	} else {
+		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+		if (IS_ERR(rr.arch_mon_ctx)) {
+			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+					    PTR_ERR(rr.arch_mon_ctx));
+			return;
+		}
 	}
 
-	__mon_event_count(closid, rmid, &rr);
+	__mon_event_count(rdtgrp, &rr);
 
 	/*
 	 * If the software controller is enabled, compute the
 	 * bandwidth for this event id.
 	 */
 	if (is_mba_sc(NULL))
-		mbm_bw_count(closid, rmid, &rr);
+		mbm_bw_count(rdtgrp, &rr);
 
-	resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+	if (rr.arch_mon_ctx)
+		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
 }
 
 static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
-		       u32 closid, u32 rmid)
+		       struct rdtgroup *rdtgrp)
 {
 	/*
 	 * This is protected from concurrent reads from user as both
 	 * the user and overflow handler hold the global mutex.
 	 */
-	if (resctrl_arch_is_mbm_total_enabled())
-		mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID);
 
-	if (resctrl_arch_is_mbm_local_enabled())
-		mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+		mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID);
 }
 
 /*
@@ -714,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work)
 	d = container_of(work, struct rdt_mon_domain, mbm_over.work);
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
-		mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
+		mbm_update(r, d, prgrp);
 
 		head = &prgrp->mon.crdtgrp_list;
 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
-			mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
+			mbm_update(r, d, crgrp);
 
 		if (is_mba_sc(NULL))
 			update_mba_bw(prgrp, d);
@@ -844,38 +926,819 @@ out_unlock:
 	mutex_unlock(&rdtgroup_mutex);
 }
 
-static struct mon_evt llc_occupancy_event = {
-	.name		= "llc_occupancy",
-	.evtid		= QOS_L3_OCCUP_EVENT_ID,
+/*
+ * All available events. Architecture code marks the ones that
+ * are supported by a system using resctrl_enable_mon_event()
+ * to set .enabled.
+ */
+struct mon_evt mon_event_all[QOS_NUM_EVENTS] = {
+	[QOS_L3_OCCUP_EVENT_ID] = {
+		.name	= "llc_occupancy",
+		.evtid	= QOS_L3_OCCUP_EVENT_ID,
+		.rid	= RDT_RESOURCE_L3,
+	},
+	[QOS_L3_MBM_TOTAL_EVENT_ID] = {
+		.name	= "mbm_total_bytes",
+		.evtid	= QOS_L3_MBM_TOTAL_EVENT_ID,
+		.rid	= RDT_RESOURCE_L3,
+	},
+	[QOS_L3_MBM_LOCAL_EVENT_ID] = {
+		.name	= "mbm_local_bytes",
+		.evtid	= QOS_L3_MBM_LOCAL_EVENT_ID,
+		.rid	= RDT_RESOURCE_L3,
+	},
 };
 
-static struct mon_evt mbm_total_event = {
-	.name		= "mbm_total_bytes",
-	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
+void resctrl_enable_mon_event(enum resctrl_event_id eventid)
+{
+	if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS))
+		return;
+	if (mon_event_all[eventid].enabled) {
+		pr_warn("Duplicate enable for event %d\n", eventid);
+		return;
+	}
+
+	mon_event_all[eventid].enabled = true;
+}
+
+bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid)
+{
+	return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS &&
+	       mon_event_all[eventid].enabled;
+}
+
+u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid)
+{
+	return mon_event_all[evtid].evt_cfg;
+}
+
+/**
+ * struct mbm_transaction - Memory transaction an MBM event can be configured with.
+ * @name:	Name of memory transaction (read, write ...).
+ * @val:	The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to
+ *		represent the memory transaction within an event's configuration.
+ */
+struct mbm_transaction {
+	char	name[32];
+	u32	val;
 };
 
-static struct mon_evt mbm_local_event = {
-	.name		= "mbm_local_bytes",
-	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
+/* Decoded values for each type of memory transaction. */
+static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = {
+	{"local_reads", READS_TO_LOCAL_MEM},
+	{"remote_reads", READS_TO_REMOTE_MEM},
+	{"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM},
+	{"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM},
+	{"local_reads_slow_memory", READS_TO_LOCAL_S_MEM},
+	{"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM},
+	{"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM},
 };
 
+int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v)
+{
+	struct mon_evt *mevt = rdt_kn_parent_priv(of->kn);
+	struct rdt_resource *r;
+	bool sep = false;
+	int ret = 0, i;
+
+	mutex_lock(&rdtgroup_mutex);
+	rdt_last_cmd_clear();
+
+	r = resctrl_arch_get_resource(mevt->rid);
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) {
+		if (mevt->evt_cfg & mbm_transactions[i].val) {
+			if (sep)
+				seq_putc(seq, ',');
+			seq_printf(seq, "%s", mbm_transactions[i].name);
+			sep = true;
+		}
+	}
+	seq_putc(seq, '\n');
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+
+int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s,
+				     void *v)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	int ret = 0;
+
+	mutex_lock(&rdtgroup_mutex);
+	rdt_last_cmd_clear();
+
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir);
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+
+ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf,
+					  size_t nbytes, loff_t off)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	bool value;
+	int ret;
+
+	ret = kstrtobool(buf, &value);
+	if (ret)
+		return ret;
+
+	mutex_lock(&rdtgroup_mutex);
+	rdt_last_cmd_clear();
+
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	r->mon.mbm_assign_on_mkdir = value;
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret ?: nbytes;
+}
+
+/*
+ * mbm_cntr_free_all() - Clear all the counter ID configuration details in the
+ *			 domain @d. Called when mbm_assign_mode is changed.
+ */
+static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+	memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs);
+}
+
+/*
+ * resctrl_reset_rmid_all() - Reset all non-architecture states for all the
+ *			      supported RMIDs.
+ */
+static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+	enum resctrl_event_id evt;
+	int idx;
+
+	for_each_mbm_event_id(evt) {
+		if (!resctrl_is_mon_event_enabled(evt))
+			continue;
+		idx = MBM_STATE_IDX(evt);
+		memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit);
+	}
+}
+
+/*
+ * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID
+ * pair in the domain.
+ *
+ * Assign the counter if @assign is true else unassign the counter. Reset the
+ * associated non-architectural state.
+ */
+static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+				 enum resctrl_event_id evtid, u32 rmid, u32 closid,
+				 u32 cntr_id, bool assign)
+{
+	struct mbm_state *m;
+
+	resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign);
+
+	m = get_mbm_state(d, closid, rmid, evtid);
+	if (m)
+		memset(m, 0, sizeof(*m));
+}
+
+/*
+ * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event
+ * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d.
+ *
+ * Return:
+ * 0 on success, < 0 on failure.
+ */
+static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+				      struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+	int cntr_id;
+
+	/* No action required if the counter is assigned already. */
+	cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid);
+	if (cntr_id >= 0)
+		return 0;
+
+	cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid);
+	if (cntr_id < 0) {
+		rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n",
+				    mevt->name, d->hdr.id);
+		return cntr_id;
+	}
+
+	rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true);
+
+	return 0;
+}
+
 /*
- * Initialize the event list for the resource.
+ * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in
+ * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is
+ * NULL; otherwise, assign the counter to the specified domain @d.
+ *
+ * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr()
+ * will fail. The assignment process will abort at the first failure encountered
+ * during domain traversal, which may result in the event being only partially
+ * assigned.
  *
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
+ * Return:
+ * 0 on success, < 0 on failure.
+ */
+static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+				      struct mon_evt *mevt)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+	int ret = 0;
+
+	if (!d) {
+		list_for_each_entry(d, &r->mon_domains, hdr.list) {
+			ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
+	}
+
+	return ret;
+}
+
+/*
+ * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when
+ *			     a new group is created.
+ *
+ * Each group can accommodate two counters per domain: one for the total
+ * event and one for the local event. Assignments may fail due to the limited
+ * number of counters. However, it is not necessary to fail the group creation
+ * and thus no failure is returned. Users have the option to modify the
+ * counter assignments after the group has been created.
+ */
+void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+	if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) ||
+	    !r->mon.mbm_assign_on_mkdir)
+		return;
+
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		rdtgroup_assign_cntr_event(NULL, rdtgrp,
+					   &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
+
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+		rdtgroup_assign_cntr_event(NULL, rdtgrp,
+					   &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
+}
+
+/*
+ * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration
+ * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp.
+ */
+static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+					struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+	int cntr_id;
+
+	cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid);
+
+	/* If there is no cntr_id assigned, nothing to do */
+	if (cntr_id < 0)
+		return;
+
+	rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false);
+
+	mbm_cntr_free(d, cntr_id);
+}
+
+/*
+ * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with
+ * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign
+ * the counters from all the domains if @d is NULL else unassign from @d.
+ */
+static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+					 struct mon_evt *mevt)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+
+	if (!d) {
+		list_for_each_entry(d, &r->mon_domains, hdr.list)
+			rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
+	} else {
+		rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
+	}
+}
+
+/*
+ * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events.
+ *			       Called when a group is deleted.
  */
-static void l3_mon_evt_init(struct rdt_resource *r)
+void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp)
 {
-	INIT_LIST_HEAD(&r->evt_list);
+	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
 
-	if (resctrl_arch_is_llc_occupancy_enabled())
-		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
-	if (resctrl_arch_is_mbm_total_enabled())
-		list_add_tail(&mbm_total_event.list, &r->evt_list);
-	if (resctrl_arch_is_mbm_local_enabled())
-		list_add_tail(&mbm_local_event.list, &r->evt_list);
+	if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r))
+		return;
+
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		rdtgroup_unassign_cntr_event(NULL, rdtgrp,
+					     &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
+
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+		rdtgroup_unassign_cntr_event(NULL, rdtgrp,
+					     &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
+}
+
+static int resctrl_parse_mem_transactions(char *tok, u32 *val)
+{
+	u32 temp_val = 0;
+	char *evt_str;
+	bool found;
+	int i;
+
+next_config:
+	if (!tok || tok[0] == '\0') {
+		*val = temp_val;
+		return 0;
+	}
+
+	/* Start processing the strings for each memory transaction type */
+	evt_str = strim(strsep(&tok, ","));
+	found = false;
+	for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) {
+		if (!strcmp(mbm_transactions[i].name, evt_str)) {
+			temp_val |= mbm_transactions[i].val;
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str);
+		return -EINVAL;
+	}
+
+	goto next_config;
+}
+
+/*
+ * rdtgroup_update_cntr_event - Update the counter assignments for the event
+ *				in a group.
+ * @r:		Resource to which update needs to be done.
+ * @rdtgrp:	Resctrl group.
+ * @evtid:	MBM monitor event.
+ */
+static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp,
+				       enum resctrl_event_id evtid)
+{
+	struct rdt_mon_domain *d;
+	int cntr_id;
+
+	list_for_each_entry(d, &r->mon_domains, hdr.list) {
+		cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid);
+		if (cntr_id >= 0)
+			rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid,
+					     rdtgrp->closid, cntr_id, true);
+	}
+}
+
+/*
+ * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event
+ *				   for all the groups.
+ * @mevt	MBM Monitor event.
+ */
+static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+	struct rdtgroup *prgrp, *crgrp;
+
+	/*
+	 * Find all the groups where the event is assigned and update the
+	 * configuration of existing assignments.
+	 */
+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+		rdtgroup_update_cntr_event(r, prgrp, mevt->evtid);
+
+		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+			rdtgroup_update_cntr_event(r, crgrp, mevt->evtid);
+	}
+}
+
+ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off)
+{
+	struct mon_evt *mevt = rdt_kn_parent_priv(of->kn);
+	struct rdt_resource *r;
+	u32 evt_cfg = 0;
+	int ret = 0;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+
+	buf[nbytes - 1] = '\0';
+
+	cpus_read_lock();
+	mutex_lock(&rdtgroup_mutex);
+
+	rdt_last_cmd_clear();
+
+	r = resctrl_arch_get_resource(mevt->rid);
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = resctrl_parse_mem_transactions(buf, &evt_cfg);
+	if (!ret && mevt->evt_cfg != evt_cfg) {
+		mevt->evt_cfg = evt_cfg;
+		resctrl_update_cntr_allrdtgrp(mevt);
+	}
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+	cpus_read_unlock();
+
+	return ret ?: nbytes;
+}
+
+int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of,
+				 struct seq_file *s, void *v)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	bool enabled;
+
+	mutex_lock(&rdtgroup_mutex);
+	enabled = resctrl_arch_mbm_cntr_assign_enabled(r);
+
+	if (r->mon.mbm_cntr_assignable) {
+		if (enabled)
+			seq_puts(s, "[mbm_event]\n");
+		else
+			seq_puts(s, "[default]\n");
+
+		if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) {
+			if (enabled)
+				seq_puts(s, "default\n");
+			else
+				seq_puts(s, "mbm_event\n");
+		}
+	} else {
+		seq_puts(s, "[default]\n");
+	}
+
+	mutex_unlock(&rdtgroup_mutex);
+
+	return 0;
+}
+
+ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf,
+				      size_t nbytes, loff_t off)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	struct rdt_mon_domain *d;
+	int ret = 0;
+	bool enable;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+
+	buf[nbytes - 1] = '\0';
+
+	cpus_read_lock();
+	mutex_lock(&rdtgroup_mutex);
+
+	rdt_last_cmd_clear();
+
+	if (!strcmp(buf, "default")) {
+		enable = 0;
+	} else if (!strcmp(buf, "mbm_event")) {
+		if (r->mon.mbm_cntr_assignable) {
+			enable = 1;
+		} else {
+			ret = -EINVAL;
+			rdt_last_cmd_puts("mbm_event mode is not supported\n");
+			goto out_unlock;
+		}
+	} else {
+		ret = -EINVAL;
+		rdt_last_cmd_puts("Unsupported assign mode\n");
+		goto out_unlock;
+	}
+
+	if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		ret = resctrl_arch_mbm_cntr_assign_set(r, enable);
+		if (ret)
+			goto out_unlock;
+
+		/* Update the visibility of BMEC related files */
+		resctrl_bmec_files_show(r, NULL, !enable);
+
+		/*
+		 * Initialize the default memory transaction values for
+		 * total and local events.
+		 */
+		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+			mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+		if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+			mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+									   (READS_TO_LOCAL_MEM |
+									    READS_TO_LOCAL_S_MEM |
+									    NON_TEMP_WRITE_TO_LOCAL_MEM);
+		/* Enable auto assignment when switching to "mbm_event" mode */
+		if (enable)
+			r->mon.mbm_assign_on_mkdir = true;
+		/*
+		 * Reset all the non-achitectural RMID state and assignable counters.
+		 */
+		list_for_each_entry(d, &r->mon_domains, hdr.list) {
+			mbm_cntr_free_all(r, d);
+			resctrl_reset_rmid_all(r, d);
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+	cpus_read_unlock();
+
+	return ret ?: nbytes;
+}
+
+int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of,
+			       struct seq_file *s, void *v)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	struct rdt_mon_domain *dom;
+	bool sep = false;
+
+	cpus_read_lock();
+	mutex_lock(&rdtgroup_mutex);
+
+	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+		if (sep)
+			seq_putc(s, ';');
+
+		seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs);
+		sep = true;
+	}
+	seq_putc(s, '\n');
+
+	mutex_unlock(&rdtgroup_mutex);
+	cpus_read_unlock();
+	return 0;
+}
+
+int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of,
+				     struct seq_file *s, void *v)
+{
+	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+	struct rdt_mon_domain *dom;
+	bool sep = false;
+	u32 cntrs, i;
+	int ret = 0;
+
+	cpus_read_lock();
+	mutex_lock(&rdtgroup_mutex);
+
+	rdt_last_cmd_clear();
+
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+		if (sep)
+			seq_putc(s, ';');
+
+		cntrs = 0;
+		for (i = 0; i < r->mon.num_mbm_cntrs; i++) {
+			if (!dom->cntr_cfg[i].rdtgrp)
+				cntrs++;
+		}
+
+		seq_printf(s, "%d=%u", dom->hdr.id, cntrs);
+		sep = true;
+	}
+	seq_putc(s, '\n');
+
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+	cpus_read_unlock();
+
+	return ret;
+}
+
+int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+	struct rdt_mon_domain *d;
+	struct rdtgroup *rdtgrp;
+	struct mon_evt *mevt;
+	int ret = 0;
+	bool sep;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	rdt_last_cmd_clear();
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	for_each_mon_event(mevt) {
+		if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid))
+			continue;
+
+		sep = false;
+		seq_printf(s, "%s:", mevt->name);
+		list_for_each_entry(d, &r->mon_domains, hdr.list) {
+			if (sep)
+				seq_putc(s, ';');
+
+			if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0)
+				seq_printf(s, "%d=_", d->hdr.id);
+			else
+				seq_printf(s, "%d=e", d->hdr.id);
+
+			sep = true;
+		}
+		seq_putc(s, '\n');
+	}
+
+out_unlock:
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+/*
+ * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching
+ * event name.
+ */
+static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name)
+{
+	struct mon_evt *mevt;
+
+	for_each_mon_event(mevt) {
+		if (mevt->rid == r->rid && mevt->enabled &&
+		    resctrl_is_mbm_event(mevt->evtid) &&
+		    !strcmp(mevt->name, name))
+			return mevt;
+	}
+
+	return NULL;
+}
+
+static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d,
+					struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+	int ret = 0;
+
+	if (!assign || strlen(assign) != 1)
+		return -EINVAL;
+
+	switch (*assign) {
+	case 'e':
+		ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt);
+		break;
+	case '_':
+		rdtgroup_unassign_cntr_event(d, rdtgrp, mevt);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp,
+					char *event, char *tok)
+{
+	struct rdt_mon_domain *d;
+	unsigned long dom_id = 0;
+	char *dom_str, *id_str;
+	struct mon_evt *mevt;
+	int ret;
+
+	mevt = mbm_get_mon_event_by_name(r, event);
+	if (!mevt) {
+		rdt_last_cmd_printf("Invalid event %s\n", event);
+		return -ENOENT;
+	}
+
+next:
+	if (!tok || tok[0] == '\0')
+		return 0;
+
+	/* Start processing the strings for each domain */
+	dom_str = strim(strsep(&tok, ";"));
+
+	id_str = strsep(&dom_str, "=");
+
+	/* Check for domain id '*' which means all domains */
+	if (id_str && *id_str == '*') {
+		ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt);
+		if (ret)
+			rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n",
+					    event, dom_str);
+		return ret;
+	} else if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+		rdt_last_cmd_puts("Missing domain id\n");
+		return -EINVAL;
+	}
+
+	/* Verify if the dom_id is valid */
+	list_for_each_entry(d, &r->mon_domains, hdr.list) {
+		if (d->hdr.id == dom_id) {
+			ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt);
+			if (ret) {
+				rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n",
+						    event, dom_id, dom_str);
+				return ret;
+			}
+			goto next;
+		}
+	}
+
+	rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id);
+	return -EINVAL;
+}
+
+ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+	struct rdtgroup *rdtgrp;
+	char *token, *event;
+	int ret = 0;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+
+	buf[nbytes - 1] = '\0';
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+	rdt_last_cmd_clear();
+
+	if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+		rdt_last_cmd_puts("mbm_event mode is not enabled\n");
+		rdtgroup_kn_unlock(of->kn);
+		return -EINVAL;
+	}
+
+	while ((token = strsep(&buf, "\n")) != NULL) {
+		/*
+		 * The write command follows the following format:
+		 * "<Event>:<Domain ID>=<Assignment state>"
+		 * Extract the event name first.
+		 */
+		event = strsep(&token, ":");
+
+		ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token);
+		if (ret)
+			break;
+	}
+
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret ?: nbytes;
 }
 
 /**
@@ -902,24 +1765,43 @@ int resctrl_mon_resource_init(void)
 	if (ret)
 		return ret;
 
-	l3_mon_evt_init(r);
-
 	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
-		mbm_total_event.configurable = true;
+		mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true;
 		resctrl_file_fflags_init("mbm_total_bytes_config",
 					 RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
 	}
 	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
-		mbm_local_event.configurable = true;
+		mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true;
 		resctrl_file_fflags_init("mbm_local_bytes_config",
 					 RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
 	}
 
-	if (resctrl_arch_is_mbm_local_enabled())
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
 		mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
-	else if (resctrl_arch_is_mbm_total_enabled())
+	else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
 		mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
 
+	if (r->mon.mbm_cntr_assignable) {
+		if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+			resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+		if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+			resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+		mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+		mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+								   (READS_TO_LOCAL_MEM |
+								    READS_TO_LOCAL_S_MEM |
+								    NON_TEMP_WRITE_TO_LOCAL_MEM);
+		r->mon.mbm_assign_on_mkdir = true;
+		resctrl_file_fflags_init("num_mbm_cntrs",
+					 RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+		resctrl_file_fflags_init("available_mbm_cntrs",
+					 RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+		resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG);
+		resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO |
+					 RFTYPE_RES_CACHE);
+		resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE);
+	}
+
 	return 0;
 }
 
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 77d08229d855..0320360cd7a6 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void)
 
 static bool resctrl_is_mbm_enabled(void)
 {
-	return (resctrl_arch_is_mbm_total_enabled() ||
-		resctrl_arch_is_mbm_local_enabled());
-}
-
-static bool resctrl_is_mbm_event(int e)
-{
-	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
-		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+	return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) ||
+		resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID));
 }
 
 /*
@@ -196,7 +190,7 @@ static int closid_alloc(void)
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
-	    resctrl_arch_is_llc_occupancy_enabled()) {
+	    resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
 		cleanest_closid = resctrl_find_cleanest_closid();
 		if (cleanest_closid < 0)
 			return cleanest_closid;
@@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 	return 0;
 }
 
-static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+void *rdt_kn_parent_priv(struct kernfs_node *kn)
 {
 	/*
 	 * The parent pointer is only valid within RCU section since it can be
@@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
 {
 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
 
-	seq_printf(seq, "%d\n", r->num_rmid);
+	seq_printf(seq, "%d\n", r->mon.num_rmid);
 
 	return 0;
 }
@@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
 	struct mon_evt *mevt;
 
-	list_for_each_entry(mevt, &r->evt_list, list) {
+	for_each_mon_event(mevt) {
+		if (mevt->rid != r->rid || !mevt->enabled)
+			continue;
 		seq_printf(seq, "%s\n", mevt->name);
-		if (mevt->configurable)
+		if (mevt->configurable &&
+		    !resctrl_arch_mbm_cntr_assign_enabled(r))
 			seq_printf(seq, "%s_config\n", mevt->name);
 	}
 
@@ -1735,9 +1732,9 @@ next:
 	}
 
 	/* Value from user cannot be more than the supported set of events */
-	if ((val & r->mbm_cfg_mask) != val) {
+	if ((val & r->mon.mbm_cfg_mask) != val) {
 		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
-				    r->mbm_cfg_mask);
+				    r->mon.mbm_cfg_mask);
 		return -EINVAL;
 	}
 
@@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
 	return ret ?: nbytes;
 }
 
+/*
+ * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl
+ * files. When @show is true, the files are displayed; when false, the files
+ * are hidden.
+ * Don't treat kernfs_find_and_get failure as an error, since this function may
+ * be called regardless of whether BMEC is supported or the event is enabled.
+ */
+void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn,
+			     bool show)
+{
+	struct kernfs_node *kn_config, *mon_kn = NULL;
+	char name[32];
+
+	if (!l3_mon_kn) {
+		sprintf(name, "%s_MON", r->name);
+		mon_kn = kernfs_find_and_get(kn_info, name);
+		if (!mon_kn)
+			return;
+		l3_mon_kn = mon_kn;
+	}
+
+	kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config");
+	if (kn_config) {
+		kernfs_show(kn_config, show);
+		kernfs_put(kn_config);
+	}
+
+	kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config");
+	if (kn_config) {
+		kernfs_show(kn_config, show);
+		kernfs_put(kn_config);
+	}
+
+	/* Release the reference only if it was acquired */
+	if (mon_kn)
+		kernfs_put(mon_kn);
+}
+
 /* rdtgroup information files for one cache resource. */
 static struct rftype res_common_files[] = {
 	{
@@ -1813,6 +1848,13 @@ static struct rftype res_common_files[] = {
 		.fflags		= RFTYPE_TOP_INFO,
 	},
 	{
+		.name		= "mbm_assign_on_mkdir",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= resctrl_mbm_assign_on_mkdir_show,
+		.write		= resctrl_mbm_assign_on_mkdir_write,
+	},
+	{
 		.name		= "num_closids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
@@ -1827,6 +1869,12 @@ static struct rftype res_common_files[] = {
 		.fflags		= RFTYPE_MON_INFO,
 	},
 	{
+		.name		= "available_mbm_cntrs",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= resctrl_available_mbm_cntrs_show,
+	},
+	{
 		.name		= "num_rmids",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
@@ -1841,6 +1889,12 @@ static struct rftype res_common_files[] = {
 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
 	},
 	{
+		.name		= "num_mbm_cntrs",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= resctrl_num_mbm_cntrs_show,
+	},
+	{
 		.name		= "min_cbm_bits",
 		.mode		= 0444,
 		.kf_ops		= &rdtgroup_kf_single_ops,
@@ -1916,6 +1970,28 @@ static struct rftype res_common_files[] = {
 		.write		= mbm_local_bytes_config_write,
 	},
 	{
+		.name		= "event_filter",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= event_filter_show,
+		.write		= event_filter_write,
+	},
+	{
+		.name		= "mbm_L3_assignments",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= mbm_L3_assignments_show,
+		.write		= mbm_L3_assignments_write,
+	},
+	{
+		.name		= "mbm_assign_mode",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= resctrl_mbm_assign_mode_show,
+		.write		= resctrl_mbm_assign_mode_write,
+		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
+	},
+	{
 		.name		= "cpus",
 		.mode		= 0644,
 		.kf_ops		= &rdtgroup_kf_single_ops,
@@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
 	return ret;
 }
 
+static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn)
+{
+	struct kernfs_node *kn_subdir, *kn_subdir2;
+	struct mon_evt *mevt;
+	int ret;
+
+	kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL);
+	if (IS_ERR(kn_subdir))
+		return PTR_ERR(kn_subdir);
+
+	ret = rdtgroup_kn_set_ugid(kn_subdir);
+	if (ret)
+		return ret;
+
+	for_each_mon_event(mevt) {
+		if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid))
+			continue;
+
+		kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt);
+		if (IS_ERR(kn_subdir2)) {
+			ret = PTR_ERR(kn_subdir2);
+			goto out;
+		}
+
+		ret = rdtgroup_kn_set_ugid(kn_subdir2);
+		if (ret)
+			goto out;
+
+		ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG);
+		if (ret)
+			break;
+	}
+
+out:
+	return ret;
+}
+
 static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
 				      unsigned long fflags)
 {
 	struct kernfs_node *kn_subdir;
+	struct rdt_resource *r;
 	int ret;
 
 	kn_subdir = kernfs_create_dir(kn_info, name,
@@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
 		return ret;
 
 	ret = rdtgroup_add_files(kn_subdir, fflags);
-	if (!ret)
-		kernfs_activate(kn_subdir);
+	if (ret)
+		return ret;
+
+	if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) {
+		r = priv;
+		if (r->mon.mbm_cntr_assignable) {
+			ret = resctrl_mkdir_event_configs(r, kn_subdir);
+			if (ret)
+				return ret;
+			/*
+			 * Hide BMEC related files if mbm_event mode
+			 * is enabled.
+			 */
+			if (resctrl_arch_mbm_cntr_assign_enabled(r))
+				resctrl_bmec_files_show(r, kn_subdir, false);
+		}
+	}
+
+	kernfs_activate(kn_subdir);
 
 	return ret;
 }
@@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc)
 		goto out_root;
 
 	ret = schemata_list_create();
-	if (ret) {
-		schemata_list_destroy();
-		goto out_ctx;
-	}
+	if (ret)
+		goto out_schemata_free;
 
 	ret = closid_init();
 	if (ret)
@@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc)
 		if (ret < 0)
 			goto out_info;
 
+		rdtgroup_assign_cntrs(&rdtgroup_default);
+
 		ret = mkdir_mondata_all(rdtgroup_default.kn,
 					&rdtgroup_default, &kn_mondata);
 		if (ret < 0)
@@ -2675,15 +2806,16 @@ out_mondata:
 	if (resctrl_arch_mon_capable())
 		kernfs_remove(kn_mondata);
 out_mongrp:
-	if (resctrl_arch_mon_capable())
+	if (resctrl_arch_mon_capable()) {
+		rdtgroup_unassign_cntrs(&rdtgroup_default);
 		kernfs_remove(kn_mongrp);
+	}
 out_info:
 	kernfs_remove(kn_info);
 out_closid_exit:
 	closid_exit();
 out_schemata_free:
 	schemata_list_destroy();
-out_ctx:
 	rdt_disable_ctx();
 out_root:
 	rdtgroup_destroy_root();
@@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
 
 	head = &rdtgrp->mon.crdtgrp_list;
 	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+		rdtgroup_unassign_cntrs(sentry);
 		free_rmid(sentry->closid, sentry->mon.rmid);
 		list_del(&sentry->mon.crdtgrp_list);
 
@@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void)
 		cpumask_or(&rdtgroup_default.cpu_mask,
 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
+		rdtgroup_unassign_cntrs(rdtgrp);
+
 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
 
 		kernfs_remove(rdtgrp->kn);
@@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void)
 		return;
 
 	rmdir_all_sub();
+	rdtgroup_unassign_cntrs(&rdtgroup_default);
 	mon_put_kn_priv();
 	rdt_pseudo_lock_release();
 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
@@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
 	struct mon_evt *mevt;
 	int ret, domid;
 
-	if (WARN_ON(list_empty(&r->evt_list)))
-		return -EPERM;
-
-	list_for_each_entry(mevt, &r->evt_list, list) {
+	for_each_mon_event(mevt) {
+		if (mevt->rid != r->rid || !mevt->enabled)
+			continue;
 		domid = do_sum ? d->ci_id : d->hdr.id;
 		priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
 		if (WARN_ON_ONCE(!priv))
@@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
 	}
 	rdtgrp->mon.rmid = ret;
 
+	rdtgroup_assign_cntrs(rdtgrp);
+
 	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
 	if (ret) {
 		rdt_last_cmd_puts("kernfs subdir error\n");
+		rdtgroup_unassign_cntrs(rdtgrp);
 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
 		return ret;
 	}
@@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
 
 static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
 {
-	if (resctrl_arch_mon_capable())
+	if (resctrl_arch_mon_capable()) {
+		rdtgroup_unassign_cntrs(rgrp);
 		free_rmid(rgrp->closid, rgrp->mon.rmid);
+	}
 }
 
 /*
@@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 	update_closid_rmid(tmpmask, NULL);
 
 	rdtgrp->flags = RDT_DELETED;
+
+	rdtgroup_unassign_cntrs(rdtgrp);
+
 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
 
 	/*
@@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
 	update_closid_rmid(tmpmask, NULL);
 
+	rdtgroup_unassign_cntrs(rdtgrp);
+
 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
 	closid_free(rdtgrp->closid);
 
@@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void)
 
 static void domain_destroy_mon_state(struct rdt_mon_domain *d)
 {
+	int idx;
+
+	kfree(d->cntr_cfg);
 	bitmap_free(d->rmid_busy_llc);
-	kfree(d->mbm_total);
-	kfree(d->mbm_local);
+	for_each_mbm_idx(idx) {
+		kfree(d->mbm_states[idx]);
+		d->mbm_states[idx] = NULL;
+	}
 }
 
 void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
@@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
 
 	if (resctrl_is_mbm_enabled())
 		cancel_delayed_work(&d->mbm_over);
-	if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) {
 		/*
 		 * When a package is going down, forcefully
 		 * decrement rmid->ebusy. There is no way to know
@@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
 {
 	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
-	size_t tsize;
+	size_t tsize = sizeof(*d->mbm_states[0]);
+	enum resctrl_event_id eventid;
+	int idx;
 
-	if (resctrl_arch_is_llc_occupancy_enabled()) {
+	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
 		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
 		if (!d->rmid_busy_llc)
 			return -ENOMEM;
 	}
-	if (resctrl_arch_is_mbm_total_enabled()) {
-		tsize = sizeof(*d->mbm_total);
-		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
-		if (!d->mbm_total) {
-			bitmap_free(d->rmid_busy_llc);
-			return -ENOMEM;
-		}
+
+	for_each_mbm_event_id(eventid) {
+		if (!resctrl_is_mon_event_enabled(eventid))
+			continue;
+		idx = MBM_STATE_IDX(eventid);
+		d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL);
+		if (!d->mbm_states[idx])
+			goto cleanup;
 	}
-	if (resctrl_arch_is_mbm_local_enabled()) {
-		tsize = sizeof(*d->mbm_local);
-		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
-		if (!d->mbm_local) {
-			bitmap_free(d->rmid_busy_llc);
-			kfree(d->mbm_total);
-			return -ENOMEM;
-		}
+
+	if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) {
+		tsize = sizeof(*d->cntr_cfg);
+		d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL);
+		if (!d->cntr_cfg)
+			goto cleanup;
 	}
 
 	return 0;
+cleanup:
+	bitmap_free(d->rmid_busy_llc);
+	for_each_mbm_idx(idx) {
+		kfree(d->mbm_states[idx]);
+		d->mbm_states[idx] = NULL;
+	}
+
+	return -ENOMEM;
 }
 
 int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
@@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
 					   RESCTRL_PICK_ANY_CPU);
 	}
 
-	if (resctrl_arch_is_llc_occupancy_enabled())
+	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID))
 		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
 
 	/*
@@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu)
 			cancel_delayed_work(&d->mbm_over);
 			mbm_setup_overflow_handler(d, 0, cpu);
 		}
-		if (resctrl_arch_is_llc_occupancy_enabled() &&
+		if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) &&
 		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
 			cancel_delayed_work(&d->cqm_limbo);
 			cqm_setup_limbo_handler(d, 0, cpu);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index beb4f18f05ef..35c4d27d2cc0 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -24,6 +24,7 @@
 #endif
 #ifdef CONFIG_CIFS_SMB_DIRECT
 #include "smbdirect.h"
+#include "../common/smbdirect/smbdirect_pdu.h"
 #endif
 #include "cifs_swn.h"
 #include "cached_dir.h"
@@ -304,6 +305,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
 			list_for_each(tmp1, &ses->tcon_list) {
 				tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
 				cfids = tcon->cfids;
+				if (!cfids)
+					continue;
 				spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
 				seq_printf(m, "Num entries: %d\n", cfids->num_entries);
 				list_for_each_entry(cfid, &cfids->entries, entry) {
@@ -319,8 +322,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
 					seq_printf(m, "\n");
 				}
 				spin_unlock(&cfids->cfid_list_lock);
-
-
 			}
 		}
 	}
@@ -347,6 +348,22 @@ static __always_inline const char *compression_alg_str(__le16 alg)
 	}
 }
 
+static __always_inline const char *cipher_alg_str(__le16 cipher)
+{
+	switch (cipher) {
+	case SMB2_ENCRYPTION_AES128_CCM:
+		return "AES128-CCM";
+	case SMB2_ENCRYPTION_AES128_GCM:
+		return "AES128-GCM";
+	case SMB2_ENCRYPTION_AES256_CCM:
+		return "AES256-CCM";
+	case SMB2_ENCRYPTION_AES256_GCM:
+		return "AES256-GCM";
+	default:
+		return "UNKNOWN";
+	}
+}
+
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
 	struct mid_q_entry *mid_entry;
@@ -440,57 +457,55 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 		sc = &server->smbd_conn->socket;
 		sp = &sc->parameters;
 
-		seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
-			"transport status: %x",
-			server->smbd_conn->protocol,
-			server->smbd_conn->socket.status);
-		seq_printf(m, "\nConn receive_credit_max: %x "
-			"send_credit_target: %x max_send_size: %x",
+		seq_printf(m, "\nSMBDirect protocol version: 0x%x "
+			"transport status: %s (%u)",
+			SMBDIRECT_V1,
+			smbdirect_socket_status_string(sc->status),
+			sc->status);
+		seq_printf(m, "\nConn receive_credit_max: %u "
+			"send_credit_target: %u max_send_size: %u",
 			sp->recv_credit_max,
 			sp->send_credit_target,
 			sp->max_send_size);
-		seq_printf(m, "\nConn max_fragmented_recv_size: %x "
-			"max_fragmented_send_size: %x max_receive_size:%x",
+		seq_printf(m, "\nConn max_fragmented_recv_size: %u "
+			"max_fragmented_send_size: %u max_receive_size:%u",
 			sp->max_fragmented_recv_size,
 			sp->max_fragmented_send_size,
 			sp->max_recv_size);
-		seq_printf(m, "\nConn keep_alive_interval: %x "
-			"max_readwrite_size: %x rdma_readwrite_threshold: %x",
+		seq_printf(m, "\nConn keep_alive_interval: %u "
+			"max_readwrite_size: %u rdma_readwrite_threshold: %u",
 			sp->keepalive_interval_msec * 1000,
 			sp->max_read_write_size,
-			server->smbd_conn->rdma_readwrite_threshold);
-		seq_printf(m, "\nDebug count_get_receive_buffer: %x "
-			"count_put_receive_buffer: %x count_send_empty: %x",
-			server->smbd_conn->count_get_receive_buffer,
-			server->smbd_conn->count_put_receive_buffer,
-			server->smbd_conn->count_send_empty);
-		seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
-			"count_enqueue_reassembly_queue: %x "
-			"count_dequeue_reassembly_queue: %x "
-			"reassembly_data_length: %x "
-			"reassembly_queue_length: %x",
-			server->smbd_conn->count_reassembly_queue,
-			server->smbd_conn->count_enqueue_reassembly_queue,
-			server->smbd_conn->count_dequeue_reassembly_queue,
+			server->rdma_readwrite_threshold);
+		seq_printf(m, "\nDebug count_get_receive_buffer: %llu "
+			"count_put_receive_buffer: %llu count_send_empty: %llu",
+			sc->statistics.get_receive_buffer,
+			sc->statistics.put_receive_buffer,
+			sc->statistics.send_empty);
+		seq_printf(m, "\nRead Queue "
+			"count_enqueue_reassembly_queue: %llu "
+			"count_dequeue_reassembly_queue: %llu "
+			"reassembly_data_length: %u "
+			"reassembly_queue_length: %u",
+			sc->statistics.enqueue_reassembly_queue,
+			sc->statistics.dequeue_reassembly_queue,
 			sc->recv_io.reassembly.data_length,
 			sc->recv_io.reassembly.queue_length);
-		seq_printf(m, "\nCurrent Credits send_credits: %x "
-			"receive_credits: %x receive_credit_target: %x",
-			atomic_read(&server->smbd_conn->send_credits),
-			atomic_read(&server->smbd_conn->receive_credits),
-			server->smbd_conn->receive_credit_target);
-		seq_printf(m, "\nPending send_pending: %x ",
-			atomic_read(&server->smbd_conn->send_pending));
-		seq_printf(m, "\nReceive buffers count_receive_queue: %x ",
-			server->smbd_conn->count_receive_queue);
-		seq_printf(m, "\nMR responder_resources: %x "
-			"max_frmr_depth: %x mr_type: %x",
-			server->smbd_conn->responder_resources,
-			server->smbd_conn->max_frmr_depth,
-			server->smbd_conn->mr_type);
-		seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x",
-			atomic_read(&server->smbd_conn->mr_ready_count),
-			atomic_read(&server->smbd_conn->mr_used_count));
+		seq_printf(m, "\nCurrent Credits send_credits: %u "
+			"receive_credits: %u receive_credit_target: %u",
+			atomic_read(&sc->send_io.credits.count),
+			atomic_read(&sc->recv_io.credits.count),
+			sc->recv_io.credits.target);
+		seq_printf(m, "\nPending send_pending: %u ",
+			atomic_read(&sc->send_io.pending.count));
+		seq_printf(m, "\nMR responder_resources: %u "
+			"max_frmr_depth: %u mr_type: 0x%x",
+			sp->responder_resources,
+			sp->max_frmr_depth,
+			sc->mr_io.type);
+		seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u",
+			atomic_read(&sc->mr_io.ready.count),
+			atomic_read(&sc->mr_io.used.count));
 skip_rdma:
 #endif
 		seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x",
@@ -539,6 +554,11 @@ skip_rdma:
 		else
 			seq_puts(m, "disabled (not supported by this server)");
 
+		/* Show negotiated encryption cipher, even if not required */
+		seq_puts(m, "\nEncryption: ");
+		if (server->cipher_type)
+			seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type));
+
 		seq_printf(m, "\n\n\tSessions: ");
 		i = 0;
 		list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -576,12 +596,8 @@ skip_rdma:
 
 			/* dump session id helpful for use with network trace */
 			seq_printf(m, " SessionId: 0x%llx", ses->Suid);
-			if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) {
+			if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
 				seq_puts(m, " encrypted");
-				/* can help in debugging to show encryption type */
-				if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
-					seq_puts(m, "(gcm256)");
-			}
 			if (ses->sign)
 				seq_puts(m, " signed");
 
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index bc1c1e9b288a..43b86fa4d695 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -124,55 +124,44 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
 	dp = description;
 	/* start with version and hostname portion of UNC string */
 	spnego_key = ERR_PTR(-EINVAL);
-	sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
-		hostname);
-	dp = description + strlen(description);
+	dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+		      hostname);
 
 	/* add the server address */
 	if (server->dstaddr.ss_family == AF_INET)
-		sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+		dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr);
 	else if (server->dstaddr.ss_family == AF_INET6)
-		sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
+		dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
 	else
 		goto out;
 
-	dp = description + strlen(description);
-
 	/* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
 	if (server->sec_kerberos)
-		sprintf(dp, ";sec=krb5");
+		dp += sprintf(dp, ";sec=krb5");
 	else if (server->sec_mskerberos)
-		sprintf(dp, ";sec=mskrb5");
+		dp += sprintf(dp, ";sec=mskrb5");
 	else if (server->sec_iakerb)
-		sprintf(dp, ";sec=iakerb");
+		dp += sprintf(dp, ";sec=iakerb");
 	else {
 		cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
-		sprintf(dp, ";sec=krb5");
+		dp += sprintf(dp, ";sec=krb5");
 	}
 
-	dp = description + strlen(description);
-	sprintf(dp, ";uid=0x%x",
-		from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
+	dp += sprintf(dp, ";uid=0x%x",
+		      from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
 
-	dp = description + strlen(description);
-	sprintf(dp, ";creduid=0x%x",
+	dp += sprintf(dp, ";creduid=0x%x",
 		from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
 
-	if (sesInfo->user_name) {
-		dp = description + strlen(description);
-		sprintf(dp, ";user=%s", sesInfo->user_name);
-	}
+	if (sesInfo->user_name)
+		dp += sprintf(dp, ";user=%s", sesInfo->user_name);
 
-	dp = description + strlen(description);
-	sprintf(dp, ";pid=0x%x", current->pid);
+	dp += sprintf(dp, ";pid=0x%x", current->pid);
 
-	if (sesInfo->upcall_target == UPTARGET_MOUNT) {
-		dp = description + strlen(description);
-		sprintf(dp, ";upcall_target=mount");
-	} else {
-		dp = description + strlen(description);
-		sprintf(dp, ";upcall_target=app");
-	}
+	if (sesInfo->upcall_target == UPTARGET_MOUNT)
+		dp += sprintf(dp, ";upcall_target=mount");
+	else
+		dp += sprintf(dp, ";upcall_target=app");
 
 	cifs_dbg(FYI, "key description = %s\n", description);
 	saved_cred = override_creds(spnego_cred);
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index 4cc6e0896fad..f8659d36793f 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
 	int len;
 	__le16 *dst;
 
+	if (!src)
+		return NULL;
+
 	len = cifs_local_to_utf16_bytes(src, maxlen, cp);
 	len += 2; /* NULL */
 	dst = kmalloc(len, GFP_KERNEL);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 3bd85ab2deb1..dcb39d1b5958 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode)
 
 	/* no serverino => unconditional eviction */
 	return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
-		generic_drop_inode(inode);
+		inode_generic_drop(inode);
 }
 
 static const struct super_operations cifs_super_ops = {
@@ -1358,6 +1358,20 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
 			truncate_setsize(target_inode, new_size);
 			fscache_resize_cookie(cifs_inode_cookie(target_inode),
 					      new_size);
+		} else if (rc == -EOPNOTSUPP) {
+			/*
+			 * copy_file_range syscall man page indicates EINVAL
+			 * is returned e.g when "fd_in and fd_out refer to the
+			 * same file and the source and target ranges overlap."
+			 * Test generic/157 was what showed these cases where
+			 * we need to remap EOPNOTSUPP to EINVAL
+			 */
+			if (off >= src_inode->i_size) {
+				rc = -EINVAL;
+			} else if (src_inode == target_inode) {
+				if (off + len > destoff)
+					rc = -EINVAL;
+			}
 		}
 		if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
 			target_cifsi->netfs.zero_point = new_size;
@@ -1881,7 +1895,9 @@ init_cifs(void)
 		cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n");
 	}
 
-	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+	cifsiod_wq = alloc_workqueue("cifsiod",
+				     WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+				     0);
 	if (!cifsiod_wq) {
 		rc = -ENOMEM;
 		goto out_clean_proc;
@@ -1909,28 +1925,32 @@ init_cifs(void)
 	}
 
 	cifsoplockd_wq = alloc_workqueue("cifsoplockd",
-					 WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+					 WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+					 0);
 	if (!cifsoplockd_wq) {
 		rc = -ENOMEM;
 		goto out_destroy_fileinfo_put_wq;
 	}
 
 	deferredclose_wq = alloc_workqueue("deferredclose",
-					   WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+					   WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+					   0);
 	if (!deferredclose_wq) {
 		rc = -ENOMEM;
 		goto out_destroy_cifsoplockd_wq;
 	}
 
 	serverclose_wq = alloc_workqueue("serverclose",
-					   WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+					   WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+					   0);
 	if (!serverclose_wq) {
 		rc = -ENOMEM;
 		goto out_destroy_deferredclose_wq;
 	}
 
 	cfid_put_wq = alloc_workqueue("cfid_put_wq",
-				      WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+				      WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+				      0);
 	if (!cfid_put_wq) {
 		rc = -ENOMEM;
 		goto out_destroy_serverclose_wq;
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 487f39cff77e..3ce7c614ccc0 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 /* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 55
-#define CIFS_VERSION   "2.55"
+#define SMB3_PRODUCT_BUILD 56
+#define CIFS_VERSION   "2.56"
 #endif				/* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index e6830ab3a546..3ac254e123dc 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
 #define SMB_INTERFACE_POLL_INTERVAL	600
 
 /* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 7
+#define MAX_COMPOUND 10
 
 /*
  * Default number of credits to keep available for SMB3.
@@ -814,6 +814,13 @@ struct TCP_Server_Info {
 	unsigned int	max_read;
 	unsigned int	max_write;
 	unsigned int	min_offload;
+	/*
+	 * If payload is less than or equal to the threshold,
+	 * use RDMA send/recv to send upper layer I/O.
+	 * If payload is more than the threshold,
+	 * use RDMA read/write through memory registration for I/O.
+	 */
+	unsigned int	rdma_readwrite_threshold;
 	unsigned int	retrans;
 	struct {
 		bool requested; /* "compress" mount option set*/
@@ -1540,7 +1547,7 @@ struct cifs_io_subrequest {
 	struct kvec			iov[2];
 	struct TCP_Server_Info		*server;
 #ifdef CONFIG_CIFS_SMB_DIRECT
-	struct smbd_mr			*mr;
+	struct smbdirect_mr_io		*mr;
 #endif
 	struct cifs_credits		credits;
 };
@@ -1732,6 +1739,7 @@ struct mid_q_entry {
 	int mid_rc;		/* rc for MID_RC */
 	__le16 command;		/* smb command code */
 	unsigned int optype;	/* operation type */
+	spinlock_t mid_lock;
 	bool wait_cancelled:1;  /* Cancelled while waiting for response */
 	bool deleted_from_q:1;  /* Whether Mid has been dequeued frem pending_mid_q */
 	bool large_buf:1;	/* if valid response, is pointer to large buf */
@@ -1881,9 +1889,12 @@ static inline bool is_replayable_error(int error)
 
 
 /* cifs_get_writable_file() flags */
-#define FIND_WR_ANY         0
-#define FIND_WR_FSUID_ONLY  1
-#define FIND_WR_WITH_DELETE 2
+enum cifs_writable_file_flags {
+	FIND_WR_ANY			= 0U,
+	FIND_WR_FSUID_ONLY		= (1U << 0),
+	FIND_WR_WITH_DELETE		= (1U << 1),
+	FIND_WR_NO_PENDING_DELETE	= (1U << 2),
+};
 
 #define   MID_FREE 0
 #define   MID_REQUEST_ALLOCATED 1
@@ -2036,6 +2047,9 @@ require use of the stronger protocol */
  * cifsFileInfo->file_info_lock	cifsFileInfo->count		cifs_new_fileinfo
  *				->invalidHandle			initiate_cifs_search
  *				->oplock_break_cancelled
+ * mid_q_entry->mid_lock	mid_q_entry->callback           alloc_mid
+ *								smb2_mid_entry_alloc
+ *				(Any fields of mid_q_entry that will need protection)
  ****************************************************************************/
 
 #ifdef DECLARE_GLOBALS_HERE
@@ -2339,6 +2353,8 @@ struct smb2_compound_vars {
 	struct kvec qi_iov;
 	struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
 	struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+	struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE];
+	struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE];
 	struct kvec close_iov;
 	struct smb2_file_rename_info_hdr rename_info;
 	struct smb2_file_link_info_hdr link_info;
@@ -2375,6 +2391,23 @@ static inline bool cifs_netbios_name(const char *name, size_t namelen)
 	return ret;
 }
 
+/*
+ * Execute mid callback atomically - ensures callback runs exactly once
+ * and prevents sleeping in atomic context.
+ */
+static inline void mid_execute_callback(struct mid_q_entry *mid)
+{
+	void (*callback)(struct mid_q_entry *mid);
+
+	spin_lock(&mid->mid_lock);
+	callback = mid->callback;
+	mid->callback = NULL;  /* Mark as executed, */
+	spin_unlock(&mid->mid_lock);
+
+	if (callback)
+		callback(mid);
+}
+
 #define CIFS_REPARSE_SUPPORT(tcon) \
 	((tcon)->posix_extensions || \
 	 (le32_to_cpu((tcon)->fsAttrInfo.Attributes) & \
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c34c533b2efa..e8fba98690ce 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
 
 extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
 
-extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
-				const char *path);
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
+					   struct dentry *dentry);
 
 extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
 				const char *path);
diff --git a/fs/smb/client/cifstransport.c b/fs/smb/client/cifstransport.c
index 352dafb888dd..e98b95eff8c9 100644
--- a/fs/smb/client/cifstransport.c
+++ b/fs/smb/client/cifstransport.c
@@ -46,6 +46,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
 	memset(temp, 0, sizeof(struct mid_q_entry));
 	kref_init(&temp->refcount);
+	spin_lock_init(&temp->mid_lock);
 	temp->mid = get_mid(smb_buffer);
 	temp->pid = current->pid;
 	temp->command = cpu_to_le16(smb_buffer->Command);
@@ -345,16 +346,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	rc = wait_for_response(server, midQ);
 	if (rc != 0) {
 		send_cancel(server, &rqst, midQ);
-		spin_lock(&server->mid_queue_lock);
-		if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
-		    midQ->mid_state == MID_RESPONSE_RECEIVED) {
+		spin_lock(&midQ->mid_lock);
+		if (midQ->callback) {
 			/* no longer considered to be "in-flight" */
 			midQ->callback = release_mid;
-			spin_unlock(&server->mid_queue_lock);
+			spin_unlock(&midQ->mid_lock);
 			add_credits(server, &credits, 0);
 			return rc;
 		}
-		spin_unlock(&server->mid_queue_lock);
+		spin_unlock(&midQ->mid_lock);
 	}
 
 	rc = cifs_sync_mid_result(midQ, server);
@@ -527,15 +527,14 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 		rc = wait_for_response(server, midQ);
 		if (rc) {
 			send_cancel(server, &rqst, midQ);
-			spin_lock(&server->mid_queue_lock);
-			if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
-			    midQ->mid_state == MID_RESPONSE_RECEIVED) {
+			spin_lock(&midQ->mid_lock);
+			if (midQ->callback) {
 				/* no longer considered to be "in-flight" */
 				midQ->callback = release_mid;
-				spin_unlock(&server->mid_queue_lock);
+				spin_unlock(&midQ->mid_lock);
 				return rc;
 			}
-			spin_unlock(&server->mid_queue_lock);
+			spin_unlock(&midQ->mid_lock);
 		}
 
 		/* We got the response - restart system call. */
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
index 766b4de13da7..db709f5cd2e1 100644
--- a/fs/smb/client/compress.c
+++ b/fs/smb/client/compress.c
@@ -155,58 +155,29 @@ static int cmp_bkt(const void *_a, const void *_b)
 }
 
 /*
- * TODO:
- * Support other iter types, if required.
- * Only ITER_XARRAY is supported for now.
+ * Collect some 2K samples with 2K gaps between.
  */
-static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
+static int collect_sample(const struct iov_iter *source, ssize_t max, u8 *sample)
 {
-	struct folio *folios[16], *folio;
-	unsigned int nr, i, j, npages;
-	loff_t start = iter->xarray_start + iter->iov_offset;
-	pgoff_t last, index = start / PAGE_SIZE;
-	size_t len, off, foff;
-	void *p;
-	int s = 0;
-
-	last = (start + max - 1) / PAGE_SIZE;
-	do {
-		nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios),
-				XA_PRESENT);
-		if (nr == 0)
-			return -EIO;
-
-		for (i = 0; i < nr; i++) {
-			folio = folios[i];
-			npages = folio_nr_pages(folio);
-			foff = start - folio_pos(folio);
-			off = foff % PAGE_SIZE;
-
-			for (j = foff / PAGE_SIZE; j < npages; j++) {
-				size_t len2;
-
-				len = min_t(size_t, max, PAGE_SIZE - off);
-				len2 = min_t(size_t, len, SZ_2K);
-
-				p = kmap_local_page(folio_page(folio, j));
-				memcpy(&sample[s], p, len2);
-				kunmap_local(p);
-
-				s += len2;
-
-				if (len2 < SZ_2K || s >= max - SZ_2K)
-					return s;
-
-				max -= len;
-				if (max <= 0)
-					return s;
-
-				start += len;
-				off = 0;
-				index++;
-			}
-		}
-	} while (nr == ARRAY_SIZE(folios));
+	struct iov_iter iter = *source;
+	size_t s = 0;
+
+	while (iov_iter_count(&iter) >= SZ_2K) {
+		size_t part = umin(umin(iov_iter_count(&iter), SZ_2K), max);
+		size_t n;
+
+		n = copy_from_iter(sample + s, part, &iter);
+		if (n != part)
+			return -EFAULT;
+
+		s += n;
+		max -= n;
+
+		if (iov_iter_count(&iter) < PAGE_SIZE - SZ_2K)
+			break;
+
+		iov_iter_advance(&iter, SZ_2K);
+	}
 
 	return s;
 }
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 587845a2452d..dd12f3eb61dc 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -335,7 +335,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
 	cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
 	list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
 		list_del_init(&mid->qhead);
-		mid->callback(mid);
+		mid_execute_callback(mid);
 		release_mid(mid);
 	}
 
@@ -919,7 +919,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 				list_del_init(&mid->qhead);
 				mid->mid_rc = mid_rc;
 				mid->mid_state = MID_RC;
-				mid->callback(mid);
+				mid_execute_callback(mid);
 				release_mid(mid);
 			}
 
@@ -1117,7 +1117,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 			cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
 			list_del_init(&mid_entry->qhead);
-			mid_entry->callback(mid_entry);
+			mid_execute_callback(mid_entry);
 			release_mid(mid_entry);
 		}
 		/* 1/8th of sec is more than enough time for them to exit */
@@ -1394,7 +1394,7 @@ next_pdu:
 				}
 
 				if (!mids[i]->multiRsp || mids[i]->multiEnd)
-					mids[i]->callback(mids[i]);
+					mid_execute_callback(mids[i]);
 
 				release_mid(mids[i]);
 			} else if (server->ops->is_oplock_break &&
@@ -4205,7 +4205,6 @@ retry:
 		return 0;
 	}
 
-	server->lstrp = jiffies;
 	server->tcpStatus = CifsInNegotiate;
 	server->neg_start = jiffies;
 	spin_unlock(&server->srv_lock);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 186e061068be..a5ed742afa00 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -97,8 +97,12 @@ retry:
 			      cifs_trace_rw_credits_write_prepare);
 
 #ifdef CONFIG_CIFS_SMB_DIRECT
-	if (server->smbd_conn)
-		stream->sreq_max_segs = server->smbd_conn->max_frmr_depth;
+	if (server->smbd_conn) {
+		const struct smbdirect_socket_parameters *sp =
+			smbd_get_parameters(server->smbd_conn);
+
+		stream->sreq_max_segs = sp->max_frmr_depth;
+	}
 #endif
 }
 
@@ -187,8 +191,12 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
 			      cifs_trace_rw_credits_read_submit);
 
 #ifdef CONFIG_CIFS_SMB_DIRECT
-	if (server->smbd_conn)
-		rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth;
+	if (server->smbd_conn) {
+		const struct smbdirect_socket_parameters *sp =
+			smbd_get_parameters(server->smbd_conn);
+
+		rreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth;
+	}
 #endif
 	return 0;
 }
@@ -998,7 +1006,10 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	/* Get the cached handle as SMB2 close is deferred */
 	if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
-		rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+		rc = cifs_get_writable_path(tcon, full_path,
+					    FIND_WR_FSUID_ONLY |
+					    FIND_WR_NO_PENDING_DELETE,
+					    &cfile);
 	} else {
 		rc = cifs_get_readable_path(tcon, full_path, &cfile);
 	}
@@ -2530,6 +2541,9 @@ refind_writable:
 			continue;
 		if (with_delete && !(open_file->fid.access & DELETE))
 			continue;
+		if ((flags & FIND_WR_NO_PENDING_DELETE) &&
+		    open_file->status_file_deleted)
+			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
@@ -2647,6 +2661,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
 		spin_unlock(&tcon->open_file_lock);
 		free_dentry_path(page);
 		*ret_file = find_readable_file(cinode, 0);
+		if (*ret_file) {
+			spin_lock(&cinode->open_file_lock);
+			if ((*ret_file)->status_file_deleted) {
+				spin_unlock(&cinode->open_file_lock);
+				cifsFileInfo_put(*ret_file);
+				*ret_file = NULL;
+			} else {
+				spin_unlock(&cinode->open_file_lock);
+			}
+		}
 		return *ret_file ? 0 : -ENOENT;
 	}
 
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 75be4b46bc6f..7e9784080501 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode)
  * but will return the EACCES to the caller. Note that the VFS does not call
  * unlink on negative dentries currently.
  */
-int cifs_unlink(struct inode *dir, struct dentry *dentry)
+static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename)
 {
 	int rc = 0;
 	unsigned int xid;
@@ -1943,15 +1943,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
+	__u32 dosattr = 0, origattr = 0;
 	struct TCP_Server_Info *server;
 	struct iattr *attrs = NULL;
-	__u32 dosattr = 0, origattr = 0;
+	bool rehash = false;
 
 	cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
 
 	if (unlikely(cifs_forced_shutdown(cifs_sb)))
 		return -EIO;
 
+	/* Unhash dentry in advance to prevent any concurrent opens */
+	spin_lock(&dentry->d_lock);
+	if (!d_unhashed(dentry)) {
+		__d_drop(dentry);
+		rehash = true;
+	}
+	spin_unlock(&dentry->d_lock);
+
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
@@ -1975,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 	netfs_wait_for_outstanding_io(inode);
-	cifs_close_deferred_file_under_dentry(tcon, full_path);
+	cifs_close_deferred_file_under_dentry(tcon, dentry);
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 	if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 				le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -1994,7 +2003,24 @@ retry_std_delete:
 		goto psx_del_no_retry;
 	}
 
-	rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
+	/* For SMB2+, if the file is open, we always perform a silly rename.
+	 *
+	 * We check for d_count() right after calling
+	 * cifs_close_deferred_file_under_dentry() to make sure that the
+	 * dentry's refcount gets dropped in case the file had any deferred
+	 * close.
+	 */
+	if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) {
+		spin_lock(&dentry->d_lock);
+		if (d_count(dentry) > 1)
+			sillyrename = true;
+		spin_unlock(&dentry->d_lock);
+	}
+
+	if (sillyrename)
+		rc = -EBUSY;
+	else
+		rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
 
 psx_del_no_retry:
 	if (!rc) {
@@ -2003,7 +2029,8 @@ psx_del_no_retry:
 			cifs_drop_nlink(inode);
 		}
 	} else if (rc == -ENOENT) {
-		d_drop(dentry);
+		if (simple_positive(dentry))
+			d_delete(dentry);
 	} else if (rc == -EBUSY) {
 		if (server->ops->rename_pending_delete) {
 			rc = server->ops->rename_pending_delete(full_path,
@@ -2056,9 +2083,16 @@ unlink_out:
 	kfree(attrs);
 	free_xid(xid);
 	cifs_put_tlink(tlink);
+	if (rehash)
+		d_rehash(dentry);
 	return rc;
 }
 
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	return __cifs_unlink(dir, dentry, false);
+}
+
 static int
 cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
 		 const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -2346,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
 	cifs_put_tlink(tlink);
 
+	cifsInode = CIFS_I(d_inode(direntry));
+
 	if (!rc) {
+		set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
 		spin_lock(&d_inode(direntry)->i_lock);
 		i_size_write(d_inode(direntry), 0);
 		clear_nlink(d_inode(direntry));
 		spin_unlock(&d_inode(direntry)->i_lock);
 	}
 
-	cifsInode = CIFS_I(d_inode(direntry));
 	/* force revalidate to go get info when needed */
 	cifsInode->time = 0;
 
@@ -2446,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	}
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
 do_rename_exit:
-	if (rc == 0)
+	if (rc == 0) {
 		d_move(from_dentry, to_dentry);
+		/* Force a new lookup */
+		d_drop(from_dentry);
+	}
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -2458,10 +2497,12 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 	     struct dentry *target_dentry, unsigned int flags)
 {
 	const char *from_name, *to_name;
+	struct TCP_Server_Info *server;
 	void *page1, *page2;
 	struct cifs_sb_info *cifs_sb;
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
+	bool rehash = false;
 	unsigned int xid;
 	int rc, tmprc;
 	int retry_count = 0;
@@ -2477,10 +2518,22 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 	if (unlikely(cifs_forced_shutdown(cifs_sb)))
 		return -EIO;
 
+	/*
+	 * Prevent any concurrent opens on the target by unhashing the dentry.
+	 * VFS already unhashes the target when renaming directories.
+	 */
+	if (d_is_positive(target_dentry) && !d_is_dir(target_dentry)) {
+		if (!d_unhashed(target_dentry)) {
+			d_drop(target_dentry);
+			rehash = true;
+		}
+	}
+
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
+	server = tcon->ses->server;
 
 	page1 = alloc_dentry_path();
 	page2 = alloc_dentry_path();
@@ -2498,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 		goto cifs_rename_exit;
 	}
 
-	cifs_close_deferred_file_under_dentry(tcon, from_name);
+	cifs_close_deferred_file_under_dentry(tcon, source_dentry);
 	if (d_inode(target_dentry) != NULL) {
 		netfs_wait_for_outstanding_io(d_inode(target_dentry));
-		cifs_close_deferred_file_under_dentry(tcon, to_name);
+		cifs_close_deferred_file_under_dentry(tcon, target_dentry);
 	}
 
 	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
@@ -2518,6 +2571,8 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 		}
 	}
 
+	if (!rc)
+		rehash = false;
 	/*
 	 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
 	 */
@@ -2565,23 +2620,61 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
 
 unlink_target:
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-
-	/* Try unlinking the target dentry if it's not negative */
-	if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
-		if (d_is_dir(target_dentry))
-			tmprc = cifs_rmdir(target_dir, target_dentry);
-		else
-			tmprc = cifs_unlink(target_dir, target_dentry);
-		if (tmprc)
-			goto cifs_rename_exit;
-		rc = cifs_do_rename(xid, source_dentry, from_name,
-				    target_dentry, to_name);
+	if (d_really_is_positive(target_dentry)) {
+		if (!rc) {
+			struct inode *inode = d_inode(target_dentry);
+			/*
+			 * Samba and ksmbd servers allow renaming a target
+			 * directory that is open, so make sure to update
+			 * ->i_nlink and then mark it as delete pending.
+			 */
+			if (S_ISDIR(inode->i_mode)) {
+				drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb);
+				spin_lock(&inode->i_lock);
+				i_size_write(inode, 0);
+				clear_nlink(inode);
+				spin_unlock(&inode->i_lock);
+				set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags);
+				CIFS_I(inode)->time = 0; /* force reval */
+				inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+			}
+		} else if (rc == -EACCES || rc == -EEXIST) {
+			/*
+			 * Rename failed, possibly due to a busy target.
+			 * Retry it by unliking the target first.
+			 */
+			if (d_is_dir(target_dentry)) {
+				tmprc = cifs_rmdir(target_dir, target_dentry);
+			} else {
+				tmprc = __cifs_unlink(target_dir, target_dentry,
+						      server->vals->protocol_id > SMB10_PROT_ID);
+			}
+			if (tmprc) {
+				/*
+				 * Some servers will return STATUS_ACCESS_DENIED
+				 * or STATUS_DIRECTORY_NOT_EMPTY when failing to
+				 * rename a non-empty directory.  Make sure to
+				 * propagate the appropriate error back to
+				 * userspace.
+				 */
+				if (tmprc == -EEXIST || tmprc == -ENOTEMPTY)
+					rc = tmprc;
+				goto cifs_rename_exit;
+			}
+			rc = cifs_do_rename(xid, source_dentry, from_name,
+					    target_dentry, to_name);
+			if (!rc)
+				rehash = false;
+		}
 	}
 
 	/* force revalidate to go get info when needed */
 	CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
 
 cifs_rename_exit:
+	if (rehash)
+		d_rehash(target_dentry);
 	kfree(info_buf_source);
 	free_dentry_path(page2);
 	free_dentry_path(page1);
@@ -2599,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry)
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct cached_fid *cfid = NULL;
 
+	if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags))
+		return false;
 	if (cifs_i->time == 0)
 		return true;
 
@@ -2749,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
 	}
 
 	cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
-		 full_path, inode, inode->i_count.counter,
+		 full_path, inode, icount_read(inode),
 		 dentry, cifs_get_time(dentry), jiffies);
 
 again:
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index da23cc12a52c..dda6dece802a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
 		kfree(tmp_list);
 	}
 }
-void
-cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
+
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon,
+					   struct dentry *dentry)
 {
-	struct cifsFileInfo *cfile;
 	struct file_list *tmp_list, *tmp_next_list;
-	void *page;
-	const char *full_path;
+	struct cifsFileInfo *cfile;
 	LIST_HEAD(file_head);
 
-	page = alloc_dentry_path();
 	spin_lock(&tcon->open_file_lock);
 	list_for_each_entry(cfile, &tcon->openFileList, tlist) {
-		full_path = build_path_from_dentry(cfile->dentry, page);
-		if (strstr(full_path, path)) {
-			if (delayed_work_pending(&cfile->deferred)) {
-				if (cancel_delayed_work(&cfile->deferred)) {
-					spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
-					cifs_del_deferred_close(cfile);
-					spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
-
-					tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
-					if (tmp_list == NULL)
-						break;
-					tmp_list->cfile = cfile;
-					list_add_tail(&tmp_list->list, &file_head);
-				}
-			}
+		if ((cfile->dentry == dentry) &&
+		    delayed_work_pending(&cfile->deferred) &&
+		    cancel_delayed_work(&cfile->deferred)) {
+			spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+			cifs_del_deferred_close(cfile);
+			spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+
+			tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+			if (tmp_list == NULL)
+				break;
+			tmp_list->cfile = cfile;
+			list_add_tail(&tmp_list->list, &file_head);
 		}
 	}
 	spin_unlock(&tcon->open_file_lock);
@@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
 		list_del(&tmp_list->list);
 		kfree(tmp_list);
 	}
-	free_dentry_path(page);
 }
 
 /*
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 7869cec58f52..10c84c095fe7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
 	}
 
 	/*
-	 * For absolute symlinks it is not possible to determinate
+	 * For absolute symlinks it is not possible to determine
 	 * if it should point to directory or file.
 	 */
 	if (symname[0] == '/') {
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 893a1ea8c000..a02d41d1ce4a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 			rc = -EOPNOTSUPP;
 	}
 
-	/* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+	/* Fallback to SMB_COM_SETATTR command when absolutely needed. */
 	if (rc == -EOPNOTSUPP) {
 		cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
 		rc = SMBSetInformation(xid, tcon, full_path,
@@ -1039,7 +1039,7 @@ set_via_filehandle:
 		cifsFileInfo_put(open_file);
 
 	/*
-	 * Setting the read-only bit is not honered on non-NT servers when done
+	* Setting the read-only bit is not honored on non-NT servers when done
 	 * via open-semantics. So for setting it, use SMB_COM_SETATTR command.
 	 * This command works only after the file is closed, so use it only when
 	 * operation was called without the filehandle.
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 224495322a05..e56e4d402f13 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -30,10 +30,9 @@ enum smb2_compound_ops {
 	SMB2_OP_QUERY_DIR,
 	SMB2_OP_MKDIR,
 	SMB2_OP_RENAME,
-	SMB2_OP_DELETE,
 	SMB2_OP_HARDLINK,
 	SMB2_OP_SET_EOF,
-	SMB2_OP_RMDIR,
+	SMB2_OP_UNLINK,
 	SMB2_OP_POSIX_QUERY_INFO,
 	SMB2_OP_SET_REPARSE,
 	SMB2_OP_GET_REPARSE,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 2a0316c514e4..0985db9f86e5 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -207,8 +207,10 @@ replay_again:
 	server = cifs_pick_channel(ses);
 
 	vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
-	if (vars == NULL)
-		return -ENOMEM;
+	if (vars == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
 	rqst = &vars->rqst[0];
 	rsp_iov = &vars->rsp_iov[0];
 
@@ -344,9 +346,6 @@ replay_again:
 			trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
 								   ses->Suid, full_path);
 			break;
-		case SMB2_OP_DELETE:
-			trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
-			break;
 		case SMB2_OP_MKDIR:
 			/*
 			 * Directories are created through parameters in the
@@ -354,23 +353,40 @@ replay_again:
 			 */
 			trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
-		case SMB2_OP_RMDIR:
-			rqst[num_rqst].rq_iov = &vars->si_iov[0];
+		case SMB2_OP_UNLINK:
+			rqst[num_rqst].rq_iov = vars->unlink_iov;
 			rqst[num_rqst].rq_nvec = 1;
 
 			size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
 			data[0] = &delete_pending[0];
 
-			rc = SMB2_set_info_init(tcon, server,
-						&rqst[num_rqst], COMPOUND_FID,
-						COMPOUND_FID, current->tgid,
-						FILE_DISPOSITION_INFORMATION,
-						SMB2_O_INFO_FILE, 0, data, size);
-			if (rc)
+			if (cfile) {
+				rc = SMB2_set_info_init(tcon, server,
+							&rqst[num_rqst],
+							cfile->fid.persistent_fid,
+							cfile->fid.volatile_fid,
+							current->tgid,
+							FILE_DISPOSITION_INFORMATION,
+							SMB2_O_INFO_FILE, 0,
+							data, size);
+			} else {
+				rc = SMB2_set_info_init(tcon, server,
+							&rqst[num_rqst],
+							COMPOUND_FID,
+							COMPOUND_FID,
+							current->tgid,
+							FILE_DISPOSITION_INFORMATION,
+							SMB2_O_INFO_FILE, 0,
+							data, size);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
-			smb2_set_next_command(tcon, &rqst[num_rqst]);
-			smb2_set_related(&rqst[num_rqst++]);
-			trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
+			}
+			num_rqst++;
+			trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path);
 			break;
 		case SMB2_OP_SET_EOF:
 			rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -440,7 +456,7 @@ replay_again:
 							   ses->Suid, full_path);
 			break;
 		case SMB2_OP_RENAME:
-			rqst[num_rqst].rq_iov = &vars->si_iov[0];
+			rqst[num_rqst].rq_iov = vars->rename_iov;
 			rqst[num_rqst].rq_nvec = 2;
 
 			len = in_iov[i].iov_len;
@@ -671,7 +687,7 @@ finished:
 	}
 
 	for (i = 0; i < num_cmds; i++) {
-		char *buf = rsp_iov[i + i].iov_base;
+		char *buf = rsp_iov[i + 1].iov_base;
 
 		if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER)
 			rc = server->ops->map_error(buf, false);
@@ -730,19 +746,6 @@ finished:
 				trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
 									  ses->Suid);
 			break;
-		case SMB2_OP_DELETE:
-			if (rc)
-				trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
-			else {
-				/*
-				 * If dentry (hence, inode) is NULL, lease break is going to
-				 * take care of degrading leases on handles for deleted files.
-				 */
-				if (inode)
-					cifs_mark_open_handles_for_deleted_file(inode, full_path);
-				trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
-			}
-			break;
 		case SMB2_OP_MKDIR:
 			if (rc)
 				trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
@@ -763,11 +766,11 @@ finished:
 				trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
-		case SMB2_OP_RMDIR:
-			if (rc)
-				trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
+		case SMB2_OP_UNLINK:
+			if (!rc)
+				trace_smb3_unlink_done(xid, tcon->tid, ses->Suid);
 			else
-				trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
+				trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc);
 			SMB2_set_info_free(&rqst[num_rqst++]);
 			break;
 		case SMB2_OP_SET_EOF:
@@ -864,6 +867,7 @@ finished:
 	    smb2_should_replay(tcon, &retries, &cur_sleep))
 		goto replay_again;
 
+out:
 	if (cfile)
 		cifsFileInfo_put(cfile);
 
@@ -1163,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 			     FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
 	return smb2_compound_op(xid, tcon, cifs_sb,
 				name, &oparms, NULL,
-				&(int){SMB2_OP_RMDIR}, 1,
+				&(int){SMB2_OP_UNLINK}, 1,
 				NULL, NULL, NULL, NULL);
 }
 
@@ -1171,21 +1175,107 @@ int
 smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	    struct cifs_sb_info *cifs_sb, struct dentry *dentry)
 {
+	struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+	__le16 *utf16_path __free(kfree) = NULL;
+	int retries = 0, cur_sleep = 1;
+	struct TCP_Server_Info *server;
 	struct cifs_open_parms oparms;
+	struct smb2_create_req *creq;
+	struct inode *inode = NULL;
+	struct smb_rqst rqst[2];
+	struct kvec rsp_iov[2];
+	struct kvec close_iov;
+	int resp_buftype[2];
+	struct cifs_fid fid;
+	int flags = 0;
+	__u8 oplock;
+	int rc;
 
-	oparms = CIFS_OPARMS(cifs_sb, tcon, name,
-			     DELETE, FILE_OPEN,
-			     CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-			     ACL_NO_MODE);
-	int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
-				  NULL, &(int){SMB2_OP_DELETE}, 1,
-				  NULL, NULL, NULL, dentry);
-	if (rc == -EINVAL) {
-		cifs_dbg(FYI, "invalid lease key, resending request without lease");
-		rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
-				      NULL, &(int){SMB2_OP_DELETE}, 1,
-				      NULL, NULL, NULL, NULL);
+	utf16_path = cifs_convert_path_to_utf16(name, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	if (smb3_encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+again:
+	oplock = SMB2_OPLOCK_LEVEL_NONE;
+	server = cifs_pick_channel(tcon->ses);
+
+	memset(rqst, 0, sizeof(rqst));
+	memset(resp_buftype, 0, sizeof(resp_buftype));
+	memset(rsp_iov, 0, sizeof(rsp_iov));
+
+	rqst[0].rq_iov = open_iov;
+	rqst[0].rq_nvec = ARRAY_SIZE(open_iov);
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES,
+			     FILE_OPEN, CREATE_DELETE_ON_CLOSE |
+			     OPEN_REPARSE_POINT, ACL_NO_MODE);
+	oparms.fid = &fid;
+
+	if (dentry) {
+		inode = d_inode(dentry);
+		if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) {
+			oplock = SMB2_OPLOCK_LEVEL_LEASE;
+			server->ops->get_lease_key(inode, &fid);
+		}
+	}
+
+	rc = SMB2_open_init(tcon, server,
+			    &rqst[0], &oplock, &oparms, utf16_path);
+	if (rc)
+		goto err_free;
+	smb2_set_next_command(tcon, &rqst[0]);
+	creq = rqst[0].rq_iov[0].iov_base;
+	creq->ShareAccess = FILE_SHARE_DELETE_LE;
+
+	rqst[1].rq_iov = &close_iov;
+	rqst[1].rq_nvec = 1;
+
+	rc = SMB2_close_init(tcon, server, &rqst[1],
+			     COMPOUND_FID, COMPOUND_FID, false);
+	smb2_set_related(&rqst[1]);
+	if (rc)
+		goto err_free;
+
+	if (retries) {
+		for (int i = 0; i < ARRAY_SIZE(rqst);  i++)
+			smb2_set_replay(server, &rqst[i]);
 	}
+
+	rc = compound_send_recv(xid, tcon->ses, server, flags,
+				ARRAY_SIZE(rqst), rqst,
+				resp_buftype, rsp_iov);
+	SMB2_open_free(&rqst[0]);
+	SMB2_close_free(&rqst[1]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+	if (is_replayable_error(rc) &&
+	    smb2_should_replay(tcon, &retries, &cur_sleep))
+		goto again;
+
+	/* Retry compound request without lease */
+	if (rc == -EINVAL && dentry) {
+		dentry = NULL;
+		retries = 0;
+		cur_sleep = 1;
+		goto again;
+	}
+	/*
+	 * If dentry (hence, inode) is NULL, lease break is going to
+	 * take care of degrading leases on handles for deleted files.
+	 */
+	if (!rc && inode)
+		cifs_mark_open_handles_for_deleted_file(inode, name);
+
+	return rc;
+
+err_free:
+	SMB2_open_free(&rqst[0]);
+	SMB2_close_free(&rqst[1]);
+	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
 	return rc;
 }
 
@@ -1438,3 +1528,113 @@ out:
 	cifs_free_open_info(&data);
 	return rc;
 }
+
+static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb,
+				      const char *name, size_t namelen)
+{
+	int len;
+
+	if (*name == '\\' ||
+	    (cifs_sb_master_tlink(cifs_sb) &&
+	     cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/'))
+		name++;
+	return cifs_strndup_to_utf16(name, namelen, &len,
+				     cifs_sb->local_nls,
+				     cifs_remap(cifs_sb));
+}
+
+int smb2_rename_pending_delete(const char *full_path,
+			       struct dentry *dentry,
+			       const unsigned int xid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry));
+	__le16 *utf16_path __free(kfree) = NULL;
+	__u32 co = file_create_options(dentry);
+	int cmds[] = {
+		SMB2_OP_SET_INFO,
+		SMB2_OP_RENAME,
+		SMB2_OP_UNLINK,
+	};
+	const int num_cmds = ARRAY_SIZE(cmds);
+	char *to_name __free(kfree) = NULL;
+	__u32 attrs = cinode->cifsAttrs;
+	struct cifs_open_parms oparms;
+	static atomic_t sillycounter;
+	struct cifsFileInfo *cfile;
+	struct tcon_link *tlink;
+	struct cifs_tcon *tcon;
+	struct kvec iov[2];
+	const char *ppath;
+	void *page;
+	size_t len;
+	int rc;
+
+	tlink = cifs_sb_tlink(cifs_sb);
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
+
+	page = alloc_dentry_path();
+
+	ppath = build_path_from_dentry(dentry->d_parent, page);
+	if (IS_ERR(ppath)) {
+		rc = PTR_ERR(ppath);
+		goto out;
+	}
+
+	len = strlen(ppath) + strlen("/.__smb1234") + 1;
+	to_name = kmalloc(len, GFP_KERNEL);
+	if (!to_name) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb),
+		  atomic_inc_return(&sillycounter) & 0xffff);
+
+	utf16_path = utf16_smb2_path(cifs_sb, to_name, len);
+	if (!utf16_path) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+			     DELETE | FILE_WRITE_ATTRIBUTES,
+			     FILE_OPEN, co, ACL_NO_MODE);
+
+	attrs &= ~ATTR_READONLY;
+	if (!attrs)
+		attrs = ATTR_NORMAL;
+	if (d_inode(dentry)->i_nlink <= 1)
+		attrs |= ATTR_HIDDEN;
+	iov[0].iov_base = &(FILE_BASIC_INFO) {
+		.Attributes = cpu_to_le32(attrs),
+	};
+	iov[0].iov_len = sizeof(FILE_BASIC_INFO);
+	iov[1].iov_base = utf16_path;
+	iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path);
+
+	cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile);
+	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+			      cmds, num_cmds, cfile, NULL, NULL, dentry);
+	if (rc == -EINVAL) {
+		cifs_dbg(FYI, "invalid lease key, resending request without lease\n");
+		cifs_get_writable_path(tcon, full_path,
+				       FIND_WR_WITH_DELETE, &cfile);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+				      cmds, num_cmds, cfile, NULL, NULL, NULL);
+	}
+	if (!rc) {
+		set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags);
+	} else {
+		cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n",
+			      __func__, full_path, to_name, rc);
+		rc = -EIO;
+	}
+out:
+	cifs_put_tlink(tlink);
+	free_dentry_path(page);
+	return rc;
+}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cddf273c14ae..89d933b4a8bc 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 	struct cifs_tcon *tcon;
 	struct cifs_pending_open *open;
 
+	/* Trace receipt of lease break request from server */
+	trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState),
+		le32_to_cpu(rsp->Flags),
+		le16_to_cpu(rsp->Epoch),
+		le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+		le64_to_cpu(rsp->hdr.SessionId),
+		*((u64 *)rsp->LeaseKey),
+		*((u64 *)&rsp->LeaseKey[8]));
+
 	cifs_dbg(FYI, "Checking for lease break\n");
 
 	/* If server is a channel, select the primary channel */
@@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 	spin_unlock(&cifs_tcp_ses_lock);
 	cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
 	trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
-				   le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
-				   le64_to_cpu(rsp->hdr.SessionId),
-				   *((u64 *)rsp->LeaseKey),
-				   *((u64 *)&rsp->LeaseKey[8]));
+					   le32_to_cpu(rsp->Flags),
+					   le16_to_cpu(rsp->Epoch),
+					   le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+					   le64_to_cpu(rsp->hdr.SessionId),
+					   *((u64 *)rsp->LeaseKey),
+					   *((u64 *)&rsp->LeaseKey[8]));
 
 	return false;
 }
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index ad8947434b71..4711a23c5b38 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -504,8 +504,8 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
 	wsize = min_t(unsigned int, wsize, server->max_write);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	if (server->rdma) {
-		struct smbdirect_socket_parameters *sp =
-			&server->smbd_conn->socket.parameters;
+		const struct smbdirect_socket_parameters *sp =
+			smbd_get_parameters(server->smbd_conn);
 
 		if (server->sign)
 			/*
@@ -555,8 +555,8 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
 	rsize = min_t(unsigned int, rsize, server->max_read);
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	if (server->rdma) {
-		struct smbdirect_socket_parameters *sp =
-			&server->smbd_conn->socket.parameters;
+		const struct smbdirect_socket_parameters *sp =
+			smbd_get_parameters(server->smbd_conn);
 
 		if (server->sign)
 			/*
@@ -772,6 +772,13 @@ next_iface:
 			bytes_left -= sizeof(*p);
 			break;
 		}
+		/* Validate that Next doesn't point beyond the buffer */
+		if (next > bytes_left) {
+			cifs_dbg(VFS, "%s: invalid Next pointer %zu > %zd\n",
+				 __func__, next, bytes_left);
+			rc = -EINVAL;
+			goto out;
+		}
 		p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
 		bytes_left -= next;
 	}
@@ -783,7 +790,9 @@ next_iface:
 	}
 
 	/* Azure rounds the buffer size up 8, to a 16 byte boundary */
-	if ((bytes_left > 8) || p->Next)
+	if ((bytes_left > 8) ||
+	    (bytes_left >= offsetof(struct network_interface_info_ioctl_rsp, Next)
+	     + sizeof(p->Next) && p->Next))
 		cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
 
 	ses->iface_last_update = jiffies;
@@ -2631,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
 	}
 
 	/* SMB headers in a compound are 8 byte aligned. */
-	if (!IS_ALIGNED(len, 8)) {
-		num_padding = 8 - (len & 7);
+	if (IS_ALIGNED(len, 8))
+		goto out;
+
+	num_padding = 8 - (len & 7);
+	if (smb3_encryption_required(tcon)) {
+		int i;
+
+		/*
+		 * Flatten request into a single buffer with required padding as
+		 * the encryption layer can't handle the padding iovs.
+		 */
+		for (i = 1; i < rqst->rq_nvec; i++) {
+			memcpy(rqst->rq_iov[0].iov_base +
+			       rqst->rq_iov[0].iov_len,
+			       rqst->rq_iov[i].iov_base,
+			       rqst->rq_iov[i].iov_len);
+			rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+		}
+		memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+		       0, num_padding);
+		rqst->rq_iov[0].iov_len += num_padding;
+		rqst->rq_nvec = 1;
+	} else {
 		rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
 		rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
 		rqst->rq_nvec++;
-		len += num_padding;
 	}
+	len += num_padding;
+out:
 	shdr->NextCommand = cpu_to_le32(len);
 }
 
@@ -4487,7 +4518,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
 	for (int i = 1; i < num_rqst; i++) {
 		struct smb_rqst *old = &old_rq[i - 1];
 		struct smb_rqst *new = &new_rq[i];
-		struct folio_queue *buffer;
+		struct folio_queue *buffer = NULL;
 		size_t size = iov_iter_count(&old->rq_iter);
 
 		orig_len += smb_rqst_len(server, old);
@@ -4805,7 +4836,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
 				dw->server->ops->is_network_name_deleted(dw->buf,
 									 dw->server);
 
-			mid->callback(mid);
+			mid_execute_callback(mid);
 		} else {
 			spin_lock(&dw->server->srv_lock);
 			if (dw->server->tcpStatus == CifsNeedReconnect) {
@@ -4813,7 +4844,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
 				mid->mid_state = MID_RETRY_NEEDED;
 				spin_unlock(&dw->server->mid_queue_lock);
 				spin_unlock(&dw->server->srv_lock);
-				mid->callback(mid);
+				mid_execute_callback(mid);
 			} else {
 				spin_lock(&dw->server->mid_queue_lock);
 				mid->mid_state = MID_REQUEST_SUBMITTED;
@@ -5367,6 +5398,7 @@ struct smb_version_operations smb20_operations = {
 	.llseek = smb3_llseek,
 	.is_status_io_timeout = smb2_is_status_io_timeout,
 	.is_network_name_deleted = smb2_is_network_name_deleted,
+	.rename_pending_delete = smb2_rename_pending_delete,
 };
 #endif /* CIFS_ALLOW_INSECURE_LEGACY */
 
@@ -5472,6 +5504,7 @@ struct smb_version_operations smb21_operations = {
 	.llseek = smb3_llseek,
 	.is_status_io_timeout = smb2_is_status_io_timeout,
 	.is_network_name_deleted = smb2_is_network_name_deleted,
+	.rename_pending_delete = smb2_rename_pending_delete,
 };
 
 struct smb_version_operations smb30_operations = {
@@ -5588,6 +5621,7 @@ struct smb_version_operations smb30_operations = {
 	.llseek = smb3_llseek,
 	.is_status_io_timeout = smb2_is_status_io_timeout,
 	.is_network_name_deleted = smb2_is_network_name_deleted,
+	.rename_pending_delete = smb2_rename_pending_delete,
 };
 
 struct smb_version_operations smb311_operations = {
@@ -5704,6 +5738,7 @@ struct smb_version_operations smb311_operations = {
 	.llseek = smb3_llseek,
 	.is_status_io_timeout = smb2_is_status_io_timeout,
 	.is_network_name_deleted = smb2_is_network_name_deleted,
+	.rename_pending_delete = smb2_rename_pending_delete,
 };
 
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2df93a75e3b8..1c63d2c9cc9c 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4411,7 +4411,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms)
 		return false;
 
 	/* offload also has its overhead, so only do it if desired */
-	if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold)
+	if (io_parms->length < server->rdma_readwrite_threshold)
 		return false;
 
 	return true;
@@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 	please_key_high = (__u64 *)(lease_key+8);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
-		trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+		trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid,
 			ses->Suid, *please_key_low, *please_key_high, rc);
 		cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
 	} else
-		trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+		trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid,
 			ses->Suid, *please_key_low, *please_key_high);
 
 	return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6e805ece6a7b..b3f1398c9f79 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end);
 int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
 		       struct dentry *dentry, struct cifs_tcon *tcon,
 		       const char *full_path, umode_t mode, dev_t dev);
+int smb2_rename_pending_delete(const char *full_path,
+			       struct dentry *dentry,
+			       const unsigned int xid);
 
 #endif			/* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index ff9ef7fcd010..bc0e92eb2b64 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -771,6 +771,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
 	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
 	memset(temp, 0, sizeof(struct mid_q_entry));
 	kref_init(&temp->refcount);
+	spin_lock_init(&temp->mid_lock);
 	temp->mid = le64_to_cpu(shdr->MessageId);
 	temp->credits = credits > 0 ? credits : 1;
 	temp->pid = current->pid;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index c628e91c328b..316f398c70f4 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -13,28 +13,35 @@
 #include "cifsproto.h"
 #include "smb2proto.h"
 
+const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
+{
+	struct smbdirect_socket *sc = &conn->socket;
+
+	return &sc->parameters;
+}
+
 static struct smbdirect_recv_io *get_receive_buffer(
-		struct smbd_connection *info);
+		struct smbdirect_socket *sc);
 static void put_receive_buffer(
-		struct smbd_connection *info,
+		struct smbdirect_socket *sc,
 		struct smbdirect_recv_io *response);
-static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
-static void destroy_receive_buffers(struct smbd_connection *info);
+static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
+static void destroy_receive_buffers(struct smbdirect_socket *sc);
 
 static void enqueue_reassembly(
-		struct smbd_connection *info,
+		struct smbdirect_socket *sc,
 		struct smbdirect_recv_io *response, int data_length);
 static struct smbdirect_recv_io *_get_first_reassembly(
-		struct smbd_connection *info);
+		struct smbdirect_socket *sc);
 
 static int smbd_post_recv(
-		struct smbd_connection *info,
+		struct smbdirect_socket *sc,
 		struct smbdirect_recv_io *response);
 
-static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_empty(struct smbdirect_socket *sc);
 
-static void destroy_mr_list(struct smbd_connection *info);
-static int allocate_mr_list(struct smbd_connection *info);
+static void destroy_mr_list(struct smbdirect_socket *sc);
+static int allocate_mr_list(struct smbdirect_socket *sc);
 
 struct smb_extract_to_rdma {
 	struct ib_sge		*sge;
@@ -57,6 +64,9 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
 /* SMBD negotiation timeout in seconds */
 #define SMBD_NEGOTIATE_TIMEOUT	120
 
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define KEEPALIVE_RECV_TIMEOUT 5
+
 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
 #define SMBD_MIN_RECEIVE_SIZE		128
 #define SMBD_MIN_FRAGMENTED_SIZE	131072
@@ -155,65 +165,277 @@ do {									\
 #define log_rdma_mr(level, fmt, args...) \
 		log_rdma(level, LOG_RDMA_MR, fmt, ##args)
 
+static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
+{
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	wake_up_all(&sc->status_wait);
+	wake_up_all(&sc->send_io.credits.wait_queue);
+	wake_up_all(&sc->send_io.pending.dec_wait_queue);
+	wake_up_all(&sc->send_io.pending.zero_wait_queue);
+	wake_up_all(&sc->recv_io.reassembly.wait_queue);
+	wake_up_all(&sc->mr_io.ready.wait_queue);
+	wake_up_all(&sc->mr_io.cleanup.wait_queue);
+}
+
 static void smbd_disconnect_rdma_work(struct work_struct *work)
 {
-	struct smbd_connection *info =
-		container_of(work, struct smbd_connection, disconnect_work);
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, disconnect_work);
 
-	if (sc->status == SMBDIRECT_SOCKET_CONNECTED) {
+	/*
+	 * make sure this and other work is not queued again
+	 * but here we don't block and avoid
+	 * disable[_delayed]_work_sync()
+	 */
+	disable_work(&sc->disconnect_work);
+	disable_work(&sc->recv_io.posted.refill_work);
+	disable_work(&sc->mr_io.recovery_work);
+	disable_work(&sc->idle.immediate_work);
+	disable_delayed_work(&sc->idle.timer_work);
+
+	if (sc->first_error == 0)
+		sc->first_error = -ECONNABORTED;
+
+	switch (sc->status) {
+	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+	case SMBDIRECT_SOCKET_CONNECTED:
+	case SMBDIRECT_SOCKET_ERROR:
 		sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
 		rdma_disconnect(sc->rdma.cm_id);
+		break;
+
+	case SMBDIRECT_SOCKET_CREATED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+		/*
+		 * rdma_connect() never reached
+		 * RDMA_CM_EVENT_ESTABLISHED
+		 */
+		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+		break;
+
+	case SMBDIRECT_SOCKET_DISCONNECTING:
+	case SMBDIRECT_SOCKET_DISCONNECTED:
+	case SMBDIRECT_SOCKET_DESTROYED:
+		break;
 	}
+
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	smbd_disconnect_wake_up_all(sc);
 }
 
-static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
+static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
 {
-	queue_work(info->workqueue, &info->disconnect_work);
+	/*
+	 * make sure other work (than disconnect_work) is
+	 * not queued again but here we don't block and avoid
+	 * disable[_delayed]_work_sync()
+	 */
+	disable_work(&sc->recv_io.posted.refill_work);
+	disable_work(&sc->mr_io.recovery_work);
+	disable_work(&sc->idle.immediate_work);
+	disable_delayed_work(&sc->idle.timer_work);
+
+	if (sc->first_error == 0)
+		sc->first_error = -ECONNABORTED;
+
+	switch (sc->status) {
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+	case SMBDIRECT_SOCKET_ERROR:
+	case SMBDIRECT_SOCKET_DISCONNECTING:
+	case SMBDIRECT_SOCKET_DISCONNECTED:
+	case SMBDIRECT_SOCKET_DESTROYED:
+		/*
+		 * Keep the current error status
+		 */
+		break;
+
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_CREATED:
+	case SMBDIRECT_SOCKET_CONNECTED:
+		sc->status = SMBDIRECT_SOCKET_ERROR;
+		break;
+	}
+
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	smbd_disconnect_wake_up_all(sc);
+
+	queue_work(sc->workqueue, &sc->disconnect_work);
 }
 
 /* Upcall from RDMA CM */
 static int smbd_conn_upcall(
 		struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
-	struct smbd_connection *info = id->context;
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket *sc = id->context;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	const char *event_name = rdma_event_msg(event->event);
+	u8 peer_initiator_depth;
+	u8 peer_responder_resources;
 
 	log_rdma_event(INFO, "event=%s status=%d\n",
 		event_name, event->status);
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
+		wake_up(&sc->status_wait);
+		break;
+
 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
-		info->ri_rc = 0;
-		complete(&info->ri_done);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+		wake_up(&sc->status_wait);
 		break;
 
 	case RDMA_CM_EVENT_ADDR_ERROR:
 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
-		info->ri_rc = -EHOSTUNREACH;
-		complete(&info->ri_done);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+		smbd_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 
 	case RDMA_CM_EVENT_ROUTE_ERROR:
 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
-		info->ri_rc = -ENETUNREACH;
-		complete(&info->ri_done);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+		smbd_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
 		log_rdma_event(INFO, "connected event=%s\n", event_name);
-		sc->status = SMBDIRECT_SOCKET_CONNECTED;
-		wake_up_interruptible(&info->status_wait);
+
+		/*
+		 * Here we work around an inconsistency between
+		 * iWarp and other devices (at least rxe and irdma using RoCEv2)
+		 */
+		if (rdma_protocol_iwarp(id->device, id->port_num)) {
+			/*
+			 * iWarp devices report the peer's values
+			 * with the perspective of the peer here.
+			 * Tested with siw and irdma (in iwarp mode)
+			 * We need to change to our perspective here,
+			 * so we need to switch the values.
+			 */
+			peer_initiator_depth = event->param.conn.responder_resources;
+			peer_responder_resources = event->param.conn.initiator_depth;
+		} else {
+			/*
+			 * Non iWarp devices report the peer's values
+			 * already changed to our perspective here.
+			 * Tested with rxe and irdma (in roce mode).
+			 */
+			peer_initiator_depth = event->param.conn.initiator_depth;
+			peer_responder_resources = event->param.conn.responder_resources;
+		}
+		if (rdma_protocol_iwarp(id->device, id->port_num) &&
+		    event->param.conn.private_data_len == 8) {
+			/*
+			 * Legacy clients with only iWarp MPA v1 support
+			 * need a private blob in order to negotiate
+			 * the IRD/ORD values.
+			 */
+			const __be32 *ird_ord_hdr = event->param.conn.private_data;
+			u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
+			u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
+
+			/*
+			 * cifs.ko sends the legacy IRD/ORD negotiation
+			 * event if iWarp MPA v2 was used.
+			 *
+			 * Here we check that the values match and only
+			 * mark the client as legacy if they don't match.
+			 */
+			if ((u32)event->param.conn.initiator_depth != ird32 ||
+			    (u32)event->param.conn.responder_resources != ord32) {
+				/*
+				 * There are broken clients (old cifs.ko)
+				 * using little endian and also
+				 * struct rdma_conn_param only uses u8
+				 * for initiator_depth and responder_resources,
+				 * so we truncate the value to U8_MAX.
+				 *
+				 * smb_direct_accept_client() will then
+				 * do the real negotiation in order to
+				 * select the minimum between client and
+				 * server.
+				 */
+				ird32 = min_t(u32, ird32, U8_MAX);
+				ord32 = min_t(u32, ord32, U8_MAX);
+
+				sc->rdma.legacy_iwarp = true;
+				peer_initiator_depth = (u8)ird32;
+				peer_responder_resources = (u8)ord32;
+			}
+		}
+
+		/*
+		 * negotiate the value by using the minimum
+		 * between client and server if the client provided
+		 * non 0 values.
+		 */
+		if (peer_initiator_depth != 0)
+			sp->initiator_depth =
+					min_t(u8, sp->initiator_depth,
+					      peer_initiator_depth);
+		if (peer_responder_resources != 0)
+			sp->responder_resources =
+					min_t(u8, sp->responder_resources,
+					      peer_responder_resources);
+
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+		wake_up(&sc->status_wait);
 		break;
 
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 	case RDMA_CM_EVENT_UNREACHABLE:
 	case RDMA_CM_EVENT_REJECTED:
 		log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
-		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
-		wake_up_interruptible(&info->status_wait);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+		smbd_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -221,15 +443,10 @@ static int smbd_conn_upcall(
 		/* This happens when we fail the negotiation */
 		if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
 			log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
-			sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
-			wake_up(&info->status_wait);
-			break;
 		}
 
 		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
-		wake_up_interruptible(&info->status_wait);
-		wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
-		wake_up_interruptible_all(&info->wait_send_queue);
+		smbd_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 
 	default:
@@ -245,15 +462,15 @@ static int smbd_conn_upcall(
 static void
 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
 {
-	struct smbd_connection *info = context;
+	struct smbdirect_socket *sc = context;
 
-	log_rdma_event(ERR, "%s on device %s info %p\n",
-		ib_event_msg(event->event), event->device->name, info);
+	log_rdma_event(ERR, "%s on device %s socket %p\n",
+		ib_event_msg(event->event), event->device->name, sc);
 
 	switch (event->event) {
 	case IB_EVENT_CQ_ERR:
 	case IB_EVENT_QP_FATAL:
-		smbd_disconnect_rdma_connection(info);
+		smbd_disconnect_rdma_connection(sc);
 		break;
 
 	default:
@@ -278,11 +495,9 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct smbdirect_send_io *request =
 		container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
 	struct smbdirect_socket *sc = request->socket;
-	struct smbd_connection *info =
-		container_of(sc, struct smbd_connection, socket);
 
-	log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n",
-		request, wc->status);
+	log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
+		request, ib_wc_status_msg(wc->status));
 
 	for (i = 0; i < request->num_sge; i++)
 		ib_dma_unmap_single(sc->ib.dev,
@@ -291,17 +506,18 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
 			DMA_TO_DEVICE);
 
 	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
-		log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
-			wc->status, wc->opcode);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
+				ib_wc_status_msg(wc->status), wc->opcode);
 		mempool_free(request, sc->send_io.mem.pool);
-		smbd_disconnect_rdma_connection(info);
+		smbd_disconnect_rdma_connection(sc);
 		return;
 	}
 
-	if (atomic_dec_and_test(&info->send_pending))
-		wake_up(&info->wait_send_pending);
+	if (atomic_dec_and_test(&sc->send_io.pending.count))
+		wake_up(&sc->send_io.pending.zero_wait_queue);
 
-	wake_up(&info->wait_post_send);
+	wake_up(&sc->send_io.pending.dec_wait_queue);
 
 	mempool_free(request, sc->send_io.mem.pool);
 }
@@ -325,8 +541,6 @@ static bool process_negotiation_response(
 		struct smbdirect_recv_io *response, int packet_length)
 {
 	struct smbdirect_socket *sc = response->socket;
-	struct smbd_connection *info =
-		container_of(sc, struct smbd_connection, socket);
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
 
@@ -341,21 +555,19 @@ static bool process_negotiation_response(
 			le16_to_cpu(packet->negotiated_version));
 		return false;
 	}
-	info->protocol = le16_to_cpu(packet->negotiated_version);
 
 	if (packet->credits_requested == 0) {
 		log_rdma_event(ERR, "error: credits_requested==0\n");
 		return false;
 	}
-	info->receive_credit_target = le16_to_cpu(packet->credits_requested);
+	sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
+	sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
 
 	if (packet->credits_granted == 0) {
 		log_rdma_event(ERR, "error: credits_granted==0\n");
 		return false;
 	}
-	atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
-
-	atomic_set(&info->receive_credits, 0);
+	atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
 
 	if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
 		log_rdma_event(ERR, "error: preferred_send_size=%d\n",
@@ -380,16 +592,12 @@ static bool process_negotiation_response(
 	}
 	sp->max_fragmented_send_size =
 		le32_to_cpu(packet->max_fragmented_size);
-	info->rdma_readwrite_threshold =
-		rdma_readwrite_threshold > sp->max_fragmented_send_size ?
-		sp->max_fragmented_send_size :
-		rdma_readwrite_threshold;
 
 
 	sp->max_read_write_size = min_t(u32,
 			le32_to_cpu(packet->max_readwrite_size),
-			info->max_frmr_depth * PAGE_SIZE);
-	info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
+			sp->max_frmr_depth * PAGE_SIZE);
+	sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
 
 	sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
 	return true;
@@ -397,52 +605,40 @@ static bool process_negotiation_response(
 
 static void smbd_post_send_credits(struct work_struct *work)
 {
-	int ret = 0;
 	int rc;
 	struct smbdirect_recv_io *response;
-	struct smbd_connection *info =
-		container_of(work, struct smbd_connection,
-			post_send_credits_work);
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
 
 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
-		wake_up(&info->wait_receive_queues);
 		return;
 	}
 
-	if (info->receive_credit_target >
-		atomic_read(&info->receive_credits)) {
+	if (sc->recv_io.credits.target >
+		atomic_read(&sc->recv_io.credits.count)) {
 		while (true) {
-			response = get_receive_buffer(info);
+			response = get_receive_buffer(sc);
 			if (!response)
 				break;
 
 			response->first_segment = false;
-			rc = smbd_post_recv(info, response);
+			rc = smbd_post_recv(sc, response);
 			if (rc) {
 				log_rdma_recv(ERR,
 					"post_recv failed rc=%d\n", rc);
-				put_receive_buffer(info, response);
+				put_receive_buffer(sc, response);
 				break;
 			}
 
-			ret++;
+			atomic_inc(&sc->recv_io.posted.count);
 		}
 	}
 
-	spin_lock(&info->lock_new_credits_offered);
-	info->new_credits_offered += ret;
-	spin_unlock(&info->lock_new_credits_offered);
-
 	/* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
-	info->send_immediate = true;
-	if (atomic_read(&info->receive_credits) <
-		info->receive_credit_target - 1) {
-		if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
-		    info->send_immediate) {
-			log_keep_alive(INFO, "send an empty message\n");
-			smbd_post_send_empty(info);
-		}
+	if (atomic_read(&sc->recv_io.credits.count) <
+		sc->recv_io.credits.target - 1) {
+		log_keep_alive(INFO, "schedule send of an empty message\n");
+		queue_work(sc->workqueue, &sc->idle.immediate_work);
 	}
 }
 
@@ -453,17 +649,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct smbdirect_recv_io *response =
 		container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
 	struct smbdirect_socket *sc = response->socket;
-	struct smbd_connection *info =
-		container_of(sc, struct smbd_connection, socket);
-	int data_length = 0;
-
-	log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
-		      response, sc->recv_io.expected, wc->status, wc->opcode,
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	u16 old_recv_credit_target;
+	u32 data_offset = 0;
+	u32 data_length = 0;
+	u32 remaining_data_length = 0;
+	bool negotiate_done = false;
+
+	log_rdma_recv(INFO,
+		      "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
+		      response, sc->recv_io.expected,
+		      ib_wc_status_msg(wc->status), wc->opcode,
 		      wc->byte_len, wc->pkey_index);
 
 	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
-		log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
-			wc->status, wc->opcode);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
+				ib_wc_status_msg(wc->status), wc->opcode);
 		goto error;
 	}
 
@@ -473,21 +675,52 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 		response->sge.length,
 		DMA_FROM_DEVICE);
 
+	/*
+	 * Reset timer to the keepalive interval in
+	 * order to trigger our next keepalive message.
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->keepalive_interval_msec));
+
 	switch (sc->recv_io.expected) {
 	/* SMBD negotiation response */
 	case SMBDIRECT_EXPECT_NEGOTIATE_REP:
 		dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
 		sc->recv_io.reassembly.full_packet_received = true;
-		info->negotiate_done =
+		negotiate_done =
 			process_negotiation_response(response, wc->byte_len);
-		put_receive_buffer(info, response);
-		complete(&info->negotiate_completion);
+		put_receive_buffer(sc, response);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING);
+		if (!negotiate_done) {
+			sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+			smbd_disconnect_rdma_connection(sc);
+		} else {
+			sc->status = SMBDIRECT_SOCKET_CONNECTED;
+			wake_up(&sc->status_wait);
+		}
+
 		return;
 
 	/* SMBD data transfer packet */
 	case SMBDIRECT_EXPECT_DATA_TRANSFER:
 		data_transfer = smbdirect_recv_io_payload(response);
+
+		if (wc->byte_len <
+		    offsetof(struct smbdirect_data_transfer, padding))
+			goto error;
+
+		remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
+		data_offset = le32_to_cpu(data_transfer->data_offset);
 		data_length = le32_to_cpu(data_transfer->data_length);
+		if (wc->byte_len < data_offset ||
+		    (u64)wc->byte_len < (u64)data_offset + data_length)
+			goto error;
+
+		if (remaining_data_length > sp->max_fragmented_recv_size ||
+		    data_length > sp->max_fragmented_recv_size ||
+		    (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
+			goto error;
 
 		if (data_length) {
 			if (sc->recv_io.reassembly.full_packet_received)
@@ -499,17 +732,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 				sc->recv_io.reassembly.full_packet_received = true;
 		}
 
-		atomic_dec(&info->receive_credits);
-		info->receive_credit_target =
+		atomic_dec(&sc->recv_io.posted.count);
+		atomic_dec(&sc->recv_io.credits.count);
+		old_recv_credit_target = sc->recv_io.credits.target;
+		sc->recv_io.credits.target =
 			le16_to_cpu(data_transfer->credits_requested);
+		sc->recv_io.credits.target =
+			min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+		sc->recv_io.credits.target =
+			max_t(u16, sc->recv_io.credits.target, 1);
 		if (le16_to_cpu(data_transfer->credits_granted)) {
 			atomic_add(le16_to_cpu(data_transfer->credits_granted),
-				&info->send_credits);
+				&sc->send_io.credits.count);
 			/*
 			 * We have new send credits granted from remote peer
 			 * If any sender is waiting for credits, unblock it
 			 */
-			wake_up_interruptible(&info->wait_send_queue);
+			wake_up(&sc->send_io.credits.wait_queue);
 		}
 
 		log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
@@ -518,11 +757,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 			     le32_to_cpu(data_transfer->data_length),
 			     le32_to_cpu(data_transfer->remaining_data_length));
 
-		/* Send a KEEP_ALIVE response right away if requested */
-		info->keep_alive_requested = KEEP_ALIVE_NONE;
+		/* Send an immediate response right away if requested */
 		if (le16_to_cpu(data_transfer->flags) &
 				SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
-			info->keep_alive_requested = KEEP_ALIVE_PENDING;
+			log_keep_alive(INFO, "schedule send of immediate response\n");
+			queue_work(sc->workqueue, &sc->idle.immediate_work);
 		}
 
 		/*
@@ -530,10 +769,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 		 * reassembly queue and wake up the reading thread
 		 */
 		if (data_length) {
-			enqueue_reassembly(info, response, data_length);
-			wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
+			if (sc->recv_io.credits.target > old_recv_credit_target)
+				queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+
+			enqueue_reassembly(sc, response, data_length);
+			wake_up(&sc->recv_io.reassembly.wait_queue);
 		} else
-			put_receive_buffer(info, response);
+			put_receive_buffer(sc, response);
 
 		return;
 
@@ -548,19 +790,20 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
 	WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
 error:
-	put_receive_buffer(info, response);
-	smbd_disconnect_rdma_connection(info);
+	put_receive_buffer(sc, response);
+	smbd_disconnect_rdma_connection(sc);
 }
 
 static struct rdma_cm_id *smbd_create_id(
-		struct smbd_connection *info,
+		struct smbdirect_socket *sc,
 		struct sockaddr *dstaddr, int port)
 {
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct rdma_cm_id *id;
 	int rc;
 	__be16 *sport;
 
-	id = rdma_create_id(&init_net, smbd_conn_upcall, info,
+	id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
 		RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(id)) {
 		rc = PTR_ERR(id);
@@ -575,43 +818,57 @@ static struct rdma_cm_id *smbd_create_id(
 
 	*sport = htons(port);
 
-	init_completion(&info->ri_done);
-	info->ri_rc = -ETIMEDOUT;
-
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
+	sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
 	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
-		RDMA_RESOLVE_TIMEOUT);
+		sp->resolve_addr_timeout_msec);
 	if (rc) {
 		log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
 		goto out;
 	}
-	rc = wait_for_completion_interruptible_timeout(
-		&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+	rc = wait_event_interruptible_timeout(
+		sc->status_wait,
+		sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
+		msecs_to_jiffies(sp->resolve_addr_timeout_msec));
 	/* e.g. if interrupted returns -ERESTARTSYS */
 	if (rc < 0) {
 		log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
 		goto out;
 	}
-	rc = info->ri_rc;
-	if (rc) {
+	if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
+		rc = -ETIMEDOUT;
+		log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
+		goto out;
+	}
+	if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
+		rc = -EHOSTUNREACH;
 		log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
 		goto out;
 	}
 
-	info->ri_rc = -ETIMEDOUT;
-	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
+	sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
+	rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
 	if (rc) {
 		log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
 		goto out;
 	}
-	rc = wait_for_completion_interruptible_timeout(
-		&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+	rc = wait_event_interruptible_timeout(
+		sc->status_wait,
+		sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
+		msecs_to_jiffies(sp->resolve_route_timeout_msec));
 	/* e.g. if interrupted returns -ERESTARTSYS */
 	if (rc < 0)  {
 		log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
 		goto out;
 	}
-	rc = info->ri_rc;
-	if (rc) {
+	if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
+		rc = -ETIMEDOUT;
+		log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
+		goto out;
+	}
+	if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
+		rc = -ENETUNREACH;
 		log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
 		goto out;
 	}
@@ -638,13 +895,16 @@ static bool frwr_is_supported(struct ib_device_attr *attrs)
 }
 
 static int smbd_ia_open(
-		struct smbd_connection *info,
+		struct smbdirect_socket *sc,
 		struct sockaddr *dstaddr, int port)
 {
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	int rc;
 
-	sc->rdma.cm_id = smbd_create_id(info, dstaddr, port);
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+	sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
+
+	sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
 	if (IS_ERR(sc->rdma.cm_id)) {
 		rc = PTR_ERR(sc->rdma.cm_id);
 		goto out1;
@@ -659,19 +919,12 @@ static int smbd_ia_open(
 		rc = -EPROTONOSUPPORT;
 		goto out2;
 	}
-	info->max_frmr_depth = min_t(int,
-		smbd_max_frmr_depth,
+	sp->max_frmr_depth = min_t(u32,
+		sp->max_frmr_depth,
 		sc->ib.dev->attrs.max_fast_reg_page_list_len);
-	info->mr_type = IB_MR_TYPE_MEM_REG;
+	sc->mr_io.type = IB_MR_TYPE_MEM_REG;
 	if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
-		info->mr_type = IB_MR_TYPE_SG_GAPS;
-
-	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
-	if (IS_ERR(sc->ib.pd)) {
-		rc = PTR_ERR(sc->ib.pd);
-		log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
-		goto out2;
-	}
+		sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
 
 	return 0;
 
@@ -689,9 +942,8 @@ out1:
  * After negotiation, the transport is connected and ready for
  * carrying upper layer SMB payload
  */
-static int smbd_post_send_negotiate_req(struct smbd_connection *info)
+static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct ib_send_wr send_wr;
 	int rc = -ENOMEM;
@@ -743,18 +995,18 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
 		request->sge[0].addr,
 		request->sge[0].length, request->sge[0].lkey);
 
-	atomic_inc(&info->send_pending);
+	atomic_inc(&sc->send_io.pending.count);
 	rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
 	if (!rc)
 		return 0;
 
 	/* if we reach here, post send failed */
 	log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
-	atomic_dec(&info->send_pending);
+	atomic_dec(&sc->send_io.pending.count);
 	ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
 		request->sge[0].length, DMA_TO_DEVICE);
 
-	smbd_disconnect_rdma_connection(info);
+	smbd_disconnect_rdma_connection(sc);
 
 dma_mapping_failed:
 	mempool_free(request, sc->send_io.mem.pool);
@@ -769,14 +1021,20 @@ dma_mapping_failed:
  * buffer as possible, and extend the receive credits to remote peer
  * return value: the new credtis being granted.
  */
-static int manage_credits_prior_sending(struct smbd_connection *info)
+static int manage_credits_prior_sending(struct smbdirect_socket *sc)
 {
 	int new_credits;
 
-	spin_lock(&info->lock_new_credits_offered);
-	new_credits = info->new_credits_offered;
-	info->new_credits_offered = 0;
-	spin_unlock(&info->lock_new_credits_offered);
+	if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
+		return 0;
+
+	new_credits = atomic_read(&sc->recv_io.posted.count);
+	if (new_credits == 0)
+		return 0;
+
+	new_credits -= atomic_read(&sc->recv_io.credits.count);
+	if (new_credits <= 0)
+		return 0;
 
 	return new_credits;
 }
@@ -790,21 +1048,27 @@ static int manage_credits_prior_sending(struct smbd_connection *info)
  * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
  * 0: otherwise
  */
-static int manage_keep_alive_before_sending(struct smbd_connection *info)
+static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
 {
-	if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
-		info->keep_alive_requested = KEEP_ALIVE_SENT;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+	if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
+		sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
+		/*
+		 * Now use the keepalive timeout (instead of keepalive interval)
+		 * in order to wait for a response
+		 */
+		mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+				 msecs_to_jiffies(sp->keepalive_timeout_msec));
 		return 1;
 	}
 	return 0;
 }
 
 /* Post the send request */
-static int smbd_post_send(struct smbd_connection *info,
+static int smbd_post_send(struct smbdirect_socket *sc,
 		struct smbdirect_send_io *request)
 {
-	struct smbdirect_socket *sc = &info->socket;
-	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct ib_send_wr send_wr;
 	int rc, i;
 
@@ -831,21 +1095,17 @@ static int smbd_post_send(struct smbd_connection *info,
 	rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
 	if (rc) {
 		log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
-		smbd_disconnect_rdma_connection(info);
+		smbd_disconnect_rdma_connection(sc);
 		rc = -EAGAIN;
-	} else
-		/* Reset timer for idle connection after packet is sent */
-		mod_delayed_work(info->workqueue, &info->idle_timer_work,
-			msecs_to_jiffies(sp->keepalive_interval_msec));
+	}
 
 	return rc;
 }
 
-static int smbd_post_send_iter(struct smbd_connection *info,
+static int smbd_post_send_iter(struct smbdirect_socket *sc,
 			       struct iov_iter *iter,
 			       int *_remaining_data_length)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	int i, rc;
 	int header_length;
@@ -856,8 +1116,8 @@ static int smbd_post_send_iter(struct smbd_connection *info,
 
 wait_credit:
 	/* Wait for send credits. A SMBD packet needs one credit */
-	rc = wait_event_interruptible(info->wait_send_queue,
-		atomic_read(&info->send_credits) > 0 ||
+	rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
+		atomic_read(&sc->send_io.credits.count) > 0 ||
 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
 	if (rc)
 		goto err_wait_credit;
@@ -867,14 +1127,14 @@ wait_credit:
 		rc = -EAGAIN;
 		goto err_wait_credit;
 	}
-	if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
-		atomic_inc(&info->send_credits);
+	if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
+		atomic_inc(&sc->send_io.credits.count);
 		goto wait_credit;
 	}
 
 wait_send_queue:
-	wait_event(info->wait_post_send,
-		atomic_read(&info->send_pending) < sp->send_credit_target ||
+	wait_event(sc->send_io.pending.dec_wait_queue,
+		atomic_read(&sc->send_io.pending.count) < sp->send_credit_target ||
 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
 
 	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
@@ -883,9 +1143,9 @@ wait_send_queue:
 		goto err_wait_send_queue;
 	}
 
-	if (unlikely(atomic_inc_return(&info->send_pending) >
+	if (unlikely(atomic_inc_return(&sc->send_io.pending.count) >
 				sp->send_credit_target)) {
-		atomic_dec(&info->send_pending);
+		atomic_dec(&sc->send_io.pending.count);
 		goto wait_send_queue;
 	}
 
@@ -898,10 +1158,30 @@ wait_send_queue:
 	request->socket = sc;
 	memset(request->sge, 0, sizeof(request->sge));
 
+	/* Map the packet to DMA */
+	header_length = sizeof(struct smbdirect_data_transfer);
+	/* If this is a packet without payload, don't send padding */
+	if (!iter)
+		header_length = offsetof(struct smbdirect_data_transfer, padding);
+
+	packet = smbdirect_send_io_payload(request);
+	request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
+						 (void *)packet,
+						 header_length,
+						 DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
+		rc = -EIO;
+		goto err_dma;
+	}
+
+	request->sge[0].length = header_length;
+	request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
+	request->num_sge = 1;
+
 	/* Fill in the data payload to find out how much data we can add */
 	if (iter) {
 		struct smb_extract_to_rdma extract = {
-			.nr_sge		= 1,
+			.nr_sge		= request->num_sge,
 			.max_sge	= SMBDIRECT_SEND_IO_MAX_SGE,
 			.sge		= request->sge,
 			.device		= sc->ib.dev,
@@ -920,21 +1200,17 @@ wait_send_queue:
 		*_remaining_data_length -= data_length;
 	} else {
 		data_length = 0;
-		request->num_sge = 1;
 	}
 
 	/* Fill in the packet header */
-	packet = smbdirect_send_io_payload(request);
 	packet->credits_requested = cpu_to_le16(sp->send_credit_target);
 
-	new_credits = manage_credits_prior_sending(info);
-	atomic_add(new_credits, &info->receive_credits);
+	new_credits = manage_credits_prior_sending(sc);
+	atomic_add(new_credits, &sc->recv_io.credits.count);
 	packet->credits_granted = cpu_to_le16(new_credits);
 
-	info->send_immediate = false;
-
 	packet->flags = 0;
-	if (manage_keep_alive_before_sending(info))
+	if (manage_keep_alive_before_sending(sc))
 		packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
 
 	packet->reserved = 0;
@@ -953,26 +1229,7 @@ wait_send_queue:
 		     le32_to_cpu(packet->data_length),
 		     le32_to_cpu(packet->remaining_data_length));
 
-	/* Map the packet to DMA */
-	header_length = sizeof(struct smbdirect_data_transfer);
-	/* If this is a packet without payload, don't send padding */
-	if (!data_length)
-		header_length = offsetof(struct smbdirect_data_transfer, padding);
-
-	request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
-						 (void *)packet,
-						 header_length,
-						 DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
-		rc = -EIO;
-		request->sge[0].addr = 0;
-		goto err_dma;
-	}
-
-	request->sge[0].length = header_length;
-	request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
-
-	rc = smbd_post_send(info, request);
+	rc = smbd_post_send(sc, request);
 	if (!rc)
 		return 0;
 
@@ -985,19 +1242,16 @@ err_dma:
 					    DMA_TO_DEVICE);
 	mempool_free(request, sc->send_io.mem.pool);
 
-	/* roll back receive credits and credits to be offered */
-	spin_lock(&info->lock_new_credits_offered);
-	info->new_credits_offered += new_credits;
-	spin_unlock(&info->lock_new_credits_offered);
-	atomic_sub(new_credits, &info->receive_credits);
+	/* roll back the granted receive credits */
+	atomic_sub(new_credits, &sc->recv_io.credits.count);
 
 err_alloc:
-	if (atomic_dec_and_test(&info->send_pending))
-		wake_up(&info->wait_send_pending);
+	if (atomic_dec_and_test(&sc->send_io.pending.count))
+		wake_up(&sc->send_io.pending.zero_wait_queue);
 
 err_wait_send_queue:
 	/* roll back send credits and pending */
-	atomic_inc(&info->send_credits);
+	atomic_inc(&sc->send_io.credits.count);
 
 err_wait_credit:
 	return rc;
@@ -1008,15 +1262,15 @@ err_wait_credit:
  * Empty message is used to extend credits to peer to for keep live
  * while there is no upper layer payload to send at the time
  */
-static int smbd_post_send_empty(struct smbd_connection *info)
+static int smbd_post_send_empty(struct smbdirect_socket *sc)
 {
 	int remaining_data_length = 0;
 
-	info->count_send_empty++;
-	return smbd_post_send_iter(info, NULL, &remaining_data_length);
+	sc->statistics.send_empty++;
+	return smbd_post_send_iter(sc, NULL, &remaining_data_length);
 }
 
-static int smbd_post_send_full_iter(struct smbd_connection *info,
+static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
 				    struct iov_iter *iter,
 				    int *_remaining_data_length)
 {
@@ -1029,7 +1283,7 @@ static int smbd_post_send_full_iter(struct smbd_connection *info,
 	 */
 
 	while (iov_iter_count(iter) > 0) {
-		rc = smbd_post_send_iter(info, iter, _remaining_data_length);
+		rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
 		if (rc < 0)
 			break;
 	}
@@ -1043,9 +1297,8 @@ static int smbd_post_send_full_iter(struct smbd_connection *info,
  * The interaction is controlled by send/receive credit system
  */
 static int smbd_post_recv(
-		struct smbd_connection *info, struct smbdirect_recv_io *response)
+		struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct ib_recv_wr recv_wr;
 	int rc = -EIO;
@@ -1071,7 +1324,7 @@ static int smbd_post_recv(
 		ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
 				    response->sge.length, DMA_FROM_DEVICE);
 		response->sge.length = 0;
-		smbd_disconnect_rdma_connection(info);
+		smbd_disconnect_rdma_connection(sc);
 		log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
 	}
 
@@ -1079,31 +1332,36 @@ static int smbd_post_recv(
 }
 
 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
-static int smbd_negotiate(struct smbd_connection *info)
+static int smbd_negotiate(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	int rc;
-	struct smbdirect_recv_io *response = get_receive_buffer(info);
+	struct smbdirect_recv_io *response = get_receive_buffer(sc);
+
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
+	sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
 
 	sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
-	rc = smbd_post_recv(info, response);
+	rc = smbd_post_recv(sc, response);
 	log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
 		       rc, response->sge.addr,
 		       response->sge.length, response->sge.lkey);
-	if (rc)
+	if (rc) {
+		put_receive_buffer(sc, response);
 		return rc;
+	}
 
-	init_completion(&info->negotiate_completion);
-	info->negotiate_done = false;
-	rc = smbd_post_send_negotiate_req(info);
+	rc = smbd_post_send_negotiate_req(sc);
 	if (rc)
 		return rc;
 
-	rc = wait_for_completion_interruptible_timeout(
-		&info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
-	log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
+	rc = wait_event_interruptible_timeout(
+		sc->status_wait,
+		sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
+		msecs_to_jiffies(sp->negotiate_timeout_msec));
+	log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
 
-	if (info->negotiate_done)
+	if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
 		return 0;
 
 	if (rc == 0)
@@ -1127,13 +1385,13 @@ static int smbd_negotiate(struct smbd_connection *info)
  * data_length: the size of payload in this packet
  */
 static void enqueue_reassembly(
-	struct smbd_connection *info,
+	struct smbdirect_socket *sc,
 	struct smbdirect_recv_io *response,
 	int data_length)
 {
-	struct smbdirect_socket *sc = &info->socket;
+	unsigned long flags;
 
-	spin_lock(&sc->recv_io.reassembly.lock);
+	spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
 	list_add_tail(&response->list, &sc->recv_io.reassembly.list);
 	sc->recv_io.reassembly.queue_length++;
 	/*
@@ -1144,9 +1402,8 @@ static void enqueue_reassembly(
 	 */
 	virt_wmb();
 	sc->recv_io.reassembly.data_length += data_length;
-	spin_unlock(&sc->recv_io.reassembly.lock);
-	info->count_reassembly_queue++;
-	info->count_enqueue_reassembly_queue++;
+	spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+	sc->statistics.enqueue_reassembly_queue++;
 }
 
 /*
@@ -1154,9 +1411,8 @@ static void enqueue_reassembly(
  * Caller is responsible for locking
  * return value: the first entry if any, NULL if queue is empty
  */
-static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info)
+static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_recv_io *ret = NULL;
 
 	if (!list_empty(&sc->recv_io.reassembly.list)) {
@@ -1173,9 +1429,8 @@ static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *i
  * pre-allocated in advance.
  * return value: the receive buffer, NULL if none is available
  */
-static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info)
+static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_recv_io *ret = NULL;
 	unsigned long flags;
 
@@ -1185,8 +1440,7 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info
 			&sc->recv_io.free.list,
 			struct smbdirect_recv_io, list);
 		list_del(&ret->list);
-		info->count_receive_queue--;
-		info->count_get_receive_buffer++;
+		sc->statistics.get_receive_buffer++;
 	}
 	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
 
@@ -1200,9 +1454,8 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info
  * receive buffer is returned.
  */
 static void put_receive_buffer(
-	struct smbd_connection *info, struct smbdirect_recv_io *response)
+	struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	unsigned long flags;
 
 	if (likely(response->sge.length != 0)) {
@@ -1215,31 +1468,18 @@ static void put_receive_buffer(
 
 	spin_lock_irqsave(&sc->recv_io.free.lock, flags);
 	list_add_tail(&response->list, &sc->recv_io.free.list);
-	info->count_receive_queue++;
-	info->count_put_receive_buffer++;
+	sc->statistics.put_receive_buffer++;
 	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
 
-	queue_work(info->workqueue, &info->post_send_credits_work);
+	queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
 }
 
 /* Preallocate all receive buffer on transport establishment */
-static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
+static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_recv_io *response;
 	int i;
 
-	INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
-	spin_lock_init(&sc->recv_io.reassembly.lock);
-	sc->recv_io.reassembly.data_length = 0;
-	sc->recv_io.reassembly.queue_length = 0;
-
-	INIT_LIST_HEAD(&sc->recv_io.free.list);
-	spin_lock_init(&sc->recv_io.free.lock);
-	info->count_receive_queue = 0;
-
-	init_waitqueue_head(&info->wait_receive_queues);
-
 	for (i = 0; i < num_buf; i++) {
 		response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
 		if (!response)
@@ -1248,7 +1488,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
 		response->socket = sc;
 		response->sge.length = 0;
 		list_add_tail(&response->list, &sc->recv_io.free.list);
-		info->count_receive_queue++;
 	}
 
 	return 0;
@@ -1259,45 +1498,59 @@ allocate_failed:
 				&sc->recv_io.free.list,
 				struct smbdirect_recv_io, list);
 		list_del(&response->list);
-		info->count_receive_queue--;
 
 		mempool_free(response, sc->recv_io.mem.pool);
 	}
 	return -ENOMEM;
 }
 
-static void destroy_receive_buffers(struct smbd_connection *info)
+static void destroy_receive_buffers(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_recv_io *response;
 
-	while ((response = get_receive_buffer(info)))
+	while ((response = get_receive_buffer(sc)))
 		mempool_free(response, sc->recv_io.mem.pool);
 }
 
+static void send_immediate_empty_message(struct work_struct *work)
+{
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, idle.immediate_work);
+
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+		return;
+
+	log_keep_alive(INFO, "send an empty message\n");
+	smbd_post_send_empty(sc);
+}
+
 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
 static void idle_connection_timer(struct work_struct *work)
 {
-	struct smbd_connection *info = container_of(
-					work, struct smbd_connection,
-					idle_timer_work.work);
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, idle.timer_work.work);
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 
-	if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
+	if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
 		log_keep_alive(ERR,
-			"error status info->keep_alive_requested=%d\n",
-			info->keep_alive_requested);
-		smbd_disconnect_rdma_connection(info);
+			"error status sc->idle.keepalive=%d\n",
+			sc->idle.keepalive);
+		smbd_disconnect_rdma_connection(sc);
 		return;
 	}
 
-	log_keep_alive(INFO, "about to send an empty idle message\n");
-	smbd_post_send_empty(info);
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+		return;
 
-	/* Setup the next idle timeout work */
-	queue_delayed_work(info->workqueue, &info->idle_timer_work,
-			msecs_to_jiffies(sp->keepalive_interval_msec));
+	/*
+	 * Now use the keepalive timeout (instead of keepalive interval)
+	 * in order to wait for a response
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->keepalive_timeout_msec));
+	log_keep_alive(INFO, "schedule send of empty idle message\n");
+	queue_work(sc->workqueue, &sc->idle.immediate_work);
 }
 
 /*
@@ -1309,7 +1562,6 @@ void smbd_destroy(struct TCP_Server_Info *server)
 {
 	struct smbd_connection *info = server->smbd_conn;
 	struct smbdirect_socket *sc;
-	struct smbdirect_socket_parameters *sp;
 	struct smbdirect_recv_io *response;
 	unsigned long flags;
 
@@ -1318,39 +1570,51 @@ void smbd_destroy(struct TCP_Server_Info *server)
 		return;
 	}
 	sc = &info->socket;
-	sp = &sc->parameters;
+
+	log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
+	disable_work_sync(&sc->disconnect_work);
 
 	log_rdma_event(INFO, "destroying rdma session\n");
-	if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) {
-		rdma_disconnect(sc->rdma.cm_id);
+	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+		smbd_disconnect_rdma_work(&sc->disconnect_work);
 		log_rdma_event(INFO, "wait for transport being disconnected\n");
 		wait_event_interruptible(
-			info->status_wait,
+			sc->status_wait,
 			sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
 	}
 
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 *
+	 * Most likely this was already called via
+	 * smbd_disconnect_rdma_work(), but call it again...
+	 */
+	smbd_disconnect_wake_up_all(sc);
+
+	log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
+	disable_work_sync(&sc->recv_io.posted.refill_work);
+
 	log_rdma_event(INFO, "destroying qp\n");
 	ib_drain_qp(sc->ib.qp);
 	rdma_destroy_qp(sc->rdma.cm_id);
 	sc->ib.qp = NULL;
 
 	log_rdma_event(INFO, "cancelling idle timer\n");
-	cancel_delayed_work_sync(&info->idle_timer_work);
-
-	log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
-	wait_event(info->wait_send_pending,
-		atomic_read(&info->send_pending) == 0);
+	disable_delayed_work_sync(&sc->idle.timer_work);
+	log_rdma_event(INFO, "cancelling send immediate work\n");
+	disable_work_sync(&sc->idle.immediate_work);
 
 	/* It's not possible for upper layer to get to reassembly */
 	log_rdma_event(INFO, "drain the reassembly queue\n");
 	do {
 		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
-		response = _get_first_reassembly(info);
+		response = _get_first_reassembly(sc);
 		if (response) {
 			list_del(&response->list);
 			spin_unlock_irqrestore(
 				&sc->recv_io.reassembly.lock, flags);
-			put_receive_buffer(info, response);
+			put_receive_buffer(sc, response);
 		} else
 			spin_unlock_irqrestore(
 				&sc->recv_io.reassembly.lock, flags);
@@ -1358,9 +1622,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
 	sc->recv_io.reassembly.data_length = 0;
 
 	log_rdma_event(INFO, "free receive buffers\n");
-	wait_event(info->wait_receive_queues,
-		info->count_receive_queue == sp->recv_credit_max);
-	destroy_receive_buffers(info);
+	destroy_receive_buffers(sc);
 
 	/*
 	 * For performance reasons, memory registration and deregistration
@@ -1370,13 +1632,12 @@ void smbd_destroy(struct TCP_Server_Info *server)
 	 * path when sending data, and then release memory registrations.
 	 */
 	log_rdma_event(INFO, "freeing mr list\n");
-	wake_up_interruptible_all(&info->wait_mr);
-	while (atomic_read(&info->mr_used_count)) {
+	while (atomic_read(&sc->mr_io.used.count)) {
 		cifs_server_unlock(server);
 		msleep(1000);
 		cifs_server_lock(server);
 	}
-	destroy_mr_list(info);
+	destroy_mr_list(sc);
 
 	ib_free_cq(sc->ib.send_cq);
 	ib_free_cq(sc->ib.recv_cq);
@@ -1392,7 +1653,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
 
 	sc->status = SMBDIRECT_SOCKET_DESTROYED;
 
-	destroy_workqueue(info->workqueue);
+	destroy_workqueue(sc->workqueue);
 	log_rdma_event(INFO,  "rdma session destroyed\n");
 	kfree(info);
 	server->smbd_conn = NULL;
@@ -1434,12 +1695,9 @@ create_conn:
 	return -ENOENT;
 }
 
-static void destroy_caches_and_workqueue(struct smbd_connection *info)
+static void destroy_caches(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
-
-	destroy_receive_buffers(info);
-	destroy_workqueue(info->workqueue);
+	destroy_receive_buffers(sc);
 	mempool_destroy(sc->recv_io.mem.pool);
 	kmem_cache_destroy(sc->recv_io.mem.cache);
 	mempool_destroy(sc->send_io.mem.pool);
@@ -1447,9 +1705,8 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info)
 }
 
 #define MAX_NAME_LEN	80
-static int allocate_caches_and_workqueue(struct smbd_connection *info)
+static int allocate_caches(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
 	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	char name[MAX_NAME_LEN];
 	int rc;
@@ -1457,7 +1714,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
 	if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
 		return -ENOMEM;
 
-	scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info);
+	scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
 	sc->send_io.mem.cache =
 		kmem_cache_create(
 			name,
@@ -1473,7 +1730,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
 	if (!sc->send_io.mem.pool)
 		goto out1;
 
-	scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info);
+	scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
 
 	struct kmem_cache_args response_args = {
 		.align		= __alignof__(struct smbdirect_recv_io),
@@ -1494,21 +1751,14 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
 	if (!sc->recv_io.mem.pool)
 		goto out3;
 
-	scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
-	info->workqueue = create_workqueue(name);
-	if (!info->workqueue)
-		goto out4;
-
-	rc = allocate_receive_buffers(info, sp->recv_credit_max);
+	rc = allocate_receive_buffers(sc, sp->recv_credit_max);
 	if (rc) {
 		log_rdma_event(ERR, "failed to allocate receive buffers\n");
-		goto out5;
+		goto out4;
 	}
 
 	return 0;
 
-out5:
-	destroy_workqueue(info->workqueue);
 out4:
 	mempool_destroy(sc->recv_io.mem.pool);
 out3:
@@ -1532,46 +1782,63 @@ static struct smbd_connection *_smbd_get_connection(
 	struct ib_qp_init_attr qp_attr;
 	struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
 	struct ib_port_immutable port_immutable;
-	u32 ird_ord_hdr[2];
+	__be32 ird_ord_hdr[2];
+	char wq_name[80];
+	struct workqueue_struct *workqueue;
 
 	info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
 	if (!info)
 		return NULL;
 	sc = &info->socket;
+	scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
+	workqueue = create_workqueue(wq_name);
+	if (!workqueue)
+		goto create_wq_failed;
+	smbdirect_socket_init(sc);
+	sc->workqueue = workqueue;
 	sp = &sc->parameters;
 
-	sc->status = SMBDIRECT_SOCKET_CONNECTING;
-	rc = smbd_ia_open(info, dstaddr, port);
+	INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
+
+	sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+	sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+	sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+	sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
+	sp->initiator_depth = 1;
+	sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
+	sp->recv_credit_max = smbd_receive_credit_max;
+	sp->send_credit_target = smbd_send_credit_target;
+	sp->max_send_size = smbd_max_send_size;
+	sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
+	sp->max_recv_size = smbd_max_receive_size;
+	sp->max_frmr_depth = smbd_max_frmr_depth;
+	sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
+	sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
+
+	rc = smbd_ia_open(sc, dstaddr, port);
 	if (rc) {
 		log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
 		goto create_id_failed;
 	}
 
-	if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe ||
-	    smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
+	if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
+	    sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
 		log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
-			       smbd_send_credit_target,
+			       sp->send_credit_target,
 			       sc->ib.dev->attrs.max_cqe,
 			       sc->ib.dev->attrs.max_qp_wr);
 		goto config_failed;
 	}
 
-	if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe ||
-	    smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) {
+	if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
+	    sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
 		log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
-			       smbd_receive_credit_max,
+			       sp->recv_credit_max,
 			       sc->ib.dev->attrs.max_cqe,
 			       sc->ib.dev->attrs.max_qp_wr);
 		goto config_failed;
 	}
 
-	sp->recv_credit_max = smbd_receive_credit_max;
-	sp->send_credit_target = smbd_send_credit_target;
-	sp->max_send_size = smbd_max_send_size;
-	sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
-	sp->max_recv_size = smbd_max_receive_size;
-	sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
-
 	if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
 	    sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
 		log_rdma_event(ERR,
@@ -1583,8 +1850,16 @@ static struct smbd_connection *_smbd_get_connection(
 		goto config_failed;
 	}
 
+	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+	if (IS_ERR(sc->ib.pd)) {
+		rc = PTR_ERR(sc->ib.pd);
+		sc->ib.pd = NULL;
+		log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
+		goto alloc_pd_failed;
+	}
+
 	sc->ib.send_cq =
-		ib_alloc_cq_any(sc->ib.dev, info,
+		ib_alloc_cq_any(sc->ib.dev, sc,
 				sp->send_credit_target, IB_POLL_SOFTIRQ);
 	if (IS_ERR(sc->ib.send_cq)) {
 		sc->ib.send_cq = NULL;
@@ -1592,7 +1867,7 @@ static struct smbd_connection *_smbd_get_connection(
 	}
 
 	sc->ib.recv_cq =
-		ib_alloc_cq_any(sc->ib.dev, info,
+		ib_alloc_cq_any(sc->ib.dev, sc,
 				sp->recv_credit_max, IB_POLL_SOFTIRQ);
 	if (IS_ERR(sc->ib.recv_cq)) {
 		sc->ib.recv_cq = NULL;
@@ -1601,7 +1876,7 @@ static struct smbd_connection *_smbd_get_connection(
 
 	memset(&qp_attr, 0, sizeof(qp_attr));
 	qp_attr.event_handler = smbd_qp_async_error_upcall;
-	qp_attr.qp_context = info;
+	qp_attr.qp_context = sc;
 	qp_attr.cap.max_send_wr = sp->send_credit_target;
 	qp_attr.cap.max_recv_wr = sp->recv_credit_max;
 	qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
@@ -1620,22 +1895,22 @@ static struct smbd_connection *_smbd_get_connection(
 	}
 	sc->ib.qp = sc->rdma.cm_id->qp;
 
-	memset(&conn_param, 0, sizeof(conn_param));
-	conn_param.initiator_depth = 0;
-
-	conn_param.responder_resources =
-		min(sc->ib.dev->attrs.max_qp_rd_atom,
-		    SMBD_CM_RESPONDER_RESOURCES);
-	info->responder_resources = conn_param.responder_resources;
+	sp->responder_resources =
+		min_t(u8, sp->responder_resources,
+		      sc->ib.dev->attrs.max_qp_rd_atom);
 	log_rdma_mr(INFO, "responder_resources=%d\n",
-		info->responder_resources);
+		sp->responder_resources);
+
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.initiator_depth = sp->initiator_depth;
+	conn_param.responder_resources = sp->responder_resources;
 
 	/* Need to send IRD/ORD in private data for iWARP */
 	sc->ib.dev->ops.get_port_immutable(
 		sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
 	if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
-		ird_ord_hdr[0] = info->responder_resources;
-		ird_ord_hdr[1] = 1;
+		ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+		ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
 		conn_param.private_data = ird_ord_hdr;
 		conn_param.private_data_len = sizeof(ird_ord_hdr);
 	} else {
@@ -1650,8 +1925,8 @@ static struct smbd_connection *_smbd_get_connection(
 	log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
 		&addr_in->sin_addr, port);
 
-	init_waitqueue_head(&info->status_wait);
-	init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
+	sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
 	rc = rdma_connect(sc->rdma.cm_id, &conn_param);
 	if (rc) {
 		log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
@@ -1659,45 +1934,42 @@ static struct smbd_connection *_smbd_get_connection(
 	}
 
 	wait_event_interruptible_timeout(
-		info->status_wait,
-		sc->status != SMBDIRECT_SOCKET_CONNECTING,
-		msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+		sc->status_wait,
+		sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
+		msecs_to_jiffies(sp->rdma_connect_timeout_msec));
 
-	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+	if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
 		log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
 		goto rdma_connect_failed;
 	}
 
 	log_rdma_event(INFO, "rdma_connect connected\n");
 
-	rc = allocate_caches_and_workqueue(info);
+	rc = allocate_caches(sc);
 	if (rc) {
 		log_rdma_event(ERR, "cache allocation failed\n");
 		goto allocate_cache_failed;
 	}
 
-	init_waitqueue_head(&info->wait_send_queue);
-	INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
-	queue_delayed_work(info->workqueue, &info->idle_timer_work,
-		msecs_to_jiffies(sp->keepalive_interval_msec));
-
-	init_waitqueue_head(&info->wait_send_pending);
-	atomic_set(&info->send_pending, 0);
-
-	init_waitqueue_head(&info->wait_post_send);
+	INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
+	INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
+	/*
+	 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+	 * so that the timer will cause a disconnect.
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->negotiate_timeout_msec));
 
-	INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
-	INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
-	info->new_credits_offered = 0;
-	spin_lock_init(&info->lock_new_credits_offered);
+	INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
 
-	rc = smbd_negotiate(info);
+	rc = smbd_negotiate(sc);
 	if (rc) {
 		log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
 		goto negotiation_failed;
 	}
 
-	rc = allocate_mr_list(info);
+	rc = allocate_mr_list(sc);
 	if (rc) {
 		log_rdma_mr(ERR, "memory registration allocation failed\n");
 		goto allocate_mr_failed;
@@ -1712,11 +1984,11 @@ allocate_mr_failed:
 	return NULL;
 
 negotiation_failed:
-	cancel_delayed_work_sync(&info->idle_timer_work);
-	destroy_caches_and_workqueue(info);
+	disable_delayed_work_sync(&sc->idle.timer_work);
+	destroy_caches(sc);
 	sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
 	rdma_disconnect(sc->rdma.cm_id);
-	wait_event(info->status_wait,
+	wait_event(sc->status_wait,
 		sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
 
 allocate_cache_failed:
@@ -1730,11 +2002,15 @@ alloc_cq_failed:
 	if (sc->ib.recv_cq)
 		ib_free_cq(sc->ib.recv_cq);
 
-config_failed:
 	ib_dealloc_pd(sc->ib.pd);
+
+alloc_pd_failed:
+config_failed:
 	rdma_destroy_id(sc->rdma.cm_id);
 
 create_id_failed:
+	destroy_workqueue(sc->workqueue);
+create_wq_failed:
 	kfree(info);
 	return NULL;
 }
@@ -1743,6 +2019,7 @@ struct smbd_connection *smbd_get_connection(
 	struct TCP_Server_Info *server, struct sockaddr *dstaddr)
 {
 	struct smbd_connection *ret;
+	const struct smbdirect_socket_parameters *sp;
 	int port = SMBD_PORT;
 
 try_again:
@@ -1753,6 +2030,16 @@ try_again:
 		port = SMB_PORT;
 		goto try_again;
 	}
+	if (!ret)
+		return NULL;
+
+	sp = &ret->socket.parameters;
+
+	server->rdma_readwrite_threshold =
+		rdma_readwrite_threshold > sp->max_fragmented_send_size ?
+		sp->max_fragmented_send_size :
+		rdma_readwrite_threshold;
+
 	return ret;
 }
 
@@ -1794,6 +2081,7 @@ again:
 	if (sc->recv_io.reassembly.data_length >= size) {
 		int queue_length;
 		int queue_removed = 0;
+		unsigned long flags;
 
 		/*
 		 * Need to make sure reassembly_data_length is read before
@@ -1808,7 +2096,7 @@ again:
 		to_read = size;
 		offset = sc->recv_io.reassembly.first_entry_offset;
 		while (data_read < size) {
-			response = _get_first_reassembly(info);
+			response = _get_first_reassembly(sc);
 			data_transfer = smbdirect_recv_io_payload(response);
 			data_length = le32_to_cpu(data_transfer->data_length);
 			remaining_data_length =
@@ -1853,16 +2141,15 @@ again:
 				if (queue_length)
 					list_del(&response->list);
 				else {
-					spin_lock_irq(
-						&sc->recv_io.reassembly.lock);
+					spin_lock_irqsave(
+						&sc->recv_io.reassembly.lock, flags);
 					list_del(&response->list);
-					spin_unlock_irq(
-						&sc->recv_io.reassembly.lock);
+					spin_unlock_irqrestore(
+						&sc->recv_io.reassembly.lock, flags);
 				}
 				queue_removed++;
-				info->count_reassembly_queue--;
-				info->count_dequeue_reassembly_queue++;
-				put_receive_buffer(info, response);
+				sc->statistics.dequeue_reassembly_queue++;
+				put_receive_buffer(sc, response);
 				offset = 0;
 				log_read(INFO, "put_receive_buffer offset=0\n");
 			} else
@@ -1876,10 +2163,10 @@ again:
 				 to_read, data_read, offset);
 		}
 
-		spin_lock_irq(&sc->recv_io.reassembly.lock);
+		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
 		sc->recv_io.reassembly.data_length -= data_read;
 		sc->recv_io.reassembly.queue_length -= queue_removed;
-		spin_unlock_irq(&sc->recv_io.reassembly.lock);
+		spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
 
 		sc->recv_io.reassembly.first_entry_offset = offset;
 		log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
@@ -1964,13 +2251,13 @@ int smbd_send(struct TCP_Server_Info *server,
 			klen += rqst->rq_iov[i].iov_len;
 		iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
 
-		rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length);
+		rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
 		if (rc < 0)
 			break;
 
 		if (iov_iter_count(&rqst->rq_iter) > 0) {
 			/* And then the data pages if there are any */
-			rc = smbd_post_send_full_iter(info, &rqst->rq_iter,
+			rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
 						      &remaining_data_length);
 			if (rc < 0)
 				break;
@@ -1985,22 +2272,25 @@ int smbd_send(struct TCP_Server_Info *server,
 	 * that means all the I/Os have been out and we are good to return
 	 */
 
-	wait_event(info->wait_send_pending,
-		atomic_read(&info->send_pending) == 0);
+	wait_event(sc->send_io.pending.zero_wait_queue,
+		atomic_read(&sc->send_io.pending.count) == 0 ||
+		sc->status != SMBDIRECT_SOCKET_CONNECTED);
+
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
+		rc = -EAGAIN;
 
 	return rc;
 }
 
 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct smbd_mr *mr;
-	struct ib_cqe *cqe;
+	struct smbdirect_mr_io *mr =
+		container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
+	struct smbdirect_socket *sc = mr->socket;
 
 	if (wc->status) {
 		log_rdma_mr(ERR, "status=%d\n", wc->status);
-		cqe = wc->wr_cqe;
-		mr = container_of(cqe, struct smbd_mr, cqe);
-		smbd_disconnect_rdma_connection(mr->conn);
+		smbd_disconnect_rdma_connection(sc);
 	}
 }
 
@@ -2015,14 +2305,14 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
  */
 static void smbd_mr_recovery_work(struct work_struct *work)
 {
-	struct smbd_connection *info =
-		container_of(work, struct smbd_connection, mr_recovery_work);
-	struct smbdirect_socket *sc = &info->socket;
-	struct smbd_mr *smbdirect_mr;
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, mr_io.recovery_work);
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_mr_io *smbdirect_mr;
 	int rc;
 
-	list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
-		if (smbdirect_mr->state == MR_ERROR) {
+	list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
+		if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
 
 			/* recover this MR entry */
 			rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2030,25 +2320,25 @@ static void smbd_mr_recovery_work(struct work_struct *work)
 				log_rdma_mr(ERR,
 					"ib_dereg_mr failed rc=%x\n",
 					rc);
-				smbd_disconnect_rdma_connection(info);
+				smbd_disconnect_rdma_connection(sc);
 				continue;
 			}
 
 			smbdirect_mr->mr = ib_alloc_mr(
-				sc->ib.pd, info->mr_type,
-				info->max_frmr_depth);
+				sc->ib.pd, sc->mr_io.type,
+				sp->max_frmr_depth);
 			if (IS_ERR(smbdirect_mr->mr)) {
 				log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
-					    info->mr_type,
-					    info->max_frmr_depth);
-				smbd_disconnect_rdma_connection(info);
+					    sc->mr_io.type,
+					    sp->max_frmr_depth);
+				smbd_disconnect_rdma_connection(sc);
 				continue;
 			}
 		} else
 			/* This MR is being used, don't recover it */
 			continue;
 
-		smbdirect_mr->state = MR_READY;
+		smbdirect_mr->state = SMBDIRECT_MR_READY;
 
 		/* smbdirect_mr->state is updated by this function
 		 * and is read and updated by I/O issuing CPUs trying
@@ -2057,19 +2347,18 @@ static void smbd_mr_recovery_work(struct work_struct *work)
 		 * value is updated before waking up any calls to
 		 * get_mr() from the I/O issuing CPUs
 		 */
-		if (atomic_inc_return(&info->mr_ready_count) == 1)
-			wake_up_interruptible(&info->wait_mr);
+		if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
+			wake_up(&sc->mr_io.ready.wait_queue);
 	}
 }
 
-static void destroy_mr_list(struct smbd_connection *info)
+static void destroy_mr_list(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
-	struct smbd_mr *mr, *tmp;
+	struct smbdirect_mr_io *mr, *tmp;
 
-	cancel_work_sync(&info->mr_recovery_work);
-	list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
-		if (mr->state == MR_INVALIDATED)
+	disable_work_sync(&sc->mr_io.recovery_work);
+	list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) {
+		if (mr->state == SMBDIRECT_MR_INVALIDATED)
 			ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
 				mr->sgt.nents, mr->dir);
 		ib_dereg_mr(mr->mr);
@@ -2085,32 +2374,32 @@ static void destroy_mr_list(struct smbd_connection *info)
  * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
  * as MRs are used and recovered for I/O, but the list links will not change
  */
-static int allocate_mr_list(struct smbd_connection *info)
+static int allocate_mr_list(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	int i;
-	struct smbd_mr *smbdirect_mr, *tmp;
-
-	INIT_LIST_HEAD(&info->mr_list);
-	init_waitqueue_head(&info->wait_mr);
-	spin_lock_init(&info->mr_list_lock);
-	atomic_set(&info->mr_ready_count, 0);
-	atomic_set(&info->mr_used_count, 0);
-	init_waitqueue_head(&info->wait_for_mr_cleanup);
-	INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
+	struct smbdirect_mr_io *smbdirect_mr, *tmp;
+
+	INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
+
+	if (sp->responder_resources == 0) {
+		log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
+		return -EINVAL;
+	}
+
 	/* Allocate more MRs (2x) than hardware responder_resources */
-	for (i = 0; i < info->responder_resources * 2; i++) {
+	for (i = 0; i < sp->responder_resources * 2; i++) {
 		smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
 		if (!smbdirect_mr)
 			goto cleanup_entries;
-		smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type,
-					info->max_frmr_depth);
+		smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type,
+					sp->max_frmr_depth);
 		if (IS_ERR(smbdirect_mr->mr)) {
 			log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
-				    info->mr_type, info->max_frmr_depth);
+				    sc->mr_io.type, sp->max_frmr_depth);
 			goto out;
 		}
-		smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth,
+		smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
 						sizeof(struct scatterlist),
 						GFP_KERNEL);
 		if (!smbdirect_mr->sgt.sgl) {
@@ -2118,18 +2407,18 @@ static int allocate_mr_list(struct smbd_connection *info)
 			ib_dereg_mr(smbdirect_mr->mr);
 			goto out;
 		}
-		smbdirect_mr->state = MR_READY;
-		smbdirect_mr->conn = info;
+		smbdirect_mr->state = SMBDIRECT_MR_READY;
+		smbdirect_mr->socket = sc;
 
-		list_add_tail(&smbdirect_mr->list, &info->mr_list);
-		atomic_inc(&info->mr_ready_count);
+		list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list);
+		atomic_inc(&sc->mr_io.ready.count);
 	}
 	return 0;
 
 out:
 	kfree(smbdirect_mr);
 cleanup_entries:
-	list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
+	list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) {
 		list_del(&smbdirect_mr->list);
 		ib_dereg_mr(smbdirect_mr->mr);
 		kfree(smbdirect_mr->sgt.sgl);
@@ -2146,14 +2435,14 @@ cleanup_entries:
  * issuing I/O trying to get MR at the same time, mr_list_lock is used to
  * protect this situation.
  */
-static struct smbd_mr *get_mr(struct smbd_connection *info)
+static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
 {
-	struct smbdirect_socket *sc = &info->socket;
-	struct smbd_mr *ret;
+	struct smbdirect_mr_io *ret;
+	unsigned long flags;
 	int rc;
 again:
-	rc = wait_event_interruptible(info->wait_mr,
-		atomic_read(&info->mr_ready_count) ||
+	rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
+		atomic_read(&sc->mr_io.ready.count) ||
 		sc->status != SMBDIRECT_SOCKET_CONNECTED);
 	if (rc) {
 		log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
@@ -2165,18 +2454,18 @@ again:
 		return NULL;
 	}
 
-	spin_lock(&info->mr_list_lock);
-	list_for_each_entry(ret, &info->mr_list, list) {
-		if (ret->state == MR_READY) {
-			ret->state = MR_REGISTERED;
-			spin_unlock(&info->mr_list_lock);
-			atomic_dec(&info->mr_ready_count);
-			atomic_inc(&info->mr_used_count);
+	spin_lock_irqsave(&sc->mr_io.all.lock, flags);
+	list_for_each_entry(ret, &sc->mr_io.all.list, list) {
+		if (ret->state == SMBDIRECT_MR_READY) {
+			ret->state = SMBDIRECT_MR_REGISTERED;
+			spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+			atomic_dec(&sc->mr_io.ready.count);
+			atomic_inc(&sc->mr_io.used.count);
 			return ret;
 		}
 	}
 
-	spin_unlock(&info->mr_list_lock);
+	spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
 	/*
 	 * It is possible that we could fail to get MR because other processes may
 	 * try to acquire a MR at the same time. If this is the case, retry it.
@@ -2187,8 +2476,7 @@ again:
 /*
  * Transcribe the pages from an iterator into an MR scatterlist.
  */
-static int smbd_iter_to_mr(struct smbd_connection *info,
-			   struct iov_iter *iter,
+static int smbd_iter_to_mr(struct iov_iter *iter,
 			   struct sg_table *sgt,
 			   unsigned int max_sg)
 {
@@ -2210,25 +2498,26 @@ static int smbd_iter_to_mr(struct smbd_connection *info,
  * need_invalidate: true if this MR needs to be locally invalidated after I/O
  * return value: the MR registered, NULL if failed.
  */
-struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
+struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
 				 struct iov_iter *iter,
 				 bool writing, bool need_invalidate)
 {
 	struct smbdirect_socket *sc = &info->socket;
-	struct smbd_mr *smbdirect_mr;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_mr_io *smbdirect_mr;
 	int rc, num_pages;
 	enum dma_data_direction dir;
 	struct ib_reg_wr *reg_wr;
 
-	num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1);
-	if (num_pages > info->max_frmr_depth) {
+	num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
+	if (num_pages > sp->max_frmr_depth) {
 		log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
-			num_pages, info->max_frmr_depth);
+			num_pages, sp->max_frmr_depth);
 		WARN_ON_ONCE(1);
 		return NULL;
 	}
 
-	smbdirect_mr = get_mr(info);
+	smbdirect_mr = get_mr(sc);
 	if (!smbdirect_mr) {
 		log_rdma_mr(ERR, "get_mr returning NULL\n");
 		return NULL;
@@ -2241,8 +2530,8 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
 	smbdirect_mr->sgt.orig_nents = 0;
 
 	log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
-		    num_pages, iov_iter_count(iter), info->max_frmr_depth);
-	smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
+		    num_pages, iov_iter_count(iter), sp->max_frmr_depth);
+	smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth);
 
 	rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
 			   smbdirect_mr->sgt.nents, dir);
@@ -2287,32 +2576,32 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
 	log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
 		rc, reg_wr->key);
 
-	/* If all failed, attempt to recover this MR by setting it MR_ERROR*/
+	/* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
 map_mr_error:
 	ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
 			smbdirect_mr->sgt.nents, smbdirect_mr->dir);
 
 dma_map_error:
-	smbdirect_mr->state = MR_ERROR;
-	if (atomic_dec_and_test(&info->mr_used_count))
-		wake_up(&info->wait_for_mr_cleanup);
+	smbdirect_mr->state = SMBDIRECT_MR_ERROR;
+	if (atomic_dec_and_test(&sc->mr_io.used.count))
+		wake_up(&sc->mr_io.cleanup.wait_queue);
 
-	smbd_disconnect_rdma_connection(info);
+	smbd_disconnect_rdma_connection(sc);
 
 	return NULL;
 }
 
 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct smbd_mr *smbdirect_mr;
+	struct smbdirect_mr_io *smbdirect_mr;
 	struct ib_cqe *cqe;
 
 	cqe = wc->wr_cqe;
-	smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
-	smbdirect_mr->state = MR_INVALIDATED;
+	smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
+	smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
 	if (wc->status != IB_WC_SUCCESS) {
 		log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
-		smbdirect_mr->state = MR_ERROR;
+		smbdirect_mr->state = SMBDIRECT_MR_ERROR;
 	}
 	complete(&smbdirect_mr->invalidate_done);
 }
@@ -2323,11 +2612,10 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
  * and we have to locally invalidate the buffer to prevent data is being
  * modified by remote peer after upper layer consumes it
  */
-int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
+int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr)
 {
 	struct ib_send_wr *wr;
-	struct smbd_connection *info = smbdirect_mr->conn;
-	struct smbdirect_socket *sc = &info->socket;
+	struct smbdirect_socket *sc = smbdirect_mr->socket;
 	int rc = 0;
 
 	if (smbdirect_mr->need_invalidate) {
@@ -2344,36 +2632,36 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
 		rc = ib_post_send(sc->ib.qp, wr, NULL);
 		if (rc) {
 			log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
-			smbd_disconnect_rdma_connection(info);
+			smbd_disconnect_rdma_connection(sc);
 			goto done;
 		}
 		wait_for_completion(&smbdirect_mr->invalidate_done);
 		smbdirect_mr->need_invalidate = false;
 	} else
 		/*
-		 * For remote invalidation, just set it to MR_INVALIDATED
+		 * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
 		 * and defer to mr_recovery_work to recover the MR for next use
 		 */
-		smbdirect_mr->state = MR_INVALIDATED;
+		smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
 
-	if (smbdirect_mr->state == MR_INVALIDATED) {
+	if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) {
 		ib_dma_unmap_sg(
 			sc->ib.dev, smbdirect_mr->sgt.sgl,
 			smbdirect_mr->sgt.nents,
 			smbdirect_mr->dir);
-		smbdirect_mr->state = MR_READY;
-		if (atomic_inc_return(&info->mr_ready_count) == 1)
-			wake_up_interruptible(&info->wait_mr);
+		smbdirect_mr->state = SMBDIRECT_MR_READY;
+		if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
+			wake_up(&sc->mr_io.ready.wait_queue);
 	} else
 		/*
 		 * Schedule the work to do MR recovery for future I/Os MR
 		 * recovery is slow and don't want it to block current I/O
 		 */
-		queue_work(info->workqueue, &info->mr_recovery_work);
+		queue_work(sc->workqueue, &sc->mr_io.recovery_work);
 
 done:
-	if (atomic_dec_and_test(&info->mr_used_count))
-		wake_up(&info->wait_for_mr_cleanup);
+	if (atomic_dec_and_test(&sc->mr_io.used.count))
+		wake_up(&sc->mr_io.cleanup.wait_queue);
 
 	return rc;
 }
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index e45aa9ddd71d..d67ac5ddaff4 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -27,12 +27,6 @@ extern int smbd_max_send_size;
 extern int smbd_send_credit_target;
 extern int smbd_receive_credit_max;
 
-enum keep_alive_status {
-	KEEP_ALIVE_NONE,
-	KEEP_ALIVE_PENDING,
-	KEEP_ALIVE_SENT,
-};
-
 /*
  * The context for the SMBDirect transport
  * Everything related to the transport is here. It has several logical parts
@@ -44,79 +38,14 @@ enum keep_alive_status {
  */
 struct smbd_connection {
 	struct smbdirect_socket socket;
-
-	int ri_rc;
-	struct completion ri_done;
-	wait_queue_head_t status_wait;
-
-	struct completion negotiate_completion;
-	bool negotiate_done;
-
-	struct work_struct disconnect_work;
-	struct work_struct post_send_credits_work;
-
-	spinlock_t lock_new_credits_offered;
-	int new_credits_offered;
-
-	/* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */
-	enum keep_alive_status keep_alive_requested;
-	int protocol;
-	atomic_t send_credits;
-	atomic_t receive_credits;
-	int receive_credit_target;
-
-	/* Memory registrations */
-	/* Maximum number of RDMA read/write outstanding on this connection */
-	int responder_resources;
-	/* Maximum number of pages in a single RDMA write/read on this connection */
-	int max_frmr_depth;
-	/*
-	 * If payload is less than or equal to the threshold,
-	 * use RDMA send/recv to send upper layer I/O.
-	 * If payload is more than the threshold,
-	 * use RDMA read/write through memory registration for I/O.
-	 */
-	int rdma_readwrite_threshold;
-	enum ib_mr_type mr_type;
-	struct list_head mr_list;
-	spinlock_t mr_list_lock;
-	/* The number of available MRs ready for memory registration */
-	atomic_t mr_ready_count;
-	atomic_t mr_used_count;
-	wait_queue_head_t wait_mr;
-	struct work_struct mr_recovery_work;
-	/* Used by transport to wait until all MRs are returned */
-	wait_queue_head_t wait_for_mr_cleanup;
-
-	/* Activity accounting */
-	atomic_t send_pending;
-	wait_queue_head_t wait_send_pending;
-	wait_queue_head_t wait_post_send;
-
-	/* Receive queue */
-	int count_receive_queue;
-	wait_queue_head_t wait_receive_queues;
-
-	bool send_immediate;
-
-	wait_queue_head_t wait_send_queue;
-
-	struct workqueue_struct *workqueue;
-	struct delayed_work idle_timer_work;
-
-	/* for debug purposes */
-	unsigned int count_get_receive_buffer;
-	unsigned int count_put_receive_buffer;
-	unsigned int count_reassembly_queue;
-	unsigned int count_enqueue_reassembly_queue;
-	unsigned int count_dequeue_reassembly_queue;
-	unsigned int count_send_empty;
 };
 
 /* Create a SMBDirect session */
 struct smbd_connection *smbd_get_connection(
 	struct TCP_Server_Info *server, struct sockaddr *dstaddr);
 
+const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn);
+
 /* Reconnect SMBDirect session */
 int smbd_reconnect(struct TCP_Server_Info *server);
 /* Destroy SMBDirect session */
@@ -127,34 +56,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg);
 int smbd_send(struct TCP_Server_Info *server,
 	int num_rqst, struct smb_rqst *rqst);
 
-enum mr_state {
-	MR_READY,
-	MR_REGISTERED,
-	MR_INVALIDATED,
-	MR_ERROR
-};
-
-struct smbd_mr {
-	struct smbd_connection	*conn;
-	struct list_head	list;
-	enum mr_state		state;
-	struct ib_mr		*mr;
-	struct sg_table		sgt;
-	enum dma_data_direction	dir;
-	union {
-		struct ib_reg_wr	wr;
-		struct ib_send_wr	inv_wr;
-	};
-	struct ib_cqe		cqe;
-	bool			need_invalidate;
-	struct completion	invalidate_done;
-};
-
 /* Interfaces to register and deregister MR for RDMA read/write */
-struct smbd_mr *smbd_register_mr(
+struct smbdirect_mr_io *smbd_register_mr(
 	struct smbd_connection *info, struct iov_iter *iter,
 	bool writing, bool need_invalidate);
-int smbd_deregister_mr(struct smbd_mr *mr);
+int smbd_deregister_mr(struct smbdirect_mr_io *mr);
 
 #else
 #define cifs_rdma_enabled(server)	0
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 93e5b2bb9f28..fd650e2afc76 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
@@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
@@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
 
@@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name,  \
 		__u64	lease_key_high),	\
 	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
 
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done);
+/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */
+DECLARE_EVENT_CLASS(smb3_lease_enter_class,
+	TP_PROTO(__u32 lease_state,
+		__u32 flags,
+		__u16 epoch,
+		__u32 tid,
+		__u64 sesid,
+		__u64 lease_key_low,
+		__u64 lease_key_high),
+	TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high),
+	TP_STRUCT__entry(
+		__field(__u32, lease_state)
+		__field(__u32, flags)
+		__field(__u16, epoch)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, lease_key_low)
+		__field(__u64, lease_key_high)
+	),
+	TP_fast_assign(
+		__entry->lease_state = lease_state;
+		__entry->flags = flags;
+		__entry->epoch = epoch;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->lease_key_low = lease_key_low;
+		__entry->lease_key_high = lease_key_high;
+	),
+	TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u",
+		__entry->sesid, __entry->tid, __entry->lease_key_high,
+		__entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch)
+)
+
+#define DEFINE_SMB3_LEASE_ENTER_EVENT(name)        \
+DEFINE_EVENT(smb3_lease_enter_class, smb3_##name,  \
+	TP_PROTO(__u32 lease_state,            \
+		__u32 flags,               \
+		__u16 epoch,               \
+		__u32 tid,                 \
+		__u64 sesid,               \
+		__u64 lease_key_low,       \
+		__u64 lease_key_high),     \
+	TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter);
+/* Lease not found: reuse lease_enter payload (includes epoch and flags) */
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found);
 
 DECLARE_EVENT_CLASS(smb3_lease_err_class,
 	TP_PROTO(__u32	lease_state,
@@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name,  \
 		int	rc),			\
 	TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
 
-DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err);
 
 DECLARE_EVENT_CLASS(smb3_connect_class,
 	TP_PROTO(char *hostname,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 32d528b4dd83..a61ba7f3fb86 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1005,15 +1005,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 			cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
 				 midQ[i]->mid, le16_to_cpu(midQ[i]->command));
 			send_cancel(server, &rqst[i], midQ[i]);
-			spin_lock(&server->mid_queue_lock);
+			spin_lock(&midQ[i]->mid_lock);
 			midQ[i]->wait_cancelled = true;
-			if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
-			    midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
+			if (midQ[i]->callback) {
 				midQ[i]->callback = cifs_cancelled_callback;
 				cancelled_mid[i] = true;
 				credits[i].value = 0;
 			}
-			spin_unlock(&server->mid_queue_lock);
+			spin_unlock(&midQ[i]->mid_lock);
 		}
 	}
 
diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h
index b9a385344ff3..05cc6a9d0ccd 100644
--- a/fs/smb/common/smbdirect/smbdirect.h
+++ b/fs/smb/common/smbdirect/smbdirect.h
@@ -23,6 +23,12 @@ struct smbdirect_buffer_descriptor_v1 {
  * Some values are important for the upper layer.
  */
 struct smbdirect_socket_parameters {
+	__u32 resolve_addr_timeout_msec;
+	__u32 resolve_route_timeout_msec;
+	__u32 rdma_connect_timeout_msec;
+	__u32 negotiate_timeout_msec;
+	__u8  initiator_depth;
+	__u8  responder_resources;
 	__u16 recv_credit_max;
 	__u16 send_credit_target;
 	__u32 max_send_size;
@@ -30,6 +36,7 @@ struct smbdirect_socket_parameters {
 	__u32 max_recv_size;
 	__u32 max_fragmented_recv_size;
 	__u32 max_read_write_size;
+	__u32 max_frmr_depth;
 	__u32 keepalive_interval_msec;
 	__u32 keepalive_timeout_msec;
 } __packed;
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index 3c4a8d627aa3..db22a1d0546b 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -6,22 +6,102 @@
 #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
 #define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
 
+#include <rdma/rw.h>
+
 enum smbdirect_socket_status {
 	SMBDIRECT_SOCKET_CREATED,
-	SMBDIRECT_SOCKET_CONNECTING,
-	SMBDIRECT_SOCKET_CONNECTED,
+	SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED,
+	SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
+	SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED,
+	SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED,
+	SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
+	SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED,
+	SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED,
+	SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
+	SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED,
+	SMBDIRECT_SOCKET_NEGOTIATE_NEEDED,
+	SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
 	SMBDIRECT_SOCKET_NEGOTIATE_FAILED,
+	SMBDIRECT_SOCKET_CONNECTED,
+	SMBDIRECT_SOCKET_ERROR,
 	SMBDIRECT_SOCKET_DISCONNECTING,
 	SMBDIRECT_SOCKET_DISCONNECTED,
 	SMBDIRECT_SOCKET_DESTROYED
 };
 
+static __always_inline
+const char *smbdirect_socket_status_string(enum smbdirect_socket_status status)
+{
+	switch (status) {
+	case SMBDIRECT_SOCKET_CREATED:
+		return "CREATED";
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+		return "RESOLVE_ADDR_NEEDED";
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+		return "RESOLVE_ADDR_RUNNING";
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+		return "RESOLVE_ADDR_FAILED";
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+		return "RESOLVE_ROUTE_NEEDED";
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+		return "RESOLVE_ROUTE_RUNNING";
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+		return "RESOLVE_ROUTE_FAILED";
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+		return "RDMA_CONNECT_NEEDED";
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+		return "RDMA_CONNECT_RUNNING";
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+		return "RDMA_CONNECT_FAILED";
+	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+		return "NEGOTIATE_NEEDED";
+	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+		return "NEGOTIATE_RUNNING";
+	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+		return "NEGOTIATE_FAILED";
+	case SMBDIRECT_SOCKET_CONNECTED:
+		return "CONNECTED";
+	case SMBDIRECT_SOCKET_ERROR:
+		return "ERROR";
+	case SMBDIRECT_SOCKET_DISCONNECTING:
+		return "DISCONNECTING";
+	case SMBDIRECT_SOCKET_DISCONNECTED:
+		return "DISCONNECTED";
+	case SMBDIRECT_SOCKET_DESTROYED:
+		return "DESTROYED";
+	}
+
+	return "<unknown>";
+}
+
+enum smbdirect_keepalive_status {
+	SMBDIRECT_KEEPALIVE_NONE,
+	SMBDIRECT_KEEPALIVE_PENDING,
+	SMBDIRECT_KEEPALIVE_SENT
+};
+
 struct smbdirect_socket {
 	enum smbdirect_socket_status status;
+	wait_queue_head_t status_wait;
+	int first_error;
+
+	/*
+	 * This points to the workqueue to
+	 * be used for this socket.
+	 * It can be per socket (on the client)
+	 * or point to a global workqueue (on the server)
+	 */
+	struct workqueue_struct *workqueue;
+
+	struct work_struct disconnect_work;
 
 	/* RDMA related */
 	struct {
 		struct rdma_cm_id *cm_id;
+		/*
+		 * This is for iWarp MPA v1
+		 */
+		bool legacy_iwarp;
 	} rdma;
 
 	/* IB verbs related */
@@ -40,6 +120,15 @@ struct smbdirect_socket {
 	struct smbdirect_socket_parameters parameters;
 
 	/*
+	 * The state for keepalive and timeout handling
+	 */
+	struct {
+		enum smbdirect_keepalive_status keepalive;
+		struct work_struct immediate_work;
+		struct delayed_work timer_work;
+	} idle;
+
+	/*
 	 * The state for posted send buffers
 	 */
 	struct {
@@ -51,6 +140,29 @@ struct smbdirect_socket {
 			struct kmem_cache	*cache;
 			mempool_t		*pool;
 		} mem;
+
+		/*
+		 * The credit state for the send side
+		 */
+		struct {
+			atomic_t count;
+			wait_queue_head_t wait_queue;
+		} credits;
+
+		/*
+		 * The state about posted/pending sends
+		 */
+		struct {
+			atomic_t count;
+			/*
+			 * woken when count is decremented
+			 */
+			wait_queue_head_t dec_wait_queue;
+			/*
+			 * woken when count reached zero
+			 */
+			wait_queue_head_t zero_wait_queue;
+		} pending;
 	} send_io;
 
 	/*
@@ -85,6 +197,23 @@ struct smbdirect_socket {
 		} free;
 
 		/*
+		 * The state for posted recv_io messages
+		 * and the refill work struct.
+		 */
+		struct {
+			atomic_t count;
+			struct work_struct refill_work;
+		} posted;
+
+		/*
+		 * The credit state for the recv side
+		 */
+		struct {
+			u16 target;
+			atomic_t count;
+		} credits;
+
+		/*
 		 * The list of arrived non-empty smbdirect_recv_io
 		 * structures
 		 *
@@ -110,8 +239,137 @@ struct smbdirect_socket {
 			bool full_packet_received;
 		} reassembly;
 	} recv_io;
+
+	/*
+	 * The state for Memory registrations on the client
+	 */
+	struct {
+		enum ib_mr_type type;
+
+		/*
+		 * The list of free smbdirect_mr_io
+		 * structures
+		 */
+		struct {
+			struct list_head list;
+			spinlock_t lock;
+		} all;
+
+		/*
+		 * The number of available MRs ready for memory registration
+		 */
+		struct {
+			atomic_t count;
+			wait_queue_head_t wait_queue;
+		} ready;
+
+		/*
+		 * The number of used MRs
+		 */
+		struct {
+			atomic_t count;
+		} used;
+
+		struct work_struct recovery_work;
+
+		/* Used by transport to wait until all MRs are returned */
+		struct {
+			wait_queue_head_t wait_queue;
+		} cleanup;
+	} mr_io;
+
+	/*
+	 * The state for RDMA read/write requests on the server
+	 */
+	struct {
+		/*
+		 * The credit state for the send side
+		 */
+		struct {
+			/*
+			 * The maximum number of rw credits
+			 */
+			size_t max;
+			/*
+			 * The number of pages per credit
+			 */
+			size_t num_pages;
+			atomic_t count;
+			wait_queue_head_t wait_queue;
+		} credits;
+	} rw_io;
+
+	/*
+	 * For debug purposes
+	 */
+	struct {
+		u64 get_receive_buffer;
+		u64 put_receive_buffer;
+		u64 enqueue_reassembly_queue;
+		u64 dequeue_reassembly_queue;
+		u64 send_empty;
+	} statistics;
 };
 
+static void __smbdirect_socket_disabled_work(struct work_struct *work)
+{
+	/*
+	 * Should never be called as disable_[delayed_]work_sync() was used.
+	 */
+	WARN_ON_ONCE(1);
+}
+
+static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
+{
+	/*
+	 * This also sets status = SMBDIRECT_SOCKET_CREATED
+	 */
+	BUILD_BUG_ON(SMBDIRECT_SOCKET_CREATED != 0);
+	memset(sc, 0, sizeof(*sc));
+
+	init_waitqueue_head(&sc->status_wait);
+
+	INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work);
+	disable_work_sync(&sc->disconnect_work);
+
+	INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work);
+	disable_work_sync(&sc->idle.immediate_work);
+	INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
+	disable_delayed_work_sync(&sc->idle.timer_work);
+
+	atomic_set(&sc->send_io.credits.count, 0);
+	init_waitqueue_head(&sc->send_io.credits.wait_queue);
+
+	atomic_set(&sc->send_io.pending.count, 0);
+	init_waitqueue_head(&sc->send_io.pending.dec_wait_queue);
+	init_waitqueue_head(&sc->send_io.pending.zero_wait_queue);
+
+	INIT_LIST_HEAD(&sc->recv_io.free.list);
+	spin_lock_init(&sc->recv_io.free.lock);
+
+	atomic_set(&sc->recv_io.posted.count, 0);
+	INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work);
+	disable_work_sync(&sc->recv_io.posted.refill_work);
+
+	atomic_set(&sc->recv_io.credits.count, 0);
+
+	INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
+	spin_lock_init(&sc->recv_io.reassembly.lock);
+	init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+
+	atomic_set(&sc->rw_io.credits.count, 0);
+	init_waitqueue_head(&sc->rw_io.credits.wait_queue);
+
+	spin_lock_init(&sc->mr_io.all.lock);
+	INIT_LIST_HEAD(&sc->mr_io.all.list);
+	atomic_set(&sc->mr_io.ready.count, 0);
+	init_waitqueue_head(&sc->mr_io.ready.wait_queue);
+	atomic_set(&sc->mr_io.used.count, 0);
+	INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work);
+	disable_work_sync(&sc->mr_io.recovery_work);
+	init_waitqueue_head(&sc->mr_io.cleanup.wait_queue);
+}
+
 struct smbdirect_send_io {
 	struct smbdirect_socket *socket;
 	struct ib_cqe cqe;
@@ -136,6 +394,23 @@ struct smbdirect_send_io {
 	u8 packet[];
 };
 
+struct smbdirect_send_batch {
+	/*
+	 * List of smbdirect_send_io messages
+	 */
+	struct list_head msg_list;
+	/*
+	 * Number of list entries
+	 */
+	size_t wr_cnt;
+
+	/*
+	 * Possible remote key invalidation state
+	 */
+	bool need_invalidate_rkey;
+	u32 remote_key;
+};
+
 struct smbdirect_recv_io {
 	struct smbdirect_socket *socket;
 	struct ib_cqe cqe;
@@ -158,4 +433,44 @@ struct smbdirect_recv_io {
 	u8 packet[];
 };
 
+enum smbdirect_mr_state {
+	SMBDIRECT_MR_READY,
+	SMBDIRECT_MR_REGISTERED,
+	SMBDIRECT_MR_INVALIDATED,
+	SMBDIRECT_MR_ERROR
+};
+
+struct smbdirect_mr_io {
+	struct smbdirect_socket *socket;
+	struct ib_cqe cqe;
+
+	struct list_head list;
+
+	enum smbdirect_mr_state state;
+	struct ib_mr *mr;
+	struct sg_table sgt;
+	enum dma_data_direction dir;
+	union {
+		struct ib_reg_wr wr;
+		struct ib_send_wr inv_wr;
+	};
+
+	bool need_invalidate;
+	struct completion invalidate_done;
+};
+
+struct smbdirect_rw_io {
+	struct smbdirect_socket *socket;
+	struct ib_cqe cqe;
+
+	struct list_head list;
+
+	int error;
+	struct completion *completion;
+
+	struct rdma_rw_ctx rdma_ctx;
+	struct sg_table sgt;
+	struct scatterlist sg_list[];
+};
+
 #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 3f04a2977ba8..91a934411134 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -243,7 +243,7 @@ int ksmbd_conn_write(struct ksmbd_work *work)
 
 int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
 			 void *buf, unsigned int buflen,
-			 struct smb2_buffer_desc_v1 *desc,
+			 struct smbdirect_buffer_descriptor_v1 *desc,
 			 unsigned int desc_len)
 {
 	int ret = -EINVAL;
@@ -257,7 +257,7 @@ int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
 
 int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
 			  void *buf, unsigned int buflen,
-			  struct smb2_buffer_desc_v1 *desc,
+			  struct smbdirect_buffer_descriptor_v1 *desc,
 			  unsigned int desc_len)
 {
 	int ret = -EINVAL;
@@ -504,7 +504,8 @@ void ksmbd_conn_transport_destroy(void)
 {
 	mutex_lock(&init_lock);
 	ksmbd_tcp_destroy();
-	ksmbd_rdma_destroy();
+	ksmbd_rdma_stop_listening();
 	stop_sessions();
+	ksmbd_rdma_destroy();
 	mutex_unlock(&init_lock);
 }
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 31dd1caac1e8..07b43634262a 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -19,6 +19,8 @@
 #include "smb_common.h"
 #include "ksmbd_work.h"
 
+struct smbdirect_buffer_descriptor_v1;
+
 #define KSMBD_SOCKET_BACKLOG		16
 
 enum {
@@ -46,7 +48,12 @@ struct ksmbd_conn {
 	struct mutex			srv_mutex;
 	int				status;
 	unsigned int			cli_cap;
-	__be32				inet_addr;
+	union {
+		__be32			inet_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+		u8			inet6_addr[16];
+#endif
+	};
 	char				*request_buf;
 	struct ksmbd_transport		*transport;
 	struct nls_table		*local_nls;
@@ -128,11 +135,11 @@ struct ksmbd_transport_ops {
 		      unsigned int remote_key);
 	int (*rdma_read)(struct ksmbd_transport *t,
 			 void *buf, unsigned int len,
-			 struct smb2_buffer_desc_v1 *desc,
+			 struct smbdirect_buffer_descriptor_v1 *desc,
 			 unsigned int desc_len);
 	int (*rdma_write)(struct ksmbd_transport *t,
 			  void *buf, unsigned int len,
-			  struct smb2_buffer_desc_v1 *desc,
+			  struct smbdirect_buffer_descriptor_v1 *desc,
 			  unsigned int desc_len);
 	void (*free_transport)(struct ksmbd_transport *kt);
 };
@@ -158,11 +165,11 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
 int ksmbd_conn_write(struct ksmbd_work *work);
 int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
 			 void *buf, unsigned int buflen,
-			 struct smb2_buffer_desc_v1 *desc,
+			 struct smbdirect_buffer_descriptor_v1 *desc,
 			 unsigned int desc_len);
 int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
 			  void *buf, unsigned int buflen,
-			  struct smb2_buffer_desc_v1 *desc,
+			  struct smbdirect_buffer_descriptor_v1 *desc,
 			  unsigned int desc_len);
 void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
 void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 72b00ca6e455..4a71f46d7020 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void)
 
 int ksmbd_workqueue_init(void)
 {
-	ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0);
+	ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0);
 	if (!ksmbd_wq)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index d7a8a580d013..a04d5702820d 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1102,8 +1102,10 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
 			if (!atomic_inc_not_zero(&opinfo->refcount))
 				continue;
 
-			if (ksmbd_conn_releasing(opinfo->conn))
+			if (ksmbd_conn_releasing(opinfo->conn)) {
+				opinfo_put(opinfo);
 				continue;
+			}
 
 			oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
 			opinfo_put(opinfo);
@@ -1139,8 +1141,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
 			if (!atomic_inc_not_zero(&opinfo->refcount))
 				continue;
 
-			if (ksmbd_conn_releasing(opinfo->conn))
+			if (ksmbd_conn_releasing(opinfo->conn)) {
+				opinfo_put(opinfo);
 				continue;
+			}
+
 			oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
 			opinfo_put(opinfo);
 		}
@@ -1343,8 +1348,10 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
 		if (!atomic_inc_not_zero(&brk_op->refcount))
 			continue;
 
-		if (ksmbd_conn_releasing(brk_op->conn))
+		if (ksmbd_conn_releasing(brk_op->conn)) {
+			opinfo_put(brk_op);
 			continue;
+		}
 
 		if (brk_op->is_lease && (brk_op->o_lease->state &
 		    (~(SMB2_LEASE_READ_CACHING_LE |
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 8c9c49c3a0a4..40420544cc25 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -365,6 +365,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl)
 		return;
 	}
 
+	pr_info("running\n");
 	WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING);
 }
 
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0d92ce49aed7..0c069eff80b7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -23,6 +23,7 @@
 #include "asn1.h"
 #include "connection.h"
 #include "transport_ipc.h"
+#include "../common/smbdirect/smbdirect.h"
 #include "transport_rdma.h"
 #include "vfs.h"
 #include "vfs_cache.h"
@@ -2951,18 +2952,19 @@ int smb2_open(struct ksmbd_work *work)
 		}
 
 		ksmbd_debug(SMB, "converted name = %s\n", name);
-		if (strchr(name, ':')) {
-			if (!test_share_config_flag(work->tcon->share_conf,
-						    KSMBD_SHARE_FLAG_STREAMS)) {
-				rc = -EBADF;
-				goto err_out2;
-			}
-			rc = parse_stream_name(name, &stream_name, &s_type);
-			if (rc < 0)
-				goto err_out2;
-		}
 
 		if (posix_ctxt == false) {
+			if (strchr(name, ':')) {
+				if (!test_share_config_flag(work->tcon->share_conf,
+							KSMBD_SHARE_FLAG_STREAMS)) {
+					rc = -EBADF;
+					goto err_out2;
+				}
+				rc = parse_stream_name(name, &stream_name, &s_type);
+				if (rc < 0)
+					goto err_out2;
+			}
+
 			rc = ksmbd_validate_filename(name);
 			if (rc < 0)
 				goto err_out2;
@@ -3443,6 +3445,8 @@ int smb2_open(struct ksmbd_work *work)
 	fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
 			FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
 
+	fp->is_posix_ctxt = posix_ctxt;
+
 	/* fp should be searchable through ksmbd_inode.m_fp_list
 	 * after daccess, saccess, attrib_only, and stream are
 	 * initialized.
@@ -5988,7 +5992,7 @@ static int smb2_rename(struct ksmbd_work *work,
 	if (IS_ERR(new_name))
 		return PTR_ERR(new_name);
 
-	if (strchr(new_name, ':')) {
+	if (fp->is_posix_ctxt == false && strchr(new_name, ':')) {
 		int s_type;
 		char *xattr_stream_name, *stream_name = NULL;
 		size_t xattr_stream_size;
@@ -6662,7 +6666,7 @@ out:
 }
 
 static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
-					struct smb2_buffer_desc_v1 *desc,
+					struct smbdirect_buffer_descriptor_v1 *desc,
 					__le32 Channel,
 					__le16 ChannelInfoLength)
 {
@@ -6698,7 +6702,7 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
 	int err;
 
 	err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
-				    (struct smb2_buffer_desc_v1 *)
+				    (struct smbdirect_buffer_descriptor_v1 *)
 				    ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)),
 				    le16_to_cpu(req->ReadChannelInfoLength));
 	if (err)
@@ -6758,7 +6762,11 @@ int smb2_read(struct ksmbd_work *work)
 	if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
 	    req->Channel == SMB2_CHANNEL_RDMA_V1) {
 		is_rdma_channel = true;
-		max_read_size = get_smbd_max_read_write_size();
+		max_read_size = get_smbd_max_read_write_size(work->conn->transport);
+		if (max_read_size == 0) {
+			err = -EINVAL;
+			goto out;
+		}
 	}
 
 	if (is_rdma_channel == true) {
@@ -6769,7 +6777,7 @@ int smb2_read(struct ksmbd_work *work)
 			goto out;
 		}
 		err = smb2_set_remote_key_for_rdma(work,
-						   (struct smb2_buffer_desc_v1 *)
+						   (struct smbdirect_buffer_descriptor_v1 *)
 						   ((char *)req + ch_offset),
 						   req->Channel,
 						   req->ReadChannelInfoLength);
@@ -6964,7 +6972,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
 		return -ENOMEM;
 
 	ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
-				   (struct smb2_buffer_desc_v1 *)
+				   (struct smbdirect_buffer_descriptor_v1 *)
 				   ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)),
 				   le16_to_cpu(req->WriteChannelInfoLength));
 	if (ret < 0) {
@@ -7016,7 +7024,11 @@ int smb2_write(struct ksmbd_work *work)
 	if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
 	    req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
 		is_rdma_channel = true;
-		max_write_size = get_smbd_max_read_write_size();
+		max_write_size = get_smbd_max_read_write_size(work->conn->transport);
+		if (max_write_size == 0) {
+			err = -EINVAL;
+			goto out;
+		}
 		length = le32_to_cpu(req->RemainingBytes);
 	}
 
@@ -7029,7 +7041,7 @@ int smb2_write(struct ksmbd_work *work)
 			goto out;
 		}
 		err = smb2_set_remote_key_for_rdma(work,
-						   (struct smb2_buffer_desc_v1 *)
+						   (struct smbdirect_buffer_descriptor_v1 *)
 						   ((char *)req + ch_offset),
 						   req->Channel,
 						   req->WriteChannelInfoLength);
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 16ae8a10490b..5163d5241b90 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -136,12 +136,6 @@ struct create_posix_rsp {
 	u8 SidBuffer[44];
 } __packed;
 
-struct smb2_buffer_desc_v1 {
-	__le64 offset;
-	__le32 token;
-	__le32 length;
-} __packed;
-
 #define SMB2_0_IOCTL_IS_FSCTL 0x00000001
 
 struct smb_sockaddr_in {
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 8d366db5f605..9e644a0daf1c 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -23,18 +23,24 @@
 #include "connection.h"
 #include "smb_common.h"
 #include "../common/smb2status.h"
+#include "../common/smbdirect/smbdirect.h"
+#include "../common/smbdirect/smbdirect_pdu.h"
+#include "../common/smbdirect/smbdirect_socket.h"
 #include "transport_rdma.h"
 
 #define SMB_DIRECT_PORT_IWARP		5445
 #define SMB_DIRECT_PORT_INFINIBAND	445
 
-#define SMB_DIRECT_VERSION_LE		cpu_to_le16(0x0100)
+#define SMB_DIRECT_VERSION_LE		cpu_to_le16(SMBDIRECT_V1)
 
-/* SMB_DIRECT negotiation timeout in seconds */
-#define SMB_DIRECT_NEGOTIATE_TIMEOUT		120
+/* SMB_DIRECT negotiation timeout (for the server) in seconds */
+#define SMB_DIRECT_NEGOTIATE_TIMEOUT		5
 
-#define SMB_DIRECT_MAX_SEND_SGES		6
-#define SMB_DIRECT_MAX_RECV_SGES		1
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL	120
+
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT	5
 
 /*
  * Default maximum number of RDMA read/write outstanding on this connection
@@ -87,131 +93,38 @@ static struct smb_direct_listener {
 
 static struct workqueue_struct *smb_direct_wq;
 
-enum smb_direct_status {
-	SMB_DIRECT_CS_NEW = 0,
-	SMB_DIRECT_CS_CONNECTED,
-	SMB_DIRECT_CS_DISCONNECTING,
-	SMB_DIRECT_CS_DISCONNECTED,
-};
-
 struct smb_direct_transport {
 	struct ksmbd_transport	transport;
 
-	enum smb_direct_status	status;
-	bool			full_packet_received;
-	wait_queue_head_t	wait_status;
-
-	struct rdma_cm_id	*cm_id;
-	struct ib_cq		*send_cq;
-	struct ib_cq		*recv_cq;
-	struct ib_pd		*pd;
-	struct ib_qp		*qp;
-
-	int			max_send_size;
-	int			max_recv_size;
-	int			max_fragmented_send_size;
-	int			max_fragmented_recv_size;
-	int			max_rdma_rw_size;
-
-	spinlock_t		reassembly_queue_lock;
-	struct list_head	reassembly_queue;
-	int			reassembly_data_length;
-	int			reassembly_queue_length;
-	int			first_entry_offset;
-	wait_queue_head_t	wait_reassembly_queue;
-
-	spinlock_t		receive_credit_lock;
-	int			recv_credits;
-	int			count_avail_recvmsg;
-	int			recv_credit_max;
-	int			recv_credit_target;
-
-	spinlock_t		recvmsg_queue_lock;
-	struct list_head	recvmsg_queue;
-
-	int			send_credit_target;
-	atomic_t		send_credits;
-	spinlock_t		lock_new_recv_credits;
-	int			new_recv_credits;
-	int			max_rw_credits;
-	int			pages_per_rw_credit;
-	atomic_t		rw_credits;
-
-	wait_queue_head_t	wait_send_credits;
-	wait_queue_head_t	wait_rw_credits;
-
-	mempool_t		*sendmsg_mempool;
-	struct kmem_cache	*sendmsg_cache;
-	mempool_t		*recvmsg_mempool;
-	struct kmem_cache	*recvmsg_cache;
-
-	wait_queue_head_t	wait_send_pending;
-	atomic_t		send_pending;
-
-	struct delayed_work	post_recv_credits_work;
-	struct work_struct	send_immediate_work;
-	struct work_struct	disconnect_work;
-
-	bool			negotiation_requested;
+	struct smbdirect_socket socket;
 };
 
-#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport))
-#define SMBD_TRANS(t)	((struct smb_direct_transport *)container_of(t, \
+#define KSMBD_TRANS(t) (&(t)->transport)
+#define SMBD_TRANS(t)	(container_of(t, \
 				struct smb_direct_transport, transport))
-enum {
-	SMB_DIRECT_MSG_NEGOTIATE_REQ = 0,
-	SMB_DIRECT_MSG_DATA_TRANSFER
-};
 
 static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops;
 
-struct smb_direct_send_ctx {
-	struct list_head	msg_list;
-	int			wr_cnt;
-	bool			need_invalidate_rkey;
-	unsigned int		remote_key;
-};
-
-struct smb_direct_sendmsg {
-	struct smb_direct_transport	*transport;
-	struct ib_send_wr	wr;
-	struct list_head	list;
-	int			num_sge;
-	struct ib_sge		sge[SMB_DIRECT_MAX_SEND_SGES];
-	struct ib_cqe		cqe;
-	u8			packet[];
-};
-
-struct smb_direct_recvmsg {
-	struct smb_direct_transport	*transport;
-	struct list_head	list;
-	int			type;
-	struct ib_sge		sge;
-	struct ib_cqe		cqe;
-	bool			first_segment;
-	u8			packet[];
-};
-
-struct smb_direct_rdma_rw_msg {
-	struct smb_direct_transport	*t;
-	struct ib_cqe		cqe;
-	int			status;
-	struct completion	*completion;
-	struct list_head	list;
-	struct rdma_rw_ctx	rw_ctx;
-	struct sg_table		sgt;
-	struct scatterlist	sg_list[];
-};
-
 void init_smbd_max_io_size(unsigned int sz)
 {
 	sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
 	smb_direct_max_read_write_size = sz;
 }
 
-unsigned int get_smbd_max_read_write_size(void)
+unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt)
 {
-	return smb_direct_max_read_write_size;
+	struct smb_direct_transport *t;
+	struct smbdirect_socket *sc;
+	struct smbdirect_socket_parameters *sp;
+
+	if (kt->ops != &ksmbd_smb_direct_transport_ops)
+		return 0;
+
+	t = SMBD_TRANS(kt);
+	sc = &t->socket;
+	sp = &sc->parameters;
+
+	return sp->max_read_write_size;
 }
 
 static inline int get_buf_page_count(void *buf, int size)
@@ -220,71 +133,65 @@ static inline int get_buf_page_count(void *buf, int size)
 		(uintptr_t)buf / PAGE_SIZE;
 }
 
-static void smb_direct_destroy_pools(struct smb_direct_transport *transport);
+static void smb_direct_destroy_pools(struct smbdirect_socket *sc);
 static void smb_direct_post_recv_credits(struct work_struct *work);
-static int smb_direct_post_send_data(struct smb_direct_transport *t,
-				     struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_post_send_data(struct smbdirect_socket *sc,
+				     struct smbdirect_send_batch *send_ctx,
 				     struct kvec *iov, int niov,
 				     int remaining_data_length);
 
-static inline struct smb_direct_transport *
-smb_trans_direct_transfort(struct ksmbd_transport *t)
-{
-	return container_of(t, struct smb_direct_transport, transport);
-}
-
 static inline void
-*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg)
+*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg)
 {
 	return (void *)recvmsg->packet;
 }
 
-static inline bool is_receive_credit_post_required(int receive_credits,
-						   int avail_recvmsg_count)
-{
-	return receive_credits <= (smb_direct_receive_credit_max >> 3) &&
-		avail_recvmsg_count >= (receive_credits >> 2);
-}
-
 static struct
-smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t)
+smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc)
 {
-	struct smb_direct_recvmsg *recvmsg = NULL;
+	struct smbdirect_recv_io *recvmsg = NULL;
+	unsigned long flags;
 
-	spin_lock(&t->recvmsg_queue_lock);
-	if (!list_empty(&t->recvmsg_queue)) {
-		recvmsg = list_first_entry(&t->recvmsg_queue,
-					   struct smb_direct_recvmsg,
+	spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+	if (!list_empty(&sc->recv_io.free.list)) {
+		recvmsg = list_first_entry(&sc->recv_io.free.list,
+					   struct smbdirect_recv_io,
 					   list);
 		list_del(&recvmsg->list);
 	}
-	spin_unlock(&t->recvmsg_queue_lock);
+	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
 	return recvmsg;
 }
 
-static void put_recvmsg(struct smb_direct_transport *t,
-			struct smb_direct_recvmsg *recvmsg)
+static void put_recvmsg(struct smbdirect_socket *sc,
+			struct smbdirect_recv_io *recvmsg)
 {
+	unsigned long flags;
+
 	if (likely(recvmsg->sge.length != 0)) {
-		ib_dma_unmap_single(t->cm_id->device,
+		ib_dma_unmap_single(sc->ib.dev,
 				    recvmsg->sge.addr,
 				    recvmsg->sge.length,
 				    DMA_FROM_DEVICE);
 		recvmsg->sge.length = 0;
 	}
 
-	spin_lock(&t->recvmsg_queue_lock);
-	list_add(&recvmsg->list, &t->recvmsg_queue);
-	spin_unlock(&t->recvmsg_queue_lock);
+	spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+	list_add(&recvmsg->list, &sc->recv_io.free.list);
+	spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
+
+	queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
 }
 
-static void enqueue_reassembly(struct smb_direct_transport *t,
-			       struct smb_direct_recvmsg *recvmsg,
+static void enqueue_reassembly(struct smbdirect_socket *sc,
+			       struct smbdirect_recv_io *recvmsg,
 			       int data_length)
 {
-	spin_lock(&t->reassembly_queue_lock);
-	list_add_tail(&recvmsg->list, &t->reassembly_queue);
-	t->reassembly_queue_length++;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+	list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list);
+	sc->recv_io.reassembly.queue_length++;
 	/*
 	 * Make sure reassembly_data_length is updated after list and
 	 * reassembly_queue_length are updated. On the dequeue side
@@ -292,85 +199,228 @@ static void enqueue_reassembly(struct smb_direct_transport *t,
 	 * if reassembly_queue_length and list is up to date
 	 */
 	virt_wmb();
-	t->reassembly_data_length += data_length;
-	spin_unlock(&t->reassembly_queue_lock);
+	sc->recv_io.reassembly.data_length += data_length;
+	spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
 }
 
-static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t)
+static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc)
 {
-	if (!list_empty(&t->reassembly_queue))
-		return list_first_entry(&t->reassembly_queue,
-				struct smb_direct_recvmsg, list);
+	if (!list_empty(&sc->recv_io.reassembly.list))
+		return list_first_entry(&sc->recv_io.reassembly.list,
+				struct smbdirect_recv_io, list);
 	else
 		return NULL;
 }
 
+static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
+{
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	wake_up_all(&sc->status_wait);
+	wake_up_all(&sc->send_io.credits.wait_queue);
+	wake_up_all(&sc->send_io.pending.zero_wait_queue);
+	wake_up_all(&sc->recv_io.reassembly.wait_queue);
+	wake_up_all(&sc->rw_io.credits.wait_queue);
+}
+
 static void smb_direct_disconnect_rdma_work(struct work_struct *work)
 {
-	struct smb_direct_transport *t =
-		container_of(work, struct smb_direct_transport,
-			     disconnect_work);
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, disconnect_work);
+
+	/*
+	 * make sure this and other work is not queued again
+	 * but here we don't block and avoid
+	 * disable[_delayed]_work_sync()
+	 */
+	disable_work(&sc->disconnect_work);
+	disable_work(&sc->recv_io.posted.refill_work);
+	disable_delayed_work(&sc->idle.timer_work);
+	disable_work(&sc->idle.immediate_work);
+
+	if (sc->first_error == 0)
+		sc->first_error = -ECONNABORTED;
+
+	switch (sc->status) {
+	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+	case SMBDIRECT_SOCKET_CONNECTED:
+	case SMBDIRECT_SOCKET_ERROR:
+		sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
+		rdma_disconnect(sc->rdma.cm_id);
+		break;
+
+	case SMBDIRECT_SOCKET_CREATED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+		/*
+		 * rdma_accept() never reached
+		 * RDMA_CM_EVENT_ESTABLISHED
+		 */
+		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+		break;
 
-	if (t->status == SMB_DIRECT_CS_CONNECTED) {
-		t->status = SMB_DIRECT_CS_DISCONNECTING;
-		rdma_disconnect(t->cm_id);
+	case SMBDIRECT_SOCKET_DISCONNECTING:
+	case SMBDIRECT_SOCKET_DISCONNECTED:
+	case SMBDIRECT_SOCKET_DESTROYED:
+		break;
 	}
+
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	smb_direct_disconnect_wake_up_all(sc);
 }
 
 static void
-smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t)
+smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
 {
-	if (t->status == SMB_DIRECT_CS_CONNECTED)
-		queue_work(smb_direct_wq, &t->disconnect_work);
+	/*
+	 * make sure other work (than disconnect_work) is
+	 * not queued again but here we don't block and avoid
+	 * disable[_delayed]_work_sync()
+	 */
+	disable_work(&sc->recv_io.posted.refill_work);
+	disable_work(&sc->idle.immediate_work);
+	disable_delayed_work(&sc->idle.timer_work);
+
+	if (sc->first_error == 0)
+		sc->first_error = -ECONNABORTED;
+
+	switch (sc->status) {
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+	case SMBDIRECT_SOCKET_ERROR:
+	case SMBDIRECT_SOCKET_DISCONNECTING:
+	case SMBDIRECT_SOCKET_DISCONNECTED:
+	case SMBDIRECT_SOCKET_DESTROYED:
+		/*
+		 * Keep the current error status
+		 */
+		break;
+
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+	case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+	case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+	case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+		break;
+
+	case SMBDIRECT_SOCKET_CREATED:
+	case SMBDIRECT_SOCKET_CONNECTED:
+		sc->status = SMBDIRECT_SOCKET_ERROR;
+		break;
+	}
+
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 */
+	smb_direct_disconnect_wake_up_all(sc);
+
+	queue_work(sc->workqueue, &sc->disconnect_work);
 }
 
 static void smb_direct_send_immediate_work(struct work_struct *work)
 {
-	struct smb_direct_transport *t = container_of(work,
-			struct smb_direct_transport, send_immediate_work);
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, idle.immediate_work);
 
-	if (t->status != SMB_DIRECT_CS_CONNECTED)
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
 		return;
 
-	smb_direct_post_send_data(t, NULL, NULL, 0, 0);
+	smb_direct_post_send_data(sc, NULL, NULL, 0, 0);
+}
+
+static void smb_direct_idle_connection_timer(struct work_struct *work)
+{
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, idle.timer_work.work);
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+	if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
+		smb_direct_disconnect_rdma_connection(sc);
+		return;
+	}
+
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+		return;
+
+	/*
+	 * Now use the keepalive timeout (instead of keepalive interval)
+	 * in order to wait for a response
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->keepalive_timeout_msec));
+	queue_work(sc->workqueue, &sc->idle.immediate_work);
 }
 
 static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
 {
 	struct smb_direct_transport *t;
+	struct smbdirect_socket *sc;
+	struct smbdirect_socket_parameters *sp;
 	struct ksmbd_conn *conn;
 
 	t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP);
 	if (!t)
 		return NULL;
+	sc = &t->socket;
+	smbdirect_socket_init(sc);
+	sp = &sc->parameters;
 
-	t->cm_id = cm_id;
-	cm_id->context = t;
-
-	t->status = SMB_DIRECT_CS_NEW;
-	init_waitqueue_head(&t->wait_status);
+	sc->workqueue = smb_direct_wq;
 
-	spin_lock_init(&t->reassembly_queue_lock);
-	INIT_LIST_HEAD(&t->reassembly_queue);
-	t->reassembly_data_length = 0;
-	t->reassembly_queue_length = 0;
-	init_waitqueue_head(&t->wait_reassembly_queue);
-	init_waitqueue_head(&t->wait_send_credits);
-	init_waitqueue_head(&t->wait_rw_credits);
+	INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work);
 
-	spin_lock_init(&t->receive_credit_lock);
-	spin_lock_init(&t->recvmsg_queue_lock);
-	INIT_LIST_HEAD(&t->recvmsg_queue);
+	sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000;
+	sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH;
+	sp->responder_resources = 1;
+	sp->recv_credit_max = smb_direct_receive_credit_max;
+	sp->send_credit_target = smb_direct_send_credit_target;
+	sp->max_send_size = smb_direct_max_send_size;
+	sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+	sp->max_recv_size = smb_direct_max_receive_size;
+	sp->max_read_write_size = smb_direct_max_read_write_size;
+	sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000;
+	sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000;
 
-	init_waitqueue_head(&t->wait_send_pending);
-	atomic_set(&t->send_pending, 0);
+	sc->rdma.cm_id = cm_id;
+	cm_id->context = sc;
 
-	spin_lock_init(&t->lock_new_recv_credits);
+	sc->ib.dev = sc->rdma.cm_id->device;
 
-	INIT_DELAYED_WORK(&t->post_recv_credits_work,
-			  smb_direct_post_recv_credits);
-	INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work);
-	INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work);
+	INIT_WORK(&sc->recv_io.posted.refill_work,
+		  smb_direct_post_recv_credits);
+	INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
+	INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);
 
 	conn = ksmbd_conn_alloc();
 	if (!conn)
@@ -391,89 +441,104 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt)
 
 static void free_transport(struct smb_direct_transport *t)
 {
-	struct smb_direct_recvmsg *recvmsg;
+	struct smbdirect_socket *sc = &t->socket;
+	struct smbdirect_recv_io *recvmsg;
 
-	wake_up_interruptible(&t->wait_send_credits);
+	disable_work_sync(&sc->disconnect_work);
+	if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+		smb_direct_disconnect_rdma_work(&sc->disconnect_work);
+		wait_event_interruptible(sc->status_wait,
+					 sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
+	}
 
-	ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n");
-	wait_event(t->wait_send_pending,
-		   atomic_read(&t->send_pending) == 0);
+	/*
+	 * Wake up all waiters in all wait queues
+	 * in order to notice the broken connection.
+	 *
+	 * Most likely this was already called via
+	 * smb_direct_disconnect_rdma_work(), but call it again...
+	 */
+	smb_direct_disconnect_wake_up_all(sc);
 
-	cancel_work_sync(&t->disconnect_work);
-	cancel_delayed_work_sync(&t->post_recv_credits_work);
-	cancel_work_sync(&t->send_immediate_work);
+	disable_work_sync(&sc->recv_io.posted.refill_work);
+	disable_delayed_work_sync(&sc->idle.timer_work);
+	disable_work_sync(&sc->idle.immediate_work);
 
-	if (t->qp) {
-		ib_drain_qp(t->qp);
-		ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
-		t->qp = NULL;
-		rdma_destroy_qp(t->cm_id);
+	if (sc->ib.qp) {
+		ib_drain_qp(sc->ib.qp);
+		ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);
+		sc->ib.qp = NULL;
+		rdma_destroy_qp(sc->rdma.cm_id);
 	}
 
 	ksmbd_debug(RDMA, "drain the reassembly queue\n");
 	do {
-		spin_lock(&t->reassembly_queue_lock);
-		recvmsg = get_first_reassembly(t);
+		unsigned long flags;
+
+		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+		recvmsg = get_first_reassembly(sc);
 		if (recvmsg) {
 			list_del(&recvmsg->list);
-			spin_unlock(&t->reassembly_queue_lock);
-			put_recvmsg(t, recvmsg);
+			spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+			put_recvmsg(sc, recvmsg);
 		} else {
-			spin_unlock(&t->reassembly_queue_lock);
+			spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
 		}
 	} while (recvmsg);
-	t->reassembly_data_length = 0;
-
-	if (t->send_cq)
-		ib_free_cq(t->send_cq);
-	if (t->recv_cq)
-		ib_free_cq(t->recv_cq);
-	if (t->pd)
-		ib_dealloc_pd(t->pd);
-	if (t->cm_id)
-		rdma_destroy_id(t->cm_id);
-
-	smb_direct_destroy_pools(t);
+	sc->recv_io.reassembly.data_length = 0;
+
+	if (sc->ib.send_cq)
+		ib_free_cq(sc->ib.send_cq);
+	if (sc->ib.recv_cq)
+		ib_free_cq(sc->ib.recv_cq);
+	if (sc->ib.pd)
+		ib_dealloc_pd(sc->ib.pd);
+	if (sc->rdma.cm_id)
+		rdma_destroy_id(sc->rdma.cm_id);
+
+	smb_direct_destroy_pools(sc);
 	ksmbd_conn_free(KSMBD_TRANS(t)->conn);
 }
 
-static struct smb_direct_sendmsg
-*smb_direct_alloc_sendmsg(struct smb_direct_transport *t)
+static struct smbdirect_send_io
+*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc)
 {
-	struct smb_direct_sendmsg *msg;
+	struct smbdirect_send_io *msg;
 
-	msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP);
+	msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP);
 	if (!msg)
 		return ERR_PTR(-ENOMEM);
-	msg->transport = t;
-	INIT_LIST_HEAD(&msg->list);
+	msg->socket = sc;
+	INIT_LIST_HEAD(&msg->sibling_list);
 	msg->num_sge = 0;
 	return msg;
 }
 
-static void smb_direct_free_sendmsg(struct smb_direct_transport *t,
-				    struct smb_direct_sendmsg *msg)
+static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,
+				    struct smbdirect_send_io *msg)
 {
 	int i;
 
 	if (msg->num_sge > 0) {
-		ib_dma_unmap_single(t->cm_id->device,
+		ib_dma_unmap_single(sc->ib.dev,
 				    msg->sge[0].addr, msg->sge[0].length,
 				    DMA_TO_DEVICE);
 		for (i = 1; i < msg->num_sge; i++)
-			ib_dma_unmap_page(t->cm_id->device,
+			ib_dma_unmap_page(sc->ib.dev,
 					  msg->sge[i].addr, msg->sge[i].length,
 					  DMA_TO_DEVICE);
 	}
-	mempool_free(msg, t->sendmsg_mempool);
+	mempool_free(msg, sc->send_io.mem.pool);
 }
 
-static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
+static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg)
 {
-	switch (recvmsg->type) {
-	case SMB_DIRECT_MSG_DATA_TRANSFER: {
-		struct smb_direct_data_transfer *req =
-			(struct smb_direct_data_transfer *)recvmsg->packet;
+	struct smbdirect_socket *sc = recvmsg->socket;
+
+	switch (sc->recv_io.expected) {
+	case SMBDIRECT_EXPECT_DATA_TRANSFER: {
+		struct smbdirect_data_transfer *req =
+			(struct smbdirect_data_transfer *)recvmsg->packet;
 		struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet
 				+ le32_to_cpu(req->data_offset));
 		ksmbd_debug(RDMA,
@@ -482,11 +547,11 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
 			    le16_to_cpu(req->credits_requested),
 			    req->data_length, req->remaining_data_length,
 			    hdr->ProtocolId, hdr->Command);
-		break;
+		return 0;
 	}
-	case SMB_DIRECT_MSG_NEGOTIATE_REQ: {
-		struct smb_direct_negotiate_req *req =
-			(struct smb_direct_negotiate_req *)recvmsg->packet;
+	case SMBDIRECT_EXPECT_NEGOTIATE_REQ: {
+		struct smbdirect_negotiate_req *req =
+			(struct smbdirect_negotiate_req *)recvmsg->packet;
 		ksmbd_debug(RDMA,
 			    "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n",
 			    le16_to_cpu(req->min_version),
@@ -504,29 +569,34 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
 					128 * 1024)
 			return -ECONNABORTED;
 
-		break;
+		return 0;
 	}
-	default:
-		return -EINVAL;
+	case SMBDIRECT_EXPECT_NEGOTIATE_REP:
+		/* client only */
+		break;
 	}
-	return 0;
+
+	/* This is an internal error */
+	return -EINVAL;
 }
 
 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct smb_direct_recvmsg *recvmsg;
-	struct smb_direct_transport *t;
+	struct smbdirect_recv_io *recvmsg;
+	struct smbdirect_socket *sc;
+	struct smbdirect_socket_parameters *sp;
 
-	recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe);
-	t = recvmsg->transport;
+	recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+	sc = recvmsg->socket;
+	sp = &sc->parameters;
 
 	if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
-		put_recvmsg(t, recvmsg);
+		put_recvmsg(sc, recvmsg);
 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
 			pr_err("Recv error. status='%s (%d)' opcode=%d\n",
 			       ib_wc_status_msg(wc->status), wc->status,
 			       wc->opcode);
-			smb_direct_disconnect_rdma_connection(t);
+			smb_direct_disconnect_rdma_connection(sc);
 		}
 		return;
 	}
@@ -538,108 +608,128 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
 	ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr,
 				   recvmsg->sge.length, DMA_FROM_DEVICE);
 
-	switch (recvmsg->type) {
-	case SMB_DIRECT_MSG_NEGOTIATE_REQ:
-		if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
-			put_recvmsg(t, recvmsg);
-			smb_direct_disconnect_rdma_connection(t);
+	/*
+	 * Reset timer to the keepalive interval in
+	 * order to trigger our next keepalive message.
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->keepalive_interval_msec));
+
+	switch (sc->recv_io.expected) {
+	case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
+		if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) {
+			put_recvmsg(sc, recvmsg);
+			smb_direct_disconnect_rdma_connection(sc);
 			return;
 		}
-		t->negotiation_requested = true;
-		t->full_packet_received = true;
-		t->status = SMB_DIRECT_CS_CONNECTED;
-		enqueue_reassembly(t, recvmsg, 0);
-		wake_up_interruptible(&t->wait_status);
+		sc->recv_io.reassembly.full_packet_received = true;
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
+		enqueue_reassembly(sc, recvmsg, 0);
+		wake_up(&sc->status_wait);
 		return;
-	case SMB_DIRECT_MSG_DATA_TRANSFER: {
-		struct smb_direct_data_transfer *data_transfer =
-			(struct smb_direct_data_transfer *)recvmsg->packet;
-		unsigned int data_length;
-		int avail_recvmsg_count, receive_credits;
+	case SMBDIRECT_EXPECT_DATA_TRANSFER: {
+		struct smbdirect_data_transfer *data_transfer =
+			(struct smbdirect_data_transfer *)recvmsg->packet;
+		u32 remaining_data_length, data_offset, data_length;
+		u16 old_recv_credit_target;
 
 		if (wc->byte_len <
-		    offsetof(struct smb_direct_data_transfer, padding)) {
-			put_recvmsg(t, recvmsg);
-			smb_direct_disconnect_rdma_connection(t);
+		    offsetof(struct smbdirect_data_transfer, padding)) {
+			put_recvmsg(sc, recvmsg);
+			smb_direct_disconnect_rdma_connection(sc);
 			return;
 		}
 
+		remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
 		data_length = le32_to_cpu(data_transfer->data_length);
-		if (data_length) {
-			if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
-			    (u64)data_length) {
-				put_recvmsg(t, recvmsg);
-				smb_direct_disconnect_rdma_connection(t);
-				return;
-			}
+		data_offset = le32_to_cpu(data_transfer->data_offset);
+		if (wc->byte_len < data_offset ||
+		    wc->byte_len < (u64)data_offset + data_length) {
+			put_recvmsg(sc, recvmsg);
+			smb_direct_disconnect_rdma_connection(sc);
+			return;
+		}
+		if (remaining_data_length > sp->max_fragmented_recv_size ||
+		    data_length > sp->max_fragmented_recv_size ||
+		    (u64)remaining_data_length + (u64)data_length >
+		    (u64)sp->max_fragmented_recv_size) {
+			put_recvmsg(sc, recvmsg);
+			smb_direct_disconnect_rdma_connection(sc);
+			return;
+		}
 
-			if (t->full_packet_received)
+		if (data_length) {
+			if (sc->recv_io.reassembly.full_packet_received)
 				recvmsg->first_segment = true;
 
 			if (le32_to_cpu(data_transfer->remaining_data_length))
-				t->full_packet_received = false;
+				sc->recv_io.reassembly.full_packet_received = false;
 			else
-				t->full_packet_received = true;
-
-			spin_lock(&t->receive_credit_lock);
-			receive_credits = --(t->recv_credits);
-			avail_recvmsg_count = t->count_avail_recvmsg;
-			spin_unlock(&t->receive_credit_lock);
-		} else {
-			spin_lock(&t->receive_credit_lock);
-			receive_credits = --(t->recv_credits);
-			avail_recvmsg_count = ++(t->count_avail_recvmsg);
-			spin_unlock(&t->receive_credit_lock);
+				sc->recv_io.reassembly.full_packet_received = true;
 		}
 
-		t->recv_credit_target =
+		atomic_dec(&sc->recv_io.posted.count);
+		atomic_dec(&sc->recv_io.credits.count);
+
+		old_recv_credit_target = sc->recv_io.credits.target;
+		sc->recv_io.credits.target =
 				le16_to_cpu(data_transfer->credits_requested);
+		sc->recv_io.credits.target =
+			min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+		sc->recv_io.credits.target =
+			max_t(u16, sc->recv_io.credits.target, 1);
 		atomic_add(le16_to_cpu(data_transfer->credits_granted),
-			   &t->send_credits);
+			   &sc->send_io.credits.count);
 
 		if (le16_to_cpu(data_transfer->flags) &
-		    SMB_DIRECT_RESPONSE_REQUESTED)
-			queue_work(smb_direct_wq, &t->send_immediate_work);
-
-		if (atomic_read(&t->send_credits) > 0)
-			wake_up_interruptible(&t->wait_send_credits);
+		    SMBDIRECT_FLAG_RESPONSE_REQUESTED)
+			queue_work(sc->workqueue, &sc->idle.immediate_work);
 
-		if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
-			mod_delayed_work(smb_direct_wq,
-					 &t->post_recv_credits_work, 0);
+		if (atomic_read(&sc->send_io.credits.count) > 0)
+			wake_up(&sc->send_io.credits.wait_queue);
 
 		if (data_length) {
-			enqueue_reassembly(t, recvmsg, (int)data_length);
-			wake_up_interruptible(&t->wait_reassembly_queue);
+			if (sc->recv_io.credits.target > old_recv_credit_target)
+				queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+
+			enqueue_reassembly(sc, recvmsg, (int)data_length);
+			wake_up(&sc->recv_io.reassembly.wait_queue);
 		} else
-			put_recvmsg(t, recvmsg);
+			put_recvmsg(sc, recvmsg);
 
 		return;
 	}
+	case SMBDIRECT_EXPECT_NEGOTIATE_REP:
+		/* client only */
+		break;
 	}
 
 	/*
 	 * This is an internal error!
 	 */
-	WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER);
-	put_recvmsg(t, recvmsg);
-	smb_direct_disconnect_rdma_connection(t);
+	WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
+	put_recvmsg(sc, recvmsg);
+	smb_direct_disconnect_rdma_connection(sc);
 }
 
-static int smb_direct_post_recv(struct smb_direct_transport *t,
-				struct smb_direct_recvmsg *recvmsg)
+static int smb_direct_post_recv(struct smbdirect_socket *sc,
+				struct smbdirect_recv_io *recvmsg)
 {
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct ib_recv_wr wr;
 	int ret;
 
-	recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device,
-					      recvmsg->packet, t->max_recv_size,
+	recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev,
+					      recvmsg->packet,
+					      sp->max_recv_size,
 					      DMA_FROM_DEVICE);
-	ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr);
+	ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr);
 	if (ret)
 		return ret;
-	recvmsg->sge.length = t->max_recv_size;
-	recvmsg->sge.lkey = t->pd->local_dma_lkey;
+	recvmsg->sge.length = sp->max_recv_size;
+	recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey;
 	recvmsg->cqe.done = recv_done;
 
 	wr.wr_cqe = &recvmsg->cqe;
@@ -647,14 +737,14 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
 	wr.sg_list = &recvmsg->sge;
 	wr.num_sge = 1;
 
-	ret = ib_post_recv(t->qp, &wr, NULL);
+	ret = ib_post_recv(sc->ib.qp, &wr, NULL);
 	if (ret) {
 		pr_err("Can't post recv: %d\n", ret);
-		ib_dma_unmap_single(t->cm_id->device,
+		ib_dma_unmap_single(sc->ib.dev,
 				    recvmsg->sge.addr, recvmsg->sge.length,
 				    DMA_FROM_DEVICE);
 		recvmsg->sge.length = 0;
-		smb_direct_disconnect_rdma_connection(t);
+		smb_direct_disconnect_rdma_connection(sc);
 		return ret;
 	}
 	return ret;
@@ -663,15 +753,16 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
 static int smb_direct_read(struct ksmbd_transport *t, char *buf,
 			   unsigned int size, int unused)
 {
-	struct smb_direct_recvmsg *recvmsg;
-	struct smb_direct_data_transfer *data_transfer;
+	struct smbdirect_recv_io *recvmsg;
+	struct smbdirect_data_transfer *data_transfer;
 	int to_copy, to_read, data_read, offset;
 	u32 data_length, remaining_data_length, data_offset;
 	int rc;
-	struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+	struct smb_direct_transport *st = SMBD_TRANS(t);
+	struct smbdirect_socket *sc = &st->socket;
 
 again:
-	if (st->status != SMB_DIRECT_CS_CONNECTED) {
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
 		pr_err("disconnected\n");
 		return -ENOTCONN;
 	}
@@ -681,9 +772,10 @@ again:
 	 * the only one reading from the front of the queue. The transport
 	 * may add more entries to the back of the queue at the same time
 	 */
-	if (st->reassembly_data_length >= size) {
+	if (sc->recv_io.reassembly.data_length >= size) {
 		int queue_length;
 		int queue_removed = 0;
+		unsigned long flags;
 
 		/*
 		 * Need to make sure reassembly_data_length is read before
@@ -693,13 +785,13 @@ again:
 		 * updated in SOFTIRQ as more data is received
 		 */
 		virt_rmb();
-		queue_length = st->reassembly_queue_length;
+		queue_length = sc->recv_io.reassembly.queue_length;
 		data_read = 0;
 		to_read = size;
-		offset = st->first_entry_offset;
+		offset = sc->recv_io.reassembly.first_entry_offset;
 		while (data_read < size) {
-			recvmsg = get_first_reassembly(st);
-			data_transfer = smb_direct_recvmsg_payload(recvmsg);
+			recvmsg = get_first_reassembly(sc);
+			data_transfer = smbdirect_recv_io_payload(recvmsg);
 			data_length = le32_to_cpu(data_transfer->data_length);
 			remaining_data_length =
 				le32_to_cpu(data_transfer->remaining_data_length);
@@ -739,12 +831,12 @@ again:
 				if (queue_length) {
 					list_del(&recvmsg->list);
 				} else {
-					spin_lock_irq(&st->reassembly_queue_lock);
+					spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
 					list_del(&recvmsg->list);
-					spin_unlock_irq(&st->reassembly_queue_lock);
+					spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
 				}
 				queue_removed++;
-				put_recvmsg(st, recvmsg);
+				put_recvmsg(sc, recvmsg);
 				offset = 0;
 			} else {
 				offset += to_copy;
@@ -754,34 +846,24 @@ again:
 			data_read += to_copy;
 		}
 
-		spin_lock_irq(&st->reassembly_queue_lock);
-		st->reassembly_data_length -= data_read;
-		st->reassembly_queue_length -= queue_removed;
-		spin_unlock_irq(&st->reassembly_queue_lock);
-
-		spin_lock(&st->receive_credit_lock);
-		st->count_avail_recvmsg += queue_removed;
-		if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) {
-			spin_unlock(&st->receive_credit_lock);
-			mod_delayed_work(smb_direct_wq,
-					 &st->post_recv_credits_work, 0);
-		} else {
-			spin_unlock(&st->receive_credit_lock);
-		}
+		spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+		sc->recv_io.reassembly.data_length -= data_read;
+		sc->recv_io.reassembly.queue_length -= queue_removed;
+		spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
 
-		st->first_entry_offset = offset;
+		sc->recv_io.reassembly.first_entry_offset = offset;
 		ksmbd_debug(RDMA,
 			    "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
-			    data_read, st->reassembly_data_length,
-			    st->first_entry_offset);
+			    data_read, sc->recv_io.reassembly.data_length,
+			    sc->recv_io.reassembly.first_entry_offset);
 read_rfc1002_done:
 		return data_read;
 	}
 
 	ksmbd_debug(RDMA, "wait_event on more data\n");
-	rc = wait_event_interruptible(st->wait_reassembly_queue,
-				      st->reassembly_data_length >= size ||
-				       st->status != SMB_DIRECT_CS_CONNECTED);
+	rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue,
+				      sc->recv_io.reassembly.data_length >= size ||
+				       sc->status != SMBDIRECT_SOCKET_CONNECTED);
 	if (rc)
 		return -EINTR;
 
@@ -790,56 +872,44 @@ read_rfc1002_done:
 
 static void smb_direct_post_recv_credits(struct work_struct *work)
 {
-	struct smb_direct_transport *t = container_of(work,
-		struct smb_direct_transport, post_recv_credits_work.work);
-	struct smb_direct_recvmsg *recvmsg;
-	int receive_credits, credits = 0;
+	struct smbdirect_socket *sc =
+		container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
+	struct smbdirect_recv_io *recvmsg;
+	int credits = 0;
 	int ret;
 
-	spin_lock(&t->receive_credit_lock);
-	receive_credits = t->recv_credits;
-	spin_unlock(&t->receive_credit_lock);
-
-	if (receive_credits < t->recv_credit_target) {
+	if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) {
 		while (true) {
-			recvmsg = get_free_recvmsg(t);
+			recvmsg = get_free_recvmsg(sc);
 			if (!recvmsg)
 				break;
 
-			recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER;
 			recvmsg->first_segment = false;
 
-			ret = smb_direct_post_recv(t, recvmsg);
+			ret = smb_direct_post_recv(sc, recvmsg);
 			if (ret) {
 				pr_err("Can't post recv: %d\n", ret);
-				put_recvmsg(t, recvmsg);
+				put_recvmsg(sc, recvmsg);
 				break;
 			}
 			credits++;
+
+			atomic_inc(&sc->recv_io.posted.count);
 		}
 	}
 
-	spin_lock(&t->receive_credit_lock);
-	t->recv_credits += credits;
-	t->count_avail_recvmsg -= credits;
-	spin_unlock(&t->receive_credit_lock);
-
-	spin_lock(&t->lock_new_recv_credits);
-	t->new_recv_credits += credits;
-	spin_unlock(&t->lock_new_recv_credits);
-
 	if (credits)
-		queue_work(smb_direct_wq, &t->send_immediate_work);
+		queue_work(sc->workqueue, &sc->idle.immediate_work);
 }
 
 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct smb_direct_sendmsg *sendmsg, *sibling;
-	struct smb_direct_transport *t;
+	struct smbdirect_send_io *sendmsg, *sibling;
+	struct smbdirect_socket *sc;
 	struct list_head *pos, *prev, *end;
 
-	sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe);
-	t = sendmsg->transport;
+	sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+	sc = sendmsg->socket;
 
 	ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n",
 		    ib_wc_status_msg(wc->status), wc->status,
@@ -849,55 +919,78 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
 		pr_err("Send error. status='%s (%d)', opcode=%d\n",
 		       ib_wc_status_msg(wc->status), wc->status,
 		       wc->opcode);
-		smb_direct_disconnect_rdma_connection(t);
+		smb_direct_disconnect_rdma_connection(sc);
 	}
 
-	if (atomic_dec_and_test(&t->send_pending))
-		wake_up(&t->wait_send_pending);
+	if (atomic_dec_and_test(&sc->send_io.pending.count))
+		wake_up(&sc->send_io.pending.zero_wait_queue);
 
 	/* iterate and free the list of messages in reverse. the list's head
 	 * is invalid.
 	 */
-	for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next;
+	for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next;
 	     prev != end; pos = prev, prev = prev->prev) {
-		sibling = container_of(pos, struct smb_direct_sendmsg, list);
-		smb_direct_free_sendmsg(t, sibling);
+		sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
+		smb_direct_free_sendmsg(sc, sibling);
 	}
 
-	sibling = container_of(pos, struct smb_direct_sendmsg, list);
-	smb_direct_free_sendmsg(t, sibling);
+	sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
+	smb_direct_free_sendmsg(sc, sibling);
 }
 
-static int manage_credits_prior_sending(struct smb_direct_transport *t)
+static int manage_credits_prior_sending(struct smbdirect_socket *sc)
 {
 	int new_credits;
 
-	spin_lock(&t->lock_new_recv_credits);
-	new_credits = t->new_recv_credits;
-	t->new_recv_credits = 0;
-	spin_unlock(&t->lock_new_recv_credits);
+	if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
+		return 0;
+
+	new_credits = atomic_read(&sc->recv_io.posted.count);
+	if (new_credits == 0)
+		return 0;
 
+	new_credits -= atomic_read(&sc->recv_io.credits.count);
+	if (new_credits <= 0)
+		return 0;
+
+	atomic_add(new_credits, &sc->recv_io.credits.count);
 	return new_credits;
 }
 
-static int smb_direct_post_send(struct smb_direct_transport *t,
+static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
+{
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+	if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
+		sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
+		/*
+		 * Now use the keepalive timeout (instead of keepalive interval)
+		 * in order to wait for a response
+		 */
+		mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+				 msecs_to_jiffies(sp->keepalive_timeout_msec));
+		return 1;
+	}
+	return 0;
+}
+
+static int smb_direct_post_send(struct smbdirect_socket *sc,
 				struct ib_send_wr *wr)
 {
 	int ret;
 
-	atomic_inc(&t->send_pending);
-	ret = ib_post_send(t->qp, wr, NULL);
+	atomic_inc(&sc->send_io.pending.count);
+	ret = ib_post_send(sc->ib.qp, wr, NULL);
 	if (ret) {
 		pr_err("failed to post send: %d\n", ret);
-		if (atomic_dec_and_test(&t->send_pending))
-			wake_up(&t->wait_send_pending);
-		smb_direct_disconnect_rdma_connection(t);
+		if (atomic_dec_and_test(&sc->send_io.pending.count))
+			wake_up(&sc->send_io.pending.zero_wait_queue);
+		smb_direct_disconnect_rdma_connection(sc);
 	}
 	return ret;
 }
 
-static void smb_direct_send_ctx_init(struct smb_direct_transport *t,
-				     struct smb_direct_send_ctx *send_ctx,
+static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx,
 				     bool need_invalidate_rkey,
 				     unsigned int remote_key)
 {
@@ -907,47 +1000,50 @@ static void smb_direct_send_ctx_init(struct smb_direct_transport *t,
 	send_ctx->remote_key = remote_key;
 }
 
-static int smb_direct_flush_send_list(struct smb_direct_transport *t,
-				      struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
+				      struct smbdirect_send_batch *send_ctx,
 				      bool is_last)
 {
-	struct smb_direct_sendmsg *first, *last;
+	struct smbdirect_send_io *first, *last;
 	int ret;
 
 	if (list_empty(&send_ctx->msg_list))
 		return 0;
 
 	first = list_first_entry(&send_ctx->msg_list,
-				 struct smb_direct_sendmsg,
-				 list);
+				 struct smbdirect_send_io,
+				 sibling_list);
 	last = list_last_entry(&send_ctx->msg_list,
-			       struct smb_direct_sendmsg,
-			       list);
+			       struct smbdirect_send_io,
+			       sibling_list);
+
+	if (send_ctx->need_invalidate_rkey) {
+		first->wr.opcode = IB_WR_SEND_WITH_INV;
+		first->wr.ex.invalidate_rkey = send_ctx->remote_key;
+		send_ctx->need_invalidate_rkey = false;
+		send_ctx->remote_key = 0;
+	}
 
 	last->wr.send_flags = IB_SEND_SIGNALED;
 	last->wr.wr_cqe = &last->cqe;
-	if (is_last && send_ctx->need_invalidate_rkey) {
-		last->wr.opcode = IB_WR_SEND_WITH_INV;
-		last->wr.ex.invalidate_rkey = send_ctx->remote_key;
-	}
 
-	ret = smb_direct_post_send(t, &first->wr);
+	ret = smb_direct_post_send(sc, &first->wr);
 	if (!ret) {
-		smb_direct_send_ctx_init(t, send_ctx,
+		smb_direct_send_ctx_init(send_ctx,
 					 send_ctx->need_invalidate_rkey,
 					 send_ctx->remote_key);
 	} else {
-		atomic_add(send_ctx->wr_cnt, &t->send_credits);
-		wake_up(&t->wait_send_credits);
+		atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count);
+		wake_up(&sc->send_io.credits.wait_queue);
 		list_for_each_entry_safe(first, last, &send_ctx->msg_list,
-					 list) {
-			smb_direct_free_sendmsg(t, first);
+					 sibling_list) {
+			smb_direct_free_sendmsg(sc, first);
 		}
 	}
 	return ret;
 }
 
-static int wait_for_credits(struct smb_direct_transport *t,
+static int wait_for_credits(struct smbdirect_socket *sc,
 			    wait_queue_head_t *waitq, atomic_t *total_credits,
 			    int needed)
 {
@@ -960,61 +1056,68 @@ static int wait_for_credits(struct smb_direct_transport *t,
 		atomic_add(needed, total_credits);
 		ret = wait_event_interruptible(*waitq,
 					       atomic_read(total_credits) >= needed ||
-					       t->status != SMB_DIRECT_CS_CONNECTED);
+					       sc->status != SMBDIRECT_SOCKET_CONNECTED);
 
-		if (t->status != SMB_DIRECT_CS_CONNECTED)
+		if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
 			return -ENOTCONN;
 		else if (ret < 0)
 			return ret;
 	} while (true);
 }
 
-static int wait_for_send_credits(struct smb_direct_transport *t,
-				 struct smb_direct_send_ctx *send_ctx)
+static int wait_for_send_credits(struct smbdirect_socket *sc,
+				 struct smbdirect_send_batch *send_ctx)
 {
 	int ret;
 
 	if (send_ctx &&
-	    (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) {
-		ret = smb_direct_flush_send_list(t, send_ctx, false);
+	    (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
+		ret = smb_direct_flush_send_list(sc, send_ctx, false);
 		if (ret)
 			return ret;
 	}
 
-	return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1);
+	return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1);
 }
 
-static int wait_for_rw_credits(struct smb_direct_transport *t, int credits)
+static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits)
 {
-	return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits);
+	return wait_for_credits(sc,
+				&sc->rw_io.credits.wait_queue,
+				&sc->rw_io.credits.count,
+				credits);
 }
 
-static int calc_rw_credits(struct smb_direct_transport *t,
+static int calc_rw_credits(struct smbdirect_socket *sc,
 			   char *buf, unsigned int len)
 {
 	return DIV_ROUND_UP(get_buf_page_count(buf, len),
-			    t->pages_per_rw_credit);
+			    sc->rw_io.credits.num_pages);
 }
 
-static int smb_direct_create_header(struct smb_direct_transport *t,
+static int smb_direct_create_header(struct smbdirect_socket *sc,
 				    int size, int remaining_data_length,
-				    struct smb_direct_sendmsg **sendmsg_out)
+				    struct smbdirect_send_io **sendmsg_out)
 {
-	struct smb_direct_sendmsg *sendmsg;
-	struct smb_direct_data_transfer *packet;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_send_io *sendmsg;
+	struct smbdirect_data_transfer *packet;
 	int header_length;
 	int ret;
 
-	sendmsg = smb_direct_alloc_sendmsg(t);
+	sendmsg = smb_direct_alloc_sendmsg(sc);
 	if (IS_ERR(sendmsg))
 		return PTR_ERR(sendmsg);
 
 	/* Fill in the packet header */
-	packet = (struct smb_direct_data_transfer *)sendmsg->packet;
-	packet->credits_requested = cpu_to_le16(t->send_credit_target);
-	packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
+	packet = (struct smbdirect_data_transfer *)sendmsg->packet;
+	packet->credits_requested = cpu_to_le16(sp->send_credit_target);
+	packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
 
 	packet->flags = 0;
+	if (manage_keep_alive_before_sending(sc))
+		packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
+
 	packet->reserved = 0;
 	if (!size)
 		packet->data_offset = 0;
@@ -1033,25 +1136,25 @@ static int smb_direct_create_header(struct smb_direct_transport *t,
 		    le32_to_cpu(packet->remaining_data_length));
 
 	/* Map the packet to DMA */
-	header_length = sizeof(struct smb_direct_data_transfer);
+	header_length = sizeof(struct smbdirect_data_transfer);
 	/* If this is a packet without payload, don't send padding */
 	if (!size)
 		header_length =
-			offsetof(struct smb_direct_data_transfer, padding);
+			offsetof(struct smbdirect_data_transfer, padding);
 
-	sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+	sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
 						 (void *)packet,
 						 header_length,
 						 DMA_TO_DEVICE);
-	ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+	ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
 	if (ret) {
-		smb_direct_free_sendmsg(t, sendmsg);
+		smb_direct_free_sendmsg(sc, sendmsg);
 		return ret;
 	}
 
 	sendmsg->num_sge = 1;
 	sendmsg->sge[0].length = header_length;
-	sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+	sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
 
 	*sendmsg_out = sendmsg;
 	return 0;
@@ -1101,14 +1204,14 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
 	return ib_dma_map_sg(device, sg_list, npages, dir);
 }
 
-static int post_sendmsg(struct smb_direct_transport *t,
-			struct smb_direct_send_ctx *send_ctx,
-			struct smb_direct_sendmsg *msg)
+static int post_sendmsg(struct smbdirect_socket *sc,
+			struct smbdirect_send_batch *send_ctx,
+			struct smbdirect_send_io *msg)
 {
 	int i;
 
 	for (i = 0; i < msg->num_sge; i++)
-		ib_dma_sync_single_for_device(t->cm_id->device,
+		ib_dma_sync_single_for_device(sc->ib.dev,
 					      msg->sge[i].addr, msg->sge[i].length,
 					      DMA_TO_DEVICE);
 
@@ -1122,34 +1225,34 @@ static int post_sendmsg(struct smb_direct_transport *t,
 		msg->wr.wr_cqe = NULL;
 		msg->wr.send_flags = 0;
 		if (!list_empty(&send_ctx->msg_list)) {
-			struct smb_direct_sendmsg *last;
+			struct smbdirect_send_io *last;
 
 			last = list_last_entry(&send_ctx->msg_list,
-					       struct smb_direct_sendmsg,
-					       list);
+					       struct smbdirect_send_io,
+					       sibling_list);
 			last->wr.next = &msg->wr;
 		}
-		list_add_tail(&msg->list, &send_ctx->msg_list);
+		list_add_tail(&msg->sibling_list, &send_ctx->msg_list);
 		send_ctx->wr_cnt++;
 		return 0;
 	}
 
 	msg->wr.wr_cqe = &msg->cqe;
 	msg->wr.send_flags = IB_SEND_SIGNALED;
-	return smb_direct_post_send(t, &msg->wr);
+	return smb_direct_post_send(sc, &msg->wr);
 }
 
-static int smb_direct_post_send_data(struct smb_direct_transport *t,
-				     struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_post_send_data(struct smbdirect_socket *sc,
+				     struct smbdirect_send_batch *send_ctx,
 				     struct kvec *iov, int niov,
 				     int remaining_data_length)
 {
 	int i, j, ret;
-	struct smb_direct_sendmsg *msg;
+	struct smbdirect_send_io *msg;
 	int data_length;
-	struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1];
+	struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
 
-	ret = wait_for_send_credits(t, send_ctx);
+	ret = wait_for_send_credits(sc, send_ctx);
 	if (ret)
 		return ret;
 
@@ -1157,10 +1260,10 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
 	for (i = 0; i < niov; i++)
 		data_length += iov[i].iov_len;
 
-	ret = smb_direct_create_header(t, data_length, remaining_data_length,
+	ret = smb_direct_create_header(sc, data_length, remaining_data_length,
 				       &msg);
 	if (ret) {
-		atomic_inc(&t->send_credits);
+		atomic_inc(&sc->send_io.credits.count);
 		return ret;
 	}
 
@@ -1168,19 +1271,19 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
 		struct ib_sge *sge;
 		int sg_cnt;
 
-		sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1);
-		sg_cnt = get_mapped_sg_list(t->cm_id->device,
+		sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1);
+		sg_cnt = get_mapped_sg_list(sc->ib.dev,
 					    iov[i].iov_base, iov[i].iov_len,
-					    sg, SMB_DIRECT_MAX_SEND_SGES - 1,
+					    sg, SMBDIRECT_SEND_IO_MAX_SGE - 1,
 					    DMA_TO_DEVICE);
 		if (sg_cnt <= 0) {
 			pr_err("failed to map buffer\n");
 			ret = -ENOMEM;
 			goto err;
-		} else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) {
+		} else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) {
 			pr_err("buffer not fitted into sges\n");
 			ret = -E2BIG;
-			ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt,
+			ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt,
 					DMA_TO_DEVICE);
 			goto err;
 		}
@@ -1189,18 +1292,18 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
 			sge = &msg->sge[msg->num_sge];
 			sge->addr = sg_dma_address(&sg[j]);
 			sge->length = sg_dma_len(&sg[j]);
-			sge->lkey  = t->pd->local_dma_lkey;
+			sge->lkey  = sc->ib.pd->local_dma_lkey;
 			msg->num_sge++;
 		}
 	}
 
-	ret = post_sendmsg(t, send_ctx, msg);
+	ret = post_sendmsg(sc, send_ctx, msg);
 	if (ret)
 		goto err;
 	return 0;
 err:
-	smb_direct_free_sendmsg(t, msg);
-	atomic_inc(&t->send_credits);
+	smb_direct_free_sendmsg(sc, msg);
+	atomic_inc(&sc->send_io.credits.count);
 	return ret;
 }
 
@@ -1208,79 +1311,133 @@ static int smb_direct_writev(struct ksmbd_transport *t,
 			     struct kvec *iov, int niovs, int buflen,
 			     bool need_invalidate, unsigned int remote_key)
 {
-	struct smb_direct_transport *st = smb_trans_direct_transfort(t);
-	int remaining_data_length;
-	int start, i, j;
-	int max_iov_size = st->max_send_size -
-			sizeof(struct smb_direct_data_transfer);
+	struct smb_direct_transport *st = SMBD_TRANS(t);
+	struct smbdirect_socket *sc = &st->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	size_t remaining_data_length;
+	size_t iov_idx;
+	size_t iov_ofs;
+	size_t max_iov_size = sp->max_send_size -
+			sizeof(struct smbdirect_data_transfer);
 	int ret;
-	struct kvec vec;
-	struct smb_direct_send_ctx send_ctx;
+	struct smbdirect_send_batch send_ctx;
+	int error = 0;
 
-	if (st->status != SMB_DIRECT_CS_CONNECTED)
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
 		return -ENOTCONN;
 
 	//FIXME: skip RFC1002 header..
+	if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
+		return -EINVAL;
 	buflen -= 4;
+	iov_idx = 1;
+	iov_ofs = 0;
 
 	remaining_data_length = buflen;
 	ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
 
-	smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
-	start = i = 1;
-	buflen = 0;
-	while (true) {
-		buflen += iov[i].iov_len;
-		if (buflen > max_iov_size) {
-			if (i > start) {
-				remaining_data_length -=
-					(buflen - iov[i].iov_len);
-				ret = smb_direct_post_send_data(st, &send_ctx,
-								&iov[start], i - start,
-								remaining_data_length);
-				if (ret)
+	smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key);
+	while (remaining_data_length) {
+		struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */
+		size_t possible_bytes = max_iov_size;
+		size_t possible_vecs;
+		size_t bytes = 0;
+		size_t nvecs = 0;
+
+		/*
+		 * For the last message remaining_data_length should be
+		 * have been 0 already!
+		 */
+		if (WARN_ON_ONCE(iov_idx >= niovs)) {
+			error = -EINVAL;
+			goto done;
+		}
+
+		/*
+		 * We have 2 factors which limit the arguments we pass
+		 * to smb_direct_post_send_data():
+		 *
+		 * 1. The number of supported sges for the send,
+		 *    while one is reserved for the smbdirect header.
+		 *    And we currently need one SGE per page.
+		 * 2. The number of negotiated payload bytes per send.
+		 */
+		possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
+
+		while (iov_idx < niovs && possible_vecs && possible_bytes) {
+			struct kvec *v = &vecs[nvecs];
+			int page_count;
+
+			v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
+			v->iov_len = min_t(size_t,
+					   iov[iov_idx].iov_len - iov_ofs,
+					   possible_bytes);
+			page_count = get_buf_page_count(v->iov_base, v->iov_len);
+			if (page_count > possible_vecs) {
+				/*
+				 * If the number of pages in the buffer
+				 * is to much (because we currently require
+				 * one SGE per page), we need to limit the
+				 * length.
+				 *
+				 * We know possible_vecs is at least 1,
+				 * so we always keep the first page.
+				 *
+				 * We need to calculate the number extra
+				 * pages (epages) we can also keep.
+				 *
+				 * We calculate the number of bytes in the
+				 * first page (fplen), this should never be
+				 * larger than v->iov_len because page_count is
+				 * at least 2, but adding a limitation feels
+				 * better.
+				 *
+				 * Then we calculate the number of bytes (elen)
+				 * we can keep for the extra pages.
+				 */
+				size_t epages = possible_vecs - 1;
+				size_t fpofs = offset_in_page(v->iov_base);
+				size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
+				size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
+
+				v->iov_len = fplen + elen;
+				page_count = get_buf_page_count(v->iov_base, v->iov_len);
+				if (WARN_ON_ONCE(page_count > possible_vecs)) {
+					/*
+					 * Something went wrong in the above
+					 * logic...
+					 */
+					error = -EINVAL;
 					goto done;
-			} else {
-				/* iov[start] is too big, break it */
-				int nvec  = (buflen + max_iov_size - 1) /
-						max_iov_size;
-
-				for (j = 0; j < nvec; j++) {
-					vec.iov_base =
-						(char *)iov[start].iov_base +
-						j * max_iov_size;
-					vec.iov_len =
-						min_t(int, max_iov_size,
-						      buflen - max_iov_size * j);
-					remaining_data_length -= vec.iov_len;
-					ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
-									remaining_data_length);
-					if (ret)
-						goto done;
 				}
-				i++;
-				if (i == niovs)
-					break;
 			}
-			start = i;
-			buflen = 0;
-		} else {
-			i++;
-			if (i == niovs) {
-				/* send out all remaining vecs */
-				remaining_data_length -= buflen;
-				ret = smb_direct_post_send_data(st, &send_ctx,
-								&iov[start], i - start,
-								remaining_data_length);
-				if (ret)
-					goto done;
-				break;
+			possible_vecs -= page_count;
+			nvecs += 1;
+			possible_bytes -= v->iov_len;
+			bytes += v->iov_len;
+
+			iov_ofs += v->iov_len;
+			if (iov_ofs >= iov[iov_idx].iov_len) {
+				iov_idx += 1;
+				iov_ofs = 0;
 			}
 		}
+
+		remaining_data_length -= bytes;
+
+		ret = smb_direct_post_send_data(sc, &send_ctx,
+						vecs, nvecs,
+						remaining_data_length);
+		if (unlikely(ret)) {
+			error = ret;
+			goto done;
+		}
 	}
 
 done:
-	ret = smb_direct_flush_send_list(st, &send_ctx, true);
+	ret = smb_direct_flush_send_list(sc, &send_ctx, true);
+	if (unlikely(!ret && error))
+		ret = error;
 
 	/*
 	 * As an optimization, we don't wait for individual I/O to finish
@@ -1289,16 +1446,22 @@ done:
 	 * that means all the I/Os have been out and we are good to return
 	 */
 
-	wait_event(st->wait_send_pending,
-		   atomic_read(&st->send_pending) == 0);
+	wait_event(sc->send_io.pending.zero_wait_queue,
+		   atomic_read(&sc->send_io.pending.count) == 0 ||
+		   sc->status != SMBDIRECT_SOCKET_CONNECTED);
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0)
+		ret = -ENOTCONN;
+
 	return ret;
 }
 
 static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
-					struct smb_direct_rdma_rw_msg *msg,
+					struct smbdirect_rw_io *msg,
 					enum dma_data_direction dir)
 {
-	rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+	struct smbdirect_socket *sc = &t->socket;
+
+	rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
 			    msg->sgt.sgl, msg->sgt.nents, dir);
 	sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
 	kfree(msg);
@@ -1307,16 +1470,16 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
 static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
 			    enum dma_data_direction dir)
 {
-	struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe,
-							  struct smb_direct_rdma_rw_msg, cqe);
-	struct smb_direct_transport *t = msg->t;
+	struct smbdirect_rw_io *msg =
+		container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe);
+	struct smbdirect_socket *sc = msg->socket;
 
 	if (wc->status != IB_WC_SUCCESS) {
-		msg->status = -EIO;
+		msg->error = -EIO;
 		pr_err("read/write error. opcode = %d, status = %s(%d)\n",
 		       wc->opcode, ib_wc_status_msg(wc->status), wc->status);
 		if (wc->status != IB_WC_WR_FLUSH_ERR)
-			smb_direct_disconnect_rdma_connection(t);
+			smb_direct_disconnect_rdma_connection(sc);
 	}
 
 	complete(msg->completion);
@@ -1334,11 +1497,13 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc)
 
 static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 				void *buf, int buf_len,
-				struct smb2_buffer_desc_v1 *desc,
+				struct smbdirect_buffer_descriptor_v1 *desc,
 				unsigned int desc_len,
 				bool is_read)
 {
-	struct smb_direct_rdma_rw_msg *msg, *next_msg;
+	struct smbdirect_socket *sc = &t->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_rw_io *msg, *next_msg;
 	int i, ret;
 	DECLARE_COMPLETION_ONSTACK(completion);
 	struct ib_send_wr *first_wr;
@@ -1347,10 +1512,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 	int credits_needed;
 	unsigned int desc_buf_len, desc_num = 0;
 
-	if (t->status != SMB_DIRECT_CS_CONNECTED)
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
 		return -ENOTCONN;
 
-	if (buf_len > t->max_rdma_rw_size)
+	if (buf_len > sp->max_read_write_size)
 		return -EINVAL;
 
 	/* calculate needed credits */
@@ -1370,7 +1535,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 			buf_len = 0;
 		}
 
-		credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len);
+		credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len);
 		desc_buf += desc_buf_len;
 		buf_len -= desc_buf_len;
 		desc_num++;
@@ -1379,7 +1544,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 	ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
 		    str_read_write(is_read), buf_len, credits_needed);
 
-	ret = wait_for_rw_credits(t, credits_needed);
+	ret = wait_for_rw_credits(sc, credits_needed);
 	if (ret < 0)
 		return ret;
 
@@ -1395,7 +1560,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 
 		desc_buf_len = le32_to_cpu(desc[i].length);
 
-		msg->t = t;
+		msg->socket = sc;
 		msg->cqe.done = is_read ? read_done : write_done;
 		msg->completion = &completion;
 
@@ -1417,7 +1582,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 			goto out;
 		}
 
-		ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
+		ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
 				       msg->sgt.sgl,
 				       get_buf_page_count(desc_buf, desc_buf_len),
 				       0,
@@ -1438,96 +1603,94 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
 	/* concatenate work requests of rdma_rw_ctxs */
 	first_wr = NULL;
 	list_for_each_entry_reverse(msg, &msg_list, list) {
-		first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
+		first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
 					   &msg->cqe, first_wr);
 	}
 
-	ret = ib_post_send(t->qp, first_wr, NULL);
+	ret = ib_post_send(sc->ib.qp, first_wr, NULL);
 	if (ret) {
 		pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
 		goto out;
 	}
 
-	msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list);
+	msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list);
 	wait_for_completion(&completion);
-	ret = msg->status;
+	ret = msg->error;
 out:
 	list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
 		list_del(&msg->list);
 		smb_direct_free_rdma_rw_msg(t, msg,
 					    is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	}
-	atomic_add(credits_needed, &t->rw_credits);
-	wake_up(&t->wait_rw_credits);
+	atomic_add(credits_needed, &sc->rw_io.credits.count);
+	wake_up(&sc->rw_io.credits.wait_queue);
 	return ret;
 }
 
 static int smb_direct_rdma_write(struct ksmbd_transport *t,
 				 void *buf, unsigned int buflen,
-				 struct smb2_buffer_desc_v1 *desc,
+				 struct smbdirect_buffer_descriptor_v1 *desc,
 				 unsigned int desc_len)
 {
-	return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+	return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
 				    desc, desc_len, false);
 }
 
 static int smb_direct_rdma_read(struct ksmbd_transport *t,
 				void *buf, unsigned int buflen,
-				struct smb2_buffer_desc_v1 *desc,
+				struct smbdirect_buffer_descriptor_v1 *desc,
 				unsigned int desc_len)
 {
-	return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+	return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
 				    desc, desc_len, true);
 }
 
 static void smb_direct_disconnect(struct ksmbd_transport *t)
 {
-	struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+	struct smb_direct_transport *st = SMBD_TRANS(t);
+	struct smbdirect_socket *sc = &st->socket;
 
-	ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id);
+	ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id);
 
-	smb_direct_disconnect_rdma_work(&st->disconnect_work);
-	wait_event_interruptible(st->wait_status,
-				 st->status == SMB_DIRECT_CS_DISCONNECTED);
 	free_transport(st);
 }
 
 static void smb_direct_shutdown(struct ksmbd_transport *t)
 {
-	struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+	struct smb_direct_transport *st = SMBD_TRANS(t);
+	struct smbdirect_socket *sc = &st->socket;
 
-	ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id);
+	ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id);
 
-	smb_direct_disconnect_rdma_work(&st->disconnect_work);
+	smb_direct_disconnect_rdma_work(&sc->disconnect_work);
 }
 
 static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
 				 struct rdma_cm_event *event)
 {
-	struct smb_direct_transport *t = cm_id->context;
+	struct smbdirect_socket *sc = cm_id->context;
 
 	ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n",
 		    cm_id, rdma_event_msg(event->event), event->event);
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_ESTABLISHED: {
-		t->status = SMB_DIRECT_CS_CONNECTED;
-		wake_up_interruptible(&t->wait_status);
+		WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+		wake_up(&sc->status_wait);
 		break;
 	}
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 	case RDMA_CM_EVENT_DISCONNECTED: {
-		ib_drain_qp(t->qp);
+		ib_drain_qp(sc->ib.qp);
 
-		t->status = SMB_DIRECT_CS_DISCONNECTED;
-		wake_up_interruptible(&t->wait_status);
-		wake_up_interruptible(&t->wait_reassembly_queue);
-		wake_up(&t->wait_send_credits);
+		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+		smb_direct_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 	}
 	case RDMA_CM_EVENT_CONNECT_ERROR: {
-		t->status = SMB_DIRECT_CS_DISCONNECTED;
-		wake_up_interruptible(&t->wait_status);
+		sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+		smb_direct_disconnect_rdma_work(&sc->disconnect_work);
 		break;
 	}
 	default:
@@ -1541,38 +1704,41 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
 
 static void smb_direct_qpair_handler(struct ib_event *event, void *context)
 {
-	struct smb_direct_transport *t = context;
+	struct smbdirect_socket *sc = context;
 
 	ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n",
-		    t->cm_id, ib_event_msg(event->event), event->event);
+		    sc->rdma.cm_id, ib_event_msg(event->event), event->event);
 
 	switch (event->event) {
 	case IB_EVENT_CQ_ERR:
 	case IB_EVENT_QP_FATAL:
-		smb_direct_disconnect_rdma_connection(t);
+		smb_direct_disconnect_rdma_connection(sc);
 		break;
 	default:
 		break;
 	}
 }
 
-static int smb_direct_send_negotiate_response(struct smb_direct_transport *t,
+static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc,
 					      int failed)
 {
-	struct smb_direct_sendmsg *sendmsg;
-	struct smb_direct_negotiate_resp *resp;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_send_io *sendmsg;
+	struct smbdirect_negotiate_resp *resp;
 	int ret;
 
-	sendmsg = smb_direct_alloc_sendmsg(t);
+	sendmsg = smb_direct_alloc_sendmsg(sc);
 	if (IS_ERR(sendmsg))
 		return -ENOMEM;
 
-	resp = (struct smb_direct_negotiate_resp *)sendmsg->packet;
+	resp = (struct smbdirect_negotiate_resp *)sendmsg->packet;
 	if (failed) {
 		memset(resp, 0, sizeof(*resp));
-		resp->min_version = cpu_to_le16(0x0100);
-		resp->max_version = cpu_to_le16(0x0100);
+		resp->min_version = SMB_DIRECT_VERSION_LE;
+		resp->max_version = SMB_DIRECT_VERSION_LE;
 		resp->status = STATUS_NOT_SUPPORTED;
+
+		sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
 	} else {
 		resp->status = STATUS_SUCCESS;
 		resp->min_version = SMB_DIRECT_VERSION_LE;
@@ -1580,57 +1746,65 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t,
 		resp->negotiated_version = SMB_DIRECT_VERSION_LE;
 		resp->reserved = 0;
 		resp->credits_requested =
-				cpu_to_le16(t->send_credit_target);
-		resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
-		resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size);
-		resp->preferred_send_size = cpu_to_le32(t->max_send_size);
-		resp->max_receive_size = cpu_to_le32(t->max_recv_size);
+				cpu_to_le16(sp->send_credit_target);
+		resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
+		resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size);
+		resp->preferred_send_size = cpu_to_le32(sp->max_send_size);
+		resp->max_receive_size = cpu_to_le32(sp->max_recv_size);
 		resp->max_fragmented_size =
-				cpu_to_le32(t->max_fragmented_recv_size);
+				cpu_to_le32(sp->max_fragmented_recv_size);
+
+		sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
+		sc->status = SMBDIRECT_SOCKET_CONNECTED;
 	}
 
-	sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+	sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
 						 (void *)resp, sizeof(*resp),
 						 DMA_TO_DEVICE);
-	ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+	ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
 	if (ret) {
-		smb_direct_free_sendmsg(t, sendmsg);
+		smb_direct_free_sendmsg(sc, sendmsg);
 		return ret;
 	}
 
 	sendmsg->num_sge = 1;
 	sendmsg->sge[0].length = sizeof(*resp);
-	sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+	sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
 
-	ret = post_sendmsg(t, NULL, sendmsg);
+	ret = post_sendmsg(sc, NULL, sendmsg);
 	if (ret) {
-		smb_direct_free_sendmsg(t, sendmsg);
+		smb_direct_free_sendmsg(sc, sendmsg);
 		return ret;
 	}
 
-	wait_event(t->wait_send_pending,
-		   atomic_read(&t->send_pending) == 0);
+	wait_event(sc->send_io.pending.zero_wait_queue,
+		   atomic_read(&sc->send_io.pending.count) == 0 ||
+		   sc->status != SMBDIRECT_SOCKET_CONNECTED);
+	if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+		return -ENOTCONN;
+
 	return 0;
 }
 
-static int smb_direct_accept_client(struct smb_direct_transport *t)
+static int smb_direct_accept_client(struct smbdirect_socket *sc)
 {
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	struct rdma_conn_param conn_param;
-	struct ib_port_immutable port_immutable;
-	u32 ird_ord_hdr[2];
+	__be32 ird_ord_hdr[2];
 	int ret;
 
+	/*
+	 * smb_direct_handle_connect_request()
+	 * already negotiated sp->initiator_depth
+	 * and sp->responder_resources
+	 */
 	memset(&conn_param, 0, sizeof(conn_param));
-	conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom,
-					   SMB_DIRECT_CM_INITIATOR_DEPTH);
-	conn_param.responder_resources = 0;
-
-	t->cm_id->device->ops.get_port_immutable(t->cm_id->device,
-						 t->cm_id->port_num,
-						 &port_immutable);
-	if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
-		ird_ord_hdr[0] = conn_param.responder_resources;
-		ird_ord_hdr[1] = 1;
+	conn_param.initiator_depth = sp->initiator_depth;
+	conn_param.responder_resources = sp->responder_resources;
+
+	if (sc->rdma.legacy_iwarp) {
+		ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+		ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
 		conn_param.private_data = ird_ord_hdr;
 		conn_param.private_data_len = sizeof(ird_ord_hdr);
 	} else {
@@ -1641,7 +1815,17 @@ static int smb_direct_accept_client(struct smb_direct_transport *t)
 	conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY;
 	conn_param.flow_control = 0;
 
-	ret = rdma_accept(t->cm_id, &conn_param);
+	/*
+	 * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+	 * so that the timer will cause a disconnect.
+	 */
+	sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+	mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+			 msecs_to_jiffies(sp->negotiate_timeout_msec));
+
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
+	sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
+	ret = rdma_accept(sc->rdma.cm_id, &conn_param);
 	if (ret) {
 		pr_err("error at rdma_accept: %d\n", ret);
 		return ret;
@@ -1649,57 +1833,60 @@ static int smb_direct_accept_client(struct smb_direct_transport *t)
 	return 0;
 }
 
-static int smb_direct_prepare_negotiation(struct smb_direct_transport *t)
+static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
 {
+	struct smbdirect_recv_io *recvmsg;
 	int ret;
-	struct smb_direct_recvmsg *recvmsg;
 
-	recvmsg = get_free_recvmsg(t);
+	WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+	sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+
+	sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ;
+
+	recvmsg = get_free_recvmsg(sc);
 	if (!recvmsg)
 		return -ENOMEM;
-	recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ;
 
-	ret = smb_direct_post_recv(t, recvmsg);
+	ret = smb_direct_post_recv(sc, recvmsg);
 	if (ret) {
 		pr_err("Can't post recv: %d\n", ret);
 		goto out_err;
 	}
 
-	t->negotiation_requested = false;
-	ret = smb_direct_accept_client(t);
+	ret = smb_direct_accept_client(sc);
 	if (ret) {
 		pr_err("Can't accept client\n");
 		goto out_err;
 	}
 
-	smb_direct_post_recv_credits(&t->post_recv_credits_work.work);
+	smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
 	return 0;
 out_err:
-	put_recvmsg(t, recvmsg);
+	put_recvmsg(sc, recvmsg);
 	return ret;
 }
 
-static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t)
+static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
 {
 	return min_t(unsigned int,
-		     t->cm_id->device->attrs.max_fast_reg_page_list_len,
+		     sc->ib.dev->attrs.max_fast_reg_page_list_len,
 		     256);
 }
 
-static int smb_direct_init_params(struct smb_direct_transport *t,
+static int smb_direct_init_params(struct smbdirect_socket *sc,
 				  struct ib_qp_cap *cap)
 {
-	struct ib_device *device = t->cm_id->device;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct ib_device *device = sc->ib.dev;
 	int max_send_sges, max_rw_wrs, max_send_wrs;
 	unsigned int max_sge_per_wr, wrs_per_credit;
 
 	/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
 	 * SMB2 response could be mapped.
 	 */
-	t->max_send_size = smb_direct_max_send_size;
-	max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3;
-	if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) {
-		pr_err("max_send_size %d is too large\n", t->max_send_size);
+	max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3;
+	if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) {
+		pr_err("max_send_size %d is too large\n", sp->max_send_size);
 		return -EINVAL;
 	}
 
@@ -1710,10 +1897,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
 	 * are needed for MR registration, RDMA R/W, local & remote
 	 * MR invalidation.
 	 */
-	t->max_rdma_rw_size = smb_direct_max_read_write_size;
-	t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t);
-	t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size,
-					 (t->pages_per_rw_credit - 1) *
+	sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
+	sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
+					 (sc->rw_io.credits.num_pages - 1) *
 					 PAGE_SIZE);
 
 	max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
@@ -1721,233 +1907,244 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
 	max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
 			       max_send_sges);
 	wrs_per_credit = max_t(unsigned int, 4,
-			       DIV_ROUND_UP(t->pages_per_rw_credit,
+			       DIV_ROUND_UP(sc->rw_io.credits.num_pages,
 					    max_sge_per_wr) + 1);
-	max_rw_wrs = t->max_rw_credits * wrs_per_credit;
+	max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
 
-	max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
+	max_send_wrs = sp->send_credit_target + max_rw_wrs;
 	if (max_send_wrs > device->attrs.max_cqe ||
 	    max_send_wrs > device->attrs.max_qp_wr) {
 		pr_err("consider lowering send_credit_target = %d\n",
-		       smb_direct_send_credit_target);
+		       sp->send_credit_target);
 		pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
 		       device->attrs.max_cqe, device->attrs.max_qp_wr);
 		return -EINVAL;
 	}
 
-	if (smb_direct_receive_credit_max > device->attrs.max_cqe ||
-	    smb_direct_receive_credit_max > device->attrs.max_qp_wr) {
+	if (sp->recv_credit_max > device->attrs.max_cqe ||
+	    sp->recv_credit_max > device->attrs.max_qp_wr) {
 		pr_err("consider lowering receive_credit_max = %d\n",
-		       smb_direct_receive_credit_max);
+		       sp->recv_credit_max);
 		pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
 		       device->attrs.max_cqe, device->attrs.max_qp_wr);
 		return -EINVAL;
 	}
 
-	if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
+	if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
+		pr_err("warning: device max_send_sge = %d too small\n",
+		       device->attrs.max_send_sge);
+		return -EINVAL;
+	}
+	if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
 		pr_err("warning: device max_recv_sge = %d too small\n",
 		       device->attrs.max_recv_sge);
 		return -EINVAL;
 	}
 
-	t->recv_credits = 0;
-	t->count_avail_recvmsg = 0;
+	sc->recv_io.credits.target = 1;
 
-	t->recv_credit_max = smb_direct_receive_credit_max;
-	t->recv_credit_target = 10;
-	t->new_recv_credits = 0;
-
-	t->send_credit_target = smb_direct_send_credit_target;
-	atomic_set(&t->send_credits, 0);
-	atomic_set(&t->rw_credits, t->max_rw_credits);
-
-	t->max_send_size = smb_direct_max_send_size;
-	t->max_recv_size = smb_direct_max_receive_size;
-	t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+	atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
 
 	cap->max_send_wr = max_send_wrs;
-	cap->max_recv_wr = t->recv_credit_max;
-	cap->max_send_sge = max_sge_per_wr;
-	cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
+	cap->max_recv_wr = sp->recv_credit_max;
+	cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+	cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
 	cap->max_inline_data = 0;
-	cap->max_rdma_ctxs = t->max_rw_credits;
+	cap->max_rdma_ctxs = sc->rw_io.credits.max;
 	return 0;
 }
 
-static void smb_direct_destroy_pools(struct smb_direct_transport *t)
+static void smb_direct_destroy_pools(struct smbdirect_socket *sc)
 {
-	struct smb_direct_recvmsg *recvmsg;
+	struct smbdirect_recv_io *recvmsg;
 
-	while ((recvmsg = get_free_recvmsg(t)))
-		mempool_free(recvmsg, t->recvmsg_mempool);
+	while ((recvmsg = get_free_recvmsg(sc)))
+		mempool_free(recvmsg, sc->recv_io.mem.pool);
 
-	mempool_destroy(t->recvmsg_mempool);
-	t->recvmsg_mempool = NULL;
+	mempool_destroy(sc->recv_io.mem.pool);
+	sc->recv_io.mem.pool = NULL;
 
-	kmem_cache_destroy(t->recvmsg_cache);
-	t->recvmsg_cache = NULL;
+	kmem_cache_destroy(sc->recv_io.mem.cache);
+	sc->recv_io.mem.cache = NULL;
 
-	mempool_destroy(t->sendmsg_mempool);
-	t->sendmsg_mempool = NULL;
+	mempool_destroy(sc->send_io.mem.pool);
+	sc->send_io.mem.pool = NULL;
 
-	kmem_cache_destroy(t->sendmsg_cache);
-	t->sendmsg_cache = NULL;
+	kmem_cache_destroy(sc->send_io.mem.cache);
+	sc->send_io.mem.cache = NULL;
 }
 
-static int smb_direct_create_pools(struct smb_direct_transport *t)
+static int smb_direct_create_pools(struct smbdirect_socket *sc)
 {
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	char name[80];
 	int i;
-	struct smb_direct_recvmsg *recvmsg;
+	struct smbdirect_recv_io *recvmsg;
 
-	snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t);
-	t->sendmsg_cache = kmem_cache_create(name,
-					     sizeof(struct smb_direct_sendmsg) +
-					      sizeof(struct smb_direct_negotiate_resp),
+	snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc);
+	sc->send_io.mem.cache = kmem_cache_create(name,
+					     sizeof(struct smbdirect_send_io) +
+					      sizeof(struct smbdirect_negotiate_resp),
 					     0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!t->sendmsg_cache)
+	if (!sc->send_io.mem.cache)
 		return -ENOMEM;
 
-	t->sendmsg_mempool = mempool_create(t->send_credit_target,
+	sc->send_io.mem.pool = mempool_create(sp->send_credit_target,
 					    mempool_alloc_slab, mempool_free_slab,
-					    t->sendmsg_cache);
-	if (!t->sendmsg_mempool)
+					    sc->send_io.mem.cache);
+	if (!sc->send_io.mem.pool)
 		goto err;
 
-	snprintf(name, sizeof(name), "smb_direct_resp_%p", t);
-	t->recvmsg_cache = kmem_cache_create(name,
-					     sizeof(struct smb_direct_recvmsg) +
-					      t->max_recv_size,
+	snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc);
+	sc->recv_io.mem.cache = kmem_cache_create(name,
+					     sizeof(struct smbdirect_recv_io) +
+					     sp->max_recv_size,
 					     0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!t->recvmsg_cache)
+	if (!sc->recv_io.mem.cache)
 		goto err;
 
-	t->recvmsg_mempool =
-		mempool_create(t->recv_credit_max, mempool_alloc_slab,
-			       mempool_free_slab, t->recvmsg_cache);
-	if (!t->recvmsg_mempool)
+	sc->recv_io.mem.pool =
+		mempool_create(sp->recv_credit_max, mempool_alloc_slab,
+			       mempool_free_slab, sc->recv_io.mem.cache);
+	if (!sc->recv_io.mem.pool)
 		goto err;
 
-	INIT_LIST_HEAD(&t->recvmsg_queue);
-
-	for (i = 0; i < t->recv_credit_max; i++) {
-		recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP);
+	for (i = 0; i < sp->recv_credit_max; i++) {
+		recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP);
 		if (!recvmsg)
 			goto err;
-		recvmsg->transport = t;
+		recvmsg->socket = sc;
 		recvmsg->sge.length = 0;
-		list_add(&recvmsg->list, &t->recvmsg_queue);
+		list_add(&recvmsg->list, &sc->recv_io.free.list);
 	}
-	t->count_avail_recvmsg = t->recv_credit_max;
 
 	return 0;
 err:
-	smb_direct_destroy_pools(t);
+	smb_direct_destroy_pools(sc);
 	return -ENOMEM;
 }
 
-static int smb_direct_create_qpair(struct smb_direct_transport *t,
+static int smb_direct_create_qpair(struct smbdirect_socket *sc,
 				   struct ib_qp_cap *cap)
 {
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
 	int ret;
 	struct ib_qp_init_attr qp_attr;
 	int pages_per_rw;
 
-	t->pd = ib_alloc_pd(t->cm_id->device, 0);
-	if (IS_ERR(t->pd)) {
+	sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+	if (IS_ERR(sc->ib.pd)) {
 		pr_err("Can't create RDMA PD\n");
-		ret = PTR_ERR(t->pd);
-		t->pd = NULL;
+		ret = PTR_ERR(sc->ib.pd);
+		sc->ib.pd = NULL;
 		return ret;
 	}
 
-	t->send_cq = ib_alloc_cq(t->cm_id->device, t,
-				 smb_direct_send_credit_target + cap->max_rdma_ctxs,
-				 0, IB_POLL_WORKQUEUE);
-	if (IS_ERR(t->send_cq)) {
+	sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+					 sp->send_credit_target +
+					 cap->max_rdma_ctxs,
+					 IB_POLL_WORKQUEUE);
+	if (IS_ERR(sc->ib.send_cq)) {
 		pr_err("Can't create RDMA send CQ\n");
-		ret = PTR_ERR(t->send_cq);
-		t->send_cq = NULL;
+		ret = PTR_ERR(sc->ib.send_cq);
+		sc->ib.send_cq = NULL;
 		goto err;
 	}
 
-	t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
-				 t->recv_credit_max, 0, IB_POLL_WORKQUEUE);
-	if (IS_ERR(t->recv_cq)) {
+	sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+					 sp->recv_credit_max,
+					 IB_POLL_WORKQUEUE);
+	if (IS_ERR(sc->ib.recv_cq)) {
 		pr_err("Can't create RDMA recv CQ\n");
-		ret = PTR_ERR(t->recv_cq);
-		t->recv_cq = NULL;
+		ret = PTR_ERR(sc->ib.recv_cq);
+		sc->ib.recv_cq = NULL;
 		goto err;
 	}
 
 	memset(&qp_attr, 0, sizeof(qp_attr));
 	qp_attr.event_handler = smb_direct_qpair_handler;
-	qp_attr.qp_context = t;
+	qp_attr.qp_context = sc;
 	qp_attr.cap = *cap;
 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 	qp_attr.qp_type = IB_QPT_RC;
-	qp_attr.send_cq = t->send_cq;
-	qp_attr.recv_cq = t->recv_cq;
+	qp_attr.send_cq = sc->ib.send_cq;
+	qp_attr.recv_cq = sc->ib.recv_cq;
 	qp_attr.port_num = ~0;
 
-	ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr);
+	ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
 	if (ret) {
 		pr_err("Can't create RDMA QP: %d\n", ret);
 		goto err;
 	}
 
-	t->qp = t->cm_id->qp;
-	t->cm_id->event_handler = smb_direct_cm_handler;
+	sc->ib.qp = sc->rdma.cm_id->qp;
+	sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
 
-	pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
-	if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
-		ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs,
-				      t->max_rw_credits, IB_MR_TYPE_MEM_REG,
-				      t->pages_per_rw_credit, 0);
+	pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
+	if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
+		ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
+				      sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
+				      sc->rw_io.credits.num_pages, 0);
 		if (ret) {
-			pr_err("failed to init mr pool count %d pages %d\n",
-			       t->max_rw_credits, t->pages_per_rw_credit);
+			pr_err("failed to init mr pool count %zu pages %zu\n",
+			       sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
 			goto err;
 		}
 	}
 
 	return 0;
 err:
-	if (t->qp) {
-		t->qp = NULL;
-		rdma_destroy_qp(t->cm_id);
+	if (sc->ib.qp) {
+		sc->ib.qp = NULL;
+		rdma_destroy_qp(sc->rdma.cm_id);
 	}
-	if (t->recv_cq) {
-		ib_destroy_cq(t->recv_cq);
-		t->recv_cq = NULL;
+	if (sc->ib.recv_cq) {
+		ib_destroy_cq(sc->ib.recv_cq);
+		sc->ib.recv_cq = NULL;
 	}
-	if (t->send_cq) {
-		ib_destroy_cq(t->send_cq);
-		t->send_cq = NULL;
+	if (sc->ib.send_cq) {
+		ib_destroy_cq(sc->ib.send_cq);
+		sc->ib.send_cq = NULL;
 	}
-	if (t->pd) {
-		ib_dealloc_pd(t->pd);
-		t->pd = NULL;
+	if (sc->ib.pd) {
+		ib_dealloc_pd(sc->ib.pd);
+		sc->ib.pd = NULL;
 	}
 	return ret;
 }
 
 static int smb_direct_prepare(struct ksmbd_transport *t)
 {
-	struct smb_direct_transport *st = smb_trans_direct_transfort(t);
-	struct smb_direct_recvmsg *recvmsg;
-	struct smb_direct_negotiate_req *req;
+	struct smb_direct_transport *st = SMBD_TRANS(t);
+	struct smbdirect_socket *sc = &st->socket;
+	struct smbdirect_socket_parameters *sp = &sc->parameters;
+	struct smbdirect_recv_io *recvmsg;
+	struct smbdirect_negotiate_req *req;
+	unsigned long flags;
 	int ret;
 
+	/*
+	 * We are waiting to pass the following states:
+	 *
+	 * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED
+	 * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING
+	 * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
+	 *
+	 * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING
+	 * in order to continue below.
+	 *
+	 * Everything else is unexpected and an error.
+	 */
 	ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
-	ret = wait_event_interruptible_timeout(st->wait_status,
-					       st->negotiation_requested ||
-					       st->status == SMB_DIRECT_CS_DISCONNECTED,
-					       SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ);
-	if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED)
+	ret = wait_event_interruptible_timeout(sc->status_wait,
+					sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED &&
+					sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING &&
+					sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED,
+					msecs_to_jiffies(sp->negotiate_timeout_msec));
+	if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)
 		return ret < 0 ? ret : -ETIMEDOUT;
 
-	recvmsg = get_first_reassembly(st);
+	recvmsg = get_first_reassembly(sc);
 	if (!recvmsg)
 		return -ECONNABORTED;
 
@@ -1955,51 +2152,54 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
 	if (ret == -ECONNABORTED)
 		goto out;
 
-	req = (struct smb_direct_negotiate_req *)recvmsg->packet;
-	st->max_recv_size = min_t(int, st->max_recv_size,
+	req = (struct smbdirect_negotiate_req *)recvmsg->packet;
+	sp->max_recv_size = min_t(int, sp->max_recv_size,
 				  le32_to_cpu(req->preferred_send_size));
-	st->max_send_size = min_t(int, st->max_send_size,
+	sp->max_send_size = min_t(int, sp->max_send_size,
 				  le32_to_cpu(req->max_receive_size));
-	st->max_fragmented_send_size =
+	sp->max_fragmented_send_size =
 		le32_to_cpu(req->max_fragmented_size);
-	st->max_fragmented_recv_size =
-		(st->recv_credit_max * st->max_recv_size) / 2;
+	sp->max_fragmented_recv_size =
+		(sp->recv_credit_max * sp->max_recv_size) / 2;
+	sc->recv_io.credits.target = le16_to_cpu(req->credits_requested);
+	sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+	sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1);
 
-	ret = smb_direct_send_negotiate_response(st, ret);
+	ret = smb_direct_send_negotiate_response(sc, ret);
 out:
-	spin_lock_irq(&st->reassembly_queue_lock);
-	st->reassembly_queue_length--;
+	spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+	sc->recv_io.reassembly.queue_length--;
 	list_del(&recvmsg->list);
-	spin_unlock_irq(&st->reassembly_queue_lock);
-	put_recvmsg(st, recvmsg);
+	spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+	put_recvmsg(sc, recvmsg);
 
 	return ret;
 }
 
-static int smb_direct_connect(struct smb_direct_transport *st)
+static int smb_direct_connect(struct smbdirect_socket *sc)
 {
-	int ret;
 	struct ib_qp_cap qp_cap;
+	int ret;
 
-	ret = smb_direct_init_params(st, &qp_cap);
+	ret = smb_direct_init_params(sc, &qp_cap);
 	if (ret) {
 		pr_err("Can't configure RDMA parameters\n");
 		return ret;
 	}
 
-	ret = smb_direct_create_pools(st);
+	ret = smb_direct_create_pools(sc);
 	if (ret) {
 		pr_err("Can't init RDMA pool: %d\n", ret);
 		return ret;
 	}
 
-	ret = smb_direct_create_qpair(st, &qp_cap);
+	ret = smb_direct_create_qpair(sc, &qp_cap);
 	if (ret) {
 		pr_err("Can't accept RDMA client: %d\n", ret);
 		return ret;
 	}
 
-	ret = smb_direct_prepare_negotiation(st);
+	ret = smb_direct_prepare_negotiation(sc);
 	if (ret) {
 		pr_err("Can't negotiate: %d\n", ret);
 		return ret;
@@ -2016,10 +2216,15 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
 	return true;
 }
 
-static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
+static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id,
+					     struct rdma_cm_event *event)
 {
 	struct smb_direct_transport *t;
+	struct smbdirect_socket *sc;
+	struct smbdirect_socket_parameters *sp;
 	struct task_struct *handler;
+	u8 peer_initiator_depth;
+	u8 peer_responder_resources;
 	int ret;
 
 	if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2032,8 +2237,71 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
 	t = alloc_transport(new_cm_id);
 	if (!t)
 		return -ENOMEM;
+	sc = &t->socket;
+	sp = &sc->parameters;
+
+	peer_initiator_depth = event->param.conn.initiator_depth;
+	peer_responder_resources = event->param.conn.responder_resources;
+	if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) &&
+	    event->param.conn.private_data_len == 8) {
+		/*
+		 * Legacy clients with only iWarp MPA v1 support
+		 * need a private blob in order to negotiate
+		 * the IRD/ORD values.
+		 */
+		const __be32 *ird_ord_hdr = event->param.conn.private_data;
+		u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
+		u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
+
+		/*
+		 * cifs.ko sends the legacy IRD/ORD negotiation
+		 * event if iWarp MPA v2 was used.
+		 *
+		 * Here we check that the values match and only
+		 * mark the client as legacy if they don't match.
+		 */
+		if ((u32)event->param.conn.initiator_depth != ird32 ||
+		    (u32)event->param.conn.responder_resources != ord32) {
+			/*
+			 * There are broken clients (old cifs.ko)
+			 * using little endian and also
+			 * struct rdma_conn_param only uses u8
+			 * for initiator_depth and responder_resources,
+			 * so we truncate the value to U8_MAX.
+			 *
+			 * smb_direct_accept_client() will then
+			 * do the real negotiation in order to
+			 * select the minimum between client and
+			 * server.
+			 */
+			ird32 = min_t(u32, ird32, U8_MAX);
+			ord32 = min_t(u32, ord32, U8_MAX);
 
-	ret = smb_direct_connect(t);
+			sc->rdma.legacy_iwarp = true;
+			peer_initiator_depth = (u8)ird32;
+			peer_responder_resources = (u8)ord32;
+		}
+	}
+
+	/*
+	 * First set what the we as server are able to support
+	 */
+	sp->initiator_depth = min_t(u8, sp->initiator_depth,
+				   new_cm_id->device->attrs.max_qp_rd_atom);
+
+	/*
+	 * negotiate the value by using the minimum
+	 * between client and server if the client provided
+	 * non 0 values.
+	 */
+	if (peer_initiator_depth != 0)
+		sp->initiator_depth = min_t(u8, sp->initiator_depth,
+					   peer_initiator_depth);
+	if (peer_responder_resources != 0)
+		sp->responder_resources = min_t(u8, sp->responder_resources,
+					       peer_responder_resources);
+
+	ret = smb_direct_connect(sc);
 	if (ret)
 		goto out_err;
 
@@ -2057,7 +2325,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
 {
 	switch (event->event) {
 	case RDMA_CM_EVENT_CONNECT_REQUEST: {
-		int ret = smb_direct_handle_connect_request(cm_id);
+		int ret = smb_direct_handle_connect_request(cm_id, event);
 
 		if (ret) {
 			pr_err("Can't create transport: %d\n", ret);
@@ -2177,7 +2445,8 @@ int ksmbd_rdma_init(void)
 	 * for lack of credits
 	 */
 	smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
-					WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+					WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
+					0);
 	if (!smb_direct_wq)
 		return -ENOMEM;
 
@@ -2194,7 +2463,7 @@ int ksmbd_rdma_init(void)
 	return 0;
 }
 
-void ksmbd_rdma_destroy(void)
+void ksmbd_rdma_stop_listening(void)
 {
 	if (!smb_direct_listener.cm_id)
 		return;
@@ -2203,7 +2472,10 @@ void ksmbd_rdma_destroy(void)
 	rdma_destroy_id(smb_direct_listener.cm_id);
 
 	smb_direct_listener.cm_id = NULL;
+}
 
+void ksmbd_rdma_destroy(void)
+{
 	if (smb_direct_wq) {
 		destroy_workqueue(smb_direct_wq);
 		smb_direct_wq = NULL;
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index 77aee4e5c9dc..3f93c6a9f7e4 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -11,59 +11,20 @@
 #define SMBD_MIN_IOSIZE (512 * 1024)
 #define SMBD_MAX_IOSIZE (16 * 1024 * 1024)
 
-/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
-struct smb_direct_negotiate_req {
-	__le16 min_version;
-	__le16 max_version;
-	__le16 reserved;
-	__le16 credits_requested;
-	__le32 preferred_send_size;
-	__le32 max_receive_size;
-	__le32 max_fragmented_size;
-} __packed;
-
-/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */
-struct smb_direct_negotiate_resp {
-	__le16 min_version;
-	__le16 max_version;
-	__le16 negotiated_version;
-	__le16 reserved;
-	__le16 credits_requested;
-	__le16 credits_granted;
-	__le32 status;
-	__le32 max_readwrite_size;
-	__le32 preferred_send_size;
-	__le32 max_receive_size;
-	__le32 max_fragmented_size;
-} __packed;
-
-#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
-
-/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */
-struct smb_direct_data_transfer {
-	__le16 credits_requested;
-	__le16 credits_granted;
-	__le16 flags;
-	__le16 reserved;
-	__le32 remaining_data_length;
-	__le32 data_offset;
-	__le32 data_length;
-	__le32 padding;
-	__u8 buffer[];
-} __packed;
-
 #ifdef CONFIG_SMB_SERVER_SMBDIRECT
 int ksmbd_rdma_init(void);
+void ksmbd_rdma_stop_listening(void);
 void ksmbd_rdma_destroy(void);
 bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
 void init_smbd_max_io_size(unsigned int sz);
-unsigned int get_smbd_max_read_write_size(void);
+unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt);
 #else
 static inline int ksmbd_rdma_init(void) { return 0; }
-static inline int ksmbd_rdma_destroy(void) { return 0; }
+static inline void ksmbd_rdma_stop_listening(void) { }
+static inline void ksmbd_rdma_destroy(void) { }
 static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
 static inline void init_smbd_max_io_size(unsigned int sz) { }
-static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
+static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; }
 #endif
 
 #endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index b1df02e321b0..4337df97987d 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -85,7 +85,14 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
 		return NULL;
 	}
 
+#if IS_ENABLED(CONFIG_IPV6)
+	if (client_sk->sk->sk_family == AF_INET6)
+		memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16);
+	else
+		conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#else
 	conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#endif
 	conn->transport = KSMBD_TRANS(t);
 	KSMBD_TRANS(t)->conn = conn;
 	KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
@@ -229,7 +236,6 @@ static int ksmbd_kthread_fn(void *p)
 {
 	struct socket *client_sk = NULL;
 	struct interface *iface = (struct interface *)p;
-	struct inet_sock *csk_inet;
 	struct ksmbd_conn *conn;
 	int ret;
 
@@ -252,13 +258,27 @@ static int ksmbd_kthread_fn(void *p)
 		/*
 		 * Limits repeated connections from clients with the same IP.
 		 */
-		csk_inet = inet_sk(client_sk->sk);
 		down_read(&conn_list_lock);
 		list_for_each_entry(conn, &conn_list, conns_list)
-			if (csk_inet->inet_daddr == conn->inet_addr) {
+#if IS_ENABLED(CONFIG_IPV6)
+			if (client_sk->sk->sk_family == AF_INET6) {
+				if (memcmp(&client_sk->sk->sk_v6_daddr,
+					   &conn->inet6_addr, 16) == 0) {
+					ret = -EAGAIN;
+					break;
+				}
+			} else if (inet_sk(client_sk->sk)->inet_daddr ==
+				 conn->inet_addr) {
+				ret = -EAGAIN;
+				break;
+			}
+#else
+			if (inet_sk(client_sk->sk)->inet_daddr ==
+			    conn->inet_addr) {
 				ret = -EAGAIN;
 				break;
 			}
+#endif
 		up_read(&conn_list_lock);
 		if (ret == -EAGAIN)
 			continue;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 04539037108c..1cfa688904b2 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 		pr_err("File(%s): creation failed (err:%d)\n", name, err);
 	}
 
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return err;
 }
 
@@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 	if (!err && dentry != d)
 		ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry));
 
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	if (err)
 		pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
 	return err;
@@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
 		ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
 
 out3:
-	done_path_create(&newpath, dentry);
+	end_creating_path(&newpath, dentry);
 out2:
 	path_put(&oldpath);
 out1:
@@ -770,10 +770,9 @@ retry:
 		goto out4;
 	}
 
-	rd.old_mnt_idmap	= mnt_idmap(old_path->mnt),
+	rd.mnt_idmap		= mnt_idmap(old_path->mnt),
 	rd.old_parent		= old_parent,
 	rd.old_dentry		= old_child,
-	rd.new_mnt_idmap	= mnt_idmap(new_path.mnt),
 	rd.new_parent		= new_path.dentry,
 	rd.new_dentry		= new_dentry,
 	rd.flags		= flags,
@@ -1326,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 	if (!abs_name)
 		return ERR_PTR(-ENOMEM);
 
-	dent = kern_path_create(AT_FDCWD, abs_name, path, flags);
+	dent = start_creating_path(AT_FDCWD, abs_name, path, flags);
 	kfree(abs_name);
 	return dent;
 }
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 0708155b5caf..78b506c5ef03 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -112,6 +112,8 @@ struct ksmbd_file {
 	bool				is_durable;
 	bool				is_persistent;
 	bool				is_resilient;
+
+	bool                            is_posix_ctxt;
 };
 
 static inline void set_ctx_actor(struct dir_context *ctx,
diff --git a/fs/splice.c b/fs/splice.c
index 4d6df083e0c0..f5094b6d00a0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -739,6 +739,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		sd.pos = kiocb.ki_pos;
 		if (ret <= 0)
 			break;
+		WARN_ONCE(ret > sd.total_len - left,
+			  "Splice Exceeded! ret=%zd tot=%zu left=%zu\n",
+			  ret, sd.total_len, left);
 
 		sd.num_spliced += ret;
 		sd.total_len -= ret;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 992ea0e37257..4465cf05603a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -187,10 +187,15 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	unsigned short flags;
 	unsigned int fragments;
 	u64 lookup_table_start, xattr_id_table_start, next_table;
-	int err;
+	int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
 
 	TRACE("Entered squashfs_fill_superblock\n");
 
+	if (!devblksize) {
+		errorf(fc, "squashfs: unable to set blocksize\n");
+		return -EINVAL;
+	}
+
 	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
 	if (sb->s_fs_info == NULL) {
 		ERROR("Failed to allocate squashfs_sb_info\n");
@@ -201,12 +206,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
 
-	msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
-	if (!msblk->devblksize) {
-		errorf(fc, "squashfs: unable to set blocksize\n");
-		return -EINVAL;
-	}
-
+	msblk->devblksize = devblksize;
 	msblk->devblksize_log2 = ffz(~msblk->devblksize);
 
 	mutex_init(&msblk->meta_index_mutex);
diff --git a/fs/super.c b/fs/super.c
index 7f876f32343a..f4fa0e93c463 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc,
 }
 EXPORT_SYMBOL(get_tree_bdev);
 
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
-}
-
-struct dentry *mount_bdev(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
-{
-	struct super_block *s;
-	int error;
-	dev_t dev;
-
-	error = lookup_bdev(dev_name, &dev);
-	if (error)
-		return ERR_PTR(error);
-
-	flags |= SB_NOSEC;
-	s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	if (s->s_root) {
-		if ((flags ^ s->s_flags) & SB_RDONLY) {
-			deactivate_locked_super(s);
-			return ERR_PTR(-EBUSY);
-		}
-	} else {
-		error = setup_bdev_super(s, flags, NULL);
-		if (!error)
-			error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-
-		s->s_flags |= SB_ACTIVE;
-	}
-
-	return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_bdev);
-
 void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
@@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb)
 EXPORT_SYMBOL(kill_block_super);
 #endif
 
-struct dentry *mount_nodev(struct file_system_type *fs_type,
-	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
-{
-	int error;
-	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
-	if (error) {
-		deactivate_locked_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= SB_ACTIVE;
-	return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_nodev);
-
 /**
  * vfs_get_tree - Get the mountable root
  * @fc: The superblock configuration context.
@@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb)
 {
 	struct workqueue_struct *old;
 	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
-						      WQ_MEM_RECLAIM, 0,
+						      WQ_MEM_RECLAIM | WQ_PERCPU,
+						      0,
 						      sb->s_id);
 	if (!wq)
 		return -ENOMEM;
+
+	old = NULL;
 	/*
 	 * This has to be atomic as more DIOs can race to create the workqueue
 	 */
-	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
-	/* Someone created workqueue before us? Free ours... */
-	if (old)
+	if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) {
+		/* Someone created workqueue before us? Free ours... */
 		destroy_workqueue(wq);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index fb5ac358077b..0b14d004a095 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 }
 
 const struct fscrypt_operations ubifs_crypt_operations = {
+	.inode_info_offs	= (int)offsetof(struct ubifs_inode, i_crypt_info) -
+				  (int)offsetof(struct ubifs_inode, vfs_inode),
 	.legacy_key_prefix	= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f3e3b2068608..46952a33c4e6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 static int ubifs_drop_inode(struct inode *inode)
 {
-	int drop = generic_drop_inode(inode);
+	int drop = inode_generic_drop(inode);
 
 	if (!drop)
 		drop = fscrypt_drop_inode(inode);
@@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode)
 		goto out;
 
 	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
-	ubifs_assert(c, !atomic_read(&inode->i_count));
+	ubifs_assert(c, !icount_read(inode));
 
 	truncate_inode_pages_final(&inode->i_data);
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5db45c9e26ee..49e50431741c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb {
  * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
  * @data_len: length of the data attached to the inode
  * @data: inode's data
+ * @i_crypt_info: inode's fscrypt information
  *
  * @ui_mutex exists for two main reasons. At first it prevents inodes from
  * being written back while UBIFS changing them, being in the middle of an VFS
@@ -416,6 +417,9 @@ struct ubifs_inode {
 	pgoff_t read_in_a_row;
 	int data_len;
 	void *data;
+#ifdef CONFIG_FS_ENCRYPTION
+	struct fscrypt_inode_info *i_crypt_info;
+#endif
 };
 
 /**
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 503268cf4296..95ec42b84797 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -19,8 +19,7 @@ struct block_buffer {
 };
 
 /* Hash a block, writing the result to the next level's pending block buffer. */
-static int hash_one_block(struct inode *inode,
-			  const struct merkle_tree_params *params,
+static int hash_one_block(const struct merkle_tree_params *params,
 			  struct block_buffer *cur)
 {
 	struct block_buffer *next = cur + 1;
@@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode,
 	/* Zero-pad the block if it's shorter than the block size. */
 	memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
 
-	fsverity_hash_block(params, inode, cur->data,
-			    &next->data[next->filled]);
+	fsverity_hash_block(params, cur->data, &next->data[next->filled]);
 	next->filled += params->digest_size;
 	cur->filled = 0;
 	return 0;
@@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp,
 			fsverity_err(inode, "Short read of file data");
 			goto out;
 		}
-		err = hash_one_block(inode, params, &buffers[-1]);
+		err = hash_one_block(params, &buffers[-1]);
 		if (err)
 			goto out;
 		for (level = 0; level < num_levels; level++) {
@@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp,
 			}
 			/* Next block at @level is full */
 
-			err = hash_one_block(inode, params, &buffers[level]);
+			err = hash_one_block(params, &buffers[level]);
 			if (err)
 				goto out;
 			err = write_merkle_tree_block(inode,
@@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp,
 	/* Finish all nonempty pending tree blocks. */
 	for (level = 0; level < num_levels; level++) {
 		if (buffers[level].filled != 0) {
-			err = hash_one_block(inode, params, &buffers[level]);
+			err = hash_one_block(params, &buffers[level]);
 			if (err)
 				goto out;
 			err = write_merkle_tree_block(inode,
@@ -284,9 +282,9 @@ static int enable_verity(struct file *filp,
 		/* Successfully enabled verity */
 
 		/*
-		 * Readers can start using ->i_verity_info immediately, so it
-		 * can't be rolled back once set.  So don't set it until just
-		 * after the filesystem has successfully enabled verity.
+		 * Readers can start using the inode's verity info immediately,
+		 * so it can't be rolled back once set.  So don't set it until
+		 * just after the filesystem has successfully enabled verity.
 		 */
 		fsverity_set_info(inode, vi);
 	}
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 5fe854a5b9ad..dd20b138d452 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -63,10 +63,11 @@ struct merkle_tree_params {
  * fsverity_info - cached verity metadata for an inode
  *
  * When a verity file is first opened, an instance of this struct is allocated
- * and stored in ->i_verity_info; it remains until the inode is evicted.  It
- * caches information about the Merkle tree that's needed to efficiently verify
- * data read from the file.  It also caches the file digest.  The Merkle tree
- * pages themselves are not cached here, but the filesystem may cache them.
+ * and a pointer to it is stored in the file's in-memory inode.  It remains
+ * until the inode is evicted.  It caches information about the Merkle tree
+ * that's needed to efficiently verify data read from the file.  It also caches
+ * the file digest.  The Merkle tree pages themselves are not cached here, but
+ * the filesystem may cache them.
  */
 struct fsverity_info {
 	struct merkle_tree_params tree_params;
@@ -89,7 +90,7 @@ union fsverity_hash_ctx *
 fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 			    const u8 *salt, size_t salt_size);
 void fsverity_hash_block(const struct merkle_tree_params *params,
-			 const struct inode *inode, const void *data, u8 *out);
+			 const void *data, u8 *out);
 void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
 			  const void *data, size_t size, u8 *out);
 void __init fsverity_check_hash_algs(void);
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 9bb3c6344907..de53e14c8aa7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
 /**
  * fsverity_hash_block() - hash a single data or hash block
  * @params: the Merkle tree's parameters
- * @inode: inode for which the hashing is being done
  * @data: virtual address of a buffer containing the block to hash
  * @out: output digest, size 'params->digest_size' bytes
  *
@@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
  * in the Merkle tree parameters.
  */
 void fsverity_hash_block(const struct merkle_tree_params *params,
-			 const struct inode *inode, const void *data, u8 *out)
+			 const void *data, u8 *out)
 {
 	union fsverity_hash_ctx ctx;
 
diff --git a/fs/verity/open.c b/fs/verity/open.c
index c561e130cd0c..77b1c977af02 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -244,17 +244,17 @@ fail:
 void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
 {
 	/*
-	 * Multiple tasks may race to set ->i_verity_info, so use
-	 * cmpxchg_release().  This pairs with the smp_load_acquire() in
-	 * fsverity_get_info().  I.e., here we publish ->i_verity_info with a
-	 * RELEASE barrier so that other tasks can ACQUIRE it.
+	 * Multiple tasks may race to set the inode's verity info pointer, so
+	 * use cmpxchg_release().  This pairs with the smp_load_acquire() in
+	 * fsverity_get_info().  I.e., publish the pointer with a RELEASE
+	 * barrier so that other tasks can ACQUIRE it.
 	 */
-	if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
-		/* Lost the race, so free the fsverity_info we allocated. */
+	if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) {
+		/* Lost the race, so free the verity info we allocated. */
 		fsverity_free_info(vi);
 		/*
-		 * Afterwards, the caller may access ->i_verity_info directly,
-		 * so make sure to ACQUIRE the winning fsverity_info.
+		 * Afterwards, the caller may access the inode's verity info
+		 * directly, so make sure to ACQUIRE the winning verity info.
 		 */
 		(void)fsverity_get_info(inode);
 	}
@@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode,
 	return 0;
 }
 
-/* Ensure the inode has an ->i_verity_info */
 static int ensure_verity_info(struct inode *inode)
 {
 	struct fsverity_info *vi = fsverity_get_info(inode);
@@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr);
 
 void __fsverity_cleanup_inode(struct inode *inode)
 {
-	fsverity_free_info(inode->i_verity_info);
-	inode->i_verity_info = NULL;
+	struct fsverity_info **vi_addr = fsverity_info_addr(inode);
+
+	fsverity_free_info(*vi_addr);
+	*vi_addr = NULL;
 }
 EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
 
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index a1f00c3fd3b2..86067c8b40cf 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -10,6 +10,31 @@
 #include <linux/bio.h>
 #include <linux/export.h>
 
+#define FS_VERITY_MAX_PENDING_BLOCKS 2
+
+struct fsverity_pending_block {
+	const void *data;
+	u64 pos;
+	u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+};
+
+struct fsverity_verification_context {
+	struct inode *inode;
+	struct fsverity_info *vi;
+	unsigned long max_ra_pages;
+
+	/*
+	 * This is the queue of data blocks that are pending verification.  When
+	 * the crypto layer supports interleaved hashing, we allow multiple
+	 * blocks to be queued up in order to utilize it.  This can improve
+	 * performance significantly vs. sequential hashing of each block.
+	 */
+	int num_pending;
+	int max_pending;
+	struct fsverity_pending_block
+		pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS];
+};
+
 static struct workqueue_struct *fsverity_read_workqueue;
 
 /*
@@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 }
 
 /*
- * Verify a single data block against the file's Merkle tree.
+ * Verify the hash of a single data block against the file's Merkle tree.
  *
  * In principle, we need to verify the entire path to the root node.  However,
  * for efficiency the filesystem may cache the hash blocks.  Therefore we need
@@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
  *
  * Return: %true if the data block is valid, else %false.
  */
-static bool
-verify_data_block(struct inode *inode, struct fsverity_info *vi,
-		  const void *data, u64 data_pos, unsigned long max_ra_pages)
+static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
+			      const struct fsverity_pending_block *dblock,
+			      unsigned long max_ra_pages)
 {
+	const u64 data_pos = dblock->pos;
 	const struct merkle_tree_params *params = &vi->tree_params;
 	const unsigned int hsize = params->digest_size;
 	int level;
@@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
 	 */
 	u64 hidx = data_pos >> params->log_blocksize;
 
-	/* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
-	BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
+	/*
+	 * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may
+	 * be mapped at once.
+	 */
+	static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <=
+		      KM_MAX_IDX);
 
 	if (unlikely(data_pos >= inode->i_size)) {
 		/*
@@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
 		 * any part past EOF should be all zeroes.  Therefore, we need
 		 * to verify that any data blocks fully past EOF are all zeroes.
 		 */
-		if (memchr_inv(data, 0, params->block_size)) {
+		if (memchr_inv(dblock->data, 0, params->block_size)) {
 			fsverity_err(inode,
 				     "FILE CORRUPTED!  Data past EOF is not zeroed");
 			return false;
@@ -202,7 +232,7 @@ descend:
 		unsigned long hblock_idx = hblocks[level - 1].index;
 		unsigned int hoffset = hblocks[level - 1].hoffset;
 
-		fsverity_hash_block(params, inode, haddr, real_hash);
+		fsverity_hash_block(params, haddr, real_hash);
 		if (memcmp(want_hash, real_hash, hsize) != 0)
 			goto corrupted;
 		/*
@@ -220,18 +250,18 @@ descend:
 		put_page(hpage);
 	}
 
-	/* Finally, verify the data block. */
-	fsverity_hash_block(params, inode, data, real_hash);
-	if (memcmp(want_hash, real_hash, hsize) != 0)
+	/* Finally, verify the hash of the data block. */
+	if (memcmp(want_hash, dblock->real_hash, hsize) != 0)
 		goto corrupted;
 	return true;
 
 corrupted:
-	fsverity_err(inode,
-		     "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
-		     data_pos, level - 1,
-		     params->hash_alg->name, hsize, want_hash,
-		     params->hash_alg->name, hsize, real_hash);
+	fsverity_err(
+		inode,
+		"FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+		data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
+		params->hash_alg->name, hsize,
+		level == 0 ? dblock->real_hash : real_hash);
 error:
 	for (; level > 0; level--) {
 		kunmap_local(hblocks[level - 1].addr);
@@ -240,13 +270,73 @@ error:
 	return false;
 }
 
+static void
+fsverity_init_verification_context(struct fsverity_verification_context *ctx,
+				   struct inode *inode,
+				   unsigned long max_ra_pages)
+{
+	struct fsverity_info *vi = *fsverity_info_addr(inode);
+
+	ctx->inode = inode;
+	ctx->vi = vi;
+	ctx->max_ra_pages = max_ra_pages;
+	ctx->num_pending = 0;
+	if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
+	    sha256_finup_2x_is_optimized())
+		ctx->max_pending = 2;
+	else
+		ctx->max_pending = 1;
+}
+
+static void
+fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
+{
+	int i;
+
+	for (i = ctx->num_pending - 1; i >= 0; i--) {
+		kunmap_local(ctx->pending_blocks[i].data);
+		ctx->pending_blocks[i].data = NULL;
+	}
+	ctx->num_pending = 0;
+}
+
 static bool
-verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
-		   unsigned long max_ra_pages)
+fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
 {
-	struct inode *inode = data_folio->mapping->host;
-	struct fsverity_info *vi = inode->i_verity_info;
-	const unsigned int block_size = vi->tree_params.block_size;
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	int i;
+
+	if (ctx->num_pending == 2) {
+		/* num_pending == 2 implies that the algorithm is SHA-256 */
+		sha256_finup_2x(params->hashstate ? &params->hashstate->sha256 :
+						    NULL,
+				ctx->pending_blocks[0].data,
+				ctx->pending_blocks[1].data, params->block_size,
+				ctx->pending_blocks[0].real_hash,
+				ctx->pending_blocks[1].real_hash);
+	} else {
+		for (i = 0; i < ctx->num_pending; i++)
+			fsverity_hash_block(params, ctx->pending_blocks[i].data,
+					    ctx->pending_blocks[i].real_hash);
+	}
+
+	for (i = 0; i < ctx->num_pending; i++) {
+		if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
+				       ctx->max_ra_pages))
+			return false;
+	}
+	fsverity_clear_pending_blocks(ctx);
+	return true;
+}
+
+static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
+				     struct folio *data_folio, size_t len,
+				     size_t offset)
+{
+	struct fsverity_info *vi = ctx->vi;
+	const struct merkle_tree_params *params = &vi->tree_params;
+	const unsigned int block_size = params->block_size;
 	u64 pos = (u64)data_folio->index << PAGE_SHIFT;
 
 	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
@@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
 			 folio_test_uptodate(data_folio)))
 		return false;
 	do {
-		void *data;
-		bool valid;
-
-		data = kmap_local_folio(data_folio, offset);
-		valid = verify_data_block(inode, vi, data, pos + offset,
-					  max_ra_pages);
-		kunmap_local(data);
-		if (!valid)
+		ctx->pending_blocks[ctx->num_pending].data =
+			kmap_local_folio(data_folio, offset);
+		ctx->pending_blocks[ctx->num_pending].pos = pos + offset;
+		if (++ctx->num_pending == ctx->max_pending &&
+		    !fsverity_verify_pending_blocks(ctx))
 			return false;
 		offset += block_size;
 		len -= block_size;
@@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
  */
 bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
 {
-	return verify_data_blocks(folio, len, offset, 0);
+	struct fsverity_verification_context ctx;
+
+	fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+
+	if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
+	    fsverity_verify_pending_blocks(&ctx))
+		return true;
+	fsverity_clear_pending_blocks(&ctx);
+	return false;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
 
@@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
  */
 void fsverity_verify_bio(struct bio *bio)
 {
+	struct inode *inode = bio_first_folio_all(bio)->mapping->host;
+	struct fsverity_verification_context ctx;
 	struct folio_iter fi;
 	unsigned long max_ra_pages = 0;
 
@@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio)
 		max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
 	}
 
+	fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+
 	bio_for_each_folio_all(fi, bio) {
-		if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
-					max_ra_pages)) {
-			bio->bi_status = BLK_STS_IOERR;
-			break;
-		}
+		if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
+					      fi.offset))
+			goto ioerr;
 	}
+
+	if (!fsverity_verify_pending_blocks(&ctx))
+		goto ioerr;
+	return;
+
+ioerr:
+	fsverity_clear_pending_blocks(&ctx);
+	bio->bi_status = BLK_STS_IOERR;
 }
 EXPORT_SYMBOL_GPL(fsverity_verify_bio);
 #endif /* CONFIG_BLOCK */
@@ -355,7 +460,7 @@ void __init fsverity_init_workqueue(void)
 	 * latency on ARM64.
 	 */
 	fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
-						  WQ_HIGHPRI,
+						  WQ_HIGHPRI | WQ_PERCPU,
 						  num_online_cpus());
 	if (!fsverity_read_workqueue)
 		panic("failed to allocate fsverity_read_queue");
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ae0ca6858496..8930d5254e1d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -25,7 +25,7 @@ config XFS_FS
 config XFS_SUPPORT_V4
 	bool "Support deprecated V4 (crc=0) format"
 	depends on XFS_FS
-	default y
+	default n
 	help
 	  The V4 filesystem format lacks certain features that are supported
 	  by the V5 format, such as metadata checksumming, strengthened
@@ -40,7 +40,7 @@ config XFS_SUPPORT_V4
 	  filesystem is a V4 filesystem.  If no such string is found, please
 	  upgrade xfsprogs to the latest version and try again.
 
-	  This option will become default N in September 2025.  Support for the
+	  This option became default N in September 2025.  Support for the
 	  V4 format will be removed entirely in September 2030.  Distributors
 	  can say N here to withdraw support earlier.
 
@@ -50,7 +50,7 @@ config XFS_SUPPORT_V4
 config XFS_SUPPORT_ASCII_CI
 	bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
 	depends on XFS_FS
-	default y
+	default n
 	help
 	  The ASCII case insensitivity filesystem feature only works correctly
 	  on systems that have been coerced into using ISO 8859-1, and it does
@@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI
 	  filesystem is a case-insensitive filesystem.  If no such string is
 	  found, please upgrade xfsprogs to the latest version and try again.
 
-	  This option will become default N in September 2025.  Support for the
+	  This option became default N in September 2025.  Support for the
 	  feature will be removed entirely in September 2030.  Distributors
 	  can say N here to withdraw support earlier.
 
@@ -105,6 +105,7 @@ config XFS_POSIX_ACL
 config XFS_RT
 	bool "XFS Realtime subvolume support"
 	depends on XFS_FS
+	default BLK_DEV_ZONED
 	help
 	  If you say Y here you will be able to mount and use XFS filesystems
 	  which contain a realtime subvolume.  The realtime subvolume is a
@@ -136,7 +137,7 @@ config XFS_BTREE_IN_MEM
 
 config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
-	default n
+	default y
 	depends on XFS_FS
 	depends on TMPFS && SHMEM
 	select XFS_LIVE_HOOKS
@@ -149,12 +150,8 @@ config XFS_ONLINE_SCRUB
 	  advantage here is to look for problems proactively so that
 	  they can be dealt with in a controlled manner.
 
-	  This feature is considered EXPERIMENTAL.  Use with caution!
-
 	  See the xfs_scrub man page in section 8 for additional information.
 
-	  If unsure, say N.
-
 config XFS_ONLINE_SCRUB_STATS
 	bool "XFS online metadata check usage data collection"
 	default y
@@ -170,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS
 
 	  Usage data are collected in /sys/kernel/debug/xfs/scrub.
 
-	  If unsure, say N.
-
 config XFS_ONLINE_REPAIR
 	bool "XFS online metadata repair support"
-	default n
+	default y
 	depends on XFS_FS && XFS_ONLINE_SCRUB
 	select XFS_BTREE_IN_MEM
 	help
@@ -185,12 +180,8 @@ config XFS_ONLINE_REPAIR
 	  formatted with secondary metadata, such as reverse mappings and inode
 	  parent pointers.
 
-	  This feature is considered EXPERIMENTAL.  Use with caution!
-
 	  See the xfs_scrub man page in section 8 for additional information.
 
-	  If unsure, say N.
-
 config XFS_WARN
 	bool "XFS Verbose Warnings"
 	depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fb79215a509d..8ac8230c3d3c 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -92,9 +92,8 @@ xfs_ag_resv_critical(
 	trace_xfs_ag_resv_critical(pag, type, avail);
 
 	/* Critically low if less than 10% or max btree height remains. */
-	return XFS_TEST_ERROR(avail < orig / 10 ||
-			      avail < mp->m_agbtree_maxlevels,
-			mp, XFS_ERRTAG_AG_RESV_CRITICAL);
+	return avail < orig / 10 || avail < mp->m_agbtree_maxlevels ||
+		XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL);
 }
 
 /*
@@ -203,7 +202,7 @@ __xfs_ag_resv_init(
 		return -EINVAL;
 	}
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL))
 		error = -ENOSPC;
 	else
 		error = xfs_dec_fdblocks(mp, hidden_space, true);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 000cc7f4a3ce..ad381c73abc4 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3321,7 +3321,7 @@ xfs_agf_read_verify(
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_agf_verify(bp);
-		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+		if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 	}
 }
@@ -4019,8 +4019,7 @@ __xfs_free_extent(
 	ASSERT(len != 0);
 	ASSERT(type != XFS_AG_RESV_AGFL);
 
-	if (XFS_TEST_ERROR(false, mp,
-			XFS_ERRTAG_FREE_EXTENT))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
 		return -EIO;
 
 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index fddb55605e0c..91c1b30ebaab 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit(
 
 	/*
 	 * For attr2 we can try to move the forkoff if there is space in the
-	 * literal area, but for the old format we are done if there is no
-	 * space in the fixed attribute fork.
+	 * literal area
 	 */
-	if (!xfs_has_attr2(mp))
-		return 0;
-
 	dsize = dp->i_df.if_bytes;
 
 	switch (dp->i_df.if_format) {
@@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit(
 }
 
 /*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
- * - noattr2 mount option is set,
- * - on-disk version bit says it is already set, or
- * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless
+ * on-disk version bit says it is already set
  */
 STATIC void
 xfs_sbversion_add_attr2(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp)
 {
-	if (xfs_has_noattr2(mp))
-		return;
 	if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
 		return;
-	if (!xfs_has_attr2(mp))
-		return;
 
 	spin_lock(&mp->m_sb_lock);
 	xfs_add_attr2(mp);
@@ -889,7 +879,7 @@ xfs_attr_sf_removename(
 	/*
 	 * Fix up the start offset of the attribute fork
 	 */
-	if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
+	if (totsize == sizeof(struct xfs_attr_sf_hdr) &&
 	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
 	    !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
 	    !xfs_has_parent(mp)) {
@@ -900,7 +890,6 @@ xfs_attr_sf_removename(
 		ASSERT(dp->i_forkoff);
 		ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
 				(args->op_flags & XFS_DA_OP_ADDNAME) ||
-				!xfs_has_attr2(mp) ||
 				dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
 				xfs_has_parent(mp));
 		xfs_trans_log_inode(args->trans, dp,
@@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit(
 		bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
 					be16_to_cpu(name_loc->valuelen));
 	}
-	if (xfs_has_attr2(dp->i_mount) &&
-	    (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+	if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
 	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
 		return -1;
 	return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform(
 		 * this case.
 		 */
 		if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
-			ASSERT(xfs_has_attr2(dp->i_mount));
 			ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
 			xfs_attr_fork_remove(dp, args->trans);
 		}
@@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node(
 
 	trace_xfs_attr_leaf_to_node(args);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
 		error = -EIO;
 		goto out;
 	}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4c44ce1c8a64..bff3dc226f81 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -435,6 +435,13 @@ xfs_attr_rmtval_get(
 					0, &bp, &xfs_attr3_rmt_buf_ops);
 			if (xfs_metadata_is_sick(error))
 				xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
+			/*
+			 * ENODATA from disk implies a disk medium failure;
+			 * ENODATA for xattrs means attribute not found, so
+			 * disambiguate that here.
+			 */
+			if (error == -ENODATA)
+				error = -EIO;
 			if (error)
 				return error;
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d954f9b8071f..53ef4b7e504d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local(
 static int
 xfs_bmap_set_attrforkoff(
 	struct xfs_inode	*ip,
-	int			size,
-	int			*version)
+	int			size)
 {
 	int			default_size = xfs_default_attroffset(ip) >> 3;
 
@@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff(
 		ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
 		if (!ip->i_forkoff)
 			ip->i_forkoff = default_size;
-		else if (xfs_has_attr2(ip->i_mount) && version)
-			*version = 2;
 		break;
 	default:
 		ASSERT(0);
@@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork(
 	int			rsvd)		/* xact may use reserved blks */
 {
 	struct xfs_mount	*mp = tp->t_mountp;
-	int			version = 1;	/* superblock attr version */
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */
 
@@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork(
 	ASSERT(!xfs_inode_has_attr_fork(ip));
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = xfs_bmap_set_attrforkoff(ip, size, &version);
+	error = xfs_bmap_set_attrforkoff(ip, size);
 	if (error)
 		return error;
 
@@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork(
 		xfs_trans_log_inode(tp, ip, logflags);
 	if (error)
 		return error;
-	if (!xfs_has_attr(mp) ||
-	   (!xfs_has_attr2(mp) && version == 2)) {
+	if (!xfs_has_attr(mp)) {
 		bool log_sb = false;
 
 		spin_lock(&mp->m_sb_lock);
 		if (!xfs_has_attr(mp)) {
 			xfs_add_attr(mp);
-			log_sb = true;
-		}
-		if (!xfs_has_attr2(mp) && version == 2) {
 			xfs_add_attr2(mp);
 			log_sb = true;
 		}
@@ -3662,8 +3654,7 @@ xfs_bmap_btalloc(
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);
 
-	if (unlikely(XFS_TEST_ERROR(false, mp,
-			XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+	if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
 		error = xfs_bmap_exact_minlen_extent_alloc(ap, &args);
 	else if ((ap->datatype & XFS_ALLOC_USERDATA) &&
 			xfs_inode_is_filestream(ap->ip))
@@ -3849,7 +3840,7 @@ xfs_bmapi_read(
 	}
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -4200,7 +4191,7 @@ xfs_bmapi_write(
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -4545,7 +4536,7 @@ xfs_bmapi_remap(
 			(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents(
 	int			logflags = 0;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents(
 	int			logflags = 0;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -5900,7 +5891,7 @@ xfs_bmap_split_extent(
 	int				i = 0;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
@@ -6065,7 +6056,7 @@ xfs_bmap_finish_one(
 
 	trace_xfs_bmap_deferred(bi);
 
-	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
+	if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
 		return -EIO;
 
 	switch (bi->bi_type) {
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a61211d253f1..dbe9df8c3300 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -306,7 +306,7 @@ xfs_btree_check_block(
 
 	fa = __xfs_btree_check_block(cur, block, level, bp);
 	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
-	    XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+	    XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) {
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
 		xfs_btree_mark_sick(cur);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 17d9e6154f19..90f7fc219fcc 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -565,7 +565,7 @@ xfs_da3_split(
 
 	trace_xfs_da_split(state->args);
 
-	if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+	if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
 		return -EIO;
 
 	/*
@@ -2833,6 +2833,12 @@ xfs_da_read_buf(
 			&bp, ops);
 	if (xfs_metadata_is_sick(error))
 		xfs_dirattr_mark_sick(dp, whichfork);
+	/*
+	 * ENODATA from disk implies a disk medium failure; ENODATA for
+	 * xattrs means attribute not found, so disambiguate that here.
+	 */
+	if (error == -ENODATA && whichfork == XFS_ATTR_FORK)
+		error = -EIO;
 	if (error)
 		goto out_free;
 
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 1775abcfa04d..82a338458a51 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -223,7 +223,7 @@ xfs_dir_ino_validate(
 	bool		ino_ok = xfs_verify_dir_ino(mp, ino);
 
 	if (XFS_IS_CORRUPT(mp, !ino_ok) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
 		xfs_warn(mp, "Invalid inode number 0x%Lx",
 				(unsigned long long) ino);
 		return -EFSCORRUPTED;
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a53c5d40e084..de840abc0bcd 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -4,14 +4,22 @@
  * Copyright (C) 2017 Oracle.
  * All Rights Reserved.
  */
-#ifndef __XFS_ERRORTAG_H_
+#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG)
 #define __XFS_ERRORTAG_H_
 
 /*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
+ * There are two ways to use this header file.  The first way is to #include it
+ * bare, which will define all the XFS_ERRTAG_* error injection knobs for use
+ * with the XFS_TEST_ERROR macro.  The second way is to enclose the #include
+ * with a #define for an XFS_ERRTAG macro, in which case the header will define
+ " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each
+ * defined error injection knob.
  */
 
+/*
+ * These are the actual error injection tags.  The numbers should be consecutive
+ * because arrays are sized based on the maximum.
+ */
 #define XFS_ERRTAG_NOERROR				0
 #define XFS_ERRTAG_IFLUSH_1				1
 #define XFS_ERRTAG_IFLUSH_2				2
@@ -71,49 +79,61 @@
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
  */
 #define XFS_RANDOM_DEFAULT				100
-#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT				1
-#define XFS_RANDOM_RMAP_FINISH_ONE			1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
-#define XFS_RANDOM_BMAP_FINISH_ONE			1
-#define XFS_RANDOM_AG_RESV_CRITICAL			4
-#define XFS_RANDOM_LOG_BAD_CRC				1
-#define XFS_RANDOM_LOG_ITEM_PIN				1
-#define XFS_RANDOM_BUF_LRU_REF				2
-#define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
-#define XFS_RANDOM_FORCE_SUMMARY_RECALC			1
-#define XFS_RANDOM_IUNLINK_FALLBACK			(XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BUF_IOERROR				XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_REDUCE_MAX_IEXTENTS			1
-#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT		1
-#define XFS_RANDOM_AG_RESV_FAIL				1
-#define XFS_RANDOM_LARP					1
-#define XFS_RANDOM_DA_LEAF_SPLIT			1
-#define XFS_RANDOM_ATTR_LEAF_TO_NODE			1
-#define XFS_RANDOM_WB_DELAY_MS				3000
-#define XFS_RANDOM_WRITE_DELAY_MS			3000
-#define XFS_RANDOM_EXCHMAPS_FINISH_ONE			1
-#define XFS_RANDOM_METAFILE_RESV_CRITICAL		4
+
+/*
+ * Table of errror injection knobs.  The parameters to the XFS_ERRTAG macro are:
+ *   1. The XFS_ERRTAG_ flag but without the prefix;
+ *   2. The name of the sysfs knob; and
+ *   3. The default value for the knob.
+ */
+#ifdef XFS_ERRTAG
+# undef XFS_ERRTAGS
+# define XFS_ERRTAGS \
+XFS_ERRTAG(NOERROR,		noerror,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_1,		iflush1,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_2,		iflush2,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_3,		iflush3,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_4,		iflush4,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_5,		iflush5,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_6,		iflush6,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DA_READ_BUF,		dareadbuf,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BTREE_CHECK_LBLOCK,	btree_chk_lblk,		XFS_RANDOM_DEFAULT/4) \
+XFS_ERRTAG(BTREE_CHECK_SBLOCK,	btree_chk_sblk,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ALLOC_READ_AGF,	readagf,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IALLOC_READ_AGI,	readagi,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ITOBP_INOTOBP,	itobp,			XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK,		iunlink,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK_REMOVE,	iunlinkrm,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DIR_INO_VALIDATE,	dirinovalid,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BULKSTAT_READ_CHUNK,	bulkstat,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IODONE_IOERR,	logiodone,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATREAD_IOERR,	stratread,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATCMPL_IOERR,	stratcmpl,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(DIOWRITE_IOERR,	diowrite,		XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BMAPIFORMAT,		bmapifmt,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(FREE_EXTENT,		free_extent,		1) \
+XFS_ERRTAG(RMAP_FINISH_ONE,	rmap_finish_one,	1) \
+XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \
+XFS_ERRTAG(REFCOUNT_FINISH_ONE,	refcount_finish_one,	1) \
+XFS_ERRTAG(BMAP_FINISH_ONE,	bmap_finish_one,	1) \
+XFS_ERRTAG(AG_RESV_CRITICAL,	ag_resv_critical,	4) \
+XFS_ERRTAG(LOG_BAD_CRC,		log_bad_crc,		1) \
+XFS_ERRTAG(LOG_ITEM_PIN,	log_item_pin,		1) \
+XFS_ERRTAG(BUF_LRU_REF,		buf_lru_ref,		2) \
+XFS_ERRTAG(FORCE_SCRUB_REPAIR,	force_repair,		1) \
+XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary,		1) \
+XFS_ERRTAG(IUNLINK_FALLBACK,	iunlink_fallback,	XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BUF_IOERROR,		buf_ioerror,		XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(REDUCE_MAX_IEXTENTS,	reduce_max_iextents,	1) \
+XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \
+XFS_ERRTAG(AG_RESV_FAIL,	ag_resv_fail,		1) \
+XFS_ERRTAG(LARP,		larp,			1) \
+XFS_ERRTAG(DA_LEAF_SPLIT,	da_leaf_split,		1) \
+XFS_ERRTAG(ATTR_LEAF_TO_NODE,	attr_leaf_to_node,	1) \
+XFS_ERRTAG(WB_DELAY_MS,		wb_delay_ms,		3000) \
+XFS_ERRTAG(WRITE_DELAY_MS,	write_delay_ms,		3000) \
+XFS_ERRTAG(EXCHMAPS_FINISH_ONE,	exchmaps_finish_one,	1) \
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit,	4)
+#endif /* XFS_ERRTAG */
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 3f1d6a98c118..932ee4619e9e 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -616,7 +616,7 @@ xfs_exchmaps_finish_one(
 			return error;
 	}
 
-	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+	if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
 		return -EIO;
 
 	/* If we still have work to do, ask for a new transaction. */
@@ -882,7 +882,7 @@ xmi_ensure_delta_nextents(
 				&new_nextents))
 		return -EFBIG;
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
 	    new_nextents > 10)
 		return -EFBIG;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 750111634d9f..d97295eaebe6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2140,7 +2140,7 @@ xfs_difree_inobt(
 	 * remove the chunk if the block size is large enough for multiple inode
 	 * chunks (that might not be free).
 	 */
-	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	if (rec.ir_free == XFS_INOBT_ALL_FREE &&
 	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
 		xic->deleted = true;
 		xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
@@ -2286,7 +2286,7 @@ xfs_difree_finobt(
 	 * enough for multiple chunks. Leave the finobt record to remain in sync
 	 * with the inobt.
 	 */
-	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+	if (rec.ir_free == XFS_INOBT_ALL_FREE &&
 	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
 		error = xfs_btree_delete(cur, &i);
 		if (error)
@@ -2706,7 +2706,7 @@ xfs_agi_read_verify(
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_agi_verify(bp);
-		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+		if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI))
 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 	}
 }
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index aa13fc00afd7..b1812b2c3cce 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -61,8 +61,8 @@ xfs_inode_buf_verify(
 		di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
 			xfs_dinode_good_version(mp, dip->di_version) &&
 			xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP))) {
+		if (unlikely(!di_ok ||
+				XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) {
 			if (readahead) {
 				bp->b_flags &= ~XBF_DONE;
 				xfs_buf_ioerror(bp, -EIO);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 4f99b90add55..1772d82f2d68 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -756,8 +756,7 @@ xfs_iext_count_extend(
 	if (nr_exts < ifp->if_nextents)
 		return -EFBIG;
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
-	    nr_exts > 10)
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10)
 		return -EFBIG;
 
 	if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 48fe49a5f050..309ce6dd5553 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -299,17 +299,6 @@ xfs_inode_init(
 		} else {
 			inode_init_owner(args->idmap, inode, dir, args->mode);
 		}
-
-		/*
-		 * If the group ID of the new file does not match the effective
-		 * group ID or one of the supplementary group IDs, the S_ISGID
-		 * bit is cleared (and only if the irix_sgid_inherit
-		 * compatibility variable is set).
-		 */
-		if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
-		    !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
-			inode->i_mode &= ~S_ISGID;
-
 		ip->i_projid = xfs_get_initial_prid(pip);
 	}
 
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 0d637c276db0..6c50cb2ece19 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -86,43 +86,6 @@ struct xfs_unmount_log_format {
 	uint32_t	pad2;	/* may as well make it 64 bits */
 };
 
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT		1
-#define XLOG_REG_TYPE_BCHUNK		2
-#define XLOG_REG_TYPE_EFI_FORMAT	3
-#define XLOG_REG_TYPE_EFD_FORMAT	4
-#define XLOG_REG_TYPE_IFORMAT		5
-#define XLOG_REG_TYPE_ICORE		6
-#define XLOG_REG_TYPE_IEXT		7
-#define XLOG_REG_TYPE_IBROOT		8
-#define XLOG_REG_TYPE_ILOCAL		9
-#define XLOG_REG_TYPE_IATTR_EXT		10
-#define XLOG_REG_TYPE_IATTR_BROOT	11
-#define XLOG_REG_TYPE_IATTR_LOCAL	12
-#define XLOG_REG_TYPE_QFORMAT		13
-#define XLOG_REG_TYPE_DQUOT		14
-#define XLOG_REG_TYPE_QUOTAOFF		15
-#define XLOG_REG_TYPE_LRHEADER		16
-#define XLOG_REG_TYPE_UNMOUNT		17
-#define XLOG_REG_TYPE_COMMIT		18
-#define XLOG_REG_TYPE_TRANSHDR		19
-#define XLOG_REG_TYPE_ICREATE		20
-#define XLOG_REG_TYPE_RUI_FORMAT	21
-#define XLOG_REG_TYPE_RUD_FORMAT	22
-#define XLOG_REG_TYPE_CUI_FORMAT	23
-#define XLOG_REG_TYPE_CUD_FORMAT	24
-#define XLOG_REG_TYPE_BUI_FORMAT	25
-#define XLOG_REG_TYPE_BUD_FORMAT	26
-#define XLOG_REG_TYPE_ATTRI_FORMAT	27
-#define XLOG_REG_TYPE_ATTRD_FORMAT	28
-#define XLOG_REG_TYPE_ATTR_NAME		29
-#define XLOG_REG_TYPE_ATTR_VALUE	30
-#define XLOG_REG_TYPE_XMI_FORMAT	31
-#define XLOG_REG_TYPE_XMD_FORMAT	32
-#define XLOG_REG_TYPE_ATTR_NEWNAME	33
-#define XLOG_REG_TYPE_ATTR_NEWVALUE	34
-#define XLOG_REG_TYPE_MAX		34
-
 /*
  * Flags to log operation header
  *
@@ -141,14 +104,13 @@ struct xfs_unmount_log_format {
 #define XLOG_END_TRANS		0x10	/* End a continued transaction */
 #define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
 
-
-typedef struct xlog_op_header {
+struct xlog_op_header {
 	__be32	   oh_tid;	/* transaction id of operation	:  4 b */
 	__be32	   oh_len;	/* bytes in data region		:  4 b */
 	__u8	   oh_clientid;	/* who sent me this		:  1 b */
 	__u8	   oh_flags;	/*				:  1 b */
 	__u16	   oh_res2;	/* 32 bit align			:  2 b */
-} xlog_op_header_t;
+};
 
 /* valid values for h_fmt */
 #define XLOG_FMT_UNKNOWN  0
@@ -174,12 +136,40 @@ typedef struct xlog_rec_header {
 	__be32	  h_prev_block; /* block number to previous LR		:  4 */
 	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
 	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
-	/* new fields */
+
+	/* fields added by the Linux port: */
 	__be32    h_fmt;        /* format of log record                 :  4 */
 	uuid_t	  h_fs_uuid;    /* uuid of FS                           : 16 */
+
+	/* fields added for log v2: */
 	__be32	  h_size;	/* iclog size				:  4 */
+
+	/*
+	 * When h_size added for log v2 support, it caused structure to have
+	 * a different size on i386 vs all other architectures because the
+	 * sum of the size ofthe  member is not aligned by that of the largest
+	 * __be64-sized member, and i386 has really odd struct alignment rules.
+	 *
+	 * Due to the way the log headers are placed out on-disk that alone is
+	 * not a problem becaue the xlog_rec_header always sits alone in a
+	 * BBSIZEs area, and the rest of that area is padded with zeroes.
+	 * But xlog_cksum used to calculate the checksum based on the structure
+	 * size, and thus gives different checksums for i386 vs the rest.
+	 * We now do two checksum validation passes for both sizes to allow
+	 * moving v5 file systems with unclean logs between i386 and other
+	 * (little-endian) architectures.
+	 */
+	__u32	  h_pad0;
 } xlog_rec_header_t;
 
+#ifdef __i386__
+#define XLOG_REC_SIZE		offsetofend(struct xlog_rec_header, h_size)
+#define XLOG_REC_SIZE_OTHER	sizeof(struct xlog_rec_header)
+#else
+#define XLOG_REC_SIZE		sizeof(struct xlog_rec_header)
+#define XLOG_REC_SIZE_OTHER	offsetofend(struct xlog_rec_header, h_size)
+#endif /* __i386__ */
+
 typedef struct xlog_rec_ext_header {
 	__be32	  xh_cycle;	/* write cycle of log			: 4 */
 	__be32	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
@@ -195,12 +185,11 @@ typedef union xlog_in_core2 {
 } xlog_in_core_2_t;
 
 /* not an on-disk structure, but needed by log recovery in userspace */
-typedef struct xfs_log_iovec {
+struct xfs_log_iovec {
 	void		*i_addr;	/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
 	uint		i_type;		/* type of region */
-} xfs_log_iovec_t;
-
+};
 
 /*
  * Transaction Header definitions.
@@ -213,12 +202,12 @@ typedef struct xfs_log_iovec {
  * Do not change the below structure without redoing the code in
  * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
  */
-typedef struct xfs_trans_header {
+struct xfs_trans_header {
 	uint		th_magic;		/* magic number */
 	uint		th_type;		/* transaction type */
 	int32_t		th_tid;			/* transaction id (unused) */
 	uint		th_num_items;		/* num items logged by trans */
-} xfs_trans_header_t;
+};
 
 #define	XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
 
@@ -542,7 +531,7 @@ struct xfs_log_dinode {
 #define __XFS_BLF_DATAMAP_SIZE	((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
 #define XFS_BLF_DATAMAP_SIZE	(__XFS_BLF_DATAMAP_SIZE + 1)
 
-typedef struct xfs_buf_log_format {
+struct xfs_buf_log_format {
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_size;	/* size of this item */
 	unsigned short	blf_flags;	/* misc state */
@@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format {
 	int64_t		blf_blkno;	/* starting blkno of this buf */
 	unsigned int	blf_map_size;	/* used size of data bitmap in words */
 	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
+};
 
 /*
  * All buffers now need to tell recovery where the magic number
@@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
 /*
  * EFI/EFD log format definitions
  */
-typedef struct xfs_extent {
+struct xfs_extent {
 	xfs_fsblock_t	ext_start;
 	xfs_extlen_t	ext_len;
-} xfs_extent_t;
+};
 
 /*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
+ * Since the structures in struct xfs_extent add up to 96 bytes, it has
+ * different alignments on i386 vs all other architectures, because i386
+ * does not pad structures to their natural alignment.
+ *
+ * Provide the different variants for use by a conversion routine.
  */
-typedef struct xfs_extent_32 {
+struct xfs_extent_32 {
 	uint64_t	ext_start;
 	uint32_t	ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
+} __attribute__((packed));
 
-typedef struct xfs_extent_64 {
+struct xfs_extent_64 {
 	uint64_t	ext_start;
 	uint32_t	ext_len;
 	uint32_t	ext_pad;
-} xfs_extent_64_t;
+};
 
 /*
  * This is the structure used to lay out an efi log item in the
  * log.  The efi_extents field is a variable size array whose
  * size is given by efi_nextents.
  */
-typedef struct xfs_efi_log_format {
+struct xfs_efi_log_format {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_t		efi_extents[];	/* array of extents to free */
-} xfs_efi_log_format_t;
+	struct xfs_extent	efi_extents[];	/* array of extents to free */
+};
 
 static inline size_t
 xfs_efi_log_format_sizeof(
@@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof(
 			nr * sizeof(struct xfs_extent);
 }
 
-typedef struct xfs_efi_log_format_32 {
+struct xfs_efi_log_format_32 {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_32_t		efi_extents[];	/* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
+	struct xfs_extent_32	efi_extents[];	/* array of extents to free */
+} __attribute__((packed));
 
 static inline size_t
 xfs_efi_log_format32_sizeof(
@@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof(
 			nr * sizeof(struct xfs_extent_32);
 }
 
-typedef struct xfs_efi_log_format_64 {
+struct xfs_efi_log_format_64 {
 	uint16_t		efi_type;	/* efi log item type */
 	uint16_t		efi_size;	/* size of this item */
 	uint32_t		efi_nextents;	/* # extents to free */
 	uint64_t		efi_id;		/* efi identifier */
-	xfs_extent_64_t		efi_extents[];	/* array of extents to free */
-} xfs_efi_log_format_64_t;
+	struct xfs_extent_64	efi_extents[];	/* array of extents to free */
+};
 
 static inline size_t
 xfs_efi_log_format64_sizeof(
@@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof(
  * log.  The efd_extents array is a variable size array whose
  * size is given by efd_nextents;
  */
-typedef struct xfs_efd_log_format {
+struct xfs_efd_log_format {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_t		efd_extents[];	/* array of extents freed */
-} xfs_efd_log_format_t;
+	struct xfs_extent	efd_extents[];	/* array of extents freed */
+};
 
 static inline size_t
 xfs_efd_log_format_sizeof(
@@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof(
 			nr * sizeof(struct xfs_extent);
 }
 
-typedef struct xfs_efd_log_format_32 {
+struct xfs_efd_log_format_32 {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_32_t		efd_extents[];	/* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
+	struct xfs_extent_32	efd_extents[];	/* array of extents freed */
+} __attribute__((packed));
 
 static inline size_t
 xfs_efd_log_format32_sizeof(
@@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof(
 			nr * sizeof(struct xfs_extent_32);
 }
 
-typedef struct xfs_efd_log_format_64 {
+struct xfs_efd_log_format_64 {
 	uint16_t		efd_type;	/* efd log item type */
 	uint16_t		efd_size;	/* size of this item */
 	uint32_t		efd_nextents;	/* # of extents freed */
 	uint64_t		efd_efi_id;	/* id of corresponding efi */
-	xfs_extent_64_t		efd_extents[];	/* array of extents freed */
-} xfs_efd_log_format_64_t;
+	struct xfs_extent_64	efd_extents[];	/* array of extents freed */
+};
 
 static inline size_t
 xfs_efd_log_format64_sizeof(
@@ -957,14 +947,14 @@ struct xfs_xmd_log_format {
  * The first two fields must be the type and size fitting into
  * 32 bits : log_recovery code assumes that.
  */
-typedef struct xfs_dq_logformat {
+struct xfs_dq_logformat {
 	uint16_t		qlf_type;      /* dquot log item type */
 	uint16_t		qlf_size;      /* size of this item */
 	xfs_dqid_t		qlf_id;	       /* usr/grp/proj id : 32 bits */
 	int64_t			qlf_blkno;     /* blkno of dquot buffer */
 	int32_t			qlf_len;       /* len of dquot buffer */
 	uint32_t		qlf_boffset;   /* off of dquot in buffer */
-} xfs_dq_logformat_t;
+};
 
 /*
  * log format struct for QUOTAOFF records.
@@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat {
  * to the first and ensures that the first logitem is taken out of the AIL
  * only when the last one is securely committed.
  */
-typedef struct xfs_qoff_logformat {
+struct xfs_qoff_logformat {
 	unsigned short		qf_type;	/* quotaoff log item type */
 	unsigned short		qf_size;	/* size of this item */
 	unsigned int		qf_flags;	/* USR and/or GRP */
 	char			qf_pad[12];	/* padding for future */
-} xfs_qoff_logformat_t;
+};
 
 /*
  * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 95de23095030..9e712e62369c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -111,7 +111,7 @@ struct xlog_recover_item {
 struct xlog_recover {
 	struct hlist_node	r_list;
 	xlog_tid_t		r_log_tid;	/* log's transaction id */
-	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	struct xfs_trans_header	r_theader;	/* trans header for partial */
 	int			r_state;	/* not needed */
 	xfs_lsn_t		r_lsn;		/* xact lsn */
 	struct list_head	r_itemq;	/* q for items */
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 225923e463c4..b02e3d6c0868 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -121,7 +121,7 @@ xfs_metafile_resv_critical(
 			div_u64(mp->m_metafile_resv_target, 10)))
 		return true;
 
-	return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+	return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
 }
 
 /* Allocate a block from the metadata file's reservation. */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 5ed44fdf7491..7bfa3242e2c5 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format,	16);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent,		32);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header,		328);
+	XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header,	260);
 
 	XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents,	16);
 	XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents,	16);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 897784037483..2484dc9f6d7e 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space(
 	 * refcount continue update "error" has been injected.
 	 */
 	if (cur->bc_refc.nr_ops > 2 &&
-	    XFS_TEST_ERROR(false, cur->bc_mp,
-			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+	    XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
 		return false;
 
 	if (cur->bc_refc.nr_ops == 0)
@@ -1398,7 +1397,7 @@ xfs_refcount_finish_one(
 
 	trace_xfs_refcount_deferred(mp, ri);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;
 
 	/*
@@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one(
 
 	trace_xfs_refcount_deferred(mp, ri);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
 		return -EIO;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3cdf50563fec..83e0488ff773 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2690,7 +2690,7 @@ xfs_rmap_finish_one(
 
 	trace_xfs_rmap_deferred(mp, ri);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE))
 		return -EIO;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5057536e586c..618061d898d4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1067,7 +1067,7 @@ xfs_rtfree_extent(
 	ASSERT(rbmip->i_itemp != NULL);
 	xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
 
-	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
 		return -EIO;
 
 	error = xfs_rtcheck_alloc_range(&args, start, len);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 711e180f9ebb..cdd16dd805d7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -142,8 +142,6 @@ xfs_sb_version_to_features(
 	if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
 		if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
 			features |= XFS_FEAT_LAZYSBCOUNT;
-		if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
-			features |= XFS_FEAT_ATTR2;
 		if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
 			features |= XFS_FEAT_PROJID32;
 		if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
@@ -155,7 +153,7 @@ xfs_sb_version_to_features(
 
 	/* Always on V5 features */
 	features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
-		    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+		    XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 |
 		    XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
 
 	/* Optional V5 features */
@@ -1524,7 +1522,8 @@ xfs_fs_geometry(
 	geo->version = XFS_FSOP_GEOM_VERSION;
 	geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
 		     XFS_FSOP_GEOM_FLAGS_DIRV2 |
-		     XFS_FSOP_GEOM_FLAGS_EXTFLG;
+		     XFS_FSOP_GEOM_FLAGS_EXTFLG |
+		     XFS_FSOP_GEOM_FLAGS_ATTR2;
 	if (xfs_has_attr(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
 	if (xfs_has_quota(mp))
@@ -1537,8 +1536,6 @@ xfs_fs_geometry(
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
 	if (xfs_has_lazysbcount(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
-	if (xfs_has_attr2(mp))
-		geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
 	if (xfs_has_projid32(mp))
 		geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
 	if (xfs_has_crc(mp))
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
index c4f1367b2cca..5fefd132e002 100644
--- a/fs/xfs/libxfs/xfs_zones.h
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -29,6 +29,13 @@ struct xfs_rtgroup;
 #define XFS_OPEN_GC_ZONES	1U
 #define XFS_MIN_OPEN_ZONES	(XFS_OPEN_GC_ZONES + 1U)
 
+/*
+ * For zoned devices that do not have a limit on the number of open zones, and
+ * for regular devices using the zoned allocator, use the most common SMR disks
+ * limit (128) as the default limit on the number of open zones.
+ */
+#define XFS_DEFAULT_MAX_OPEN_ZONES	128
+
 bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
 	xfs_rgblock_t *write_pointer);
 
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 38a246b8bf11..b2a83801412e 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -300,7 +300,7 @@ xrep_cow_find_bad(
 	 * on the debugging knob, replace everything in the CoW fork.
 	 */
 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
-	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+	    XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
 		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
 				xc->irec.br_blockcount);
 		if (error)
@@ -385,7 +385,7 @@ xrep_cow_find_bad_rt(
 	 * CoW fork and then scan for staging extents in the refcountbt.
 	 */
 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
-	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+	    XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
 		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
 				xc->irec.br_blockcount);
 		if (error)
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 14939d7de349..378ec7c8d38e 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -79,7 +79,7 @@ xchk_metapath_cleanup(
 
 	if (mpath->dp_ilock_flags)
 		xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
-	kfree(mpath->path);
+	kfree_const(mpath->path);
 }
 
 /* Set up a metadir path scan.  @path must be dynamically allocated. */
@@ -98,13 +98,13 @@ xchk_setup_metapath_scan(
 
 	error = xchk_install_live_inode(sc, ip);
 	if (error) {
-		kfree(path);
+		kfree_const(path);
 		return error;
 	}
 
 	mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
 	if (!mpath) {
-		kfree(path);
+		kfree_const(path);
 		return -ENOMEM;
 	}
 
@@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir(
 		return -ENOENT;
 
 	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+			kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip);
 }
 
 /* Scan a rtgroup inode under the /rtgroups directory. */
@@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir(
 		return -ENOENT;
 
 	return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
-			kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
+			kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip);
 }
 
 /* Scan a quota inode under the /quota directory. */
@@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode(
 		return -ENOENT;
 
 	return xchk_setup_metapath_scan(sc, qi->qi_dirip,
-			kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
+			kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip);
 }
 #else
 # define xchk_setup_metapath_quotadir(...)	(-ENOENT)
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 1588ce971cb8..951ae8b71566 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -28,6 +28,15 @@
 #include "scrub/newbt.h"
 
 /*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.  The newbt code should reserve
+ * exactly the correct number of blocks to rebuild the btree, so there should
+ * not be any excess blocks to free when committing a new btree.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS	(128)
+
+/*
  * Estimate proper slack values for a btree that's being reloaded.
  *
  * Under most circumstances, we'll take whatever default loading value the
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 8703897c0a9c..07f5bb8a6421 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -36,6 +36,12 @@
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
 #include "xfs_rtrmap_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_bmap_btree.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -91,21 +97,33 @@
 struct xreap_state {
 	struct xfs_scrub		*sc;
 
-	/* Reverse mapping owner and metadata reservation type. */
-	const struct xfs_owner_info	*oinfo;
-	enum xfs_ag_resv_type		resv;
+	union {
+		struct {
+			/*
+			 * For AG blocks, this is reverse mapping owner and
+			 * metadata reservation type.
+			 */
+			const struct xfs_owner_info	*oinfo;
+			enum xfs_ag_resv_type		resv;
+		};
+		struct {
+			/* For file blocks, this is the inode and fork. */
+			struct xfs_inode		*ip;
+			int				whichfork;
+		};
+	};
 
-	/* If true, roll the transaction before reaping the next extent. */
-	bool				force_roll;
+	/* Number of invalidated buffers logged to the current transaction. */
+	unsigned int			nr_binval;
 
-	/* Number of deferred reaps attached to the current transaction. */
-	unsigned int			deferred;
+	/* Maximum number of buffers we can invalidate in a single tx. */
+	unsigned int			max_binval;
 
-	/* Number of invalidated buffers logged to the current transaction. */
-	unsigned int			invalidated;
+	/* Number of deferred reaps attached to the current transaction. */
+	unsigned int			nr_deferred;
 
-	/* Number of deferred reaps queued during the whole reap sequence. */
-	unsigned long long		total_deferred;
+	/* Maximum number of intents we can reap in a single transaction. */
+	unsigned int			max_deferred;
 };
 
 /* Put a block back on the AGFL. */
@@ -148,71 +166,79 @@ xreap_put_freelist(
 }
 
 /* Are there any uncommitted reap operations? */
-static inline bool xreap_dirty(const struct xreap_state *rs)
+static inline bool xreap_is_dirty(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->deferred)
-		return true;
-	if (rs->invalidated)
-		return true;
-	if (rs->total_deferred)
-		return true;
-	return false;
+	return rs->nr_binval > 0 || rs->nr_deferred > 0;
 }
 
-#define XREAP_MAX_BINVAL	(2048)
-
 /*
- * Decide if we want to roll the transaction after reaping an extent.  We don't
- * want to overrun the transaction reservation, so we prohibit more than
- * 128 EFIs per transaction.  For the same reason, we limit the number
- * of buffer invalidations to 2048.
+ * Decide if we need to roll the transaction to clear out the the log
+ * reservation that we allocated to buffer invalidations.
  */
-static inline bool xreap_want_roll(const struct xreap_state *rs)
+static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
-		return true;
-	if (rs->invalidated > XREAP_MAX_BINVAL)
-		return true;
-	return false;
+	return rs->nr_binval >= rs->max_binval;
 }
 
-static inline void xreap_reset(struct xreap_state *rs)
+/* Reset the buffer invalidation count after rolling. */
+static inline void xreap_binval_reset(struct xreap_state *rs)
 {
-	rs->total_deferred += rs->deferred;
-	rs->deferred = 0;
-	rs->invalidated = 0;
-	rs->force_roll = false;
+	rs->nr_binval = 0;
 }
 
-#define XREAP_MAX_DEFER_CHAIN		(2048)
+/*
+ * Bump the number of invalidated buffers, and return true if we can continue,
+ * or false if we need to roll the transaction.
+ */
+static inline bool xreap_inc_binval(struct xreap_state *rs)
+{
+	rs->nr_binval++;
+	return rs->nr_binval < rs->max_binval;
+}
 
 /*
  * Decide if we want to finish the deferred ops that are attached to the scrub
  * transaction.  We don't want to queue huge chains of deferred ops because
  * that can consume a lot of log space and kernel memory.  Hence we trigger a
- * xfs_defer_finish if there are more than 2048 deferred reap operations or the
- * caller did some real work.
+ * xfs_defer_finish if there are too many deferred reap operations or we've run
+ * out of space for invalidations.
  */
-static inline bool
-xreap_want_defer_finish(const struct xreap_state *rs)
+static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
 {
-	if (rs->force_roll)
-		return true;
-	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
-		return true;
-	return false;
+	return rs->nr_deferred >= rs->max_deferred;
 }
 
+/*
+ * Reset the defer chain length and buffer invalidation count after finishing
+ * items.
+ */
 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
 {
-	rs->total_deferred = 0;
-	rs->deferred = 0;
-	rs->invalidated = 0;
-	rs->force_roll = false;
+	rs->nr_deferred = 0;
+	rs->nr_binval = 0;
+}
+
+/*
+ * Bump the number of deferred extent reaps.
+ */
+static inline void xreap_inc_defer(struct xreap_state *rs)
+{
+	rs->nr_deferred++;
+}
+
+/* Force the caller to finish a deferred item chain. */
+static inline void xreap_force_defer_finish(struct xreap_state *rs)
+{
+	rs->nr_deferred = rs->max_deferred;
+}
+
+/* Maximum number of fsblocks that we might find in a buffer to invalidate. */
+static inline unsigned int
+xrep_binval_max_fsblocks(
+	struct xfs_mount	*mp)
+{
+	/* Remote xattr values are the largest buffers that we support. */
+	return xfs_attr3_max_rmt_blocks(mp);
 }
 
 /*
@@ -224,12 +250,8 @@ xrep_bufscan_max_sectors(
 	struct xfs_mount	*mp,
 	xfs_extlen_t		fsblocks)
 {
-	int			max_fsbs;
-
-	/* Remote xattr values are the largest buffers that we support. */
-	max_fsbs = xfs_attr3_max_rmt_blocks(mp);
-
-	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
+				       xrep_binval_max_fsblocks(mp)));
 }
 
 /*
@@ -297,14 +319,13 @@ xreap_agextent_binval(
 		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
 			xfs_trans_bjoin(sc->tp, bp);
 			xfs_trans_binval(sc->tp, bp);
-			rs->invalidated++;
 
 			/*
 			 * Stop invalidating if we've hit the limit; we should
 			 * still have enough reservation left to free however
 			 * far we've gotten.
 			 */
-			if (rs->invalidated > XREAP_MAX_BINVAL) {
+			if (!xreap_inc_binval(rs)) {
 				*aglenp -= agbno_next - bno;
 				goto out;
 			}
@@ -416,21 +437,23 @@ xreap_agextent_iter(
 		trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
 				*aglenp);
 
-		rs->force_roll = true;
-
 		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 			/*
-			 * If we're unmapping CoW staging extents, remove the
+			 * t0: Unmapping CoW staging extents, remove the
 			 * records from the refcountbt, which will remove the
 			 * rmap record as well.
 			 */
 			xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
 					*aglenp);
+			xreap_inc_defer(rs);
 			return 0;
 		}
 
-		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
-				*aglenp, rs->oinfo);
+		/* t1: unmap crosslinked metadata blocks */
+		xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
+				rs->oinfo->oi_owner);
+		xreap_inc_defer(rs);
+		return 0;
 	}
 
 	trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
@@ -443,12 +466,12 @@ xreap_agextent_iter(
 	 */
 	xreap_agextent_binval(rs, agbno, aglenp);
 	if (*aglenp == 0) {
-		ASSERT(xreap_want_roll(rs));
+		ASSERT(xreap_want_binval_roll(rs));
 		return 0;
 	}
 
 	/*
-	 * If we're getting rid of CoW staging extents, use deferred work items
+	 * t2: To get rid of CoW staging extents, use deferred work items
 	 * to remove the refcountbt records (which removes the rmap records)
 	 * and free the extent.  We're not worried about the system going down
 	 * here because log recovery walks the refcount btree to clean out the
@@ -463,23 +486,23 @@ xreap_agextent_iter(
 		if (error)
 			return error;
 
-		rs->force_roll = true;
+		xreap_inc_defer(rs);
 		return 0;
 	}
 
-	/* Put blocks back on the AGFL one at a time. */
+	/* t3: Put blocks back on the AGFL one at a time. */
 	if (rs->resv == XFS_AG_RESV_AGFL) {
 		ASSERT(*aglenp == 1);
 		error = xreap_put_freelist(sc, agbno);
 		if (error)
 			return error;
 
-		rs->force_roll = true;
+		xreap_force_defer_finish(rs);
 		return 0;
 	}
 
 	/*
-	 * Use deferred frees to get rid of the old btree blocks to try to
+	 * t4: Use deferred frees to get rid of the old btree blocks to try to
 	 * minimize the window in which we could crash and lose the old blocks.
 	 * Add a defer ops barrier every other extent to avoid stressing the
 	 * system with large EFIs.
@@ -489,12 +512,194 @@ xreap_agextent_iter(
 	if (error)
 		return error;
 
-	rs->deferred++;
-	if (rs->deferred % 2 == 0)
+	xreap_inc_defer(rs);
+	if (rs->nr_deferred % 2 == 0)
 		xfs_defer_add_barrier(sc->tp);
 	return 0;
 }
 
+/* Configure the deferral and invalidation limits */
+static inline void
+xreap_configure_limits(
+	struct xreap_state	*rs,
+	unsigned int		fixed_overhead,
+	unsigned int		variable_overhead,
+	unsigned int		per_intent,
+	unsigned int		per_binval)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	unsigned int		res = sc->tp->t_log_res - fixed_overhead;
+
+	/* Don't underflow the reservation */
+	if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
+		ASSERT(sc->tp->t_log_res >=
+				(fixed_overhead + variable_overhead));
+		xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
+		return;
+	}
+
+	rs->max_deferred = per_intent ? res / variable_overhead : 0;
+	res -= rs->max_deferred * per_intent;
+	rs->max_binval = per_binval ? res / per_binval : 0;
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single per-AG space extent.  This is not for freeing CoW
+ * staging extents.
+ */
+STATIC void
+xreap_configure_agextent_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
+	 * record.
+	 *
+	 * t3: Freeing to AGFL: roll and finish deferred items for every block.
+	 * Limits here do not matter.
+	 *
+	 * t4: Freeing metadata blocks: deferred freeing of the space, which
+	 * also removes the rmap record.
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t1 = rui;
+	const unsigned int	t4 = rui + efi;
+	const unsigned int	per_intent = max(t1, t4);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of EFI or
+	 * RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	step_size = max(f1, f2);
+
+	/* Largest buffer size (in fsblocks) that can be invalidated. */
+	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
+
+	/* Maximum overhead of invalidating one buffer. */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+	/*
+	 * For each transaction in a reap chain, we can delete some number of
+	 * extents and invalidate some number of blocks.  We assume that btree
+	 * blocks aren't usually contiguous; and that scrub likely pulled all
+	 * the buffers into memory.  From these assumptions, set the maximum
+	 * number of deferrals we can queue before flushing the defer chain,
+	 * and the number of invalidations we can queue before rolling to a
+	 * clean transaction (and possibly relogging some of the deferrals) to
+	 * the same quantity.
+	 */
+	const unsigned int	variable_overhead = per_intent + per_binval;
+
+	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+			per_binval);
+
+	trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
+			step_size, per_intent, rs->max_deferred);
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent.  This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_agcow_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	cui = xfs_cui_log_space(1) +
+				      xfs_cud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
+	 * record, which defers removal of rmap record
+	 *
+	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
+	 * defers removal of rmap record; and deferred removal of the space
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t0 = cui + rui;
+	const unsigned int	t2 = cui + rui + efi;
+	const unsigned int	per_intent = max(t0, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_cui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/* Largest buffer size (in fsblocks) that can be invalidated. */
+	const unsigned int	max_binval = xrep_binval_max_fsblocks(mp);
+
+	/* Overhead of invalidating one buffer */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+	/*
+	 * For each transaction in a reap chain, we can delete some number of
+	 * extents and invalidate some number of blocks.  We assume that CoW
+	 * staging extents are usually more than 1 fsblock, and that there
+	 * shouldn't be any buffers for those blocks.  From the assumptions,
+	 * set the number of deferrals to use as much of the reservation as
+	 * it can, but leave space to invalidate 1/8th that number of buffers.
+	 */
+	const unsigned int	variable_overhead = per_intent +
+							(per_binval / 8);
+
+	xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+			per_binval);
+
+	trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
+			per_intent, rs->max_deferred);
+}
+
 /*
  * Break an AG metadata extent into sub-extents by fate (crosslinked, not
  * crosslinked), and dispose of each sub-extent separately.
@@ -531,11 +736,11 @@ xreap_agmeta_extent(
 			if (error)
 				return error;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			error = xrep_roll_ag_trans(sc);
 			if (error)
 				return error;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}
 
 		agbno += aglen;
@@ -562,11 +767,12 @@ xrep_reap_agblocks(
 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip == NULL);
 
+	xreap_configure_agextent_limits(&rs);
 	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
 	if (error)
 		return error;
 
-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);
 
 	return 0;
@@ -628,7 +834,7 @@ xreap_fsmeta_extent(
 			if (error)
 				goto out_agf;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			/*
 			 * Hold the AGF buffer across the transaction roll so
 			 * that we don't have to reattach it to the scrub
@@ -639,7 +845,7 @@ xreap_fsmeta_extent(
 			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 			if (error)
 				goto out_agf;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}
 
 		agbno += aglen;
@@ -674,11 +880,15 @@ xrep_reap_fsblocks(
 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip != NULL);
 
+	if (oinfo == &XFS_RMAP_OINFO_COW)
+		xreap_configure_agcow_limits(&rs);
+	else
+		xreap_configure_agextent_limits(&rs);
 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
 	if (error)
 		return error;
 
-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);
 
 	return 0;
@@ -770,7 +980,7 @@ xreap_rgextent_iter(
 	rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
 
 	/*
-	 * If there are other rmappings, this block is cross linked and must
+	 * t1: There are other rmappings; this block is cross linked and must
 	 * not be freed.  Remove the forward and reverse mapping and move on.
 	 */
 	if (crosslinked) {
@@ -778,14 +988,14 @@ xreap_rgextent_iter(
 				*rglenp);
 
 		xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
-		rs->deferred++;
+		xreap_inc_defer(rs);
 		return 0;
 	}
 
 	trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
 
 	/*
-	 * The CoW staging extent is not crosslinked.  Use deferred work items
+	 * t2: The CoW staging extent is not crosslinked.  Use deferred work
 	 * to remove the refcountbt records (which removes the rmap records)
 	 * and free the extent.  We're not worried about the system going down
 	 * here because log recovery walks the refcount btree to clean out the
@@ -799,10 +1009,73 @@ xreap_rgextent_iter(
 	if (error)
 		return error;
 
-	rs->deferred++;
+	xreap_inc_defer(rs);
 	return 0;
 }
 
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent.  This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_rgcow_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	cui = xfs_cui_log_space(1) +
+				      xfs_cud_log_space();
+
+	/*
+	 * Various things can happen when reaping non-CoW metadata blocks:
+	 *
+	 * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
+	 * record, which defers removal of rmap record
+	 *
+	 * t2: Freeing CoW blocks: deferred removal of refcount record, which
+	 * defers removal of rmap record; and deferred removal of the space
+	 *
+	 * For simplicity, we'll use the worst-case intents size to determine
+	 * the maximum number of deferred extents before we have to finish the
+	 * whole chain.  If we're trying to reap a btree larger than this size,
+	 * a crash midway through reaping can result in leaked blocks.
+	 */
+	const unsigned int	t1 = cui + rui;
+	const unsigned int	t2 = cui + rui + efi;
+	const unsigned int	per_intent = max(t1, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/*
+	 * The only buffer for the rt device is the rtgroup super, so we don't
+	 * need to save space for buffer invalidations.
+	 */
+	xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
+
+	trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
+			rs->max_deferred);
+}
+
 #define XREAP_RTGLOCK_ALL	(XFS_RTGLOCK_BITMAP | \
 				 XFS_RTGLOCK_RMAP | \
 				 XFS_RTGLOCK_REFCOUNT)
@@ -855,11 +1128,11 @@ xreap_rtmeta_extent(
 			if (error)
 				goto out_unlock;
 			xreap_defer_finish_reset(rs);
-		} else if (xreap_want_roll(rs)) {
+		} else if (xreap_want_binval_roll(rs)) {
 			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
 			if (error)
 				goto out_unlock;
-			xreap_reset(rs);
+			xreap_binval_reset(rs);
 		}
 
 		rgbno += rglen;
@@ -891,12 +1164,14 @@ xrep_reap_rtblocks(
 
 	ASSERT(xfs_has_rmapbt(sc->mp));
 	ASSERT(sc->ip != NULL);
+	ASSERT(oinfo == &XFS_RMAP_OINFO_COW);
 
+	xreap_configure_rgcow_limits(&rs);
 	error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
 	if (error)
 		return error;
 
-	if (xreap_dirty(&rs))
+	if (xreap_is_dirty(&rs))
 		return xrep_defer_finish(sc);
 
 	return 0;
@@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks(
 	ASSERT(sc->ip != NULL);
 	ASSERT(xfs_is_metadir_inode(sc->ip));
 
+	xreap_configure_agextent_limits(&rs);
 	xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
-
 	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
 	if (error)
 		return error;
 
-	if (xreap_dirty(&rs)) {
+	if (xreap_is_dirty(&rs)) {
 		error = xrep_defer_finish(sc);
 		if (error)
 			return error;
@@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks(
  */
 STATIC int
 xreap_bmapi_select(
-	struct xfs_scrub	*sc,
-	struct xfs_inode	*ip,
-	int			whichfork,
+	struct xreap_state	*rs,
 	struct xfs_bmbt_irec	*imap,
 	bool			*crosslinked)
 {
 	struct xfs_owner_info	oinfo;
+	struct xfs_scrub	*sc = rs->sc;
 	struct xfs_btree_cur	*cur;
 	xfs_filblks_t		len = 1;
 	xfs_agblock_t		bno;
@@ -975,7 +1249,8 @@ xreap_bmapi_select(
 	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 			sc->sa.pag);
 
-	xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+	xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
+			imap->br_startoff);
 	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
 	if (error)
 		goto out_cur;
@@ -1038,21 +1313,19 @@ xreap_buf_loggable(
  */
 STATIC int
 xreap_bmapi_binval(
-	struct xfs_scrub	*sc,
-	struct xfs_inode	*ip,
-	int			whichfork,
+	struct xreap_state	*rs,
 	struct xfs_bmbt_irec	*imap)
 {
+	struct xfs_scrub	*sc = rs->sc;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_perag	*pag = sc->sa.pag;
-	int			bmap_flags = xfs_bmapi_aflag(whichfork);
+	int			bmap_flags = xfs_bmapi_aflag(rs->whichfork);
 	xfs_fileoff_t		off;
 	xfs_fileoff_t		max_off;
 	xfs_extlen_t		scan_blocks;
 	xfs_agblock_t		bno;
 	xfs_agblock_t		agbno;
 	xfs_agblock_t		agbno_next;
-	unsigned int		invalidated = 0;
 	int			error;
 
 	/*
@@ -1079,7 +1352,7 @@ xreap_bmapi_binval(
 		struct xfs_bmbt_irec	hmap;
 		int			nhmaps = 1;
 
-		error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+		error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
 				&nhmaps, bmap_flags);
 		if (error)
 			return error;
@@ -1120,14 +1393,13 @@ xreap_bmapi_binval(
 				xfs_buf_stale(bp);
 				xfs_buf_relse(bp);
 			}
-			invalidated++;
 
 			/*
 			 * Stop invalidating if we've hit the limit; we should
 			 * still have enough reservation left to free however
-			 * much of the mapping we've seen so far.
+			 * far we've gotten.
 			 */
-			if (invalidated > XREAP_MAX_BINVAL) {
+			if (!xreap_inc_binval(rs)) {
 				imap->br_blockcount = agbno_next - bno;
 				goto out;
 			}
@@ -1149,12 +1421,11 @@ out:
  */
 STATIC int
 xrep_reap_bmapi_iter(
-	struct xfs_scrub		*sc,
-	struct xfs_inode		*ip,
-	int				whichfork,
+	struct xreap_state		*rs,
 	struct xfs_bmbt_irec		*imap,
 	bool				crosslinked)
 {
+	struct xfs_scrub		*sc = rs->sc;
 	int				error;
 
 	if (crosslinked) {
@@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter(
 				imap->br_blockcount);
 
 		/*
-		 * Schedule removal of the mapping from the fork.  We use
+		 * t0: Schedule removal of the mapping from the fork.  We use
 		 * deferred log intents in this function to control the exact
 		 * sequence of metadata updates.
 		 */
-		xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
-		xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+		xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+		xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
 				-(int64_t)imap->br_blockcount);
-		xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+		xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
 		return 0;
 	}
 
@@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter(
 	 * transaction is full of logged buffer invalidations, so we need to
 	 * return early so that we can roll and retry.
 	 */
-	error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+	error = xreap_bmapi_binval(rs, imap);
 	if (error || imap->br_blockcount == 0)
 		return error;
 
 	/*
-	 * Schedule removal of the mapping from the fork.  We use deferred log
-	 * intents in this function to control the exact sequence of metadata
+	 * t1: Schedule removal of the mapping from the fork.  We use deferred
+	 * work in this function to control the exact sequence of metadata
 	 * updates.
 	 */
-	xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
-	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+	xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+	xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
 			-(int64_t)imap->br_blockcount);
 	return xfs_free_extent_later(sc->tp, imap->br_startblock,
 			imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
 			XFS_FREE_EXTENT_SKIP_DISCARD);
 }
 
+/* Compute the maximum mapcount of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_mapcount(
+	struct xfs_scrub	*sc)
+{
+	/* directory blocks can span multiple fsblocks and be discontiguous */
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
+		return sc->mp->m_dir_geo->fsbcount;
+
+	/* all other file xattr/symlink blocks must be contiguous */
+	return 1;
+}
+
+/* Compute the maximum block size of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_blocksize(
+	struct xfs_scrub	*sc)
+{
+	switch (sc->sm->sm_type) {
+	case XFS_SCRUB_TYPE_DIR:
+		return sc->mp->m_dir_geo->blksize;
+	case XFS_SCRUB_TYPE_XATTR:
+	case XFS_SCRUB_TYPE_PARENT:
+		/*
+		 * The xattr structure itself consists of single fsblocks, but
+		 * there could be remote xattr blocks to invalidate.
+		 */
+		return XFS_XATTR_SIZE_MAX;
+	}
+
+	/* everything else is a single block */
+	return sc->mp->m_sb.sb_blocksize;
+}
+
+/*
+ * Compute the maximum number of buffer invalidations that we can do while
+ * reaping a single extent from a file fork.
+ */
+STATIC void
+xreap_configure_bmapi_limits(
+	struct xreap_state	*rs)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_mount	*mp = sc->mp;
+
+	/* overhead of invalidating a buffer */
+	const unsigned int	per_binval =
+		xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
+					    xreap_bmapi_binval_blocksize(sc));
+
+	/*
+	 * In the worst case, relogging an intent item causes both an intent
+	 * item and a done item to be attached to a transaction for each extent
+	 * that we'd like to process.
+	 */
+	const unsigned int	efi = xfs_efi_log_space(1) +
+				      xfs_efd_log_space(1);
+	const unsigned int	rui = xfs_rui_log_space(1) +
+				      xfs_rud_log_space();
+	const unsigned int	bui = xfs_bui_log_space(1) +
+				      xfs_bud_log_space();
+
+	/*
+	 * t1: Unmapping crosslinked file data blocks: one bmap deletion,
+	 * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
+	 *
+	 * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
+	 * EFI for underfilled bmbt blocks, and another EFI for the space
+	 * itself.
+	 */
+	const unsigned int	t1 = (bui + efi) + rui;
+	const unsigned int	t2 = (bui + efi) + efi;
+	const unsigned int	per_intent = max(t1, t2);
+
+	/*
+	 * For each transaction in a reap chain, we must be able to take one
+	 * step in the defer item chain, which should only consist of CUI, EFI,
+	 * or RUI items.
+	 */
+	const unsigned int	f1 = xfs_calc_finish_efi_reservation(mp, 1);
+	const unsigned int	f2 = xfs_calc_finish_rui_reservation(mp, 1);
+	const unsigned int	f3 = xfs_calc_finish_bui_reservation(mp, 1);
+	const unsigned int	step_size = max3(f1, f2, f3);
+
+	/*
+	 * Each call to xreap_ifork_extent starts with a clean transaction and
+	 * operates on a single mapping by creating a chain of log intent items
+	 * for that mapping.  We need to leave enough reservation in the
+	 * transaction to log btree buffer and inode updates for each step in
+	 * the chain, and to relog the log intents.
+	 */
+	const unsigned int	per_extent_res = per_intent + step_size;
+
+	xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
+
+	trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
+			step_size, per_intent, 1);
+}
+
 /*
  * Dispose of as much of this file extent as we can.  Upon successful return,
  * the imap will reflect the mapping that was removed from the fork.
  */
 STATIC int
 xreap_ifork_extent(
-	struct xfs_scrub		*sc,
-	struct xfs_inode		*ip,
-	int				whichfork,
+	struct xreap_state		*rs,
 	struct xfs_bmbt_irec		*imap)
 {
+	struct xfs_scrub		*sc = rs->sc;
 	xfs_agnumber_t			agno;
 	bool				crosslinked;
 	int				error;
 
 	ASSERT(sc->sa.pag == NULL);
 
-	trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+	trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);
 
 	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
 	sc->sa.pag = xfs_perag_get(sc->mp, agno);
@@ -1248,11 +1617,11 @@ xreap_ifork_extent(
 	 * Decide the fate of the blocks at the beginning of the mapping, then
 	 * update the mapping to use it with the unmap calls.
 	 */
-	error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+	error = xreap_bmapi_select(rs, imap, &crosslinked);
 	if (error)
 		goto out_agf;
 
-	error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+	error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
 	if (error)
 		goto out_agf;
 
@@ -1276,6 +1645,11 @@ xrep_reap_ifork(
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
+	struct xreap_state	rs = {
+		.sc		= sc,
+		.ip		= ip,
+		.whichfork	= whichfork,
+	};
 	xfs_fileoff_t		off = 0;
 	int			bmap_flags = xfs_bmapi_aflag(whichfork);
 	int			error;
@@ -1284,6 +1658,7 @@ xrep_reap_ifork(
 	ASSERT(ip == sc->ip || ip == sc->tempip);
 	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
 
+	xreap_configure_bmapi_limits(&rs);
 	while (off < XFS_MAX_FILEOFF) {
 		struct xfs_bmbt_irec	imap;
 		int			nimaps = 1;
@@ -1303,13 +1678,14 @@ xrep_reap_ifork(
 		 * can in a single transaction.
 		 */
 		if (xfs_bmap_is_real_extent(&imap)) {
-			error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+			error = xreap_ifork_extent(&rs, &imap);
 			if (error)
 				return error;
 
 			error = xfs_defer_finish(&sc->tp);
 			if (error)
 				return error;
+			xreap_defer_finish_reset(&rs);
 		}
 
 		off = imap.br_startoff + imap.br_blockcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index d00c18954a26..efd5a7ccdf62 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1110,7 +1110,7 @@ xrep_will_attempt(
 		return true;
 
 	/* Let debug users force us into the repair routines. */
-	if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+	if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
 		return true;
 
 	/* Metadata is corrupt or failed cross-referencing. */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9c04295742c8..2bb125c4f9bf 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 
-/*
- * This is the maximum number of deferred extent freeing item extents (EFIs)
- * that we'll attach to a transaction without rolling the transaction to avoid
- * overrunning a tr_itruncate reservation.
- */
-#define XREP_MAX_ITRUNCATE_EFIS	(128)
-
-
 /* Repair helpers */
 
 int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 953ce7be78dc..5902398185a8 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -185,7 +185,7 @@ xrep_symlink_salvage_inline(
 		return 0;
 
 	nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
-	strncpy(target_buf, ifp->if_data, nr);
+	memcpy(target_buf, ifp->if_data, nr);
 	return nr;
 }
 
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2450e214103f..987313a52e64 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
 #include "xfs_parent.h"
 #include "xfs_metafile.h"
 #include "xfs_rtgroup.h"
+#include "xfs_trans.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 1e6e9c10cea2..39ea651cbb75 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class,
 		__field(xfs_exntst_t, state)
 	),
 	TP_fast_assign(
-		__entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+		__entry->dev = cursor->sc->mp->m_super->s_dev;
 		__entry->dqtype = cursor->dqtype;
 		__entry->ino = cursor->quota_ip->i_ino;
 		__entry->cur_id = cursor->id;
@@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
 
+DECLARE_EVENT_CLASS(xrep_reap_limits_class,
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval,
+		 unsigned int max_binval, unsigned int step_size,
+		 unsigned int per_intent,
+		 unsigned int max_deferred),
+	TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, log_res)
+		__field(unsigned int, per_binval)
+		__field(unsigned int, max_binval)
+		__field(unsigned int, step_size)
+		__field(unsigned int, per_intent)
+		__field(unsigned int, max_deferred)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->log_res = tp->t_log_res;
+		__entry->per_binval = per_binval;
+		__entry->max_binval = max_binval;
+		__entry->step_size = step_size;
+		__entry->per_intent = per_intent;
+		__entry->max_deferred = max_deferred;
+	),
+	TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->log_res,
+		  __entry->per_binval,
+		  __entry->max_binval,
+		  __entry->step_size,
+		  __entry->per_intent,
+		  __entry->max_deferred)
+);
+#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \
+DEFINE_EVENT(xrep_reap_limits_class, name, \
+	TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \
+		 unsigned int max_binval, unsigned int step_size, \
+		 unsigned int per_intent, \
+		 unsigned int max_deferred), \
+	TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred))
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits);
+
 DECLARE_EVENT_CLASS(xrep_reap_find_class,
 	TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
 		 xfs_extlen_t len, bool crosslinked),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1ee4f835ac3c..a26f79815533 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -760,6 +760,9 @@ xfs_vm_swap_activate(
 {
 	struct xfs_inode		*ip = XFS_I(file_inode(swap_file));
 
+	if (xfs_is_zoned_inode(ip))
+		return -EINVAL;
+
 	/*
 	 * Swap file activation can race against concurrent shared extent
 	 * removal in files that have been cloned.  If this happens,
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 5eef3bc30bda..c3a593319bee 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -491,7 +491,7 @@ xfs_attr_finish_item(
 	/* Reset trans after EAGAIN cycle since the transaction is new */
 	args->trans = tp;
 
-	if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+	if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) {
 		error = -EIO;
 		goto out;
 	}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f9ef3b2a332a..773d959965dc 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -387,8 +387,6 @@ xfs_buf_map_verify(
 	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map)
 {
-	xfs_daddr_t		eofs;
-
 	/* Check for IOs smaller than the sector size / not sector aligned */
 	ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
 	ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
@@ -397,11 +395,10 @@ xfs_buf_map_verify(
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
-	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (map->bm_bn < 0 || map->bm_bn >= eofs) {
+	if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) {
 		xfs_alert(btp->bt_mount,
 			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
-			  __func__, map->bm_bn, eofs);
+			  __func__, map->bm_bn, btp->bt_nr_sectors);
 		WARN_ON(1);
 		return -EFSCORRUPTED;
 	}
@@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io(
 	if (bio->bi_status)
 		xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
 	else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
-		 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+		 XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
 		xfs_buf_ioerror(bp, -EIO);
 
 	if (bp->b_flags & XBF_ASYNC) {
@@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes(
 int
 xfs_configure_buftarg(
 	struct xfs_buftarg	*btp,
-	unsigned int		sectorsize)
+	unsigned int		sectorsize,
+	xfs_rfsblock_t		nr_blocks)
 {
-	int			error;
+	struct xfs_mount	*mp = btp->bt_mount;
 
-	ASSERT(btp->bt_bdev != NULL);
+	if (btp->bt_bdev) {
+		int		error;
 
-	/* Set up metadata sector size info */
-	btp->bt_meta_sectorsize = sectorsize;
-	btp->bt_meta_sectormask = sectorsize - 1;
+		error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
+		if (error) {
+			xfs_warn(mp,
+				"Cannot use blocksize %u on device %pg, err %d",
+				sectorsize, btp->bt_bdev, error);
+			return -EINVAL;
+		}
 
-	error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
-	if (error) {
-		xfs_warn(btp->bt_mount,
-			"Cannot use blocksize %u on device %pg, err %d",
-			sectorsize, btp->bt_bdev, error);
-		return -EINVAL;
+		if (bdev_can_atomic_write(btp->bt_bdev))
+			xfs_configure_buftarg_atomic_writes(btp);
 	}
 
-	if (bdev_can_atomic_write(btp->bt_bdev))
-		xfs_configure_buftarg_atomic_writes(btp);
+	btp->bt_meta_sectorsize = sectorsize;
+	btp->bt_meta_sectormask = sectorsize - 1;
+	/* m_blkbb_log is not set up yet */
+	btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT);
 	return 0;
 }
 
@@ -1749,6 +1750,9 @@ xfs_init_buftarg(
 	size_t				logical_sectorsize,
 	const char			*descr)
 {
+	/* The maximum size of the buftarg is only known once the sb is read. */
+	btp->bt_nr_sectors = (xfs_daddr_t)-1;
+
 	/* Set up device logical sector size mask */
 	btp->bt_logical_sectorsize = logical_sectorsize;
 	btp->bt_logical_sectormask = logical_sectorsize - 1;
@@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 	 * This allows userspace to disrupt buffer caching for debug/testing
 	 * purposes.
 	 */
-	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
+	if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
 		lru_ref = 0;
 
 	atomic_set(&bp->b_lru_ref, lru_ref);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b269e115d9ac..8fa7bdf59c91 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -103,6 +103,7 @@ struct xfs_buftarg {
 	size_t			bt_meta_sectormask;
 	size_t			bt_logical_sectorsize;
 	size_t			bt_logical_sectormask;
+	xfs_daddr_t		bt_nr_sectors;
 
 	/* LRU control structures */
 	struct shrinker		*bt_shrinker;
@@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
-int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
+int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize,
+		xfs_fsblock_t nr_blocks);
 
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 5d58e2ae4972..e4c8af873632 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer(
 	 */
 	xfs_sb_from_disk(&mp->m_sb, dsb);
 
+	/*
+	 * Grow can change the device size.  Mirror that into the buftarg.
+	 */
+	mp->m_ddev_targp->bt_nr_sectors =
+		XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) {
+		mp->m_rtdev_targp->bt_nr_sectors =
+			XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	}
+
 	if (mp->m_sb.sb_agcount < orig_agcount) {
 		xfs_alert(mp, "Shrinking AG count in log recovery not supported");
 		return -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index dbd87e137694..39830b252ac8 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -10,61 +10,17 @@
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
-#include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_sysfs.h"
 #include "xfs_inode.h"
 
 #ifdef DEBUG
 
-static unsigned int xfs_errortag_random_default[] = {
-	XFS_RANDOM_DEFAULT,
-	XFS_RANDOM_IFLUSH_1,
-	XFS_RANDOM_IFLUSH_2,
-	XFS_RANDOM_IFLUSH_3,
-	XFS_RANDOM_IFLUSH_4,
-	XFS_RANDOM_IFLUSH_5,
-	XFS_RANDOM_IFLUSH_6,
-	XFS_RANDOM_DA_READ_BUF,
-	XFS_RANDOM_BTREE_CHECK_LBLOCK,
-	XFS_RANDOM_BTREE_CHECK_SBLOCK,
-	XFS_RANDOM_ALLOC_READ_AGF,
-	XFS_RANDOM_IALLOC_READ_AGI,
-	XFS_RANDOM_ITOBP_INOTOBP,
-	XFS_RANDOM_IUNLINK,
-	XFS_RANDOM_IUNLINK_REMOVE,
-	XFS_RANDOM_DIR_INO_VALIDATE,
-	XFS_RANDOM_BULKSTAT_READ_CHUNK,
-	XFS_RANDOM_IODONE_IOERR,
-	XFS_RANDOM_STRATREAD_IOERR,
-	XFS_RANDOM_STRATCMPL_IOERR,
-	XFS_RANDOM_DIOWRITE_IOERR,
-	XFS_RANDOM_BMAPIFORMAT,
-	XFS_RANDOM_FREE_EXTENT,
-	XFS_RANDOM_RMAP_FINISH_ONE,
-	XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
-	XFS_RANDOM_REFCOUNT_FINISH_ONE,
-	XFS_RANDOM_BMAP_FINISH_ONE,
-	XFS_RANDOM_AG_RESV_CRITICAL,
-	0, /* XFS_RANDOM_DROP_WRITES has been removed */
-	XFS_RANDOM_LOG_BAD_CRC,
-	XFS_RANDOM_LOG_ITEM_PIN,
-	XFS_RANDOM_BUF_LRU_REF,
-	XFS_RANDOM_FORCE_SCRUB_REPAIR,
-	XFS_RANDOM_FORCE_SUMMARY_RECALC,
-	XFS_RANDOM_IUNLINK_FALLBACK,
-	XFS_RANDOM_BUF_IOERROR,
-	XFS_RANDOM_REDUCE_MAX_IEXTENTS,
-	XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
-	XFS_RANDOM_AG_RESV_FAIL,
-	XFS_RANDOM_LARP,
-	XFS_RANDOM_DA_LEAF_SPLIT,
-	XFS_RANDOM_ATTR_LEAF_TO_NODE,
-	XFS_RANDOM_WB_DELAY_MS,
-	XFS_RANDOM_WRITE_DELAY_MS,
-	XFS_RANDOM_EXCHMAPS_FINISH_ONE,
-	XFS_RANDOM_METAFILE_RESV_CRITICAL,
-};
+#define XFS_ERRTAG(_tag, _name, _default) \
+	[XFS_ERRTAG_##_tag]	= (_default),
+#include "xfs_errortag.h"
+static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS };
+#undef XFS_ERRTAG
 
 struct xfs_errortag_attr {
 	struct attribute	attr;
@@ -93,21 +49,18 @@ xfs_errortag_attr_store(
 	size_t			count)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
-	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+	unsigned int		error_tag = to_attr(attr)->tag;
 	int			ret;
-	unsigned int		val;
 
 	if (strcmp(buf, "default") == 0) {
-		val = xfs_errortag_random_default[xfs_attr->tag];
+		mp->m_errortag[error_tag] =
+			xfs_errortag_random_default[error_tag];
 	} else {
-		ret = kstrtouint(buf, 0, &val);
+		ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]);
 		if (ret)
 			return ret;
 	}
 
-	ret = xfs_errortag_set(mp, xfs_attr->tag, val);
-	if (ret)
-		return ret;
 	return count;
 }
 
@@ -118,10 +71,9 @@ xfs_errortag_attr_show(
 	char			*buf)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
-	struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+	unsigned int		error_tag = to_attr(attr)->tag;
 
-	return snprintf(buf, PAGE_SIZE, "%u\n",
-			xfs_errortag_get(mp, xfs_attr->tag));
+	return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]);
 }
 
 static const struct sysfs_ops xfs_errortag_sysfs_ops = {
@@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = {
 	.store = xfs_errortag_attr_store,
 };
 
-#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+#define XFS_ERRTAG(_tag, _name, _default)				\
 static struct xfs_errortag_attr xfs_errortag_attr_##_name = {		\
 	.attr = {.name = __stringify(_name),				\
 		 .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) },	\
-	.tag	= (_tag),						\
-}
-
-#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
-
-XFS_ERRORTAG_ATTR_RW(noerror,		XFS_ERRTAG_NOERROR);
-XFS_ERRORTAG_ATTR_RW(iflush1,		XFS_ERRTAG_IFLUSH_1);
-XFS_ERRORTAG_ATTR_RW(iflush2,		XFS_ERRTAG_IFLUSH_2);
-XFS_ERRORTAG_ATTR_RW(iflush3,		XFS_ERRTAG_IFLUSH_3);
-XFS_ERRORTAG_ATTR_RW(iflush4,		XFS_ERRTAG_IFLUSH_4);
-XFS_ERRORTAG_ATTR_RW(iflush5,		XFS_ERRTAG_IFLUSH_5);
-XFS_ERRORTAG_ATTR_RW(iflush6,		XFS_ERRTAG_IFLUSH_6);
-XFS_ERRORTAG_ATTR_RW(dareadbuf,		XFS_ERRTAG_DA_READ_BUF);
-XFS_ERRORTAG_ATTR_RW(btree_chk_lblk,	XFS_ERRTAG_BTREE_CHECK_LBLOCK);
-XFS_ERRORTAG_ATTR_RW(btree_chk_sblk,	XFS_ERRTAG_BTREE_CHECK_SBLOCK);
-XFS_ERRORTAG_ATTR_RW(readagf,		XFS_ERRTAG_ALLOC_READ_AGF);
-XFS_ERRORTAG_ATTR_RW(readagi,		XFS_ERRTAG_IALLOC_READ_AGI);
-XFS_ERRORTAG_ATTR_RW(itobp,		XFS_ERRTAG_ITOBP_INOTOBP);
-XFS_ERRORTAG_ATTR_RW(iunlink,		XFS_ERRTAG_IUNLINK);
-XFS_ERRORTAG_ATTR_RW(iunlinkrm,		XFS_ERRTAG_IUNLINK_REMOVE);
-XFS_ERRORTAG_ATTR_RW(dirinovalid,	XFS_ERRTAG_DIR_INO_VALIDATE);
-XFS_ERRORTAG_ATTR_RW(bulkstat,		XFS_ERRTAG_BULKSTAT_READ_CHUNK);
-XFS_ERRORTAG_ATTR_RW(logiodone,		XFS_ERRTAG_IODONE_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratread,		XFS_ERRTAG_STRATREAD_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratcmpl,		XFS_ERRTAG_STRATCMPL_IOERR);
-XFS_ERRORTAG_ATTR_RW(diowrite,		XFS_ERRTAG_DIOWRITE_IOERR);
-XFS_ERRORTAG_ATTR_RW(bmapifmt,		XFS_ERRTAG_BMAPIFORMAT);
-XFS_ERRORTAG_ATTR_RW(free_extent,	XFS_ERRTAG_FREE_EXTENT);
-XFS_ERRORTAG_ATTR_RW(rmap_finish_one,	XFS_ERRTAG_RMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(refcount_continue_update,	XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
-XFS_ERRORTAG_ATTR_RW(refcount_finish_one,	XFS_ERRTAG_REFCOUNT_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(bmap_finish_one,	XFS_ERRTAG_BMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(ag_resv_critical,	XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
-XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
-XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
-XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
-XFS_ERRORTAG_ATTR_RW(bad_summary,	XFS_ERRTAG_FORCE_SUMMARY_RECALC);
-XFS_ERRORTAG_ATTR_RW(iunlink_fallback,	XFS_ERRTAG_IUNLINK_FALLBACK);
-XFS_ERRORTAG_ATTR_RW(buf_ioerror,	XFS_ERRTAG_BUF_IOERROR);
-XFS_ERRORTAG_ATTR_RW(reduce_max_iextents,	XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
-XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent,	XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
-XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
-XFS_ERRORTAG_ATTR_RW(larp,		XFS_ERRTAG_LARP);
-XFS_ERRORTAG_ATTR_RW(da_leaf_split,	XFS_ERRTAG_DA_LEAF_SPLIT);
-XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node,	XFS_ERRTAG_ATTR_LEAF_TO_NODE);
-XFS_ERRORTAG_ATTR_RW(wb_delay_ms,	XFS_ERRTAG_WB_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(write_delay_ms,	XFS_ERRTAG_WRITE_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+	.tag	= XFS_ERRTAG_##_tag,					\
+};
+#include "xfs_errortag.h"
+XFS_ERRTAGS
+#undef XFS_ERRTAG
 
+#define XFS_ERRTAG(_tag, _name, _default) \
+	&xfs_errortag_attr_##_name.attr,
+#include "xfs_errortag.h"
 static struct attribute *xfs_errortag_attrs[] = {
-	XFS_ERRORTAG_ATTR_LIST(noerror),
-	XFS_ERRORTAG_ATTR_LIST(iflush1),
-	XFS_ERRORTAG_ATTR_LIST(iflush2),
-	XFS_ERRORTAG_ATTR_LIST(iflush3),
-	XFS_ERRORTAG_ATTR_LIST(iflush4),
-	XFS_ERRORTAG_ATTR_LIST(iflush5),
-	XFS_ERRORTAG_ATTR_LIST(iflush6),
-	XFS_ERRORTAG_ATTR_LIST(dareadbuf),
-	XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
-	XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
-	XFS_ERRORTAG_ATTR_LIST(readagf),
-	XFS_ERRORTAG_ATTR_LIST(readagi),
-	XFS_ERRORTAG_ATTR_LIST(itobp),
-	XFS_ERRORTAG_ATTR_LIST(iunlink),
-	XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
-	XFS_ERRORTAG_ATTR_LIST(dirinovalid),
-	XFS_ERRORTAG_ATTR_LIST(bulkstat),
-	XFS_ERRORTAG_ATTR_LIST(logiodone),
-	XFS_ERRORTAG_ATTR_LIST(stratread),
-	XFS_ERRORTAG_ATTR_LIST(stratcmpl),
-	XFS_ERRORTAG_ATTR_LIST(diowrite),
-	XFS_ERRORTAG_ATTR_LIST(bmapifmt),
-	XFS_ERRORTAG_ATTR_LIST(free_extent),
-	XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
-	XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
-	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
-	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
-	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
-	XFS_ERRORTAG_ATTR_LIST(force_repair),
-	XFS_ERRORTAG_ATTR_LIST(bad_summary),
-	XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
-	XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
-	XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
-	XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
-	XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
-	XFS_ERRORTAG_ATTR_LIST(larp),
-	XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
-	XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
-	XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
-	XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
-	XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
-	XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
-	NULL,
+	XFS_ERRTAGS
+	NULL
 };
 ATTRIBUTE_GROUPS(xfs_errortag);
+#undef XFS_ERRTAG
+
+/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */
+static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX);
 
 static const struct kobj_type xfs_errortag_ktype = {
 	.release = xfs_sysfs_release,
@@ -295,7 +165,6 @@ xfs_errortag_enabled(
 bool
 xfs_errortag_test(
 	struct xfs_mount	*mp,
-	const char		*expression,
 	const char		*file,
 	int			line,
 	unsigned int		error_tag)
@@ -321,36 +190,12 @@ xfs_errortag_test(
 		return false;
 
 	xfs_warn_ratelimited(mp,
-"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
-			expression, file, line, mp->m_super->s_id);
+"Injecting error at file %s, line %d, on filesystem \"%s\"",
+			file, line, mp->m_super->s_id);
 	return true;
 }
 
 int
-xfs_errortag_get(
-	struct xfs_mount	*mp,
-	unsigned int		error_tag)
-{
-	if (!xfs_errortag_valid(error_tag))
-		return -EINVAL;
-
-	return mp->m_errortag[error_tag];
-}
-
-int
-xfs_errortag_set(
-	struct xfs_mount	*mp,
-	unsigned int		error_tag,
-	unsigned int		tag_value)
-{
-	if (!xfs_errortag_valid(error_tag))
-		return -EINVAL;
-
-	mp->m_errortag[error_tag] = tag_value;
-	return 0;
-}
-
-int
 xfs_errortag_add(
 	struct xfs_mount	*mp,
 	unsigned int		error_tag)
@@ -359,9 +204,8 @@ xfs_errortag_add(
 
 	if (!xfs_errortag_valid(error_tag))
 		return -EINVAL;
-
-	return xfs_errortag_set(mp, error_tag,
-			xfs_errortag_random_default[error_tag]);
+	mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag];
+	return 0;
 }
 
 int
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0b9c5ba8a598..fe6a71bbe9cd 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -8,22 +8,17 @@
 
 struct xfs_mount;
 
-extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-			const char *filename, int linenum,
-			xfs_failaddr_t failaddr);
-extern void xfs_corruption_error(const char *tag, int level,
-			struct xfs_mount *mp, const void *buf, size_t bufsize,
-			const char *filename, int linenum,
-			xfs_failaddr_t failaddr);
+void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+		const char *filename, int linenum, xfs_failaddr_t failaddr);
+void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp,
+		const void *buf, size_t bufsize, const char *filename,
+		int linenum, xfs_failaddr_t failaddr);
 void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
-extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
-			const char *name, const void *buf, size_t bufsz,
-			xfs_failaddr_t failaddr);
-extern void xfs_verifier_error(struct xfs_buf *bp, int error,
-			xfs_failaddr_t failaddr);
-extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
-			const char *name, const void *buf, size_t bufsz,
-			xfs_failaddr_t failaddr);
+void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name,
+		const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
+void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr);
+void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name,
+		const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
 
 #define	XFS_ERROR_REPORT(e, lvl, mp)	\
 	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
@@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
 #define XFS_CORRUPTION_DUMP_LEN		(128)
 
 #ifdef DEBUG
-extern int xfs_errortag_init(struct xfs_mount *mp);
-extern void xfs_errortag_del(struct xfs_mount *mp);
-extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
-		const char *file, int line, unsigned int error_tag);
-#define XFS_TEST_ERROR(expr, mp, tag)		\
-	((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+int xfs_errortag_init(struct xfs_mount *mp);
+void xfs_errortag_del(struct xfs_mount *mp);
+bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line,
+		unsigned int error_tag);
+#define XFS_TEST_ERROR(mp, tag)		\
+	xfs_errortag_test((mp), __FILE__, __LINE__, (tag))
 bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
 #define XFS_ERRORTAG_DELAY(mp, tag)		\
 	do { \
@@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
 		mdelay((mp)->m_errortag[(tag)]); \
 	} while (0)
 
-extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
-		unsigned int tag_value);
-extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_clearall(struct xfs_mount *mp);
+int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+int xfs_errortag_clearall(struct xfs_mount *mp);
 #else
 #define xfs_errortag_init(mp)			(0)
 #define xfs_errortag_del(mp)
-#define XFS_TEST_ERROR(expr, mp, tag)		(expr)
+#define XFS_TEST_ERROR(mp, tag)			(false)
 #define XFS_ERRORTAG_DELAY(mp, tag)		((void)0)
-#define xfs_errortag_set(mp, tag, val)		(ENOSYS)
 #define xfs_errortag_add(mp, tag)		(ENOSYS)
 #define xfs_errortag_clearall(mp)		(ENOSYS)
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 47ee598a9827..418ddab590e0 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -202,7 +202,7 @@ xfs_efi_copy_format(
 			       sizeof(struct xfs_extent));
 		return 0;
 	} else if (buf->iov_len == len32) {
-		xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base;
+		struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base;
 
 		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
@@ -216,7 +216,7 @@ xfs_efi_copy_format(
 		}
 		return 0;
 	} else if (buf->iov_len == len64) {
-		xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base;
+		struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base;
 
 		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
 		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index c8402040410b..af1b0331f7af 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -49,7 +49,7 @@ struct xfs_efi_log_item {
 	struct xfs_log_item	efi_item;
 	atomic_t		efi_refcount;
 	atomic_t		efi_next_extent;
-	xfs_efi_log_format_t	efi_format;
+	struct xfs_efi_log_format efi_format;
 };
 
 static inline size_t
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
 	struct xfs_log_item	efd_item;
 	struct xfs_efi_log_item *efd_efip;
 	uint			efd_next_extent;
-	xfs_efd_log_format_t	efd_format;
+	struct xfs_efd_log_format efd_format;
 };
 
 static inline size_t
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 55a304cb3aef..2702fef2c90c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -75,52 +75,47 @@ xfs_dir_fsync(
 	return xfs_log_force_inode(ip);
 }
 
-static xfs_csn_t
-xfs_fsync_seq(
-	struct xfs_inode	*ip,
-	bool			datasync)
-{
-	if (!xfs_ipincount(ip))
-		return 0;
-	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-		return 0;
-	return ip->i_itemp->ili_commit_seq;
-}
-
 /*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
  *
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning.  If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk.  We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
  */
-static  int
+static int
 xfs_fsync_flush_log(
 	struct xfs_inode	*ip,
 	bool			datasync,
 	int			*log_flushed)
 {
-	int			error = 0;
-	xfs_csn_t		seq;
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	xfs_csn_t		seq = 0;
 
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	seq = xfs_fsync_seq(ip, datasync);
-	if (seq) {
-		error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
-					  log_flushed);
+	spin_lock(&iip->ili_lock);
+	if (datasync)
+		seq = iip->ili_datasync_seq;
+	else
+		seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);
 
-		spin_lock(&ip->i_itemp->ili_lock);
-		ip->i_itemp->ili_fsync_fields = 0;
-		spin_unlock(&ip->i_itemp->ili_lock);
-	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return error;
+	if (!seq)
+		return 0;
+
+	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+					  log_flushed);
 }
 
 STATIC int
@@ -158,12 +153,10 @@ xfs_file_fsync(
 		error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
 
 	/*
-	 * Any inode that has dirty modifications in the log is pinned.  The
-	 * racy check here for a pinned inode will not catch modifications
-	 * that happen concurrently to the fsync call, but fsync semantics
-	 * only require to sync previously completed I/O.
+	 * If the inode has a inode log item attached, it may need the journal
+	 * flushed to persist any changes the log item might be tracking.
 	 */
-	if (xfs_ipincount(ip)) {
+	if (ip->i_itemp) {
 		err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
 		if (err2 && !error)
 			error = err2;
@@ -1101,9 +1094,6 @@ xfs_file_write_iter(
 	if (xfs_is_shutdown(ip->i_mount))
 		return -EIO;
 
-	if (IS_DAX(inode))
-		return xfs_file_dax_write(iocb, from);
-
 	if (iocb->ki_flags & IOCB_ATOMIC) {
 		if (ocount < xfs_get_atomic_write_min(ip))
 			return -EINVAL;
@@ -1116,6 +1106,9 @@ xfs_file_write_iter(
 			return ret;
 	}
 
+	if (IS_DAX(inode))
+		return xfs_file_dax_write(iocb, from);
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		/*
 		 * Allow a directio write to fall back to a buffered
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f6f628c01feb..566fd663c95b 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -14,8 +14,6 @@
  */
 xfs_param_t xfs_params = {
 			  /*	MIN		DFLT		MAX	*/
-	.sgid_inherit	= {	0,		0,		1	},
-	.symlink_mode	= {	0,		0,		1	},
 	.panic_mask	= {	0,		0,		XFS_PTAG_MASK},
 	.error_level	= {	0,		3,		11	},
 	.syncd_timer	= {	1*100,		30*100,		7200*100},
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 4cf7abe50143..e44040206851 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -646,8 +646,7 @@ xfs_iget_cache_miss(
 		goto out_destroy;
 
 	/*
-	 * For version 5 superblocks, if we are initialising a new inode and we
-	 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+	 * For version 5 superblocks, if we are initialising a new inode, we
 	 * simply build the new inode core with a random generation number.
 	 *
 	 * For version 4 (and older) superblocks, log recovery is dependent on
@@ -655,8 +654,7 @@ xfs_iget_cache_miss(
 	 * value and hence we must also read the inode off disk even when
 	 * initializing new inodes.
 	 */
-	if (xfs_has_v3inodes(mp) &&
-	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+	if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) {
 		VFS_I(ip)->i_generation = get_random_u32();
 	} else {
 		struct xfs_buf		*bp;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9c39251961a3..36b39539e561 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -877,6 +877,35 @@ xfs_create_tmpfile(
 	return error;
 }
 
+static inline int
+xfs_projid_differ(
+	struct xfs_inode	*tdp,
+	struct xfs_inode	*sip)
+{
+	/*
+	 * If we are using project inheritance, we only allow hard link/renames
+	 * creation in our tree when the project IDs are the same; else
+	 * the tree quota mechanism could be circumvented.
+	 */
+	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+		     tdp->i_projid != sip->i_projid)) {
+		/*
+		 * Project quota setup skips special files which can
+		 * leave inodes in a PROJINHERIT directory without a
+		 * project ID set. We need to allow links to be made
+		 * to these "project-less" inodes because userspace
+		 * expects them to succeed after project ID setup,
+		 * but everything else should be rejected.
+		 */
+		if (!special_file(VFS_I(sip)->i_mode) ||
+		    sip->i_projid != 0) {
+			return -EXDEV;
+		}
+	}
+
+	return 0;
+}
+
 int
 xfs_link(
 	struct xfs_inode	*tdp,
@@ -930,27 +959,9 @@ xfs_link(
 		goto error_return;
 	}
 
-	/*
-	 * If we are using project inheritance, we only allow hard link
-	 * creation in our tree when the project IDs are the same; else
-	 * the tree quota mechanism could be circumvented.
-	 */
-	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
-		     tdp->i_projid != sip->i_projid)) {
-		/*
-		 * Project quota setup skips special files which can
-		 * leave inodes in a PROJINHERIT directory without a
-		 * project ID set. We need to allow links to be made
-		 * to these "project-less" inodes because userspace
-		 * expects them to succeed after project ID setup,
-		 * but everything else should be rejected.
-		 */
-		if (!special_file(VFS_I(sip)->i_mode) ||
-		    sip->i_projid != 0) {
-			error = -EXDEV;
-			goto error_return;
-		}
-	}
+	error = xfs_projid_differ(tdp, sip);
+	if (error)
+		goto error_return;
 
 	error = xfs_dir_add_child(tp, resblks, &du);
 	if (error)
@@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags(
 	int			error = 0;
 
 	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
-	if (atomic_read(&VFS_I(ip)->i_count))
+	if (icount_read(VFS_I(ip)))
 		xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
 	ASSERT(new_size <= XFS_ISIZE(ip));
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1656,7 +1667,6 @@ retry:
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	spin_unlock(&iip->ili_lock);
 	ASSERT(iip->ili_last_fields);
 
@@ -1821,12 +1831,20 @@ static void
 xfs_iunpin(
 	struct xfs_inode	*ip)
 {
-	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	xfs_csn_t		seq = 0;
 
 	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+	spin_lock(&iip->ili_lock);
+	seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);
+	if (!seq)
+		return;
 
 	/* Give the log a push to start the unpinning I/O */
-	xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+	xfs_log_force_seq(ip->i_mount, seq, 0, NULL);
 
 }
 
@@ -2227,16 +2245,9 @@ retry:
 	if (du_wip.ip)
 		xfs_trans_ijoin(tp, du_wip.ip, 0);
 
-	/*
-	 * If we are using project inheritance, we only allow renames
-	 * into our tree when the project IDs are the same; else the
-	 * tree quota mechanism would be circumvented.
-	 */
-	if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
-		     target_dp->i_projid != src_ip->i_projid)) {
-		error = -EXDEV;
+	error = xfs_projid_differ(target_dp, src_ip);
+	if (error)
 		goto out_trans_cancel;
-	}
 
 	/* RENAME_EXCHANGE is unique from here on. */
 	if (flags & RENAME_EXCHANGE) {
@@ -2377,8 +2388,8 @@ xfs_iflush(
 	 * error handling as the caller will shutdown and fail the buffer.
 	 */
 	error = -EFSCORRUPTED;
-	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
-			       mp, XFS_ERRTAG_IFLUSH_1)) {
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
@@ -2394,29 +2405,27 @@ xfs_iflush(
 			goto flush_out;
 		}
 	} else if (S_ISREG(VFS_I(ip)->i_mode)) {
-		if (XFS_TEST_ERROR(
-		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
-		    mp, XFS_ERRTAG_IFLUSH_3)) {
+		if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_BTREE) ||
+		    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) {
 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 				"%s: Bad regular inode %llu, ptr "PTR_FMT,
 				__func__, ip->i_ino, ip);
 			goto flush_out;
 		}
 	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
-		if (XFS_TEST_ERROR(
-		    ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
-		    ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
-		    mp, XFS_ERRTAG_IFLUSH_4)) {
+		if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+		     ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) ||
+		    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) {
 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 				"%s: Bad directory inode %llu, ptr "PTR_FMT,
 				__func__, ip->i_ino, ip);
 			goto flush_out;
 		}
 	}
-	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
-				ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+	if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+	    ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: detected corrupt incore inode %llu, "
 			"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
@@ -2425,8 +2434,8 @@ xfs_iflush(
 			ip->i_nblocks, ip);
 		goto flush_out;
 	}
-	if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
-				mp, XFS_ERRTAG_IFLUSH_6)) {
+	if (ip->i_forkoff > mp->m_sb.sb_inodesize ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) {
 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
 			"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
 			__func__, ip->i_ino, ip->i_forkoff, ip);
@@ -2502,7 +2511,6 @@ flush_out:
 	spin_lock(&iip->ili_lock);
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
 	spin_unlock(&iip->ili_lock);
 
@@ -2661,12 +2669,15 @@ int
 xfs_log_force_inode(
 	struct xfs_inode	*ip)
 {
+	struct xfs_inode_log_item *iip = ip->i_itemp;
 	xfs_csn_t		seq = 0;
 
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	if (xfs_ipincount(ip))
-		seq = ip->i_itemp->ili_commit_seq;
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!iip)
+		return 0;
+
+	spin_lock(&iip->ili_lock);
+	seq = iip->ili_commit_seq;
+	spin_unlock(&iip->ili_lock);
 
 	if (!seq)
 		return 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 07fbdcc4cbf5..bd6d33557194 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -358,9 +358,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
 
 static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
 {
+	if (IS_DAX(VFS_IC(ip)))
+		return false;
+
 	return xfs_inode_buftarg(ip)->bt_awu_max > 0;
 }
 
+static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip)
+{
+	if (IS_DAX(VFS_IC(ip)))
+		return false;
+
+	return xfs_can_sw_atomic_write(ip->i_mount);
+}
+
 /*
  * In-core inode flags.
  */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 829675700fcd..1bd411a1114c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -131,46 +131,28 @@ xfs_inode_item_precommit(
 	}
 
 	/*
-	 * Inode verifiers do not check that the extent size hint is an integer
-	 * multiple of the rt extent size on a directory with both rtinherit
-	 * and extszinherit flags set.  If we're logging a directory that is
-	 * misconfigured in this way, clear the hint.
+	 * Inode verifiers do not check that the extent size hints are an
+	 * integer multiple of the rt extent size on a directory with
+	 * rtinherit flags set.  If we're logging a directory that is
+	 * misconfigured in this way, clear the bad hints.
 	 */
-	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-	    (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
-	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
-		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
-				   XFS_DIFLAG_EXTSZINHERIT);
-		ip->i_extsize = 0;
-		flags |= XFS_ILOG_CORE;
+	if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) {
+		if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+		    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
+			ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+					   XFS_DIFLAG_EXTSZINHERIT);
+			ip->i_extsize = 0;
+			flags |= XFS_ILOG_CORE;
+		}
+		if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+		    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+			ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+			ip->i_cowextsize = 0;
+			flags |= XFS_ILOG_CORE;
+		}
 	}
 
-	/*
-	 * Record the specific change for fdatasync optimisation. This allows
-	 * fdatasync to skip log forces for inodes that are only timestamp
-	 * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
-	 * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
-	 * (ili_fields) correctly tracks that the version has changed.
-	 */
 	spin_lock(&iip->ili_lock);
-	iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
-	if (flags & XFS_ILOG_IVERSION)
-		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
-
-	/*
-	 * Inode verifiers do not check that the CoW extent size hint is an
-	 * integer multiple of the rt extent size on a directory with both
-	 * rtinherit and cowextsize flags set.  If we're logging a directory
-	 * that is misconfigured in this way, clear the hint.
-	 */
-	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-	    (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
-		ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
-		ip->i_cowextsize = 0;
-		flags |= XFS_ILOG_CORE;
-	}
-
 	if (!iip->ili_item.li_buf) {
 		struct xfs_buf	*bp;
 		int		error;
@@ -205,6 +187,20 @@ xfs_inode_item_precommit(
 	}
 
 	/*
+	 * Store the dirty flags back into the inode item as this state is used
+	 * later on in xfs_inode_item_committing() to determine whether the
+	 * transaction is relevant to fsync state or not.
+	 */
+	iip->ili_dirty_flags = flags;
+
+	/*
+	 * Convert the flags on-disk fields that have been modified in the
+	 * transaction so that ili_fields tracks the changes correctly.
+	 */
+	if (flags & XFS_ILOG_IVERSION)
+		flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
+	/*
 	 * Always OR in the bits from the ili_last_fields field.  This is to
 	 * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
 	 * in the eventual clearing of the ili_fields bits.  See the big comment
@@ -214,12 +210,6 @@ xfs_inode_item_precommit(
 	spin_unlock(&iip->ili_lock);
 
 	xfs_inode_item_precommit_check(ip);
-
-	/*
-	 * We are done with the log item transaction dirty state, so clear it so
-	 * that it doesn't pollute future transactions.
-	 */
-	iip->ili_dirty_flags = 0;
 	return 0;
 }
 
@@ -729,13 +719,24 @@ xfs_inode_item_unpin(
 	struct xfs_log_item	*lip,
 	int			remove)
 {
-	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+	struct xfs_inode	*ip = iip->ili_inode;
 
 	trace_xfs_inode_unpin(ip, _RET_IP_);
 	ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
-	if (atomic_dec_and_test(&ip->i_pincount))
+
+	/*
+	 * If this is the last unpin, then the inode no longer needs a journal
+	 * flush to persist it. Hence we can clear the commit sequence numbers
+	 * as a fsync/fdatasync operation on the inode at this point is a no-op.
+	 */
+	if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+		iip->ili_commit_seq = 0;
+		iip->ili_datasync_seq = 0;
+		spin_unlock(&iip->ili_lock);
 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+	}
 }
 
 STATIC uint
@@ -858,12 +859,45 @@ xfs_inode_item_committed(
 	return lsn;
 }
 
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
 STATIC void
 xfs_inode_item_committing(
 	struct xfs_log_item	*lip,
 	xfs_csn_t		seq)
 {
-	INODE_ITEM(lip)->ili_commit_seq = seq;
+	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+	spin_lock(&iip->ili_lock);
+	iip->ili_commit_seq = seq;
+	if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+		iip->ili_datasync_seq = seq;
+	spin_unlock(&iip->ili_lock);
+
+	/*
+	 * Clear the per-transaction dirty flags now that we have finished
+	 * recording the transaction's inode modifications in the CIL and are
+	 * about to release and (maybe) unlock the inode.
+	 */
+	iip->ili_dirty_flags = 0;
+
 	return xfs_inode_item_release(lip);
 }
 
@@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean(
 {
 	iip->ili_last_fields = 0;
 	iip->ili_fields = 0;
-	iip->ili_fsync_fields = 0;
 	iip->ili_flush_lsn = 0;
 	iip->ili_item.li_buf = NULL;
 	list_del_init(&iip->ili_item.li_bio_list);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index ba92ce11a011..2ddcca41714f 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,17 @@ struct xfs_inode_log_item {
 	spinlock_t		ili_lock;	   /* flush state lock */
 	unsigned int		ili_last_fields;   /* fields when flushed */
 	unsigned int		ili_fields;	   /* fields to be logged */
-	unsigned int		ili_fsync_fields;  /* logged since last fsync */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+
+	/*
+	 * We record the sequence number for every inode modification, as
+	 * well as those that only require fdatasync operations for data
+	 * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+	 * without needing to track what modifications the journal is currently
+	 * carrying for the inode. These are protected by the above ili_lock.
+	 */
 	xfs_csn_t		ili_commit_seq;	   /* last transaction commit */
+	xfs_csn_t		ili_datasync_seq;  /* for datasync optimisation */
 };
 
 static inline int xfs_inode_clean(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index fe1f74a3b6a3..a6bb7ee7a27a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -219,7 +219,7 @@ xfs_bulk_ireq_setup(
 		else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
 			return -EINVAL;
 
-		breq->flags |= XFS_IBULK_SAME_AG;
+		breq->iwalk_flags |= XFS_IWALK_SAME_AG;
 
 		/* Asking for an inode past the end of the AG?  We're done! */
 		if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
@@ -512,9 +512,6 @@ xfs_fileattr_get(
 {
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 
-	if (d_is_special(dentry))
-		return -ENOTTY;
-
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -736,9 +733,6 @@ xfs_fileattr_set(
 
 	trace_xfs_ioctl_setattr(ip);
 
-	if (d_is_special(dentry))
-		return -ENOTTY;
-
 	if (!fa->fsx_valid) {
 		if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
 				  FS_NOATIME_FL | FS_NODUMP_FL |
@@ -1209,21 +1203,21 @@ xfs_file_ioctl(
 				current->comm);
 		return -ENOTTY;
 	case XFS_IOC_DIOINFO: {
-		struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+		struct kstat		st;
 		struct dioattr		da;
 
-		da.d_mem = target->bt_logical_sectorsize;
+		error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0);
+		if (error)
+			return error;
 
 		/*
-		 * See xfs_report_dioalign() for an explanation about why this
-		 * reports a value larger than the sector size for COW inodes.
+		 * Some userspace directly feeds the return value to
+		 * posix_memalign, which fails for values that are smaller than
+		 * the pointer size.  Round up the value to not break userspace.
 		 */
-		if (xfs_is_cow_inode(ip))
-			da.d_miniosz = xfs_inode_alloc_unitsize(ip);
-		else
-			da.d_miniosz = target->bt_logical_sectorsize;
+		da.d_mem = roundup(st.dio_mem_align, sizeof(void *));
+		da.d_miniosz = st.dio_offset_align;
 		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
 		if (copy_to_user(arg, &da, sizeof(da)))
 			return -EFAULT;
 		return 0;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2a74f2957341..d3f6e3e42a11 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -149,9 +149,18 @@ xfs_bmbt_to_iomap(
 		iomap->bdev = target->bt_bdev;
 	iomap->flags = iomap_flags;
 
-	if (xfs_ipincount(ip) &&
-	    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
-		iomap->flags |= IOMAP_F_DIRTY;
+	/*
+	 * If the inode is dirty for datasync purposes, let iomap know so it
+	 * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+	 */
+	if (ip->i_itemp) {
+		struct xfs_inode_log_item *iip = ip->i_itemp;
+
+		spin_lock(&iip->ili_lock);
+		if (iip->ili_datasync_seq)
+			iomap->flags |= IOMAP_F_DIRTY;
+		spin_unlock(&iip->ili_lock);
+	}
 
 	iomap->validity_cookie = sequence_cookie;
 	return 0;
@@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin(
 		return error;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = -EFSCORRUPTED;
 		goto out_unlock;
@@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin(
 		return error;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
 		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = -EFSCORRUPTED;
 		goto out_unlock;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 149b5460fbfd..caff0125faea 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -431,14 +431,12 @@ xfs_vn_symlink(
 	struct dentry		*dentry,
 	const char		*symname)
 {
-	struct inode	*inode;
-	struct xfs_inode *cip = NULL;
-	struct xfs_name	name;
-	int		error;
-	umode_t		mode;
+	struct inode		*inode;
+	struct xfs_inode	*cip = NULL;
+	struct xfs_name		name;
+	int			error;
+	umode_t			mode = S_IFLNK | S_IRWXUGO;
 
-	mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
 	error = xfs_dentry_mode_to_name(&name, dentry, mode);
 	if (unlikely(error))
 		goto out;
@@ -616,7 +614,8 @@ xfs_get_atomic_write_min(
 	 * write of exactly one single fsblock if the bdev will make that
 	 * guarantee for us.
 	 */
-	if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp))
+	if (xfs_inode_can_hw_atomic_write(ip) ||
+	    xfs_inode_can_sw_atomic_write(ip))
 		return mp->m_sb.sb_blocksize;
 
 	return 0;
@@ -633,7 +632,7 @@ xfs_get_atomic_write_max(
 	 * write of exactly one single fsblock if the bdev will make that
 	 * guarantee for us.
 	 */
-	if (!xfs_can_sw_atomic_write(mp)) {
+	if (!xfs_inode_can_sw_atomic_write(ip)) {
 		if (xfs_inode_can_hw_atomic_write(ip))
 			return mp->m_sb.sb_blocksize;
 		return 0;
@@ -1334,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.setattr		= xfs_vn_setattr,
 	.listxattr		= xfs_vn_listxattr,
 	.update_time		= xfs_vn_update_time,
+	.fileattr_get		= xfs_fileattr_get,
+	.fileattr_set		= xfs_fileattr_set,
 };
 
 /* Figure out if this file actually supports DAX. */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c8c9b8d8309f..2aa37a4d2706 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -307,7 +307,6 @@ xfs_bulkstat(
 		.breq		= breq,
 	};
 	struct xfs_trans	*tp;
-	unsigned int		iwalk_flags = 0;
 	int			error;
 
 	if (breq->idmap != &nop_mnt_idmap) {
@@ -328,10 +327,7 @@ xfs_bulkstat(
 	 * locking abilities to detect cycles in the inobt without deadlocking.
 	 */
 	tp = xfs_trans_alloc_empty(breq->mp);
-	if (breq->flags & XFS_IBULK_SAME_AG)
-		iwalk_flags |= XFS_IWALK_SAME_AG;
-
-	error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+	error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags,
 			xfs_bulkstat_iwalk, breq->icount, &bc);
 	xfs_trans_cancel(tp);
 	kfree(bc.buf);
@@ -457,7 +453,7 @@ xfs_inumbers(
 	 * locking abilities to detect cycles in the inobt without deadlocking.
 	 */
 	tp = xfs_trans_alloc_empty(breq->mp);
-	error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
+	error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags,
 			xfs_inumbers_walk, breq->icount, &ic);
 	xfs_trans_cancel(tp);
 
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f10e8f8f2335..2d0612f14d6e 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -13,17 +13,15 @@ struct xfs_ibulk {
 	xfs_ino_t		startino; /* start with this inode */
 	unsigned int		icount;   /* number of elements in ubuffer */
 	unsigned int		ocount;   /* number of records returned */
-	unsigned int		flags;    /* see XFS_IBULK_FLAG_* */
+	unsigned int		flags;    /* XFS_IBULK_FLAG_* */
+	unsigned int		iwalk_flags; /* XFS_IWALK_FLAG_* */
 };
 
-/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG	(1U << 0)
-
 /* Fill out the bs_extents64 field if set. */
-#define XFS_IBULK_NREXT64	(1U << 1)
+#define XFS_IBULK_NREXT64	(1U << 0)
 
 /* Signal that we can return metadata directories. */
-#define XFS_IBULK_METADIR	(1U << 2)
+#define XFS_IBULK_METADIR	(1U << 1)
 
 /*
  * Advance the user buffer pointer by one record of the given size.  If the
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 9a2221b4aa21..4dd747bdbcca 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -89,8 +89,6 @@ typedef __u32			xfs_nlink_t;
 #undef XFS_NATIVE_HOST
 #endif
 
-#define irix_sgid_inherit	xfs_params.sgid_inherit.val
-#define irix_symlink_mode	xfs_params.symlink_mode.val
 #define xfs_panic_mask		xfs_params.panic_mask.val
 #define xfs_error_level		xfs_params.error_level.val
 #define xfs_syncd_centisecs	xfs_params.syncd_timer.val
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c8a57e21a1d3..603e85c1ab4c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -969,8 +969,8 @@ xfs_log_unmount_write(
 	 * counters will be recalculated.  Refer to xlog_check_unmount_rec for
 	 * more details.
 	 */
-	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
-			XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+	if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) ||
+	    XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
 		xfs_alert(mp, "%s: will fix summary counters at next mount",
 				__func__);
 		return;
@@ -1240,7 +1240,7 @@ xlog_ioend_work(
 	/*
 	 * Race to shutdown the filesystem if we see an error.
 	 */
-	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+	if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
 		xfs_alert(log->l_mp, "log I/O error %d", error);
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
 	}
@@ -1489,8 +1489,7 @@ xlog_alloc_log(
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
 	log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
-				    WQ_HIGHPRI),
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU),
 			0, mp->m_super->s_id);
 	if (!log->l_ioend_workqueue)
 		goto out_free_iclog;
@@ -1568,13 +1567,13 @@ xlog_cksum(
 	struct xlog		*log,
 	struct xlog_rec_header	*rhead,
 	char			*dp,
-	int			size)
+	unsigned int		hdrsize,
+	unsigned int		size)
 {
 	uint32_t		crc;
 
 	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum_update((char *)rhead,
-			      sizeof(struct xlog_rec_header),
+	crc = xfs_start_cksum_update((char *)rhead, hdrsize,
 			      offsetof(struct xlog_rec_header, h_crc));
 
 	/* ... then for additional cycle data for v2 logs ... */
@@ -1818,7 +1817,7 @@ xlog_sync(
 
 	/* calculcate the checksum */
 	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
-					    iclog->ic_datap, size);
+			iclog->ic_datap, XLOG_REC_SIZE, size);
 	/*
 	 * Intentionally corrupt the log record CRC based on the error injection
 	 * frequency, if defined. This facilitates testing log recovery in the
@@ -1827,7 +1826,7 @@ xlog_sync(
 	 * detects the bad CRC and attempts to recover.
 	 */
 #ifdef DEBUG
-	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
+	if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
 		iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
 		iclog->ic_fail_crc = true;
 		xfs_warn(log->l_mp,
@@ -2656,10 +2655,11 @@ restart:
 	 * until you know exactly how many bytes get copied.  Therefore, wait
 	 * until later to update ic_offset.
 	 *
-	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header's
 	 * can fit into remaining data section.
 	 */
-	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+	if (iclog->ic_size - iclog->ic_offset <
+	    2 * sizeof(struct xlog_op_header)) {
 		int		error = 0;
 
 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
@@ -3153,11 +3153,11 @@ xlog_calc_unit_res(
 	 */
 
 	/* for trans header */
-	unit_bytes += sizeof(xlog_op_header_t);
-	unit_bytes += sizeof(xfs_trans_header_t);
+	unit_bytes += sizeof(struct xlog_op_header);
+	unit_bytes += sizeof(struct xfs_trans_header);
 
 	/* for start-rec */
-	unit_bytes += sizeof(xlog_op_header_t);
+	unit_bytes += sizeof(struct xlog_op_header);
 
 	/*
 	 * for LR headers - the space for data in an iclog is the size minus
@@ -3180,12 +3180,12 @@ xlog_calc_unit_res(
 	num_headers = howmany(unit_bytes, iclog_space);
 
 	/* for split-recs - ophdrs added when data split over LRs */
-	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+	unit_bytes += sizeof(struct xlog_op_header) * num_headers;
 
 	/* add extra header reservations if we overrun */
 	while (!num_headers ||
 	       howmany(unit_bytes, iclog_space) > num_headers) {
-		unit_bytes += sizeof(xlog_op_header_t);
+		unit_bytes += sizeof(struct xlog_op_header);
 		num_headers++;
 	}
 	unit_bytes += log->l_iclog_hsize * num_headers;
@@ -3322,7 +3322,7 @@ xlog_verify_iclog(
 	struct xlog_in_core	*iclog,
 	int			count)
 {
-	xlog_op_header_t	*ophead;
+	struct xlog_op_header	*ophead;
 	xlog_in_core_t		*icptr;
 	xlog_in_core_2_t	*xhdr;
 	void			*base_ptr, *ptr, *p;
@@ -3400,7 +3400,7 @@ xlog_verify_iclog(
 				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
 			}
 		}
-		ptr += sizeof(xlog_op_header_t) + op_len;
+		ptr += sizeof(struct xlog_op_header) + op_len;
 	}
 }
 #endif
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index af6daf4f6792..dcc1f44ed68f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -20,6 +20,43 @@ struct xfs_log_vec {
 	int			lv_alloc_size;	/* size of allocated lv */
 };
 
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT		1
+#define XLOG_REG_TYPE_BCHUNK		2
+#define XLOG_REG_TYPE_EFI_FORMAT	3
+#define XLOG_REG_TYPE_EFD_FORMAT	4
+#define XLOG_REG_TYPE_IFORMAT		5
+#define XLOG_REG_TYPE_ICORE		6
+#define XLOG_REG_TYPE_IEXT		7
+#define XLOG_REG_TYPE_IBROOT		8
+#define XLOG_REG_TYPE_ILOCAL		9
+#define XLOG_REG_TYPE_IATTR_EXT		10
+#define XLOG_REG_TYPE_IATTR_BROOT	11
+#define XLOG_REG_TYPE_IATTR_LOCAL	12
+#define XLOG_REG_TYPE_QFORMAT		13
+#define XLOG_REG_TYPE_DQUOT		14
+#define XLOG_REG_TYPE_QUOTAOFF		15
+#define XLOG_REG_TYPE_LRHEADER		16
+#define XLOG_REG_TYPE_UNMOUNT		17
+#define XLOG_REG_TYPE_COMMIT		18
+#define XLOG_REG_TYPE_TRANSHDR		19
+#define XLOG_REG_TYPE_ICREATE		20
+#define XLOG_REG_TYPE_RUI_FORMAT	21
+#define XLOG_REG_TYPE_RUD_FORMAT	22
+#define XLOG_REG_TYPE_CUI_FORMAT	23
+#define XLOG_REG_TYPE_CUD_FORMAT	24
+#define XLOG_REG_TYPE_BUI_FORMAT	25
+#define XLOG_REG_TYPE_BUD_FORMAT	26
+#define XLOG_REG_TYPE_ATTRI_FORMAT	27
+#define XLOG_REG_TYPE_ATTRD_FORMAT	28
+#define XLOG_REG_TYPE_ATTR_NAME		29
+#define XLOG_REG_TYPE_ATTR_VALUE	30
+#define XLOG_REG_TYPE_XMI_FORMAT	31
+#define XLOG_REG_TYPE_XMD_FORMAT	32
+#define XLOG_REG_TYPE_ATTR_NEWNAME	33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE	34
+#define XLOG_REG_TYPE_MAX		34
+
 #define XFS_LOG_VEC_ORDERED	(-1)
 
 /*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index a9a7a271c15b..0cfc654d8e87 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -499,8 +499,8 @@ xlog_recover_finish(
 extern void
 xlog_recover_cancel(struct xlog *);
 
-extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
-			    char *dp, int size);
+__le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+		char *dp, unsigned int hdrsize, unsigned int size);
 
 extern struct kmem_cache *xfs_log_ticket_cache;
 struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e6ed9e09c027..549d60959aee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2894,20 +2894,34 @@ xlog_recover_process(
 	int			pass,
 	struct list_head	*buffer_list)
 {
-	__le32			old_crc = rhead->h_crc;
-	__le32			crc;
+	__le32			expected_crc = rhead->h_crc, crc, other_crc;
 
-	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE,
+			be32_to_cpu(rhead->h_len));
+
+	/*
+	 * Look at the end of the struct xlog_rec_header definition in
+	 * xfs_log_format.h for the glory details.
+	 */
+	if (expected_crc && crc != expected_crc) {
+		other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER,
+				be32_to_cpu(rhead->h_len));
+		if (other_crc == expected_crc) {
+			xfs_notice_once(log->l_mp,
+	"Fixing up incorrect CRC due to padding.");
+			crc = other_crc;
+		}
+	}
 
 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
-	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
-	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
-	 * know precisely what failed.
+	 * sets expected_crc to 0 so we must consider this valid even on v5
+	 * supers.  Otherwise, return EFSBADCRC on failure so the callers up the
+	 * stack know precisely what failed.
 	 */
 	if (pass == XLOG_RECOVER_CRCPASS) {
-		if (old_crc && crc != old_crc)
+		if (expected_crc && crc != expected_crc)
 			return -EFSBADCRC;
 		return 0;
 	}
@@ -2918,11 +2932,11 @@ xlog_recover_process(
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * the kernel from one that does not add CRCs by default.
 	 */
-	if (crc != old_crc) {
-		if (old_crc || xfs_has_crc(log->l_mp)) {
+	if (crc != expected_crc) {
+		if (expected_crc || xfs_has_crc(log->l_mp)) {
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
-					le32_to_cpu(old_crc),
+					le32_to_cpu(expected_crc),
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 		}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2133fbaf1766..0953f6ae94ab 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -779,6 +779,25 @@ xfs_set_max_atomic_write_opt(
 		return -EINVAL;
 	}
 
+	if (xfs_has_reflink(mp))
+		goto set_limit;
+
+	if (new_max_fsbs == 1) {
+		if (mp->m_ddev_targp->bt_awu_max ||
+		    (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) {
+		} else {
+			xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink or HW support",
+				new_max_bytes >> 10);
+			return -EINVAL;
+		}
+	} else {
+		xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink support",
+				new_max_bytes >> 10);
+		return -EINVAL;
+	}
+
 set_limit:
 	error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
 	if (error) {
@@ -1038,19 +1057,6 @@ xfs_mountfs(
 	xfs_inodegc_start(mp);
 	xfs_blockgc_start(mp);
 
-	/*
-	 * Now that we've recovered any pending superblock feature bit
-	 * additions, we can finish setting up the attr2 behaviour for the
-	 * mount. The noattr2 option overrides the superblock flag, so only
-	 * check the superblock feature flag if the mount option is not set.
-	 */
-	if (xfs_has_noattr2(mp)) {
-		mp->m_features &= ~XFS_FEAT_ATTR2;
-	} else if (!xfs_has_attr2(mp) &&
-		   (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
-		mp->m_features |= XFS_FEAT_ATTR2;
-	}
-
 	if (xfs_has_metadir(mp)) {
 		error = xfs_mount_setup_metadir(mp);
 		if (error)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 97de44c32272..f046d1215b04 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -363,7 +363,6 @@ typedef struct xfs_mount {
 #define XFS_FEAT_EXTFLG		(1ULL << 7)	/* unwritten extents */
 #define XFS_FEAT_ASCIICI	(1ULL << 8)	/* ASCII only case-insens. */
 #define XFS_FEAT_LAZYSBCOUNT	(1ULL << 9)	/* Superblk counters */
-#define XFS_FEAT_ATTR2		(1ULL << 10)	/* dynamic attr fork */
 #define XFS_FEAT_PARENT		(1ULL << 11)	/* parent pointers */
 #define XFS_FEAT_PROJID32	(1ULL << 12)	/* 32 bit project id */
 #define XFS_FEAT_CRC		(1ULL << 13)	/* metadata CRCs */
@@ -386,7 +385,6 @@ typedef struct xfs_mount {
 
 /* Mount features */
 #define XFS_FEAT_NOLIFETIME	(1ULL << 47)	/* disable lifetime hints */
-#define XFS_FEAT_NOATTR2	(1ULL << 48)	/* disable attr2 creation */
 #define XFS_FEAT_NOALIGN	(1ULL << 49)	/* ignore alignment */
 #define XFS_FEAT_ALLOCSIZE	(1ULL << 50)	/* user specified allocation size */
 #define XFS_FEAT_LARGE_IOSIZE	(1ULL << 51)	/* report large preferred
@@ -396,7 +394,6 @@ typedef struct xfs_mount {
 #define XFS_FEAT_DISCARD	(1ULL << 54)	/* discard unused blocks */
 #define XFS_FEAT_GRPID		(1ULL << 55)	/* group-ID assigned from directory */
 #define XFS_FEAT_SMALL_INUMS	(1ULL << 56)	/* user wants 32bit inodes */
-#define XFS_FEAT_IKEEP		(1ULL << 57)	/* keep empty inode clusters*/
 #define XFS_FEAT_SWALLOC	(1ULL << 58)	/* stripe width allocation */
 #define XFS_FEAT_FILESTREAMS	(1ULL << 59)	/* use filestreams allocator */
 #define XFS_FEAT_DAX_ALWAYS	(1ULL << 60)	/* DAX always enabled */
@@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN)
 __XFS_HAS_V4_FEAT(logv2, LOGV2)
 __XFS_HAS_V4_FEAT(extflg, EXTFLG)
 __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_V4_FEAT(attr2, ATTR2)
 __XFS_ADD_V4_FEAT(projid32, PROJID32)
 __XFS_HAS_V4_FEAT(v3inodes, V3INODES)
 __XFS_HAS_V4_FEAT(crc, CRC)
 __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
 
+static inline void xfs_add_attr2(struct xfs_mount *mp)
+{
+	if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4))
+		xfs_sb_version_addattr2(&mp->m_sb);
+}
+
 /*
  * Mount features
  *
@@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
  * bit inodes and read-only state, are kept as operational state rather than
  * features.
  */
-__XFS_HAS_FEAT(noattr2, NOATTR2)
 __XFS_HAS_FEAT(noalign, NOALIGN)
 __XFS_HAS_FEAT(allocsize, ALLOCSIZE)
 __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
@@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC)
 __XFS_HAS_FEAT(discard, DISCARD)
 __XFS_HAS_FEAT(grpid, GRPID)
 __XFS_HAS_FEAT(small_inums, SMALL_INUMS)
-__XFS_HAS_FEAT(ikeep, IKEEP)
 __XFS_HAS_FEAT(swalloc, SWALLOC)
 __XFS_HAS_FEAT(filestreams, FILESTREAMS)
 __XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 866c71d9fbae..73b7e72944e4 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -293,7 +293,8 @@ int
 xfs_mru_cache_init(void)
 {
 	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
-			XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
+			XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+			1);
 	if (!xfs_mru_reap_wq)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fbeddcac4792..b17672889942 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -165,7 +165,7 @@ xfs_dax_translate_range(
 	uint64_t		*bblen)
 {
 	u64			dev_start = btp->bt_dax_part_off;
-	u64			dev_len = bdev_nr_bytes(btp->bt_bdev);
+	u64			dev_len = BBTOB(btp->bt_nr_sectors);
 	u64			dev_end = dev_start + dev_len - 1;
 
 	/* Notify failure on the whole device. */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bb0a82635a77..e85a156dc17d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -105,8 +105,8 @@ enum {
 	Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
 	Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
 	Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
-	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
-	Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
+	Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
+	Opt_largeio, Opt_nolargeio,
 	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
 	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
 	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
@@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
 	fsparam_flag("norecovery",	Opt_norecovery),
 	fsparam_flag("inode64",		Opt_inode64),
 	fsparam_flag("inode32",		Opt_inode32),
-	fsparam_flag("ikeep",		Opt_ikeep),
-	fsparam_flag("noikeep",		Opt_noikeep),
 	fsparam_flag("largeio",		Opt_largeio),
 	fsparam_flag("nolargeio",	Opt_nolargeio),
-	fsparam_flag("attr2",		Opt_attr2),
-	fsparam_flag("noattr2",		Opt_noattr2),
 	fsparam_flag("filestreams",	Opt_filestreams),
 	fsparam_flag("quota",		Opt_quota),
 	fsparam_flag("noquota",		Opt_noquota),
@@ -175,13 +171,11 @@ xfs_fs_show_options(
 {
 	static struct proc_xfs_info xfs_info_set[] = {
 		/* the few simple ones we can get from the mount struct */
-		{ XFS_FEAT_IKEEP,		",ikeep" },
 		{ XFS_FEAT_WSYNC,		",wsync" },
 		{ XFS_FEAT_NOALIGN,		",noalign" },
 		{ XFS_FEAT_SWALLOC,		",swalloc" },
 		{ XFS_FEAT_NOUUID,		",nouuid" },
 		{ XFS_FEAT_NORECOVERY,		",norecovery" },
-		{ XFS_FEAT_ATTR2,		",attr2" },
 		{ XFS_FEAT_FILESTREAMS,		",filestreams" },
 		{ XFS_FEAT_GRPID,		",grpid" },
 		{ XFS_FEAT_DISCARD,		",discard" },
@@ -541,7 +535,8 @@ xfs_setup_devices(
 {
 	int			error;
 
-	error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+	error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize,
+			mp->m_sb.sb_dblocks);
 	if (error)
 		return error;
 
@@ -551,7 +546,7 @@ xfs_setup_devices(
 		if (xfs_has_sector(mp))
 			log_sector_size = mp->m_sb.sb_logsectsize;
 		error = xfs_configure_buftarg(mp->m_logdev_targp,
-					    log_sector_size);
+				log_sector_size, mp->m_sb.sb_logblocks);
 		if (error)
 			return error;
 	}
@@ -565,7 +560,7 @@ xfs_setup_devices(
 		mp->m_rtdev_targp = mp->m_ddev_targp;
 	} else if (mp->m_rtname) {
 		error = xfs_configure_buftarg(mp->m_rtdev_targp,
-					    mp->m_sb.sb_sectsize);
+				mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks);
 		if (error)
 			return error;
 	}
@@ -578,19 +573,19 @@ xfs_init_mount_workqueues(
 	struct xfs_mount	*mp)
 {
 	mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
 			1, mp->m_super->s_id);
 	if (!mp->m_buf_workqueue)
 		goto out;
 
 	mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
 			0, mp->m_super->s_id);
 	if (!mp->m_unwritten_workqueue)
 		goto out_destroy_buf;
 
 	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
 			0, mp->m_super->s_id);
 	if (!mp->m_reclaim_workqueue)
 		goto out_destroy_unwritten;
@@ -602,13 +597,14 @@ xfs_init_mount_workqueues(
 		goto out_destroy_reclaim;
 
 	mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
 			1, mp->m_super->s_id);
 	if (!mp->m_inodegc_wq)
 		goto out_destroy_blockgc;
 
 	mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
-			XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
+			XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0,
+			mp->m_super->s_id);
 	if (!mp->m_sync_workqueue)
 		goto out_destroy_inodegc;
 
@@ -778,7 +774,7 @@ xfs_fs_drop_inode(
 		return 0;
 	}
 
-	return generic_drop_inode(inode);
+	return inode_generic_drop(inode);
 }
 
 STATIC void
@@ -1088,15 +1084,6 @@ xfs_finish_flags(
 	}
 
 	/*
-	 * V5 filesystems always use attr2 format for attributes.
-	 */
-	if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
-		xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
-			     "attr2 is always enabled for V5 filesystems.");
-		return -EINVAL;
-	}
-
-	/*
 	 * prohibit r/w mounts of read-only filesystems
 	 */
 	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
@@ -1542,22 +1529,6 @@ xfs_fs_parse_param(
 		return 0;
 #endif
 	/* Following mount options will be removed in September 2025 */
-	case Opt_ikeep:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
-		parsing_mp->m_features |= XFS_FEAT_IKEEP;
-		return 0;
-	case Opt_noikeep:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
-		parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
-		return 0;
-	case Opt_attr2:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
-		parsing_mp->m_features |= XFS_FEAT_ATTR2;
-		return 0;
-	case Opt_noattr2:
-		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
-		parsing_mp->m_features |= XFS_FEAT_NOATTR2;
-		return 0;
 	case Opt_max_open_zones:
 		parsing_mp->m_max_open_zones = result.uint_32;
 		return 0;
@@ -1593,16 +1564,6 @@ xfs_fs_validate_params(
 		return -EINVAL;
 	}
 
-	/*
-	 * We have not read the superblock at this point, so only the attr2
-	 * mount option can set the attr2 feature by this stage.
-	 */
-	if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
-		xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
-		return -EINVAL;
-	}
-
-
 	if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
 		xfs_warn(mp,
 	"sunit and swidth options incompatible with the noalign option");
@@ -2177,21 +2138,6 @@ xfs_fs_reconfigure(
 	if (error)
 		return error;
 
-	/* attr2 -> noattr2 */
-	if (xfs_has_noattr2(new_mp)) {
-		if (xfs_has_crc(mp)) {
-			xfs_warn(mp,
-			"attr2 is always enabled for a V5 filesystem - can't be changed.");
-			return -EINVAL;
-		}
-		mp->m_features &= ~XFS_FEAT_ATTR2;
-		mp->m_features |= XFS_FEAT_NOATTR2;
-	} else if (xfs_has_attr2(new_mp)) {
-		/* noattr2 -> attr2 */
-		mp->m_features &= ~XFS_FEAT_NOATTR2;
-		mp->m_features |= XFS_FEAT_ATTR2;
-	}
-
 	/* Validate new max_atomic_write option before making other changes */
 	if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
 		error = xfs_set_max_atomic_write_opt(mp,
@@ -2596,8 +2542,8 @@ xfs_init_workqueues(void)
 	 * AGs in all the filesystems mounted. Hence use the default large
 	 * max_active value for this workqueue.
 	 */
-	xfs_alloc_wq = alloc_workqueue("xfsalloc",
-			XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
+	xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+			0);
 	if (!xfs_alloc_wq)
 		return -ENOMEM;
 
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 751dc74a3067..9918f14b4874 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler(
 }
 #endif /* CONFIG_PROC_FS */
 
-STATIC int
+static inline int
 xfs_deprecated_dointvec_minmax(
 	const struct ctl_table	*ctl,
 	int			write,
@@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax(
 
 static const struct ctl_table xfs_table[] = {
 	{
-		.procname	= "irix_sgid_inherit",
-		.data		= &xfs_params.sgid_inherit.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.sgid_inherit.min,
-		.extra2		= &xfs_params.sgid_inherit.max
-	},
-	{
-		.procname	= "irix_symlink_mode",
-		.data		= &xfs_params.symlink_mode.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.symlink_mode.min,
-		.extra2		= &xfs_params.symlink_mode.max
-	},
-	{
 		.procname	= "panic_mask",
 		.data		= &xfs_params.panic_mask.val,
 		.maxlen		= sizeof(int),
@@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.blockgc_timer.min,
 		.extra2		= &xfs_params.blockgc_timer.max,
 	},
-	{
-		.procname	= "speculative_cow_prealloc_lifetime",
-		.data		= &xfs_params.blockgc_timer.val,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= xfs_deprecated_dointvec_minmax,
-		.extra1		= &xfs_params.blockgc_timer.min,
-		.extra2		= &xfs_params.blockgc_timer.max,
-	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 51646f066c4f..ed9d896079c1 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 
 typedef struct xfs_param {
-	xfs_sysctl_val_t sgid_inherit;	/* Inherit S_ISGID if process' GID is
-					 * not a member of parent dir GID. */
-	xfs_sysctl_val_t symlink_mode;	/* Link creat mode affected by umask */
 	xfs_sysctl_val_t panic_mask;	/* bitmask to cause panic on errors. */
 	xfs_sysctl_val_t error_level;	/* Degree of reporting for problems  */
 	xfs_sysctl_val_t syncd_timer;	/* Interval between xfssyncd wakeups */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e1794e3e3156..79b8641880ab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -455,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,			\
 		 xfs_extlen_t len),				\
 	TP_ARGS(oz, rgbno, len))
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
 
 TRACE_EVENT(xfs_zone_gc_select_victim,
@@ -1151,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
-		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->count = icount_read(VFS_I(ip));
 		__entry->pincount = atomic_read(&ip->i_pincount);
 		__entry->iflags = ip->i_flags;
 		__entry->caller_ip = caller_ip;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ece374d622b3..474f5a04ec63 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -253,8 +253,8 @@ xfs_trans_alloc(
 	 * by doing GFP_KERNEL allocations inside sb_start_intwrite().
 	 */
 retry:
-	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
 	tp = __xfs_trans_alloc(mp, flags);
+	WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
 	error = xfs_trans_reserve(tp, resp, blocks, rtextents);
 	if (error == -ENOSPC && want_retry) {
 		xfs_trans_cancel(tp);
@@ -452,19 +452,17 @@ xfs_trans_mod_sb(
  */
 STATIC void
 xfs_trans_apply_sb_deltas(
-	xfs_trans_t	*tp)
+	struct xfs_trans	*tp)
 {
-	struct xfs_dsb	*sbp;
-	struct xfs_buf	*bp;
-	int		whole = 0;
-
-	bp = xfs_trans_getsb(tp);
-	sbp = bp->b_addr;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*bp = xfs_trans_getsb(tp);
+	struct xfs_dsb		*sbp = bp->b_addr;
+	int			whole = 0;
 
 	/*
 	 * Only update the superblock counters if we are logging them
 	 */
-	if (!xfs_has_lazysbcount((tp->t_mountp))) {
+	if (!xfs_has_lazysbcount(mp)) {
 		if (tp->t_icount_delta)
 			be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
 		if (tp->t_ifree_delta)
@@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas(
 	 * write the correct value ondisk.
 	 */
 	if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
-	    !xfs_has_rtgroups(tp->t_mountp)) {
-		struct xfs_mount	*mp = tp->t_mountp;
+	    !xfs_has_rtgroups(mp)) {
 		int64_t			rtxdelta;
 
 		rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
@@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas(
 
 	if (tp->t_dblocks_delta) {
 		be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+		mp->m_ddev_targp->bt_nr_sectors +=
+			XFS_FSB_TO_BB(mp, tp->t_dblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_agcount_delta) {
@@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas(
 		 * recompute the ondisk rtgroup block log.  The incore values
 		 * will be recomputed in xfs_trans_unreserve_and_mod_sb.
 		 */
-		if (xfs_has_rtgroups(tp->t_mountp)) {
+		if (xfs_has_rtgroups(mp)) {
 			sbp->sb_rgblklog = xfs_compute_rgblklog(
 						be32_to_cpu(sbp->sb_rgextents),
 						be32_to_cpu(sbp->sb_rextsize));
@@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas(
 	}
 	if (tp->t_rblocks_delta) {
 		be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+		mp->m_rtdev_targp->bt_nr_sectors +=
+			XFS_FSB_TO_BB(mp, tp->t_rblocks_delta);
 		whole = 1;
 	}
 	if (tp->t_rextents_delta) {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 67c328d23e4a..38983c6777df 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -374,7 +374,7 @@ xfsaild_push_item(
 	 * If log item pinning is enabled, skip the push and track the item as
 	 * pinned. This can help induce head-behind-tail conditions.
 	 */
-	if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
+	if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
 		return XFS_ITEM_PINNED;
 
 	/*
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 33f7eee521a8..1147bacb2da8 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -166,10 +166,9 @@ xfs_open_zone_mark_full(
 static void
 xfs_zone_record_blocks(
 	struct xfs_trans	*tp,
-	xfs_fsblock_t		fsbno,
-	xfs_filblks_t		len,
 	struct xfs_open_zone	*oz,
-	bool			used)
+	xfs_fsblock_t		fsbno,
+	xfs_filblks_t		len)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_rtgroup	*rtg = oz->oz_rtg;
@@ -179,18 +178,37 @@ xfs_zone_record_blocks(
 
 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
 	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
-	if (used) {
-		rmapip->i_used_blocks += len;
-		ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
-	} else {
-		xfs_add_frextents(mp, len);
-	}
+	rmapip->i_used_blocks += len;
+	ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
 	oz->oz_written += len;
 	if (oz->oz_written == rtg_blocks(rtg))
 		xfs_open_zone_mark_full(oz);
 	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
 }
 
+/*
+ * Called for blocks that have been written to disk, but not actually linked to
+ * an inode, which can happen when garbage collection races with user data
+ * writes to a file.
+ */
+static void
+xfs_zone_skip_blocks(
+	struct xfs_open_zone	*oz,
+	xfs_filblks_t		len)
+{
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+
+	trace_xfs_zone_skip_blocks(oz, 0, len);
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	oz->oz_written += len;
+	if (oz->oz_written == rtg_blocks(rtg))
+		xfs_open_zone_mark_full(oz);
+	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+
+	xfs_add_frextents(rtg_mount(rtg), len);
+}
+
 static int
 xfs_zoned_map_extent(
 	struct xfs_trans	*tp,
@@ -250,8 +268,7 @@ xfs_zoned_map_extent(
 		}
 	}
 
-	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
-			true);
+	xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
 
 	/* Map the new blocks into the data fork. */
 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -259,8 +276,7 @@ xfs_zoned_map_extent(
 
 skip:
 	trace_xfs_reflink_cow_remap_skip(ip, new);
-	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
-			false);
+	xfs_zone_skip_blocks(oz, new->br_blockcount);
 	return 0;
 }
 
@@ -358,44 +374,6 @@ xfs_zone_free_blocks(
 	return 0;
 }
 
-/*
- * Check if the zone containing the data just before the offset we are
- * writing to is still open and has space.
- */
-static struct xfs_open_zone *
-xfs_last_used_zone(
-	struct iomap_ioend	*ioend)
-{
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
-	struct xfs_rtgroup	*rtg = NULL;
-	struct xfs_open_zone	*oz = NULL;
-	struct xfs_iext_cursor	icur;
-	struct xfs_bmbt_irec	got;
-
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
-				&icur, &got)) {
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		return NULL;
-	}
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
-	if (!rtg)
-		return NULL;
-
-	xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-	oz = READ_ONCE(rtg->rtg_open_zone);
-	if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
-		oz = NULL;
-	xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-
-	xfs_rtgroup_rele(rtg);
-	return oz;
-}
-
 static struct xfs_group *
 xfs_find_free_zone(
 	struct xfs_mount	*mp,
@@ -515,64 +493,58 @@ xfs_try_open_zone(
 	return oz;
 }
 
+enum xfs_zone_alloc_score {
+	/* Any open zone will do it, we're desperate */
+	XFS_ZONE_ALLOC_ANY	= 0,
+
+	/* It better fit somehow */
+	XFS_ZONE_ALLOC_OK	= 1,
+
+	/* Only reuse a zone if it fits really well. */
+	XFS_ZONE_ALLOC_GOOD	= 2,
+};
+
 /*
- * For data with short or medium lifetime, try to colocated it into an
- * already open zone with a matching temperature.
+ * Life time hint co-location matrix.  Fields not set default to 0
+ * aka XFS_ZONE_ALLOC_ANY.
  */
-static bool
-xfs_colocate_eagerly(
-	enum rw_hint		file_hint)
-{
-	switch (file_hint) {
-	case WRITE_LIFE_MEDIUM:
-	case WRITE_LIFE_SHORT:
-	case WRITE_LIFE_NONE:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static bool
-xfs_good_hint_match(
-	struct xfs_open_zone	*oz,
-	enum rw_hint		file_hint)
-{
-	switch (oz->oz_write_hint) {
-	case WRITE_LIFE_LONG:
-	case WRITE_LIFE_EXTREME:
-		/* colocate long and extreme */
-		if (file_hint == WRITE_LIFE_LONG ||
-		    file_hint == WRITE_LIFE_EXTREME)
-			return true;
-		break;
-	case WRITE_LIFE_MEDIUM:
-		/* colocate medium with medium */
-		if (file_hint == WRITE_LIFE_MEDIUM)
-			return true;
-		break;
-	case WRITE_LIFE_SHORT:
-	case WRITE_LIFE_NONE:
-	case WRITE_LIFE_NOT_SET:
-		/* colocate short and none */
-		if (file_hint <= WRITE_LIFE_SHORT)
-			return true;
-		break;
-	}
-	return false;
-}
+static const unsigned int
+xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
+	[WRITE_LIFE_NOT_SET]	= {
+		[WRITE_LIFE_NOT_SET]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_NONE]	= {
+		[WRITE_LIFE_NONE]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_SHORT]	= {
+		[WRITE_LIFE_SHORT]	= XFS_ZONE_ALLOC_GOOD,
+	},
+	[WRITE_LIFE_MEDIUM]	= {
+		[WRITE_LIFE_MEDIUM]	= XFS_ZONE_ALLOC_GOOD,
+	},
+	[WRITE_LIFE_LONG]	= {
+		[WRITE_LIFE_LONG]	= XFS_ZONE_ALLOC_OK,
+		[WRITE_LIFE_EXTREME]	= XFS_ZONE_ALLOC_OK,
+	},
+	[WRITE_LIFE_EXTREME]	= {
+		[WRITE_LIFE_LONG]	= XFS_ZONE_ALLOC_OK,
+		[WRITE_LIFE_EXTREME]	= XFS_ZONE_ALLOC_OK,
+	},
+};
 
 static bool
 xfs_try_use_zone(
 	struct xfs_zone_info	*zi,
 	enum rw_hint		file_hint,
 	struct xfs_open_zone	*oz,
-	bool			lowspace)
+	unsigned int		goodness)
 {
 	if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
 		return false;
-	if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+
+	if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
 		return false;
+
 	if (!atomic_inc_not_zero(&oz->oz_ref))
 		return false;
 
@@ -603,14 +575,14 @@ static struct xfs_open_zone *
 xfs_select_open_zone_lru(
 	struct xfs_zone_info	*zi,
 	enum rw_hint		file_hint,
-	bool			lowspace)
+	unsigned int		goodness)
 {
 	struct xfs_open_zone	*oz;
 
 	lockdep_assert_held(&zi->zi_open_zones_lock);
 
 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
-		if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+		if (xfs_try_use_zone(zi, file_hint, oz, goodness))
 			return oz;
 
 	cond_resched_lock(&zi->zi_open_zones_lock);
@@ -673,9 +645,11 @@ xfs_select_zone_nowait(
 	 * data.
 	 */
 	spin_lock(&zi->zi_open_zones_lock);
-	if (xfs_colocate_eagerly(write_hint))
-		oz = xfs_select_open_zone_lru(zi, write_hint, false);
-	else if (pack_tight)
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
+	if (oz)
+		goto out_unlock;
+
+	if (pack_tight)
 		oz = xfs_select_open_zone_mru(zi, write_hint);
 	if (oz)
 		goto out_unlock;
@@ -689,16 +663,16 @@ xfs_select_zone_nowait(
 		goto out_unlock;
 
 	/*
-	 * Try to colocate cold data with other cold data if we failed to open a
-	 * new zone for it.
+	 * Try to find an zone that is an ok match to colocate data with.
+	 */
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
+	if (oz)
+		goto out_unlock;
+
+	/*
+	 * Pick the least recently used zone, regardless of hint match
 	 */
-	if (write_hint != WRITE_LIFE_NOT_SET &&
-	    !xfs_colocate_eagerly(write_hint))
-		oz = xfs_select_open_zone_lru(zi, write_hint, false);
-	if (!oz)
-		oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
-	if (!oz)
-		oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+	oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
 out_unlock:
 	spin_unlock(&zi->zi_open_zones_lock);
 	return oz;
@@ -902,12 +876,9 @@ xfs_zone_alloc_and_submit(
 		goto out_error;
 
 	/*
-	 * If we don't have a cached zone in this write context, see if the
-	 * last extent before the one we are writing to points to an active
-	 * zone.  If so, just continue writing to it.
+	 * If we don't have a locally cached zone in this write context, see if
+	 * the inode is still associated with a zone and use that if so.
 	 */
-	if (!*oz && ioend->io_offset)
-		*oz = xfs_last_used_zone(ioend);
 	if (!*oz)
 		*oz = xfs_cached_zone(mp, ip);
 
@@ -1160,7 +1131,7 @@ xfs_calc_open_zones(
 		if (bdev_open_zones)
 			mp->m_max_open_zones = bdev_open_zones;
 		else
-			mp->m_max_open_zones = xfs_max_open_zones(mp);
+			mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
 	}
 
 	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
@@ -1273,7 +1244,7 @@ xfs_mount_zones(
 	if (!mp->m_zone_info)
 		return -ENOMEM;
 
-	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+	xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
 		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
 		 mp->m_max_open_zones);
 	trace_xfs_zones_mount(mp);
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 1313c55b8cbe..9cd38716fd25 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -10,6 +10,7 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
 #include "xfs_zone_alloc.h"
 #include "xfs_zone_priv.h"
 #include "xfs_zones.h"
@@ -230,6 +231,11 @@ xfs_zoned_space_reserve(
 
 	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
 			flags & XFS_ZR_RESERVED);
+	if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) {
+		xfs_inodegc_flush(mp);
+		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+				flags & XFS_ZR_RESERVED);
+	}
 	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
 		error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
 	if (error)
diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h
index 2d08c750c8a7..3a899c626fdc 100644
--- a/include/asm-generic/bitops/__ffs.h
+++ b/include/asm-generic/bitops/__ffs.h
@@ -10,7 +10,7 @@
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned int generic___ffs(unsigned long word)
+static __always_inline __attribute_const__ unsigned int generic___ffs(unsigned long word)
 {
 	unsigned int num = 0;
 
diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h
index e974ec932ec1..35f33780ca6c 100644
--- a/include/asm-generic/bitops/__fls.h
+++ b/include/asm-generic/bitops/__fls.h
@@ -10,7 +10,7 @@
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned int generic___fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word)
 {
 	unsigned int num = BITS_PER_LONG - 1;
 
diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h
index cf4b3d33bf96..d3c3f567045d 100644
--- a/include/asm-generic/bitops/builtin-__ffs.h
+++ b/include/asm-generic/bitops/builtin-__ffs.h
@@ -8,7 +8,7 @@
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned int __ffs(unsigned long word)
+static __always_inline __attribute_const__ unsigned int __ffs(unsigned long word)
 {
 	return __builtin_ctzl(word);
 }
diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h
index 6d72fc8a5259..7770c4f1bfcd 100644
--- a/include/asm-generic/bitops/builtin-__fls.h
+++ b/include/asm-generic/bitops/builtin-__fls.h
@@ -8,7 +8,7 @@
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned int __fls(unsigned long word)
+static __always_inline __attribute_const__ unsigned int __fls(unsigned long word)
 {
 	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
 }
diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h
index c8455cc28841..be707da8c7cd 100644
--- a/include/asm-generic/bitops/builtin-fls.h
+++ b/include/asm-generic/bitops/builtin-fls.h
@@ -9,7 +9,7 @@
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
-static __always_inline int fls(unsigned int x)
+static __always_inline __attribute_const__ int fls(unsigned int x)
 {
 	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
 }
diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h
index 4c43f242daeb..5ff2b7fbda6d 100644
--- a/include/asm-generic/bitops/ffs.h
+++ b/include/asm-generic/bitops/ffs.h
@@ -10,7 +10,7 @@
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from ffz (man ffs).
  */
-static inline int generic_ffs(int x)
+static inline __attribute_const__ int generic_ffs(int x)
 {
 	int r = 1;
 
diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h
index 26f3ce1dd6e4..8eed3437edb9 100644
--- a/include/asm-generic/bitops/fls.h
+++ b/include/asm-generic/bitops/fls.h
@@ -10,7 +10,7 @@
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 
-static __always_inline int generic_fls(unsigned int x)
+static __always_inline __attribute_const__ int generic_fls(unsigned int x)
 {
 	int r = 32;
 
diff --git a/include/asm-generic/bitops/fls64.h b/include/asm-generic/bitops/fls64.h
index 866f2b2304ff..b5f58dd261a3 100644
--- a/include/asm-generic/bitops/fls64.h
+++ b/include/asm-generic/bitops/fls64.h
@@ -16,7 +16,7 @@
  * at position 64.
  */
 #if BITS_PER_LONG == 32
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
 {
 	__u32 h = x >> 32;
 	if (h)
@@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x)
 	return fls(x);
 }
 #elif BITS_PER_LONG == 64
-static __always_inline int fls64(__u64 x)
+static __always_inline __attribute_const__ int fls64(__u64 x)
 {
 	if (x == 0)
 		return 0;
diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h
new file mode 100644
index 000000000000..ee3793e9b1a4
--- /dev/null
+++ b/include/asm-generic/thread_info_tif.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_THREAD_INFO_TIF_H_
+#define _ASM_GENERIC_THREAD_INFO_TIF_H_
+
+#include <vdso/bits.h>
+
+/* Bits 16-31 are reserved for architecture specific purposes */
+
+#define TIF_NOTIFY_RESUME	0	// callback before returning to user
+#define _TIF_NOTIFY_RESUME	BIT(TIF_NOTIFY_RESUME)
+
+#define TIF_SIGPENDING		1	// signal pending
+#define _TIF_SIGPENDING		BIT(TIF_SIGPENDING)
+
+#define TIF_NOTIFY_SIGNAL	2	// signal notifications exist
+#define _TIF_NOTIFY_SIGNAL	BIT(TIF_NOTIFY_SIGNAL)
+
+#define TIF_MEMDIE		3	// is terminating due to OOM killer
+#define _TIF_MEMDIE		BIT(TIF_MEMDIE)
+
+#define TIF_NEED_RESCHED	4	// rescheduling necessary
+#define _TIF_NEED_RESCHED	BIT(TIF_NEED_RESCHED)
+
+#ifdef HAVE_TIF_NEED_RESCHED_LAZY
+# define TIF_NEED_RESCHED_LAZY	5	// Lazy rescheduling needed
+# define _TIF_NEED_RESCHED_LAZY	BIT(TIF_NEED_RESCHED_LAZY)
+#endif
+
+#ifdef HAVE_TIF_POLLING_NRFLAG
+# define TIF_POLLING_NRFLAG	6	// idle is polling for TIF_NEED_RESCHED
+# define _TIF_POLLING_NRFLAG	BIT(TIF_POLLING_NRFLAG)
+#endif
+
+#define TIF_USER_RETURN_NOTIFY	7	// notify kernel of userspace return
+#define _TIF_USER_RETURN_NOTIFY	BIT(TIF_USER_RETURN_NOTIFY)
+
+#define TIF_UPROBE		8	// breakpointed or singlestepping
+#define _TIF_UPROBE		BIT(TIF_UPROBE)
+
+#define TIF_PATCH_PENDING	9	// pending live patching update
+#define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
+
+#ifdef HAVE_TIF_RESTORE_SIGMASK
+# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal() */
+# define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
+#endif
+
+#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ae2d2359b79e..a65a87366c48 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -157,7 +157,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 #define PATCHABLE_DISCARDS	*(__patchable_function_entries)
 #endif
 
-#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG
+#ifndef CONFIG_ARCH_SUPPORTS_CFI
 /*
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h
index 91f6b4cf561c..38e26dff27b0 100644
--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -45,19 +45,11 @@ static inline void chacha20_block(struct chacha_state *state,
 	chacha_block_generic(state, out, 20);
 }
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds);
 void hchacha_block_generic(const struct chacha_state *state,
 			   u32 out[HCHACHA_OUT_WORDS], int nrounds);
 
-static inline void hchacha_block(const struct chacha_state *state,
-				 u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
-		hchacha_block_arch(state, out, nrounds);
-	else
-		hchacha_block_generic(state, out, nrounds);
-}
+void hchacha_block(const struct chacha_state *state,
+		   u32 out[HCHACHA_OUT_WORDS], int nrounds);
 
 enum chacha_constants { /* expand 32-byte k */
 	CHACHA_CONSTANT_EXPA = 0x61707865U,
@@ -93,20 +85,8 @@ static inline void chacha_init(struct chacha_state *state,
 	state->x[15] = get_unaligned_le32(iv + 12);
 }
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds);
-void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds);
-
-static inline void chacha_crypt(struct chacha_state *state,
-				u8 *dst, const u8 *src,
-				unsigned int bytes, int nrounds)
-{
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
-		chacha_crypt_arch(state, dst, src, bytes, nrounds);
-	else
-		chacha_crypt_generic(state, dst, src, bytes, nrounds);
-}
+void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
+		  unsigned int bytes, int nrounds);
 
 static inline void chacha20_crypt(struct chacha_state *state,
 				  u8 *dst, const u8 *src, unsigned int bytes)
@@ -119,13 +99,4 @@ static inline void chacha_zeroize_state(struct chacha_state *state)
 	memzero_explicit(state, sizeof(*state));
 }
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)
-bool chacha_is_arch_optimized(void);
-#else
-static inline bool chacha_is_arch_optimized(void)
-{
-	return false;
-}
-#endif
-
 #endif /* _CRYPTO_CHACHA_H */
diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h
index ece6a9b5fafc..db63a5577c00 100644
--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -6,7 +6,6 @@
 #ifndef CURVE25519_H
 #define CURVE25519_H
 
-#include <crypto/algapi.h> // For crypto_memneq.
 #include <linux/types.h>
 #include <linux/random.h>
 
@@ -14,49 +13,16 @@ enum curve25519_lengths {
 	CURVE25519_KEY_SIZE = 32
 };
 
-extern const u8 curve25519_null_point[];
-extern const u8 curve25519_base_point[];
-
 void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
 			const u8 scalar[CURVE25519_KEY_SIZE],
 			const u8 point[CURVE25519_KEY_SIZE]);
 
-void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
-		     const u8 scalar[CURVE25519_KEY_SIZE],
-		     const u8 point[CURVE25519_KEY_SIZE]);
-
-void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
-			  const u8 secret[CURVE25519_KEY_SIZE]);
-
-bool curve25519_selftest(void);
-
-static inline
 bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
 			     const u8 secret[CURVE25519_KEY_SIZE],
-			     const u8 basepoint[CURVE25519_KEY_SIZE])
-{
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
-		curve25519_arch(mypublic, secret, basepoint);
-	else
-		curve25519_generic(mypublic, secret, basepoint);
-	return crypto_memneq(mypublic, curve25519_null_point,
-			     CURVE25519_KEY_SIZE);
-}
-
-static inline bool
-__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
-					const u8 secret[CURVE25519_KEY_SIZE])
-{
-	if (unlikely(!crypto_memneq(secret, curve25519_null_point,
-				    CURVE25519_KEY_SIZE)))
-		return false;
+			     const u8 basepoint[CURVE25519_KEY_SIZE]);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
-		curve25519_base_arch(pub, secret);
-	else
-		curve25519_generic(pub, secret, curve25519_base_point);
-	return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
-}
+bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+					     const u8 secret[CURVE25519_KEY_SIZE]);
 
 static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
 {
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index f7b3b93f3a49..107b797c33ec 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -135,6 +135,7 @@ struct af_alg_async_req {
  *			SG?
  * @enc:		Cryptographic operation to be performed when
  *			recvmsg is invoked.
+ * @write:		True if we are in the middle of a write.
  * @init:		True if metadata has been sent.
  * @len:		Length of memory allocated for this data structure.
  * @inflight:		Non-zero when AIO requests are in flight.
@@ -151,10 +152,11 @@ struct af_alg_ctx {
 	size_t used;
 	atomic_t rcvused;
 
-	bool more;
-	bool merge;
-	bool enc;
-	bool init;
+	bool		more:1,
+			merge:1,
+			enc:1,
+			write:1,
+			init:1;
 
 	unsigned int len;
 
diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
deleted file mode 100644
index 506d56530ca9..000000000000
--- a/include/crypto/internal/blake2s.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Helper functions for BLAKE2s implementations.
- * Keep this in sync with the corresponding BLAKE2b header.
- */
-
-#ifndef _CRYPTO_INTERNAL_BLAKE2S_H
-#define _CRYPTO_INTERNAL_BLAKE2S_H
-
-#include <crypto/blake2s.h>
-#include <linux/string.h>
-
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
-			      size_t nblocks, const u32 inc);
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, const u32 inc);
-
-bool blake2s_selftest(void);
-
-#endif /* _CRYPTO_INTERNAL_BLAKE2S_H */
diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h
index c60315f47562..a72fff409ab8 100644
--- a/include/crypto/internal/poly1305.h
+++ b/include/crypto/internal/poly1305.h
@@ -30,12 +30,13 @@ void poly1305_core_blocks(struct poly1305_state *state,
 void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
 			void *dst);
 
-void poly1305_block_init_arch(struct poly1305_block_state *state,
-			      const u8 raw_key[POLY1305_BLOCK_SIZE]);
-void poly1305_block_init_generic(struct poly1305_block_state *state,
-				 const u8 raw_key[POLY1305_BLOCK_SIZE]);
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit);
+static inline void
+poly1305_block_init_generic(struct poly1305_block_state *desc,
+			    const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	poly1305_core_init(&desc->h);
+	poly1305_core_setkey(&desc->core_r, raw_key);
+}
 
 static inline void poly1305_blocks_generic(struct poly1305_block_state *state,
 					   const u8 *src, unsigned int len,
@@ -45,9 +46,6 @@ static inline void poly1305_blocks_generic(struct poly1305_block_state *state,
 			     len / POLY1305_BLOCK_SIZE, padbit);
 }
 
-void poly1305_emit_arch(const struct poly1305_state *state,
-			u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4]);
-
 static inline void poly1305_emit_generic(const struct poly1305_state *state,
 					 u8 digest[POLY1305_DIGEST_SIZE],
 					 const u32 nonce[4])
diff --git a/include/crypto/md5.h b/include/crypto/md5.h
index 28ee533a0507..c9aa5c3abc53 100644
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
@@ -7,6 +7,7 @@
 
 #define MD5_DIGEST_SIZE		16
 #define MD5_HMAC_BLOCK_SIZE	64
+#define MD5_BLOCK_SIZE		64
 #define MD5_BLOCK_WORDS		16
 #define MD5_HASH_WORDS		4
 #define MD5_STATE_SIZE		24
@@ -27,4 +28,182 @@ struct md5_state {
 	u32 block[MD5_BLOCK_WORDS];
 };
 
-#endif
+/* State for the MD5 compression function */
+struct md5_block_state {
+	u32 h[MD5_HASH_WORDS];
+};
+
+/**
+ * struct md5_ctx - Context for hashing a message with MD5
+ * @state: the compression function state
+ * @bytecount: number of bytes processed so far
+ * @buf: partial block buffer; bytecount % MD5_BLOCK_SIZE bytes are valid
+ */
+struct md5_ctx {
+	struct md5_block_state state;
+	u64 bytecount;
+	u8 buf[MD5_BLOCK_SIZE] __aligned(__alignof__(__le64));
+};
+
+/**
+ * md5_init() - Initialize an MD5 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider md5() instead.
+ *
+ * Context: Any context.
+ */
+void md5_init(struct md5_ctx *ctx);
+
+/**
+ * md5_update() - Update an MD5 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * md5_final() - Finish computing an MD5 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting MD5 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+
+/**
+ * md5() - Compute MD5 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting MD5 message digest
+ *
+ * Context: Any context.
+ */
+void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]);
+
+/**
+ * struct hmac_md5_key - Prepared key for HMAC-MD5
+ * @istate: private
+ * @ostate: private
+ */
+struct hmac_md5_key {
+	struct md5_block_state istate;
+	struct md5_block_state ostate;
+};
+
+/**
+ * struct hmac_md5_ctx - Context for computing HMAC-MD5 of a message
+ * @hash_ctx: private
+ * @ostate: private
+ */
+struct hmac_md5_ctx {
+	struct md5_ctx hash_ctx;
+	struct md5_block_state ostate;
+};
+
+/**
+ * hmac_md5_preparekey() - Prepare a key for HMAC-MD5
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-MD5 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_md5_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_md5_preparekey(struct hmac_md5_key *key,
+			 const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_md5_init() - Initialize an HMAC-MD5 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_md5() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key);
+
+/**
+ * hmac_md5_init_usingrawkey() - Initialize an HMAC-MD5 context for a new
+ *				  message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-MD5 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_md5_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx,
+			       const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_md5_update() - Update an HMAC-MD5 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_md5_update(struct hmac_md5_ctx *ctx,
+				   const u8 *data, size_t data_len)
+{
+	md5_update(&ctx->hash_ctx, data, data_len);
+}
+
+/**
+ * hmac_md5_final() - Finish computing an HMAC-MD5 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-MD5 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]);
+
+/**
+ * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-MD5 value
+ *
+ * If you're using the key only once, consider using hmac_md5_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_md5(const struct hmac_md5_key *key,
+	      const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]);
+
+/**
+ * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key
+ * @raw_key: the raw HMAC-MD5 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-MD5 value
+ *
+ * If you're using the key multiple times, prefer to use hmac_md5_preparekey()
+ * followed by multiple calls to hmac_md5() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			  const u8 *data, size_t data_len,
+			  u8 out[MD5_DIGEST_SIZE]);
+
+#endif /* _CRYPTO_MD5_H */
diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h
index e54abda8cfe9..d4daeec8da19 100644
--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -64,13 +64,4 @@ void poly1305_update(struct poly1305_desc_ctx *desc,
 		     const u8 *src, unsigned int nbytes);
 void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest);
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)
-bool poly1305_is_arch_optimized(void);
-#else
-static inline bool poly1305_is_arch_optimized(void)
-{
-	return false;
-}
-#endif
-
 #endif
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 15e461e568cc..e5dafb935cc8 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -376,6 +376,34 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
 /**
+ * sha256_finup_2x() - Compute two SHA-256 digests from a common initial
+ *		       context.  On some CPUs, this is faster than sequentially
+ *		       computing each digest.
+ * @ctx: an optional initial context, which may have already processed data.  If
+ *	 NULL, a default initial context is used (equivalent to sha256_init()).
+ * @data1: data for the first message
+ * @data2: data for the second message
+ * @len: the length of each of @data1 and @data2, in bytes
+ * @out1: (output) the first SHA-256 message digest
+ * @out2: (output) the second SHA-256 message digest
+ *
+ * Context: Any context.
+ */
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE]);
+
+/**
+ * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real
+ *				    interleaved implementation, as opposed to a
+ *				    sequential fallback
+ * @return: true if optimized
+ *
+ * Context: Any context.
+ */
+bool sha256_finup_2x_is_optimized(void);
+
+/**
  * struct hmac_sha256_key - Prepared key for HMAC-SHA256
  * @key: private
  */
diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
index 8ed80cad77ec..b0e6653ee42e 100644
--- a/include/drm/drm_bridge.h
+++ b/include/drm/drm_bridge.h
@@ -866,13 +866,61 @@ struct drm_bridge_funcs {
 				      struct drm_connector *connector,
 				      bool enable, int direction);
 
+	/**
+	 * @hdmi_cec_init:
+	 *
+	 * Initialize CEC part of the bridge.
+	 *
+	 * This callback is optional, it can be implemented by bridges that
+	 * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their
+	 * &drm_bridge->ops.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code otherwise
+	 */
 	int (*hdmi_cec_init)(struct drm_bridge *bridge,
 			     struct drm_connector *connector);
 
+	/**
+	 * @hdmi_cec_enable:
+	 *
+	 * Enable or disable the CEC adapter inside the bridge.
+	 *
+	 * This callback is optional, it can be implemented by bridges that
+	 * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their
+	 * &drm_bridge->ops.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code otherwise
+	 */
 	int (*hdmi_cec_enable)(struct drm_bridge *bridge, bool enable);
 
+	/**
+	 * @hdmi_cec_log_addr:
+	 *
+	 * Set the logical address of the CEC adapter inside the bridge.
+	 *
+	 * This callback is optional, it can be implemented by bridges that
+	 * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their
+	 * &drm_bridge->ops.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code otherwise
+	 */
 	int (*hdmi_cec_log_addr)(struct drm_bridge *bridge, u8 logical_addr);
 
+	/**
+	 * @hdmi_cec_transmit:
+	 *
+	 * Transmit the message using the CEC adapter inside the bridge.
+	 *
+	 * This callback is optional, it can be implemented by bridges that
+	 * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their
+	 * &drm_bridge->ops.
+	 *
+	 * Returns:
+	 * 0 on success, a negative error code otherwise
+	 */
 	int (*hdmi_cec_transmit)(struct drm_bridge *bridge, u8 attempts,
 				 u32 signal_free_time, struct cec_msg *msg);
 
diff --git a/include/drm/drm_gpuvm.h b/include/drm/drm_gpuvm.h
index 274532facfd6..2e7088264355 100644
--- a/include/drm/drm_gpuvm.h
+++ b/include/drm/drm_gpuvm.h
@@ -103,7 +103,7 @@ struct drm_gpuva {
 	} va;
 
 	/**
-	 * @gem: structure containing the &drm_gem_object and it's offset
+	 * @gem: structure containing the &drm_gem_object and its offset
 	 */
 	struct {
 		/**
@@ -843,7 +843,7 @@ struct drm_gpuva_op_map {
 	} va;
 
 	/**
-	 * @gem: structure containing the &drm_gem_object and it's offset
+	 * @gem: structure containing the &drm_gem_object and its offset
 	 */
 	struct {
 		/**
@@ -1189,11 +1189,11 @@ struct drm_gpuvm_ops {
 
 	/**
 	 * @sm_step_unmap: called from &drm_gpuvm_sm_map and
-	 * &drm_gpuvm_sm_unmap to unmap an existent mapping
+	 * &drm_gpuvm_sm_unmap to unmap an existing mapping
 	 *
-	 * This callback is called when existent mapping needs to be unmapped.
+	 * This callback is called when existing mapping needs to be unmapped.
 	 * This is the case when either a newly requested mapping encloses an
-	 * existent mapping or an unmap of an existent mapping is requested.
+	 * existing mapping or an unmap of an existing mapping is requested.
 	 *
 	 * The &priv pointer matches the one the driver passed to
 	 * &drm_gpuvm_sm_map or &drm_gpuvm_sm_unmap, respectively.
diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h
new file mode 100644
index 000000000000..108e96433ea4
--- /dev/null
+++ b/include/kunit/run-in-irq-context.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper function for testing code in interrupt contexts
+ *
+ * Copyright 2025 Google LLC
+ */
+#ifndef _KUNIT_RUN_IN_IRQ_CONTEXT_H
+#define _KUNIT_RUN_IN_IRQ_CONTEXT_H
+
+#include <kunit/test.h>
+#include <linux/timekeeping.h>
+#include <linux/hrtimer.h>
+#include <linux/workqueue.h>
+
+#define KUNIT_IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5)
+
+struct kunit_irq_test_state {
+	bool (*func)(void *test_specific_state);
+	void *test_specific_state;
+	bool task_func_reported_failure;
+	bool hardirq_func_reported_failure;
+	bool softirq_func_reported_failure;
+	unsigned long hardirq_func_calls;
+	unsigned long softirq_func_calls;
+	struct hrtimer timer;
+	struct work_struct bh_work;
+};
+
+static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer)
+{
+	struct kunit_irq_test_state *state =
+		container_of(timer, typeof(*state), timer);
+
+	WARN_ON_ONCE(!in_hardirq());
+	state->hardirq_func_calls++;
+
+	if (!state->func(state->test_specific_state))
+		state->hardirq_func_reported_failure = true;
+
+	hrtimer_forward_now(&state->timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL);
+	queue_work(system_bh_wq, &state->bh_work);
+	return HRTIMER_RESTART;
+}
+
+static void kunit_irq_test_bh_work_func(struct work_struct *work)
+{
+	struct kunit_irq_test_state *state =
+		container_of(work, typeof(*state), bh_work);
+
+	WARN_ON_ONCE(!in_serving_softirq());
+	state->softirq_func_calls++;
+
+	if (!state->func(state->test_specific_state))
+		state->softirq_func_reported_failure = true;
+}
+
+/*
+ * Helper function which repeatedly runs the given @func in task, softirq, and
+ * hardirq context concurrently, and reports a failure to KUnit if any
+ * invocation of @func in any context returns false.  @func is passed
+ * @test_specific_state as its argument.  At most 3 invocations of @func will
+ * run concurrently: one in each of task, softirq, and hardirq context.
+ *
+ * The main purpose of this interrupt context testing is to validate fallback
+ * code paths that run in contexts where the normal code path cannot be used,
+ * typically due to the FPU or vector registers already being in-use in kernel
+ * mode.  These code paths aren't covered when the test code is executed only by
+ * the KUnit test runner thread in task context.  The reason for the concurrency
+ * is because merely using hardirq context is not sufficient to reach a fallback
+ * code path on some architectures; the hardirq actually has to occur while the
+ * FPU or vector unit was already in-use in kernel mode.
+ *
+ * Another purpose of this testing is to detect issues with the architecture's
+ * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions,
+ * especially in softirq context when the softirq may have interrupted a task
+ * already using kernel-mode FPU or vector (if the arch didn't prevent that).
+ * Crypto functions are often executed in softirqs, so this is important.
+ */
+static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
+				      int max_iterations,
+				      void *test_specific_state)
+{
+	struct kunit_irq_test_state state = {
+		.func = func,
+		.test_specific_state = test_specific_state,
+	};
+	unsigned long end_jiffies;
+
+	/*
+	 * Set up a hrtimer (the way we access hardirq context) and a work
+	 * struct for the BH workqueue (the way we access softirq context).
+	 */
+	hrtimer_setup_on_stack(&state.timer, kunit_irq_test_timer_func,
+			       CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+	INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func);
+
+	/* Run for up to max_iterations or 1 second, whichever comes first. */
+	end_jiffies = jiffies + HZ;
+	hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL,
+		      HRTIMER_MODE_REL_HARD);
+	for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
+	     i++) {
+		if (!func(test_specific_state))
+			state.task_func_reported_failure = true;
+	}
+
+	/* Cancel the timer and work. */
+	hrtimer_cancel(&state.timer);
+	flush_work(&state.bh_work);
+
+	/* Sanity check: the timer and BH functions should have been run. */
+	KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
+			    "Timer function was not called");
+	KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
+			    "BH work function was not called");
+
+	/* Check for incorrect hash values reported from any context. */
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.task_func_reported_failure,
+		"Incorrect hash values reported from task context");
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.hardirq_func_reported_failure,
+		"Incorrect hash values reported from hardirq context");
+	KUNIT_EXPECT_FALSE_MSG(
+		test, state.softirq_func_reported_failure,
+		"Incorrect hash values reported from softirq context");
+}
+
+#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 404883c7af6e..4000ff16f295 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -8,8 +8,8 @@
 #include <linux/bits.h>
 #include <linux/kvm.h>
 #include <linux/irqreturn.h>
-#include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/refcount.h>
 #include <linux/spinlock.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
@@ -139,10 +139,13 @@ struct vgic_irq {
 	bool pending_latch;		/* The pending latch state used to calculate
 					 * the pending state for both level
 					 * and edge triggered IRQs. */
-	bool active;			/* not used for LPIs */
+	bool active;
+	bool pending_release;		/* Used for LPIs only, unreferenced IRQ
+					 * pending a release */
+
 	bool enabled;
 	bool hw;			/* Tied to HW IRQ */
-	struct kref refcount;		/* Used for LPIs */
+	refcount_t refcount;		/* Used for LPIs */
 	u32 hwintid;			/* HW INTID number */
 	unsigned int host_irq;		/* linux irq corresponding to hwintid */
 	union {
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 45f2f278b50a..70807c679f1a 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -185,6 +185,7 @@ struct atmdev_ops { /* only send is required */
 	int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd,
 			    void __user *arg);
 #endif
+	int (*pre_send)(struct atm_vcc *vcc, struct sk_buff *skb);
 	int (*send)(struct atm_vcc *vcc,struct sk_buff *skb);
 	int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb);
 	int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags);
diff --git a/include/linux/audit.h b/include/linux/audit.h
index a394614ccd0b..536f8ee8da81 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -37,6 +37,8 @@ struct audit_watch;
 struct audit_tree;
 struct sk_buff;
 struct kern_ipc_perm;
+struct lsm_id;
+struct lsm_prop;
 
 struct audit_krule {
 	u32			pflags;
@@ -147,6 +149,10 @@ extern unsigned compat_signal_class[];
 #define AUDIT_TTY_ENABLE	BIT(0)
 #define AUDIT_TTY_LOG_PASSWD	BIT(1)
 
+/* bit values for audit_cfg_lsm */
+#define AUDIT_CFG_LSM_SECCTX_SUBJECT	BIT(0)
+#define AUDIT_CFG_LSM_SECCTX_OBJECT	BIT(1)
+
 struct filename;
 
 #define AUDIT_OFF	0
@@ -185,6 +191,8 @@ extern void		    audit_log_path_denied(int type,
 						  const char *operation);
 extern void		    audit_log_lost(const char *message);
 
+extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
+extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop);
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab);
 
@@ -210,6 +218,8 @@ extern u32 audit_enabled;
 
 extern int audit_signal_info(int sig, struct task_struct *t);
 
+extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags);
+
 #else /* CONFIG_AUDIT */
 static inline __printf(4, 5)
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
@@ -245,6 +255,16 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 { }
 static inline void audit_log_path_denied(int type, const char *operation)
 { }
+static inline int audit_log_subj_ctx(struct audit_buffer *ab,
+				     struct lsm_prop *prop)
+{
+	return 0;
+}
+static inline int audit_log_obj_ctx(struct audit_buffer *ab,
+				    struct lsm_prop *prop)
+{
+	return 0;
+}
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
@@ -269,6 +289,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 	return 0;
 }
 
+static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{ }
+
 #endif /* CONFIG_AUDIT */
 
 #ifdef CONFIG_AUDIT_COMPAT_GENERIC
@@ -527,7 +550,7 @@ static inline void audit_log_kern_module(const char *name)
 
 static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
 {
-	if (!audit_dummy_context())
+	if (audit_enabled)
 		__audit_fanotify(response, friar);
 }
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..c5c9d89c73ed 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -152,6 +152,10 @@ struct bdi_writeback {
 	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
 	struct list_head b_attached;	/* attached inodes, protected by list_lock */
 	struct list_head offline_node;	/* anchored at offline_cgwbs */
+	struct work_struct switch_work;	/* work used to perform inode switching
+					 * to this wb */
+	struct llist_head switch_wbs_ctxs;	/* queued contexts for
+						 * writeback switching */
 
 	union {
 		struct work_struct release_work;
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 9be2d50da09a..ea7898cc5903 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -267,7 +267,7 @@ static inline int parity8(u8 val)
  * The result is not defined if no bits are set, so check that @word
  * is non-zero before calling this.
  */
-static inline unsigned int __ffs64(u64 word)
+static inline __attribute_const__ unsigned int __ffs64(u64 word)
 {
 #if BITS_PER_LONG == 32
 	if (((u32)word) == 0UL)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 95886b404b16..fe1797bbec42 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -656,6 +656,7 @@ enum {
 	QUEUE_FLAG_SQ_SCHED,		/* single queue style io dispatch */
 	QUEUE_FLAG_DISABLE_WBT_DEF,	/* for sched to disable/enable wbt */
 	QUEUE_FLAG_NO_ELV_SWITCH,	/* can't switch elevator any more */
+	QUEUE_FLAG_QOS_ENABLED,		/* qos is enabled */
 	QUEUE_FLAG_MAX
 };
 
diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
index 0bf7d33a1048..7fcec025c5e0 100644
--- a/include/linux/cc_platform.h
+++ b/include/linux/cc_platform.h
@@ -96,6 +96,14 @@ enum cc_attr {
 	 * enabled to run SEV-SNP guests.
 	 */
 	CC_ATTR_HOST_SEV_SNP,
+
+	/**
+	 * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active.
+	 *
+	 * The host kernel is running with the necessary features enabled
+	 * to run SEV-SNP guests with full Secure AVIC capabilities.
+	 */
+	CC_ATTR_SNP_SECURE_AVIC,
 };
 
 #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
diff --git a/drivers/cdx/controller/bitfield.h b/include/linux/cdx/bitfield.h
index 567f8ec47582..567f8ec47582 100644
--- a/drivers/cdx/controller/bitfield.h
+++ b/include/linux/cdx/bitfield.h
diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h
new file mode 100644
index 000000000000..749db33bb482
--- /dev/null
+++ b/include/linux/cdx/edac_cdx_pcol.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Driver for AMD network controllers and boards
+ *
+ * Copyright (C) 2021, Xilinx, Inc.
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ */
+
+#ifndef MC_CDX_PCOL_H
+#define MC_CDX_PCOL_H
+#include  <linux/cdx/mcdi.h>
+
+#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN		4
+/* Number of registers for the DDR controller */
+#define MC_CMD_GET_DDR_CONFIG_OFST	4
+#define MC_CMD_GET_DDR_CONFIG_LEN	4
+
+/***********************************/
+/* MC_CMD_EDAC_GET_DDR_CONFIG
+ * Provides detailed configuration for the DDR controller of the given index.
+ */
+#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3
+
+/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */
+#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST		0
+#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN		4
+
+#endif /* MC_CDX_PCOL_H */
diff --git a/drivers/cdx/controller/mcdi.h b/include/linux/cdx/mcdi.h
index 54a65e9760ae..74075305cba4 100644
--- a/drivers/cdx/controller/mcdi.h
+++ b/include/linux/cdx/mcdi.h
@@ -11,16 +11,7 @@
 #include <linux/kref.h>
 #include <linux/rpmsg.h>
 
-#include "bitfield.h"
-#include "mc_cdx_pcol.h"
-
-#ifdef DEBUG
-#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x)
-#define CDX_WARN_ON_PARANOID(x) WARN_ON(x)
-#else
-#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0)
-#define CDX_WARN_ON_PARANOID(x) do {} while (0)
-#endif
+#include "linux/cdx/bitfield.h"
 
 /**
  * enum cdx_mcdi_mode - MCDI transaction mode
@@ -36,8 +27,6 @@ enum cdx_mcdi_mode {
 #define MCDI_RPC_LONG_TIMEOU	(60 * HZ)
 #define MCDI_RPC_POST_RST_TIME	(10 * HZ)
 
-#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX)
-
 /**
  * enum cdx_mcdi_cmd_state - State for an individual MCDI command
  * @MCDI_STATE_QUEUED: Command not started and is waiting to run.
@@ -180,24 +169,12 @@ struct cdx_mcdi_data {
 	u32 fn_flags;
 };
 
-static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx)
-{
-	return cdx->mcdi ? &cdx->mcdi->iface : NULL;
-}
-
-int cdx_mcdi_init(struct cdx_mcdi *cdx);
 void cdx_mcdi_finish(struct cdx_mcdi *cdx);
-
+int cdx_mcdi_init(struct cdx_mcdi *cdx);
 void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len);
 int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd,
 		 const struct cdx_dword *inbuf, size_t inlen,
 		 struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual);
-int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd,
-		       const struct cdx_dword *inbuf, size_t inlen,
-		       cdx_mcdi_async_completer *complete,
-		       unsigned long cookie);
-int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx,
-				 unsigned int timeout_jiffies);
 
 /*
  * We expect that 16- and 32-bit fields in MCDI requests and responses
@@ -215,28 +192,8 @@ int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx,
 #define _MCDI_DWORD(_buf, _field)					\
 	((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2))
 
-#define MCDI_BYTE(_buf, _field)						\
-	((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1),	\
-	 *MCDI_PTR(_buf, _field))
-#define MCDI_WORD(_buf, _field)						\
-	((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2),	\
-	 le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field)))
 #define MCDI_SET_DWORD(_buf, _field, _value)				\
 	CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value)
 #define MCDI_DWORD(_buf, _field)					\
 	CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD)
-#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1)		\
-	CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field),		\
-			     MC_CMD_ ## _name1, _value1)
-#define MCDI_SET_QWORD(_buf, _field, _value)				\
-	do {								\
-		CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0],	\
-				     CDX_DWORD, (u32)(_value));	\
-		CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1],	\
-				     CDX_DWORD, (u64)(_value) >> 32);	\
-	} while (0)
-#define MCDI_QWORD(_buf, _field)					\
-	(CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) |	\
-	(u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32)
-
 #endif /* CDX_MCDI_H */
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 52a98886a455..1fd22ea6eba4 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <asm/cfi.h>
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 extern bool cfi_warn;
 
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
@@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func)
 extern u32 cfi_bpf_hash;
 extern u32 cfi_bpf_subprog_hash;
 
-#else /* CONFIG_CFI_CLANG */
+#else /* CONFIG_CFI */
 
 static inline int cfi_get_offset(void) { return 0; }
 static inline u32 cfi_get_func_hash(void *func) { return 0; }
@@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; }
 #define cfi_bpf_hash 0U
 #define cfi_bpf_subprog_hash 0U
 
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 #ifdef CONFIG_ARCH_USES_CFI_TRAPS
 bool is_cfi_trap(unsigned long addr);
diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h
index 685f7181780f..a86af9bc8bdc 100644
--- a/include/linux/cfi_types.h
+++ b/include/linux/cfi_types.h
@@ -8,7 +8,7 @@
 #ifdef __ASSEMBLY__
 #include <linux/linkage.h>
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 /*
  * Use the __kcfi_typeid_<function> type identifier symbol to
  * annotate indirectly called assembly functions. The compiler emits
@@ -29,12 +29,12 @@
 #define SYM_TYPED_START(name, linkage, align...)	\
 	SYM_TYPED_ENTRY(name, linkage, align)
 
-#else /* CONFIG_CFI_CLANG */
+#else /* CONFIG_CFI */
 
 #define SYM_TYPED_START(name, linkage, align...)	\
 	SYM_START(name, linkage, align)
 
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 #ifndef SYM_TYPED_FUNC_START
 #define SYM_TYPED_FUNC_START(name) 			\
@@ -43,7 +43,7 @@
 
 #else /* __ASSEMBLY__ */
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 #define DEFINE_CFI_TYPE(name, func)						\
 	/*									\
 	 * Force a reference to the function so the compiler generates		\
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 6b93a64115fe..93318fce31f3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -91,6 +91,12 @@ enum {
 	 * cgroup_threadgroup_rwsem. This makes hot path operations such as
 	 * forks and exits into the slow path and more expensive.
 	 *
+	 * Alleviate the contention between fork, exec, exit operations and
+	 * writing to cgroup.procs by taking a per threadgroup rwsem instead of
+	 * the global cgroup_threadgroup_rwsem. Fork and other operations
+	 * from threads in different thread groups no longer contend with
+	 * writing to cgroup.procs.
+	 *
 	 * The static usage pattern of creating a cgroup, enabling controllers,
 	 * and then seeding it with CLONE_INTO_CGROUP doesn't require write
 	 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
@@ -140,6 +146,17 @@ enum {
 	__CFTYPE_ADDED		= (1 << 18),
 };
 
+enum cgroup_attach_lock_mode {
+	/* Default */
+	CGRP_ATTACH_LOCK_GLOBAL,
+
+	/* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
+	CGRP_ATTACH_LOCK_NONE,
+
+	/* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
+	CGRP_ATTACH_LOCK_PER_THREADGROUP,
+};
+
 /*
  * cgroup_file is the handle for a file instance created in a cgroup which
  * is used, for example, to generate file changed notifications.  This can
@@ -433,6 +450,23 @@ struct cgroup_freezer_state {
 	 * frozen, SIGSTOPped, and PTRACEd.
 	 */
 	int nr_frozen_tasks;
+
+	/* Freeze time data consistency protection */
+	seqcount_t freeze_seq;
+
+	/*
+	 * Most recent time the cgroup was requested to freeze.
+	 * Accesses guarded by freeze_seq counter. Writes serialized
+	 * by css_set_lock.
+	 */
+	u64 freeze_start_nsec;
+
+	/*
+	 * Total duration the cgroup has spent freezing.
+	 * Accesses guarded by freeze_seq counter. Writes serialized
+	 * by css_set_lock.
+	 */
+	u64 frozen_nsec;
 };
 
 struct cgroup {
@@ -746,7 +780,6 @@ struct cgroup_subsys {
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_taskset *tset);
-	void (*post_attach)(void);
 	int (*can_fork)(struct task_struct *task,
 			struct css_set *cset);
 	void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
@@ -822,6 +855,7 @@ struct cgroup_subsys {
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+extern bool cgroup_enable_per_threadgroup_rwsem;
 
 struct cgroup_of_peak {
 	unsigned long		value;
@@ -833,11 +867,14 @@ struct cgroup_of_peak {
  * @tsk: target task
  *
  * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
+ * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
+ * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
  */
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 {
 	percpu_down_read(&cgroup_threadgroup_rwsem);
+	if (cgroup_enable_per_threadgroup_rwsem)
+		down_read(&tsk->signal->cgroup_threadgroup_rwsem);
 }
 
 /**
@@ -848,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
  */
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 {
+	if (cgroup_enable_per_threadgroup_rwsem)
+		up_read(&tsk->signal->cgroup_threadgroup_rwsem);
 	percpu_up_read(&cgroup_threadgroup_rwsem);
 }
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b18fb5fcb38e..7e292bdff2c1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,6 +27,7 @@
 #include <linux/kernel_stat.h>
 
 #include <linux/cgroup-defs.h>
+#include <linux/cgroup_namespace.h>
 
 struct kernel_clone_args;
 
@@ -354,6 +355,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css)
 	return css->flags & CSS_DYING;
 }
 
+static inline bool css_is_online(struct cgroup_subsys_state *css)
+{
+	return css->flags & CSS_ONLINE;
+}
+
 static inline bool css_is_self(struct cgroup_subsys_state *css)
 {
 	if (css == &css->cgroup->self) {
@@ -783,52 +789,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 
 #endif	/* CONFIG_CGROUP_DATA */
 
-struct cgroup_namespace {
-	struct ns_common	ns;
-	struct user_namespace	*user_ns;
-	struct ucounts		*ucounts;
-	struct css_set          *root_cset;
-};
-
-extern struct cgroup_namespace init_cgroup_ns;
-
-#ifdef CONFIG_CGROUPS
-
-void free_cgroup_ns(struct cgroup_namespace *ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns);
-
-int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
-		   struct cgroup_namespace *ns);
-
-static inline void get_cgroup_ns(struct cgroup_namespace *ns)
-{
-	refcount_inc(&ns->ns.count);
-}
-
-static inline void put_cgroup_ns(struct cgroup_namespace *ns)
-{
-	if (refcount_dec_and_test(&ns->ns.count))
-		free_cgroup_ns(ns);
-}
-
-#else /* !CONFIG_CGROUPS */
-
-static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline struct cgroup_namespace *
-copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
-	       struct cgroup_namespace *old_ns)
-{
-	return old_ns;
-}
-
-static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
-
-#endif /* !CONFIG_CGROUPS */
-
 #ifdef CONFIG_CGROUPS
 
 void cgroup_enter_frozen(void);
diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h
new file mode 100644
index 000000000000..78a8418558a4
--- /dev/null
+++ b/include/linux/cgroup_namespace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CGROUP_NAMESPACE_H
+#define _LINUX_CGROUP_NAMESPACE_H
+
+#include <linux/ns_common.h>
+
+struct cgroup_namespace {
+	struct ns_common	ns;
+	struct user_namespace	*user_ns;
+	struct ucounts		*ucounts;
+	struct css_set          *root_cset;
+};
+
+extern struct cgroup_namespace init_cgroup_ns;
+
+#ifdef CONFIG_CGROUPS
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns);
+
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns);
+
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+		   struct cgroup_namespace *ns);
+
+static inline void get_cgroup_ns(struct cgroup_namespace *ns)
+{
+	ns_ref_inc(ns);
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+	if (ns_ref_put(ns))
+		free_cgroup_ns(ns);
+}
+
+#else /* !CONFIG_CGROUPS */
+
+static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline struct cgroup_namespace *
+copy_cgroup_ns(u64 flags, struct user_namespace *user_ns,
+	       struct cgroup_namespace *old_ns)
+{
+	return old_ns;
+}
+
+static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
+
+#endif /* !CONFIG_CGROUPS */
+
+#endif /* _LINUX_CGROUP_NAMESPACE_H */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index fa4ffe037bc7..8720a0705900 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -18,23 +18,42 @@
 #define KASAN_ABI_VERSION 5
 
 /*
+ * Clang 22 added preprocessor macros to match GCC, in hopes of eventually
+ * dropping __has_feature support for sanitizers:
+ * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c
+ * Create these macros for older versions of clang so that it is easy to clean
+ * up once the minimum supported version of LLVM for building the kernel always
+ * creates these macros.
+ *
  * Note: Checking __has_feature(*_sanitizer) is only true if the feature is
  * enabled. Therefore it is not required to additionally check defined(CONFIG_*)
  * to avoid adding redundant attributes in other configurations.
  */
+#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__)
+#define __SANITIZE_ADDRESS__
+#endif
+#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__)
+#define __SANITIZE_HWADDRESS__
+#endif
+#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__)
+#define __SANITIZE_THREAD__
+#endif
 
-#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
-/* Emulate GCC's __SANITIZE_ADDRESS__ flag */
+/*
+ * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel.
+ */
+#ifdef __SANITIZE_HWADDRESS__
 #define __SANITIZE_ADDRESS__
+#endif
+
+#ifdef __SANITIZE_ADDRESS__
 #define __no_sanitize_address \
 		__attribute__((no_sanitize("address", "hwaddress")))
 #else
 #define __no_sanitize_address
 #endif
 
-#if __has_feature(thread_sanitizer)
-/* emulate gcc's __SANITIZE_THREAD__ flag */
-#define __SANITIZE_THREAD__
+#ifdef __SANITIZE_THREAD__
 #define __no_sanitize_thread \
 		__attribute__((no_sanitize("thread")))
 #else
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 6f04a1d8c720..5b45ea7dff3e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 
 #endif /* __KERNEL__ */
 
-#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
 /*
  * Force a reference to the external symbol so the compiler generates
  * __kcfi_typid.
@@ -288,14 +288,6 @@ static inline void *offset_to_ptr(const int *off)
 #define __ADDRESSABLE(sym) \
 	___ADDRESSABLE(sym, __section(".discard.addressable"))
 
-#define __ADDRESSABLE_ASM(sym)						\
-	.pushsection .discard.addressable,"aw";				\
-	.align ARCH_SEL(8,4);						\
-	ARCH_SEL(.quad, .long) __stringify(sym);			\
-	.popsection;
-
-#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym))
-
 /*
  * This returns a constant expression while determining if an argument is
  * a constant expression, most importantly without evaluating the argument.
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 16755431fc11..2f3e80bf9f35 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -330,6 +330,29 @@ struct ftrace_likely_data {
 #endif
 
 /*
+ * The assume attribute is used to indicate that a certain condition is
+ * assumed to be true. If this condition is violated at runtime, the behavior
+ * is undefined. Compilers may or may not use this indication to generate
+ * optimized code.
+ *
+ * Note that the clang documentation states that optimizers may react
+ * differently to this attribute, and this may even have a negative
+ * performance impact. Therefore this attribute should be used with care.
+ *
+ * Optional: only supported since gcc >= 13
+ * Optional: only supported since clang >= 19
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#id13
+ *
+ */
+#ifdef CONFIG_CC_HAS_ASSUME
+# define __assume(expr)			__attribute__((__assume__(expr)))
+#else
+# define __assume(expr)
+#endif
+
+/*
  * Optional: only supported since gcc >= 15
  * Optional: only supported since clang >= 18
  *
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b91b993f58ee..487b3bf2e1ea 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev,
 extern ssize_t cpu_show_indirect_target_selection(struct device *dev,
 						  struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index edfa61d80702..62cd7b35a29c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -168,6 +168,7 @@ enum cpuhp_state {
 	CPUHP_AP_QCOM_TIMER_STARTING,
 	CPUHP_AP_TEGRA_TIMER_STARTING,
 	CPUHP_AP_ARMADA_TIMER_STARTING,
+	CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING,
 	CPUHP_AP_MIPS_GIC_TIMER_STARTING,
 	CPUHP_AP_ARC_TIMER_STARTING,
 	CPUHP_AP_REALTEK_TIMER_STARTING,
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a102a10f833f..89ae50ad2ace 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -148,7 +148,7 @@ struct cred {
 
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
-extern int copy_creds(struct task_struct *, unsigned long);
+extern int copy_creds(struct task_struct *, u64);
 extern const struct cred *get_task_cred(struct task_struct *);
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
diff --git a/include/linux/damon.h b/include/linux/damon.h
index f13664c62ddd..9e62b2a85538 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -636,6 +636,7 @@ struct damon_operations {
  * @data:		Data that will be passed to @fn.
  * @repeat:		Repeat invocations.
  * @return_code:	Return code from @fn invocation.
+ * @dealloc_on_cancel:	De-allocate when canceled.
  *
  * Control damon_call(), which requests specific kdamond to invoke a given
  * function.  Refer to damon_call() for more details.
@@ -645,6 +646,7 @@ struct damon_call_control {
 	void *data;
 	bool repeat;
 	int return_code;
+	bool dealloc_on_cancel;
 /* private: internal use only */
 	/* informs if the kdamond finished handling of the request */
 	struct completion completion;
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index bacda9898f2b..7e7b45b0d097 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -88,12 +88,43 @@ int dlm_new_lockspace(const char *name, const char *cluster,
 		      int *ops_result, dlm_lockspace_t **lockspace);
 
 /*
+ * dlm_release_lockspace() release_option values:
+ *
+ * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's)
+ *   exist in the local lockspace.
+ *
+ * DLM_RELEASE_UNUSED previous value that is no longer used.
+ *
+ * DLM_RELEASE_NORMAL releases the lockspace regardless of any
+ *   locks managed in the local lockspace.
+ *
+ * DLM_RELEASE_NO_EVENT release the lockspace regardless of any
+ *   locks managed in the local lockspace, and does not submit
+ *   a leave event to the cluster manager, so other nodes will
+ *   not be notified that the node should be removed from the
+ *   list of lockspace members.
+ *
+ * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining
+ *   nodes will handle the removal of the node as if the node
+ *   had failed, e.g. the recover_slot() callback would be used.
+ */
+#define DLM_RELEASE_NO_LOCKS		0
+#define DLM_RELEASE_UNUSED		1
+#define DLM_RELEASE_NORMAL		2
+#define DLM_RELEASE_NO_EVENT		3
+#define DLM_RELEASE_RECOVER		4
+#define __DLM_RELEASE_MAX		DLM_RELEASE_RECOVER
+
+/*
  * dlm_release_lockspace
  *
  * Stop a lockspace.
+ *
+ * release_option: see DLM_RELEASE values above.
  */
 
-int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
+int dlm_release_lockspace(dlm_lockspace_t *lockspace,
+			  unsigned int release_option);
 
 /*
  * dlm_lock
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f48e5fb88bd5..332b80c42b6f 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -153,6 +153,9 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page,
 {
 	__free_pages(page, get_order(size));
 }
+static inline void dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
+{
+}
 #endif /* CONFIG_DMA_CMA*/
 
 #ifdef CONFIG_DMA_DECLARE_COHERENT
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 7fa1eb3cc823..61d50571ad88 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -171,6 +171,9 @@ int em_dev_update_perf_domain(struct device *dev,
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				const struct em_data_callback *cb,
 				const cpumask_t *cpus, bool microwatts);
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+				 const struct em_data_callback *cb,
+				 const cpumask_t *cpus, bool microwatts);
 void em_dev_unregister_perf_domain(struct device *dev);
 struct em_perf_table *em_table_alloc(struct em_perf_domain *pd);
 void em_table_free(struct em_perf_table *table);
@@ -350,6 +353,13 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 {
 	return -EINVAL;
 }
+static inline
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+				 const struct em_data_callback *cb,
+				 const cpumask_t *cpus, bool microwatts)
+{
+	return -EINVAL;
+}
 static inline void em_dev_unregister_perf_domain(struct device *dev)
 {
 }
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index de5bd76a400c..d7d757e72554 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -856,8 +856,8 @@ struct kernel_ethtool_ts_info {
 	enum hwtstamp_provider_qualifier phc_qualifier;
 	enum hwtstamp_source phc_source;
 	int phc_phyindex;
-	enum hwtstamp_tx_types tx_types;
-	enum hwtstamp_rx_filters rx_filters;
+	u32 tx_types;
+	u32 rx_filters;
 };
 
 /**
diff --git a/include/linux/export.h b/include/linux/export.h
index f35d03b4113b..a686fd0ba406 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -91,6 +91,6 @@
 #define EXPORT_SYMBOL_NS(sym, ns)	__EXPORT_SYMBOL(sym, "", ns)
 #define EXPORT_SYMBOL_NS_GPL(sym, ns)	__EXPORT_SYMBOL(sym, "GPL", ns)
 
-#define EXPORT_SYMBOL_GPL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods)
+#define EXPORT_SYMBOL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods)
 
 #endif /* _LINUX_EXPORT_H */
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index cfb0dd1ea49c..3aac58a520c7 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -123,6 +123,12 @@ enum fid_type {
 	FILEID_BCACHEFS_WITH_PARENT = 0xb2,
 
 	/*
+	 *
+	 * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number.
+	 */
+	FILEID_NSFS = 0xf1,
+
+	/*
 	 * 64 bit unique kernfs id
 	 */
 	FILEID_KERNFS = 0xfe,
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index cceb70415ed2..d38c6e538e5c 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -341,7 +341,11 @@ struct fw_address_handler {
 	u64 length;
 	fw_address_callback_t address_callback;
 	void *callback_data;
+
+	// Only for core functions.
 	struct list_head link;
+	struct kref kref;
+	struct completion done;
 };
 
 struct fw_address_region {
diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h
index d4212bc42b2c..a33b45027356 100644
--- a/include/linux/firmware/imx/sm.h
+++ b/include/linux/firmware/imx/sm.h
@@ -26,13 +26,43 @@
 #define SCMI_IMX94_CTRL_SAI3_MCLK	5U	/*!< WAKE SAI3 MCLK */
 #define SCMI_IMX94_CTRL_SAI4_MCLK	6U	/*!< WAKE SAI4 MCLK */
 
+#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV)
 int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val);
 int scmi_imx_misc_ctrl_set(u32 id, u32 val);
+#else
+static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val)
+{
+	return -EOPNOTSUPP;
+}
 
+static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV)
 int scmi_imx_cpu_start(u32 cpuid, bool start);
 int scmi_imx_cpu_started(u32 cpuid, bool *started);
 int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot,
 				  bool resume);
+#else
+static inline int scmi_imx_cpu_start(u32 cpuid, bool start)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int scmi_imx_cpu_started(u32 cpuid, bool *started)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start,
+						bool boot, bool resume)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 
 enum scmi_imx_lmm_op {
 	SCMI_IMX_LMM_BOOT,
@@ -44,7 +74,24 @@ enum scmi_imx_lmm_op {
 #define SCMI_IMX_LMM_OP_FORCEFUL	0
 #define SCMI_IMX_LMM_OP_GRACEFUL	BIT(0)
 
+#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV)
 int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags);
 int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info);
 int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector);
+#else
+static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d7ab4f96d705..9e9d7c757efe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -72,9 +72,7 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct fscrypt_inode_info;
 struct fscrypt_operations;
-struct fsverity_info;
 struct fsverity_operations;
 struct fsnotify_mark_connector;
 struct fsnotify_sb_info;
@@ -149,7 +147,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)(1 << 12))
 
-/* FMODE_* bit 13 */
+/* Supports IOCB_HAS_METADATA */
+#define FMODE_HAS_METADATA	((__force fmode_t)(1 << 13))
 
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)(1 << 14))
@@ -356,6 +355,7 @@ struct readahead_control;
 #define IOCB_APPEND		(__force int) RWF_APPEND
 #define IOCB_ATOMIC		(__force int) RWF_ATOMIC
 #define IOCB_DONTCACHE		(__force int) RWF_DONTCACHE
+#define IOCB_NOSIGNAL		(__force int) RWF_NOSIGNAL
 
 /* non-RWF related bits - start at 16 */
 #define IOCB_EVENTFD		(1 << 16)
@@ -667,6 +667,124 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_CACHED_LINK	0x0040
 
 /*
+ * Inode state bits.  Protected by inode->i_lock
+ *
+ * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
+ * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
+ *
+ * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
+ * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
+ * various stages of removing an inode.
+ *
+ * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
+ *
+ * I_DIRTY_SYNC		Inode is dirty, but doesn't have to be written on
+ *			fdatasync() (unless I_DIRTY_DATASYNC is also set).
+ *			Timestamp updates are the usual cause.
+ * I_DIRTY_DATASYNC	Data-related inode changes pending.  We keep track of
+ *			these changes separately from I_DIRTY_SYNC so that we
+ *			don't have to write inode on fdatasync() when only
+ *			e.g. the timestamps have changed.
+ * I_DIRTY_PAGES	Inode has dirty pages.  Inode itself may be clean.
+ * I_DIRTY_TIME		The inode itself has dirty timestamps, and the
+ *			lazytime mount option is enabled.  We keep track of this
+ *			separately from I_DIRTY_SYNC in order to implement
+ *			lazytime.  This gets cleared if I_DIRTY_INODE
+ *			(I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
+ *			I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
+ *			in place because writeback might already be in progress
+ *			and we don't want to lose the time update
+ * I_NEW		Serves as both a mutex and completion notification.
+ *			New inodes set I_NEW.  If two processes both create
+ *			the same inode, one of them will release its inode and
+ *			wait for I_NEW to be released before returning.
+ *			Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
+ *			also cause waiting on I_NEW, without I_NEW actually
+ *			being set.  find_inode() uses this to prevent returning
+ *			nearly-dead inodes.
+ * I_WILL_FREE		Must be set when calling write_inode_now() if i_count
+ *			is zero.  I_FREEING must be set when I_WILL_FREE is
+ *			cleared.
+ * I_FREEING		Set when inode is about to be freed but still has dirty
+ *			pages or buffers attached or the inode itself is still
+ *			dirty.
+ * I_CLEAR		Added by clear_inode().  In this state the inode is
+ *			clean and can be destroyed.  Inode keeps I_FREEING.
+ *
+ *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
+ *			prohibited for many purposes.  iget() must wait for
+ *			the inode to be completely released, then create it
+ *			anew.  Other functions will just ignore such inodes,
+ *			if appropriate.  I_NEW is used for waiting.
+ *
+ * I_SYNC		Writeback of inode is running. The bit is set during
+ *			data writeback, and cleared with a wakeup on the bit
+ *			address once it is done. The bit is also used to pin
+ *			the inode in memory for flusher thread.
+ *
+ * I_REFERENCED		Marks the inode as recently references on the LRU list.
+ *
+ * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
+ *			synchronize competing switching instances and to tell
+ *			wb stat updates to grab the i_pages lock.  See
+ *			inode_switch_wbs_work_fn() for details.
+ *
+ * I_OVL_INUSE		Used by overlayfs to get exclusive ownership on upper
+ *			and work dirs among overlayfs mounts.
+ *
+ * I_CREATING		New object's inode in the middle of setting up.
+ *
+ * I_DONTCACHE		Evict inode as soon as it is not used anymore.
+ *
+ * I_SYNC_QUEUED	Inode is queued in b_io or b_more_io writeback lists.
+ *			Used to detect that mark_inode_dirty() should not move
+ *			inode between dirty lists.
+ *
+ * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
+ *
+ * I_LRU_ISOLATING	Inode is pinned being isolated from LRU without holding
+ *			i_count.
+ *
+ * Q: What is the difference between I_WILL_FREE and I_FREEING?
+ *
+ * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ * upon. There's one free address left.
+ */
+
+enum inode_state_bits {
+	__I_NEW			= 0U,
+	__I_SYNC		= 1U,
+	__I_LRU_ISOLATING	= 2U
+	/* reserved wait address bit 3 */
+};
+
+enum inode_state_flags_t {
+	I_NEW			= (1U << __I_NEW),
+	I_SYNC			= (1U << __I_SYNC),
+	I_LRU_ISOLATING         = (1U << __I_LRU_ISOLATING),
+	/* reserved flag bit 3 */
+	I_DIRTY_SYNC		= (1U << 4),
+	I_DIRTY_DATASYNC	= (1U << 5),
+	I_DIRTY_PAGES		= (1U << 6),
+	I_WILL_FREE		= (1U << 7),
+	I_FREEING		= (1U << 8),
+	I_CLEAR			= (1U << 9),
+	I_REFERENCED		= (1U << 10),
+	I_LINKABLE		= (1U << 11),
+	I_DIRTY_TIME		= (1U << 12),
+	I_WB_SWITCH		= (1U << 13),
+	I_OVL_INUSE		= (1U << 14),
+	I_CREATING		= (1U << 15),
+	I_DONTCACHE		= (1U << 16),
+	I_SYNC_QUEUED		= (1U << 17),
+	I_PINNING_NETFS_WB	= (1U << 18)
+};
+
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
+#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
+
+/*
  * Keep mostly read-only and often accessed (especially for
  * the RCU path lookup and 'stat' data) fields at the beginning
  * of the 'struct inode'
@@ -724,7 +842,7 @@ struct inode {
 #endif
 
 	/* Misc */
-	u32			i_state;
+	enum inode_state_flags_t	i_state;
 	/* 32-bit hole */
 	struct rw_semaphore	i_rwsem;
 
@@ -780,14 +898,6 @@ struct inode {
 	struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
 #endif
 
-#ifdef CONFIG_FS_ENCRYPTION
-	struct fscrypt_inode_info	*i_crypt_info;
-#endif
-
-#ifdef CONFIG_FS_VERITY
-	struct fsverity_info	*i_verity_info;
-#endif
-
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
@@ -2008,20 +2118,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
 
 /**
  * struct renamedata - contains all information required for renaming
- * @old_mnt_idmap:     idmap of the old mount the inode was found from
+ * @mnt_idmap:     idmap of the mount in which the rename is happening.
  * @old_parent:        parent of source
  * @old_dentry:                source
- * @new_mnt_idmap:     idmap of the new mount the inode was found from
  * @new_parent:        parent of destination
  * @new_dentry:                destination
  * @delegated_inode:   returns an inode needing a delegation break
  * @flags:             rename flags
  */
 struct renamedata {
-	struct mnt_idmap *old_mnt_idmap;
+	struct mnt_idmap *mnt_idmap;
 	struct dentry *old_parent;
 	struct dentry *old_dentry;
-	struct mnt_idmap *new_mnt_idmap;
 	struct dentry *new_parent;
 	struct dentry *new_dentry;
 	struct inode **delegated_inode;
@@ -2052,8 +2160,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group);
 int vfs_fchmod(struct file *file, umode_t mode);
 int vfs_utimes(const struct path *path, struct timespec64 *times);
 
-int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-
 #ifdef CONFIG_COMPAT
 extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
 					unsigned long arg);
@@ -2492,117 +2598,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 	};
 }
 
-/*
- * Inode state bits.  Protected by inode->i_lock
- *
- * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
- * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
- *
- * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
- * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
- * various stages of removing an inode.
- *
- * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
- *
- * I_DIRTY_SYNC		Inode is dirty, but doesn't have to be written on
- *			fdatasync() (unless I_DIRTY_DATASYNC is also set).
- *			Timestamp updates are the usual cause.
- * I_DIRTY_DATASYNC	Data-related inode changes pending.  We keep track of
- *			these changes separately from I_DIRTY_SYNC so that we
- *			don't have to write inode on fdatasync() when only
- *			e.g. the timestamps have changed.
- * I_DIRTY_PAGES	Inode has dirty pages.  Inode itself may be clean.
- * I_DIRTY_TIME		The inode itself has dirty timestamps, and the
- *			lazytime mount option is enabled.  We keep track of this
- *			separately from I_DIRTY_SYNC in order to implement
- *			lazytime.  This gets cleared if I_DIRTY_INODE
- *			(I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
- *			I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
- *			in place because writeback might already be in progress
- *			and we don't want to lose the time update
- * I_NEW		Serves as both a mutex and completion notification.
- *			New inodes set I_NEW.  If two processes both create
- *			the same inode, one of them will release its inode and
- *			wait for I_NEW to be released before returning.
- *			Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
- *			also cause waiting on I_NEW, without I_NEW actually
- *			being set.  find_inode() uses this to prevent returning
- *			nearly-dead inodes.
- * I_WILL_FREE		Must be set when calling write_inode_now() if i_count
- *			is zero.  I_FREEING must be set when I_WILL_FREE is
- *			cleared.
- * I_FREEING		Set when inode is about to be freed but still has dirty
- *			pages or buffers attached or the inode itself is still
- *			dirty.
- * I_CLEAR		Added by clear_inode().  In this state the inode is
- *			clean and can be destroyed.  Inode keeps I_FREEING.
- *
- *			Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
- *			prohibited for many purposes.  iget() must wait for
- *			the inode to be completely released, then create it
- *			anew.  Other functions will just ignore such inodes,
- *			if appropriate.  I_NEW is used for waiting.
- *
- * I_SYNC		Writeback of inode is running. The bit is set during
- *			data writeback, and cleared with a wakeup on the bit
- *			address once it is done. The bit is also used to pin
- *			the inode in memory for flusher thread.
- *
- * I_REFERENCED		Marks the inode as recently references on the LRU list.
- *
- * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
- *			synchronize competing switching instances and to tell
- *			wb stat updates to grab the i_pages lock.  See
- *			inode_switch_wbs_work_fn() for details.
- *
- * I_OVL_INUSE		Used by overlayfs to get exclusive ownership on upper
- *			and work dirs among overlayfs mounts.
- *
- * I_CREATING		New object's inode in the middle of setting up.
- *
- * I_DONTCACHE		Evict inode as soon as it is not used anymore.
- *
- * I_SYNC_QUEUED	Inode is queued in b_io or b_more_io writeback lists.
- *			Used to detect that mark_inode_dirty() should not move
- * 			inode between dirty lists.
- *
- * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
- *
- * I_LRU_ISOLATING	Inode is pinned being isolated from LRU without holding
- *			i_count.
- *
- * Q: What is the difference between I_WILL_FREE and I_FREEING?
- *
- * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
- * upon. There's one free address left.
- */
-#define __I_NEW			0
-#define I_NEW			(1 << __I_NEW)
-#define __I_SYNC		1
-#define I_SYNC			(1 << __I_SYNC)
-#define __I_LRU_ISOLATING	2
-#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
-
-#define I_DIRTY_SYNC		(1 << 3)
-#define I_DIRTY_DATASYNC	(1 << 4)
-#define I_DIRTY_PAGES		(1 << 5)
-#define I_WILL_FREE		(1 << 6)
-#define I_FREEING		(1 << 7)
-#define I_CLEAR			(1 << 8)
-#define I_REFERENCED		(1 << 9)
-#define I_LINKABLE		(1 << 10)
-#define I_DIRTY_TIME		(1 << 11)
-#define I_WB_SWITCH		(1 << 12)
-#define I_OVL_INUSE		(1 << 13)
-#define I_CREATING		(1 << 14)
-#define I_DONTCACHE		(1 << 15)
-#define I_SYNC_QUEUED		(1 << 16)
-#define I_PINNING_NETFS_WB	(1 << 17)
-
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
-#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
-#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
-
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
 {
@@ -2614,6 +2609,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline int icount_read(const struct inode *inode)
+{
+	return atomic_read(&inode->i_count);
+}
+
 /*
  * Returns true if the given inode itself only has dirty timestamps (its pages
  * may still be dirty) and isn't currently being allocated or freed.
@@ -2713,12 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode)
 	return inode->i_opflags & IOP_MGTIME;
 }
 
-extern struct dentry *mount_bdev(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-extern struct dentry *mount_nodev(struct file_system_type *fs_type,
-	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
 void retire_super(struct super_block *sb);
 void generic_shutdown_super(struct super_block *sb);
@@ -3281,7 +3275,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len)
 
 /**
  * name_contains_dotdot - check if a file name contains ".." path components
- *
+ * @name: File path string to check
  * Search for ".." surrounded by either '/' or start/end of string.
  */
 static inline bool name_contains_dotdot(const char *name)
@@ -3313,8 +3307,8 @@ extern void address_space_init_once(struct address_space *mapping);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
-extern int generic_delete_inode(struct inode *inode);
-static inline int generic_drop_inode(struct inode *inode)
+extern int inode_just_drop(struct inode *inode);
+static inline int inode_generic_drop(struct inode *inode)
 {
 	return !inode->i_nlink || inode_unhashed(inode);
 }
@@ -3393,7 +3387,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb)
 extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
-extern int file_remove_privs_flags(struct file *file, unsigned int flags);
 extern int file_remove_privs(struct file *);
 int setattr_should_drop_sgid(struct mnt_idmap *idmap,
 			     const struct inode *inode);
@@ -4023,4 +4016,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
 
 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
 
+static inline bool extensible_ioctl_valid(unsigned int cmd_a,
+					  unsigned int cmd_b, size_t min_size)
+{
+	if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b))
+		return false;
+	if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b))
+		return false;
+	if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b))
+		return false;
+	if (_IOC_SIZE(cmd_a) < min_size)
+		return false;
+	return true;
+}
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 7773eb870039..671f031be173 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -186,10 +186,12 @@ struct fc_log {
 extern __attribute__((format(printf, 4, 5)))
 void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...);
 
-#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \
-					l, fmt, ## __VA_ARGS__)
-#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \
-					l, fmt, ## __VA_ARGS__)
+#define __logfc(fc, l, fmt, ...) \
+	logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__)
+#define __plogp(p, prefix, l, fmt, ...) \
+	logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__)
+#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__)
+
 /**
  * infof - Store supplementary informational message
  * @fc: The context in which to log the informational message
@@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
 #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
 #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
 #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
+#define infofcp(fc, prefix, fmt, ...) \
+	__plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__)
 
 /**
  * warnf - Store supplementary warning message
@@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
 #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__)
 #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__)
 #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__)
+#define warnfcp(fc, prefix, fmt, ...) \
+	__plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__)
 
 /**
  * errorf - Store supplementary error message
@@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
 #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__)
 #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__)
 #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__)
+#define errorfcp(fc, prefix, fmt, ...) \
+	__plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__)
 
 /**
  * invalf - Store supplementary invalid argument error message
@@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
 #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL)
 #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL)
 #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL)
+#define invalfcp(fc, prefix, fmt, ...) \
+	(errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL)
 
 #endif /* _LINUX_FS_CONTEXT_H */
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 10dd161690a2..516aba5b858b 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -61,6 +61,12 @@ struct fscrypt_name {
 
 /* Crypto operations for filesystems */
 struct fscrypt_operations {
+	/*
+	 * The offset of the pointer to struct fscrypt_inode_info in the
+	 * filesystem-specific part of the inode, relative to the beginning of
+	 * the common part of the inode (the 'struct inode').
+	 */
+	ptrdiff_t inode_info_offs;
 
 	/*
 	 * If set, then fs/crypto/ will allocate a global bounce page pool the
@@ -195,16 +201,44 @@ struct fscrypt_operations {
 int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
 			 struct dentry *dentry, unsigned int flags);
 
+/*
+ * Returns the address of the fscrypt info pointer within the
+ * filesystem-specific part of the inode.  (To save memory on filesystems that
+ * don't support fscrypt, a field in 'struct inode' itself is no longer used.)
+ */
+static inline struct fscrypt_inode_info **
+fscrypt_inode_info_addr(const struct inode *inode)
+{
+	VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0);
+	return (void *)inode + inode->i_sb->s_cop->inode_info_offs;
+}
+
+/*
+ * Load the inode's fscrypt info pointer, using a raw dereference.  Since this
+ * uses a raw dereference with no memory barrier, it is appropriate to use only
+ * when the caller knows the inode's key setup already happened, resulting in
+ * non-NULL fscrypt info.  E.g., the file contents en/decryption functions use
+ * this, since fscrypt_file_open() set up the key.
+ */
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info_raw(const struct inode *inode)
+{
+	struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode);
+
+	VFS_WARN_ON_ONCE(ci == NULL);
+	return ci;
+}
+
 static inline struct fscrypt_inode_info *
 fscrypt_get_inode_info(const struct inode *inode)
 {
 	/*
 	 * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
-	 * I.e., another task may publish ->i_crypt_info concurrently, executing
-	 * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
+	 * I.e., another task may publish the fscrypt info concurrently,
+	 * executing a RELEASE barrier.  Use smp_load_acquire() here to safely
 	 * ACQUIRE the memory the other task published.
 	 */
-	return smp_load_acquire(&inode->i_crypt_info);
+	return smp_load_acquire(fscrypt_inode_info_addr(inode));
 }
 
 /**
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 1eb7eae580be..5bc7280425a7 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -26,8 +26,16 @@
 /* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
 #define FS_VERITY_MAX_DESCRIPTOR_SIZE	16384
 
+struct fsverity_info;
+
 /* Verity operations for filesystems */
 struct fsverity_operations {
+	/**
+	 * The offset of the pointer to struct fsverity_info in the
+	 * filesystem-specific part of the inode, relative to the beginning of
+	 * the common part of the inode (the 'struct inode').
+	 */
+	ptrdiff_t inode_info_offs;
 
 	/**
 	 * Begin enabling verity on the given file.
@@ -124,15 +132,37 @@ struct fsverity_operations {
 
 #ifdef CONFIG_FS_VERITY
 
+/*
+ * Returns the address of the verity info pointer within the filesystem-specific
+ * part of the inode.  (To save memory on filesystems that don't support
+ * fsverity, a field in 'struct inode' itself is no longer used.)
+ */
+static inline struct fsverity_info **
+fsverity_info_addr(const struct inode *inode)
+{
+	VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0);
+	return (void *)inode + inode->i_sb->s_vop->inode_info_offs;
+}
+
 static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 {
 	/*
-	 * Pairs with the cmpxchg_release() in fsverity_set_info().
-	 * I.e., another task may publish ->i_verity_info concurrently,
-	 * executing a RELEASE barrier.  We need to use smp_load_acquire() here
-	 * to safely ACQUIRE the memory the other task published.
+	 * Since this function can be called on inodes belonging to filesystems
+	 * that don't support fsverity at all, and fsverity_info_addr() doesn't
+	 * work on such filesystems, we have to start with an IS_VERITY() check.
+	 * Checking IS_VERITY() here is also useful to minimize the overhead of
+	 * fsverity_active() on non-verity files.
+	 */
+	if (!IS_VERITY(inode))
+		return NULL;
+
+	/*
+	 * Pairs with the cmpxchg_release() in fsverity_set_info().  I.e.,
+	 * another task may publish the inode's verity info concurrently,
+	 * executing a RELEASE barrier.  Use smp_load_acquire() here to safely
+	 * ACQUIRE the memory the other task published.
 	 */
-	return smp_load_acquire(&inode->i_verity_info);
+	return smp_load_acquire(fsverity_info_addr(inode));
 }
 
 /* enable.c */
@@ -156,12 +186,19 @@ void __fsverity_cleanup_inode(struct inode *inode);
  * fsverity_cleanup_inode() - free the inode's verity info, if present
  * @inode: an inode being evicted
  *
- * Filesystems must call this on inode eviction to free ->i_verity_info.
+ * Filesystems must call this on inode eviction to free the inode's verity info.
  */
 static inline void fsverity_cleanup_inode(struct inode *inode)
 {
-	if (inode->i_verity_info)
+	/*
+	 * Only IS_VERITY() inodes can have verity info, so start by checking
+	 * for IS_VERITY() (which is faster than retrieving the pointer to the
+	 * verity info).  This minimizes overhead for non-verity inodes.
+	 */
+	if (IS_VERITY(inode))
 		__fsverity_cleanup_inode(inode);
+	else
+		VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL);
 }
 
 /* read_metadata.c */
@@ -267,12 +304,12 @@ static inline bool fsverity_verify_page(struct page *page)
  * fsverity_active() - do reads from the inode need to go through fs-verity?
  * @inode: inode to check
  *
- * This checks whether ->i_verity_info has been set.
+ * This checks whether the inode's verity info has been set.
  *
  * Filesystems call this from ->readahead() to check whether the pages need to
  * be verified or not.  Don't use IS_VERITY() for this purpose; it's subject to
  * a race condition where the file is being read concurrently with
- * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before ->i_verity_info.)
+ * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before the verity info.)
  *
  * Return: true if reads need to go through fs-verity, otherwise false
  */
@@ -287,7 +324,7 @@ static inline bool fsverity_active(const struct inode *inode)
  * @filp: the struct file being set up
  *
  * When opening a verity file, deny the open if it is for writing.  Otherwise,
- * set up the inode's ->i_verity_info if not already done.
+ * set up the inode's verity info if not already done.
  *
  * When combined with fscrypt, this must be called after fscrypt_file_open().
  * Otherwise, we won't have the key set up to decrypt the verity metadata.
diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h
new file mode 100644
index 000000000000..8838ca2f3d08
--- /dev/null
+++ b/include/linux/hfs_common.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * HFS/HFS+ common definitions, inline functions,
+ * and shared functionality.
+ */
+
+#ifndef _HFS_COMMON_H_
+#define _HFS_COMMON_H_
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define hfs_dbg(fmt, ...)							\
+	pr_debug("pid %d:%s:%d %s(): " fmt,					\
+		 current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__)	\
+
+#endif /* _HFS_COMMON_H_ */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 80a178f3d896..12f5ee43850e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -420,9 +420,6 @@ struct io_ring_ctx {
 	struct list_head		defer_list;
 	unsigned			nr_drained;
 
-	struct io_alloc_cache		msg_cache;
-	spinlock_t			msg_lock;
-
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	struct list_head	napi_list;	/* track busy poll napi_id */
 	spinlock_t		napi_lock;	/* napi_list lock */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 14f7eaf1b443..079d8773790c 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -118,8 +118,8 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-int __copy_io(unsigned long clone_flags, struct task_struct *tsk);
-static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+int __copy_io(u64 clone_flags, struct task_struct *tsk);
+static inline int copy_io(u64 clone_flags, struct task_struct *tsk)
 {
 	if (!current->io_context)
 		return 0;
@@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
 static inline void exit_io_context(struct task_struct *task) { }
-static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+static inline int copy_io(u64 clone_flags, struct task_struct *tsk)
 {
 	return 0;
 }
diff --git a/include/linux/iosys-map.h b/include/linux/iosys-map.h
index 4696abfd311c..3e85afe794c0 100644
--- a/include/linux/iosys-map.h
+++ b/include/linux/iosys-map.h
@@ -264,12 +264,7 @@ static inline bool iosys_map_is_set(const struct iosys_map *map)
  */
 static inline void iosys_map_clear(struct iosys_map *map)
 {
-	if (map->is_iomem) {
-		map->vaddr_iomem = NULL;
-		map->is_iomem = false;
-	} else {
-		map->vaddr = NULL;
-	}
+	memset(map, 0, sizeof(*map));
 }
 
 /**
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index c4aa58032faf..f9a17fbbd398 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -160,7 +160,7 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2
 
 	do {
 		struct folio *folio = folioq_folio(folioq, slot);
-		size_t part, remain, consumed;
+		size_t part, remain = 0, consumed;
 		size_t fsize;
 		void *base;
 
@@ -168,14 +168,16 @@ size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2
 			break;
 
 		fsize = folioq_folio_size(folioq, slot);
-		base = kmap_local_folio(folio, skip);
-		part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
-		remain = step(base, progress, part, priv, priv2);
-		kunmap_local(base);
-		consumed = part - remain;
-		len -= consumed;
-		progress += consumed;
-		skip += consumed;
+		if (skip < fsize) {
+			base = kmap_local_folio(folio, skip);
+			part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
+			remain = step(base, progress, part, priv, priv2);
+			kunmap_local(base);
+			consumed = part - remain;
+			len -= consumed;
+			progress += consumed;
+			skip += consumed;
+		}
 		if (skip >= fsize) {
 			skip = 0;
 			slot++;
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e8240cf2611a..12faca29bbb9 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -129,20 +129,25 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
 
 #if defined(CONFIG_IPC_NS)
-extern struct ipc_namespace *copy_ipcs(unsigned long flags,
+static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct ipc_namespace, ns);
+}
+
+extern struct ipc_namespace *copy_ipcs(u64 flags,
 	struct user_namespace *user_ns, struct ipc_namespace *ns);
 
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
-		refcount_inc(&ns->ns.count);
+		ns_ref_inc(ns);
 	return ns;
 }
 
 static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
 {
 	if (ns) {
-		if (refcount_inc_not_zero(&ns->ns.count))
+		if (ns_ref_get(ns))
 			return ns;
 	}
 
@@ -151,7 +156,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns
 
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
-static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
+static inline struct ipc_namespace *copy_ipcs(u64 flags,
 	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
 	if (flags & CLONE_NEWIPC)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 890011071f2b..fe5ce9215821 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { }
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
 void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end,
@@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start,
 						       unsigned long size)
 { }
 static inline int kasan_populate_vmalloc(unsigned long start,
-					unsigned long size)
+					unsigned long size, gfp_t gfp_mask)
 {
 	return 0;
 }
@@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start,
 static inline void kasan_populate_early_vm_area_shadow(void *start,
 						       unsigned long size) { }
 static inline int kasan_populate_vmalloc(unsigned long start,
-					unsigned long size)
+					unsigned long size, gfp_t gfp_mask)
 {
 	return 0;
 }
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 75a2fb8b16c3..0143358874b0 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -57,47 +57,21 @@ static inline void kcov_remote_start_usb(u64 id)
 
 /*
  * The softirq flavor of kcov_remote_*() functions is introduced as a temporary
- * workaround for KCOV's lack of nested remote coverage sections support.
- *
- * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337.
- *
- * kcov_remote_start_usb_softirq():
- *
- * 1. Only collects coverage when called in the softirq context. This allows
- *    avoiding nested remote coverage collection sections in the task context.
- *    For example, USB/IP calls usb_hcd_giveback_urb() in the task context
- *    within an existing remote coverage collection section. Thus, KCOV should
- *    not attempt to start collecting coverage within the coverage collection
- *    section in __usb_hcd_giveback_urb() in this case.
- *
- * 2. Disables interrupts for the duration of the coverage collection section.
- *    This allows avoiding nested remote coverage collection sections in the
- *    softirq context (a softirq might occur during the execution of a work in
- *    the BH workqueue, which runs with in_serving_softirq() > 0).
- *    For example, usb_giveback_urb_bh() runs in the BH workqueue with
- *    interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in
- *    the middle of its remote coverage collection section, and the interrupt
- *    handler might invoke __usb_hcd_giveback_urb() again.
+ * work around for kcov's lack of nested remote coverage sections support in
+ * task context. Adding support for nested sections is tracked in:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=210337
  */
 
-static inline unsigned long kcov_remote_start_usb_softirq(u64 id)
+static inline void kcov_remote_start_usb_softirq(u64 id)
 {
-	unsigned long flags = 0;
-
-	if (in_serving_softirq()) {
-		local_irq_save(flags);
+	if (in_serving_softirq() && !in_hardirq())
 		kcov_remote_start_usb(id);
-	}
-
-	return flags;
 }
 
-static inline void kcov_remote_stop_softirq(unsigned long flags)
+static inline void kcov_remote_stop_softirq(void)
 {
-	if (in_serving_softirq()) {
+	if (in_serving_softirq() && !in_hardirq())
 		kcov_remote_stop();
-		local_irq_restore(flags);
-	}
 }
 
 #ifdef CONFIG_64BIT
@@ -131,11 +105,8 @@ static inline u64 kcov_common_handle(void)
 }
 static inline void kcov_remote_start_common(u64 id) {}
 static inline void kcov_remote_start_usb(u64 id) {}
-static inline unsigned long kcov_remote_start_usb_softirq(u64 id)
-{
-	return 0;
-}
-static inline void kcov_remote_stop_softirq(unsigned long flags) {}
+static inline void kcov_remote_start_usb_softirq(u64 id) {}
+static inline void kcov_remote_stop_softirq(void) {}
 
 #endif /* CONFIG_KCOV */
 #endif /* _LINUX_KCOV_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1b10a5d84b68..39fe3e6cd282 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -460,7 +460,8 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
-				 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG)
+				 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \
+				 KEXEC_FILE_NO_CMA)
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index fd11fffdd3c3..adbe234a6f6c 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file)
 LSM_HOOK(int, 0, file_post_open, struct file *file, int mask)
 LSM_HOOK(int, 0, file_truncate, struct file *file)
 LSM_HOOK(int, 0, task_alloc, struct task_struct *task,
-	 unsigned long clone_flags)
+	 u64 clone_flags)
 LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task)
 LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp)
 LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 090d1d3e19fe..79ec5a2bdcca 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -116,6 +116,9 @@ struct lsm_blob_sizes {
 	int lbs_xattr_count; /* number of xattr slots in new_xattrs array */
 	int lbs_tun_dev;
 	int lbs_bdev;
+	int lbs_bpf_map;
+	int lbs_bpf_prog;
+	int lbs_bpf_token;
 };
 
 /*
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b96746376e17..fcda8481de9a 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -40,8 +40,9 @@ extern unsigned long long max_possible_pfn;
  * via a driver, and never indicated in the firmware-provided memory map as
  * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
  * kernel resource tree.
- * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
- * not initialized (only for reserved regions).
+ * @MEMBLOCK_RSRV_NOINIT: reserved memory region for which struct pages are not
+ * fully initialized. Users of this flag are responsible to properly initialize
+ * struct pages of this region
  * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use,
  * either explictitly with memblock_reserve_kern() or via memblock
  * allocation APIs. All memblock allocations set this flag.
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index acadd41e0b5c..9009e27b5f44 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -79,6 +79,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
 void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);
+int set_movable_ops(const struct movable_operations *ops, enum pagetype type);
 
 #else
 
@@ -100,6 +101,10 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 {
 	return -ENOSYS;
 }
+static inline int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
+{
+	return -ENOSYS;
+}
 
 #endif /* CONFIG_MIGRATION */
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8c5fbfb85749..10fe492e1fed 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -663,6 +663,7 @@ struct mlx5e_resources {
 		bool			   tisn_valid;
 	} hw_objs;
 	struct net_device *uplink_netdev;
+	netdevice_tracker tracker;
 	struct mutex uplink_netdev_lock;
 	struct mlx5_crypto_dek_priv *dek_priv;
 };
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 86055d55836d..6ac76a0c3827 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size);
 void mlx5_fc_local_destroy(struct mlx5_fc *counter);
+void mlx5_fc_local_get(struct mlx5_fc *counter);
+void mlx5_fc_local_put(struct mlx5_fc *counter);
 u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 08bc2442db93..7f625c35128b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
 {
 	return atomic_read(&ptdesc->pt_share_count);
 }
+
+static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc)
+{
+	return !!ptdesc_pmd_pts_count(ptdesc);
+}
 #else
 static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
 {
@@ -1102,6 +1107,11 @@ struct mm_struct {
 
 		unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+		/* the ABI-related flags from the ELF header. Used for core dump */
+		unsigned long saved_e_flags;
+#endif
+
 		struct percpu_counter rss_stat[NR_MM_COUNTERS];
 
 		struct linux_binfmt *binfmt;
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 70b366b64816..0acd1089d149 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -11,7 +11,9 @@ struct fs_struct;
 struct user_namespace;
 struct ns_common;
 
-extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
+extern struct mnt_namespace init_mnt_ns;
+
+extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *,
 		struct user_namespace *, struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
 DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T))
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 3111ba95fbde..d415dd15a0a9 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -431,8 +431,6 @@ struct msi_domain_info;
  *			function.
  * @domain_free_irqs:	Optional function to override the default free
  *			function.
- * @msi_post_free:	Optional function which is invoked after freeing
- *			all interrupts.
  * @msi_translate:	Optional translate callback to support the odd wire to
  *			MSI bridges, e.g. MBIGEN
  *
@@ -473,8 +471,6 @@ struct msi_domain_ops {
 					     struct device *dev, int nvec);
 	void		(*domain_free_irqs)(struct irq_domain *domain,
 					    struct device *dev);
-	void		(*msi_post_free)(struct irq_domain *domain,
-					 struct device *dev);
 	int		(*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec,
 					 irq_hw_number_t *hwirq, unsigned int *type);
 };
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 5d085428e471..a7800ef04e76 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -57,13 +57,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 				    struct dentry *base,
 				    unsigned int flags);
 extern int kern_path(const char *, unsigned, struct path *);
-
-extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
-extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
-extern void done_path_create(struct path *, struct dentry *);
-extern struct dentry *kern_path_locked(const char *, struct path *);
-extern struct dentry *kern_path_locked_negative(const char *, struct path *);
-extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
+struct dentry *kern_path_parent(const char *name, struct path *parent);
+
+extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int);
+extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int);
+extern void end_creating_path(struct path *, struct dentry *);
+extern struct dentry *start_removing_path(const char *, struct path *);
+extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *);
+static inline void end_removing_path(struct path *path , struct dentry *dentry)
+{
+	end_creating_path(path, dentry);
+}
 int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
 			   struct path *parent, struct qstr *last, int *type,
 			   const struct path *root);
@@ -80,6 +84,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
 struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
 					    struct qstr *name,
 					    struct dentry *base);
+struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
+					    struct qstr *name,
+					    struct dentry *base);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *path, unsigned int flags);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5e5de4b0a433..f3a3b761abfb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2071,6 +2071,8 @@ enum netdev_reg_state {
  *	@max_pacing_offload_horizon: max EDT offload horizon in nsec.
  *	@napi_config: An array of napi_config structures containing per-NAPI
  *		      settings.
+ *	@num_napi_configs:	number of allocated NAPI config structs,
+ *		always >= max(num_rx_queues, num_tx_queues).
  *	@gro_flush_timeout:	timeout for GRO layer in NAPI
  *	@napi_defer_hard_irqs:	If not zero, provides a counter that would
  *				allow to avoid NIC hard IRQ, on busy queues.
@@ -2482,8 +2484,9 @@ struct net_device {
 
 	u64			max_pacing_offload_horizon;
 	struct napi_config	*napi_config;
-	unsigned long		gro_flush_timeout;
+	u32			num_napi_configs;
 	u32			napi_defer_hard_irqs;
+	unsigned long		gro_flush_timeout;
 
 	/**
 	 * @up: copy of @state's IFF_UP, but safe to read with just @lock.
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 185bd8196503..98c96d649bf9 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -150,6 +150,7 @@ struct netfs_io_stream {
 	bool			active;		/* T if stream is active */
 	bool			need_retry;	/* T if this stream needs retrying */
 	bool			failed;		/* T if this stream failed */
+	bool			transferred_valid; /* T is ->transferred is valid */
 };
 
 /*
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 169b4ae30ff4..9aed39abc94b 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -160,6 +160,7 @@ extern void nfs_join_page_group(struct nfs_page *head,
 extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern bool nfs_page_group_sync_on_bit_locked(struct nfs_page *, unsigned int);
 extern	int nfs_page_set_headlock(struct nfs_page *req);
 extern void nfs_page_clear_headlock(struct nfs_page *req);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 7d22ea50b098..f5b68b8abb54 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -3,14 +3,151 @@
 #define _LINUX_NS_COMMON_H
 
 #include <linux/refcount.h>
+#include <linux/rbtree.h>
+#include <uapi/linux/sched.h>
 
 struct proc_ns_operations;
 
+struct cgroup_namespace;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations utsns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+
 struct ns_common {
+	u32 ns_type;
 	struct dentry *stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
-	refcount_t count;
+	refcount_t __ns_ref; /* do not use directly */
+	union {
+		struct {
+			u64 ns_id;
+			struct rb_node ns_tree_node;
+			struct list_head ns_list_node;
+		};
+		struct rcu_head ns_rcu;
+	};
 };
 
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
+void __ns_common_free(struct ns_common *ns);
+
+#define to_ns_common(__ns)                                    \
+	_Generic((__ns),                                      \
+		struct cgroup_namespace *:       &(__ns)->ns, \
+		const struct cgroup_namespace *: &(__ns)->ns, \
+		struct ipc_namespace *:          &(__ns)->ns, \
+		const struct ipc_namespace *:    &(__ns)->ns, \
+		struct mnt_namespace *:          &(__ns)->ns, \
+		const struct mnt_namespace *:    &(__ns)->ns, \
+		struct net *:                    &(__ns)->ns, \
+		const struct net *:              &(__ns)->ns, \
+		struct pid_namespace *:          &(__ns)->ns, \
+		const struct pid_namespace *:    &(__ns)->ns, \
+		struct time_namespace *:         &(__ns)->ns, \
+		const struct time_namespace *:   &(__ns)->ns, \
+		struct user_namespace *:         &(__ns)->ns, \
+		const struct user_namespace *:   &(__ns)->ns, \
+		struct uts_namespace *:          &(__ns)->ns, \
+		const struct uts_namespace *:    &(__ns)->ns)
+
+#define ns_init_inum(__ns)                                     \
+	_Generic((__ns),                                       \
+		struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+		struct ipc_namespace *:    IPC_NS_INIT_INO,    \
+		struct mnt_namespace *:    MNT_NS_INIT_INO,    \
+		struct net *:              NET_NS_INIT_INO,    \
+		struct pid_namespace *:    PID_NS_INIT_INO,    \
+		struct time_namespace *:   TIME_NS_INIT_INO,   \
+		struct user_namespace *:   USER_NS_INIT_INO,   \
+		struct uts_namespace *:    UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns)                                    \
+	_Generic((__ns),                                    \
+		struct cgroup_namespace *: &init_cgroup_ns, \
+		struct ipc_namespace *:    &init_ipc_ns,    \
+		struct mnt_namespace *:    &init_mnt_ns,     \
+		struct net *:              &init_net,       \
+		struct pid_namespace *:    &init_pid_ns,    \
+		struct time_namespace *:   &init_time_ns,   \
+		struct user_namespace *:   &init_user_ns,   \
+		struct uts_namespace *:    &init_uts_ns)
+
+#define to_ns_operations(__ns)                                                                         \
+	_Generic((__ns),                                                                               \
+		struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+		struct ipc_namespace *:    (IS_ENABLED(CONFIG_IPC_NS)  ? &ipcns_operations    : NULL), \
+		struct mnt_namespace *:    &mntns_operations,                                          \
+		struct net *:              (IS_ENABLED(CONFIG_NET_NS)  ? &netns_operations    : NULL), \
+		struct pid_namespace *:    (IS_ENABLED(CONFIG_PID_NS)  ? &pidns_operations    : NULL), \
+		struct time_namespace *:   (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations   : NULL), \
+		struct user_namespace *:   (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations   : NULL), \
+		struct uts_namespace *:    (IS_ENABLED(CONFIG_UTS_NS)  ? &utsns_operations    : NULL))
+
+#define ns_common_type(__ns)                                \
+	_Generic((__ns),                                    \
+		struct cgroup_namespace *: CLONE_NEWCGROUP, \
+		struct ipc_namespace *:    CLONE_NEWIPC,    \
+		struct mnt_namespace *:    CLONE_NEWNS,     \
+		struct net *:              CLONE_NEWNET,    \
+		struct pid_namespace *:    CLONE_NEWPID,    \
+		struct time_namespace *:   CLONE_NEWTIME,   \
+		struct user_namespace *:   CLONE_NEWUSER,   \
+		struct uts_namespace *:    CLONE_NEWUTS)
+
+#define ns_common_init(__ns)                     \
+	__ns_common_init(to_ns_common(__ns),     \
+			 ns_common_type(__ns),   \
+			 to_ns_operations(__ns), \
+			 (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0))
+
+#define ns_common_init_inum(__ns, __inum)        \
+	__ns_common_init(to_ns_common(__ns),     \
+			 ns_common_type(__ns),   \
+			 to_ns_operations(__ns), \
+			 __inum)
+
+#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+
+static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
+{
+	return refcount_dec_and_test(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
+{
+	return refcount_inc_not_zero(&ns->__ns_ref);
+}
+
+#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
+#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
+#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
+#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
+#define ns_ref_put_and_lock(__ns, __lock) \
+	refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+
 #endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
new file mode 100644
index 000000000000..e5a5fa83d36b
--- /dev/null
+++ b/include/linux/nsfs.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#ifndef _LINUX_NSFS_H
+#define _LINUX_NSFS_H
+
+#include <linux/ns_common.h>
+#include <linux/cred.h>
+#include <linux/pid_namespace.h>
+
+struct path;
+struct task_struct;
+struct proc_ns_operations;
+
+int ns_get_path(struct path *path, struct task_struct *task,
+		const struct proc_ns_operations *ns_ops);
+typedef struct ns_common *ns_get_path_helper_t(void *);
+int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
+		   void *private_data);
+
+bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino);
+
+int ns_get_name(char *buf, size_t size, struct task_struct *task,
+			const struct proc_ns_operations *ns_ops);
+void nsfs_init(void);
+
+#define __current_namespace_from_type(__ns)				\
+	_Generic((__ns),						\
+		struct cgroup_namespace *: current->nsproxy->cgroup_ns,	\
+		struct ipc_namespace *:    current->nsproxy->ipc_ns,	\
+		struct net *:              current->nsproxy->net_ns,	\
+		struct pid_namespace *:    task_active_pid_ns(current),	\
+		struct mnt_namespace *:    current->nsproxy->mnt_ns,	\
+		struct time_namespace *:   current->nsproxy->time_ns,	\
+		struct user_namespace *:   current_user_ns(),		\
+		struct uts_namespace *:    current->nsproxy->uts_ns)
+
+#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+
+#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index dab6a1734a22..bd118a187dec 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -42,17 +42,6 @@ struct nsproxy {
 };
 extern struct nsproxy init_nsproxy;
 
-#define to_ns_common(__ns)                              \
-	_Generic((__ns),                                \
-		struct cgroup_namespace *: &(__ns->ns), \
-		struct ipc_namespace *:    &(__ns->ns), \
-		struct net *:              &(__ns->ns), \
-		struct pid_namespace *:    &(__ns->ns), \
-		struct mnt_namespace *:    &(__ns->ns), \
-		struct time_namespace *:   &(__ns->ns), \
-		struct user_namespace *:   &(__ns->ns), \
-		struct uts_namespace *:    &(__ns->ns))
-
 /*
  * A structure to encompass all bits needed to install
  * a partial or complete new set of namespaces.
@@ -103,7 +92,7 @@ static inline struct cred *nsset_cred(struct nsset *set)
  *
  */
 
-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+int copy_namespaces(u64 flags, struct task_struct *tsk);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 int exec_task_namespaces(void);
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
new file mode 100644
index 000000000000..8b8636690473
--- /dev/null
+++ b/include/linux/nstree.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NSTREE_H
+#define _LINUX_NSTREE_H
+
+#include <linux/ns_common.h>
+#include <linux/nsproxy.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/rculist.h>
+#include <linux/cookie.h>
+
+extern struct ns_tree cgroup_ns_tree;
+extern struct ns_tree ipc_ns_tree;
+extern struct ns_tree mnt_ns_tree;
+extern struct ns_tree net_ns_tree;
+extern struct ns_tree pid_ns_tree;
+extern struct ns_tree time_ns_tree;
+extern struct ns_tree user_ns_tree;
+extern struct ns_tree uts_ns_tree;
+
+#define to_ns_tree(__ns)					\
+	_Generic((__ns),					\
+		struct cgroup_namespace *: &(cgroup_ns_tree),	\
+		struct ipc_namespace *:    &(ipc_ns_tree),	\
+		struct net *:              &(net_ns_tree),	\
+		struct pid_namespace *:    &(pid_ns_tree),	\
+		struct mnt_namespace *:    &(mnt_ns_tree),	\
+		struct time_namespace *:   &(time_ns_tree),	\
+		struct user_namespace *:   &(user_ns_tree),	\
+		struct uts_namespace *:    &(uts_ns_tree))
+
+u64 ns_tree_gen_id(struct ns_common *ns);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+					 struct ns_tree *ns_tree,
+					 bool previous);
+
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+	ns_tree_gen_id(ns);
+	__ns_tree_add_raw(ns, ns_tree);
+}
+
+/**
+ * ns_tree_add_raw - Add a namespace to a namespace
+ * @ns: Namespace to add
+ *
+ * This function adds a namespace to the appropriate namespace tree
+ * without assigning a id.
+ */
+#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns))
+
+/**
+ * ns_tree_add - Add a namespace to a namespace tree
+ * @ns: Namespace to add
+ *
+ * This function assigns a new id to the namespace and adds it to the
+ * appropriate namespace tree and list.
+ */
+#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+
+/**
+ * ns_tree_remove - Remove a namespace from a namespace tree
+ * @ns: Namespace to remove
+ *
+ * This function removes a namespace from the appropriate namespace
+ * tree and list.
+ */
+#define ns_tree_remove(__ns)  __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns))
+
+#define ns_tree_adjoined_rcu(__ns, __previous) \
+	__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
+
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+
+#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 682472c15495..88e18615dd72 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 int walk_kernel_page_table_range(unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		pgd_t *pgd, void *private);
+int walk_kernel_page_table_range_lockless(unsigned long start,
+		unsigned long end, const struct mm_walk_ops *ops,
+		pgd_t *pgd, void *private);
 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, const struct mm_walk_ops *ops,
 			void *private);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ec9d96025683..fd1d91017b99 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -859,7 +859,7 @@ struct perf_event {
 
 	/* mmap bits */
 	struct mutex			mmap_mutex;
-	atomic_t			mmap_count;
+	refcount_t			mmap_count;
 
 	struct perf_buffer		*rb;
 	struct list_head		rb_entry;
@@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
diff --git a/include/linux/pgalloc.h b/include/linux/pgalloc.h
new file mode 100644
index 000000000000..9174fa59bbc5
--- /dev/null
+++ b/include/linux/pgalloc.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLOC_H
+#define _LINUX_PGALLOC_H
+
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+
+/*
+ * {pgd,p4d}_populate_kernel() are defined as macros to allow
+ * compile-time optimization based on the configured page table levels.
+ * Without this, linking may fail because callers (e.g., KASAN) may rely
+ * on calls to these functions being optimized away when passing symbols
+ * that exist only for certain page table levels.
+ */
+#define pgd_populate_kernel(addr, pgd, p4d)				\
+	do {								\
+		pgd_populate(&init_mm, pgd, p4d);			\
+		if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED)	\
+			arch_sync_kernel_mappings(addr, addr);		\
+	} while (0)
+
+#define p4d_populate_kernel(addr, p4d, pud)				\
+	do {								\
+		p4d_populate(&init_mm, p4d, pud);			\
+		if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED)	\
+			arch_sync_kernel_mappings(addr, addr);		\
+	} while (0)
+
+#endif /* _LINUX_PGALLOC_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 4c035637eeb7..25a7257052ff 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd)
  * and the mode cannot be used in interrupt context.
  */
 #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-#define arch_enter_lazy_mmu_mode()	do {} while (0)
-#define arch_leave_lazy_mmu_mode()	do {} while (0)
-#define arch_flush_lazy_mmu_mode()	do {} while (0)
+static inline void arch_enter_lazy_mmu_mode(void) {}
+static inline void arch_leave_lazy_mmu_mode(void) {}
+static inline void arch_flush_lazy_mmu_mode(void) {}
 #endif
 
 #ifndef pte_batch_hint
@@ -1467,6 +1467,22 @@ static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned
 }
 #endif
 
+/*
+ * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
+ * and let generic vmalloc, ioremap and page table update code know when
+ * arch_sync_kernel_mappings() needs to be called.
+ */
+#ifndef ARCH_PAGE_TABLE_SYNC_MASK
+#define ARCH_PAGE_TABLE_SYNC_MASK 0
+#endif
+
+/*
+ * There is no default implementation for arch_sync_kernel_mappings(). It is
+ * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
+ * is 0.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
+
 #endif /* CONFIG_MMU */
 
 /*
@@ -1938,10 +1954,11 @@ static inline bool arch_has_pfn_modify_check(void)
 /*
  * Page Table Modification bits for pgtbl_mod_mask.
  *
- * These are used by the p?d_alloc_track*() set of functions an in the generic
- * vmalloc/ioremap code to track at which page-table levels entries have been
- * modified. Based on that the code can better decide when vmalloc and ioremap
- * mapping changes need to be synchronized to other page-tables in the system.
+ * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
+ * functions in the generic vmalloc, ioremap and page table update code
+ * to track at which page-table levels entries have been modified.
+ * Based on that the code can better decide when page table changes need
+ * to be synchronized to other page-tables in the system.
  */
 #define		__PGTBL_PGD_MODIFIED	0
 #define		__PGTBL_P4D_MODIFIED	1
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4c2b8b6e7187..bb45787d8684 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -169,6 +169,11 @@ static inline bool phy_interface_empty(const unsigned long *intf)
 	return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX);
 }
 
+static inline unsigned int phy_interface_weight(const unsigned long *intf)
+{
+	return bitmap_weight(intf, PHY_INTERFACE_MODE_MAX);
+}
+
 static inline void phy_interface_and(unsigned long *dst, const unsigned long *a,
 				     const unsigned long *b)
 {
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7c67a5811199..445517a72ad0 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -54,10 +54,15 @@ extern struct pid_namespace init_pid_ns;
 #define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct pid_namespace, ns);
+}
+
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 {
 	if (ns != &init_pid_ns)
-		refcount_inc(&ns->ns.count);
+		ns_ref_inc(ns);
 	return ns;
 }
 
@@ -78,12 +83,15 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
 }
 #endif
 
-extern struct pid_namespace *copy_pid_ns(unsigned long flags,
+extern struct pid_namespace *copy_pid_ns(u64 flags,
 	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
 extern void put_pid_ns(struct pid_namespace *ns);
 
+extern bool pidns_is_ancestor(struct pid_namespace *child,
+			      struct pid_namespace *ancestor);
+
 #else /* !CONFIG_PID_NS */
 #include <linux/err.h>
 
@@ -97,7 +105,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
 	return 0;
 }
 
-static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
+static inline struct pid_namespace *copy_pid_ns(u64 flags,
 	struct user_namespace *user_ns, struct pid_namespace *ns)
 {
 	if (flags & CLONE_NEWPID)
@@ -118,6 +126,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 {
 	return 0;
 }
+
+static inline bool pidns_is_ancestor(struct pid_namespace *child,
+				     struct pid_namespace *ancestor)
+{
+	return false;
+}
 #endif /* CONFIG_PID_NS */
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h
index 78276a11c48d..1571e9157fa5 100644
--- a/include/linux/platform_data/x86/int3472.h
+++ b/include/linux/platform_data/x86/int3472.h
@@ -27,6 +27,7 @@
 #define INT3472_GPIO_TYPE_CLK_ENABLE				0x0c
 #define INT3472_GPIO_TYPE_PRIVACY_LED				0x0d
 #define INT3472_GPIO_TYPE_HANDSHAKE				0x12
+#define INT3472_GPIO_TYPE_HOTPLUG_DETECT			0x13
 
 #define INT3472_PDEV_MAX_NAME_LEN				23
 #define INT3472_MAX_SENSOR_GPIOS				3
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index c84edf217819..f67a2cb7d781 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -115,6 +115,12 @@ struct dev_pm_domain_list {
  *				genpd provider specific way, likely through a
  *				parent device node. This flag makes genpd to
  *				skip its internal support for this.
+ *
+ * GENPD_FLAG_NO_STAY_ON:	For genpd OF providers a powered-on PM domain at
+ *				initialization is prevented from being
+ *				powered-off until the ->sync_state() callback is
+ *				invoked. This flag informs genpd to allow a
+ *				power-off without waiting for ->sync_state().
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -126,6 +132,7 @@ struct dev_pm_domain_list {
 #define GENPD_FLAG_OPP_TABLE_FW	 (1U << 7)
 #define GENPD_FLAG_DEV_NAME_FW	 (1U << 8)
 #define GENPD_FLAG_NO_SYNC_STATE (1U << 9)
+#define GENPD_FLAG_NO_STAY_ON	 (1U << 10)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1fad1c8a4c76..102202185d7a 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 /*
  * Migrate-Disable and why it is undesired.
  *
- * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * When a preempted task becomes eligible to run under the ideal model (IOW it
  * becomes one of the M highest priority tasks), it might still have to wait
  * for the preemptee's migrate_disable() section to complete. Thereby suffering
  * a reduction in bandwidth in the exact duration of the migrate_disable()
@@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
  * - a lower priority tasks; which under preempt_disable() could've instantly
  *   migrated away when another CPU becomes available, is now constrained
  *   by the ability to push the higher priority task away, which might itself be
- *   in a migrate_disable() section, reducing it's available bandwidth.
+ *   in a migrate_disable() section, reducing its available bandwidth.
  *
  * IOW it trades latency / moves the interference term, but it stays in the
  * system, and as long as it remains unbounded, the system is not fully
@@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
  * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
  * number of primitives into becoming preemptible, they would also allow
  * migration. This turns out to break a bunch of per-cpu usage. To this end,
- * all these primitives employ migirate_disable() to restore this implicit
+ * all these primitives employ migrate_disable() to restore this implicit
  * assumption.
  *
  * This is a 'temporary' work-around at best. The correct solution is getting
@@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
  * per-cpu locking or short preempt-disable regions.
  *
  * The end goal must be to get rid of migrate_disable(), alternatively we need
- * a schedulability theory that does not depend on abritrary migration.
+ * a schedulability theory that does not depend on arbitrary migration.
  *
  *
  * Notes on the implementation.
@@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
  *       work-conserving schedulers.
  *
  */
-extern void migrate_disable(void);
-extern void migrate_enable(void);
 
 /**
  * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
@@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void)
 
 DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
 DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
-DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 4b20375f3783..e81b8e596e4f 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -5,7 +5,7 @@
 #ifndef _LINUX_PROC_NS_H
 #define _LINUX_PROC_NS_H
 
-#include <linux/ns_common.h>
+#include <linux/nsfs.h>
 #include <uapi/linux/nsfs.h>
 
 struct pid_namespace;
@@ -17,7 +17,6 @@ struct inode;
 struct proc_ns_operations {
 	const char *name;
 	const char *real_ns_name;
-	int type;
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsset *nsset, struct ns_common *ns);
@@ -66,25 +65,6 @@ static inline void proc_free_inum(unsigned int inum) {}
 
 #endif /* CONFIG_PROC_FS */
 
-static inline int ns_alloc_inum(struct ns_common *ns)
-{
-	WRITE_ONCE(ns->stashed, NULL);
-	return proc_alloc_inum(&ns->inum);
-}
-
-#define ns_free_inum(ns) proc_free_inum((ns)->inum)
-
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
-extern int ns_get_path(struct path *path, struct task_struct *task,
-			const struct proc_ns_operations *ns_ops);
-typedef struct ns_common *ns_get_path_helper_t(void *);
-extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
-			    void *private_data);
-
-extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino);
-
-extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
-			const struct proc_ns_operations *ns_ops);
-extern void nsfs_init(void);
 
 #endif /* _LINUX_PROC_NS_H */
diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h
index 1504fb012c05..540abf7de048 100644
--- a/include/linux/psp-platform-access.h
+++ b/include/linux/psp-platform-access.h
@@ -7,6 +7,8 @@
 
 enum psp_platform_access_msg {
 	PSP_CMD_NONE			= 0x0,
+	PSP_SFS_GET_FW_VERSIONS,
+	PSP_SFS_UPDATE,
 	PSP_CMD_HSTI_QUERY		= 0x14,
 	PSP_I2C_REQ_BUS_CMD		= 0x64,
 	PSP_DYNAMIC_BOOST_GET_NONCE,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 120536f4c6eb..8f346c847ee5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -24,7 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/atomic.h>
 #include <linux/irqflags.h>
-#include <linux/preempt.h>
+#include <linux/sched.h>
 #include <linux/bottom_half.h>
 #include <linux/lockdep.h>
 #include <linux/cleanup.h>
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 6fb4894b8cfd..a7d92718b653 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -157,27 +157,42 @@ struct rdt_ctrl_domain {
 };
 
 /**
+ * struct mbm_cntr_cfg - Assignable counter configuration.
+ * @evtid:		MBM event to which the counter is assigned. Only valid
+ *			if @rdtgroup is not NULL.
+ * @rdtgrp:		resctrl group assigned to the counter. NULL if the
+ *			counter is free.
+ */
+struct mbm_cntr_cfg {
+	enum resctrl_event_id	evtid;
+	struct rdtgroup		*rdtgrp;
+};
+
+/**
  * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource
  * @hdr:		common header for different domain types
  * @ci_id:		cache info id for this domain
  * @rmid_busy_llc:	bitmap of which limbo RMIDs are above threshold
- * @mbm_total:		saved state for MBM total bandwidth
- * @mbm_local:		saved state for MBM local bandwidth
+ * @mbm_states:		Per-event pointer to the MBM event's saved state.
+ *			An MBM event's state is an array of struct mbm_state
+ *			indexed by RMID on x86 or combined CLOSID, RMID on Arm.
  * @mbm_over:		worker to periodically read MBM h/w counters
  * @cqm_limbo:		worker to periodically read CQM h/w counters
  * @mbm_work_cpu:	worker CPU for MBM h/w counters
  * @cqm_work_cpu:	worker CPU for CQM h/w counters
+ * @cntr_cfg:		array of assignable counters' configuration (indexed
+ *			by counter ID)
  */
 struct rdt_mon_domain {
 	struct rdt_domain_hdr		hdr;
 	unsigned int			ci_id;
 	unsigned long			*rmid_busy_llc;
-	struct mbm_state		*mbm_total;
-	struct mbm_state		*mbm_local;
+	struct mbm_state		*mbm_states[QOS_NUM_L3_MBM_EVENTS];
 	struct delayed_work		mbm_over;
 	struct delayed_work		cqm_limbo;
 	int				mbm_work_cpu;
 	int				cqm_work_cpu;
+	struct mbm_cntr_cfg		*cntr_cfg;
 };
 
 /**
@@ -256,39 +271,52 @@ enum resctrl_schema_fmt {
 };
 
 /**
+ * struct resctrl_mon - Monitoring related data of a resctrl resource.
+ * @num_rmid:		Number of RMIDs available.
+ * @mbm_cfg_mask:	Memory transactions that can be tracked when bandwidth
+ *			monitoring events can be configured.
+ * @num_mbm_cntrs:	Number of assignable counters.
+ * @mbm_cntr_assignable:Is system capable of supporting counter assignment?
+ * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM
+ *			events of monitor groups created via mkdir.
+ */
+struct resctrl_mon {
+	int			num_rmid;
+	unsigned int		mbm_cfg_mask;
+	int			num_mbm_cntrs;
+	bool			mbm_cntr_assignable;
+	bool			mbm_assign_on_mkdir;
+};
+
+/**
  * struct rdt_resource - attributes of a resctrl resource
  * @rid:		The index of the resource
  * @alloc_capable:	Is allocation available on this machine
  * @mon_capable:	Is monitor feature available on this machine
- * @num_rmid:		Number of RMIDs available
  * @ctrl_scope:		Scope of this resource for control functions
  * @mon_scope:		Scope of this resource for monitor functions
  * @cache:		Cache allocation related data
  * @membw:		If the component has bandwidth controls, their properties.
+ * @mon:		Monitoring related data.
  * @ctrl_domains:	RCU list of all control domains for this resource
  * @mon_domains:	RCU list of all monitor domains for this resource
  * @name:		Name to use in "schemata" file.
  * @schema_fmt:		Which format string and parser is used for this schema.
- * @evt_list:		List of monitoring events
- * @mbm_cfg_mask:	Bandwidth sources that can be tracked when bandwidth
- *			monitoring events can be configured.
  * @cdp_capable:	Is the CDP feature available on this resource
  */
 struct rdt_resource {
 	int			rid;
 	bool			alloc_capable;
 	bool			mon_capable;
-	int			num_rmid;
 	enum resctrl_scope	ctrl_scope;
 	enum resctrl_scope	mon_scope;
 	struct resctrl_cache	cache;
 	struct resctrl_membw	membw;
+	struct resctrl_mon	mon;
 	struct list_head	ctrl_domains;
 	struct list_head	mon_domains;
 	char			*name;
 	enum resctrl_schema_fmt	schema_fmt;
-	struct list_head	evt_list;
-	unsigned int		mbm_cfg_mask;
 	bool			cdp_capable;
 };
 
@@ -372,8 +400,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 u32 resctrl_arch_system_num_rmid_idx(void);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 
+void resctrl_enable_mon_event(enum resctrl_event_id eventid);
+
+bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid);
+
 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt);
 
+static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid)
+{
+	return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+		eventid <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid);
+
+/* Iterate over all memory bandwidth events */
+#define for_each_mbm_event_id(eventid)				\
+	for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID;		\
+	     eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++)
+
+/* Iterate over memory bandwidth arrays in domain structures */
+#define for_each_mbm_idx(idx)					\
+	for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++)
+
 /**
  * resctrl_arch_mon_event_config_write() - Write the config for an event.
  * @config_info: struct resctrl_mon_config_info describing the resource, domain
@@ -416,6 +465,26 @@ static inline u32 resctrl_get_config_index(u32 closid,
 bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l);
 int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
 
+/**
+ * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment
+ *					    mode is enabled.
+ * @r:		Pointer to the resource structure.
+ *
+ * Return:
+ * true if the assignment mode is enabled, false otherwise.
+ */
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r);
+
+/**
+ * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode.
+ * @r:		Pointer to the resource structure.
+ * @enable:	Set to true to enable, false to disable the assignment mode.
+ *
+ * Return:
+ * 0 on success, < 0 on error.
+ */
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable);
+
 /*
  * Update the ctrl_val and apply this config right now.
  * Must be called on one of the domain's CPUs.
@@ -528,6 +597,63 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *
  */
 void resctrl_arch_reset_all_ctrls(struct rdt_resource *r);
 
+/**
+ * resctrl_arch_config_cntr() - Configure the counter with its new RMID
+ *				and event details.
+ * @r:			Resource structure.
+ * @d:			The domain in which counter with ID @cntr_id should be configured.
+ * @evtid:		Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID
+ *			or QOS_L3_MBM_LOCAL_EVENT_ID).
+ * @rmid:		RMID.
+ * @closid:		CLOSID.
+ * @cntr_id:		Counter ID to configure.
+ * @assign:		True to assign the counter or update an existing assignment,
+ *			false to unassign the counter.
+ *
+ * This can be called from any CPU.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
+			      u32 cntr_id, bool assign);
+
+/**
+ * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID
+ *			      assigned to the RMID, event pair for this resource
+ *			      and domain.
+ * @r:		Resource that the counter should be read from.
+ * @d:		Domain that the counter should be read from.
+ * @closid:	CLOSID that matches the RMID.
+ * @rmid:	The RMID to which @cntr_id is assigned.
+ * @cntr_id:	The counter to read.
+ * @eventid:	The MBM event to which @cntr_id is assigned.
+ * @val:	Result of the counter read in bytes.
+ *
+ * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled.
+ * Called from a non-migrateable process context via smp_call_on_cpu() unless all
+ * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()).
+ *
+ * Return:
+ * 0 on success, or -EIO, -EINVAL etc on error.
+ */
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+			   u32 closid, u32 rmid, int cntr_id,
+			   enum resctrl_event_id eventid, u64 *val);
+
+/**
+ * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID.
+ * @r:		The domain's resource.
+ * @d:		The counter ID's domain.
+ * @closid:	CLOSID that matches the RMID.
+ * @rmid:	The RMID to which @cntr_id is assigned.
+ * @cntr_id:	The counter to reset.
+ * @eventid:	The MBM event to which @cntr_id is assigned.
+ *
+ * This can be called from any CPU.
+ */
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 closid, u32 rmid, int cntr_id,
+			     enum resctrl_event_id eventid);
+
 extern unsigned int resctrl_rmid_realloc_threshold;
 extern unsigned int resctrl_rmid_realloc_limit;
 
diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h
index a25fb9c4070d..acfe07860b34 100644
--- a/include/linux/resctrl_types.h
+++ b/include/linux/resctrl_types.h
@@ -34,11 +34,18 @@
 /* Max event bits supported */
 #define MAX_EVT_CONFIG_BITS		GENMASK(6, 0)
 
-/*
- * Event IDs, the values match those used to program IA32_QM_EVTSEL before
- * reading IA32_QM_CTR on RDT systems.
- */
+/* Number of memory transactions that an MBM event can be configured with */
+#define NUM_MBM_TRANSACTIONS		7
+
+/* Event IDs */
 enum resctrl_event_id {
+	/* Must match value of first event below */
+	QOS_FIRST_EVENT			= 0x01,
+
+	/*
+	 * These values match those used to program IA32_QM_EVTSEL before
+	 * reading IA32_QM_CTR on RDT systems.
+	 */
 	QOS_L3_OCCUP_EVENT_ID		= 0x01,
 	QOS_L3_MBM_TOTAL_EVENT_ID	= 0x02,
 	QOS_L3_MBM_LOCAL_EVENT_ID	= 0x03,
@@ -47,4 +54,7 @@ enum resctrl_event_id {
 	QOS_NUM_EVENTS,
 };
 
+#define QOS_NUM_L3_MBM_EVENTS	(QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1)
+#define MBM_STATE_IDX(evt)	((evt) - QOS_L3_MBM_TOTAL_EVENT_ID)
+
 #endif /* __LINUX_RESCTRL_TYPES_H */
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bc8af3eb5598..69553e7c14c1 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,6 +7,12 @@
 #include <linux/preempt.h>
 #include <linux/sched.h>
 
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD	irq
+#else
+# define RSEQ_EVENT_GUARD	preempt
+#endif
+
 /*
  * Map the event mask on the user-space ABI enum rseq_cs_flags
  * for direct mask checks.
@@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig,
 static inline void rseq_signal_deliver(struct ksignal *ksig,
 				       struct pt_regs *regs)
 {
-	preempt_disable();
-	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	preempt_enable();
+	scoped_guard(RSEQ_EVENT_GUARD)
+		__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
 	rseq_handle_notify_resume(ksig, regs);
 }
 
@@ -65,7 +70,7 @@ static inline void rseq_migrate(struct task_struct *t)
  * If parent process has a registered restartable sequences area, the
  * child inherits. Unregister rseq for a clone with CLONE_VM set.
  */
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 {
 	if (clone_flags & CLONE_VM) {
 		t->rseq = NULL;
@@ -107,7 +112,7 @@ static inline void rseq_preempt(struct task_struct *t)
 static inline void rseq_migrate(struct task_struct *t)
 {
 }
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
 {
 }
 static inline void rseq_execve(struct task_struct *t)
diff --git a/include/linux/rv.h b/include/linux/rv.h
index 14410a42faef..9520aab34bcb 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,16 +7,14 @@
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
-#include <linux/types.h>
-#include <linux/list.h>
-
 #define MAX_DA_NAME_LEN			32
 #define MAX_DA_RETRY_RACING_EVENTS	3
 
 #ifdef CONFIG_RV
+#include <linux/array_size.h>
 #include <linux/bitops.h>
+#include <linux/list.h>
 #include <linux/types.h>
-#include <linux/array_size.h>
 
 /*
  * Deterministic automaton per-object variables.
diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h
index 309ca72f2dfb..adcc43042c90 100644
--- a/include/linux/rw_hint.h
+++ b/include/linux/rw_hint.h
@@ -14,6 +14,7 @@ enum rw_hint {
 	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
 	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
 	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
+	WRITE_LIFE_HINT_NR,
 } __packed;
 
 /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b272382673d..cbb7340c5866 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,9 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -706,7 +709,6 @@ struct sched_dl_entity {
 	unsigned int			dl_defer	  : 1;
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
-	unsigned int			dl_server_idle    : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -733,7 +735,6 @@ struct sched_dl_entity {
 	 * runnable task.
 	 */
 	struct rq			*rq;
-	dl_server_has_tasks_f		server_has_tasks;
 	dl_server_pick_f		server_pick_task;
 
 #ifdef CONFIG_RT_MUTEXES
@@ -883,6 +884,11 @@ struct task_struct {
 
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group		*sched_task_group;
+#ifdef CONFIG_CFS_BANDWIDTH
+	struct callback_head		sched_throttle_work;
+	struct list_head		throttle_node;
+	bool				throttled;
+#endif
 #endif
 
 
@@ -2152,6 +2158,8 @@ static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
 
 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
+	struct mutex *blocked_on = READ_ONCE(p->blocked_on);
+
 	WARN_ON_ONCE(!m);
 	/* The task should only be setting itself as blocked */
 	WARN_ON_ONCE(p != current);
@@ -2162,8 +2170,8 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
 	 * with a different mutex. Note, setting it to the same
 	 * lock repeatedly is ok.
 	 */
-	WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
-	p->blocked_on = m;
+	WARN_ON_ONCE(blocked_on && blocked_on != m);
+	WRITE_ONCE(p->blocked_on, m);
 }
 
 static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2174,16 +2182,19 @@ static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
 
 static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
-	WARN_ON_ONCE(!m);
-	/* Currently we serialize blocked_on under the mutex::wait_lock */
-	lockdep_assert_held_once(&m->wait_lock);
-	/*
-	 * There may be cases where we re-clear already cleared
-	 * blocked_on relationships, but make sure we are not
-	 * clearing the relationship with a different lock.
-	 */
-	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
-	p->blocked_on = NULL;
+	if (m) {
+		struct mutex *blocked_on = READ_ONCE(p->blocked_on);
+
+		/* Currently we serialize blocked_on under the mutex::wait_lock */
+		lockdep_assert_held_once(&m->wait_lock);
+		/*
+		 * There may be cases where we re-clear already cleared
+		 * blocked_on relationships, but make sure we are not
+		 * clearing the relationship with a different lock.
+		 */
+		WARN_ON_ONCE(blocked_on && blocked_on != m);
+	}
+	WRITE_ONCE(p->blocked_on, NULL);
 }
 
 static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2307,4 +2318,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
 #define alloc_tag_restore(_tag, _old)		do {} while (0)
 #endif
 
+#ifndef MODULE
+#ifndef COMPILE_OFFSETS
+
+extern void ___migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+/*
+ * The "struct rq" is not available here, so we can't access the
+ * "runqueues" with this_cpu_ptr(), as the compilation will fail in
+ * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
+ *   typeof((ptr) + 0)
+ *
+ * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
+ */
+#ifdef CONFIG_SMP
+#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
+#else
+#define this_rq_raw() PERCPU_PTR(&runqueues)
+#endif
+#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))
+
+static inline void __migrate_enable(void)
+{
+	struct task_struct *p = current;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Check both overflow from migrate_disable() and superfluous
+	 * migrate_enable().
+	 */
+	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+		return;
+#endif
+
+	if (p->migration_disabled > 1) {
+		p->migration_disabled--;
+		return;
+	}
+
+	/*
+	 * Ensure stop_task runs either before or after this, and that
+	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+	 */
+	guard(preempt)();
+	if (unlikely(p->cpus_ptr != &p->cpus_mask))
+		___migrate_enable();
+	/*
+	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
+	 * regular cpus_mask, otherwise things that race (eg.
+	 * select_fallback_rq) get confused.
+	 */
+	barrier();
+	p->migration_disabled = 0;
+	this_rq_pinned()--;
+}
+
+static inline void __migrate_disable(void)
+{
+	struct task_struct *p = current;
+
+	if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+		/*
+		 *Warn about overflow half-way through the range.
+		 */
+		WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+		p->migration_disabled++;
+		return;
+	}
+
+	guard(preempt)();
+	this_rq_pinned()++;
+	p->migration_disabled = 1;
+}
+#else /* !COMPILE_OFFSETS */
+static inline void __migrate_disable(void) { }
+static inline void __migrate_enable(void) { }
+#endif /* !COMPILE_OFFSETS */
+
+/*
+ * So that it is possible to not export the runqueues variable, define and
+ * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
+ * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
+ * be defined in kernel/sched/core.c.
+ */
+#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+static inline void migrate_disable(void)
+{
+	__migrate_disable();
+}
+
+static inline void migrate_enable(void)
+{
+	__migrate_enable();
+}
+#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
+
+#else /* MODULE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* MODULE */
+
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+
 #endif
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 7047101dbf58..d82b7a9b0658 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -108,7 +108,11 @@ enum scx_kf_mask {
 	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
 	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
 	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
-	/* ops.dequeue (in REST) may be nested inside DISPATCH */
+	/*
+	 * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and
+	 * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be
+	 * nested inside DISPATCH.
+	 */
 	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
 	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
 	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1ef1edbaaf79..7d6449982822 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -226,6 +226,10 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
+#ifdef CONFIG_CGROUPS
+	struct rw_semaphore cgroup_threadgroup_rwsem;
+#endif
+
 	/*
 	 * Thread is the potential origin of an oom condition; kill first on
 	 * oom
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ea41795a352b..34d6a0e108c3 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void);
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
-extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern int sched_fork(u64 clone_flags, struct task_struct *p);
 extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
 extern void sched_cancel_fork(struct task_struct *p);
 extern void sched_post_fork(struct task_struct *p);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 5263746b63e8..bbcfdf12aa6e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -30,33 +30,24 @@ struct sd_flag_debug {
 };
 extern const struct sd_flag_debug sd_flag_debug[];
 
+struct sched_domain_topology_level;
+
 #ifdef CONFIG_SCHED_SMT
-static inline int cpu_smt_flags(void)
-{
-	return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
-}
+extern int cpu_smt_flags(void);
+extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu);
 #endif
 
 #ifdef CONFIG_SCHED_CLUSTER
-static inline int cpu_cluster_flags(void)
-{
-	return SD_CLUSTER | SD_SHARE_LLC;
-}
+extern int cpu_cluster_flags(void);
+extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu);
 #endif
 
 #ifdef CONFIG_SCHED_MC
-static inline int cpu_core_flags(void)
-{
-	return SD_SHARE_LLC;
-}
+extern int cpu_core_flags(void);
+extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu);
 #endif
 
-#ifdef CONFIG_NUMA
-static inline int cpu_numa_flags(void)
-{
-	return SD_NUMA;
-}
-#endif
+extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu);
 
 extern int arch_asym_cpu_priority(int cpu);
 
@@ -172,7 +163,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu);
 bool cpus_share_cache(int this_cpu, int that_cpu);
 bool cpus_share_resources(int this_cpu, int that_cpu);
 
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef int (*sched_domain_flags_f)(void);
 
 struct sd_data {
diff --git a/include/linux/security.h b/include/linux/security.h
index 521bcb5b9717..bd33f194c94a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -489,7 +489,7 @@ int security_file_receive(struct file *file);
 int security_file_open(struct file *file);
 int security_file_post_open(struct file *file, int mask);
 int security_file_truncate(struct file *file);
-int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
+int security_task_alloc(struct task_struct *task, u64 clone_flags);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
@@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
 int security_ismaclabel(const char *name);
 int security_secid_to_secctx(u32 secid, struct lsm_context *cp);
-int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp);
+int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp,
+			       int lsmid);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(struct lsm_context *cp);
 void security_inode_invalidate_secctx(struct inode *inode);
@@ -1215,7 +1216,7 @@ static inline int security_file_truncate(struct file *file)
 }
 
 static inline int security_task_alloc(struct task_struct *task,
-				      unsigned long clone_flags)
+				      u64 clone_flags)
 {
 	return 0;
 }
@@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp)
 }
 
 static inline int security_lsmprop_to_secctx(struct lsm_prop *prop,
-					     struct lsm_context *cp)
+					     struct lsm_context *cp,
+					     int lsmid)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/linux/sem.h b/include/linux/sem.h
index c4deefe42aeb..275269ce2ec8 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -9,12 +9,12 @@ struct task_struct;
 
 #ifdef CONFIG_SYSVIPC
 
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
+extern int copy_semundo(u64 clone_flags, struct task_struct *tsk);
 extern void exit_sem(struct task_struct *tsk);
 
 #else
 
-static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk)
 {
 	return 0;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 14b923ddb6df..fa633657e4c0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4172,6 +4172,8 @@ int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
 				      struct iov_iter *to, int len, u32 *crcp);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
+int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
+				     struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index dab49e2ec8c0..80b6bfb944f0 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -94,7 +94,8 @@ enum {
 	__DECLARE_FLEX_ARRAY(TYPE, NAME)
 
 /**
- * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members.
+ * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing
+ *			  members.
  *
  * Creates a union between a flexible-array member (FAM) in a struct and a set
  * of additional members that would otherwise follow it.
@@ -102,15 +103,30 @@ enum {
  * @TYPE: Flexible structure type name, including "struct" keyword.
  * @NAME: Name for a variable to define.
  * @FAM: The flexible-array member within @TYPE
+ * @ATTRS: Any struct attributes (usually empty)
  * @MEMBERS: Trailing overlapping members.
  */
-#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS)				\
+#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS)			\
 	union {									\
 		TYPE NAME;							\
 		struct {							\
-			unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)];	\
+			unsigned char __offset_to_FAM[offsetof(TYPE, FAM)];	\
 			MEMBERS							\
-		};								\
+		} ATTRS;							\
 	}
 
+/**
+ * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members.
+ *
+ * Creates a union between a flexible-array member (FAM) in a struct and a set
+ * of additional members that would otherwise follow it.
+ *
+ * @TYPE: Flexible structure type name, including "struct" keyword.
+ * @NAME: Name for a variable to define.
+ * @FAM: The flexible-array member within @TYPE
+ * @MEMBERS: Trailing overlapping members.
+ */
+#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS)				\
+	__TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS)
+
 #endif
diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index f3ba4f52ff26..6c4077be7742 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -17,6 +17,12 @@
 
 #include <linux/types.h>
 
+static inline const char *str_assert_deassert(bool v)
+{
+	return v ? "assert" : "deassert";
+}
+#define str_deassert_assert(v)		str_assert_deassert(!(v))
+
 static inline const char *str_enable_disable(bool v)
 {
 	return v ? "enable" : "disable";
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2fe6ed2cc3fd..7012a0f758d8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
 void mark_page_accessed(struct page *);
 void folio_mark_accessed(struct folio *);
 
+static inline bool folio_may_be_lru_cached(struct folio *folio)
+{
+	/*
+	 * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting.
+	 * Holding small numbers of low-order mTHP folios in per-CPU LRU cache
+	 * will be sensible, but nobody has implemented and tested that yet.
+	 */
+	return !folio_test_large(folio);
+}
+
 extern atomic_t lru_disable_count;
 
 static inline bool lru_cache_disabled(void)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 77f45e5d4413..66c06fcdfe19 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
 asmlinkage long sys_uretprobe(void);
 
+asmlinkage long sys_uprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index bb2c52f4fc94..c514d0e5a45c 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -33,17 +33,22 @@ struct time_namespace {
 extern struct time_namespace init_time_ns;
 
 #ifdef CONFIG_TIME_NS
+static inline struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct time_namespace, ns);
+}
+void __init time_ns_init(void);
 extern int vdso_join_timens(struct task_struct *task,
 			    struct time_namespace *ns);
 extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
 
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
-	refcount_inc(&ns->ns.count);
+	ns_ref_inc(ns);
 	return ns;
 }
 
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
 				    struct user_namespace *user_ns,
 				    struct time_namespace *old_ns);
 void free_time_ns(struct time_namespace *ns);
@@ -52,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma);
 
 static inline void put_time_ns(struct time_namespace *ns)
 {
-	if (refcount_dec_and_test(&ns->ns.count))
+	if (ns_ref_put(ns))
 		free_time_ns(ns);
 }
 
@@ -108,6 +113,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
 }
 
 #else
+static inline void __init time_ns_init(void)
+{
+}
+
 static inline int vdso_join_timens(struct task_struct *task,
 				   struct time_namespace *ns)
 {
@@ -129,7 +138,7 @@ static inline void put_time_ns(struct time_namespace *ns)
 }
 
 static inline
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
 				    struct user_namespace *user_ns,
 				    struct time_namespace *old_ns)
 {
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index c27aac67cb3f..b8ae89ea28ab 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -76,6 +76,7 @@ struct tk_read_base {
  * @cs_was_changed_seq:		The sequence number of clocksource change events
  * @clock_valid:		Indicator for valid clock
  * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
+ * @monotonic_to_aux:		CLOCK_MONOTONIC to CLOCK_AUX offset
  * @cycle_interval:		Number of clock cycles in one NTP interval
  * @xtime_interval:		Number of clock shifted nano seconds in one NTP
  *				interval.
@@ -117,6 +118,9 @@ struct tk_read_base {
  * @offs_aux is used by the auxiliary timekeepers which do not utilize any
  * of the regular timekeeper offset fields.
  *
+ * @monotonic_to_aux is a timespec64 representation of @offs_aux to
+ * accelerate the VDSO update for CLOCK_AUX.
+ *
  * The cacheline ordering of the structure is optimized for in kernel usage of
  * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
  * timekeeper is prepended in the core timekeeping code with a sequence count,
@@ -159,7 +163,10 @@ struct timekeeper {
 	u8			cs_was_changed_seq;
 	u8			clock_valid;
 
-	struct timespec64	monotonic_to_boot;
+	union {
+		struct timespec64	monotonic_to_boot;
+		struct timespec64	monotonic_to_aux;
+	};
 
 	u64			cycle_interval;
 	u64			xtime_interval;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 33b7fda97d39..6575af39fd10 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu)
 
 #endif
 
-static inline const struct cpumask *cpu_cpu_mask(int cpu)
+static inline const struct cpumask *cpu_node_mask(int cpu)
 {
 	return cpumask_of_node(cpu_to_node(cpu));
 }
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 516217c39094..ee3d36eda45d 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -17,6 +17,7 @@
 #include <linux/wait.h>
 #include <linux/timer.h>
 #include <linux/seqlock.h>
+#include <linux/mutex.h>
 
 struct uprobe;
 struct vm_area_struct;
@@ -185,8 +186,14 @@ struct xol_area;
 
 struct uprobes_state {
 	struct xol_area		*xol_area;
+#ifdef CONFIG_X86_64
+	struct hlist_head	head_tramps;
+#endif
 };
 
+typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
+				     uprobe_opcode_t *insn, int nbytes, void *data);
+
 extern void __init uprobes_init(void);
 extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
 extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn);
 extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
-extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t);
+extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
+			       bool is_register);
+extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
+			uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+			void *data);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@@ -205,7 +216,7 @@ extern void uprobe_start_dup_mmap(void);
 extern void uprobe_end_dup_mmap(void);
 extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
 extern void uprobe_free_utask(struct task_struct *t);
-extern void uprobe_copy_process(struct task_struct *t, unsigned long flags);
+extern void uprobe_copy_process(struct task_struct *t, u64 flags);
 extern int uprobe_post_sstep_notifier(struct pt_regs *regs);
 extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
 extern void uprobe_notify_resume(struct pt_regs *regs);
@@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 					 void *src, unsigned long len);
 extern void uprobe_handle_trampoline(struct pt_regs *regs);
-extern void *arch_uprobe_trampoline(unsigned long *psize);
+extern void *arch_uretprobe_trampoline(unsigned long *psize);
 extern unsigned long uprobe_get_trampoline_vaddr(void);
+extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
+extern void arch_uprobe_clear_state(struct mm_struct *mm);
+extern void arch_uprobe_init_state(struct mm_struct *mm);
+extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
@@ -281,7 +297,7 @@ static inline bool uprobe_deny_signal(void)
 static inline void uprobe_free_utask(struct task_struct *t)
 {
 }
-static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+static inline void uprobe_copy_process(struct task_struct *t, u64 flags)
 {
 }
 static inline void uprobe_clear_state(struct mm_struct *mm)
diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 8afa8c3a0973..57d1ff006090 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t,
 extern void user_event_mm_remove(struct task_struct *t);
 
 static inline void user_events_fork(struct task_struct *t,
-				    unsigned long clone_flags)
+				    u64 clone_flags)
 {
 	struct user_event_mm *old_mm;
 
@@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t)
 }
 #else
 static inline void user_events_fork(struct task_struct *t,
-				    unsigned long clone_flags)
+				    u64 clone_flags)
 {
 }
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index a0bb6d012137..9a9aebbf96b9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -168,10 +168,15 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 
 #ifdef CONFIG_USER_NS
 
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct user_namespace, ns);
+}
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
-		refcount_inc(&ns->ns.count);
+		ns_ref_inc(ns);
 	return ns;
 }
 
@@ -181,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns);
 
 static inline void put_user_ns(struct user_namespace *ns)
 {
-	if (ns && refcount_dec_and_test(&ns->ns.count))
+	if (ns && ns_ref_put(ns))
 		__put_user_ns(ns);
 }
 
diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h
new file mode 100644
index 000000000000..60f37fec0f4b
--- /dev/null
+++ b/include/linux/uts_namespace.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UTS_NAMESPACE_H
+#define _LINUX_UTS_NAMESPACE_H
+
+#include <linux/ns_common.h>
+#include <uapi/linux/utsname.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct uts_namespace {
+	struct new_utsname name;
+	struct user_namespace *user_ns;
+	struct ucounts *ucounts;
+	struct ns_common ns;
+} __randomize_layout;
+
+extern struct uts_namespace init_uts_ns;
+
+#ifdef CONFIG_UTS_NS
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct uts_namespace, ns);
+}
+
+static inline void get_uts_ns(struct uts_namespace *ns)
+{
+	ns_ref_inc(ns);
+}
+
+extern struct uts_namespace *copy_utsname(u64 flags,
+	struct user_namespace *user_ns, struct uts_namespace *old_ns);
+extern void free_uts_ns(struct uts_namespace *ns);
+
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+	if (ns_ref_put(ns))
+		free_uts_ns(ns);
+}
+
+void uts_ns_init(void);
+#else
+static inline void get_uts_ns(struct uts_namespace *ns)
+{
+}
+
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+}
+
+static inline struct uts_namespace *copy_utsname(u64 flags,
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
+{
+	if (flags & CLONE_NEWUTS)
+		return ERR_PTR(-EINVAL);
+
+	return old_ns;
+}
+
+static inline void uts_ns_init(void)
+{
+}
+#endif
+
+#endif /* _LINUX_UTS_NAMESPACE_H */
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index bf7613ba412b..547bd4439706 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -7,7 +7,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
-#include <uapi/linux/utsname.h>
+#include <linux/uts_namespace.h>
 
 enum uts_proc {
 	UTS_PROC_ARCH,
@@ -18,57 +18,6 @@ enum uts_proc {
 	UTS_PROC_DOMAINNAME,
 };
 
-struct user_namespace;
-extern struct user_namespace init_user_ns;
-
-struct uts_namespace {
-	struct new_utsname name;
-	struct user_namespace *user_ns;
-	struct ucounts *ucounts;
-	struct ns_common ns;
-} __randomize_layout;
-extern struct uts_namespace init_uts_ns;
-
-#ifdef CONFIG_UTS_NS
-static inline void get_uts_ns(struct uts_namespace *ns)
-{
-	refcount_inc(&ns->ns.count);
-}
-
-extern struct uts_namespace *copy_utsname(unsigned long flags,
-	struct user_namespace *user_ns, struct uts_namespace *old_ns);
-extern void free_uts_ns(struct uts_namespace *ns);
-
-static inline void put_uts_ns(struct uts_namespace *ns)
-{
-	if (refcount_dec_and_test(&ns->ns.count))
-		free_uts_ns(ns);
-}
-
-void uts_ns_init(void);
-#else
-static inline void get_uts_ns(struct uts_namespace *ns)
-{
-}
-
-static inline void put_uts_ns(struct uts_namespace *ns)
-{
-}
-
-static inline struct uts_namespace *copy_utsname(unsigned long flags,
-	struct user_namespace *user_ns, struct uts_namespace *old_ns)
-{
-	if (flags & CLONE_NEWUTS)
-		return ERR_PTR(-EINVAL);
-
-	return old_ns;
-}
-
-static inline void uts_ns_init(void)
-{
-}
-#endif
-
 #ifdef CONFIG_PROC_SYSCTL
 extern void uts_proc_notify(enum uts_proc proc);
 #else
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 918cf25cd3c6..7427b79d6f3d 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 }
 
 static inline void virtio_get_features(struct virtio_device *vdev,
-				       u64 *features)
+				       u64 *features_out)
 {
 	if (vdev->config->get_extended_features) {
-		vdev->config->get_extended_features(vdev, features);
+		vdev->config->get_extended_features(vdev, features_out);
 		return;
 	}
 
-	virtio_features_from_u64(features, vdev->config->get_features(vdev));
+	virtio_features_from_u64(features_out,
+		vdev->config->get_features(vdev));
 }
 
 /**
@@ -326,13 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
 
 static inline
 bool virtio_get_shm_region(struct virtio_device *vdev,
-			   struct virtio_shm_region *region, u8 id)
+			   struct virtio_shm_region *region_out, u8 id)
 {
-	if (!region->len)
-		return false;
 	if (!vdev->config->get_shm_region)
 		return false;
-	return vdev->config->get_shm_region(vdev, region, id);
+	return vdev->config->get_shm_region(vdev, region_out, id);
 }
 
 static inline bool virtio_is_little_endian(struct virtio_device *vdev)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index fdc9aeb74a44..2759dac6be44 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -220,22 +220,6 @@ int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot,
 		     struct page **pages, unsigned int page_shift);
 
 /*
- * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
- * needs to be called.
- */
-#ifndef ARCH_PAGE_TABLE_SYNC_MASK
-#define ARCH_PAGE_TABLE_SYNC_MASK 0
-#endif
-
-/*
- * There is no default implementation for arch_sync_kernel_mappings(). It is
- * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
- * is 0.
- */
-void arch_sync_kernel_mappings(unsigned long start, unsigned long end);
-
-/*
  *	Lowlevel-APIs (not for driver use!)
  */
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 45d5dd470ff6..dabc351cc127 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -410,7 +410,7 @@ enum wq_flags {
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
 
 	/* BH wq only allows the following flags */
-	__WQ_BH_ALLOWS		= WQ_BH | WQ_HIGHPRI,
+	__WQ_BH_ALLOWS		= WQ_BH | WQ_HIGHPRI | WQ_PERCPU,
 };
 
 enum wq_consts {
@@ -434,10 +434,10 @@ enum wq_consts {
  * short queue flush time.  Don't queue works which can run for too
  * long.
  *
- * system_highpri_wq is similar to system_wq but for work items which
+ * system_highpri_wq is similar to system_percpu_wq but for work items which
  * require WQ_HIGHPRI.
  *
- * system_long_wq is similar to system_wq but may host long running
+ * system_long_wq is similar to system_percpu_wq but may host long running
  * works.  Queue flushing might take relatively long.
  *
  * system_dfl_wq is unbound workqueue.  Workers are not bound to
@@ -445,13 +445,13 @@ enum wq_consts {
  * executed immediately as long as max_active limit is not reached and
  * resources are available.
  *
- * system_freezable_wq is equivalent to system_wq except that it's
+ * system_freezable_wq is equivalent to system_percpu_wq except that it's
  * freezable.
  *
  * *_power_efficient_wq are inclined towards saving power and converted
  * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
  * they are same as their non-power-efficient counterparts - e.g.
- * system_power_efficient_wq is identical to system_wq if
+ * system_power_efficient_wq is identical to system_percpu_wq if
  * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
  *
  * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
@@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu);
  * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
  * that the sum of per-node max_active's may be larger than @max_active.
  *
- * For detailed information on %WQ_* flags, please refer to
+ * For detailed information on %WQ_\* flags, please refer to
  * Documentation/core-api/workqueue.rst.
  *
  * RETURNS:
@@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
 	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 
 #define create_workqueue(name)						\
-	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
+	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name))
 #define create_freezable_workqueue(name)				\
 	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
 			WQ_MEM_RECLAIM, 1, (name))
@@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq,
  */
 static inline bool schedule_work_on(int cpu, struct work_struct *work)
 {
-	return queue_work_on(cpu, system_wq, work);
+	return queue_work_on(cpu, system_percpu_wq, work);
 }
 
 /**
@@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work)
  */
 static inline bool schedule_work(struct work_struct *work)
 {
-	return queue_work(system_wq, work);
+	return queue_work(system_percpu_wq, work);
 }
 
 /**
@@ -770,21 +770,21 @@ extern void __warn_flushing_systemwide_wq(void)
 #define flush_scheduled_work()						\
 ({									\
 	__warn_flushing_systemwide_wq();				\
-	__flush_workqueue(system_wq);					\
+	__flush_workqueue(system_percpu_wq);					\
 })
 
 #define flush_workqueue(wq)						\
 ({									\
 	struct workqueue_struct *_wq = (wq);				\
 									\
-	if ((__builtin_constant_p(_wq == system_wq) &&			\
-	     _wq == system_wq) ||					\
+	if ((__builtin_constant_p(_wq == system_percpu_wq) &&			\
+	     _wq == system_percpu_wq) ||					\
 	    (__builtin_constant_p(_wq == system_highpri_wq) &&		\
 	     _wq == system_highpri_wq) ||				\
 	    (__builtin_constant_p(_wq == system_long_wq) &&		\
 	     _wq == system_long_wq) ||					\
-	    (__builtin_constant_p(_wq == system_unbound_wq) &&		\
-	     _wq == system_unbound_wq) ||				\
+	    (__builtin_constant_p(_wq == system_dfl_wq) &&		\
+	     _wq == system_dfl_wq) ||				\
 	    (__builtin_constant_p(_wq == system_freezable_wq) &&	\
 	     _wq == system_freezable_wq) ||				\
 	    (__builtin_constant_p(_wq == system_power_efficient_wq) &&	\
@@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void)
 static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
 					    unsigned long delay)
 {
-	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
+	return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay);
 }
 
 /**
@@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
 static inline bool schedule_delayed_work(struct delayed_work *dwork,
 					 unsigned long delay)
 {
-	return queue_delayed_work(system_wq, dwork, delay);
+	return queue_delayed_work(system_percpu_wq, dwork, delay);
 }
 
 #ifndef CONFIG_SMP
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a2848d731a46..15a4bc4ab819 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 		bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
 }
 
+void inode_switch_wbs_work_fn(struct work_struct *work);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index ada5b56a4413..e5751f3070b8 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -647,7 +647,7 @@ static inline void sco_exit(void)
 #if IS_ENABLED(CONFIG_BT_LE)
 int iso_init(void);
 int iso_exit(void);
-bool iso_enabled(void);
+bool iso_inited(void);
 #else
 static inline int iso_init(void)
 {
@@ -659,7 +659,7 @@ static inline int iso_exit(void)
 	return 0;
 }
 
-static inline bool iso_enabled(void)
+static inline bool iso_inited(void)
 {
 	return false;
 }
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 4dc11c66f7b8..6560b32f3125 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -129,7 +129,9 @@ struct hci_conn_hash {
 	struct list_head list;
 	unsigned int     acl_num;
 	unsigned int     sco_num;
-	unsigned int     iso_num;
+	unsigned int     cis_num;
+	unsigned int     bis_num;
+	unsigned int     pa_num;
 	unsigned int     le_num;
 	unsigned int     le_num_peripheral;
 };
@@ -1014,9 +1016,13 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 		h->sco_num++;
 		break;
 	case CIS_LINK:
+		h->cis_num++;
+		break;
 	case BIS_LINK:
+		h->bis_num++;
+		break;
 	case PA_LINK:
-		h->iso_num++;
+		h->pa_num++;
 		break;
 	}
 }
@@ -1042,9 +1048,13 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
 		h->sco_num--;
 		break;
 	case CIS_LINK:
+		h->cis_num--;
+		break;
 	case BIS_LINK:
+		h->bis_num--;
+		break;
 	case PA_LINK:
-		h->iso_num--;
+		h->pa_num--;
 		break;
 	}
 }
@@ -1061,9 +1071,11 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
 	case ESCO_LINK:
 		return h->sco_num;
 	case CIS_LINK:
+		return h->cis_num;
 	case BIS_LINK:
+		return h->bis_num;
 	case PA_LINK:
-		return h->iso_num;
+		return h->pa_num;
 	default:
 		return 0;
 	}
@@ -1073,7 +1085,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
 {
 	struct hci_conn_hash *c = &hdev->conn_hash;
 
-	return c->acl_num + c->sco_num + c->le_num + c->iso_num;
+	return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num +
+		c->pa_num;
+}
+
+static inline unsigned int hci_iso_count(struct hci_dev *hdev)
+{
+	struct hci_conn_hash *c = &hdev->conn_hash;
+
+	return c->cis_num + c->bis_num;
 }
 
 static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn)
@@ -1225,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev,
 	return NULL;
 }
 
+static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev,
+							 __u8 type, __u8 role,
+							 bdaddr_t *ba)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) {
+			rcu_read_unlock();
+			return c;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
 						       bdaddr_t *ba,
 						       __u8 ba_type)
@@ -1915,6 +1956,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 				!hci_dev_test_flag(dev, HCI_RPA_EXPIRED))
 #define adv_rpa_valid(adv)     (bacmp(&adv->random_addr, BDADDR_ANY) && \
 				!adv->rpa_expired)
+#define le_enabled(dev)        (lmp_le_capable(dev) && \
+				hci_dev_test_flag(dev, HCI_LE_ENABLED))
 
 #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))
@@ -1932,6 +1975,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 			 ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))
 
 #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY)
+#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev))
 
 #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \
 				   ((dev)->commands[39] & 0x04))
@@ -1981,14 +2025,23 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 
 /* CIS Master/Slave and BIS support */
 #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev))
+#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev))
 #define cis_capable(dev) \
 	(cis_central_capable(dev) || cis_peripheral_capable(dev))
+#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev))
 #define cis_central_capable(dev) \
 	((dev)->le_features[3] & HCI_LE_CIS_CENTRAL)
+#define cis_central_enabled(dev) \
+	(le_enabled(dev) && cis_central_capable(dev))
 #define cis_peripheral_capable(dev) \
 	((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
+#define cis_peripheral_enabled(dev) \
+	(le_enabled(dev) && cis_peripheral_capable(dev))
 #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER)
-#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
+#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev))
+#define sync_recv_capable(dev) \
+	((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
+#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev))
 
 #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
 	(!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG)))
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 5224f57f6af2..e352a4e0ef8d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev);
 
 int hci_update_eir_sync(struct hci_dev *hdev);
 int hci_update_class_sync(struct hci_dev *hdev);
-int hci_update_name_sync(struct hci_dev *hdev);
+int hci_update_name_sync(struct hci_dev *hdev, const u8 *name);
 int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode);
 
 int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index 2053cd8e788a..dba369a2cf27 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -307,6 +307,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
+void bond_3ad_update_lacp_active(struct bonding *bond);
 void bond_3ad_update_ad_actor_settings(struct bonding *bond);
 int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats);
 size_t bond_3ad_stats_size(void);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 93640a29427c..b32c9ceeb81d 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -78,6 +78,9 @@ struct devlink_port_pci_sf_attrs {
  * @flavour: flavour of the port
  * @split: indicates if this is split port
  * @splittable: indicates if the port can be split.
+ * @no_phys_port_name: skip automatic phys_port_name generation; for
+ *		       compatibility only, newly added driver/port instance
+ *		       should never set this.
  * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink.
  * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL
  * @phys: physical port attributes
@@ -87,7 +90,8 @@ struct devlink_port_pci_sf_attrs {
  */
 struct devlink_port_attrs {
 	u8 split:1,
-	   splittable:1;
+	   splittable:1,
+	   no_phys_port_name:1;
 	u32 lanes;
 	enum devlink_port_flavour flavour;
 	struct netdev_phys_item_id switch_id;
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 4160731dcb6e..1fc2fb03ce3f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -3,6 +3,7 @@
 #define __NET_DST_METADATA_H 1
 
 #include <linux/skbuff.h>
+#include <net/ip.h>
 #include <net/ip_tunnels.h>
 #include <net/macsec.h>
 #include <net/dst.h>
@@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 						 int md_size)
 {
 	const struct iphdr *iph = ip_hdr(skb);
+	struct metadata_dst *tun_dst;
+
+	tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
+				   0, flags, tunnel_id, md_size);
 
-	return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
-				0, flags, tunnel_id, md_size);
+	if (tun_dst && (iph->frag_off & htons(IP_DF)))
+		__set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+			  tun_dst->u.tun_info.key.tun_flags);
+	return tun_dst;
 }
 
 static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ff406ef4fd4a..29a36709e7f3 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
 		return housekeeping_cpumask(HK_TYPE_KTHREAD);
 }
 
+static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
+{
+	if (ipvs->est_cpulist_valid)
+		return ipvs->sysctl_est_cpulist;
+	else
+		return NULL;
+}
+
 static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 {
 	return ipvs->sysctl_est_nice;
@@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
 	return housekeeping_cpumask(HK_TYPE_KTHREAD);
 }
 
+static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
+{
+	return NULL;
+}
+
 static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 {
 	return IPVS_EST_NICE;
diff --git a/include/net/kcm.h b/include/net/kcm.h
index 441e993be634..d9c35e71ecea 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -71,7 +71,6 @@ struct kcm_sock {
 	struct list_head wait_psock_list;
 	struct sk_buff *seq_skb;
 	struct mutex tx_mutex;
-	u32 tx_stopped : 1;
 
 	/* Don't use bit fields here, these are set under different locks */
 	bool tx_wait;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 025a7574b275..cb664f6e3558 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -204,7 +204,7 @@ struct net {
 extern struct net init_net;
 
 #ifdef CONFIG_NET_NS
-struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
+struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns,
 			struct net *old_net);
 
 void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);
@@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task;
 #else /* CONFIG_NET_NS */
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
-static inline struct net *copy_net_ns(unsigned long flags,
+static inline struct net *copy_net_ns(u64 flags,
 	struct user_namespace *user_ns, struct net *old_net)
 {
 	if (flags & CLONE_NEWNET)
@@ -262,10 +262,15 @@ void ipx_unregister_sysctl(void);
 #ifdef CONFIG_NET_NS
 void __put_net(struct net *net);
 
+static inline struct net *to_net_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct net, ns);
+}
+
 /* Try using get_net_track() instead */
 static inline struct net *get_net(struct net *net)
 {
-	refcount_inc(&net->ns.count);
+	ns_ref_inc(net);
 	return net;
 }
 
@@ -276,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net)
 	 * exists.  If the reference count is zero this
 	 * function fails and returns NULL.
 	 */
-	if (!refcount_inc_not_zero(&net->ns.count))
+	if (!ns_ref_get(net))
 		net = NULL;
 	return net;
 }
@@ -284,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net)
 /* Try using put_net_track() instead */
 static inline void put_net(struct net *net)
 {
-	if (refcount_dec_and_test(&net->ns.count))
+	if (ns_ref_put(net))
 		__put_net(net);
 }
 
@@ -296,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2)
 
 static inline int check_net(const struct net *net)
 {
-	return refcount_read(&net->ns.count) != 0;
+	return ns_ref_read(net) != 0;
 }
 
 void net_drop_ns(void *);
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 891e43a01bdc..3faa80f5d801 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1912,7 +1912,6 @@ struct nftables_pernet {
 	struct mutex		commit_mutex;
 	u64			table_handle;
 	u64			tstamp;
-	unsigned int		base_seq;
 	unsigned int		gc_seq;
 	u8			validate_state;
 	struct work_struct	destroy_work;
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 6c2f483d9828..656e784714f3 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
 const struct nft_set_ext *
 nft_hash_lookup(const struct net *net, const struct nft_set *set,
 		const u32 *key);
+#endif
+
 const struct nft_set_ext *
 nft_set_do_lookup(const struct net *net, const struct nft_set *set,
 		  const u32 *key);
-#else
-static inline const struct nft_set_ext *
-nft_set_do_lookup(const struct net *net, const struct nft_set *set,
-		  const u32 *key)
-{
-	return set->ops->lookup(net, set, key);
-}
-#endif
 
 /* called from nft_pipapo_avx2.c */
 const struct nft_set_ext *
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index cc8060c017d5..99dd166c5d07 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -3,6 +3,7 @@
 #define _NETNS_NFTABLES_H_
 
 struct netns_nftables {
+	unsigned int		base_seq;
 	u8			gencursor;
 };
 
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 431b593de709..1509a536cb85 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
 struct xdp_mem_info;
 
 #ifdef CONFIG_PAGE_POOL
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+				       struct napi_struct *napi);
 void page_pool_disable_direct_recycling(struct page_pool *pool);
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
diff --git a/include/net/rose.h b/include/net/rose.h
index 23267b4efcfa..2b5491bbf39a 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -8,6 +8,7 @@
 #ifndef _ROSE_H
 #define _ROSE_H 
 
+#include <linux/refcount.h>
 #include <linux/rose.h>
 #include <net/ax25.h>
 #include <net/sock.h>
@@ -96,7 +97,7 @@ struct rose_neigh {
 	ax25_cb			*ax25;
 	struct net_device		*dev;
 	unsigned short		count;
-	unsigned short		use;
+	refcount_t		use;
 	unsigned int		number;
 	char			restarted;
 	char			dce_mode;
@@ -151,6 +152,21 @@ struct rose_sock {
 
 #define rose_sk(sk) ((struct rose_sock *)(sk))
 
+static inline void rose_neigh_hold(struct rose_neigh *rose_neigh)
+{
+	refcount_inc(&rose_neigh->use);
+}
+
+static inline void rose_neigh_put(struct rose_neigh *rose_neigh)
+{
+	if (refcount_dec_and_test(&rose_neigh->use)) {
+		if (rose_neigh->ax25)
+			ax25_cb_put(rose_neigh->ax25);
+		kfree(rose_neigh->digipeat);
+		kfree(rose_neigh);
+	}
+}
+
 /* af_rose.c */
 extern ax25_address rose_callsign;
 extern int  sysctl_rose_restart_request_timeout;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 638948be4c50..738cd5b13c62 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1038,12 +1038,17 @@ static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool dir
 	skb = __skb_dequeue(&sch->gso_skb);
 	if (skb) {
 		sch->q.qlen--;
+		qdisc_qstats_backlog_dec(sch, skb);
 		return skb;
 	}
-	if (direct)
-		return __qdisc_dequeue_head(&sch->q);
-	else
+	if (direct) {
+		skb = __qdisc_dequeue_head(&sch->q);
+		if (skb)
+			qdisc_qstats_backlog_dec(sch, skb);
+		return skb;
+	} else {
 		return sch->dequeue(sch);
+	}
 }
 
 static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
diff --git a/include/net/sock.h b/include/net/sock.h
index c8a4b283df6f..2e14283c5be1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -285,6 +285,7 @@ struct sk_filter;
   *	@sk_ack_backlog: current listen backlog
   *	@sk_max_ack_backlog: listen backlog set in listen()
   *	@sk_uid: user id of owner
+  *	@sk_ino: inode number (zero if orphaned)
   *	@sk_prefer_busy_poll: prefer busypolling over softirq processing
   *	@sk_busy_poll_budget: napi processing budget when busypolling
   *	@sk_priority: %SO_PRIORITY setting
@@ -518,6 +519,7 @@ struct sock {
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
+	unsigned long		sk_ino;
 	spinlock_t		sk_peer_lock;
 	int			sk_bind_phc;
 	struct pid		*sk_peer_pid;
@@ -2056,6 +2058,13 @@ static inline int sk_rx_queue_get(const struct sock *sk)
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk->sk_socket = sock;
+	if (sock) {
+		WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
+		WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+	} else {
+		/* Note: sk_uid is unchanged. */
+		WRITE_ONCE(sk->sk_ino, 0);
+	}
 }
 
 static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -2076,7 +2085,6 @@ static inline void sock_orphan(struct sock *sk)
 	sock_set_flag(sk, SOCK_DEAD);
 	sk_set_socket(sk, NULL);
 	sk->sk_wq  = NULL;
-	/* Note: sk_uid is unchanged. */
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -2087,20 +2095,22 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
-	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
+static inline unsigned long sock_i_ino(const struct sock *sk)
+{
+	/* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
+	return READ_ONCE(sk->sk_ino);
+}
+
 static inline kuid_t sk_uid(const struct sock *sk)
 {
 	/* Paired with WRITE_ONCE() in sockfs_setattr() */
 	return READ_ONCE(sk->sk_uid);
 }
 
-unsigned long __sock_i_ino(struct sock *sk);
-unsigned long sock_i_ino(struct sock *sk);
-
 static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
 {
 	return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 7cf7dbbfa131..89aed99bfeae 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -227,12 +227,8 @@ struct pcmcia_socket {
 
 
 /* socket drivers must define the resource operations type they use. There
- * are three options:
+ * are two options:
  * - pccard_static_ops		iomem and ioport areas are assigned statically
- * - pccard_iodyn_ops		iomem areas is assigned statically, ioport
- *				areas dynamically
- *				If this option is selected, use
- *				"select PCCARD_IODYN" in Kconfig.
  * - pccard_nonstatic_ops	iomem and ioport areas are assigned dynamically.
  *				If this option is selected, use
  *				"select PCCARD_NONSTATIC" in Kconfig.
@@ -240,13 +236,11 @@ struct pcmcia_socket {
  */
 extern struct pccard_resource_ops pccard_static_ops;
 #if defined(CONFIG_PCMCIA) || defined(CONFIG_PCMCIA_MODULE)
-extern struct pccard_resource_ops pccard_iodyn_ops;
 extern struct pccard_resource_ops pccard_nonstatic_ops;
 #else
 /* If PCMCIA is not used, but only CARDBUS, these functions are not used
  * at all. Therefore, do not use the large (240K!) rsrc_nonstatic module
  */
-#define pccard_iodyn_ops pccard_static_ops
 #define pccard_nonstatic_ops pccard_static_ops
 #endif
 
diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index 67031a774e3d..5368cf5fd623 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -56,7 +56,7 @@ static void ltl_task_init(struct task_struct *task, bool task_creation)
 	ltl_atoms_fetch(task, mon);
 }
 
-static void handle_task_newtask(void *data, struct task_struct *task, unsigned long flags)
+static void handle_task_newtask(void *data, struct task_struct *task, u64 flags)
 {
 	ltl_task_init(task, true);
 }
diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index e17c4cadd04d..7c8bbe8ad1e2 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -107,8 +107,8 @@
 #define CS35L56_DSP1_PMEM_5114				0x3804FE8
 
 #define CS35L63_DSP1_FW_VER				CS35L56_DSP1_FW_VER
-#define CS35L63_DSP1_HALO_STATE				0x280396C
-#define CS35L63_DSP1_PM_CUR_STATE			0x28042C8
+#define CS35L63_DSP1_HALO_STATE				0x2803C04
+#define CS35L63_DSP1_PM_CUR_STATE			0x2804518
 #define CS35L63_PROTECTION_STATUS			0x340009C
 #define CS35L63_TRANSDUCER_ACTUAL_PS			0x34000F4
 #define CS35L63_MAIN_RENDER_USER_MUTE			0x3400020
@@ -306,6 +306,7 @@ struct cs35l56_base {
 	struct gpio_desc *reset_gpio;
 	struct cs35l56_spi_payload *spi_payload_buf;
 	const struct cs35l56_fw_reg *fw_reg;
+	const struct cirrus_amp_cal_controls *calibration_controls;
 };
 
 static inline bool cs35l56_is_otp_register(unsigned int reg)
diff --git a/include/sound/sdca.h b/include/sound/sdca.h
index 5a5d6de78d72..9c6a351c9d47 100644
--- a/include/sound/sdca.h
+++ b/include/sound/sdca.h
@@ -46,6 +46,7 @@ struct sdca_device_data {
 
 enum sdca_quirk {
 	SDCA_QUIRKS_RT712_VB,
+	SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING,
 };
 
 #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA)
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 06ec126cdcc3..ea68856e4c8c 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1063,27 +1063,30 @@ struct sdca_entity_ge {
 /**
  * struct sdca_entity_hide - information specific to HIDE Entities
  * @hid: HID device structure
- * @hidtx_ids: HIDTx Report ID
  * @num_hidtx_ids: number of HIDTx Report ID
- * @hidrx_ids: HIDRx Report ID
  * @num_hidrx_ids: number of HIDRx Report ID
- * @hide_reside_function_num: indicating which Audio Function Numbers within this Device
- * @max_delay: the maximum time in microseconds allowed for the Device to change the ownership from Device to Host
- * @af_number_list: which Audio Function Numbers within this Device are sending/receiving the messages in this HIDE
- * @hid_desc: HID descriptor for the HIDE Entity
+ * @hidtx_ids: HIDTx Report ID
+ * @hidrx_ids: HIDRx Report ID
+ * @af_number_list: which Audio Function Numbers within this Device are
+ * sending/receiving the messages in this HIDE
+ * @hide_reside_function_num: indicating which Audio Function Numbers
+ * within this Device
+ * @max_delay: the maximum time in microseconds allowed for the Device
+ * to change the ownership from Device to Host
  * @hid_report_desc: HID Report Descriptor for the HIDE Entity
+ * @hid_desc: HID descriptor for the HIDE Entity
  */
 struct sdca_entity_hide {
 	struct hid_device *hid;
 	unsigned int *hidtx_ids;
-	int num_hidtx_ids;
 	unsigned int *hidrx_ids;
+	int num_hidtx_ids;
 	int num_hidrx_ids;
+	unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT];
 	unsigned int hide_reside_function_num;
 	unsigned int max_delay;
-	unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT];
-	struct hid_descriptor hid_desc;
 	unsigned char *hid_report_desc;
+	struct hid_descriptor hid_desc;
 };
 
 /**
diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h
index ef9b9f19d212..273224df9282 100644
--- a/include/sound/tas2781-tlv.h
+++ b/include/sound/tas2781-tlv.h
@@ -2,7 +2,7 @@
 //
 // ALSA SoC Texas Instruments TAS2781 Audio Smart Amplifier
 //
-// Copyright (C) 2022 - 2024 Texas Instruments Incorporated
+// Copyright (C) 2022 - 2025 Texas Instruments Incorporated
 // https://www.ti.com
 //
 // The TAS2781 driver implements a flexible and configurable
@@ -15,7 +15,7 @@
 #ifndef __TAS2781_TLV_H__
 #define __TAS2781_TLV_H__
 
-static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 50, 0);
-static const __maybe_unused DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2781_dvc_tlv, -10000, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2781_amp_tlv, 1100, 50, 0);
 
 #endif
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 7f83d242c8e9..1b3c48b5591d 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -69,6 +69,9 @@ enum afs_fs_operation {
 	yfs_FS_RemoveACL		= 64171,
 	yfs_FS_RemoveFile2		= 64173,
 	yfs_FS_StoreOpaqueACL2		= 64174,
+	yfs_FS_Rename_Replace		= 64176,
+	yfs_FS_Rename_NoReplace		= 64177,
+	yfs_FS_Rename_Exchange		= 64187,
 	yfs_FS_InlineBulkStatus		= 64536, /* YFS Fetch multiple file statuses with errors */
 	yfs_FS_FetchData64		= 64537, /* YFS Fetch file data */
 	yfs_FS_StoreData64		= 64538, /* YFS Store file data */
@@ -300,6 +303,9 @@ enum yfs_cm_operation {
 	EM(yfs_FS_RemoveACL,			"YFS.RemoveACL") \
 	EM(yfs_FS_RemoveFile2,			"YFS.RemoveFile2") \
 	EM(yfs_FS_StoreOpaqueACL2,		"YFS.StoreOpaqueACL2") \
+	EM(yfs_FS_Rename_Replace,		"YFS.Rename_Replace") \
+	EM(yfs_FS_Rename_NoReplace,		"YFS.Rename_NoReplace") \
+	EM(yfs_FS_Rename_Exchange,		"YFS.Rename_Exchange") \
 	EM(yfs_FS_InlineBulkStatus,		"YFS.InlineBulkStatus") \
 	EM(yfs_FS_FetchData64,			"YFS.FetchData64") \
 	EM(yfs_FS_StoreData64,			"YFS.StoreData64") \
diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h
index b8d1e00a7982..370016c38a5b 100644
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -27,7 +27,8 @@
 		{ FL_SLEEP,		"FL_SLEEP" },			\
 		{ FL_DOWNGRADE_PENDING,	"FL_DOWNGRADE_PENDING" },	\
 		{ FL_UNLOCK_PENDING,	"FL_UNLOCK_PENDING" },		\
-		{ FL_OFDLCK,		"FL_OFDLCK" })
+		{ FL_OFDLCK,		"FL_OFDLCK" },			\
+		{ FL_RECLAIM,		"FL_RECLAIM"})
 
 #define show_fl_type(val)				\
 	__print_symbolic(val,				\
@@ -189,7 +190,7 @@ TRACE_EVENT(generic_add_lease,
 		__entry->i_ino = inode->i_ino;
 		__entry->wcount = atomic_read(&inode->i_writecount);
 		__entry->rcount = atomic_read(&inode->i_readcount);
-		__entry->icount = atomic_read(&inode->i_count);
+		__entry->icount = icount_read(inode);
 		__entry->owner = fl->c.flc_owner;
 		__entry->flags = fl->c.flc_flags;
 		__entry->type = fl->c.flc_type;
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index af535b053033..4f0759634306 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -8,14 +8,14 @@
 
 TRACE_EVENT(task_newtask,
 
-	TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+	TP_PROTO(struct task_struct *task, u64 clone_flags),
 
 	TP_ARGS(task, clone_flags),
 
 	TP_STRUCT__entry(
 		__field(	pid_t,	pid)
 		__array(	char,	comm, TASK_COMM_LEN)
-		__field( unsigned long, clone_flags)
+		__field(	u64,    clone_flags)
 		__field(	short,	oom_score_adj)
 	),
 
@@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask,
 		__entry->oom_score_adj = task->signal->oom_score_adj;
 	),
 
-	TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd",
+	TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd",
 		__entry->pid, __entry->comm,
 		__entry->clone_flags, __entry->oom_score_adj)
 );
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 1e23919c0da9..c08aff044e80 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
 	)
 );
 
+TRACE_EVENT(inode_switch_wbs_queue,
+
+	TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
+		 unsigned int count),
+
+	TP_ARGS(old_wb, new_wb, count),
+
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(ino_t,		old_cgroup_ino)
+		__field(ino_t,		new_cgroup_ino)
+		__field(unsigned int,	count)
+	),
+
+	TP_fast_assign(
+		strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
+		__entry->old_cgroup_ino	= __trace_wb_assign_cgroup(old_wb);
+		__entry->new_cgroup_ino	= __trace_wb_assign_cgroup(new_wb);
+		__entry->count		= count;
+	),
+
+	TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
+		__entry->name,
+		(unsigned long)__entry->old_cgroup_ino,
+		(unsigned long)__entry->new_cgroup_ino,
+		__entry->count
+	)
+);
+
 TRACE_EVENT(inode_switch_wbs,
 
 	TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 9a4ecc9f6dc5..14a1c1fe013a 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -148,6 +148,8 @@
 #define AUDIT_IPE_POLICY_LOAD	1422	/* IPE policy load */
 #define AUDIT_LANDLOCK_ACCESS	1423	/* Landlock denial */
 #define AUDIT_LANDLOCK_DOMAIN	1424	/* Landlock domain status */
+#define AUDIT_MAC_TASK_CONTEXTS	1425	/* Multiple LSM task contexts */
+#define AUDIT_MAC_OBJ_CONTEXTS	1426	/* Multiple LSM objext contexts */
 
 #define AUDIT_FIRST_KERN_ANOM_MSG   1700
 #define AUDIT_LAST_KERN_ANOM_MSG    1799
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index f291ab4f94eb..3741ea1b73d8 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,6 +111,7 @@
 #define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
 
 #define FD_PIDFS_ROOT			-10002 /* Root of the pidfs filesystem */
+#define FD_NSFS_ROOT			-10003 /* Root of the nsfs filesystem */
 #define FD_INVALID			-10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
 
 /* Generic flags for the *at(2) family of syscalls. */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 0bd678a4a10e..beb4c2d1e41c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t;
 /* buffered IO that drops the cache after reading or writing data */
 #define RWF_DONTCACHE	((__force __kernel_rwf_t)0x00000080)
 
+/* prevent pipe and socket writes from raising SIGPIPE */
+#define RWF_NOSIGNAL	((__force __kernel_rwf_t)0x00000100)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
-			 RWF_DONTCACHE)
+			 RWF_DONTCACHE | RWF_NOSIGNAL)
 
 #define PROCFS_IOCTL_MAGIC 'f'
 
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 67d015df8893..5fd5b4cf75ca 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -31,6 +31,8 @@
 #define MPTCP_INFO_FLAG_FALLBACK		_BITUL(0)
 #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED	_BITUL(1)
 
+#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0		_BITUL(0)
+
 #define MPTCP_PM_ADDR_FLAG_SIGNAL                      (1 << 0)
 #define MPTCP_PM_ADDR_FLAG_SUBFLOW                     (1 << 1)
 #define MPTCP_PM_ADDR_FLAG_BACKUP                      (1 << 2)
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
index 6ac84b2f636c..7359d34da446 100644
--- a/include/uapi/linux/mptcp_pm.h
+++ b/include/uapi/linux/mptcp_pm.h
@@ -16,10 +16,10 @@
  *   good time to allocate memory and send ADD_ADDR if needed. Depending on the
  *   traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED
  *   is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6,
- *   sport, dport, server-side.
+ *   sport, dport, server-side, [flags].
  * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new
  *   subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6,
- *   sport, dport, server-side.
+ *   sport, dport, server-side, [flags].
  * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token.
  * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer.
  *   Attributes: token, rem_id, family, daddr4 | daddr6 [, dport].
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 2beb30be2c5f..8e0eb832bc01 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1784,10 +1784,12 @@ enum nft_synproxy_attributes {
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
  * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
+ * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING)
  */
 enum nft_devices_attributes {
 	NFTA_DEVICE_UNSPEC,
 	NFTA_DEVICE_NAME,
+	NFTA_DEVICE_PREFIX,
 	__NFTA_DEVICE_MAX
 };
 #define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 97d8d80d139f..e098759ec917 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -16,8 +16,6 @@
 #define NS_GET_NSTYPE		_IO(NSIO, 0x3)
 /* Get owner UID (in the caller's user namespace) for a user namespace */
 #define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
-/* Get the id for a mount namespace */
-#define NS_GET_MNTNS_ID		_IOR(NSIO, 0x5, __u64)
 /* Translate pid from target pid namespace into the caller's pid namespace. */
 #define NS_GET_PID_FROM_PIDNS	_IOR(NSIO, 0x6, int)
 /* Return thread-group leader id of pid in the callers pid namespace. */
@@ -42,6 +40,10 @@ struct mnt_ns_info {
 /* Get previous namespace. */
 #define NS_MNT_GET_PREV		_IOR(NSIO, 12, struct mnt_ns_info)
 
+/* Retrieve namespace identifiers. */
+#define NS_GET_MNTNS_ID		_IOR(NSIO, 5,  __u64)
+#define NS_GET_ID		_IOR(NSIO, 13, __u64)
+
 enum init_ns_ino {
 	IPC_NS_INIT_INO		= 0xEFFFFFFFU,
 	UTS_NS_INIT_INO		= 0xEFFFFFFEU,
@@ -51,6 +53,18 @@ enum init_ns_ino {
 	TIME_NS_INIT_INO	= 0xEFFFFFFAU,
 	NET_NS_INIT_INO		= 0xEFFFFFF9U,
 	MNT_NS_INIT_INO		= 0xEFFFFFF8U,
+#ifdef __KERNEL__
+	MNT_NS_ANON_INO		= 0xEFFFFFF7U,
+#endif
+};
+
+struct nsfs_file_handle {
+	__u64 ns_id;
+	__u32 ns_type;
+	__u32 ns_inum;
 };
 
+#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
+#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+
 #endif /* __LINUX_NSFS_H */
diff --git a/include/uapi/linux/pfrut.h b/include/uapi/linux/pfrut.h
index 42fa15f8310d..b77d5c210c26 100644
--- a/include/uapi/linux/pfrut.h
+++ b/include/uapi/linux/pfrut.h
@@ -89,6 +89,7 @@ struct pfru_payload_hdr {
 	__u32 hw_ver;
 	__u32 rt_ver;
 	__u8 platform_id[16];
+	__u32 svn_ver;
 };
 
 enum pfru_dsm_status {
diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h
new file mode 100644
index 000000000000..94e51670383c
--- /dev/null
+++ b/include/uapi/linux/psp-sfs.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Userspace interface for AMD Seamless Firmware Servicing (SFS)
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ *
+ * Author: Ashish Kalra <ashish.kalra@amd.com>
+ */
+
+#ifndef __PSP_SFS_USER_H__
+#define __PSP_SFS_USER_H__
+
+#include <linux/types.h>
+
+/**
+ * SFS: AMD Seamless Firmware Support (SFS) interface
+ */
+
+#define PAYLOAD_NAME_SIZE	64
+#define TEE_EXT_CMD_BUFFER_SIZE	4096
+
+/**
+ * struct sfs_user_get_fw_versions - get current level of base firmware (output).
+ * @blob:                  current level of base firmware for ASP and patch levels (input/output).
+ * @sfs_status:            32-bit SFS status value (output).
+ * @sfs_extended_status:   32-bit SFS extended status value (output).
+ */
+struct sfs_user_get_fw_versions {
+	__u8	blob[TEE_EXT_CMD_BUFFER_SIZE];
+	__u32	sfs_status;
+	__u32	sfs_extended_status;
+} __packed;
+
+/**
+ * struct sfs_user_update_package - update SFS package (input).
+ * @payload_name:          name of SFS package to load, verify and execute (input).
+ * @sfs_status:            32-bit SFS status value (output).
+ * @sfs_extended_status:   32-bit SFS extended status value (output).
+ */
+struct sfs_user_update_package {
+	char	payload_name[PAYLOAD_NAME_SIZE];
+	__u32	sfs_status;
+	__u32	sfs_extended_status;
+} __packed;
+
+/**
+ * Seamless Firmware Support (SFS) IOC
+ *
+ * possible return codes for all SFS IOCTLs:
+ *  0:          success
+ *  -EINVAL:    invalid input
+ *  -E2BIG:     excess data passed
+ *  -EFAULT:    failed to copy to/from userspace
+ *  -EBUSY:     mailbox in recovery or in use
+ *  -ENODEV:    driver not bound with PSP device
+ *  -EACCES:    request isn't authorized
+ *  -EINVAL:    invalid parameter
+ *  -ETIMEDOUT: request timed out
+ *  -EAGAIN:    invalid request for state machine
+ *  -ENOENT:    not implemented
+ *  -ENFILE:    overflow
+ *  -EPERM:     invalid signature
+ *  -EIO:       PSP I/O error
+ */
+#define SFS_IOC_TYPE	'S'
+
+/**
+ * SFSIOCFWVERS - returns blob containing FW versions
+ *                ASP provides the current level of Base Firmware for the ASP
+ *                and the other microprocessors as well as current patch
+ *                level(s).
+ */
+#define SFSIOCFWVERS	_IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions)
+
+/**
+ * SFSIOCUPDATEPKG - updates package/payload
+ *                   ASP loads, verifies and executes the SFS package.
+ *                   By default, the SFS package/payload is loaded from
+ *                   /lib/firmware/amd, but alternative firmware loading
+ *                   path can be specified using kernel parameter
+ *                   firmware_class.path or the firmware loading path
+ *                   can be customized using sysfs file:
+ *                   /sys/module/firmware_class/parameters/path.
+ */
+#define SFSIOCUPDATEPKG	_IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package)
+
+#endif /* __PSP_SFS_USER_H__ */
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 18eefa6d93d6..2c3346e91dbe 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -37,6 +37,9 @@
 
 /*
  * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl.
+ *
+ * Note: PTP_STRICT_FLAGS is always enabled by the kernel for
+ * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace.
  */
 #define PTP_EXTTS_VALID_FLAGS	(PTP_ENABLE_FEATURE |	\
 				 PTP_RISING_EDGE |	\
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index b13946287277..ac74133a4768 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -173,7 +173,7 @@ typedef struct mdp_superblock_s {
 #else
 #error unspecified endianness
 #endif
-	__u32 resync_offset;	/* 11 resync checkpoint sector count	      */
+	__u32 recovery_cp;	/* 11 resync checkpoint sector count	      */
 	/* There are only valid for minor_version > 90 */
 	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
 	__u32 new_level;	/* 14 new level we are reshaping to	      */
diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h
index 68a627d04afa..10ad71aa00d6 100644
--- a/include/uapi/linux/vduse.h
+++ b/include/uapi/linux/vduse.h
@@ -237,7 +237,7 @@ struct vduse_iova_umem {
  * struct vduse_iova_info - information of one IOVA region
  * @start: start of the IOVA region
  * @last: last of the IOVA region
- * @capability: capability of the IOVA regsion
+ * @capability: capability of the IOVA region
  * @reserved: for future use, needs to be initialized to zero
  *
  * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 283348b64af9..c57674a6aa0d 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -260,7 +260,7 @@
  * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
  *   - Vhost will create vhost workers as kernel threads.
  */
-#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8)
+#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8)
 
 /**
  * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
@@ -268,6 +268,6 @@
  *
  * @return: An 8-bit value indicating the current thread mode.
  */
-#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8)
+#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8)
 
 #endif
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index e279be353e3f..69ac6d80a006 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -164,7 +164,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
 {
 	if (flags & GNTMAP_contains_pte)
 		map->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
+	else if (!xen_pv_domain())
 		map->host_addr = __pa(addr);
 	else
 		map->host_addr = addr;
@@ -181,7 +181,7 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
 {
 	if (flags & GNTMAP_contains_pte)
 		unmap->host_addr = addr;
-	else if (xen_feature(XENFEAT_auto_translated_physmap))
+	else if (!xen_pv_domain())
 		unmap->host_addr = __pa(addr);
 	else
 		unmap->host_addr = addr;
diff --git a/include/xen/mem-reservation.h b/include/xen/mem-reservation.h
index a2ab516fcd2c..3cbe3df0dfd4 100644
--- a/include/xen/mem-reservation.h
+++ b/include/xen/mem-reservation.h
@@ -39,7 +39,7 @@ static inline void xenmem_reservation_va_mapping_update(unsigned long count,
 							xen_pfn_t *frames)
 {
 #ifdef CONFIG_XEN_HAVE_PVMMU
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_pv_domain())
 		__xenmem_reservation_va_mapping_update(count, pages, frames);
 #endif
 }
@@ -48,7 +48,7 @@ static inline void xenmem_reservation_va_mapping_reset(unsigned long count,
 						       struct page **pages)
 {
 #ifdef CONFIG_XEN_HAVE_PVMMU
-	if (!xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_pv_domain())
 		__xenmem_reservation_va_mapping_reset(count, pages);
 #endif
 }
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 9e2a769b0d96..496e6013c689 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -6,6 +6,7 @@
 #include <linux/notifier.h>
 #include <linux/efi.h>
 #include <linux/virtio_anchor.h>
+#include <xen/xen.h>
 #include <xen/features.h>
 #include <asm/xen/interface.h>
 #include <xen/interface/vcpu.h>
@@ -116,7 +117,7 @@ static inline int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 					     unsigned int domid,
 					     struct page **pages)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
 						 prot, domid, pages);
 
@@ -150,7 +151,7 @@ static inline int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
 					     int nr, int *err_ptr,
 					     pgprot_t prot, unsigned int domid)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return -EOPNOTSUPP;
 
 	return xen_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid,
@@ -175,7 +176,7 @@ static inline int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 					     pgprot_t prot, unsigned int domid,
 					     struct page **pages)
 {
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (!xen_pv_domain())
 		return -EOPNOTSUPP;
 
 	return xen_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false);
diff --git a/include/xen/xen.h b/include/xen/xen.h
index a1e5b3f18d69..61854e3f2837 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -22,8 +22,15 @@ extern bool xen_pvh;
 #define xen_pvh			0
 #endif
 
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+
+#define xen_pv_domain()		(cpu_feature_enabled(X86_FEATURE_XENPV))
+#else
+#define xen_pv_domain()		0
+#endif
+
 #define xen_domain()		(xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain()		(xen_domain_type == XEN_PV_DOMAIN)
 #define xen_hvm_domain()	(xen_domain_type == XEN_HVM_DOMAIN)
 #define xen_pvh_domain()	(xen_pvh)
 
diff --git a/init/Kconfig b/init/Kconfig
index 836320251219..c2b0c9df679a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -99,7 +99,10 @@ config GCC_ASM_GOTO_OUTPUT_BROKEN
 config CC_HAS_ASM_GOTO_OUTPUT
 	def_bool y
 	depends on !GCC_ASM_GOTO_OUTPUT_BROKEN
+	# Detect basic support
 	depends on $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null)
+	# Detect clang (< v17) scoped label issues
+	depends on $(success,echo 'void b(void **);void* c(void);int f(void){{asm goto(""::::l0);return 0;l0:return 1;}void *x __attribute__((cleanup(b)))=c();{asm goto(""::::l1);return 2;l1:return 3;}}' | $(CC) -x c - -c -o /dev/null)
 
 config CC_HAS_ASM_GOTO_TIED_OUTPUT
 	depends on CC_HAS_ASM_GOTO_OUTPUT
@@ -112,15 +115,26 @@ config TOOLS_SUPPORT_RELR
 config CC_HAS_ASM_INLINE
 	def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null)
 
+config CC_HAS_ASSUME
+	bool
+	# clang needs to be at least 19.1.0 since the meaning of the assume
+	# attribute changed:
+	# https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17
+	default y if CC_IS_CLANG && CLANG_VERSION >= 190100
+	# supported since gcc 13.1.0
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654
+	default y if CC_IS_GCC && GCC_VERSION >= 130100
+
 config CC_HAS_NO_PROFILE_FN_ATTR
 	def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror)
 
 config CC_HAS_COUNTED_BY
 	bool
-	# clang needs to be at least 19.1.3 to avoid __bdos miscalculations
-	# https://github.com/llvm/llvm-project/pull/110497
-	# https://github.com/llvm/llvm-project/pull/112636
-	default y if CC_IS_CLANG && CLANG_VERSION >= 190103
+	# clang needs to be at least 20.1.0 to avoid potential crashes
+	# when building structures that contain __counted_by
+	# https://github.com/ClangBuiltLinux/linux/issues/2114
+	# https://github.com/llvm/llvm-project/commit/160fb1121cdf703c3ef5e61fb26c5659eb581489
+	default y if CC_IS_CLANG && CLANG_VERSION >= 200100
 	# supported since gcc 15.1.0
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
 	default y if CC_IS_GCC && GCC_VERSION >= 150100
@@ -145,6 +159,9 @@ config RUSTC_HAS_UNNECESSARY_TRANSMUTES
 config RUSTC_HAS_FILE_WITH_NUL
 	def_bool RUSTC_VERSION >= 108900
 
+config RUSTC_HAS_FILE_AS_C_STR
+	def_bool RUSTC_VERSION >= 109100
+
 config PAHOLE_VERSION
 	int
 	default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
@@ -1497,6 +1514,7 @@ config BOOT_CONFIG_EMBED_FILE
 
 config INITRAMFS_PRESERVE_MTIME
 	bool "Preserve cpio archive mtimes in initramfs"
+	depends on BLK_DEV_INITRD
 	default y
 	help
 	  Each entry in an initramfs cpio archive carries an mtime value. When
@@ -2063,8 +2081,8 @@ config RUST
 	depends on !GCC_PLUGIN_RANDSTRUCT
 	depends on !RANDSTRUCT
 	depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO)
-	depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC
-	select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG
+	depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC
+	select CFI_ICALL_NORMALIZE_INTEGERS if CFI
 	depends on !CALL_PADDING || RUSTC_VERSION >= 108100
 	depends on !KASAN_SW_TAGS
 	depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index ac021ae6e6fa..19d9f33dcacf 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -7,6 +7,7 @@
 #include <uapi/linux/cramfs_fs.h>
 #include <linux/initrd.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/slab.h>
 
 #include "do_mounts.h"
@@ -186,14 +187,12 @@ static unsigned long nr_blocks(struct file *file)
 int __init rd_load_image(char *from)
 {
 	int res = 0;
-	unsigned long rd_blocks, devblocks;
+	unsigned long rd_blocks, devblocks, nr_disks;
 	int nblocks, i;
 	char *buf = NULL;
 	unsigned short rotate = 0;
 	decompress_fn decompressor = NULL;
-#if !defined(CONFIG_S390)
 	char rotator[4] = { '|' , '/' , '-' , '\\' };
-#endif
 
 	out_file = filp_open("/dev/ram", O_RDWR, 0);
 	if (IS_ERR(out_file))
@@ -244,8 +243,9 @@ int __init rd_load_image(char *from)
 		goto done;
 	}
 
-	printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
-		nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : "");
+	nr_disks = (nblocks - 1) / devblocks + 1;
+	pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
+		  nblocks, nr_disks, str_plural(nr_disks));
 	for (i = 0; i < nblocks; i++) {
 		if (i && (i % devblocks == 0)) {
 			pr_cont("done disk #1.\n");
@@ -255,12 +255,10 @@ int __init rd_load_image(char *from)
 		}
 		kernel_read(in_file, buf, BLOCK_SIZE, &in_pos);
 		kernel_write(out_file, buf, BLOCK_SIZE, &out_pos);
-#if !defined(CONFIG_S390)
-		if (!(i % 16)) {
+		if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) {
 			pr_cont("%c\b", rotator[rotate & 0x3]);
 			rotate++;
 		}
-#endif
 	}
 	pr_cont("done.\n");
 
diff --git a/init/init_task.c b/init/init_task.c
index e557f622bd90..a55e2189206f 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
 	},
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
+#ifdef CONFIG_CGROUPS
+	.cgroup_threadgroup_rwsem	= __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
+#endif
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
 	.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
 #ifdef CONFIG_POSIX_TIMERS
diff --git a/init/initramfs.c b/init/initramfs.c
index 097673b97784..6ddbfb17fb8f 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -19,6 +19,7 @@
 #include <linux/init_syscalls.h>
 #include <linux/umh.h>
 #include <linux/security.h>
+#include <linux/overflow.h>
 
 #include "do_mounts.h"
 #include "initramfs_internal.h"
@@ -108,7 +109,7 @@ static char __init *find_link(int major, int minor, int ino,
 	q->minor = minor;
 	q->ino = ino;
 	q->mode = mode;
-	strcpy(q->name, name);
+	strscpy(q->name, name);
 	q->next = NULL;
 	*p = q;
 	hardlink_seen = true;
@@ -152,7 +153,7 @@ static void __init dir_add(const char *name, size_t nlen, time64_t mtime)
 {
 	struct dir_entry *de;
 
-	de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL);
+	de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL);
 	if (!de)
 		panic_show_mem("can't allocate dir_entry buffer");
 	INIT_LIST_HEAD(&de->list);
diff --git a/init/main.c b/init/main.c
index 0ee0ee7b7c2c..fab4f599c035 100644
--- a/init/main.c
+++ b/init/main.c
@@ -103,6 +103,7 @@
 #include <linux/randomize_kstack.h>
 #include <linux/pidfs.h>
 #include <linux/ptdump.h>
+#include <linux/time_namespace.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -956,6 +957,7 @@ void start_kernel(void)
 	sort_main_extable();
 	trap_init();
 	mm_core_init();
+	maple_tree_init();
 	poking_init();
 	ftrace_init();
 
@@ -973,7 +975,6 @@ void start_kernel(void)
 		 "Interrupts were enabled *very* early, fixing it\n"))
 		local_irq_disable();
 	radix_tree_init();
-	maple_tree_init();
 
 	/*
 	 * Set up housekeeping before setting up workqueues to allow the unbound
@@ -1072,6 +1073,7 @@ void start_kernel(void)
 	fork_init();
 	proc_caches_init();
 	uts_ns_init();
+	time_ns_init();
 	key_init();
 	security_init();
 	dbg_late_init();
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
index 043cbf80a766..d071835121c2 100644
--- a/init/version-timestamp.c
+++ b/init/version-timestamp.c
@@ -8,7 +8,8 @@
 #include <linux/utsname.h>
 
 struct uts_namespace init_uts_ns = {
-	.ns.count = REFCOUNT_INIT(2),
+	.ns.ns_type = ns_common_type(&init_uts_ns),
+	.ns.__ns_ref = REFCOUNT_INIT(2),
 	.name = {
 		.sysname	= UTS_SYSNAME,
 		.nodename	= UTS_NODENAME,
@@ -18,7 +19,7 @@ struct uts_namespace init_uts_ns = {
 		.domainname	= UTS_DOMAINNAME,
 	},
 	.user_ns = &init_user_ns,
-	.ns.inum = PROC_UTS_INIT_INO,
+	.ns.inum = ns_init_inum(&init_uts_ns),
 #ifdef CONFIG_UTS_NS
 	.ns.ops = &utsns_operations,
 #endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 692462d50c8c..9113a44984f3 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -288,6 +288,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
 		goto done_unlock;
 	}
 
+	req->flags |= REQ_F_ASYNC_DATA;
 	req->async_data = ifd;
 	ifd->q = futex_q_init;
 	ifd->q.bitset = iof->futex_mask;
@@ -309,6 +310,8 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
+	req->async_data = NULL;
+	req->flags &= ~REQ_F_ASYNC_DATA;
 	kfree(ifd);
 	return IOU_COMPLETE;
 }
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index be91edf34f01..1d03b2fc4b25 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -352,11 +352,18 @@ static void create_worker_cb(struct callback_head *cb)
 	struct io_wq *wq;
 
 	struct io_wq_acct *acct;
-	bool do_create = false;
+	bool activated_free_worker, do_create = false;
 
 	worker = container_of(cb, struct io_worker, create_work);
 	wq = worker->wq;
 	acct = worker->acct;
+
+	rcu_read_lock();
+	activated_free_worker = io_acct_activate_free_worker(acct);
+	rcu_read_unlock();
+	if (activated_free_worker)
+		goto no_need_create;
+
 	raw_spin_lock(&acct->workers_lock);
 
 	if (acct->nr_workers < acct->max_workers) {
@@ -367,6 +374,7 @@ static void create_worker_cb(struct callback_head *cb)
 	if (do_create) {
 		create_io_worker(wq, acct);
 	} else {
+no_need_create:
 		atomic_dec(&acct->nr_running);
 		io_worker_ref_put(wq);
 	}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4ef69dd58734..93665cebe9bd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
 	io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
-	io_alloc_cache_free(&ctx->msg_cache, kfree);
 	io_futex_cache_free(ctx);
 	io_rsrc_cache_free(ctx);
 }
@@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_cmd),
 			    sizeof(struct io_async_cmd));
-	spin_lock_init(&ctx->msg_lock);
-	ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
-			    sizeof(struct io_kiocb), 0);
 	ret |= io_futex_cache_init(ctx);
 	ret |= io_rsrc_cache_init(ctx);
 	if (ret)
@@ -1406,8 +1402,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw)
 
 void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
 {
-	io_tw_lock(req->ctx, tw);
-	if (unlikely(io_should_terminate_tw()))
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_tw_lock(ctx, tw);
+	if (unlikely(io_should_terminate_tw(ctx)))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req);
@@ -2119,6 +2117,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->file = NULL;
 	req->tctx = current->io_uring;
 	req->cancel_seq_set = false;
+	req->async_data = NULL;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
 		req->opcode = 0;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index abc6de227f74..1880902be6fd 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -476,9 +476,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
  * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
  *    our fallback task_work.
  */
-static inline bool io_should_terminate_tw(void)
+static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx)
 {
-	return current->flags & (PF_KTHREAD | PF_EXITING);
+	return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs);
 }
 
 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index f2d2cc319faa..19a8bde5e1e1 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -36,15 +36,19 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
 {
 	while (len) {
 		struct io_uring_buf *buf;
-		u32 this_len;
+		u32 buf_len, this_len;
 
 		buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
-		this_len = min_t(int, len, buf->len);
-		buf->len -= this_len;
-		if (buf->len) {
+		buf_len = READ_ONCE(buf->len);
+		this_len = min_t(u32, len, buf_len);
+		buf_len -= this_len;
+		/* Stop looping for invalid buffer length of 0 */
+		if (buf_len || !this_len) {
 			buf->addr += this_len;
+			buf->len = buf_len;
 			return false;
 		}
+		buf->len = 0;
 		bl->head++;
 		len -= this_len;
 	}
@@ -159,6 +163,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	__u16 tail, head = bl->head;
 	struct io_uring_buf *buf;
 	void __user *ret;
+	u32 buf_len;
 
 	tail = smp_load_acquire(&br->tail);
 	if (unlikely(tail == head))
@@ -168,8 +173,9 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		req->flags |= REQ_F_BL_EMPTY;
 
 	buf = io_ring_head_to_buf(br, head, bl->mask);
-	if (*len == 0 || *len > buf->len)
-		*len = buf->len;
+	buf_len = READ_ONCE(buf->len);
+	if (*len == 0 || *len > buf_len)
+		*len = buf_len;
 	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
 	req->buf_list = bl;
 	req->buf_index = buf->bid;
@@ -265,7 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 
 	req->buf_index = buf->bid;
 	do {
-		u32 len = buf->len;
+		u32 len = READ_ONCE(buf->len);
 
 		/* truncate end piece, if needed, for non partial buffers */
 		if (len > arg->max_len) {
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 4c2578f2efcb..5e5b94236d72 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -11,7 +11,6 @@
 #include "io_uring.h"
 #include "rsrc.h"
 #include "filetable.h"
-#include "alloc_cache.h"
 #include "msg_ring.h"
 
 /* All valid masks for MSG_RING */
@@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
-	if (spin_trylock(&ctx->msg_lock)) {
-		if (io_alloc_cache_put(&ctx->msg_cache, req))
-			req = NULL;
-		spin_unlock(&ctx->msg_lock);
-	}
-	if (req)
-		kfree_rcu(req, rcu_head);
+	kfree_rcu(req, rcu_head);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return 0;
 }
 
-static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
-{
-	struct io_kiocb *req = NULL;
-
-	if (spin_trylock(&ctx->msg_lock)) {
-		req = io_alloc_cache_get(&ctx->msg_cache);
-		spin_unlock(&ctx->msg_lock);
-		if (req)
-			return req;
-	}
-	return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
-}
-
 static int io_msg_data_remote(struct io_ring_ctx *target_ctx,
 			      struct io_msg *msg)
 {
 	struct io_kiocb *target;
 	u32 flags = 0;
 
-	target = io_msg_get_kiocb(target_ctx);
+	target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO)  ;
 	if (unlikely(!target))
 		return -ENOMEM;
 
diff --git a/io_uring/net.c b/io_uring/net.c
index dd96e355982f..d69f2afa4f7a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -494,6 +494,15 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
 	return nbufs;
 }
 
+static int io_net_kbuf_recyle(struct io_kiocb *req,
+			      struct io_async_msghdr *kmsg, int len)
+{
+	req->flags |= REQ_F_BL_NO_RECYCLE;
+	if (req->flags & REQ_F_BUFFERS_COMMIT)
+		io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len));
+	return IOU_RETRY;
+}
+
 static inline bool io_send_finish(struct io_kiocb *req, int *ret,
 				  struct io_async_msghdr *kmsg,
 				  unsigned issue_flags)
@@ -562,8 +571,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 			kmsg->msg.msg_controllen = 0;
 			kmsg->msg.msg_control = NULL;
 			sr->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return -EAGAIN;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
@@ -674,8 +682,7 @@ retry_bundle:
 			sr->len -= ret;
 			sr->buf += ret;
 			sr->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return -EAGAIN;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
@@ -1071,8 +1078,7 @@ retry_multishot:
 		}
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return IOU_RETRY;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
@@ -1218,8 +1224,7 @@ retry_multishot:
 			sr->len -= ret;
 			sr->buf += ret;
 			sr->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return -EAGAIN;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
@@ -1500,8 +1505,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
 			zc->len -= ret;
 			zc->buf += ret;
 			zc->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return -EAGAIN;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
@@ -1571,8 +1575,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
 
 		if (ret > 0 && io_net_retry(sock, flags)) {
 			sr->done_io += ret;
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			return -EAGAIN;
+			return io_net_kbuf_recyle(req, kmsg, ret);
 		}
 		if (ret == -ERESTARTSYS)
 			ret = -EINTR;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 9a6f6e92d742..ea9c0116cec2 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -85,7 +85,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
 		return -EEXIST;
 
 	prev_nd = container_of(prev_uarg, struct io_notif_data, uarg);
-	prev_notif = cmd_to_io_kiocb(nd);
+	prev_notif = cmd_to_io_kiocb(prev_nd);
 
 	/* make sure all noifications can be finished in the same task_work */
 	if (unlikely(notif->ctx != prev_notif->ctx ||
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c786e587563b..6090a26975d4 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
 {
 	int v;
 
-	if (unlikely(io_should_terminate_tw()))
+	if (unlikely(io_should_terminate_tw(req->ctx)))
 		return -ECANCELED;
 
 	do {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 52a5b950b2e5..af5a54b5db12 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -886,6 +886,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 	if (req->flags & REQ_F_HAS_METADATA) {
 		struct io_async_rw *io = req->async_data;
 
+		if (!(file->f_mode & FMODE_HAS_METADATA))
+			return -EINVAL;
+
 		/*
 		 * We have a union of meta fields with wpq used for buffered-io
 		 * in io_async_rw, so fail it here.
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7f13bfa9f2b6..17e3aab0af36 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw)
 	int ret;
 
 	if (prev) {
-		if (!io_should_terminate_tw()) {
+		if (!io_should_terminate_tw(req->ctx)) {
 			struct io_cancel_data cd = {
 				.ctx		= req->ctx,
 				.data		= prev->cqe.user_data,
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 053bac89b6c0..213716e10d70 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw)
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	unsigned int flags = IO_URING_F_COMPLETE_DEFER;
 
-	if (io_should_terminate_tw())
+	if (io_should_terminate_tw(req->ctx))
 		flags |= IO_URING_F_TASK_DEAD;
 
 	/* task_work executor checks the deffered list completion */
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index c7be0c792647..7a03f6d03de3 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -15,6 +15,7 @@
 #include <linux/proc_ns.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/nstree.h>
 
 #include "util.h"
 
@@ -26,12 +27,13 @@ DEFINE_SPINLOCK(mq_lock);
  * and not CONFIG_IPC_NS.
  */
 struct ipc_namespace init_ipc_ns = {
-	.ns.count = REFCOUNT_INIT(1),
+	.ns.__ns_ref = REFCOUNT_INIT(1),
 	.user_ns = &init_user_ns,
-	.ns.inum = PROC_IPC_INIT_INO,
+	.ns.inum = ns_init_inum(&init_ipc_ns),
 #ifdef CONFIG_IPC_NS
 	.ns.ops = &ipcns_operations,
 #endif
+	.ns.ns_type = ns_common_type(&init_ipc_ns),
 };
 
 struct msg_msgseg {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4df91ceeeafe..59b12fcb40bd 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -15,6 +15,7 @@
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/nstree.h>
 #include <linux/sched/task.h>
 
 #include "util.h"
@@ -61,12 +62,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (ns == NULL)
 		goto fail_dec;
 
-	err = ns_alloc_inum(&ns->ns);
+	err = ns_common_init(ns);
 	if (err)
 		goto fail_free;
-	ns->ns.ops = &ipcns_operations;
 
-	refcount_set(&ns->ns.count, 1);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 
@@ -87,6 +86,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 
 	sem_init_ns(ns);
 	shm_init_ns(ns);
+	ns_tree_add(ns);
 
 	return ns;
 
@@ -97,7 +97,7 @@ fail_mq:
 
 fail_put:
 	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 fail_free:
 	kfree(ns);
 fail_dec:
@@ -106,7 +106,7 @@ fail:
 	return ERR_PTR(err);
 }
 
-struct ipc_namespace *copy_ipcs(unsigned long flags,
+struct ipc_namespace *copy_ipcs(u64 flags,
 	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
 	if (!(flags & CLONE_NEWIPC))
@@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 
 	dec_ipc_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 	kfree(ns);
 }
 
@@ -199,20 +199,16 @@ static void free_ipc(struct work_struct *unused)
  */
 void put_ipc_ns(struct ipc_namespace *ns)
 {
-	if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
+	if (ns_ref_put_and_lock(ns, &mq_lock)) {
 		mq_clear_sbinfo(ns);
 		spin_unlock(&mq_lock);
 
+		ns_tree_remove(ns);
 		if (llist_add(&ns->mnt_llist, &free_ipc_list))
 			schedule_work(&free_ipc_work);
 	}
 }
 
-static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct ipc_namespace, ns);
-}
-
 static struct ns_common *ipcns_get(struct task_struct *task)
 {
 	struct ipc_namespace *ns = NULL;
@@ -252,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations ipcns_operations = {
 	.name		= "ipc",
-	.type		= CLONE_NEWIPC,
 	.get		= ipcns_get,
 	.put		= ipcns_put,
 	.install	= ipcns_install,
diff --git a/ipc/sem.c b/ipc/sem.c
index a39cdc7bf88f..0f06e4bd4673 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
  * parent and child tasks.
  */
 
-int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+int copy_semundo(u64 clone_flags, struct task_struct *tsk)
 {
 	struct sem_undo_list *undo_list;
 	int error;
diff --git a/ipc/shm.c b/ipc/shm.c
index a9310b6dbbc3..3db36773dd10 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -45,6 +45,7 @@
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
 #include <linux/rhashtable.h>
+#include <linux/nstree.h>
 
 #include <linux/uaccess.h>
 
@@ -148,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns)
 static int __init ipc_ns_init(void)
 {
 	shm_init_ns(&init_ipc_ns);
+	ns_tree_add(&init_ipc_ns);
 	return 0;
 }
 
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 2ee603a98813..1224dd937df0 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,6 +97,7 @@ config KEXEC_JUMP
 config KEXEC_HANDOVER
 	bool "kexec handover"
 	depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+	depends on !DEFERRED_STRUCT_PAGE_INIT
 	select MEMBLOCK_KHO_SCRATCH
 	select KEXEC_FILE
 	select DEBUG_FS
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 54ea59ff8fbe..da326800c1c9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -103,6 +103,19 @@ config PREEMPT_RT
 	  Select this if you are building a kernel for systems which
 	  require real-time guarantees.
 
+config PREEMPT_RT_NEEDS_BH_LOCK
+	bool "Enforce softirq synchronisation on PREEMPT_RT"
+	depends on PREEMPT_RT
+	help
+	  Enforce synchronisation across the softirqs context. On PREEMPT_RT
+	  the softirq is preemptible. This enforces the same per-CPU BLK
+	  semantic non-PREEMPT_RT builds have. This should not be needed
+	  because per-CPU locks were added to avoid the per-CPU BKL.
+
+	  This switch provides the old behaviour for testing reasons. Select
+	  this if you suspect an error with preemptible softirq and want test
+	  the old synchronized behaviour.
+
 config PREEMPT_COUNT
        bool
 
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..41751834e764 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    sysctl.o capability.o ptrace.o user.o \
 	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
 	    extable.o params.o \
-	    kthread.o sys_ni.o nsproxy.o \
+	    kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
 
@@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
 obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
 obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
-obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_CFI) += cfi.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/audit.c b/kernel/audit.c
index 61b5744d0bb6..26a332ffb1b8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -54,6 +54,7 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
@@ -81,6 +82,13 @@ static u32	audit_failure = AUDIT_FAIL_PRINTK;
 /* private audit network namespace index */
 static unsigned int audit_net_id;
 
+/* Number of modules that provide a security context.
+   List of lsms that provide a security context */
+static u32 audit_subj_secctx_cnt;
+static u32 audit_obj_secctx_cnt;
+static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT];
+static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT];
+
 /**
  * struct audit_net - audit private network namespace data
  * @sk: communication socket
@@ -195,8 +203,10 @@ static struct audit_ctl_mutex {
  * to place it on a transmit queue.  Multiple audit_buffers can be in
  * use simultaneously. */
 struct audit_buffer {
-	struct sk_buff       *skb;	/* formatted skb ready to send */
+	struct sk_buff       *skb;	/* the skb for audit_log functions */
+	struct sk_buff_head  skb_list;	/* formatted skbs, ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
+	struct audit_stamp   stamp;	/* audit stamp for these records */
 	gfp_t		     gfp_mask;
 };
 
@@ -279,6 +289,33 @@ static pid_t auditd_pid_vnr(void)
 }
 
 /**
+ * audit_cfg_lsm - Identify a security module as providing a secctx.
+ * @lsmid: LSM identity
+ * @flags: which contexts are provided
+ *
+ * Description:
+ * Increments the count of the security modules providing a secctx.
+ * If the LSM id is already in the list leave it alone.
+ */
+void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{
+	int i;
+
+	if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) {
+		for (i = 0 ; i < audit_subj_secctx_cnt; i++)
+			if (audit_subj_lsms[i] == lsmid)
+				return;
+		audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid;
+	}
+	if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) {
+		for (i = 0 ; i < audit_obj_secctx_cnt; i++)
+			if (audit_obj_lsms[i] == lsmid)
+				return;
+		audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid;
+	}
+}
+
+/**
  * audit_get_sk - Return the audit socket for the given network namespace
  * @net: the destination network namespace
  *
@@ -1113,7 +1150,6 @@ static int is_audit_feature_set(int i)
 	return af.features & AUDIT_FEATURE_TO_MASK(i);
 }
 
-
 static int audit_get_feature(struct sk_buff *skb)
 {
 	u32 seq;
@@ -1473,7 +1509,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	case AUDIT_SIGNAL_INFO:
 		if (lsmprop_is_set(&audit_sig_lsm)) {
 			err = security_lsmprop_to_secctx(&audit_sig_lsm,
-							 &lsmctx);
+							 &lsmctx, LSM_ID_UNDEF);
 			if (err < 0)
 				return err;
 		}
@@ -1776,10 +1812,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
 
 static void audit_buffer_free(struct audit_buffer *ab)
 {
+	struct sk_buff *skb;
+
 	if (!ab)
 		return;
 
-	kfree_skb(ab->skb);
+	while ((skb = skb_dequeue(&ab->skb_list)))
+		kfree_skb(skb);
 	kmem_cache_free(audit_buffer_cache, ab);
 }
 
@@ -1792,9 +1831,14 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
 	if (!ab)
 		return NULL;
 
+	skb_queue_head_init(&ab->skb_list);
+
 	ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
 	if (!ab->skb)
 		goto err;
+
+	skb_queue_tail(&ab->skb_list, ab->skb);
+
 	if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
 		goto err;
 
@@ -1833,11 +1877,11 @@ unsigned int audit_serial(void)
 }
 
 static inline void audit_get_stamp(struct audit_context *ctx,
-				   struct timespec64 *t, unsigned int *serial)
+				   struct audit_stamp *stamp)
 {
-	if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
-		ktime_get_coarse_real_ts64(t);
-		*serial = audit_serial();
+	if (!ctx || !auditsc_get_stamp(ctx, stamp)) {
+		ktime_get_coarse_real_ts64(&stamp->ctime);
+		stamp->serial = audit_serial();
 	}
 }
 
@@ -1860,8 +1904,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
 	struct audit_buffer *ab;
-	struct timespec64 t;
-	unsigned int serial;
 
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
@@ -1916,12 +1958,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 		return NULL;
 	}
 
-	audit_get_stamp(ab->ctx, &t, &serial);
+	audit_get_stamp(ab->ctx, &ab->stamp);
 	/* cancel dummy context to enable supporting records */
 	if (ctx)
 		ctx->dummy = 0;
 	audit_log_format(ab, "audit(%llu.%03lu:%u): ",
-			 (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
+			 (unsigned long long)ab->stamp.ctime.tv_sec,
+			 ab->stamp.ctime.tv_nsec/1000000,
+			 ab->stamp.serial);
 
 	return ab;
 }
@@ -2177,33 +2221,179 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 		audit_log_format(ab, "(null)");
 }
 
-int audit_log_task_context(struct audit_buffer *ab)
+/**
+ * audit_buffer_aux_new - Add an aux record buffer to the skb list
+ * @ab: audit_buffer
+ * @type: message type
+ *
+ * Aux records are allocated and added to the skb list of
+ * the "main" record. The ab->skb is reset to point to the
+ * aux record on its creation. When the aux record in complete
+ * ab->skb has to be reset to point to the "main" record.
+ * This allows the audit_log_ functions to be ignorant of
+ * which kind of record it is logging to. It also avoids adding
+ * special data for aux records.
+ *
+ * On success ab->skb will point to the new aux record.
+ * Returns 0 on success, -ENOMEM should allocation fail.
+ */
+static int audit_buffer_aux_new(struct audit_buffer *ab, int type)
+{
+	WARN_ON(ab->skb != skb_peek(&ab->skb_list));
+
+	ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask);
+	if (!ab->skb)
+		goto err;
+	if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+		goto err;
+	skb_queue_tail(&ab->skb_list, ab->skb);
+
+	audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+			 (unsigned long long)ab->stamp.ctime.tv_sec,
+			 ab->stamp.ctime.tv_nsec/1000000,
+			 ab->stamp.serial);
+
+	return 0;
+
+err:
+	kfree_skb(ab->skb);
+	ab->skb = skb_peek(&ab->skb_list);
+	return -ENOMEM;
+}
+
+/**
+ * audit_buffer_aux_end - Switch back to the "main" record from an aux record
+ * @ab: audit_buffer
+ *
+ * Restores the "main" audit record to ab->skb.
+ */
+static void audit_buffer_aux_end(struct audit_buffer *ab)
+{
+	ab->skb = skb_peek(&ab->skb_list);
+}
+
+/**
+ * audit_log_subj_ctx - Add LSM subject information
+ * @ab: audit_buffer
+ * @prop: LSM subject properties.
+ *
+ * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record.
+ */
+int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
 {
-	struct lsm_prop prop;
 	struct lsm_context ctx;
+	char *space = "";
 	int error;
+	int i;
 
-	security_current_getlsmprop_subj(&prop);
-	if (!lsmprop_is_set(&prop))
+	security_current_getlsmprop_subj(prop);
+	if (!lsmprop_is_set(prop))
 		return 0;
 
-	error = security_lsmprop_to_secctx(&prop, &ctx);
-	if (error < 0) {
-		if (error != -EINVAL)
-			goto error_path;
+	if (audit_subj_secctx_cnt < 2) {
+		error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+		if (error < 0) {
+			if (error != -EINVAL)
+				goto error_path;
+			return 0;
+		}
+		audit_log_format(ab, " subj=%s", ctx.context);
+		security_release_secctx(&ctx);
 		return 0;
 	}
-
-	audit_log_format(ab, " subj=%s", ctx.context);
-	security_release_secctx(&ctx);
+	/* Multiple LSMs provide contexts. Include an aux record. */
+	audit_log_format(ab, " subj=?");
+	error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS);
+	if (error)
+		goto error_path;
+
+	for (i = 0; i < audit_subj_secctx_cnt; i++) {
+		error = security_lsmprop_to_secctx(prop, &ctx,
+						   audit_subj_lsms[i]->id);
+		if (error < 0) {
+			/*
+			 * Don't print anything. An LSM like BPF could
+			 * claim to support contexts, but only do so under
+			 * certain conditions.
+			 */
+			if (error == -EOPNOTSUPP)
+				continue;
+			if (error != -EINVAL)
+				audit_panic("error in audit_log_subj_ctx");
+		} else {
+			audit_log_format(ab, "%ssubj_%s=%s", space,
+					 audit_subj_lsms[i]->name, ctx.context);
+			space = " ";
+			security_release_secctx(&ctx);
+		}
+	}
+	audit_buffer_aux_end(ab);
 	return 0;
 
 error_path:
-	audit_panic("error in audit_log_task_context");
+	audit_panic("error in audit_log_subj_ctx");
 	return error;
 }
+EXPORT_SYMBOL(audit_log_subj_ctx);
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+	struct lsm_prop prop;
+
+	security_current_getlsmprop_subj(&prop);
+	return audit_log_subj_ctx(ab, &prop);
+}
 EXPORT_SYMBOL(audit_log_task_context);
 
+int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+	int i;
+	int rc;
+	int error = 0;
+	char *space = "";
+	struct lsm_context ctx;
+
+	if (audit_obj_secctx_cnt < 2) {
+		error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+		if (error < 0) {
+			if (error != -EINVAL)
+				goto error_path;
+			return error;
+		}
+		audit_log_format(ab, " obj=%s", ctx.context);
+		security_release_secctx(&ctx);
+		return 0;
+	}
+	audit_log_format(ab, " obj=?");
+	error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS);
+	if (error)
+		goto error_path;
+
+	for (i = 0; i < audit_obj_secctx_cnt; i++) {
+		rc = security_lsmprop_to_secctx(prop, &ctx,
+						audit_obj_lsms[i]->id);
+		if (rc < 0) {
+			audit_log_format(ab, "%sobj_%s=?", space,
+					 audit_obj_lsms[i]->name);
+			if (rc != -EINVAL)
+				audit_panic("error in audit_log_obj_ctx");
+			error = rc;
+		} else {
+			audit_log_format(ab, "%sobj_%s=%s", space,
+					 audit_obj_lsms[i]->name, ctx.context);
+			security_release_secctx(&ctx);
+		}
+		space = " ";
+	}
+
+	audit_buffer_aux_end(ab);
+	return error;
+
+error_path:
+	audit_panic("error in audit_log_obj_ctx");
+	return error;
+}
+
 void audit_log_d_path_exe(struct audit_buffer *ab,
 			  struct mm_struct *mm)
 {
@@ -2411,6 +2601,28 @@ int audit_signal_info(int sig, struct task_struct *t)
 }
 
 /**
+ * __audit_log_end - enqueue one audit record
+ * @skb: the buffer to send
+ */
+static void __audit_log_end(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh;
+
+	if (audit_rate_check()) {
+		/* setup the netlink header, see the comments in
+		 * kauditd_send_multicast_skb() for length quirks */
+		nlh = nlmsg_hdr(skb);
+		nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+		/* queue the netlink packet */
+		skb_queue_tail(&audit_queue, skb);
+	} else {
+		audit_log_lost("rate limit exceeded");
+		kfree_skb(skb);
+	}
+}
+
+/**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
  *
@@ -2422,25 +2634,15 @@ int audit_signal_info(int sig, struct task_struct *t)
 void audit_log_end(struct audit_buffer *ab)
 {
 	struct sk_buff *skb;
-	struct nlmsghdr *nlh;
 
 	if (!ab)
 		return;
 
-	if (audit_rate_check()) {
-		skb = ab->skb;
-		ab->skb = NULL;
-
-		/* setup the netlink header, see the comments in
-		 * kauditd_send_multicast_skb() for length quirks */
-		nlh = nlmsg_hdr(skb);
-		nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+	while ((skb = skb_dequeue(&ab->skb_list)))
+		__audit_log_end(skb);
 
-		/* queue the netlink packet and poke the kauditd thread */
-		skb_queue_tail(&audit_queue, skb);
-		wake_up_interruptible(&kauditd_wait);
-	} else
-		audit_log_lost("rate limit exceeded");
+	/* poke the kauditd thread */
+	wake_up_interruptible(&kauditd_wait);
 
 	audit_buffer_free(ab);
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 2a24d01c5fb0..0f05933a173b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -99,6 +99,12 @@ struct audit_proctitle {
 	char	*value;	/* the cmdline field */
 };
 
+/* A timestamp/serial pair to identify an event */
+struct audit_stamp {
+	struct timespec64	ctime;	/* time of syscall entry */
+	unsigned int		serial;	/* serial number for record */
+};
+
 /* The per-task audit context. */
 struct audit_context {
 	int		    dummy;	/* must be the first element */
@@ -108,10 +114,9 @@ struct audit_context {
 		AUDIT_CTX_URING,	/* in use by io_uring */
 	} context;
 	enum audit_state    state, current_state;
-	unsigned int	    serial;     /* serial number for record */
+	struct audit_stamp  stamp;	/* event identifier */
 	int		    major;      /* syscall number */
 	int		    uring_op;   /* uring operation */
-	struct timespec64   ctime;      /* time of syscall entry */
 	unsigned long	    argv[4];    /* syscall arguments */
 	long		    return_code;/* syscall return code */
 	u64		    prio;
@@ -263,7 +268,7 @@ extern void audit_put_tty(struct tty_struct *tty);
 extern unsigned int audit_serial(void);
 #ifdef CONFIG_AUDITSYSCALL
 extern int auditsc_get_stamp(struct audit_context *ctx,
-			      struct timespec64 *t, unsigned int *serial);
+			     struct audit_stamp *stamp);
 
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
@@ -304,7 +309,7 @@ extern void audit_filter_inodes(struct task_struct *tsk,
 				struct audit_context *ctx);
 extern struct list_head *audit_killed_trees(void);
 #else /* CONFIG_AUDITSYSCALL */
-#define auditsc_get_stamp(c, t, s) 0
+#define auditsc_get_stamp(c, s) 0
 #define audit_put_watch(w) do { } while (0)
 #define audit_get_watch(w) do { } while (0)
 #define audit_to_watch(k, p, l, o) (-EINVAL)
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index c565fbf66ac8..b92805b317a2 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	struct audit_fsnotify_mark *audit_mark;
 	struct path path;
 	struct dentry *dentry;
-	struct inode *inode;
 	int ret;
 
 	if (pathname[0] != '/' || pathname[len-1] == '/')
 		return ERR_PTR(-EINVAL);
 
-	dentry = kern_path_locked(pathname, &path);
+	dentry = kern_path_parent(pathname, &path);
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry); /* returning an error */
-	inode = path.dentry->d_inode;
-	inode_unlock(inode);
+	if (d_really_is_negative(dentry)) {
+		audit_mark = ERR_PTR(-ENOENT);
+		goto out;
+	}
 
 	audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
 	if (unlikely(!audit_mark)) {
@@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	audit_update_mark(audit_mark, dentry->d_inode);
 	audit_mark->rule = krule;
 
-	ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
+	ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0);
 	if (ret < 0) {
 		audit_mark->path = NULL;
 		fsnotify_put_mark(&audit_mark->mark);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0eae2a3c895..1605df0a171e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,8 +93,10 @@ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
 static struct audit_tree *alloc_tree(const char *s)
 {
 	struct audit_tree *tree;
+	size_t sz;
 
-	tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
+	sz = strlen(s) + 1;
+	tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL);
 	if (tree) {
 		refcount_set(&tree->count, 1);
 		tree->goner = 0;
@@ -103,7 +105,7 @@ static struct audit_tree *alloc_tree(const char *s)
 		INIT_LIST_HEAD(&tree->list);
 		INIT_LIST_HEAD(&tree->same_root);
 		tree->root = NULL;
-		strcpy(tree->pathname, s);
+		strscpy(tree->pathname, s, sz);
 	}
 	return tree;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0ebbbe37a60f..a700e3c8925f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
 	struct dentry *d;
 
-	d = kern_path_locked_negative(watch->path, parent);
+	d = kern_path_parent(watch->path, parent);
 	if (IS_ERR(d))
 		return PTR_ERR(d);
 
@@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 		watch->ino = d_backing_inode(d)->i_ino;
 	}
 
-	inode_unlock(d_backing_inode(parent->dentry));
 	dput(d);
 	return 0;
 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e3f42018ed46..c401082d9b25 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1326,7 +1326,7 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par
 
 	/* handle trailing slashes */
 	pathlen -= parentlen;
-	while (p[pathlen - 1] == '/')
+	while (pathlen > 0 && p[pathlen - 1] == '/')
 		pathlen--;
 
 	if (pathlen != dlen)
@@ -1440,7 +1440,7 @@ static int update_lsm_rule(struct audit_krule *r)
 }
 
 /* This function will re-initialize the lsm_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain LSM
+ * It will traverse the filter lists searching for rules that contain LSM
  * specific filter fields.  When such a rule is found, it is copied, the
  * LSM field is re-initialized, and the old rule is replaced with the
  * updated rule. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index eb98cd6fe91f..d1966144bdfe 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -994,10 +994,10 @@ static void audit_reset_context(struct audit_context *ctx)
 	 */
 
 	ctx->current_state = ctx->state;
-	ctx->serial = 0;
+	ctx->stamp.serial = 0;
+	ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
 	ctx->major = 0;
 	ctx->uring_op = 0;
-	ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
 	memset(ctx->argv, 0, sizeof(ctx->argv));
 	ctx->return_code = 0;
 	ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
@@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 				 char *comm)
 {
 	struct audit_buffer *ab;
-	struct lsm_context ctx;
 	int rc = 0;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
 			 from_kuid(&init_user_ns, auid),
 			 from_kuid(&init_user_ns, uid), sessionid);
-	if (lsmprop_is_set(prop)) {
-		if (security_lsmprop_to_secctx(prop, &ctx) < 0) {
-			audit_log_format(ab, " obj=(none)");
-			rc = 1;
-		} else {
-			audit_log_format(ab, " obj=%s", ctx.context);
-			security_release_secctx(&ctx);
-		}
-	}
+	if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop))
+		rc = 1;
+
 	audit_log_format(ab, " ocomm=");
 	audit_log_untrustedstring(ab, comm);
 	audit_log_end(ab);
@@ -1392,15 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic)
 				 from_kgid(&init_user_ns, context->ipc.gid),
 				 context->ipc.mode);
 		if (lsmprop_is_set(&context->ipc.oprop)) {
-			struct lsm_context lsmctx;
-
-			if (security_lsmprop_to_secctx(&context->ipc.oprop,
-						       &lsmctx) < 0) {
+			if (audit_log_obj_ctx(ab, &context->ipc.oprop))
 				*call_panic = 1;
-			} else {
-				audit_log_format(ab, " obj=%s", lsmctx.context);
-				security_release_secctx(&lsmctx);
-			}
 		}
 		if (context->ipc.has_perm) {
 			audit_log_end(ab);
@@ -1557,17 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 				 from_kgid(&init_user_ns, n->gid),
 				 MAJOR(n->rdev),
 				 MINOR(n->rdev));
-	if (lsmprop_is_set(&n->oprop)) {
-		struct lsm_context ctx;
-
-		if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) {
-			if (call_panic)
-				*call_panic = 2;
-		} else {
-			audit_log_format(ab, " obj=%s", ctx.context);
-			security_release_secctx(&ctx);
-		}
-	}
+	if (lsmprop_is_set(&n->oprop) &&
+	    audit_log_obj_ctx(ab, &n->oprop))
+		*call_panic = 2;
 
 	/* log the audit_names record type */
 	switch (n->type) {
@@ -1785,8 +1763,9 @@ static void audit_log_exit(void)
 	    audit_log_pid_context(context, context->target_pid,
 				  context->target_auid, context->target_uid,
 				  context->target_sessionid,
-				  &context->target_ref, context->target_comm))
-			call_panic = 1;
+				  &context->target_ref,
+				  context->target_comm))
+		call_panic = 1;
 
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
@@ -1917,7 +1896,7 @@ void __audit_uring_entry(u8 op)
 
 	ctx->context = AUDIT_CTX_URING;
 	ctx->current_state = ctx->state;
-	ktime_get_coarse_real_ts64(&ctx->ctime);
+	ktime_get_coarse_real_ts64(&ctx->stamp.ctime);
 }
 
 /**
@@ -2039,7 +2018,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 	context->argv[3]    = a4;
 	context->context = AUDIT_CTX_SYSCALL;
 	context->current_state  = state;
-	ktime_get_coarse_real_ts64(&context->ctime);
+	ktime_get_coarse_real_ts64(&context->stamp.ctime);
 }
 
 /**
@@ -2508,21 +2487,17 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
 /**
  * auditsc_get_stamp - get local copies of audit_context values
  * @ctx: audit_context for the task
- * @t: timespec64 to store time recorded in the audit_context
- * @serial: serial value that is recorded in the audit_context
+ * @stamp: timestamp to record
  *
  * Also sets the context as auditable.
  */
-int auditsc_get_stamp(struct audit_context *ctx,
-		       struct timespec64 *t, unsigned int *serial)
+int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp)
 {
 	if (ctx->context == AUDIT_CTX_UNUSED)
 		return 0;
-	if (!ctx->serial)
-		ctx->serial = audit_serial();
-	t->tv_sec  = ctx->ctime.tv_sec;
-	t->tv_nsec = ctx->ctime.tv_nsec;
-	*serial    = ctx->serial;
+	if (!ctx->stamp.serial)
+		ctx->stamp.serial = audit_serial();
+	*stamp = ctx->stamp;
 	if (!ctx->prio) {
 		ctx->prio = 1;
 		ctx->current_state = AUDIT_STATE_RECORD;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 269c04a24664..f6cf8c2af5f7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5d1650af899d..e4568d44e827 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
 					 const struct bpf_insn *insn)
 {
 	/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
-	 * is not working properly, or interpreter is being used when
-	 * prog->jit_requested is not 0, so warn about it!
+	 * is not working properly, so warn about it!
 	 */
 	WARN_ON_ONCE(1);
 	return 0;
@@ -2468,8 +2467,9 @@ out:
 	return ret;
 }
 
-static void bpf_prog_select_func(struct bpf_prog *fp)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
 {
+	bool select_interpreter = false;
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
 	u32 idx = (round_up(stack_depth, 32) / 32) - 1;
@@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
 	 * But for non-JITed programs, we don't need bpf_func, so no bounds
 	 * check needed.
 	 */
-	if (!fp->jit_requested &&
-	    !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+	if (idx < ARRAY_SIZE(interpreters)) {
 		fp->bpf_func = interpreters[idx];
+		select_interpreter = true;
 	} else {
 		fp->bpf_func = __bpf_prog_ret0_warn;
 	}
 #else
 	fp->bpf_func = __bpf_prog_ret0_warn;
 #endif
+	return select_interpreter;
 }
 
 /**
@@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	/* In case of BPF to BPF calls, verifier did all the prep
 	 * work with regards to JITing, etc.
 	 */
-	bool jit_needed = fp->jit_requested;
+	bool jit_needed = false;
 
 	if (fp->bpf_func)
 		goto finalize;
@@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	    bpf_prog_has_kfunc_call(fp))
 		jit_needed = true;
 
-	bpf_prog_select_func(fp);
+	if (!bpf_prog_select_interpreter(fp))
+		jit_needed = true;
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
@@ -3024,7 +3026,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output);
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
-	.func		= NULL,
+	/* func is unused for tail_call, we set it to pass the
+	 * get_helper_proto check
+	 */
+	.func		= BPF_PTR_POISON,
 	.gpl_only	= false,
 	.ret_type	= RET_VOID,
 	.arg1_type	= ARG_PTR_TO_CTX,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b2b7b8ec2c2a..c46360b27871 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 	struct xdp_buff xdp;
 	int i, nframes = 0;
 
-	xdp_set_return_frame_no_direct();
 	xdp.rxq = &rxq;
 
 	for (i = 0; i < n; i++) {
@@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 		}
 	}
 
-	xdp_clear_return_frame_no_direct();
 	stats->pass += nframes;
 
 	return nframes;
@@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 
 	rcu_read_lock();
 	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+	xdp_set_return_frame_no_direct();
 
 	ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
 	if (unlikely(ret->skb_n))
@@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 	if (stats->redirect)
 		xdp_do_flush();
 
+	xdp_clear_return_frame_no_direct();
 	bpf_net_ctx_clear(bpf_net_ctx);
 	rcu_read_unlock();
 
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 94854cd9c4cc..83c4d9943084 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
 	siv_len = siv ? __bpf_dynptr_size(siv) : 0;
 	src_len = __bpf_dynptr_size(src);
 	dst_len = __bpf_dynptr_size(dst);
-	if (!src_len || !dst_len)
+	if (!src_len || !dst_len || src_len > dst_len)
 		return -EINVAL;
 
 	if (siv_len != ctx->siv_len)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..8af62cb243d9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		goto out;
 	}
 
-	/* allocate hrtimer via map_kmalloc to use memcg accounting */
-	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+	/* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
+	 * kmalloc_nolock() is available, avoid locking issues by using
+	 * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
+	 */
+	cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
 	if (!cb) {
 		ret = -ENOMEM;
 		goto out;
@@ -3664,10 +3667,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
 
 	guard(pagefault)();
 	for (i = 0; i < XATTR_SIZE_MAX; i++) {
-		for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) {
+		for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
 			__get_kernel_nofault(&c2, s2__ign + j, char, err_out);
 			if (c2 == '\0')
 				return i;
+			/*
+			 * We allow reading an extra byte from s2 (note the
+			 * `i + j <= len` above) to cover the case when s2 is
+			 * a suffix of the first len chars of s1.
+			 */
+			if (i + j == len)
+				break;
 			__get_kernel_nofault(&c1, s1__ign + j, char, err_out);
 			if (c1 == '\0')
 				return -ENOENT;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5c2e96b19392..f90bdcc0a047 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
 	umode_t mode;
 	int ret;
 
-	dentry = user_path_create(path_fd, pathname, &path, 0);
+	dentry = start_creating_user_path(path_fd, pathname, &path, 0);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
 		ret = -EPERM;
 	}
 out:
-	done_path_create(&path, dentry);
+	end_creating_path(&path, dentry);
 	return ret;
 }
 
@@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode)
 
 const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.show_options	= bpf_show_options,
 	.free_inode	= bpf_free_inode,
 };
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..a00561b1d3e5 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -471,7 +471,7 @@ queue:
 	 * any MCS node. This is not the most elegant solution, but is
 	 * simple enough.
 	 */
-	if (unlikely(idx >= _Q_MAX_NODES)) {
+	if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
 		lockevent_inc(lock_no_node);
 		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
 		while (!queued_spin_trylock(lock)) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7dfa..ec3a57a5fba1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	if (max_depth > sysctl_perf_event_max_stack)
 		max_depth = sysctl_perf_event_max_stack;
 
-	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+	trace = get_perf_callchain(regs, kernel, user, max_depth,
 				   false, false);
 
 	if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	else if (kernel && task)
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
-		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+		trace = get_perf_callchain(regs, kernel, user, max_depth,
 					   crosstask, false);
 
 	if (unlikely(!trace) || trace->nr < skip) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4f69a9e9af6..8340cecd1b35 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 		verifier_bug(env, "Two map pointers in a timer helper");
 		return -EFAULT;
 	}
+	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
+		return -EOPNOTSUPP;
+	}
 	meta->map_uid = reg->map_uid;
 	meta->map_ptr = map;
 	return 0;
@@ -11354,7 +11358,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
 		return -EINVAL;
 
 	*ptr = env->ops->get_func_proto(func_id, env->prog);
-	return *ptr ? 0 : -EINVAL;
+	return *ptr && (*ptr)->func ? 0 : -EINVAL;
 }
 
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -23855,6 +23859,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 BTF_SET_START(btf_id_deny)
 BTF_ID_UNUSED
 #ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
 BTF_ID(func, migrate_disable)
 BTF_ID(func, migrate_enable)
 #endif
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b14e61c64a34..22051b4f1ccb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 
 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 		       bool threadgroup);
-void cgroup_attach_lock(bool lock_threadgroup);
-void cgroup_attach_unlock(bool lock_threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+			struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+			  struct task_struct *tsk);
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
-					     bool *locked)
+					     enum cgroup_attach_lock_mode *lock_mode)
 	__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+			       enum cgroup_attach_lock_mode lock_mode)
 	__releases(&cgroup_threadgroup_rwsem);
 
 void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2a4a387f867a..a9e029b570c8 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -10,6 +10,7 @@
 #include <linux/sched/task.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/delayacct.h>
 #include <linux/pid_namespace.h>
@@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 
 	cgroup_lock();
-	cgroup_attach_lock(true);
+	cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
@@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
-	cgroup_attach_unlock(true);
+	cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	cgroup_unlock();
 
 	return retval;
@@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 
 	cgroup_lock();
 
-	cgroup_attach_lock(true);
+	cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
@@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&mgctx);
-	cgroup_attach_unlock(true);
+	cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
 	cgroup_unlock();
 	return ret;
 }
@@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
 	struct task_struct *task;
 	const struct cred *cred, *tcred;
 	ssize_t ret;
-	bool locked;
+	enum cgroup_attach_lock_mode lock_mode;
 
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENODEV;
 
-	task = cgroup_procs_write_start(buf, threadgroup, &locked);
+	task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
 	ret = PTR_ERR_OR_ZERO(task);
 	if (ret)
 		goto out_unlock;
@@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
 	ret = cgroup_attach_task(cgrp, task, threadgroup);
 
 out_finish:
-	cgroup_procs_write_finish(task, locked);
+	cgroup_procs_write_finish(task, lock_mode);
 out_unlock:
 	cgroup_kn_unlock(of->kn);
 
@@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
 
 	if (ctx->release_agent) {
 		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, ctx->release_agent);
+		strscpy(root->release_agent_path, ctx->release_agent);
 		spin_unlock(&release_agent_path_lock);
 	}
 
@@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
 	 * Cap @max_active to 1 too.
 	 */
 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
+						    WQ_PERCPU, 1);
 	BUG_ON(!cgroup_pidlist_destroy_wq);
 	return 0;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..da4eaee52178 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
 #include <linux/sched/cputime.h>
 #include <linux/sched/deadline.h>
 #include <linux/psi.h>
+#include <linux/nstree.h>
 #include <net/sock.h>
 
 #define CREATE_TRACE_POINTS
@@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
  * which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ *    which can never complete as it's behind in the same queue and
+ *    workqueue's max_active is 1.
  */
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
 
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly;
 
 static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
 
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.ns.count	= REFCOUNT_INIT(2),
+	.ns.__ns_ref	= REFCOUNT_INIT(2),
 	.user_ns	= &init_user_ns,
 	.ns.ops		= &cgroupns_operations,
-	.ns.inum	= PROC_CGROUP_INIT_INO,
+	.ns.inum	= ns_init_inum(&init_cgroup_ns),
 	.root_cset	= &init_css_set,
+	.ns.ns_type	= ns_common_type(&init_cgroup_ns),
 };
 
 static struct file_system_type cgroup2_fs_type;
@@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
 {
 	bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
 
-	/* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+	/*
+	 * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+	 * favordynmods can flip while task is between
+	 * cgroup_threadgroup_change_begin() and end(), so down_write global
+	 * cgroup_threadgroup_rwsem to synchronize them.
+	 *
+	 * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+	 * cgroup_threadgroup_rwsem doesn't exlude tasks between
+	 * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+	 * turn off. As the scenario is unlikely, simply disallow disabling once
+	 * enabled and print out a warning.
+	 */
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	if (favor && !favoring) {
+		cgroup_enable_per_threadgroup_rwsem = true;
 		rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
 		root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
 	} else if (!favor && favoring) {
+		if (cgroup_enable_per_threadgroup_rwsem)
+			pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
 		rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
 		root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
 	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 }
 
 static int cgroup_init_root_id(struct cgroup_root *root)
@@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
 
 /**
  * cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
  *
  * cgroup migration sometimes needs to stabilize threadgroups against forks and
  * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
  * Resolve the situation by always acquiring cpus_read_lock() before optionally
  * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
  * CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
  */
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+			struct task_struct *tsk)
 {
 	cpus_read_lock();
-	if (lock_threadgroup)
+
+	switch (lock_mode) {
+	case CGRP_ATTACH_LOCK_NONE:
+		break;
+	case CGRP_ATTACH_LOCK_GLOBAL:
 		percpu_down_write(&cgroup_threadgroup_rwsem);
+		break;
+	case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+		down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+		break;
+	default:
+		pr_warn("cgroup: Unexpected attach lock mode.");
+		break;
+	}
 }
 
 /**
  * cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
  */
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+			  struct task_struct *tsk)
 {
-	if (lock_threadgroup)
+	switch (lock_mode) {
+	case CGRP_ATTACH_LOCK_NONE:
+		break;
+	case CGRP_ATTACH_LOCK_GLOBAL:
 		percpu_up_write(&cgroup_threadgroup_rwsem);
+		break;
+	case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+		up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+		break;
+	default:
+		pr_warn("cgroup: Unexpected attach lock mode.");
+		break;
+	}
+
 	cpus_read_unlock();
 }
 
@@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 
 	/* look up all src csets */
 	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
 	task = leader;
 	do {
 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
-	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
@@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 }
 
 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
-					     bool *threadgroup_locked)
+					     enum cgroup_attach_lock_mode *lock_mode)
 {
 	struct task_struct *tsk;
 	pid_t pid;
@@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 		return ERR_PTR(-EINVAL);
 
-	/*
-	 * If we migrate a single thread, we don't care about threadgroup
-	 * stability. If the thread is `current`, it won't exit(2) under our
-	 * hands or change PID through exec(2). We exclude
-	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
-	 * callers by cgroup_mutex.
-	 * Therefore, we can skip the global lock.
-	 */
-	lockdep_assert_held(&cgroup_mutex);
-	*threadgroup_locked = pid || threadgroup;
-	cgroup_attach_lock(*threadgroup_locked);
-
+retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			tsk = ERR_PTR(-ESRCH);
-			goto out_unlock_threadgroup;
+			goto out_unlock_rcu;
 		}
 	} else {
 		tsk = current;
@@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
 	 */
 	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 		tsk = ERR_PTR(-EINVAL);
-		goto out_unlock_threadgroup;
+		goto out_unlock_rcu;
 	}
-
 	get_task_struct(tsk);
-	goto out_unlock_rcu;
+	rcu_read_unlock();
+
+	/*
+	 * If we migrate a single thread, we don't care about threadgroup
+	 * stability. If the thread is `current`, it won't exit(2) under our
+	 * hands or change PID through exec(2). We exclude
+	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+	 * by cgroup_mutex. Therefore, we can skip the global lock.
+	 */
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (pid || threadgroup) {
+		if (cgroup_enable_per_threadgroup_rwsem)
+			*lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+		else
+			*lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+	} else {
+		*lock_mode = CGRP_ATTACH_LOCK_NONE;
+	}
+
+	cgroup_attach_lock(*lock_mode, tsk);
+
+	if (threadgroup) {
+		if (!thread_group_leader(tsk)) {
+			/*
+			 * A race with de_thread from another thread's exec()
+			 * may strip us of our leadership. If this happens,
+			 * throw this task away and try again.
+			 */
+			cgroup_attach_unlock(*lock_mode, tsk);
+			put_task_struct(tsk);
+			goto retry_find_task;
+		}
+	}
+
+	return tsk;
 
-out_unlock_threadgroup:
-	cgroup_attach_unlock(*threadgroup_locked);
-	*threadgroup_locked = false;
 out_unlock_rcu:
 	rcu_read_unlock();
 	return tsk;
 }
 
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+			       enum cgroup_attach_lock_mode lock_mode)
 {
-	struct cgroup_subsys *ss;
-	int ssid;
+	cgroup_attach_unlock(lock_mode, task);
 
 	/* release reference from cgroup_procs_write_start() */
 	put_task_struct(task);
-
-	cgroup_attach_unlock(threadgroup_locked);
-
-	for_each_subsys(ss, ssid)
-		if (ss->post_attach)
-			ss->post_attach();
 }
 
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
+	enum cgroup_attach_lock_mode lock_mode;
 	bool has_tasks;
 	int ret;
 
@@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	 * write-locking can be skipped safely.
 	 */
 	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
-	cgroup_attach_lock(has_tasks);
+
+	if (has_tasks)
+		lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+	else
+		lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+	cgroup_attach_lock(lock_mode, NULL);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
 	cgroup_migrate_finish(&mgctx);
-	cgroup_attach_unlock(has_tasks);
+	cgroup_attach_unlock(lock_mode, NULL);
 	return ret;
 }
 
@@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	unsigned int sequence;
+	u64 freeze_time;
+
+	do {
+		sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+		freeze_time = cgrp->freezer.frozen_nsec;
+		/* Add in current freezer interval if the cgroup is freezing. */
+		if (test_bit(CGRP_FREEZE, &cgrp->flags))
+			freeze_time += (ktime_get_ns() -
+					cgrp->freezer.freeze_start_nsec);
+	} while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+	do_div(freeze_time, NSEC_PER_USEC);
+	seq_printf(seq, "frozen_usec %llu\n", freeze_time);
+
+	return 0;
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 /**
  * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
@@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 		cft->release(of);
 	put_cgroup_ns(ctx->ns);
 	kfree(ctx);
+	of->priv = NULL;
 }
 
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
@@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	struct task_struct *task;
 	const struct cred *saved_cred;
 	ssize_t ret;
-	bool threadgroup_locked;
+	enum cgroup_attach_lock_mode lock_mode;
 
 	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!dst_cgrp)
 		return -ENODEV;
 
-	task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+	task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
 	ret = PTR_ERR_OR_ZERO(task);
 	if (ret)
 		goto out_unlock;
@@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
 
 out_finish:
-	cgroup_procs_write_finish(task, threadgroup_locked);
+	cgroup_procs_write_finish(task, lock_mode);
 out_unlock:
 	cgroup_kn_unlock(of->kn);
 
@@ -5355,6 +5479,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_stat_show,
 	},
 	{
+		.name = "cgroup.stat.local",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_core_local_stat_show,
+	},
+	{
 		.name = "cgroup.freeze",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_freeze_show,
@@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work)
 	cgroup_unlock();
 
 	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
-	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+	queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
 }
 
 static void css_release(struct percpu_ref *ref)
@@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref)
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
 	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	queue_work(cgroup_release_wq, &css->destroy_work);
 }
 
 static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5701,7 +5830,7 @@ err_list_del:
 	list_del_rcu(&css->sibling);
 err_free_css:
 	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
-	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+	queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
 	return ERR_PTR(err);
 }
 
@@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	 * if the parent has to be frozen, the child has too.
 	 */
 	cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+	seqcount_init(&cgrp->freezer.freeze_seq);
 	if (cgrp->freezer.e_freeze) {
 		/*
 		 * Set the CGRP_FREEZE flag, so when a process will be
@@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 		 * consider it frozen immediately.
 		 */
 		set_bit(CGRP_FREEZE, &cgrp->flags);
+		cgrp->freezer.freeze_start_nsec = ktime_get_ns();
 		set_bit(CGRP_FROZEN, &cgrp->flags);
 	}
 
@@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 
 	if (atomic_dec_and_test(&css->online_cnt)) {
 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
-		queue_work(cgroup_destroy_wq, &css->destroy_work);
+		queue_work(cgroup_offline_wq, &css->destroy_work);
 	}
 }
 
@@ -6312,6 +6443,7 @@ int __init cgroup_init(void)
 	WARN_ON(register_filesystem(&cpuset_fs_type));
 #endif
 
+	ns_tree_add(&init_cgroup_ns);
 	return 0;
 }
 
@@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void)
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
-	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
-	BUG_ON(!cgroup_destroy_wq);
+	cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+	BUG_ON(!cgroup_offline_wq);
+
+	cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+	BUG_ON(!cgroup_release_wq);
+
+	cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+	BUG_ON(!cgroup_free_wq);
 	return 0;
 }
 core_initcall(cgroup_wq_init);
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 383963e28ac6..337608f408ce 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -38,7 +38,6 @@ enum prs_errcode {
 
 /* bits in struct cpuset flags field */
 typedef enum {
-	CS_ONLINE,
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
 	CS_MEM_HARDWALL,
@@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
 /* convenient tests for these bits */
 static inline bool is_cpuset_online(struct cpuset *cs)
 {
-	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+	return css_is_online(&cs->css) && !css_is_dying(&cs->css);
 }
 
 static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
 ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 				    char *buf, size_t nbytes, loff_t off);
 int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
 
 /*
  * cpuset-v1.c
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index b69a7db67090..12e76774c75b 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
-	cpus_read_lock();
-	cpuset_lock();
+	cpuset_full_lock();
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
 
@@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	cpuset_unlock();
-	cpus_read_unlock();
+	cpuset_full_unlock();
 	return retval;
 }
 
@@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
-	cpus_read_lock();
-	cpuset_lock();
+	cpuset_full_lock();
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
 		goto out_unlock;
@@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	cpuset_unlock();
-	cpus_read_unlock();
+	cpuset_full_unlock();
 	return retval;
 }
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f74d04429a29..52468d2c178a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <linux/task_work.h>
 
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -131,11 +132,6 @@ static bool force_sd_rebuild;
 #define PRS_INVALID_ROOT	-1
 #define PRS_INVALID_ISOLATED	-2
 
-static inline bool is_prs_invalid(int prs_state)
-{
-	return prs_state < 0;
-}
-
 /*
  * Temporary cpumasks for working with partitions that are passed among
  * functions to avoid memory allocation in inner functions.
@@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p)
 	cs->nr_deadline_tasks--;
 }
 
-static inline int is_partition_valid(const struct cpuset *cs)
+static inline bool is_partition_valid(const struct cpuset *cs)
 {
 	return cs->partition_root_state > 0;
 }
 
-static inline int is_partition_invalid(const struct cpuset *cs)
+static inline bool is_partition_invalid(const struct cpuset *cs)
 {
 	return cs->partition_root_state < 0;
 }
 
+static inline bool cs_is_member(const struct cpuset *cs)
+{
+	return cs->partition_root_state == PRS_MEMBER;
+}
+
 /*
  * Callers should hold callback_lock to modify partition_root_state.
  */
@@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
  * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
  */
 static struct cpuset top_cpuset = {
-	.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+	.flags = BIT(CS_CPU_EXCLUSIVE) |
 		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
 	.partition_root_state = PRS_ROOT,
 	.relax_domain_level = -1,
@@ -250,6 +251,12 @@ static struct cpuset top_cpuset = {
 
 static DEFINE_MUTEX(cpuset_mutex);
 
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
+ *
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
+ */
 void cpuset_lock(void)
 {
 	mutex_lock(&cpuset_mutex);
@@ -260,6 +267,24 @@ void cpuset_unlock(void)
 	mutex_unlock(&cpuset_mutex);
 }
 
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
+{
+	cpus_read_lock();
+	mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_full_unlock(void)
+{
+	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
+}
+
 static DEFINE_SPINLOCK(callback_lock);
 
 void cpuset_callback_lock_irq(void)
@@ -280,7 +305,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
 {
 	if (!cpusets_insane_config() &&
 		movable_only_nodes(nodes)) {
-		static_branch_enable(&cpusets_insane_config_key);
+		static_branch_enable_cpuslocked(&cpusets_insane_config_key);
 		pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
 			"Cpuset allocations might fail even with a lot of memory available.\n",
 			nodemask_pr_args(nodes));
@@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 }
 
 /**
- * alloc_cpumasks - allocate three cpumasks for cpuset
- * @cs:  the cpuset that have cpumasks to be allocated.
- * @tmp: the tmpmasks structure pointer
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
  * Return: 0 if successful, -ENOMEM otherwise.
  *
- * Only one of the two input arguments should be non-NULL.
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
  */
-static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
 {
-	cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
+	int i;
 
-	if (cs) {
-		pmask1 = &cs->cpus_allowed;
-		pmask2 = &cs->effective_cpus;
-		pmask3 = &cs->effective_xcpus;
-		pmask4 = &cs->exclusive_cpus;
-	} else {
-		pmask1 = &tmp->new_cpus;
-		pmask2 = &tmp->addmask;
-		pmask3 = &tmp->delmask;
-		pmask4 = NULL;
+	for (i = 0; i < size; i++) {
+		if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+			while (--i >= 0)
+				free_cpumask_var(*pmasks[i]);
+			return -ENOMEM;
+		}
 	}
-
-	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
-		goto free_one;
-
-	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
-		goto free_two;
-
-	if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
-		goto free_three;
-
-
 	return 0;
+}
+
+/**
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
+ */
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
+{
+	/*
+	 * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+	 * Note: Array size must match actual number of masks (3)
+	 */
+	cpumask_var_t *pmask[3] = {
+		&tmp->new_cpus,
+		&tmp->addmask,
+		&tmp->delmask
+	};
 
-free_three:
-	free_cpumask_var(*pmask3);
-free_two:
-	free_cpumask_var(*pmask2);
-free_one:
-	free_cpumask_var(*pmask1);
-	return -ENOMEM;
+	return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
 }
 
 /**
- * free_cpumasks - free cpumasks in a tmpmasks structure
- * @cs:  the cpuset that have cpumasks to be free.
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
  * @tmp: the tmpmasks structure pointer
  */
-static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline void free_tmpmasks(struct tmpmasks *tmp)
 {
-	if (cs) {
-		free_cpumask_var(cs->cpus_allowed);
-		free_cpumask_var(cs->effective_cpus);
-		free_cpumask_var(cs->effective_xcpus);
-		free_cpumask_var(cs->exclusive_cpus);
-	}
-	if (tmp) {
-		free_cpumask_var(tmp->new_cpus);
-		free_cpumask_var(tmp->addmask);
-		free_cpumask_var(tmp->delmask);
-	}
+	if (!tmp)
+		return;
+
+	free_cpumask_var(tmp->new_cpus);
+	free_cpumask_var(tmp->addmask);
+	free_cpumask_var(tmp->delmask);
 }
 
 /**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
  */
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
 {
 	struct cpuset *trial;
 
-	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+	/* Allocate base structure */
+	trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+		     kzalloc(sizeof(*cs), GFP_KERNEL);
 	if (!trial)
 		return NULL;
 
-	if (alloc_cpumasks(trial, NULL)) {
+	/* Setup cpumask pointer array */
+	cpumask_var_t *pmask[4] = {
+		&trial->cpus_allowed,
+		&trial->effective_cpus,
+		&trial->effective_xcpus,
+		&trial->exclusive_cpus
+	};
+
+	if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
 		kfree(trial);
 		return NULL;
 	}
 
-	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
-	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
-	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
-	cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+	/* Copy masks if duplicating */
+	if (cs) {
+		cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+		cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+		cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+		cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+	}
+
 	return trial;
 }
 
@@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
  */
 static inline void free_cpuset(struct cpuset *cs)
 {
-	free_cpumasks(cs, NULL);
+	free_cpumask_var(cs->cpus_allowed);
+	free_cpumask_var(cs->effective_cpus);
+	free_cpumask_var(cs->effective_xcpus);
+	free_cpumask_var(cs->exclusive_cpus);
 	kfree(cs);
 }
 
@@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
 	return true;
 }
 
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+	/* If either cpuset is exclusive, check if they are mutually exclusive */
+	if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+		return !cpusets_are_exclusive(cs1, cs2);
+
+	/* Exclusive_cpus cannot intersect */
+	if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+		return true;
+
+	/* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+	if (!cpumask_empty(cs1->cpus_allowed) &&
+	    cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+		return true;
+
+	if (!cpumask_empty(cs2->cpus_allowed) &&
+	    cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+		return true;
+
+	return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+	if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+		return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+	return false;
+}
+
 /*
  * validate_change() - Used to validate that any proposed cpuset change
  *		       follows the structural rules for cpusets.
@@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	 */
 	ret = -EINVAL;
 	cpuset_for_each_child(c, css, par) {
-		bool txset, cxset;	/* Are exclusive_cpus set? */
-
 		if (c == cur)
 			continue;
-
-		txset = !cpumask_empty(trial->exclusive_cpus);
-		cxset = !cpumask_empty(c->exclusive_cpus);
-		if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
-		    (txset && cxset)) {
-			if (!cpusets_are_exclusive(trial, c))
-				goto out;
-		} else if (txset || cxset) {
-			struct cpumask *xcpus, *acpus;
-
-			/*
-			 * When just one of the exclusive_cpus's is set,
-			 * cpus_allowed of the other cpuset, if set, cannot be
-			 * a subset of it or none of those CPUs will be
-			 * available if these exclusive CPUs are activated.
-			 */
-			if (txset) {
-				xcpus = trial->exclusive_cpus;
-				acpus = c->cpus_allowed;
-			} else {
-				xcpus = c->exclusive_cpus;
-				acpus = trial->cpus_allowed;
-			}
-			if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
-				goto out;
-		}
-		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
-		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
+		if (cpus_excl_conflict(trial, c))
+			goto out;
+		if (mems_excl_conflict(trial, c))
 			goto out;
 	}
 
@@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu)
 }
 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
 
-/*
- * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
- * @cs: cpuset
- * @xcpus: effective exclusive CPUs value to be set
- * @real_cs: the real cpuset (can be NULL)
- * Return: 0 if there is no sibling conflict, > 0 otherwise
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus:  exclusive effective CPU mask to modify
  *
- * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
- * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
- * as well. The provision of real_cs means that a cpumask is being changed and
- * the given cs is a trial one.
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
  */
-static int compute_effective_exclusive_cpumask(struct cpuset *cs,
-					       struct cpumask *xcpus,
-					       struct cpuset *real_cs)
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+					struct cpumask *excpus)
 {
 	struct cgroup_subsys_state *css;
-	struct cpuset *parent = parent_cs(cs);
 	struct cpuset *sibling;
 	int retval = 0;
 
-	if (!xcpus)
-		xcpus = cs->effective_xcpus;
-
-	cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
-
-	if (!real_cs) {
-		if (!cpumask_empty(cs->exclusive_cpus))
-			return 0;
-	} else {
-		cs = real_cs;
-	}
+	if (cpumask_empty(excpus))
+		return retval;
 
 	/*
 	 * Exclude exclusive CPUs from siblings
@@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
 		if (sibling == cs)
 			continue;
 
-		if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
-			cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+		if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+			cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
 			retval++;
 			continue;
 		}
-		if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
-			cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+		if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+			cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
 			retval++;
 		}
 	}
 	rcu_read_unlock();
+
 	return retval;
 }
 
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+	struct cpuset *parent = parent_cs(cs);
+
+	cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+	if (!cpumask_empty(cs->exclusive_cpus))
+		return 0;
+
+	return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+	struct cpuset *parent = parent_cs(trialcs);
+	struct cpumask *excpus = trialcs->effective_xcpus;
+
+	/* trialcs is member, cpuset.cpus has no impact to excpus */
+	if (cs_is_member(cs))
+		cpumask_and(excpus, trialcs->exclusive_cpus,
+				parent->effective_xcpus);
+	else
+		cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+	return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
 static inline bool is_remote_partition(struct cpuset *cs)
 {
 	return !list_empty(&cs->remote_sibling);
@@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
 	 * Note that creating a remote partition with any local partition root
 	 * above it or remote partition root underneath it is not allowed.
 	 */
-	compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
+	compute_excpus(cs, tmp->new_cpus);
 	WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
 	if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
 	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
 		cs->partition_root_state = PRS_MEMBER;
 
 	/* effective_xcpus may need to be changed */
-	compute_effective_exclusive_cpumask(cs, NULL, NULL);
+	compute_excpus(cs, cs->effective_xcpus);
 	reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
 	update_unbound_workqueue_cpumask(isolcpus_updated);
@@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 	old_prs = new_prs = cs->partition_root_state;
 
 	if (cmd == partcmd_invalidate) {
-		if (is_prs_invalid(old_prs))
+		if (is_partition_invalid(cs))
 			return 0;
 
 		/*
@@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 
 	if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
 		/*
-		 * Need to call compute_effective_exclusive_cpumask() in case
+		 * Need to call compute_excpus() in case
 		 * exclusive_cpus not set. Sibling conflict should only happen
 		 * if exclusive_cpus isn't set.
 		 */
 		xcpus = tmp->delmask;
-		if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+		if (compute_excpus(cs, xcpus))
 			WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
 
 		/*
 		 * Enabling partition root is not allowed if its
@@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		if (prstate_housekeeping_conflict(new_prs, xcpus))
 			return PERR_HKEEPING;
 
-		/*
-		 * A parent can be left with no CPU as long as there is no
-		 * task directly associated with the parent partition.
-		 */
-		if (nocpu)
+		if (tasks_nocpu_error(parent, cs, xcpus))
 			return PERR_NOCPUS;
 
 		/*
@@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 
 		deleting = true;
 		subparts_delta++;
-		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
 	} else if (cmd == partcmd_disable) {
 		/*
 		 * May need to add cpus back to parent's effective_cpus
@@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		 * For invalid partition:
 		 *   delmask = newmask & parent->effective_xcpus
 		 */
-		if (is_prs_invalid(old_prs)) {
+		if (is_partition_invalid(cs)) {
 			adding = false;
 			deleting = cpumask_and(tmp->delmask,
 					newmask, parent->effective_xcpus);
@@ -1837,13 +1918,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		 * A partition error happens when parent has tasks and all
 		 * its effective CPUs will have to be distributed out.
 		 */
-		WARN_ON_ONCE(!is_partition_valid(parent));
 		if (nocpu) {
 			part_error = PERR_NOCPUS;
 			if (is_partition_valid(cs))
 				adding = cpumask_and(tmp->addmask,
 						xcpus, parent->effective_xcpus);
-		} else if (is_partition_invalid(cs) &&
+		} else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
 			   cpumask_subset(xcpus, parent->effective_xcpus)) {
 			struct cgroup_subsys_state *css;
 			struct cpuset *child;
@@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
 	 *  2) All the effective_cpus will be used up and cp
 	 *     has tasks
 	 */
-	compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
+	compute_excpus(cs, new_ecpus);
 	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
 
 	rcu_read_lock();
@@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		 * its value is being processed.
 		 */
 		if (remote && (cp != cs)) {
-			compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+			compute_excpus(cp, tmp->new_cpus);
 			if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
 				pos_css = css_rightmost_descendant(pos_css);
 				continue;
@@ -2177,7 +2257,7 @@ get_css:
 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
 		cp->partition_root_state = new_prs;
 		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
-			compute_effective_exclusive_cpumask(cp, NULL, NULL);
+			compute_excpus(cp, cp->effective_xcpus);
 
 		/*
 		 * Make sure effective_xcpus is properly set for a valid
@@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 	rcu_read_unlock();
 }
 
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
-			  const char *buf)
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
 {
 	int retval;
-	struct tmpmasks tmp;
-	struct cpuset *parent = parent_cs(cs);
-	bool invalidate = false;
-	bool force = false;
-	int old_prs = cs->partition_root_state;
 
-	/* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
-	if (cs == &top_cpuset)
-		return -EACCES;
+	retval = cpulist_parse(buf, out_mask);
+	if (retval < 0)
+		return retval;
+	if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+		return -EINVAL;
 
-	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
-	 * Since cpulist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have cpus.
-	 */
-	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
-		if (cpumask_empty(trialcs->exclusive_cpus))
-			cpumask_clear(trialcs->effective_xcpus);
-	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
-		if (retval < 0)
-			return retval;
+	return 0;
+}
 
-		if (!cpumask_subset(trialcs->cpus_allowed,
-				    top_cpuset.cpus_allowed))
-			return -EINVAL;
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+	struct cpuset *parent = parent_cs(cs);
 
-		/*
-		 * When exclusive_cpus isn't explicitly set, it is constrained
-		 * by cpus_allowed and parent's effective_xcpus. Otherwise,
-		 * trialcs->effective_xcpus is used as a temporary cpumask
-		 * for checking validity of the partition root.
-		 */
-		trialcs->partition_root_state = PRS_MEMBER;
-		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
-			compute_effective_exclusive_cpumask(trialcs, NULL, cs);
-	}
+	if (cs_is_member(trialcs))
+		return PERR_NONE;
 
-	/* Nothing to do if the cpus didn't change */
-	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
-		return 0;
+	if (cpumask_empty(trialcs->effective_xcpus))
+		return PERR_INVCPUS;
 
-	if (alloc_cpumasks(NULL, &tmp))
-		return -ENOMEM;
+	if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+					  trialcs->effective_xcpus))
+		return PERR_HKEEPING;
 
-	if (old_prs) {
-		if (is_partition_valid(cs) &&
-		    cpumask_empty(trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_INVCPUS;
-		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_HKEEPING;
-		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_NOCPUS;
-		}
-	}
+	if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+		return PERR_NOCPUS;
 
-	/*
-	 * Check all the descendants in update_cpumasks_hier() if
-	 * effective_xcpus is to be changed.
-	 */
-	force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+	return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+					struct tmpmasks *tmp)
+{
+	int retval;
+	struct cpuset *parent = parent_cs(cs);
 
 	retval = validate_change(cs, trialcs);
 
@@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		 * partition. However, any conflicting sibling partitions
 		 * have to be marked as invalid too.
 		 */
-		invalidate = true;
+		trialcs->prs_err = PERR_NOTEXCL;
 		rcu_read_lock();
 		cpuset_for_each_child(cp, css, parent) {
 			struct cpumask *xcpus = user_xcpus(trialcs);
@@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			if (is_partition_valid(cp) &&
 			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
 				rcu_read_unlock();
-				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
 				rcu_read_lock();
 			}
 		}
 		rcu_read_unlock();
 		retval = 0;
 	}
+	return retval;
+}
 
-	if (retval < 0)
-		goto out_free;
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+					struct tmpmasks *tmp)
+{
+	enum prs_errcode prs_err;
 
-	if (is_partition_valid(cs) ||
-	   (is_partition_invalid(cs) && !invalidate)) {
-		struct cpumask *xcpus = trialcs->effective_xcpus;
+	if (cs_is_member(cs))
+		return;
 
-		if (cpumask_empty(xcpus) && is_partition_invalid(cs))
-			xcpus = trialcs->cpus_allowed;
+	prs_err = validate_partition(cs, trialcs);
+	if (prs_err)
+		trialcs->prs_err = cs->prs_err = prs_err;
 
-		/*
-		 * Call remote_cpus_update() to handle valid remote partition
-		 */
-		if (is_remote_partition(cs))
-			remote_cpus_update(cs, NULL, xcpus, &tmp);
-		else if (invalidate)
+	if (is_remote_partition(cs)) {
+		if (trialcs->prs_err)
+			remote_partition_disable(cs, tmp);
+		else
+			remote_cpus_update(cs, trialcs->exclusive_cpus,
+					   trialcs->effective_xcpus, tmp);
+	} else {
+		if (trialcs->prs_err)
 			update_parent_effective_cpumask(cs, partcmd_invalidate,
-							NULL, &tmp);
+							NULL, tmp);
 		else
 			update_parent_effective_cpumask(cs, partcmd_update,
-							xcpus, &tmp);
+							trialcs->effective_xcpus, tmp);
 	}
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+			  const char *buf)
+{
+	int retval;
+	struct tmpmasks tmp;
+	bool force = false;
+	int old_prs = cs->partition_root_state;
+
+	retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+	if (retval < 0)
+		return retval;
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+		return 0;
+
+	if (alloc_tmpmasks(&tmp))
+		return -ENOMEM;
+
+	compute_trialcs_excpus(trialcs, cs);
+	trialcs->prs_err = PERR_NONE;
+
+	retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+	if (retval < 0)
+		goto out_free;
+
+	/*
+	 * Check all the descendants in update_cpumasks_hier() if
+	 * effective_xcpus is to be changed.
+	 */
+	force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+	partition_cpus_change(cs, trialcs, &tmp);
 
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
@@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cs->partition_root_state)
 		update_partition_sd_lb(cs, old_prs);
 out_free:
-	free_cpumasks(NULL, &tmp);
+	free_tmpmasks(&tmp);
 	return retval;
 }
 
@@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	int retval;
 	struct tmpmasks tmp;
-	struct cpuset *parent = parent_cs(cs);
-	bool invalidate = false;
 	bool force = false;
 	int old_prs = cs->partition_root_state;
 
-	if (!*buf) {
-		cpumask_clear(trialcs->exclusive_cpus);
-		cpumask_clear(trialcs->effective_xcpus);
-	} else {
-		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
-		if (retval < 0)
-			return retval;
-	}
+	retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+	if (retval < 0)
+		return retval;
 
 	/* Nothing to do if the CPUs didn't change */
 	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
 		return 0;
 
-	if (*buf) {
-		trialcs->partition_root_state = PRS_MEMBER;
-		/*
-		 * Reject the change if there is exclusive CPUs conflict with
-		 * the siblings.
-		 */
-		if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
-			return -EINVAL;
-	}
+	/*
+	 * Reject the change if there is exclusive CPUs conflict with
+	 * the siblings.
+	 */
+	if (compute_trialcs_excpus(trialcs, cs))
+		return -EINVAL;
 
 	/*
 	 * Check all the descendants in update_cpumasks_hier() if
@@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval)
 		return retval;
 
-	if (alloc_cpumasks(NULL, &tmp))
+	if (alloc_tmpmasks(&tmp))
 		return -ENOMEM;
 
-	if (old_prs) {
-		if (cpumask_empty(trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_INVCPUS;
-		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_HKEEPING;
-		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
-			invalidate = true;
-			cs->prs_err = PERR_NOCPUS;
-		}
+	trialcs->prs_err = PERR_NONE;
+	partition_cpus_change(cs, trialcs, &tmp);
 
-		if (is_remote_partition(cs)) {
-			if (invalidate)
-				remote_partition_disable(cs, &tmp);
-			else
-				remote_cpus_update(cs, trialcs->exclusive_cpus,
-						   trialcs->effective_xcpus, &tmp);
-		} else if (invalidate) {
-			update_parent_effective_cpumask(cs, partcmd_invalidate,
-							NULL, &tmp);
-		} else {
-			update_parent_effective_cpumask(cs, partcmd_update,
-						trialcs->effective_xcpus, &tmp);
-		}
-	}
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
 	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
@@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cs->partition_root_state)
 		update_partition_sd_lb(cs, old_prs);
 
-	free_cpumasks(NULL, &tmp);
+	free_tmpmasks(&tmp);
 	return 0;
 }
 
@@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	}
 }
 
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
 {
 	flush_workqueue(cpuset_migrate_mm_wq);
+	kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+	struct callback_head *flush_cb;
+
+	flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+	if (!flush_cb)
+		return;
+
+	init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+	if (task_work_add(current, flush_cb, TWA_RESUME))
+		kfree(flush_cb);
 }
 
 /*
@@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	int retval;
 
 	/*
-	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
-	 * it's read-only
-	 */
-	if (cs == &top_cpuset) {
-		retval = -EACCES;
-		goto done;
-	}
-
-	/*
 	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
+	 * The validate_change() call ensures that cpusets with tasks have memory.
 	 */
-	if (!*buf) {
-		nodes_clear(trialcs->mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs->mems_allowed);
-		if (retval < 0)
-			goto done;
+	retval = nodelist_parse(buf, trialcs->mems_allowed);
+	if (retval < 0)
+		goto done;
 
-		if (!nodes_subset(trialcs->mems_allowed,
-				  top_cpuset.mems_allowed)) {
-			retval = -EINVAL;
-			goto done;
-		}
+	if (!nodes_subset(trialcs->mems_allowed,
+			  top_cpuset.mems_allowed)) {
+		retval = -EINVAL;
+		goto done;
 	}
 
 	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
@@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	int spread_flag_changed;
 	int err;
 
-	trialcs = alloc_trial_cpuset(cs);
+	trialcs = dup_or_alloc_cpuset(cs);
 	if (!trialcs)
 		return -ENOMEM;
 
@@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	/*
 	 * Treat a previously invalid partition root as if it is a "member".
 	 */
-	if (new_prs && is_prs_invalid(old_prs))
+	if (new_prs && is_partition_invalid(cs))
 		old_prs = PRS_MEMBER;
 
-	if (alloc_cpumasks(NULL, &tmpmask))
+	if (alloc_tmpmasks(&tmpmask))
 		return -ENOMEM;
 
 	err = update_partition_exclusive_flag(cs, new_prs);
@@ -2983,7 +3058,7 @@ out:
 	notify_partition_change(cs, old_prs);
 	if (force_sd_rebuild)
 		rebuild_sched_domains_locked();
-	free_cpumasks(NULL, &tmpmask);
+	free_tmpmasks(&tmpmask);
 	return 0;
 }
 
@@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	struct cpuset *cs;
 	struct cpuset *oldcs = cpuset_attach_old_cs;
 	bool cpus_updated, mems_updated;
+	bool queue_task_work = false;
 
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
@@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 			 * @old_mems_allowed is the right nodesets that we
 			 * migrate mm from.
 			 */
-			if (is_memory_migrate(cs))
+			if (is_memory_migrate(cs)) {
 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
 						  &cpuset_attach_nodemask_to);
-			else
+				queue_task_work = true;
+			} else
 				mmput(mm);
 		}
 	}
 
 out:
+	if (queue_task_work)
+		schedule_flush_migrate_mm();
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
 	if (cs->nr_migrate_dl_tasks) {
@@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	struct cpuset *trialcs;
 	int retval = -ENODEV;
 
+	/* root is read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	buf = strstrip(buf);
-	cpus_read_lock();
-	mutex_lock(&cpuset_mutex);
+	cpuset_full_lock();
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
 
-	trialcs = alloc_trial_cpuset(cs);
+	trialcs = dup_or_alloc_cpuset(cs);
 	if (!trialcs) {
 		retval = -ENOMEM;
 		goto out_unlock;
@@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	if (force_sd_rebuild)
 		rebuild_sched_domains_locked();
 out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	cpus_read_unlock();
-	flush_workqueue(cpuset_migrate_mm_wq);
+	cpuset_full_unlock();
+	if (of_cft(of)->private == FILE_MEMLIST)
+		schedule_flush_migrate_mm();
 	return retval ?: nbytes;
 }
 
@@ -3358,14 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
 	else
 		return -EINVAL;
 
-	css_get(&cs->css);
-	cpus_read_lock();
-	mutex_lock(&cpuset_mutex);
+	cpuset_full_lock();
 	if (is_cpuset_online(cs))
 		retval = update_prstate(cs, val);
-	mutex_unlock(&cpuset_mutex);
-	cpus_read_unlock();
-	css_put(&cs->css);
+	cpuset_full_unlock();
 	return retval ?: nbytes;
 }
 
@@ -3464,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!parent_css)
 		return &top_cpuset.css;
 
-	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+	cs = dup_or_alloc_cpuset(NULL);
 	if (!cs)
 		return ERR_PTR(-ENOMEM);
 
-	if (alloc_cpumasks(cs, NULL)) {
-		kfree(cs);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
@@ -3495,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	if (!parent)
 		return 0;
 
-	cpus_read_lock();
-	mutex_lock(&cpuset_mutex);
-
-	set_bit(CS_ONLINE, &cs->flags);
+	cpuset_full_lock();
 	if (is_spread_page(parent))
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
 	if (is_spread_slab(parent))
@@ -3550,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
 	spin_unlock_irq(&callback_lock);
 out_unlock:
-	mutex_unlock(&cpuset_mutex);
-	cpus_read_unlock();
+	cpuset_full_unlock();
 	return 0;
 }
 
@@ -3566,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
-	cpus_read_lock();
-	mutex_lock(&cpuset_mutex);
-
+	cpuset_full_lock();
 	if (!cpuset_v2() && is_sched_load_balance(cs))
 		cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
 
 	cpuset_dec();
-	clear_bit(CS_ONLINE, &cs->flags);
-
-	mutex_unlock(&cpuset_mutex);
-	cpus_read_unlock();
+	cpuset_full_unlock();
 }
 
 /*
@@ -3588,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
-	cpus_read_lock();
-	mutex_lock(&cpuset_mutex);
-
+	cpuset_full_lock();
 	/* Reset valid partition back to member */
 	if (is_partition_valid(cs))
 		update_prstate(cs, PRS_MEMBER);
-
-	mutex_unlock(&cpuset_mutex);
-	cpus_read_unlock();
-
+	cpuset_full_unlock();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3726,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.can_attach	= cpuset_can_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
-	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.can_fork	= cpuset_can_fork,
 	.cancel_fork	= cpuset_cancel_fork,
@@ -3870,9 +3928,10 @@ retry:
 		partcmd = partcmd_invalidate;
 	/*
 	 * On the other hand, an invalid partition root may be transitioned
-	 * back to a regular one.
+	 * back to a regular one with a non-empty effective xcpus.
 	 */
-	else if (is_partition_valid(parent) && is_partition_invalid(cs))
+	else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+		 !cpumask_empty(cs->effective_xcpus))
 		partcmd = partcmd_update;
 
 	if (partcmd >= 0) {
@@ -3929,7 +3988,7 @@ static void cpuset_handle_hotplug(void)
 	bool on_dfl = is_in_v2_mode();
 	struct tmpmasks tmp, *ptmp = NULL;
 
-	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+	if (on_dfl && !alloc_tmpmasks(&tmp))
 		ptmp = &tmp;
 
 	lockdep_assert_cpus_held();
@@ -4009,7 +4068,7 @@ static void cpuset_handle_hotplug(void)
 	if (force_sd_rebuild)
 		rebuild_sched_domains_cpuslocked();
 
-	free_cpumasks(NULL, ptmp);
+	free_tmpmasks(ptmp);
 }
 
 void cpuset_update_active_cpus(void)
@@ -4074,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	struct cpuset *cs;
 
 	spin_lock_irqsave(&callback_lock, flags);
-	rcu_read_lock();
 
 	cs = task_cs(tsk);
 	if (cs != &top_cpuset)
@@ -4096,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 			cpumask_copy(pmask, possible_mask);
 	}
 
-	rcu_read_unlock();
 	spin_unlock_irqrestore(&callback_lock, flags);
 }
 
@@ -4169,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 	unsigned long flags;
 
 	spin_lock_irqsave(&callback_lock, flags);
-	rcu_read_lock();
 	guarantee_online_mems(task_cs(tsk), &mask);
-	rcu_read_unlock();
 	spin_unlock_irqrestore(&callback_lock, flags);
 
 	return mask;
@@ -4266,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
 
-	rcu_read_lock();
 	cs = nearest_hardwall_ancestor(task_cs(current));
 	allowed = node_isset(node, cs->mems_allowed);
-	rcu_read_unlock();
 
 	spin_unlock_irqrestore(&callback_lock, flags);
 	return allowed;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..81ea38dd6f9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 		return -ENODEV;
 
 	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
 	cset = task_css_set(current);
 	refcnt = refcount_read(&cset->refcount);
 	seq_printf(seq, "css_set %pK %d", cset, refcnt);
@@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 		seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
 			  css, css->id);
 	}
-	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 	cgroup_kn_unlock(of->kn);
 	return 0;
@@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		return -ENOMEM;
 
 	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
 	cset = task_css_set(current);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name_buf);
 	}
-	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 	kfree(name_buf);
 	return 0;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index bf1690a167dd..6c18854bff34 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
 /*
  * Freeze or unfreeze all tasks in the given cgroup.
  */
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
 {
 	struct css_task_iter it;
 	struct task_struct *task;
@@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
-	if (freeze)
+	write_seqcount_begin(&cgrp->freezer.freeze_seq);
+	if (freeze) {
 		set_bit(CGRP_FREEZE, &cgrp->flags);
-	else
+		cgrp->freezer.freeze_start_nsec = ts_nsec;
+	} else {
 		clear_bit(CGRP_FREEZE, &cgrp->flags);
+		cgrp->freezer.frozen_nsec += (ts_nsec -
+			cgrp->freezer.freeze_start_nsec);
+	}
+	write_seqcount_end(&cgrp->freezer.freeze_seq);
 	spin_unlock_irq(&css_set_lock);
 
 	if (freeze)
@@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 	struct cgroup *parent;
 	struct cgroup *dsct;
 	bool applied = false;
+	u64 ts_nsec;
 	bool old_e;
 
 	lockdep_assert_held(&cgroup_mutex);
@@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 		return;
 
 	cgrp->freezer.freeze = freeze;
+	ts_nsec = ktime_get_ns();
 
 	/*
 	 * Propagate changes downwards the cgroup tree.
@@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
 		/*
 		 * Do change actual state: freeze or unfreeze.
 		 */
-		cgroup_do_freeze(dsct, freeze);
+		cgroup_do_freeze(dsct, freeze, ts_nsec);
 		applied = true;
 	}
 
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 144a464e45c6..fdbe57578e68 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
 
 /* cgroup namespaces */
 
@@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
 
 static struct cgroup_namespace *alloc_cgroup_ns(void)
 {
-	struct cgroup_namespace *new_ns;
+	struct cgroup_namespace *new_ns __free(kfree) = NULL;
 	int ret;
 
 	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
+	ret = ns_common_init(new_ns);
+	if (ret)
 		return ERR_PTR(ret);
-	}
-	refcount_set(&new_ns->ns.count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
+	ns_tree_add(new_ns);
+	return no_free_ptr(new_ns);
 }
 
 void free_cgroup_ns(struct cgroup_namespace *ns)
 {
+	ns_tree_remove(ns);
 	put_css_set(ns->root_cset);
 	dec_cgroup_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
+	ns_common_free(ns);
+	/* Concurrent nstree traversal depends on a grace period. */
+	kfree_rcu(ns, ns.ns_rcu);
 }
 EXPORT_SYMBOL(free_cgroup_ns);
 
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
 					struct user_namespace *user_ns,
 					struct cgroup_namespace *old_ns)
 {
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 	return new_ns;
 }
 
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
 static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
 {
 	struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations cgroupns_operations = {
 	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
 	.get		= cgroupns_get,
 	.put		= cgroupns_put,
 	.install	= cgroupns_install,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 981e2f77ad4e..a198e40c799b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -479,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css)
 	if (!css_uses_rstat(css))
 		return;
 
+	if (!css->rstat_cpu)
+		return;
+
 	css_rstat_flush(css);
 
 	/* sanity check */
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 64caaf997fc0..7c3924614e01 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y
 # Provides some protections against SYN flooding.
 CONFIG_SYN_COOKIES=y
 
-# Enable Kernel Control Flow Integrity (currently Clang only).
-CONFIG_CFI_CLANG=y
+# Enable Kernel Control Flow Integrity.
+CONFIG_CFI=y
 # CONFIG_CFI_PERMISSIVE is not set
 
 # Attack surface reduction: do not autoload TTY line disciplines.
diff --git a/kernel/cred.c b/kernel/cred.c
index 9676965c0981..dbf6b687dc5c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void)
  * The new process gets the current process's subjective credentials as its
  * objective and subjective credentials
  */
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, u64 clone_flags)
 {
 	struct cred *new;
 	int ret;
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 67af8a55185d..d9b9dcba6ff7 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -483,8 +483,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 		pr_err("Reserved memory: unable to setup CMA region\n");
 		return err;
 	}
-	/* Architecture specific contiguous memory fixup. */
-	dma_contiguous_early_fixup(rmem->base, rmem->size);
 
 	if (default_cma)
 		dma_contiguous_default_area = cma;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index e43c6de2bce4..b82399437db0 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -39,6 +39,7 @@ enum {
 	dma_debug_sg,
 	dma_debug_coherent,
 	dma_debug_resource,
+	dma_debug_noncoherent,
 };
 
 enum map_err_types {
@@ -141,6 +142,7 @@ static const char *type2name[] = {
 	[dma_debug_sg] = "scatter-gather",
 	[dma_debug_coherent] = "coherent",
 	[dma_debug_resource] = "resource",
+	[dma_debug_noncoherent] = "noncoherent",
 };
 
 static const char *dir2name[] = {
@@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref)
 			   "[mapped as %s] [unmapped as %s]\n",
 			   ref->dev_addr, ref->size,
 			   type2name[entry->type], type2name[ref->type]);
-	} else if (entry->type == dma_debug_coherent &&
+	} else if ((entry->type == dma_debug_coherent ||
+		    entry->type == dma_debug_noncoherent) &&
 		   ref->paddr != entry->paddr) {
 		err_printk(ref->dev, entry, "device driver frees "
 			   "DMA memory with different CPU address "
@@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	}
 }
 
+void debug_dma_alloc_pages(struct device *dev, struct page *page,
+			   size_t size, int direction,
+			   dma_addr_t dma_addr,
+			   unsigned long attrs)
+{
+	struct dma_debug_entry *entry;
+
+	if (unlikely(dma_debug_disabled()))
+		return;
+
+	entry = dma_entry_alloc();
+	if (!entry)
+		return;
+
+	entry->type      = dma_debug_noncoherent;
+	entry->dev       = dev;
+	entry->paddr	 = page_to_phys(page);
+	entry->size      = size;
+	entry->dev_addr  = dma_addr;
+	entry->direction = direction;
+
+	add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_pages(struct device *dev, struct page *page,
+			  size_t size, int direction,
+			  dma_addr_t dma_addr)
+{
+	struct dma_debug_entry ref = {
+		.type           = dma_debug_noncoherent,
+		.dev            = dev,
+		.paddr		= page_to_phys(page),
+		.dev_addr       = dma_addr,
+		.size           = size,
+		.direction      = direction,
+	};
+
+	if (unlikely(dma_debug_disabled()))
+		return;
+
+	check_unmap(&ref);
+}
+
 static int __init dma_debug_driver_setup(char *str)
 {
 	int i;
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index f525197d3cae..48757ca13f31 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev,
 extern void debug_dma_sync_sg_for_device(struct device *dev,
 					 struct scatterlist *sg,
 					 int nelems, int direction);
+extern void debug_dma_alloc_pages(struct device *dev, struct page *page,
+				  size_t size, int direction,
+				  dma_addr_t dma_addr,
+				  unsigned long attrs);
+extern void debug_dma_free_pages(struct device *dev, struct page *page,
+				 size_t size, int direction,
+				 dma_addr_t dma_addr);
 #else /* CONFIG_DMA_API_DEBUG */
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
@@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev,
 						int nelems, int direction)
 {
 }
+
+static inline void debug_dma_alloc_pages(struct device *dev, struct page *page,
+					 size_t size, int direction,
+					 dma_addr_t dma_addr,
+					 unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_pages(struct device *dev, struct page *page,
+					size_t size, int direction,
+					dma_addr_t dma_addr)
+{
+}
 #endif /* CONFIG_DMA_API_DEBUG */
 #endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 107e4a4d251d..56de28a3b179 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
 	if (page) {
 		trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
 				      size, dir, gfp, 0);
-		debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
+		debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0);
 	} else {
 		trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
 	}
@@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
 		dma_addr_t dma_handle, enum dma_data_direction dir)
 {
 	trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
-	debug_dma_unmap_page(dev, dma_handle, size, dir);
+	debug_dma_free_pages(dev, page, size, dir, dma_handle);
 	__dma_free_pages(dev, size, page, dma_handle, dir);
 }
 EXPORT_SYMBOL_GPL(dma_free_pages);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 7b04f7575796..ee45dee33d49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 
 #ifdef CONFIG_DMA_DIRECT_REMAP
 	addr = dma_common_contiguous_remap(page, pool_size,
-					   pgprot_dmacoherent(PAGE_KERNEL),
-					   __builtin_return_address(0));
+			pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+			__builtin_return_address(0));
 	if (!addr)
 		goto free_page;
 #else
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 408d28b5179d..f62e1d1b2063 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	return ret;
 }
 
+/**
+ * arch_irqentry_exit_need_resched - Architecture specific need resched function
+ *
+ * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
+ * Defaults return true.
+ *
+ * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
+ */
+static inline bool arch_irqentry_exit_need_resched(void);
+
+#ifndef arch_irqentry_exit_need_resched
+static inline bool arch_irqentry_exit_need_resched(void) { return true; }
+#endif
+
 void raw_irqentry_exit_cond_resched(void)
 {
 	if (!preempt_count()) {
@@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void)
 		rcu_irq_exit_check_preempt();
 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 			WARN_ON_ONCE(!on_thread_stack());
-		if (need_resched())
+		if (need_resched() && arch_irqentry_exit_need_resched())
 			preempt_schedule_irq();
 	}
 }
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6c83ad674d01..808c0d7a31fa 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 }
 
 struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
 	int rctx, start_entry_idx;
 
+	/* crosstask is not supported for user stacks */
+	if (crosstask && user && !kernel)
+		return NULL;
+
 	entry = get_callchain_entry(&rctx);
 	if (!entry)
 		return NULL;
 
-	ctx.entry     = entry;
-	ctx.max_stack = max_stack;
-	ctx.nr	      = entry->nr = init_nr;
-	ctx.contexts       = 0;
-	ctx.contexts_maxed = false;
+	ctx.entry		= entry;
+	ctx.max_stack		= max_stack;
+	ctx.nr			= entry->nr = 0;
+	ctx.contexts		= 0;
+	ctx.contexts_maxed	= false;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
@@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 		perf_callchain_kernel(&ctx, regs);
 	}
 
-	if (user) {
+	if (user && !crosstask) {
 		if (!user_mode(regs)) {
-			if  (current->mm)
-				regs = task_pt_regs(current);
-			else
-				regs = NULL;
-		}
-
-		if (regs) {
-			if (crosstask)
+			if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
 				goto exit_put;
+			regs = task_pt_regs(current);
+		}
 
-			if (add_mark)
-				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+		if (add_mark)
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
-			start_entry_idx = entry->nr;
-			perf_callchain_user(&ctx, regs);
-			fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
-		}
+		start_entry_idx = entry->nr;
+		perf_callchain_user(&ctx, regs);
+		fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
 	}
 
 exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8060c2857bb2..fb1eae762044 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2665,6 +2665,9 @@ static void perf_log_itrace_start(struct perf_event *event);
 
 static void perf_event_unthrottle(struct perf_event *event, bool start)
 {
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
 	event->hw.interrupts = 0;
 	if (start)
 		event->pmu->start(event, 0);
@@ -2674,6 +2677,9 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
 
 static void perf_event_throttle(struct perf_event *event)
 {
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
 	event->hw.interrupts = MAX_INTERRUPTS;
 	event->pmu->stop(event, 0);
 	if (event == event->group_leader)
@@ -3968,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
  */
 static inline bool event_update_userpage(struct perf_event *event)
 {
-	if (likely(!atomic_read(&event->mmap_count)))
+	if (likely(!refcount_read(&event->mmap_count)))
 		return false;
 
 	perf_event_update_time(event);
@@ -6704,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 	mapped_f mapped = get_mapped(event, event_mapped);
 
-	atomic_inc(&event->mmap_count);
-	atomic_inc(&event->rb->mmap_count);
+	refcount_inc(&event->mmap_count);
+	refcount_inc(&event->rb->mmap_count);
 
 	if (vma->vm_pgoff)
-		atomic_inc(&event->rb->aux_mmap_count);
+		refcount_inc(&event->rb->aux_mmap_count);
 
 	if (mapped)
 		mapped(event, vma->vm_mm);
@@ -6743,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	 * to avoid complications.
 	 */
 	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
-	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
+	    refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
 		/*
 		 * Stop all AUX events that are writing to this buffer,
 		 * so that we can free its AUX pages and corresponding PMU
@@ -6763,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 		mutex_unlock(&rb->aux_mutex);
 	}
 
-	if (atomic_dec_and_test(&rb->mmap_count))
+	if (refcount_dec_and_test(&rb->mmap_count))
 		detach_rest = true;
 
-	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+	if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
 		goto out_put;
 
 	ring_buffer_attach(event, NULL);
@@ -6927,230 +6933,242 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
 	return err;
 }
 
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
 {
-	struct perf_event *event = file->private_data;
-	unsigned long user_locked, user_lock_limit;
+	unsigned long user_locked, user_lock_limit, locked, lock_limit;
 	struct user_struct *user = current_user();
-	struct mutex *aux_mutex = NULL;
-	struct perf_buffer *rb = NULL;
-	unsigned long locked, lock_limit;
-	unsigned long vma_size;
-	unsigned long nr_pages;
-	long user_extra = 0, extra = 0;
-	int ret, flags = 0;
-	mapped_f mapped;
+
+	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+	/* Increase the limit linearly with more CPUs */
+	user_lock_limit *= num_online_cpus();
+
+	user_locked = atomic_long_read(&user->locked_vm);
 
 	/*
-	 * Don't allow mmap() of inherited per-task counters. This would
-	 * create a performance issue due to all children writing to the
-	 * same rb.
+	 * sysctl_perf_event_mlock may have changed, so that
+	 *     user->locked_vm > user_lock_limit
 	 */
-	if (event->cpu == -1 && event->attr.inherit)
-		return -EINVAL;
+	if (user_locked > user_lock_limit)
+		user_locked = user_lock_limit;
+	user_locked += *user_extra;
 
-	if (!(vma->vm_flags & VM_SHARED))
-		return -EINVAL;
+	if (user_locked > user_lock_limit) {
+		/*
+		 * charge locked_vm until it hits user_lock_limit;
+		 * charge the rest from pinned_vm
+		 */
+		*extra = user_locked - user_lock_limit;
+		*user_extra -= *extra;
+	}
 
-	ret = security_perf_event_read(event);
-	if (ret)
-		return ret;
+	lock_limit = rlimit(RLIMIT_MEMLOCK);
+	lock_limit >>= PAGE_SHIFT;
+	locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
 
-	vma_size = vma->vm_end - vma->vm_start;
-	nr_pages = vma_size / PAGE_SIZE;
+	return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
+}
 
-	if (nr_pages > INT_MAX)
-		return -ENOMEM;
+static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
+{
+	struct user_struct *user = current_user();
 
-	if (vma_size != PAGE_SIZE * nr_pages)
-		return -EINVAL;
+	atomic_long_add(user_extra, &user->locked_vm);
+	atomic64_add(extra, &vma->vm_mm->pinned_vm);
+}
 
-	user_extra = nr_pages;
+static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
+			unsigned long nr_pages)
+{
+	long extra = 0, user_extra = nr_pages;
+	struct perf_buffer *rb;
+	int rb_flags = 0;
 
-	mutex_lock(&event->mmap_mutex);
-	ret = -EINVAL;
+	nr_pages -= 1;
 
 	/*
-	 * This relies on __pmu_detach_event() taking mmap_mutex after marking
-	 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
-	 * will detach the rb created here.
+	 * If we have rb pages ensure they're a power-of-two number, so we
+	 * can do bitmasks instead of modulo.
 	 */
-	if (event->state <= PERF_EVENT_STATE_REVOKED) {
-		ret = -ENODEV;
-		goto unlock;
-	}
-
-	if (vma->vm_pgoff == 0) {
-		nr_pages -= 1;
-
-		/*
-		 * If we have rb pages ensure they're a power-of-two number, so we
-		 * can do bitmasks instead of modulo.
-		 */
-		if (nr_pages != 0 && !is_power_of_2(nr_pages))
-			goto unlock;
-
-		WARN_ON_ONCE(event->ctx->parent_ctx);
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
+		return -EINVAL;
 
-		if (event->rb) {
-			if (data_page_nr(event->rb) != nr_pages)
-				goto unlock;
+	WARN_ON_ONCE(event->ctx->parent_ctx);
 
-			if (atomic_inc_not_zero(&event->rb->mmap_count)) {
-				/*
-				 * Success -- managed to mmap() the same buffer
-				 * multiple times.
-				 */
-				ret = 0;
-				/* We need the rb to map pages. */
-				rb = event->rb;
-				goto unlock;
-			}
+	if (event->rb) {
+		if (data_page_nr(event->rb) != nr_pages)
+			return -EINVAL;
 
+		if (refcount_inc_not_zero(&event->rb->mmap_count)) {
 			/*
-			 * Raced against perf_mmap_close()'s
-			 * atomic_dec_and_mutex_lock() remove the
-			 * event and continue as if !event->rb
+			 * Success -- managed to mmap() the same buffer
+			 * multiple times.
 			 */
-			ring_buffer_attach(event, NULL);
+			perf_mmap_account(vma, user_extra, extra);
+			refcount_inc(&event->mmap_count);
+			return 0;
 		}
 
-	} else {
 		/*
-		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
-		 * mapped, all subsequent mappings should have the same size
-		 * and offset. Must be above the normal perf buffer.
+		 * Raced against perf_mmap_close()'s
+		 * refcount_dec_and_mutex_lock() remove the
+		 * event and continue as if !event->rb
 		 */
-		u64 aux_offset, aux_size;
+		ring_buffer_attach(event, NULL);
+	}
 
-		rb = event->rb;
-		if (!rb)
-			goto aux_unlock;
+	if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
+		return -EPERM;
 
-		aux_mutex = &rb->aux_mutex;
-		mutex_lock(aux_mutex);
+	if (vma->vm_flags & VM_WRITE)
+		rb_flags |= RING_BUFFER_WRITABLE;
 
-		aux_offset = READ_ONCE(rb->user_page->aux_offset);
-		aux_size = READ_ONCE(rb->user_page->aux_size);
+	rb = rb_alloc(nr_pages,
+		      event->attr.watermark ? event->attr.wakeup_watermark : 0,
+		      event->cpu, rb_flags);
 
-		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
-			goto aux_unlock;
+	if (!rb)
+		return -ENOMEM;
 
-		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
-			goto aux_unlock;
+	refcount_set(&rb->mmap_count, 1);
+	rb->mmap_user = get_current_user();
+	rb->mmap_locked = extra;
 
-		/* already mapped with a different offset */
-		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
-			goto aux_unlock;
+	ring_buffer_attach(event, rb);
 
-		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
-			goto aux_unlock;
+	perf_event_update_time(event);
+	perf_event_init_userpage(event);
+	perf_event_update_userpage(event);
 
-		/* already mapped with a different size */
-		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
-			goto aux_unlock;
+	perf_mmap_account(vma, user_extra, extra);
+	refcount_set(&event->mmap_count, 1);
 
-		if (!is_power_of_2(nr_pages))
-			goto aux_unlock;
+	return 0;
+}
 
-		if (!atomic_inc_not_zero(&rb->mmap_count))
-			goto aux_unlock;
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+			 unsigned long nr_pages)
+{
+	long extra = 0, user_extra = nr_pages;
+	u64 aux_offset, aux_size;
+	struct perf_buffer *rb;
+	int ret, rb_flags = 0;
 
-		if (rb_has_aux(rb)) {
-			atomic_inc(&rb->aux_mmap_count);
-			ret = 0;
-			goto unlock;
-		}
-	}
+	rb = event->rb;
+	if (!rb)
+		return -EINVAL;
 
-	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+	guard(mutex)(&rb->aux_mutex);
 
 	/*
-	 * Increase the limit linearly with more CPUs:
+	 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+	 * mapped, all subsequent mappings should have the same size
+	 * and offset. Must be above the normal perf buffer.
 	 */
-	user_lock_limit *= num_online_cpus();
+	aux_offset = READ_ONCE(rb->user_page->aux_offset);
+	aux_size = READ_ONCE(rb->user_page->aux_size);
 
-	user_locked = atomic_long_read(&user->locked_vm);
+	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+		return -EINVAL;
 
-	/*
-	 * sysctl_perf_event_mlock may have changed, so that
-	 *     user->locked_vm > user_lock_limit
-	 */
-	if (user_locked > user_lock_limit)
-		user_locked = user_lock_limit;
-	user_locked += user_extra;
+	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+		return -EINVAL;
 
-	if (user_locked > user_lock_limit) {
-		/*
-		 * charge locked_vm until it hits user_lock_limit;
-		 * charge the rest from pinned_vm
-		 */
-		extra = user_locked - user_lock_limit;
-		user_extra -= extra;
-	}
+	/* already mapped with a different offset */
+	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+		return -EINVAL;
 
-	lock_limit = rlimit(RLIMIT_MEMLOCK);
-	lock_limit >>= PAGE_SHIFT;
-	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
+	if (aux_size != nr_pages * PAGE_SIZE)
+		return -EINVAL;
 
-	if ((locked > lock_limit) && perf_is_paranoid() &&
-		!capable(CAP_IPC_LOCK)) {
-		ret = -EPERM;
-		goto unlock;
-	}
+	/* already mapped with a different size */
+	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+		return -EINVAL;
 
-	WARN_ON(!rb && event->rb);
+	if (!is_power_of_2(nr_pages))
+		return -EINVAL;
 
-	if (vma->vm_flags & VM_WRITE)
-		flags |= RING_BUFFER_WRITABLE;
+	if (!refcount_inc_not_zero(&rb->mmap_count))
+		return -EINVAL;
 
-	if (!rb) {
-		rb = rb_alloc(nr_pages,
-			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
-			      event->cpu, flags);
+	if (rb_has_aux(rb)) {
+		refcount_inc(&rb->aux_mmap_count);
 
-		if (!rb) {
-			ret = -ENOMEM;
-			goto unlock;
+	} else {
+		if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+			refcount_dec(&rb->mmap_count);
+			return -EPERM;
 		}
 
-		atomic_set(&rb->mmap_count, 1);
-		rb->mmap_user = get_current_user();
-		rb->mmap_locked = extra;
+		WARN_ON(!rb && event->rb);
 
-		ring_buffer_attach(event, rb);
+		if (vma->vm_flags & VM_WRITE)
+			rb_flags |= RING_BUFFER_WRITABLE;
 
-		perf_event_update_time(event);
-		perf_event_init_userpage(event);
-		perf_event_update_userpage(event);
-		ret = 0;
-	} else {
 		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
-				   event->attr.aux_watermark, flags);
-		if (!ret) {
-			atomic_set(&rb->aux_mmap_count, 1);
-			rb->aux_mmap_locked = extra;
+				   event->attr.aux_watermark, rb_flags);
+		if (ret) {
+			refcount_dec(&rb->mmap_count);
+			return ret;
 		}
+
+		refcount_set(&rb->aux_mmap_count, 1);
+		rb->aux_mmap_locked = extra;
 	}
 
-unlock:
-	if (!ret) {
-		atomic_long_add(user_extra, &user->locked_vm);
-		atomic64_add(extra, &vma->vm_mm->pinned_vm);
-
-		atomic_inc(&event->mmap_count);
-	} else if (rb) {
-		/* AUX allocation failed */
-		atomic_dec(&rb->mmap_count);
-	}
-aux_unlock:
-	if (aux_mutex)
-		mutex_unlock(aux_mutex);
-	mutex_unlock(&event->mmap_mutex);
+	perf_mmap_account(vma, user_extra, extra);
+	refcount_inc(&event->mmap_count);
+
+	return 0;
+}
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_event *event = file->private_data;
+	unsigned long vma_size, nr_pages;
+	mapped_f mapped;
+	int ret;
 
+	/*
+	 * Don't allow mmap() of inherited per-task counters. This would
+	 * create a performance issue due to all children writing to the
+	 * same rb.
+	 */
+	if (event->cpu == -1 && event->attr.inherit)
+		return -EINVAL;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	ret = security_perf_event_read(event);
 	if (ret)
 		return ret;
 
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = vma_size / PAGE_SIZE;
+
+	if (nr_pages > INT_MAX)
+		return -ENOMEM;
+
+	if (vma_size != PAGE_SIZE * nr_pages)
+		return -EINVAL;
+
+	scoped_guard (mutex, &event->mmap_mutex) {
+		/*
+		 * This relies on __pmu_detach_event() taking mmap_mutex after marking
+		 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+		 * will detach the rb created here.
+		 */
+		if (event->state <= PERF_EVENT_STATE_REVOKED)
+			return -ENODEV;
+
+		if (vma->vm_pgoff == 0)
+			ret = perf_mmap_rb(vma, event, nr_pages);
+		else
+			ret = perf_mmap_aux(vma, event, nr_pages);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * Since pinned accounting is per vm we cannot allow fork() to copy our
 	 * vma.
@@ -7168,7 +7186,7 @@ aux_unlock:
 	 * full cleanup in this case and therefore does not invoke
 	 * vmops::close().
 	 */
-	ret = map_range(rb, vma);
+	ret = map_range(event->rb, vma);
 	if (ret)
 		perf_mmap_close(vma);
 
@@ -7434,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
 	if (user_mode(regs)) {
 		regs_user->abi = perf_reg_abi(current);
 		regs_user->regs = regs;
-	} else if (!(current->flags & PF_KTHREAD)) {
+	} else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
 		perf_get_regs_user(regs_user, regs);
 	} else {
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8074,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
 		 * Try IRQ-safe get_user_page_fast_only first.
 		 * If failed, leave phys_addr as 0.
 		 */
-		if (current->mm != NULL) {
+		if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
 			struct page *p;
 
 			pagefault_disable();
@@ -8186,7 +8204,8 @@ struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
 	bool kernel = !event->attr.exclude_callchain_kernel;
-	bool user   = !event->attr.exclude_callchain_user;
+	bool user   = !event->attr.exclude_callchain_user &&
+		!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
 	const u32 max_stack = event->attr.sample_max_stack;
@@ -8198,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return &__empty_callchain;
 
-	callchain = get_perf_callchain(regs, 0, kernel, user,
+	callchain = get_perf_callchain(regs, kernel, user,
 				       max_stack, crosstask, true);
 	return callchain ?: &__empty_callchain;
 }
@@ -10324,6 +10343,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		ret = 1;
 		event->pending_kill = POLL_HUP;
 		perf_event_disable_inatomic(event);
+		event->pmu->stop(event, 0);
 	}
 
 	if (event->attr.sigtrap) {
@@ -13242,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
 set:
 	/* Can't redirect output if we've got an active mmap() */
-	if (atomic_read(&event->mmap_count))
+	if (refcount_read(&event->mmap_count))
 		goto unlock;
 
 	if (output_event) {
@@ -13255,7 +13275,7 @@ set:
 			goto unlock;
 
 		/* did we race against perf_mmap_close() */
-		if (!atomic_read(&rb->mmap_count)) {
+		if (!refcount_read(&rb->mmap_count)) {
 			ring_buffer_put(rb);
 			goto unlock;
 		}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 249288d82b8d..d9cc57083091 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -35,7 +35,7 @@ struct perf_buffer {
 	spinlock_t			event_lock;
 	struct list_head		event_list;
 
-	atomic_t			mmap_count;
+	refcount_t			mmap_count;
 	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
 
@@ -47,7 +47,7 @@ struct perf_buffer {
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	int				aux_overwrite;
-	atomic_t			aux_mmap_count;
+	refcount_t			aux_mmap_count;
 	unsigned long			aux_mmap_locked;
 	void				(*free_aux)(void *);
 	refcount_t			aux_refcount;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index aa9a759e824f..20a905023736 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 * the same order, see perf_mmap_close. Otherwise we end up freeing
 	 * aux pages in this path, which is a bug, because in_atomic().
 	 */
-	if (!atomic_read(&rb->aux_mmap_count))
+	if (!refcount_read(&rb->aux_mmap_count))
 		goto err;
 
 	if (!refcount_inc_not_zero(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7ca1940607bd..c8bfff023c6e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -121,7 +121,7 @@ struct xol_area {
 
 static void uprobe_warn(struct task_struct *t, const char *msg)
 {
-	pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
+	pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
 }
 
 /*
@@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
 	return is_swbp_insn(insn);
 }
 
-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 {
 	void *kaddr = kmap_atomic(page);
 	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 	kunmap_atomic(kaddr);
 }
 
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
+			 int nbytes, void *data)
 {
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
@@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 	 * is a trap variant; uprobes always wins over any other (gdb)
 	 * breakpoint.
 	 */
-	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+	uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
 	is_swbp = is_swbp_insn(&old_opcode);
 
-	if (is_swbp_insn(new_opcode)) {
+	if (is_swbp_insn(insn)) {
 		if (is_swbp)		/* register: already installed? */
 			return 0;
 	} else {
@@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
 	return identical;
 }
 
-static int __uprobe_write_opcode(struct vm_area_struct *vma,
+static int __uprobe_write(struct vm_area_struct *vma,
 		struct folio_walk *fw, struct folio *folio,
-		unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+		unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+		bool is_register)
 {
-	const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
-	const bool is_register = !!is_swbp_insn(&opcode);
+	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
 	bool pmd_mappable;
 
 	/* For now, we'll only handle PTE-mapped folios. */
@@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
 	 */
 	flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
 	fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
-	copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+	copy_to_page(fw->page, insn_vaddr, insn, nbytes);
 
 	/*
 	 * When unregistering, we may only zap a PTE if uffd is disabled and
@@ -482,23 +483,32 @@ remap:
  * @opcode_vaddr: the virtual address to store the opcode.
  * @opcode: opcode to be written at @opcode_vaddr.
  *
- * Called with mm->mmap_lock held for read or write.
+ * Called with mm->mmap_lock held for write.
  * Return 0 (success) or a negative errno.
  */
 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
-		const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+		const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
+		bool is_register)
 {
-	const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+	return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+			    verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
+}
+
+int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+		 const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+		 uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+		 void *data)
+{
+	const unsigned long vaddr = insn_vaddr & PAGE_MASK;
 	struct mm_struct *mm = vma->vm_mm;
 	struct uprobe *uprobe;
-	int ret, is_register, ref_ctr_updated = 0;
+	int ret, ref_ctr_updated = 0;
 	unsigned int gup_flags = FOLL_FORCE;
 	struct mmu_notifier_range range;
 	struct folio_walk fw;
 	struct folio *folio;
 	struct page *page;
 
-	is_register = is_swbp_insn(&opcode);
 	uprobe = container_of(auprobe, struct uprobe, arch);
 
 	if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
@@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	 * page that we can safely modify. Use FOLL_WRITE to trigger a write
 	 * fault if required. When unregistering, we might be lucky and the
 	 * anon page is already gone. So defer write faults until really
-	 * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+	 * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
 	 * cannot deal with PMDs yet.
 	 */
 	if (is_register)
@@ -521,14 +531,14 @@ retry:
 		goto out;
 	folio = page_folio(page);
 
-	ret = verify_opcode(page, opcode_vaddr, &opcode);
+	ret = verify(page, insn_vaddr, insn, nbytes, data);
 	if (ret <= 0) {
 		folio_put(folio);
 		goto out;
 	}
 
 	/* We are going to replace instruction, update ref_ctr. */
-	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+	if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
 		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
 		if (ret) {
 			folio_put(folio);
@@ -560,7 +570,7 @@ retry:
 	/* Walk the page tables again, to perform the actual update. */
 	if (folio_walk_start(&fw, vma, vaddr, 0)) {
 		if (fw.page == page)
-			ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+			ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
 		folio_walk_end(&fw, vma);
 	}
 
@@ -580,7 +590,7 @@ retry:
 
 out:
 	/* Revert back reference counter if instruction update failed. */
-	if (ret < 0 && ref_ctr_updated)
+	if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
 		update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
 
 	/* try collapse pmd for compound page */
@@ -602,7 +612,7 @@ out:
 int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		unsigned long vaddr)
 {
-	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
+	return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
 }
 
 /**
@@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
 		struct vm_area_struct *vma, unsigned long vaddr)
 {
 	return uprobe_write_opcode(auprobe, vma, vaddr,
-			*(uprobe_opcode_t *)&auprobe->insn);
+			*(uprobe_opcode_t *)&auprobe->insn, false);
 }
 
 /* uprobe should have guaranteed positive refcount */
@@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
-	copy_from_page(page, offset, insn, nbytes);
+	uprobe_copy_from_page(page, offset, insn, nbytes);
 	put_page(page);
 
 	return 0;
@@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 			 * reclaim. This is optimistic, no harm done if it fails.
 			 */
 			prev = kmalloc(sizeof(struct map_info),
-					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+					GFP_NOWAIT | __GFP_NOMEMALLOC);
 			if (prev)
 				prev->next = NULL;
 		}
@@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
 		return ERR_PTR(-EINVAL);
 
 	/*
-	 * This ensures that copy_from_page(), copy_to_page() and
+	 * This ensures that uprobe_copy_from_page(), copy_to_page() and
 	 * __update_ref_ctr() can't cross page boundary.
 	 */
 	if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
@@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	int err = 0;
 
-	mmap_read_lock(mm);
+	mmap_write_lock(mm);
 	for_each_vma(vmi, vma) {
 		unsigned long vaddr;
 		loff_t offset;
@@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
 		vaddr = offset_to_vaddr(vma, uprobe->offset);
 		err |= remove_breakpoint(uprobe, vma, vaddr);
 	}
-	mmap_read_unlock(mm);
+	mmap_write_unlock(mm);
 
 	return err;
 }
@@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 	return ret;
 }
 
-void * __weak arch_uprobe_trampoline(unsigned long *psize)
+void * __weak arch_uretprobe_trampoline(unsigned long *psize)
 {
 	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
 
@@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
 	init_waitqueue_head(&area->wq);
 	/* Reserve the 1st slot for get_trampoline_vaddr() */
 	set_bit(0, area->bitmap);
-	insns = arch_uprobe_trampoline(&insns_size);
+	insns = arch_uretprobe_trampoline(&insns_size);
 	arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
 
 	if (!xol_add_vma(mm, area))
@@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
 	return area;
 }
 
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
 /*
  * uprobe_clear_state - Free the area allocated for slots.
  */
@@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
 	delayed_uprobe_remove(NULL, mm);
 	mutex_unlock(&delayed_uprobe_lock);
 
+	arch_uprobe_clear_state(mm);
+
 	if (!area)
 		return;
 
@@ -2160,7 +2180,7 @@ static void dup_xol_work(struct callback_head *work)
 /*
  * Called in context of a new clone/fork from copy_process.
  */
-void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+void uprobe_copy_process(struct task_struct *t, u64 flags)
 {
 	struct uprobe_task *utask = current->utask;
 	struct mm_struct *mm = current->mm;
@@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	if (result < 0)
 		return result;
 
-	copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+	uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 	put_page(page);
  out:
 	/* This needs to return true for any variant of the trap insn */
@@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
@@ -2752,6 +2779,23 @@ out:
 	rcu_read_unlock_trace();
 }
 
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+	struct uprobe *uprobe;
+	int is_swbp;
+
+	guard(rcu_tasks_trace)();
+
+	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+	if (!uprobe)
+		return;
+	if (!get_utask())
+		return;
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		return;
+	handler_chain(uprobe, regs);
+}
+
 /*
  * Perform required fix-ups and disable singlestep.
  * Allow pending signals to take effect.
diff --git a/kernel/fork.c b/kernel/fork.c
index af673856499d..cffa6157a55a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -689,7 +689,6 @@ void __mmdrop(struct mm_struct *mm)
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
 	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
-	futex_hash_free(mm);
 
 	free_mm(mm);
 }
@@ -1015,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
 	mm->uprobes_state.xol_area = NULL;
+	arch_uprobe_init_state(mm);
 #endif
 }
 
@@ -1138,6 +1138,7 @@ static inline void __mmput(struct mm_struct *mm)
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
 	lru_gen_del_mm(mm);
+	futex_hash_free(mm);
 	mmdrop(mm);
 }
 
@@ -1507,7 +1508,7 @@ fail_nomem:
 	return NULL;
 }
 
-static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_mm(u64 clone_flags, struct task_struct *tsk)
 {
 	struct mm_struct *mm, *oldmm;
 
@@ -1545,7 +1546,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk)
 {
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
@@ -1566,7 +1567,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+static int copy_files(u64 clone_flags, struct task_struct *tsk,
 		      int no_files)
 {
 	struct files_struct *oldf, *newf;
@@ -1596,7 +1597,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
 	return 0;
 }
 
-static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
 {
 	struct sighand_struct *sig;
 
@@ -1645,7 +1646,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	posix_cputimers_group_init(pct, cpu_limit);
 }
 
-static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_signal(u64 clone_flags, struct task_struct *tsk)
 {
 	struct signal_struct *sig;
 
@@ -1688,6 +1689,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
+#ifdef CONFIG_CGROUPS
+	init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
@@ -2295,7 +2300,7 @@ __latent_entropy struct task_struct *copy_process(
 	if (need_futex_hash_allocate_default(clone_flags)) {
 		retval = futex_hash_allocate_default();
 		if (retval)
-			goto bad_fork_core_free;
+			goto bad_fork_cancel_cgroup;
 		/*
 		 * If we fail beyond this point we don't free the allocated
 		 * futex hash map. We assume that another thread will be created
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d9bb5567af0c..125804fbb5cb 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1722,12 +1722,9 @@ int futex_mm_init(struct mm_struct *mm)
 	RCU_INIT_POINTER(mm->futex_phash, NULL);
 	mm->futex_phash_new = NULL;
 	/* futex-ref */
+	mm->futex_ref = NULL;
 	atomic_long_set(&mm->futex_atomic, 0);
 	mm->futex_batches = get_state_synchronize_rcu();
-	mm->futex_ref = alloc_percpu(unsigned int);
-	if (!mm->futex_ref)
-		return -ENOMEM;
-	this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
 	return 0;
 }
 
@@ -1801,6 +1798,17 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 		}
 	}
 
+	if (!mm->futex_ref) {
+		/*
+		 * This will always be allocated by the first thread and
+		 * therefore requires no locking.
+		 */
+		mm->futex_ref = alloc_percpu(unsigned int);
+		if (!mm->futex_ref)
+			return -ENOMEM;
+		this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+	}
+
 	fph = kvzalloc(struct_size(fph, queues, hash_slots),
 		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!fph)
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c74eac572acd..2cd57096c38e 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to)
 {
 	if (can_do_masked_user_access())
 		to = masked_user_access_begin(to);
-	else if (!user_read_access_begin(to, sizeof(*to)))
+	else if (!user_write_access_begin(to, sizeof(*to)))
 		return -EFAULT;
 	unsafe_put_user(val, to, Efault);
-	user_read_access_end();
+	user_write_access_end();
 	return 0;
 Efault:
-	user_read_access_end();
+	user_write_access_end();
 	return -EFAULT;
 }
 
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index c716a66f8692..d818b4d47f1b 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -230,8 +230,9 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 			   struct futex_hash_bucket *hb)
 {
-	q->key = *key;
+	struct task_struct *task;
 
+	q->key = *key;
 	__futex_unqueue(q);
 
 	WARN_ON(!q->rt_waiter);
@@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 	futex_hash_get(hb);
 	q->drop_hb_ref = true;
 	q->lock_ptr = &hb->lock;
+	task = READ_ONCE(q->task);
 
 	/* Signal locked state to the waiter */
 	futex_requeue_pi_complete(q, 1);
-	wake_up_state(q->task, TASK_NORMAL);
+	wake_up_state(task, TASK_NORMAL);
 }
 
 /**
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 36673640c4fc..1b4254d19a73 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -143,7 +143,9 @@ config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
 config IRQ_KUNIT_TEST
 	bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
 	depends on KUNIT=y
+	depends on SPARSE_IRQ
 	default KUNIT_ALL_TESTS
+	select IRQ_DOMAIN
 	imply SMP
 	help
 	  This option enables KUnit tests for the IRQ subsystem API. These are
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index eb16a58e0322..b41188698622 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -30,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 	return this->irq == match->irq && this->dev_id == match->dev_id;
 }
 
-/**
- *	devm_request_threaded_irq - allocate an interrupt line for a managed device
- *	@dev: device to request interrupt for
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@thread_fn: function to be called in a threaded interrupt context. NULL
- *		    for devices which handle everything in @handler
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device, dev_name(dev) if NULL
- *	@dev_id: A cookie passed back to the handler function
- *
- *	Except for the extra @dev argument, this function takes the
- *	same arguments and performs the same function as
- *	request_threaded_irq().  IRQs requested with this function will be
- *	automatically freed on driver detach.
- *
- *	If an IRQ allocated with this function needs to be freed
- *	separately, devm_free_irq() must be used.
- */
-int devm_request_threaded_irq(struct device *dev, unsigned int irq,
-			      irq_handler_t handler, irq_handler_t thread_fn,
-			      unsigned long irqflags, const char *devname,
-			      void *dev_id)
+static int devm_request_result(struct device *dev, int rc, unsigned int irq,
+			       irq_handler_t handler, irq_handler_t thread_fn,
+			       const char *devname)
+{
+	if (rc >= 0)
+		return rc;
+
+	return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n",
+			     irq, handler, thread_fn, devname ? : "");
+}
+
+static int __devm_request_threaded_irq(struct device *dev, unsigned int irq,
+				       irq_handler_t handler,
+				       irq_handler_t thread_fn,
+				       unsigned long irqflags,
+				       const char *devname, void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -78,28 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
 
 	return 0;
 }
-EXPORT_SYMBOL(devm_request_threaded_irq);
 
 /**
- *	devm_request_any_context_irq - allocate an interrupt line for a managed device
- *	@dev: device to request interrupt for
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device, dev_name(dev) if NULL
- *	@dev_id: A cookie passed back to the handler function
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging
+ * @dev:	Device to request interrupt for
+ * @irq:	Interrupt line to allocate
+ * @handler:	Function to be called when the interrupt occurs
+ * @thread_fn:	Function to be called in a threaded interrupt context. NULL
+ *		for devices which handle everything in @handler
+ * @irqflags:	Interrupt type flags
+ * @devname:	An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id:	A cookie passed back to the handler function
  *
- *	Except for the extra @dev argument, this function takes the
- *	same arguments and performs the same function as
- *	request_any_context_irq().  IRQs requested with this function will be
- *	automatically freed on driver detach.
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_threaded_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
  *
- *	If an IRQ allocated with this function needs to be freed
- *	separately, devm_free_irq() must be used.
+ * Return: 0 on success or a negative error number.
  */
-int devm_request_any_context_irq(struct device *dev, unsigned int irq,
-			      irq_handler_t handler, unsigned long irqflags,
-			      const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, irq_handler_t thread_fn,
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id)
+{
+	int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn,
+					     irqflags, devname, dev_id);
+
+	return devm_request_result(dev, rc, irq, handler, thread_fn, devname);
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+static int __devm_request_any_context_irq(struct device *dev, unsigned int irq,
+					  irq_handler_t handler,
+					  unsigned long irqflags,
+					  const char *devname, void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -124,6 +137,40 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
 
 	return rc;
 }
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging
+ * @dev:	Device to request interrupt for
+ * @irq:	Interrupt line to allocate
+ * @handler:	Function to be called when the interrupt occurs
+ * @irqflags:	Interrupt type flags
+ * @devname:	An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id:	A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_any_context_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error
+ * number.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+				 irq_handler_t handler, unsigned long irqflags,
+				 const char *devname, void *dev_id)
+{
+	int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags,
+						devname, dev_id);
+
+	return devm_request_result(dev, rc, irq, handler, NULL, devname);
+}
 EXPORT_SYMBOL(devm_request_any_context_irq);
 
 /**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9489f93b3db3..e103451243a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,6 +136,44 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 	wake_up_process(action->thread);
 }
 
+static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
+static u64 irqhandler_duration_threshold_ns __ro_after_init;
+
+static int __init irqhandler_duration_check_setup(char *arg)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(arg, 0, &val);
+	if (ret) {
+		pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret);
+		return 0;
+	}
+
+	if (!val) {
+		pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n");
+		return 0;
+	}
+
+	irqhandler_duration_threshold_ns = val * 1000;
+	static_branch_enable(&irqhandler_duration_check_enabled);
+
+	return 1;
+}
+__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup);
+
+static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq,
+					     const struct irqaction *action)
+{
+	u64 delta_ns = local_clock() - ts_start;
+
+	if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) {
+		pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n",
+				    smp_processor_id(), irq, action->handler,
+				    div_u64(delta_ns, NSEC_PER_USEC));
+	}
+}
+
 irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
 {
 	irqreturn_t retval = IRQ_NONE;
@@ -155,7 +193,16 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
 			lockdep_hardirq_threaded();
 
 		trace_irq_handler_entry(irq, action);
-		res = action->handler(irq, action->dev_id);
+
+		if (static_branch_unlikely(&irqhandler_duration_check_enabled)) {
+			u64 ts_start = local_clock();
+
+			res = action->handler(irq, action->dev_id);
+			irqhandler_duration_check(ts_start, irq, action);
+		} else {
+			res = action->handler(irq, action->dev_id);
+		}
+
 		trace_irq_handler_exit(irq, action, res);
 
 		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
index a75abebed7f2..e2d31914b3c4 100644
--- a/kernel/irq/irq_test.c
+++ b/kernel/irq/irq_test.c
@@ -41,21 +41,37 @@ static struct irq_chip fake_irq_chip = {
 	.flags          = IRQCHIP_SKIP_SET_WAKE,
 };
 
-static void irq_disable_depth_test(struct kunit *test)
+static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd)
 {
 	struct irq_desc *desc;
-	int virq, ret;
+	int virq;
 
-	virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+	virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd);
 	KUNIT_ASSERT_GE(test, virq, 0);
 
-	irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+	irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+	desc = irq_to_desc(virq);
+	KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+	/* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */
+	irq_settings_clr_norequest(desc);
+
+	return virq;
+}
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+	struct irq_desc *desc;
+	int virq, ret;
+
+	virq = irq_test_setup_fake_irq(test, NULL);
 
 	desc = irq_to_desc(virq);
 	KUNIT_ASSERT_PTR_NE(test, desc, NULL);
 
 	ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
-	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
 
 	KUNIT_EXPECT_EQ(test, desc->depth, 0);
 
@@ -73,16 +89,13 @@ static void irq_free_disabled_test(struct kunit *test)
 	struct irq_desc *desc;
 	int virq, ret;
 
-	virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
-	KUNIT_ASSERT_GE(test, virq, 0);
-
-	irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+	virq = irq_test_setup_fake_irq(test, NULL);
 
 	desc = irq_to_desc(virq);
 	KUNIT_ASSERT_PTR_NE(test, desc, NULL);
 
 	ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
-	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
 
 	KUNIT_EXPECT_EQ(test, desc->depth, 0);
 
@@ -93,7 +106,7 @@ static void irq_free_disabled_test(struct kunit *test)
 	KUNIT_EXPECT_GE(test, desc->depth, 1);
 
 	ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
-	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
 	KUNIT_EXPECT_EQ(test, desc->depth, 0);
 
 	free_irq(virq, NULL);
@@ -112,10 +125,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
 	if (!IS_ENABLED(CONFIG_SMP))
 		kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
 
-	virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
-	KUNIT_ASSERT_GE(test, virq, 0);
-
-	irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+	virq = irq_test_setup_fake_irq(test, &affinity);
 
 	desc = irq_to_desc(virq);
 	KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -124,7 +134,7 @@ static void irq_shutdown_depth_test(struct kunit *test)
 	KUNIT_ASSERT_PTR_NE(test, data, NULL);
 
 	ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
-	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
 
 	KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
 	KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -169,13 +179,12 @@ static void irq_cpuhotplug_test(struct kunit *test)
 		kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
 	if (!cpu_is_hotpluggable(1))
 		kunit_skip(test, "CPU 1 must be hotpluggable");
+	if (!cpu_online(1))
+		kunit_skip(test, "CPU 1 must be online");
 
 	cpumask_copy(&affinity.mask, cpumask_of(1));
 
-	virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
-	KUNIT_ASSERT_GE(test, virq, 0);
-
-	irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+	virq = irq_test_setup_fake_irq(test, &affinity);
 
 	desc = irq_to_desc(virq);
 	KUNIT_ASSERT_PTR_NE(test, desc, NULL);
@@ -184,7 +193,7 @@ static void irq_cpuhotplug_test(struct kunit *test)
 	KUNIT_ASSERT_PTR_NE(test, data, NULL);
 
 	ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
-	KUNIT_EXPECT_EQ(test, ret, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
 
 	KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
 	KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
@@ -196,13 +205,9 @@ static void irq_cpuhotplug_test(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, desc->depth, 1);
 
 	KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
-	KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
-	KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
 	KUNIT_EXPECT_GE(test, desc->depth, 1);
 	KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
 
-	KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
-	KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
 	KUNIT_EXPECT_EQ(test, desc->depth, 1);
 
 	enable_irq(virq);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9b09ad3f9914..e7ad99254841 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
 	else
 		__msi_domain_free_irqs(dev, domain, ctrl);
 
-	if (ops->msi_post_free)
-		ops->msi_post_free(domain, dev);
-
 	if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
 		msi_domain_free_descs(dev, ctrl);
 }
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index e49743ae52c5..ecd1ac210dbd 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -144,14 +144,34 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
 				unsigned int order)
 {
 	struct kho_mem_phys_bits *bits;
-	struct kho_mem_phys *physxa;
+	struct kho_mem_phys *physxa, *new_physxa;
 	const unsigned long pfn_high = pfn >> order;
 
 	might_sleep();
 
-	physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
-	if (IS_ERR(physxa))
-		return PTR_ERR(physxa);
+	physxa = xa_load(&track->orders, order);
+	if (!physxa) {
+		int err;
+
+		new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
+		if (!new_physxa)
+			return -ENOMEM;
+
+		xa_init(&new_physxa->phys_bits);
+		physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
+				    GFP_KERNEL);
+
+		err = xa_err(physxa);
+		if (err || physxa) {
+			xa_destroy(&new_physxa->phys_bits);
+			kfree(new_physxa);
+
+			if (err)
+				return err;
+		} else {
+			physxa = new_physxa;
+		}
+	}
 
 	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
 				sizeof(*bits));
@@ -544,6 +564,7 @@ err_free_scratch_areas:
 err_free_scratch_desc:
 	memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
 err_disable_kho:
+	pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
 	kho_enable = false;
 }
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0e98b228a8ef..31b072e8d427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -893,6 +893,7 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(kthread_affine_preferred);
 
 /*
  * Re-affine kthreads according to their preferences
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 086fd5487ca7..31a785afee6c 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 			 * When waking up the task to wound, be sure to clear the
 			 * blocked_on pointer. Otherwise we can see circular
 			 * blocked_on relationships that can't resolve.
+			 *
+			 * NOTE: We pass NULL here instead of lock, because we
+			 * are waking the mutex owner, who may be currently
+			 * blocked on a different mutex.
 			 */
-			__clear_task_blocked_on(owner, lock);
+			__clear_task_blocked_on(owner, NULL);
 			wake_q_add(wake_q, owner);
 		}
 		return true;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 39278737bb68..2a1beebf1d37 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST
 
 config MODULES_TREE_LOOKUP
 	def_bool y
-	depends on PERF_EVENTS || TRACING || CFI_CLANG
+	depends on PERF_EVENTS || TRACING || CFI
 
 endif # MODULES
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index d3204c5c74eb..f8e8c126705c 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -14,7 +14,7 @@
  * Use a latched RB-tree for __module_address(); this allows us to use
  * RCU lookups of the address from any context.
  *
- * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can
+ * This is conditional on PERF_EVENTS || TRACING || CFI because those can
  * really hit __module_address() hard by doing a lot of stack unwinding;
  * potentially from NMI context.
  */
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..c1fb2bad6d72
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ns_common.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+	switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+	case CLONE_NEWCGROUP:
+		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+		break;
+#endif
+#ifdef CONFIG_IPC_NS
+	case CLONE_NEWIPC:
+		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+		break;
+#endif
+	case CLONE_NEWNS:
+		VFS_WARN_ON_ONCE(ops != &mntns_operations);
+		break;
+#ifdef CONFIG_NET_NS
+	case CLONE_NEWNET:
+		VFS_WARN_ON_ONCE(ops != &netns_operations);
+		break;
+#endif
+#ifdef CONFIG_PID_NS
+	case CLONE_NEWPID:
+		VFS_WARN_ON_ONCE(ops != &pidns_operations);
+		break;
+#endif
+#ifdef CONFIG_TIME_NS
+	case CLONE_NEWTIME:
+		VFS_WARN_ON_ONCE(ops != &timens_operations);
+		break;
+#endif
+#ifdef CONFIG_USER_NS
+	case CLONE_NEWUSER:
+		VFS_WARN_ON_ONCE(ops != &userns_operations);
+		break;
+#endif
+#ifdef CONFIG_UTS_NS
+	case CLONE_NEWUTS:
+		VFS_WARN_ON_ONCE(ops != &utsns_operations);
+		break;
+#endif
+	}
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+	refcount_set(&ns->__ns_ref, 1);
+	ns->stashed = NULL;
+	ns->ops = ops;
+	ns->ns_id = 0;
+	ns->ns_type = ns_type;
+	RB_CLEAR_NODE(&ns->ns_tree_node);
+	INIT_LIST_HEAD(&ns->ns_list_node);
+
+#ifdef CONFIG_DEBUG_VFS
+	ns_debug(ns, ops);
+#endif
+
+	if (inum) {
+		ns->inum = inum;
+		return 0;
+	}
+	return proc_alloc_inum(&ns->inum);
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+	proc_free_inum(ns->inum);
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 5f31fdff8a38..19aa64ab08c8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void)
  * Return the newly created nsproxy.  Do not attach this to the task,
  * leave it to the caller to do proper locking and attach it to task.
  */
-static struct nsproxy *create_new_namespaces(unsigned long flags,
+static struct nsproxy *create_new_namespaces(u64 flags,
 	struct task_struct *tsk, struct user_namespace *user_ns,
 	struct fs_struct *new_fs)
 {
@@ -144,7 +144,7 @@ out_ns:
  * called from clone.  This now handles copy for nsproxy and all
  * namespaces therein.
  */
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(u64 flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
@@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 
 	if (proc_ns_file(fd_file(f))) {
 		ns = get_proc_ns(file_inode(fd_file(f)));
-		if (flags && (ns->ops->type != flags))
+		if (flags && (ns->ns_type != flags))
 			err = -EINVAL;
-		flags = ns->ops->type;
+		flags = ns->ns_type;
 	} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
 		err = check_setns_flags(flags);
 	} else {
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..b24a320a11a6
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+/**
+ * struct ns_tree - Namespace tree
+ * @ns_tree: Rbtree of namespaces of a particular type
+ * @ns_list: Sequentially walkable list of all namespaces of this type
+ * @ns_tree_lock: Seqlock to protect the tree and list
+ * @type: type of namespaces in this tree
+ */
+struct ns_tree {
+       struct rb_root ns_tree;
+       struct list_head ns_list;
+       seqlock_t ns_tree_lock;
+       int type;
+};
+
+struct ns_tree mnt_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWNS,
+};
+
+struct ns_tree net_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWNET,
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree uts_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWUTS,
+};
+
+struct ns_tree user_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWUSER,
+};
+
+struct ns_tree ipc_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWIPC,
+};
+
+struct ns_tree pid_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWPID,
+};
+
+struct ns_tree cgroup_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWCGROUP,
+};
+
+struct ns_tree time_ns_tree = {
+	.ns_tree = RB_ROOT,
+	.ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
+	.ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
+	.type = CLONE_NEWTIME,
+};
+
+DEFINE_COOKIE(namespace_cookie);
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct ns_common, ns_tree_node);
+}
+
+static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+	struct ns_common *ns_a = node_to_ns(a);
+	struct ns_common *ns_b = node_to_ns(b);
+	u64 ns_id_a = ns_a->ns_id;
+	u64 ns_id_b = ns_b->ns_id;
+
+	if (ns_id_a < ns_id_b)
+		return -1;
+	if (ns_id_a > ns_id_b)
+		return 1;
+	return 0;
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+	struct rb_node *node, *prev;
+
+	VFS_WARN_ON_ONCE(!ns->ns_id);
+
+	write_seqlock(&ns_tree->ns_tree_lock);
+
+	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+	node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
+	/*
+	 * If there's no previous entry simply add it after the
+	 * head and if there is add it after the previous entry.
+	 */
+	prev = rb_prev(&ns->ns_tree_node);
+	if (!prev)
+		list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
+	else
+		list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+
+	write_sequnlock(&ns_tree->ns_tree_lock);
+
+	VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+	VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
+	VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
+	VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+
+	write_seqlock(&ns_tree->ns_tree_lock);
+	rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+	list_bidir_del_rcu(&ns->ns_list_node);
+	RB_CLEAR_NODE(&ns->ns_tree_node);
+	write_sequnlock(&ns_tree->ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+	const u64 ns_id = *(u64 *)key;
+	const struct ns_common *ns = node_to_ns(node);
+
+	if (ns_id < ns->ns_id)
+		return -1;
+	if (ns_id > ns->ns_id)
+		return 1;
+	return 0;
+}
+
+
+static struct ns_tree *ns_tree_from_type(int ns_type)
+{
+	switch (ns_type) {
+	case CLONE_NEWCGROUP:
+		return &cgroup_ns_tree;
+	case CLONE_NEWIPC:
+		return &ipc_ns_tree;
+	case CLONE_NEWNS:
+		return &mnt_ns_tree;
+	case CLONE_NEWNET:
+		return &net_ns_tree;
+	case CLONE_NEWPID:
+		return &pid_ns_tree;
+	case CLONE_NEWUSER:
+		return &user_ns_tree;
+	case CLONE_NEWUTS:
+		return &uts_ns_tree;
+	case CLONE_NEWTIME:
+		return &time_ns_tree;
+	}
+
+	return NULL;
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+	struct ns_tree *ns_tree;
+	struct rb_node *node;
+	unsigned int seq;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+	ns_tree = ns_tree_from_type(ns_type);
+	if (!ns_tree)
+		return NULL;
+
+	do {
+		seq = read_seqbegin(&ns_tree->ns_tree_lock);
+		node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+		if (node)
+			break;
+	} while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+
+	if (!node)
+		return NULL;
+
+	VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+
+	return node_to_ns(node);
+}
+
+/**
+ * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+					 struct ns_tree *ns_tree, bool previous)
+{
+	struct list_head *list;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+	if (previous)
+		list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+	else
+		list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
+	if (list_is_head(list, &ns_tree->ns_list))
+		return ERR_PTR(-ENOENT);
+
+	VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
+
+	return list_entry_rcu(list, struct ns_common, ns_list_node);
+}
+
+/**
+ * ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 ns_tree_gen_id(struct ns_common *ns)
+{
+	guard(preempt)();
+	ns->ns_id = gen_cookie_next(&namespace_cookie);
+	return ns->ns_id;
+}
diff --git a/kernel/params.c b/kernel/params.c
index b92d64161b75..b96cfd693c99 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops);
 int param_set_copystring(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
+	const size_t len = strnlen(val, kps->maxlen);
 
-	if (strnlen(val, kps->maxlen) == kps->maxlen) {
+	if (len == kps->maxlen) {
 		pr_err("%s: string doesn't fit in %u chars.\n",
 		       kp->name, kps->maxlen-1);
 		return -ENOSPC;
 	}
-	strcpy(kps->string, val);
+	memcpy(kps->string, val, len + 1);
 	return 0;
 }
 EXPORT_SYMBOL(param_set_copystring);
@@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void)
 		dot = strchr(kp->name, '.');
 		if (!dot) {
 			/* This happens for core_param() */
-			strcpy(modname, "kernel");
+			strscpy(modname, "kernel");
 			name_len = 0;
 		} else {
 			name_len = dot - kp->name + 1;
diff --git a/kernel/pid.c b/kernel/pid.c
index c45a28c16cd2..4fffec767a63 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT;
  * the scheme scales to up to 4 million PIDs, runtime.
  */
 struct pid_namespace init_pid_ns = {
-	.ns.count = REFCOUNT_INIT(2),
+	.ns.__ns_ref = REFCOUNT_INIT(2),
 	.idr = IDR_INIT(init_pid_ns.idr),
 	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
-	.ns.inum = PROC_PID_INIT_INO,
+	.ns.inum = ns_init_inum(&init_pid_ns),
 #ifdef CONFIG_PID_NS
 	.ns.ops = &pidns_operations,
 #endif
@@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = {
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
 	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
 #endif
+	.ns.ns_type = ns_common_type(&init_pid_ns),
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
@@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 	struct upid *upid;
 	pid_t nr = 0;
 
-	if (pid && ns->level <= pid->level) {
+	if (pid && ns && ns->level <= pid->level) {
 		upid = &pid->numbers[ns->level];
 		if (upid->ns == ns)
 			nr = upid->nr;
@@ -514,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	rcu_read_lock();
 	if (!ns)
 		ns = task_active_pid_ns(current);
-	nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
+	if (ns)
+		nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
 	rcu_read_unlock();
 
 	return nr;
@@ -680,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head,
 		container_of(head->set, struct pid_namespace, set);
 	int mode = table->mode;
 
-	if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+	if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
 	    uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
 		mode = (mode & S_IRWXU) >> 6;
 	else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7098ed44e717..650be58d8d18 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/idr.h>
+#include <linux/nstree.h>
 #include <uapi/linux/wait.h>
 #include "pid_sysctl.h"
 
@@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns->pid_cachep == NULL)
 		goto out_free_idr;
 
-	err = ns_alloc_inum(&ns->ns);
+	err = ns_common_init(ns);
 	if (err)
 		goto out_free_idr;
-	ns->ns.ops = &pidns_operations;
 
 	ns->pid_max = PID_MAX_LIMIT;
 	err = register_pidns_sysctls(ns);
 	if (err)
 		goto out_free_inum;
 
-	refcount_set(&ns->ns.count, 1);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
@@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
 #endif
 
+	ns_tree_add(ns);
 	return ns;
 
 out_free_inum:
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 out_free_idr:
 	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
@@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
+	ns_tree_remove(ns);
 	unregister_pidns_sysctls(ns);
 
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 
 	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
@@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work)
 		parent = ns->parent;
 		destroy_pid_namespace(ns);
 		ns = parent;
-	} while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+	} while (ns != &init_pid_ns && ns_ref_put(ns));
 }
 
-struct pid_namespace *copy_pid_ns(unsigned long flags,
+struct pid_namespace *copy_pid_ns(u64 flags,
 	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
@@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
 
 void put_pid_ns(struct pid_namespace *ns)
 {
-	if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+	if (ns && ns != &init_pid_ns && ns_ref_put(ns))
 		schedule_work(&ns->work);
 }
 EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 	return 0;
 }
 
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct pid_namespace, ns);
-}
-
 static struct ns_common *pidns_get(struct task_struct *task)
 {
 	struct pid_namespace *ns;
@@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns)
 	put_pid_ns(to_pid_ns(ns));
 }
 
+bool pidns_is_ancestor(struct pid_namespace *child,
+		       struct pid_namespace *ancestor)
+{
+	struct pid_namespace *ns;
+
+	if (child->level < ancestor->level)
+		return false;
+	for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+		;
+	return ns == ancestor;
+}
+
 static int pidns_install(struct nsset *nsset, struct ns_common *ns)
 {
 	struct nsproxy *nsproxy = nsset->nsproxy;
 	struct pid_namespace *active = task_active_pid_ns(current);
-	struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+	struct pid_namespace *new = to_pid_ns(ns);
 
 	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
@@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns)
 	 * this maintains the property that processes and their
 	 * children can not escape their current pid namespace.
 	 */
-	if (new->level < active->level)
-		return -EINVAL;
-
-	ancestor = new;
-	while (ancestor->level > active->level)
-		ancestor = ancestor->parent;
-	if (ancestor != active)
+	if (!pidns_is_ancestor(new, active))
 		return -EINVAL;
 
 	put_pid_ns(nsproxy->pid_ns_for_children);
@@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations pidns_operations = {
 	.name		= "pid",
-	.type		= CLONE_NEWPID,
 	.get		= pidns_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
@@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = {
 const struct proc_ns_operations pidns_for_children_operations = {
 	.name		= "pid_for_children",
 	.real_ns_name	= "pid",
-	.type		= CLONE_NEWPID,
 	.get		= pidns_for_children_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
@@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void)
 #endif
 
 	register_pid_ns_sysctl_table_vm();
+	ns_tree_add(&init_pid_ns);
 	return 0;
 }
 
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index ea7995a25780..8df55397414a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -553,6 +553,30 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				const struct em_data_callback *cb,
 				const cpumask_t *cpus, bool microwatts)
 {
+	int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
+
+	if (_is_cpu_device(dev))
+		em_check_capacity_update();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_register_pd_no_update() - Register a perf domain for a device
+ * @dev : Device to register the PD for
+ * @nr_states : Number of performance states in the new PD
+ * @cb : Callback functions for populating the energy model
+ * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
+ * @microwatts : Whether or not the power values in the EM will be in uW
+ *
+ * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
+ * update after registering the PD, even if @dev is a CPU device.
+ */
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+				 const struct em_data_callback *cb,
+				 const cpumask_t *cpus, bool microwatts)
+{
 	struct em_perf_table *em_table;
 	unsigned long cap, prev_cap = 0;
 	unsigned long flags = 0;
@@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 unlock:
 	mutex_unlock(&em_pd_mutex);
 
-	if (_is_cpu_device(dev))
-		em_check_capacity_update();
-
 	return ret;
 }
-EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
 
 /**
  * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f1f30cca573..2f66ab453823 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode)
 	shrink_shmem_memory();
 
 	console_suspend_all();
+	pm_restrict_gfp_mask();
 
 	error = dpm_suspend(PMSG_FREEZE);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 174ee243b349..8eff357b0436 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu)
 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	rcu_preempt_deferred_qs_init(rdp);
 	rcu_spawn_rnp_kthreads(rnp);
 	rcu_spawn_cpu_nocb_kthread(cpu);
 	ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de6ca13a7b5f..b8bbe7960cda 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_flavor_sched_clock_irq(int user);
 static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fc14adf15cbb..4cd170b2d655 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			    cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				rdp->defer_qs_iw =
-					IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
 				rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
@@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 	}
 }
 
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+	rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
 /*
@@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 	WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
 }
 
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..2452b7366b00 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
 
 	/*
 	 * Load and clear event mask atomically with respect to
-	 * scheduler preemption.
+	 * scheduler preemption and membarrier IPIs.
 	 */
-	preempt_disable();
-	event_mask = t->rseq_event_mask;
-	t->rseq_event_mask = 0;
-	preempt_enable();
+	scoped_guard(RSEQ_EVENT_GUARD) {
+		event_mask = t->rseq_event_mask;
+		t->rseq_event_mask = 0;
+	}
 
 	return !!event_mask;
 }
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index c4a488e67aa7..755883faf751 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,6 +58,7 @@
 #include "deadline.c"
 
 #ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
 # include "ext.c"
 # include "ext_idle.c"
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..ec126f85ff40 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,8 @@
  *  Copyright (C) 1991-2002  Linus Torvalds
  *  Copyright (C) 1998-2024  Ingo Molnar, Red Hat
  */
+#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+#include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer_api.h>
 #include <linux/ktime_api.h>
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
 	__do_set_cpus_allowed(p, &ac);
 }
 
-void migrate_disable(void)
-{
-	struct task_struct *p = current;
-
-	if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
-		/*
-		 *Warn about overflow half-way through the range.
-		 */
-		WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
-		p->migration_disabled++;
-		return;
-	}
-
-	guard(preempt)();
-	this_rq()->nr_pinned++;
-	p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void ___migrate_enable(void)
 {
 	struct task_struct *p = current;
 	struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
 		.flags     = SCA_MIGRATE_ENABLE,
 	};
 
-#ifdef CONFIG_DEBUG_PREEMPT
-	/*
-	 * Check both overflow from migrate_disable() and superfluous
-	 * migrate_enable().
-	 */
-	if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
-		return;
-#endif
+	__set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
 
-	if (p->migration_disabled > 1) {
-		p->migration_disabled--;
-		return;
-	}
+void migrate_disable(void)
+{
+	__migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
 
-	/*
-	 * Ensure stop_task runs either before or after this, and that
-	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
-	 */
-	guard(preempt)();
-	if (p->cpus_ptr != &p->cpus_mask)
-		__set_cpus_allowed_ptr(p, &ac);
-	/*
-	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
-	 * regular cpus_mask, otherwise things that race (eg.
-	 * select_fallback_rq) get confused.
-	 */
-	barrier();
-	p->migration_disabled = 0;
-	this_rq()->nr_pinned--;
+void migrate_enable(void)
+{
+	__migrate_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -4472,7 +4437,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  * __sched_fork() is basic setup which is also used by sched_init() to
  * initialize the boot CPU's idle task.
  */
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(u64 clone_flags, struct task_struct *p)
 {
 	p->on_rq			= 0;
 
@@ -4490,6 +4455,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+	init_cfs_throttle_work(p);
+#endif
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -4707,7 +4675,7 @@ late_initcall(sched_core_sysctl_init);
 /*
  * fork()/clone()-time setup:
  */
-int sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(u64 clone_flags, struct task_struct *p)
 {
 	__sched_fork(clone_flags, p);
 	/*
@@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, css, tset)
 		sched_move_task(task, false);
-
-	scx_cgroup_finish_attach();
 }
 
 static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
@@ -9551,7 +9517,7 @@ static unsigned long tg_weight(struct task_group *tg)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	return scale_load_down(tg->shares);
 #else
-	return sched_weight_from_cgroup(tg->scx_weight);
+	return sched_weight_from_cgroup(tg->scx.weight);
 #endif
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e2d51f4306b3..615411a0a881 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
 	 */
 	if (dl_se->dl_defer && !dl_se->dl_defer_running &&
 	    dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
-		if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
+		if (!is_dl_boosted(dl_se)) {
 
 			/*
 			 * Set dl_se->dl_defer_armed and dl_throttled variables to
@@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
 /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
 static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
 
-static bool dl_server_stopped(struct sched_dl_entity *dl_se);
-
 static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
 {
 	struct rq *rq = rq_of_dl_se(dl_se);
@@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
 		if (!dl_se->dl_runtime)
 			return HRTIMER_NORESTART;
 
-		if (!dl_se->server_has_tasks(dl_se)) {
-			replenish_dl_entity(dl_se);
-			dl_server_stopped(dl_se);
-			return HRTIMER_NORESTART;
-		}
-
 		if (dl_se->dl_defer_armed) {
 			/*
 			 * First check if the server could consume runtime in background.
@@ -1496,10 +1488,12 @@ throttle:
 		}
 
 		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
-			if (dl_server(dl_se))
-				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
-			else
+			if (dl_server(dl_se)) {
+				replenish_dl_new_period(dl_se, rq);
+				start_dl_timer(dl_se);
+			} else {
 				enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+			}
 		}
 
 		if (!is_leftmost(dl_se, &rq->dl))
@@ -1577,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 {
 	/* 0 runtime = fair server disabled */
-	if (dl_se->dl_runtime) {
-		dl_se->dl_server_idle = 0;
+	if (dl_se->dl_runtime)
 		update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
-	}
 }
 
 void dl_server_start(struct sched_dl_entity *dl_se)
@@ -1608,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
 	dl_se->dl_server_active = 0;
 }
 
-static bool dl_server_stopped(struct sched_dl_entity *dl_se)
-{
-	if (!dl_se->dl_server_active)
-		return false;
-
-	if (dl_se->dl_server_idle) {
-		dl_server_stop(dl_se);
-		return true;
-	}
-
-	dl_se->dl_server_idle = 1;
-	return false;
-}
-
 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
-		    dl_server_has_tasks_f has_tasks,
 		    dl_server_pick_f pick_task)
 {
 	dl_se->rq = rq;
-	dl_se->server_has_tasks = has_tasks;
 	dl_se->server_pick_task = pick_task;
 }
 
@@ -1849,7 +1825,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	u64 deadline = dl_se->deadline;
 
 	dl_rq->dl_nr_running++;
-	add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+	if (!dl_server(dl_se))
+		add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	inc_dl_deadline(dl_rq, deadline);
 }
@@ -1859,7 +1837,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
-	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+	if (!dl_server(dl_se))
+		sub_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
 }
@@ -2388,10 +2368,7 @@ again:
 	if (dl_server(dl_se)) {
 		p = dl_se->server_pick_task(dl_se);
 		if (!p) {
-			if (!dl_server_stopped(dl_se)) {
-				dl_se->dl_yielded = 1;
-				update_curr_dl_se(rq, dl_se, 0);
-			}
+			dl_server_stop(dl_se);
 			goto again;
 		}
 		rq->dl_server = dl_se;
@@ -2574,6 +2551,25 @@ static int find_later_rq(struct task_struct *task)
 	return -1;
 }
 
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+	struct task_struct *p;
+
+	if (!has_pushable_dl_tasks(rq))
+		return NULL;
+
+	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+	WARN_ON_ONCE(rq->cpu != task_cpu(p));
+	WARN_ON_ONCE(task_current(rq, p));
+	WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+	WARN_ON_ONCE(!task_on_rq_queued(p));
+	WARN_ON_ONCE(!dl_task(p));
+
+	return p;
+}
+
 /* Locks the rq it finds */
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 {
@@ -2601,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 
 		/* Retry if something changed. */
 		if (double_lock_balance(rq, later_rq)) {
-			if (unlikely(task_rq(task) != rq ||
+			/*
+			 * double_lock_balance had to release rq->lock, in the
+			 * meantime, task may no longer be fit to be migrated.
+			 * Check the following to ensure that the task is
+			 * still suitable for migration:
+			 * 1. It is possible the task was scheduled,
+			 *    migrate_disabled was set and then got preempted,
+			 *    so we must check the task migration disable
+			 *    flag.
+			 * 2. The CPU picked is in the task's affinity.
+			 * 3. For throttled task (dl_task_offline_migration),
+			 *    check the following:
+			 *    - the task is not on the rq anymore (it was
+			 *      migrated)
+			 *    - the task is not on CPU anymore
+			 *    - the task is still a dl task
+			 *    - the task is not queued on the rq anymore
+			 * 4. For the non-throttled task (push_dl_task), the
+			 *    check to ensure that this task is still at the
+			 *    head of the pushable tasks list is enough.
+			 */
+			if (unlikely(is_migration_disabled(task) ||
 				     !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
-				     task_on_cpu(rq, task) ||
-				     !dl_task(task) ||
-				     is_migration_disabled(task) ||
-				     !task_on_rq_queued(task))) {
+				     (task->dl.dl_throttled &&
+				      (task_rq(task) != rq ||
+				       task_on_cpu(rq, task) ||
+				       !dl_task(task) ||
+				       !task_on_rq_queued(task))) ||
+				     (!task->dl.dl_throttled &&
+				      task != pick_next_pushable_dl_task(rq)))) {
+
 				double_unlock_balance(rq, later_rq);
 				later_rq = NULL;
 				break;
@@ -2629,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 	return later_rq;
 }
 
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
-	struct task_struct *p;
-
-	if (!has_pushable_dl_tasks(rq))
-		return NULL;
-
-	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
-	WARN_ON_ONCE(rq->cpu != task_cpu(p));
-	WARN_ON_ONCE(task_current(rq, p));
-	WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
-	WARN_ON_ONCE(!task_on_rq_queued(p));
-	WARN_ON_ONCE(!dl_task(p));
-
-	return p;
-}
-
 /*
  * See if the non running -deadline tasks on this rq
  * can be sent to some other CPU where they can preempt
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3f06ab84d53f..02e16b70a790 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -376,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 			return  -EINVAL;
 		}
 
-		if (rq->cfs.h_nr_queued) {
-			update_rq_clock(rq);
-			dl_server_stop(&rq->fair_server);
-		}
+		update_rq_clock(rq);
+		dl_server_stop(&rq->fair_server);
 
 		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
 		if (retval)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dedc9a16281..2b0e88206d07 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,1040 +9,6 @@
 #include <linux/btf_ids.h>
 #include "ext_idle.h"
 
-#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-
-enum scx_consts {
-	SCX_DSP_DFL_MAX_BATCH		= 32,
-	SCX_DSP_MAX_LOOPS		= 32,
-	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
-
-	SCX_EXIT_BT_LEN			= 64,
-	SCX_EXIT_MSG_LEN		= 1024,
-	SCX_EXIT_DUMP_DFL_LEN		= 32768,
-
-	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,
-
-	/*
-	 * Iterating all tasks may take a while. Periodically drop
-	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
-	 */
-	SCX_TASK_ITER_BATCH		= 32,
-};
-
-enum scx_exit_kind {
-	SCX_EXIT_NONE,
-	SCX_EXIT_DONE,
-
-	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
-	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
-	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
-	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
-
-	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
-	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
-	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
-};
-
-/*
- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
- * are 64bit of the format:
- *
- *   Bits: [63  ..  48 47   ..  32 31 .. 0]
- *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
- *
- *   SYS ACT: System-defined exit actions
- *   SYS RSN: System-defined exit reasons
- *   USR    : User-defined exit codes and reasons
- *
- * Using the above, users may communicate intention and context by ORing system
- * actions and/or system reasons with a user-defined exit code.
- */
-enum scx_exit_code {
-	/* Reasons */
-	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
-
-	/* Actions */
-	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
-};
-
-/*
- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
- * being disabled.
- */
-struct scx_exit_info {
-	/* %SCX_EXIT_* - broad category of the exit reason */
-	enum scx_exit_kind	kind;
-
-	/* exit code if gracefully exiting */
-	s64			exit_code;
-
-	/* textual representation of the above */
-	const char		*reason;
-
-	/* backtrace if exiting due to an error */
-	unsigned long		*bt;
-	u32			bt_len;
-
-	/* informational message */
-	char			*msg;
-
-	/* debug dump */
-	char			*dump;
-};
-
-/* sched_ext_ops.flags */
-enum scx_ops_flags {
-	/*
-	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
-	 */
-	SCX_OPS_KEEP_BUILTIN_IDLE	= 1LLU << 0,
-
-	/*
-	 * By default, if there are no other task to run on the CPU, ext core
-	 * keeps running the current task even after its slice expires. If this
-	 * flag is specified, such tasks are passed to ops.enqueue() with
-	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
-	 */
-	SCX_OPS_ENQ_LAST		= 1LLU << 1,
-
-	/*
-	 * An exiting task may schedule after PF_EXITING is set. In such cases,
-	 * bpf_task_from_pid() may not be able to find the task and if the BPF
-	 * scheduler depends on pid lookup for dispatching, the task will be
-	 * lost leading to various issues including RCU grace period stalls.
-	 *
-	 * To mask this problem, by default, unhashed tasks are automatically
-	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
-	 * depend on pid lookups and wants to handle these tasks directly, the
-	 * following flag can be used.
-	 */
-	SCX_OPS_ENQ_EXITING		= 1LLU << 2,
-
-	/*
-	 * If set, only tasks with policy set to SCHED_EXT are attached to
-	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
-	 */
-	SCX_OPS_SWITCH_PARTIAL		= 1LLU << 3,
-
-	/*
-	 * A migration disabled task can only execute on its current CPU. By
-	 * default, such tasks are automatically put on the CPU's local DSQ with
-	 * the default slice on enqueue. If this ops flag is set, they also go
-	 * through ops.enqueue().
-	 *
-	 * A migration disabled task never invokes ops.select_cpu() as it can
-	 * only select the current CPU. Also, p->cpus_ptr will only contain its
-	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
-	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
-	 */
-	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,
-
-	/*
-	 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
-	 * ops.enqueue() on the ops.select_cpu() selected or the wakee's
-	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
-	 * transfers. When this optimization is enabled, ops.select_cpu() is
-	 * skipped in some cases (when racing against the wakee switching out).
-	 * As the BPF scheduler may depend on ops.select_cpu() being invoked
-	 * during wakeups, queued wakeup is disabled by default.
-	 *
-	 * If this ops flag is set, queued wakeup optimization is enabled and
-	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
-	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
-	 * may be executed on multiple CPUs.
-	 */
-	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
-
-	/*
-	 * If set, enable per-node idle cpumasks. If clear, use a single global
-	 * flat idle cpumask.
-	 */
-	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
-
-	/*
-	 * CPU cgroup support flags
-	 */
-	SCX_OPS_HAS_CGROUP_WEIGHT	= 1LLU << 16,	/* DEPRECATED, will be removed on 6.18 */
-
-	SCX_OPS_ALL_FLAGS		= SCX_OPS_KEEP_BUILTIN_IDLE |
-					  SCX_OPS_ENQ_LAST |
-					  SCX_OPS_ENQ_EXITING |
-					  SCX_OPS_ENQ_MIGRATION_DISABLED |
-					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
-					  SCX_OPS_SWITCH_PARTIAL |
-					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
-					  SCX_OPS_HAS_CGROUP_WEIGHT,
-
-	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
-	__SCX_OPS_INTERNAL_MASK		= 0xffLLU << 56,
-
-	SCX_OPS_HAS_CPU_PREEMPT		= 1LLU << 56,
-};
-
-/* argument container for ops.init_task() */
-struct scx_init_task_args {
-	/*
-	 * Set if ops.init_task() is being invoked on the fork path, as opposed
-	 * to the scheduler transition path.
-	 */
-	bool			fork;
-#ifdef CONFIG_EXT_GROUP_SCHED
-	/* the cgroup the task is joining */
-	struct cgroup		*cgroup;
-#endif
-};
-
-/* argument container for ops.exit_task() */
-struct scx_exit_task_args {
-	/* Whether the task exited before running on sched_ext. */
-	bool cancelled;
-};
-
-/* argument container for ops->cgroup_init() */
-struct scx_cgroup_init_args {
-	/* the weight of the cgroup [1..10000] */
-	u32			weight;
-
-	/* bandwidth control parameters from cpu.max and cpu.max.burst */
-	u64			bw_period_us;
-	u64			bw_quota_us;
-	u64			bw_burst_us;
-};
-
-enum scx_cpu_preempt_reason {
-	/* next task is being scheduled by &sched_class_rt */
-	SCX_CPU_PREEMPT_RT,
-	/* next task is being scheduled by &sched_class_dl */
-	SCX_CPU_PREEMPT_DL,
-	/* next task is being scheduled by &sched_class_stop */
-	SCX_CPU_PREEMPT_STOP,
-	/* unknown reason for SCX being preempted */
-	SCX_CPU_PREEMPT_UNKNOWN,
-};
-
-/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
- * expanded in the future.
- */
-struct scx_cpu_acquire_args {};
-
-/* argument container for ops->cpu_release() */
-struct scx_cpu_release_args {
-	/* the reason the CPU was preempted */
-	enum scx_cpu_preempt_reason reason;
-
-	/* the task that's going to be scheduled on the CPU */
-	struct task_struct	*task;
-};
-
-/*
- * Informational context provided to dump operations.
- */
-struct scx_dump_ctx {
-	enum scx_exit_kind	kind;
-	s64			exit_code;
-	const char		*reason;
-	u64			at_ns;
-	u64			at_jiffies;
-};
-
-/**
- * struct sched_ext_ops - Operation table for BPF scheduler implementation
- *
- * A BPF scheduler can implement an arbitrary scheduling policy by
- * implementing and loading operations in this table. Note that a userland
- * scheduling policy can also be implemented using the BPF scheduler
- * as a shim layer.
- */
-struct sched_ext_ops {
-	/**
-	 * @select_cpu: Pick the target CPU for a task which is being woken up
-	 * @p: task being woken up
-	 * @prev_cpu: the cpu @p was on before sleeping
-	 * @wake_flags: SCX_WAKE_*
-	 *
-	 * Decision made here isn't final. @p may be moved to any CPU while it
-	 * is getting dispatched for execution later. However, as @p is not on
-	 * the rq at this point, getting the eventual execution CPU right here
-	 * saves a small bit of overhead down the line.
-	 *
-	 * If an idle CPU is returned, the CPU is kicked and will try to
-	 * dispatch. While an explicit custom mechanism can be added,
-	 * select_cpu() serves as the default way to wake up idle CPUs.
-	 *
-	 * @p may be inserted into a DSQ directly by calling
-	 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
-	 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
-	 * of the CPU returned by this operation.
-	 *
-	 * Note that select_cpu() is never called for tasks that can only run
-	 * on a single CPU or tasks with migration disabled, as they don't have
-	 * the option to select a different CPU. See select_task_rq() for
-	 * details.
-	 */
-	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-
-	/**
-	 * @enqueue: Enqueue a task on the BPF scheduler
-	 * @p: task being enqueued
-	 * @enq_flags: %SCX_ENQ_*
-	 *
-	 * @p is ready to run. Insert directly into a DSQ by calling
-	 * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
-	 * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
-	 * the task will stall.
-	 *
-	 * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
-	 * skipped.
-	 */
-	void (*enqueue)(struct task_struct *p, u64 enq_flags);
-
-	/**
-	 * @dequeue: Remove a task from the BPF scheduler
-	 * @p: task being dequeued
-	 * @deq_flags: %SCX_DEQ_*
-	 *
-	 * Remove @p from the BPF scheduler. This is usually called to isolate
-	 * the task while updating its scheduling properties (e.g. priority).
-	 *
-	 * The ext core keeps track of whether the BPF side owns a given task or
-	 * not and can gracefully ignore spurious dispatches from BPF side,
-	 * which makes it safe to not implement this method. However, depending
-	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
-	 * scheduling position not being updated across a priority change.
-	 */
-	void (*dequeue)(struct task_struct *p, u64 deq_flags);
-
-	/**
-	 * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
-	 * @cpu: CPU to dispatch tasks for
-	 * @prev: previous task being switched out
-	 *
-	 * Called when a CPU's local dsq is empty. The operation should dispatch
-	 * one or more tasks from the BPF scheduler into the DSQs using
-	 * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
-	 * using scx_bpf_dsq_move_to_local().
-	 *
-	 * The maximum number of times scx_bpf_dsq_insert() can be called
-	 * without an intervening scx_bpf_dsq_move_to_local() is specified by
-	 * ops.dispatch_max_batch. See the comments on top of the two functions
-	 * for more details.
-	 *
-	 * When not %NULL, @prev is an SCX task with its slice depleted. If
-	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
-	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
-	 * ops.dispatch() returns. To keep executing @prev, return without
-	 * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
-	 */
-	void (*dispatch)(s32 cpu, struct task_struct *prev);
-
-	/**
-	 * @tick: Periodic tick
-	 * @p: task running currently
-	 *
-	 * This operation is called every 1/HZ seconds on CPUs which are
-	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
-	 * immediate dispatch cycle on the CPU.
-	 */
-	void (*tick)(struct task_struct *p);
-
-	/**
-	 * @runnable: A task is becoming runnable on its associated CPU
-	 * @p: task becoming runnable
-	 * @enq_flags: %SCX_ENQ_*
-	 *
-	 * This and the following three functions can be used to track a task's
-	 * execution state transitions. A task becomes ->runnable() on a CPU,
-	 * and then goes through one or more ->running() and ->stopping() pairs
-	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
-	 * done running on the CPU.
-	 *
-	 * @p is becoming runnable on the CPU because it's
-	 *
-	 * - waking up (%SCX_ENQ_WAKEUP)
-	 * - being moved from another CPU
-	 * - being restored after temporarily taken off the queue for an
-	 *   attribute change.
-	 *
-	 * This and ->enqueue() are related but not coupled. This operation
-	 * notifies @p's state transition and may not be followed by ->enqueue()
-	 * e.g. when @p is being dispatched to a remote CPU, or when @p is
-	 * being enqueued on a CPU experiencing a hotplug event. Likewise, a
-	 * task may be ->enqueue()'d without being preceded by this operation
-	 * e.g. after exhausting its slice.
-	 */
-	void (*runnable)(struct task_struct *p, u64 enq_flags);
-
-	/**
-	 * @running: A task is starting to run on its associated CPU
-	 * @p: task starting to run
-	 *
-	 * Note that this callback may be called from a CPU other than the
-	 * one the task is going to run on. This can happen when a task
-	 * property is changed (i.e., affinity), since scx_next_task_scx(),
-	 * which triggers this callback, may run on a CPU different from
-	 * the task's assigned CPU.
-	 *
-	 * Therefore, always use scx_bpf_task_cpu(@p) to determine the
-	 * target CPU the task is going to use.
-	 *
-	 * See ->runnable() for explanation on the task state notifiers.
-	 */
-	void (*running)(struct task_struct *p);
-
-	/**
-	 * @stopping: A task is stopping execution
-	 * @p: task stopping to run
-	 * @runnable: is task @p still runnable?
-	 *
-	 * Note that this callback may be called from a CPU other than the
-	 * one the task was running on. This can happen when a task
-	 * property is changed (i.e., affinity), since dequeue_task_scx(),
-	 * which triggers this callback, may run on a CPU different from
-	 * the task's assigned CPU.
-	 *
-	 * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
-	 * the task was running on.
-	 *
-	 * See ->runnable() for explanation on the task state notifiers. If
-	 * !@runnable, ->quiescent() will be invoked after this operation
-	 * returns.
-	 */
-	void (*stopping)(struct task_struct *p, bool runnable);
-
-	/**
-	 * @quiescent: A task is becoming not runnable on its associated CPU
-	 * @p: task becoming not runnable
-	 * @deq_flags: %SCX_DEQ_*
-	 *
-	 * See ->runnable() for explanation on the task state notifiers.
-	 *
-	 * @p is becoming quiescent on the CPU because it's
-	 *
-	 * - sleeping (%SCX_DEQ_SLEEP)
-	 * - being moved to another CPU
-	 * - being temporarily taken off the queue for an attribute change
-	 *   (%SCX_DEQ_SAVE)
-	 *
-	 * This and ->dequeue() are related but not coupled. This operation
-	 * notifies @p's state transition and may not be preceded by ->dequeue()
-	 * e.g. when @p is being dispatched to a remote CPU.
-	 */
-	void (*quiescent)(struct task_struct *p, u64 deq_flags);
-
-	/**
-	 * @yield: Yield CPU
-	 * @from: yielding task
-	 * @to: optional yield target task
-	 *
-	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
-	 * The BPF scheduler should ensure that other available tasks are
-	 * dispatched before the yielding task. Return value is ignored in this
-	 * case.
-	 *
-	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
-	 * scheduler can implement the request, return %true; otherwise, %false.
-	 */
-	bool (*yield)(struct task_struct *from, struct task_struct *to);
-
-	/**
-	 * @core_sched_before: Task ordering for core-sched
-	 * @a: task A
-	 * @b: task B
-	 *
-	 * Used by core-sched to determine the ordering between two tasks. See
-	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
-	 * core-sched.
-	 *
-	 * Both @a and @b are runnable and may or may not currently be queued on
-	 * the BPF scheduler. Should return %true if @a should run before @b.
-	 * %false if there's no required ordering or @b should run before @a.
-	 *
-	 * If not specified, the default is ordering them according to when they
-	 * became runnable.
-	 */
-	bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-
-	/**
-	 * @set_weight: Set task weight
-	 * @p: task to set weight for
-	 * @weight: new weight [1..10000]
-	 *
-	 * Update @p's weight to @weight.
-	 */
-	void (*set_weight)(struct task_struct *p, u32 weight);
-
-	/**
-	 * @set_cpumask: Set CPU affinity
-	 * @p: task to set CPU affinity for
-	 * @cpumask: cpumask of cpus that @p can run on
-	 *
-	 * Update @p's CPU affinity to @cpumask.
-	 */
-	void (*set_cpumask)(struct task_struct *p,
-			    const struct cpumask *cpumask);
-
-	/**
-	 * @update_idle: Update the idle state of a CPU
-	 * @cpu: CPU to update the idle state for
-	 * @idle: whether entering or exiting the idle state
-	 *
-	 * This operation is called when @rq's CPU goes or leaves the idle
-	 * state. By default, implementing this operation disables the built-in
-	 * idle CPU tracking and the following helpers become unavailable:
-	 *
-	 * - scx_bpf_select_cpu_dfl()
-	 * - scx_bpf_select_cpu_and()
-	 * - scx_bpf_test_and_clear_cpu_idle()
-	 * - scx_bpf_pick_idle_cpu()
-	 *
-	 * The user also must implement ops.select_cpu() as the default
-	 * implementation relies on scx_bpf_select_cpu_dfl().
-	 *
-	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
-	 * tracking.
-	 */
-	void (*update_idle)(s32 cpu, bool idle);
-
-	/**
-	 * @cpu_acquire: A CPU is becoming available to the BPF scheduler
-	 * @cpu: The CPU being acquired by the BPF scheduler.
-	 * @args: Acquire arguments, see the struct definition.
-	 *
-	 * A CPU that was previously released from the BPF scheduler is now once
-	 * again under its control.
-	 */
-	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
-	/**
-	 * @cpu_release: A CPU is taken away from the BPF scheduler
-	 * @cpu: The CPU being released by the BPF scheduler.
-	 * @args: Release arguments, see the struct definition.
-	 *
-	 * The specified CPU is no longer under the control of the BPF
-	 * scheduler. This could be because it was preempted by a higher
-	 * priority sched_class, though there may be other reasons as well. The
-	 * caller should consult @args->reason to determine the cause.
-	 */
-	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
-	/**
-	 * @init_task: Initialize a task to run in a BPF scheduler
-	 * @p: task to initialize for BPF scheduling
-	 * @args: init arguments, see the struct definition
-	 *
-	 * Either we're loading a BPF scheduler or a new task is being forked.
-	 * Initialize @p for BPF scheduling. This operation may block and can
-	 * be used for allocations, and is called exactly once for a task.
-	 *
-	 * Return 0 for success, -errno for failure. An error return while
-	 * loading will abort loading of the BPF scheduler. During a fork, it
-	 * will abort that specific fork.
-	 */
-	s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-
-	/**
-	 * @exit_task: Exit a previously-running task from the system
-	 * @p: task to exit
-	 * @args: exit arguments, see the struct definition
-	 *
-	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
-	 * necessary cleanup for @p.
-	 */
-	void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-
-	/**
-	 * @enable: Enable BPF scheduling for a task
-	 * @p: task to enable BPF scheduling for
-	 *
-	 * Enable @p for BPF scheduling. enable() is called on @p any time it
-	 * enters SCX, and is always paired with a matching disable().
-	 */
-	void (*enable)(struct task_struct *p);
-
-	/**
-	 * @disable: Disable BPF scheduling for a task
-	 * @p: task to disable BPF scheduling for
-	 *
-	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
-	 * Disable BPF scheduling for @p. A disable() call is always matched
-	 * with a prior enable() call.
-	 */
-	void (*disable)(struct task_struct *p);
-
-	/**
-	 * @dump: Dump BPF scheduler state on error
-	 * @ctx: debug dump context
-	 *
-	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
-	 */
-	void (*dump)(struct scx_dump_ctx *ctx);
-
-	/**
-	 * @dump_cpu: Dump BPF scheduler state for a CPU on error
-	 * @ctx: debug dump context
-	 * @cpu: CPU to generate debug dump for
-	 * @idle: @cpu is currently idle without any runnable tasks
-	 *
-	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
-	 * @cpu. If @idle is %true and this operation doesn't produce any
-	 * output, @cpu is skipped for dump.
-	 */
-	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-
-	/**
-	 * @dump_task: Dump BPF scheduler state for a runnable task on error
-	 * @ctx: debug dump context
-	 * @p: runnable task to generate debug dump for
-	 *
-	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
-	 * @p.
-	 */
-	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-
-#ifdef CONFIG_EXT_GROUP_SCHED
-	/**
-	 * @cgroup_init: Initialize a cgroup
-	 * @cgrp: cgroup being initialized
-	 * @args: init arguments, see the struct definition
-	 *
-	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
-	 * @cgrp for sched_ext. This operation may block.
-	 *
-	 * Return 0 for success, -errno for failure. An error return while
-	 * loading will abort loading of the BPF scheduler. During cgroup
-	 * creation, it will abort the specific cgroup creation.
-	 */
-	s32 (*cgroup_init)(struct cgroup *cgrp,
-			   struct scx_cgroup_init_args *args);
-
-	/**
-	 * @cgroup_exit: Exit a cgroup
-	 * @cgrp: cgroup being exited
-	 *
-	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
-	 * @cgrp for sched_ext. This operation my block.
-	 */
-	void (*cgroup_exit)(struct cgroup *cgrp);
-
-	/**
-	 * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
-	 * @p: task being moved
-	 * @from: cgroup @p is being moved from
-	 * @to: cgroup @p is being moved to
-	 *
-	 * Prepare @p for move from cgroup @from to @to. This operation may
-	 * block and can be used for allocations.
-	 *
-	 * Return 0 for success, -errno for failure. An error return aborts the
-	 * migration.
-	 */
-	s32 (*cgroup_prep_move)(struct task_struct *p,
-				struct cgroup *from, struct cgroup *to);
-
-	/**
-	 * @cgroup_move: Commit cgroup move
-	 * @p: task being moved
-	 * @from: cgroup @p is being moved from
-	 * @to: cgroup @p is being moved to
-	 *
-	 * Commit the move. @p is dequeued during this operation.
-	 */
-	void (*cgroup_move)(struct task_struct *p,
-			    struct cgroup *from, struct cgroup *to);
-
-	/**
-	 * @cgroup_cancel_move: Cancel cgroup move
-	 * @p: task whose cgroup move is being canceled
-	 * @from: cgroup @p was being moved from
-	 * @to: cgroup @p was being moved to
-	 *
-	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
-	 * Undo the preparation.
-	 */
-	void (*cgroup_cancel_move)(struct task_struct *p,
-				   struct cgroup *from, struct cgroup *to);
-
-	/**
-	 * @cgroup_set_weight: A cgroup's weight is being changed
-	 * @cgrp: cgroup whose weight is being updated
-	 * @weight: new weight [1..10000]
-	 *
-	 * Update @cgrp's weight to @weight.
-	 */
-	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-
-	/**
-	 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
-	 * @cgrp: cgroup whose bandwidth is being updated
-	 * @period_us: bandwidth control period
-	 * @quota_us: bandwidth control quota
-	 * @burst_us: bandwidth control burst
-	 *
-	 * Update @cgrp's bandwidth control parameters. This is from the cpu.max
-	 * cgroup interface.
-	 *
-	 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
-	 * to. For example, if @period_us is 1_000_000 and @quota_us is
-	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
-	 * interpreted in the same fashion and specifies how much @cgrp can
-	 * burst temporarily. The specific control mechanism and thus the
-	 * interpretation of @period_us and burstiness is upto to the BPF
-	 * scheduler.
-	 */
-	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
-				     u64 period_us, u64 quota_us, u64 burst_us);
-
-#endif	/* CONFIG_EXT_GROUP_SCHED */
-
-	/*
-	 * All online ops must come before ops.cpu_online().
-	 */
-
-	/**
-	 * @cpu_online: A CPU became online
-	 * @cpu: CPU which just came up
-	 *
-	 * @cpu just came online. @cpu will not call ops.enqueue() or
-	 * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
-	 */
-	void (*cpu_online)(s32 cpu);
-
-	/**
-	 * @cpu_offline: A CPU is going offline
-	 * @cpu: CPU which is going offline
-	 *
-	 * @cpu is going offline. @cpu will not call ops.enqueue() or
-	 * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
-	 */
-	void (*cpu_offline)(s32 cpu);
-
-	/*
-	 * All CPU hotplug ops must come before ops.init().
-	 */
-
-	/**
-	 * @init: Initialize the BPF scheduler
-	 */
-	s32 (*init)(void);
-
-	/**
-	 * @exit: Clean up after the BPF scheduler
-	 * @info: Exit info
-	 *
-	 * ops.exit() is also called on ops.init() failure, which is a bit
-	 * unusual. This is to allow rich reporting through @info on how
-	 * ops.init() failed.
-	 */
-	void (*exit)(struct scx_exit_info *info);
-
-	/**
-	 * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
-	 */
-	u32 dispatch_max_batch;
-
-	/**
-	 * @flags: %SCX_OPS_* flags
-	 */
-	u64 flags;
-
-	/**
-	 * @timeout_ms: The maximum amount of time, in milliseconds, that a
-	 * runnable task should be able to wait before being scheduled. The
-	 * maximum timeout may not exceed the default timeout of 30 seconds.
-	 *
-	 * Defaults to the maximum allowed timeout value of 30 seconds.
-	 */
-	u32 timeout_ms;
-
-	/**
-	 * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
-	 * value of 32768 is used.
-	 */
-	u32 exit_dump_len;
-
-	/**
-	 * @hotplug_seq: A sequence number that may be set by the scheduler to
-	 * detect when a hotplug event has occurred during the loading process.
-	 * If 0, no detection occurs. Otherwise, the scheduler will fail to
-	 * load if the sequence number does not match @scx_hotplug_seq on the
-	 * enable path.
-	 */
-	u64 hotplug_seq;
-
-	/**
-	 * @name: BPF scheduler's name
-	 *
-	 * Must be a non-zero valid BPF object name including only isalnum(),
-	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
-	 * BPF scheduler is enabled.
-	 */
-	char name[SCX_OPS_NAME_LEN];
-
-	/* internal use only, must be NULL */
-	void *priv;
-};
-
-enum scx_opi {
-	SCX_OPI_BEGIN			= 0,
-	SCX_OPI_NORMAL_BEGIN		= 0,
-	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
-	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
-	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
-	SCX_OPI_END			= SCX_OP_IDX(init),
-};
-
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
-	/*
-	 * If ops.select_cpu() returns a CPU which can't be used by the task,
-	 * the core scheduler code silently picks a fallback CPU.
-	 */
-	s64		SCX_EV_SELECT_CPU_FALLBACK;
-
-	/*
-	 * When dispatching to a local DSQ, the CPU may have gone offline in
-	 * the meantime. In this case, the task is bounced to the global DSQ.
-	 */
-	s64		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
-	/*
-	 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
-	 * continued to run because there were no other tasks on the CPU.
-	 */
-	s64		SCX_EV_DISPATCH_KEEP_LAST;
-
-	/*
-	 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
-	 * is dispatched to a local DSQ when exiting.
-	 */
-	s64		SCX_EV_ENQ_SKIP_EXITING;
-
-	/*
-	 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
-	 * migration disabled task skips ops.enqueue() and is dispatched to its
-	 * local DSQ.
-	 */
-	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
-	/*
-	 * Total number of times a task's time slice was refilled with the
-	 * default value (SCX_SLICE_DFL).
-	 */
-	s64		SCX_EV_REFILL_SLICE_DFL;
-
-	/*
-	 * The total duration of bypass modes in nanoseconds.
-	 */
-	s64		SCX_EV_BYPASS_DURATION;
-
-	/*
-	 * The number of tasks dispatched in the bypassing mode.
-	 */
-	s64		SCX_EV_BYPASS_DISPATCH;
-
-	/*
-	 * The number of times the bypassing mode has been activated.
-	 */
-	s64		SCX_EV_BYPASS_ACTIVATE;
-};
-
-struct scx_sched {
-	struct sched_ext_ops	ops;
-	DECLARE_BITMAP(has_op, SCX_OPI_END);
-
-	/*
-	 * Dispatch queues.
-	 *
-	 * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
-	 * This is to avoid live-locking in bypass mode where all tasks are
-	 * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
-	 * per-node split isn't sufficient, it can be further split.
-	 */
-	struct rhashtable	dsq_hash;
-	struct scx_dispatch_q	**global_dsqs;
-
-	/*
-	 * The event counters are in a per-CPU variable to minimize the
-	 * accounting overhead. A system-wide view on the event counter is
-	 * constructed when requested by scx_bpf_events().
-	 */
-	struct scx_event_stats __percpu *event_stats_cpu;
-
-	bool			warned_zero_slice;
-
-	atomic_t		exit_kind;
-	struct scx_exit_info	*exit_info;
-
-	struct kobject		kobj;
-
-	struct kthread_worker	*helper;
-	struct irq_work		error_irq_work;
-	struct kthread_work	disable_work;
-	struct rcu_work		rcu_work;
-};
-
-enum scx_wake_flags {
-	/* expose select WF_* flags as enums */
-	SCX_WAKE_FORK		= WF_FORK,
-	SCX_WAKE_TTWU		= WF_TTWU,
-	SCX_WAKE_SYNC		= WF_SYNC,
-};
-
-enum scx_enq_flags {
-	/* expose select ENQUEUE_* flags as enums */
-	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
-	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
-	SCX_ENQ_CPU_SELECTED	= ENQUEUE_RQ_SELECTED,
-
-	/* high 32bits are SCX specific */
-
-	/*
-	 * Set the following to trigger preemption when calling
-	 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
-	 * current task is cleared to zero and the CPU is kicked into the
-	 * scheduling path. Implies %SCX_ENQ_HEAD.
-	 */
-	SCX_ENQ_PREEMPT		= 1LLU << 32,
-
-	/*
-	 * The task being enqueued was previously enqueued on the current CPU's
-	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
-	 * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
-	 * invoked in a ->cpu_release() callback, and the task is again
-	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
-	 * task will not be scheduled on the CPU until at least the next invocation
-	 * of the ->cpu_acquire() callback.
-	 */
-	SCX_ENQ_REENQ		= 1LLU << 40,
-
-	/*
-	 * The task being enqueued is the only task available for the cpu. By
-	 * default, ext core keeps executing such tasks but when
-	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
-	 * %SCX_ENQ_LAST flag set.
-	 *
-	 * The BPF scheduler is responsible for triggering a follow-up
-	 * scheduling event. Otherwise, Execution may stall.
-	 */
-	SCX_ENQ_LAST		= 1LLU << 41,
-
-	/* high 8 bits are internal */
-	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
-
-	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
-	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
-};
-
-enum scx_deq_flags {
-	/* expose select DEQUEUE_* flags as enums */
-	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
-
-	/* high 32bits are SCX specific */
-
-	/*
-	 * The generic core-sched layer decided to execute the task even though
-	 * it hasn't been dispatched yet. Dequeue from the BPF side.
-	 */
-	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
-};
-
-enum scx_pick_idle_cpu_flags {
-	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
-	SCX_PICK_IDLE_IN_NODE	= 1LLU << 1,	/* pick a CPU in the same target NUMA node */
-};
-
-enum scx_kick_flags {
-	/*
-	 * Kick the target CPU if idle. Guarantees that the target CPU goes
-	 * through at least one full scheduling cycle before going idle. If the
-	 * target CPU can be determined to be currently not idle and going to go
-	 * through a scheduling cycle before going idle, noop.
-	 */
-	SCX_KICK_IDLE		= 1LLU << 0,
-
-	/*
-	 * Preempt the current task and execute the dispatch path. If the
-	 * current task of the target CPU is an SCX task, its ->scx.slice is
-	 * cleared to zero before the scheduling path is invoked so that the
-	 * task expires and the dispatch path is invoked.
-	 */
-	SCX_KICK_PREEMPT	= 1LLU << 1,
-
-	/*
-	 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
-	 * return after the target CPU finishes picking the next task.
-	 */
-	SCX_KICK_WAIT		= 1LLU << 2,
-};
-
-enum scx_tg_flags {
-	SCX_TG_ONLINE		= 1U << 0,
-	SCX_TG_INITED		= 1U << 1,
-};
-
-enum scx_enable_state {
-	SCX_ENABLING,
-	SCX_ENABLED,
-	SCX_DISABLING,
-	SCX_DISABLED,
-};
-
-static const char *scx_enable_state_str[] = {
-	[SCX_ENABLING]		= "enabling",
-	[SCX_ENABLED]		= "enabled",
-	[SCX_DISABLING]		= "disabling",
-	[SCX_DISABLED]		= "disabled",
-};
-
-/*
- * sched_ext_entity->ops_state
- *
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
- *
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- *   ^              |                 |
- *   |              v                 v
- *   \-------------------------------/
- *
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
- *
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
- */
-enum scx_ops_state {
-	SCX_OPSS_NONE,		/* owned by the SCX core */
-	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
-	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
-	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
-
-	/*
-	 * QSEQ brands each QUEUED instance so that, when dispatch races
-	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
-	 * on the task being dispatched.
-	 *
-	 * As some 32bit archs can't do 64bit store_release/load_acquire,
-	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
-	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
-	 * and runs with IRQ disabled. 30 bits should be sufficient.
-	 */
-	SCX_OPSS_QSEQ_SHIFT	= 2,
-};
-
-/* Use macros to ensure that the type is unsigned long for the masks */
-#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
-
 /*
  * NOTE: sched_ext is in the process of growing multiple scheduler support and
  * scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -1170,7 +136,7 @@ static struct kset *scx_kset;
 #include <trace/events/sched_ext.h>
 
 static void process_ddsp_deferred_locals(struct rq *rq);
-static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
 static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
 		      s64 exit_code, const char *fmt, va_list args);
 
@@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch,
 	va_end(args);
 }
 
-static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
-				       const char *fmt, ...)
-{
-	struct scx_sched *sch;
-	va_list args;
-
-	rcu_read_lock();
-	sch = rcu_dereference(scx_root);
-	if (sch) {
-		va_start(args, fmt);
-		scx_vexit(sch, kind, exit_code, fmt, args);
-		va_end(args);
-	}
-	rcu_read_unlock();
-}
-
 #define scx_error(sch, fmt, args...)	scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_kf_error(fmt, args...)	scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
 
 #define SCX_HAS_OP(sch, op)	test_bit(SCX_OP_IDX(op), (sch)->has_op)
 
@@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b)
 	return (s32)(a - b) < 0;
 }
 
-static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+					      struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
-
 	return sch->global_dsqs[cpu_to_node(task_cpu(p))];
 }
 
@@ -1363,11 +311,11 @@ do {										\
 })
 
 /* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
 {
 	if (unlikely(!(current->scx.kf_mask & mask))) {
-		scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
-			     mask, current->scx.kf_mask);
+		scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+			  mask, current->scx.kf_mask);
 		return false;
 	}
 
@@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 	 */
 	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
 		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
-		scx_kf_error("cpu_release kfunc called from a nested operation");
+		scx_error(sch, "cpu_release kfunc called from a nested operation");
 		return false;
 	}
 
 	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
-		scx_kf_error("dispatch kfunc called from a nested operation");
+		scx_error(sch, "dispatch kfunc called from a nested operation");
 		return false;
 	}
 
@@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask)
 }
 
 /* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+							u32 mask,
 							struct task_struct *p)
 {
-	if (!scx_kf_allowed(mask))
+	if (!scx_kf_allowed(sch, mask))
 		return false;
 
 	if (unlikely((p != current->scx.kf_tasks[0] &&
 		      p != current->scx.kf_tasks[1]))) {
-		scx_kf_error("called on a task not being operated on");
+		scx_error(sch, "called on a task not being operated on");
 		return false;
 	}
 
@@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq {
  */
 struct scx_task_iter {
 	struct sched_ext_entity		cursor;
-	struct task_struct		*locked;
+	struct task_struct		*locked_task;
 	struct rq			*rq;
 	struct rq_flags			rf;
 	u32				cnt;
+	bool				list_locked;
 };
 
 /**
@@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
 
 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
 	list_add(&iter->cursor.tasks_node, &scx_tasks);
-	iter->locked = NULL;
+	iter->locked_task = NULL;
 	iter->cnt = 0;
+	iter->list_locked = true;
 }
 
 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
 {
-	if (iter->locked) {
-		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-		iter->locked = NULL;
+	if (iter->locked_task) {
+		task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+		iter->locked_task = NULL;
 	}
 }
 
@@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
  *
  * If @iter is in the middle of a locked iteration, it may be locking the rq of
  * the task currently being visited in addition to scx_tasks_lock. Unlock both.
- * This function can be safely called anytime during an iteration.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
  */
 static void scx_task_iter_unlock(struct scx_task_iter *iter)
 {
 	__scx_task_iter_rq_unlock(iter);
-	spin_unlock_irq(&scx_tasks_lock);
+	if (iter->list_locked) {
+		iter->list_locked = false;
+		spin_unlock_irq(&scx_tasks_lock);
+	}
 }
 
-/**
- * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
- * @iter: iterator to re-lock
- *
- * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
- * doesn't re-lock the rq lock. Must be called before other iterator operations.
- */
-static void scx_task_iter_relock(struct scx_task_iter *iter)
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
 {
-	spin_lock_irq(&scx_tasks_lock);
+	if (!iter->list_locked) {
+		spin_lock_irq(&scx_tasks_lock);
+		iter->list_locked = true;
+	}
 }
 
 /**
@@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter)
  */
 static void scx_task_iter_stop(struct scx_task_iter *iter)
 {
+	__scx_task_iter_maybe_relock(iter);
 	list_del_init(&iter->cursor.tasks_node);
 	scx_task_iter_unlock(iter);
 }
@@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 	struct list_head *cursor = &iter->cursor.tasks_node;
 	struct sched_ext_entity *pos;
 
+	__scx_task_iter_maybe_relock(iter);
+
 	if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
 		scx_task_iter_unlock(iter);
 		cond_resched();
-		scx_task_iter_relock(iter);
+		__scx_task_iter_maybe_relock(iter);
 	}
 
 	list_for_each_entry(pos, cursor, tasks_node) {
@@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 		return NULL;
 
 	iter->rq = task_rq_lock(p, &iter->rf);
-	iter->locked = p;
+	iter->locked_task = p;
 
 	return p;
 }
@@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
  * This can be used when preemption is not disabled.
  */
 #define scx_add_event(sch, name, cnt) do {					\
-	this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
+	this_cpu_add((sch)->pcpu->event_stats.name, (cnt));			\
 	trace_sched_ext_event(#name, (cnt));					\
 } while(0)
 
@@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
  * This should be used only when preemption is disabled.
  */
 #define __scx_add_event(sch, name, cnt) do {					\
-	__this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
+	__this_cpu_add((sch)->pcpu->event_stats.name, (cnt));			\
 	trace_sched_ext_event(#name, cnt);					\
 } while(0)
 
@@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
 }
 
 /**
- * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
- * @cpu: cpu number which came from a BPF ops
- * @where: extra information reported on error
- *
- * The same as ops_cpu_valid() but @sch is implicit.
- */
-static bool kf_cpu_valid(u32 cpu, const char *where)
-{
-	if (__cpu_valid(cpu)) {
-		return true;
-	} else {
-		scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
-		return false;
-	}
-}
-
-/**
  * ops_sanitize_err - Sanitize a -errno value
  * @sch: scx_sched to error out on error
  * @ops_name: operation to blame on failure
@@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 	WRITE_ONCE(dsq->nr, dsq->nr + delta);
 }
 
-static void refill_task_slice_dfl(struct task_struct *p)
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
 {
 	p->scx.slice = SCX_SLICE_DFL;
-	__scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
 static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
@@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 			scx_error(sch, "attempting to dispatch to a destroyed dsq");
 			/* fall back to the global dsq */
 			raw_spin_unlock(&dsq->lock);
-			dsq = find_global_dsq(p);
+			dsq = find_global_dsq(sch, p);
 			raw_spin_lock(&dsq->lock);
 		}
 	}
@@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
 
 		if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
-			return find_global_dsq(p);
+			return find_global_dsq(sch, p);
 
 		return &cpu_rq(cpu)->scx.local_dsq;
 	}
 
 	if (dsq_id == SCX_DSQ_GLOBAL)
-		dsq = find_global_dsq(p);
+		dsq = find_global_dsq(sch, p);
 	else
 		dsq = find_user_dsq(sch, dsq_id);
 
 	if (unlikely(!dsq)) {
 		scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
 			  dsq_id, p->comm, p->pid);
-		return find_global_dsq(p);
+		return find_global_dsq(sch, p);
 	}
 
 	return dsq;
 }
 
-static void mark_direct_dispatch(struct task_struct *ddsp_task,
+static void mark_direct_dispatch(struct scx_sched *sch,
+				 struct task_struct *ddsp_task,
 				 struct task_struct *p, u64 dsq_id,
 				 u64 enq_flags)
 {
@@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
 	/* @p must match the task on the enqueue path */
 	if (unlikely(p != ddsp_task)) {
 		if (IS_ERR(ddsp_task))
-			scx_kf_error("%s[%d] already direct-dispatched",
+			scx_error(sch, "%s[%d] already direct-dispatched",
 				  p->comm, p->pid);
 		else
-			scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+			scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
 				  ddsp_task->comm, ddsp_task->pid,
 				  p->comm, p->pid);
 		return;
@@ -2333,15 +1271,15 @@ local:
 	 * higher priority it becomes from scx_prio_less()'s POV.
 	 */
 	touch_core_sched(rq, p);
-	refill_task_slice_dfl(p);
+	refill_task_slice_dfl(sch, p);
 local_norefill:
 	dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
 	return;
 
 global:
 	touch_core_sched(rq, p);	/* see the comment in local: */
-	refill_task_slice_dfl(p);
-	dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
+	refill_task_slice_dfl(sch, p);
+	dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
 }
 
 static bool task_runnable(const struct task_struct *p)
@@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
 
 	if (!scx_rq_online(rq)) {
 		if (enforce)
-			__scx_add_event(scx_root,
-					SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+			__scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
 		return false;
 	}
 
@@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
 		if (src_rq != dst_rq &&
 		    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-			dst_dsq = find_global_dsq(p);
+			dst_dsq = find_global_dsq(sch, p);
 			dst_rq = src_rq;
 		}
 	} else {
@@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 
 	if (src_rq != dst_rq &&
 	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-		dispatch_enqueue(sch, find_global_dsq(p), p,
+		dispatch_enqueue(sch, find_global_dsq(sch, p), p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
@@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * balance(), we want to complete this scheduling cycle and then
 		 * start a new one. IOW, we want to call resched_curr() on the
 		 * next, most likely idle, task, not the current one. Use
-		 * scx_bpf_kick_cpu() for deferred kicking.
+		 * scx_kick_cpu() for deferred kicking.
 		 */
 		if (unlikely(!--nr_loops)) {
-			scx_bpf_kick_cpu(cpu_of(rq), 0);
+			scx_kick_cpu(sch, cpu_of(rq), 0);
 			break;
 		}
 	} while (dspc->nr_tasks);
@@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 	if (keep_prev) {
 		p = prev;
 		if (!p->scx.slice)
-			refill_task_slice_dfl(p);
+			refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
 	} else {
 		p = first_local_task(rq);
 		if (!p) {
 			if (kick_idle)
-				scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+				scx_kick_cpu(rcu_dereference_sched(scx_root),
+					     cpu_of(rq), SCX_KICK_IDLE);
 			return NULL;
 		}
 
 		if (unlikely(!p->scx.slice)) {
-			struct scx_sched *sch = scx_root;
+			struct scx_sched *sch = rcu_dereference_sched(scx_root);
 
 			if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
 						p->comm, p->pid, __func__);
 				sch->warned_zero_slice = true;
 			}
-			refill_task_slice_dfl(p);
+			refill_task_slice_dfl(sch, p);
 		}
 	}
 
@@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 
 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
 		if (cpu >= 0) {
-			refill_task_slice_dfl(p);
+			refill_task_slice_dfl(sch, p);
 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
 		} else {
 			cpu = prev_cpu;
@@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq)
 
 #ifdef CONFIG_EXT_GROUP_SCHED
 
-DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
 static bool scx_cgroup_enabled;
 
 void scx_tg_init(struct task_group *tg)
@@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg)
 
 	WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
 
-	percpu_down_read(&scx_cgroup_rwsem);
-
 	if (scx_cgroup_enabled) {
 		if (SCX_HAS_OP(sch, cgroup_init)) {
 			struct scx_cgroup_init_args args =
@@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg)
 		tg->scx.flags |= SCX_TG_ONLINE;
 	}
 
-	percpu_up_read(&scx_cgroup_rwsem);
 	return ret;
 }
 
@@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg)
 
 	WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
 
-	percpu_down_read(&scx_cgroup_rwsem);
-
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
 	    (tg->scx.flags & SCX_TG_INITED))
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
 			    tg->css.cgroup);
 	tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-
-	percpu_up_read(&scx_cgroup_rwsem);
 }
 
 int scx_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 	struct task_struct *p;
 	int ret;
 
-	/* released in scx_finish/cancel_attach() */
-	percpu_down_read(&scx_cgroup_rwsem);
-
 	if (!scx_cgroup_enabled)
 		return 0;
 
@@ -4192,7 +3120,6 @@ err:
 		p->scx.cgrp_moving_from = NULL;
 	}
 
-	percpu_up_read(&scx_cgroup_rwsem);
 	return ops_sanitize_err(sch, "cgroup_prep_move", ret);
 }
 
@@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p)
 	p->scx.cgrp_moving_from = NULL;
 }
 
-void scx_cgroup_finish_attach(void)
-{
-	percpu_up_read(&scx_cgroup_rwsem);
-}
-
 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 {
 	struct scx_sched *sch = scx_root;
@@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 	struct task_struct *p;
 
 	if (!scx_cgroup_enabled)
-		goto out_unlock;
+		return;
 
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
@@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 				    p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
-out_unlock:
-	percpu_up_read(&scx_cgroup_rwsem);
 }
 
 void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 {
 	struct scx_sched *sch = scx_root;
 
-	percpu_down_read(&scx_cgroup_rwsem);
+	percpu_down_read(&scx_cgroup_ops_rwsem);
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
 	    tg->scx.weight != weight)
@@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 	tg->scx.weight = weight;
 
-	percpu_up_read(&scx_cgroup_rwsem);
+	percpu_up_read(&scx_cgroup_ops_rwsem);
 }
 
 void scx_group_set_idle(struct task_group *tg, bool idle)
@@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
 {
 	struct scx_sched *sch = scx_root;
 
-	percpu_down_read(&scx_cgroup_rwsem);
+	percpu_down_read(&scx_cgroup_ops_rwsem);
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
 	    (tg->scx.bw_period_us != period_us ||
@@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg,
 	tg->scx.bw_quota_us = quota_us;
 	tg->scx.bw_burst_us = burst_us;
 
-	percpu_up_read(&scx_cgroup_rwsem);
+	percpu_up_read(&scx_cgroup_ops_rwsem);
 }
 
 static void scx_cgroup_lock(void)
 {
-	percpu_down_write(&scx_cgroup_rwsem);
+	percpu_down_write(&scx_cgroup_ops_rwsem);
+	cgroup_lock();
 }
 
 static void scx_cgroup_unlock(void)
 {
-	percpu_up_write(&scx_cgroup_rwsem);
+	cgroup_unlock();
+	percpu_up_write(&scx_cgroup_ops_rwsem);
 }
 
 #else	/* CONFIG_EXT_GROUP_SCHED */
 
-static inline void scx_cgroup_lock(void) {}
-static inline void scx_cgroup_unlock(void) {}
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
@@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch)
 {
 	struct cgroup_subsys_state *css;
 
-	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
 	scx_cgroup_enabled = false;
 
 	/*
-	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+	 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
 	 * cgroups and exit all the inited ones, all online cgroups are exited.
 	 */
-	rcu_read_lock();
 	css_for_each_descendant_post(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
 
@@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
 		if (!sch->ops.cgroup_exit)
 			continue;
 
-		if (WARN_ON_ONCE(!css_tryget(css)))
-			continue;
-		rcu_read_unlock();
-
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
 			    css->cgroup);
-
-		rcu_read_lock();
-		css_put(css);
 	}
-	rcu_read_unlock();
 }
 
 static int scx_cgroup_init(struct scx_sched *sch)
@@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch)
 	struct cgroup_subsys_state *css;
 	int ret;
 
-	percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
 	/*
-	 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+	 * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
 	 * cgroups and init, all online cgroups are initialized.
 	 */
-	rcu_read_lock();
 	css_for_each_descendant_pre(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
 		struct scx_cgroup_init_args args = {
@@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
 			continue;
 		}
 
-		if (WARN_ON_ONCE(!css_tryget(css)))
-			continue;
-		rcu_read_unlock();
-
 		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
 				      css->cgroup, &args);
 		if (ret) {
@@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
 			return ret;
 		}
 		tg->scx.flags |= SCX_TG_INITED;
-
-		rcu_read_lock();
-		css_put(css);
 	}
-	rcu_read_unlock();
 
 	WARN_ON_ONCE(scx_cgroup_enabled);
 	scx_cgroup_enabled = true;
@@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	int node;
 
 	kthread_stop(sch->helper->task);
-	free_percpu(sch->event_stats_cpu);
+	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
 		kfree(sch->global_dsqs[node]);
@@ -4671,9 +3571,22 @@ bool task_should_scx(int policy)
 
 bool scx_allow_ttwu_queue(const struct task_struct *p)
 {
-	return !scx_enabled() ||
-		(scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
-		p->sched_class != &ext_sched_class;
+	struct scx_sched *sch;
+
+	if (!scx_enabled())
+		return true;
+
+	sch = rcu_dereference_sched(scx_root);
+	if (unlikely(!sch))
+		return true;
+
+	if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+		return true;
+
+	if (unlikely(p->sched_class != &ext_sched_class))
+		return true;
+
+	return false;
 }
 
 /**
@@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void)
  *
  * - pick_next_task() suppresses zero slice warning.
  *
- * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
  *   operations.
  *
  * - scx_prio_less() reverts to the default core_sched_at order.
@@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
 	dump_line(s, "      dsq_vtime=%llu slice=%llu weight=%u",
 		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
-	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+	dump_line(s, "      cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+		  p->migration_disabled);
 
 	if (SCX_HAS_OP(sch, dump_task)) {
 		ops_dump_init(s, "    ");
@@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 		sch->global_dsqs[node] = dsq;
 	}
 
-	sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
-	if (!sch->event_stats_cpu)
+	sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+	if (!sch->pcpu)
 		goto err_free_gdsqs;
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (!sch->helper)
-		goto err_free_event_stats;
+		goto err_free_pcpu;
 	sched_set_fifo(sch->helper->task);
 
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 
 err_stop_helper:
 	kthread_stop(sch->helper->task);
-err_free_event_stats:
-	free_percpu(sch->event_stats_cpu);
+err_free_pcpu:
+	free_percpu(sch->pcpu);
 err_free_gdsqs:
 	for_each_node_state(node, N_POSSIBLE)
 		kfree(sch->global_dsqs[node]);
@@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			scx_error(sch, "ops.init() failed (%d)", ret);
 			goto err_disable;
 		}
+		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
@@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		ret = scx_init_task(p, task_group(p), false);
 		if (ret) {
 			put_task_struct(p);
-			scx_task_iter_relock(&sti);
 			scx_task_iter_stop(&sti);
 			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
 				  ret, p->comm, p->pid);
@@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		scx_set_task_state(p, SCX_TASK_READY);
 
 		put_task_struct(p);
-		scx_task_iter_relock(&sti);
 	}
 	scx_task_iter_stop(&sti);
 	scx_cgroup_unlock();
@@ -5749,6 +4662,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			__setscheduler_class(p->policy, p->prio);
 		struct sched_enq_and_set_ctx ctx;
 
+		if (!tryget_task_struct(p))
+			continue;
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 
@@ -5761,6 +4677,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		sched_enq_and_set_task(&ctx);
 
 		check_class_changed(task_rq(p), p, old_class, p->prio);
+		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
 	percpu_up_write(&scx_fork_rwsem);
@@ -5791,7 +4708,7 @@ err_unlock:
 err_disable_unlock_all:
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
-	scx_bypass(false);
+	/* we'll soon enter disable path, keep bypass on */
 err_disable:
 	mutex_unlock(&scx_enable_mutex);
 	/*
@@ -6324,40 +5241,41 @@ void __init init_sched_ext_class(void)
 /********************************************************************************
  * Helpers that can be called from the BPF scheduler.
  */
-static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+				    u64 enq_flags)
 {
-	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+	if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
 		return false;
 
 	lockdep_assert_irqs_disabled();
 
 	if (unlikely(!p)) {
-		scx_kf_error("called with NULL task");
+		scx_error(sch, "called with NULL task");
 		return false;
 	}
 
 	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
-		scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
+		scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
 		return false;
 	}
 
 	return true;
 }
 
-static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
-				  u64 enq_flags)
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+				  u64 dsq_id, u64 enq_flags)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct task_struct *ddsp_task;
 
 	ddsp_task = __this_cpu_read(direct_dispatch_task);
 	if (ddsp_task) {
-		mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+		mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
 		return;
 	}
 
 	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
-		scx_kf_error("dispatch buffer overflow");
+		scx_error(sch, "dispatch buffer overflow");
 		return;
 	}
 
@@ -6409,7 +5327,14 @@ __bpf_kfunc_start_defs();
 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
 				    u64 enq_flags)
 {
-	if (!scx_dsq_insert_preamble(p, enq_flags))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return;
+
+	if (!scx_dsq_insert_preamble(sch, p, enq_flags))
 		return;
 
 	if (slice)
@@ -6417,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
 	else
 		p->scx.slice = p->scx.slice ?: 1;
 
-	scx_dsq_insert_commit(p, dsq_id, enq_flags);
+	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
 }
 
 /**
@@ -6444,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 					  u64 slice, u64 vtime, u64 enq_flags)
 {
-	if (!scx_dsq_insert_preamble(p, enq_flags))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return;
+
+	if (!scx_dsq_insert_preamble(sch, p, enq_flags))
 		return;
 
 	if (slice)
@@ -6454,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 
 	p->scx.dsq_vtime = vtime;
 
-	scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+	scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
 }
 
 __bpf_kfunc_end_defs();
@@ -6479,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	bool in_balance;
 	unsigned long flags;
 
-	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+	if (!scx_kf_allowed_if_unlocked() &&
+	    !scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
 	/*
@@ -6564,7 +5497,15 @@ __bpf_kfunc_start_defs();
  */
 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
 {
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return 0;
+
+	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return 0;
 
 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
@@ -6579,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	struct scx_sched *sch;
+
+	guard(rcu)();
 
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return;
+
+	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return;
 
 	if (dspc->cursor > 0)
 		dspc->cursor--;
 	else
-		scx_kf_error("dispatch buffer underflow");
+		scx_error(sch, "dispatch buffer underflow");
 }
 
 /**
@@ -6605,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
  */
 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 {
-	struct scx_sched *sch = scx_root;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
+	struct scx_sched *sch;
 
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return false;
+
+	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
 	flush_dispatch_buf(sch, dspc->rq);
@@ -6756,12 +5710,18 @@ __bpf_kfunc_start_defs();
  */
 __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 {
+	struct scx_sched *sch;
 	LIST_HEAD(tasks);
 	u32 nr_enqueued = 0;
 	struct rq *rq;
 	struct task_struct *p, *n;
 
-	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+	guard(rcu)();
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return 0;
+
+	if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
 		return 0;
 
 	rq = cpu_rq(smp_processor_id());
@@ -6784,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 		 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
 		 * the current local DSQ for running tasks and thus are not
 		 * visible to the BPF scheduler.
-		 *
-		 * Also skip re-enqueueing tasks that can only run on this
-		 * CPU, as they would just be re-added to the same local
-		 * DSQ without any benefit.
 		 */
-		if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
+		if (p->migration_pending)
 			continue;
 
 		dispatch_dequeue(rq, p);
@@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
 
 __bpf_kfunc_start_defs();
 
-/**
- * scx_bpf_kick_cpu - Trigger reschedule on a CPU
- * @cpu: cpu to kick
- * @flags: %SCX_KICK_* flags
- *
- * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
- * trigger rescheduling on a busy CPU. This can be called from any online
- * scx_ops operation and the actual kicking is performed asynchronously through
- * an irq work.
- */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
 {
 	struct rq *this_rq;
 	unsigned long irq_flags;
 
-	if (!kf_cpu_valid(cpu, NULL))
+	if (!ops_cpu_valid(sch, cpu, NULL))
 		return;
 
 	local_irq_save(irq_flags);
@@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
 		struct rq *target_rq = cpu_rq(cpu);
 
 		if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
-			scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+			scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
 
 		if (raw_spin_rq_trylock(target_rq)) {
 			if (can_skip_idle_kick(target_rq)) {
@@ -6941,6 +5887,26 @@ out:
 }
 
 /**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = rcu_dereference(scx_root);
+	if (likely(sch))
+		scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
  * @dsq_id: id of the DSQ
  *
@@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
 
 __bpf_kfunc_end_defs();
 
-static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
-			 char *fmt, unsigned long long *data, u32 data__sz)
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+			 size_t line_size, char *fmt, unsigned long long *data,
+			 u32 data__sz)
 {
 	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
 	s32 ret;
 
 	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
 	    (data__sz && !data)) {
-		scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
+		scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
 		return -EINVAL;
 	}
 
 	ret = copy_from_kernel_nofault(data_buf, data, data__sz);
 	if (ret < 0) {
-		scx_kf_error("failed to read data fields (%d)", ret);
+		scx_error(sch, "failed to read data fields (%d)", ret);
 		return ret;
 	}
 
 	ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
 				  &bprintf_data);
 	if (ret < 0) {
-		scx_kf_error("format preparation failed (%d)", ret);
+		scx_error(sch, "format preparation failed (%d)", ret);
 		return ret;
 	}
 
@@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
 			  bprintf_data.bin_args);
 	bpf_bprintf_cleanup(&bprintf_data);
 	if (ret < 0) {
-		scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+		scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
 		return ret;
 	}
 
 	return ret;
 }
 
-static s32 bstr_format(struct scx_bstr_buf *buf,
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
 		       char *fmt, unsigned long long *data, u32 data__sz)
 {
-	return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+	return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
 			     fmt, data, data__sz);
 }
 
@@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs();
 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
 				   unsigned long long *data, u32 data__sz)
 {
+	struct scx_sched *sch;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
-		scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+	sch = rcu_dereference_bh(scx_root);
+	if (likely(sch) &&
+	    bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+		scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
 }
 
@@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
 				    u32 data__sz)
 {
+	struct scx_sched *sch;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
-		scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+	sch = rcu_dereference_bh(scx_root);
+	if (likely(sch) &&
+	    bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+		scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
 }
 
@@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 				   u32 data__sz)
 {
+	struct scx_sched *sch;
 	struct scx_dump_data *dd = &scx_dump_data;
 	struct scx_bstr_buf *buf = &dd->buf;
 	s32 ret;
 
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return;
+
 	if (raw_smp_processor_id() != dd->cpu) {
-		scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+		scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
 		return;
 	}
 
 	/* append the formatted string to the line buf */
-	ret = __bstr_format(buf->data, buf->line + dd->cursor,
+	ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
 	if (ret < 0) {
 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
@@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
  */
 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
 {
-	if (kf_cpu_valid(cpu, NULL))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
 		return arch_scale_cpu_capacity(cpu);
 	else
 		return SCX_CPUPERF_ONE;
@@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
  */
 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
 {
-	if (kf_cpu_valid(cpu, NULL))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
 		return arch_scale_freq_capacity(cpu);
 	else
 		return SCX_CPUPERF_ONE;
@@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
  */
 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(sch);
+	if (unlikely(!sch))
+		return;
+
 	if (unlikely(perf > SCX_CPUPERF_ONE)) {
-		scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+		scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
 		return;
 	}
 
-	if (kf_cpu_valid(cpu, NULL)) {
+	if (ops_cpu_valid(sch, cpu, NULL)) {
 		struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
 		struct rq_flags rf;
 
@@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
 		 * to the corresponding CPU to prevent ABBA deadlocks.
 		 */
 		if (locked_rq && rq != locked_rq) {
-			scx_kf_error("Invalid target CPU %d", cpu);
+			scx_error(sch, "Invalid target CPU %d", cpu);
 			return;
 		}
 
@@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
  */
 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
 {
-	if (!kf_cpu_valid(cpu, NULL))
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
 		return NULL;
 
+	if (!ops_cpu_valid(sch, cpu, NULL))
+		return NULL;
+
+	if (!sch->warned_deprecated_rq) {
+		printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+				"use scx_bpf_locked_rq() when holding rq lock "
+				"or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+		sch->warned_deprecated_rq = true;
+	}
+
 	return cpu_rq(cpu);
 }
 
 /**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+	struct scx_sched *sch;
+	struct rq *rq;
+
+	guard(preempt)();
+
+	sch = rcu_dereference_sched(scx_root);
+	if (unlikely(!sch))
+		return NULL;
+
+	rq = scx_locked_rq();
+	if (!rq) {
+		scx_error(sch, "accessing rq without holding rq lock");
+		return NULL;
+	}
+
+	return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return NULL;
+
+	if (!ops_cpu_valid(sch, cpu, NULL))
+		return NULL;
+
+	return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
  * scx_bpf_task_cgroup - Return the sched cgroup of a task
  * @p: task of interest
  *
@@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 {
 	struct task_group *tg = p->sched_task_group;
 	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		goto out;
 
-	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+	if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
 		goto out;
 
 	cgrp = tg_cgrp(tg);
@@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
 	/* Aggregate per-CPU event counters into @events. */
 	memset(events, 0, sizeof(*events));
 	for_each_possible_cpu(cpu) {
-		e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+		e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
 		scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
 		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
 #ifdef CONFIG_CGROUP_SCHED
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 #endif
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 292bb41a242e..43429b33e52c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,29 +8,6 @@
  */
 #ifdef CONFIG_SCHED_CLASS_EXT
 
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
-	return !current->scx.kf_mask;
-}
-
-static inline bool scx_rq_bypassing(struct rq *rq)
-{
-	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-
-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
-	return __this_cpu_read(scx_locked_rq_state);
-}
-
 void scx_tick(struct rq *rq);
 void init_scx_entity(struct sched_ext_entity *scx);
 void scx_pre_fork(struct task_struct *p);
@@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg);
 void scx_tg_offline(struct task_group *tg);
 int scx_cgroup_can_attach(struct cgroup_taskset *tset);
 void scx_cgroup_move_task(struct task_struct *p);
-void scx_cgroup_finish_attach(void);
 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
 void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
 void scx_group_set_idle(struct task_group *tg, bool idle);
@@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; }
 static inline void scx_tg_offline(struct task_group *tg) {}
 static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
 static inline void scx_cgroup_move_task(struct task_struct *p) {}
-static inline void scx_cgroup_finish_attach(void) {}
 static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
 static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
 static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 7174e1c1a392..d2434c954848 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -819,10 +819,10 @@ void scx_idle_disable(void)
  * Helpers that can be called from the BPF scheduler.
  */
 
-static int validate_node(int node)
+static int validate_node(struct scx_sched *sch, int node)
 {
 	if (!static_branch_likely(&scx_builtin_idle_per_node)) {
-		scx_kf_error("per-node idle tracking is disabled");
+		scx_error(sch, "per-node idle tracking is disabled");
 		return -EOPNOTSUPP;
 	}
 
@@ -832,13 +832,13 @@ static int validate_node(int node)
 
 	/* Make sure node is in a valid range */
 	if (node < 0 || node >= nr_node_ids) {
-		scx_kf_error("invalid node %d", node);
+		scx_error(sch, "invalid node %d", node);
 		return -EINVAL;
 	}
 
 	/* Make sure the node is part of the set of possible nodes */
 	if (!node_possible(node)) {
-		scx_kf_error("unavailable node %d", node);
+		scx_error(sch, "unavailable node %d", node);
 		return -EINVAL;
 	}
 
@@ -847,26 +847,53 @@ static int validate_node(int node)
 
 __bpf_kfunc_start_defs();
 
-static bool check_builtin_idle_enabled(void)
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
 {
 	if (static_branch_likely(&scx_builtin_idle_enabled))
 		return true;
 
-	scx_kf_error("built-in idle tracking is disabled");
+	scx_error(sch, "built-in idle tracking is disabled");
 	return false;
 }
 
-static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+/*
+ * Determine whether @p is a migration-disabled task in the context of BPF
+ * code.
+ *
+ * We can't simply check whether @p->migration_disabled is set in a
+ * sched_ext callback, because migration is always disabled for the current
+ * task while running BPF code.
+ *
+ * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively
+ * disable and re-enable migration. For this reason, the current task
+ * inside a sched_ext callback is always a migration-disabled task.
+ *
+ * Therefore, when @p->migration_disabled == 1, check whether @p is the
+ * current task or not: if it is, then migration was not disabled before
+ * entering the callback, otherwise migration was disabled.
+ *
+ * Returns true if @p is migration-disabled, false otherwise.
+ */
+static bool is_bpf_migration_disabled(const struct task_struct *p)
+{
+	if (p->migration_disabled == 1)
+		return p != current;
+	else
+		return p->migration_disabled;
+}
+
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+				 s32 prev_cpu, u64 wake_flags,
 				 const struct cpumask *allowed, u64 flags)
 {
 	struct rq *rq;
 	struct rq_flags rf;
 	s32 cpu;
 
-	if (!kf_cpu_valid(prev_cpu, NULL))
+	if (!ops_cpu_valid(sch, prev_cpu, NULL))
 		return -EINVAL;
 
-	if (!check_builtin_idle_enabled())
+	if (!check_builtin_idle_enabled(sch))
 		return -EBUSY;
 
 	/*
@@ -879,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
 	if (scx_kf_allowed_if_unlocked()) {
 		rq = task_rq_lock(p, &rf);
 	} else {
-		if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+		if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
 			return -EPERM;
 		rq = scx_locked_rq();
 	}
@@ -898,7 +925,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
 	 * selection optimizations and simply check whether the previously
 	 * used CPU is idle and within the allowed cpumask.
 	 */
-	if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
+	if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) {
 		if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
 		    scx_idle_test_and_clear_cpu(prev_cpu))
 			cpu = prev_cpu;
@@ -922,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
  */
 __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
 {
-	if (!kf_cpu_valid(cpu, NULL))
-		return NUMA_NO_NODE;
+	struct scx_sched *sch;
+
+	guard(rcu)();
 
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+		return NUMA_NO_NODE;
 	return cpu_to_node(cpu);
 }
 
@@ -946,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 				       u64 wake_flags, bool *is_idle)
 {
+	struct scx_sched *sch;
 	s32 cpu;
 
-	cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
+	cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
 	if (cpu >= 0) {
 		*is_idle = true;
 		return cpu;
 	}
 	*is_idle = false;
-
 	return prev_cpu;
 }
 
@@ -981,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 				       const struct cpumask *cpus_allowed, u64 flags)
 {
-	return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
+	return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+				     cpus_allowed, flags);
 }
 
 /**
@@ -995,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
  */
 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
 {
-	node = validate_node(node);
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return cpu_none_mask;
+
+	node = validate_node(sch, node);
 	if (node < 0)
 		return cpu_none_mask;
 
@@ -1011,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
  */
 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return cpu_none_mask;
+
 	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
-		scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+		scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
 		return cpu_none_mask;
 	}
 
-	if (!check_builtin_idle_enabled())
+	if (!check_builtin_idle_enabled(sch))
 		return cpu_none_mask;
 
 	return idle_cpumask(NUMA_NO_NODE)->cpu;
@@ -1034,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
  */
 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
 {
-	node = validate_node(node);
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return cpu_none_mask;
+
+	node = validate_node(sch, node);
 	if (node < 0)
 		return cpu_none_mask;
 
@@ -1054,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
  */
 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return cpu_none_mask;
+
 	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
-		scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+		scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
 		return cpu_none_mask;
 	}
 
-	if (!check_builtin_idle_enabled())
+	if (!check_builtin_idle_enabled(sch))
 		return cpu_none_mask;
 
 	if (sched_smt_active())
@@ -1095,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
  */
 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
 {
-	if (!check_builtin_idle_enabled())
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
 		return false;
 
-	if (!kf_cpu_valid(cpu, NULL))
+	if (!check_builtin_idle_enabled(sch))
+		return false;
+
+	if (!ops_cpu_valid(sch, cpu, NULL))
 		return false;
 
 	return scx_idle_test_and_clear_cpu(cpu);
@@ -1126,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
 __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
 					   int node, u64 flags)
 {
-	node = validate_node(node);
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
+	node = validate_node(sch, node);
 	if (node < 0)
 		return node;
 
@@ -1158,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
 				      u64 flags)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
 	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
-		scx_kf_error("per-node idle tracking is enabled");
+		scx_error(sch, "per-node idle tracking is enabled");
 		return -EBUSY;
 	}
 
-	if (!check_builtin_idle_enabled())
+	if (!check_builtin_idle_enabled(sch))
 		return -EBUSY;
 
 	return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
@@ -1193,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
 __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
 					  int node, u64 flags)
 {
+	struct scx_sched *sch;
 	s32 cpu;
 
-	node = validate_node(node);
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
+	node = validate_node(sch, node);
 	if (node < 0)
 		return node;
 
@@ -1233,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
 				     u64 flags)
 {
+	struct scx_sched *sch;
 	s32 cpu;
 
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch))
+		return -ENODEV;
+
 	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
-		scx_kf_error("per-node idle tracking is enabled");
+		scx_error(sch, "per-node idle tracking is enabled");
 		return -EBUSY;
 	}
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..b3617abed510
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1078 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+	SCX_DSP_DFL_MAX_BATCH		= 32,
+	SCX_DSP_MAX_LOOPS		= 32,
+	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
+
+	SCX_EXIT_BT_LEN			= 64,
+	SCX_EXIT_MSG_LEN		= 1024,
+	SCX_EXIT_DUMP_DFL_LEN		= 32768,
+
+	SCX_CPUPERF_ONE			= SCHED_CAPACITY_SCALE,
+
+	/*
+	 * Iterating all tasks may take a while. Periodically drop
+	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+	 */
+	SCX_TASK_ITER_BATCH		= 32,
+};
+
+enum scx_exit_kind {
+	SCX_EXIT_NONE,
+	SCX_EXIT_DONE,
+
+	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
+	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
+	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
+	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
+
+	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
+	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
+	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ *   Bits: [63  ..  48 47   ..  32 31 .. 0]
+ *         [ SYS ACT ] [ SYS RSN ] [ USR  ]
+ *
+ *   SYS ACT: System-defined exit actions
+ *   SYS RSN: System-defined exit reasons
+ *   USR    : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+	/* Reasons */
+	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
+
+	/* Actions */
+	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
+};
+
+enum scx_exit_flags {
+	/*
+	 * ops.exit() may be called even if the loading failed before ops.init()
+	 * finishes successfully. This is because ops.exit() allows rich exit
+	 * info communication. The following flag indicates whether ops.init()
+	 * finished successfully.
+	 */
+	SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+	/* %SCX_EXIT_* - broad category of the exit reason */
+	enum scx_exit_kind	kind;
+
+	/* exit code if gracefully exiting */
+	s64			exit_code;
+
+	/* %SCX_EFLAG_* */
+	u64			flags;
+
+	/* textual representation of the above */
+	const char		*reason;
+
+	/* backtrace if exiting due to an error */
+	unsigned long		*bt;
+	u32			bt_len;
+
+	/* informational message */
+	char			*msg;
+
+	/* debug dump */
+	char			*dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+	/*
+	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
+	 */
+	SCX_OPS_KEEP_BUILTIN_IDLE	= 1LLU << 0,
+
+	/*
+	 * By default, if there are no other task to run on the CPU, ext core
+	 * keeps running the current task even after its slice expires. If this
+	 * flag is specified, such tasks are passed to ops.enqueue() with
+	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+	 */
+	SCX_OPS_ENQ_LAST		= 1LLU << 1,
+
+	/*
+	 * An exiting task may schedule after PF_EXITING is set. In such cases,
+	 * bpf_task_from_pid() may not be able to find the task and if the BPF
+	 * scheduler depends on pid lookup for dispatching, the task will be
+	 * lost leading to various issues including RCU grace period stalls.
+	 *
+	 * To mask this problem, by default, unhashed tasks are automatically
+	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+	 * depend on pid lookups and wants to handle these tasks directly, the
+	 * following flag can be used.
+	 */
+	SCX_OPS_ENQ_EXITING		= 1LLU << 2,
+
+	/*
+	 * If set, only tasks with policy set to SCHED_EXT are attached to
+	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+	 */
+	SCX_OPS_SWITCH_PARTIAL		= 1LLU << 3,
+
+	/*
+	 * A migration disabled task can only execute on its current CPU. By
+	 * default, such tasks are automatically put on the CPU's local DSQ with
+	 * the default slice on enqueue. If this ops flag is set, they also go
+	 * through ops.enqueue().
+	 *
+	 * A migration disabled task never invokes ops.select_cpu() as it can
+	 * only select the current CPU. Also, p->cpus_ptr will only contain its
+	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
+	 */
+	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,
+
+	/*
+	 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+	 * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+	 * transfers. When this optimization is enabled, ops.select_cpu() is
+	 * skipped in some cases (when racing against the wakee switching out).
+	 * As the BPF scheduler may depend on ops.select_cpu() being invoked
+	 * during wakeups, queued wakeup is disabled by default.
+	 *
+	 * If this ops flag is set, queued wakeup optimization is enabled and
+	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
+	 * may be executed on multiple CPUs.
+	 */
+	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
+
+	/*
+	 * If set, enable per-node idle cpumasks. If clear, use a single global
+	 * flat idle cpumask.
+	 */
+	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
+
+	/*
+	 * CPU cgroup support flags
+	 */
+	SCX_OPS_HAS_CGROUP_WEIGHT	= 1LLU << 16,	/* DEPRECATED, will be removed on 6.18 */
+
+	SCX_OPS_ALL_FLAGS		= SCX_OPS_KEEP_BUILTIN_IDLE |
+					  SCX_OPS_ENQ_LAST |
+					  SCX_OPS_ENQ_EXITING |
+					  SCX_OPS_ENQ_MIGRATION_DISABLED |
+					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
+					  SCX_OPS_SWITCH_PARTIAL |
+					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
+					  SCX_OPS_HAS_CGROUP_WEIGHT,
+
+	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+	__SCX_OPS_INTERNAL_MASK		= 0xffLLU << 56,
+
+	SCX_OPS_HAS_CPU_PREEMPT		= 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+	/*
+	 * Set if ops.init_task() is being invoked on the fork path, as opposed
+	 * to the scheduler transition path.
+	 */
+	bool			fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/* the cgroup the task is joining */
+	struct cgroup		*cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+	/* Whether the task exited before running on sched_ext. */
+	bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+	/* the weight of the cgroup [1..10000] */
+	u32			weight;
+
+	/* bandwidth control parameters from cpu.max and cpu.max.burst */
+	u64			bw_period_us;
+	u64			bw_quota_us;
+	u64			bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+	/* next task is being scheduled by &sched_class_rt */
+	SCX_CPU_PREEMPT_RT,
+	/* next task is being scheduled by &sched_class_dl */
+	SCX_CPU_PREEMPT_DL,
+	/* next task is being scheduled by &sched_class_stop */
+	SCX_CPU_PREEMPT_STOP,
+	/* unknown reason for SCX being preempted */
+	SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+	/* the reason the CPU was preempted */
+	enum scx_cpu_preempt_reason reason;
+
+	/* the task that's going to be scheduled on the CPU */
+	struct task_struct	*task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+	enum scx_exit_kind	kind;
+	s64			exit_code;
+	const char		*reason;
+	u64			at_ns;
+	u64			at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+	/**
+	 * @select_cpu: Pick the target CPU for a task which is being woken up
+	 * @p: task being woken up
+	 * @prev_cpu: the cpu @p was on before sleeping
+	 * @wake_flags: SCX_WAKE_*
+	 *
+	 * Decision made here isn't final. @p may be moved to any CPU while it
+	 * is getting dispatched for execution later. However, as @p is not on
+	 * the rq at this point, getting the eventual execution CPU right here
+	 * saves a small bit of overhead down the line.
+	 *
+	 * If an idle CPU is returned, the CPU is kicked and will try to
+	 * dispatch. While an explicit custom mechanism can be added,
+	 * select_cpu() serves as the default way to wake up idle CPUs.
+	 *
+	 * @p may be inserted into a DSQ directly by calling
+	 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+	 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+	 * of the CPU returned by this operation.
+	 *
+	 * Note that select_cpu() is never called for tasks that can only run
+	 * on a single CPU or tasks with migration disabled, as they don't have
+	 * the option to select a different CPU. See select_task_rq() for
+	 * details.
+	 */
+	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+	/**
+	 * @enqueue: Enqueue a task on the BPF scheduler
+	 * @p: task being enqueued
+	 * @enq_flags: %SCX_ENQ_*
+	 *
+	 * @p is ready to run. Insert directly into a DSQ by calling
+	 * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+	 * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+	 * the task will stall.
+	 *
+	 * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+	 * skipped.
+	 */
+	void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+	/**
+	 * @dequeue: Remove a task from the BPF scheduler
+	 * @p: task being dequeued
+	 * @deq_flags: %SCX_DEQ_*
+	 *
+	 * Remove @p from the BPF scheduler. This is usually called to isolate
+	 * the task while updating its scheduling properties (e.g. priority).
+	 *
+	 * The ext core keeps track of whether the BPF side owns a given task or
+	 * not and can gracefully ignore spurious dispatches from BPF side,
+	 * which makes it safe to not implement this method. However, depending
+	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
+	 * scheduling position not being updated across a priority change.
+	 */
+	void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+	/**
+	 * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+	 * @cpu: CPU to dispatch tasks for
+	 * @prev: previous task being switched out
+	 *
+	 * Called when a CPU's local dsq is empty. The operation should dispatch
+	 * one or more tasks from the BPF scheduler into the DSQs using
+	 * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+	 * using scx_bpf_dsq_move_to_local().
+	 *
+	 * The maximum number of times scx_bpf_dsq_insert() can be called
+	 * without an intervening scx_bpf_dsq_move_to_local() is specified by
+	 * ops.dispatch_max_batch. See the comments on top of the two functions
+	 * for more details.
+	 *
+	 * When not %NULL, @prev is an SCX task with its slice depleted. If
+	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+	 * ops.dispatch() returns. To keep executing @prev, return without
+	 * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+	 */
+	void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+	/**
+	 * @tick: Periodic tick
+	 * @p: task running currently
+	 *
+	 * This operation is called every 1/HZ seconds on CPUs which are
+	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+	 * immediate dispatch cycle on the CPU.
+	 */
+	void (*tick)(struct task_struct *p);
+
+	/**
+	 * @runnable: A task is becoming runnable on its associated CPU
+	 * @p: task becoming runnable
+	 * @enq_flags: %SCX_ENQ_*
+	 *
+	 * This and the following three functions can be used to track a task's
+	 * execution state transitions. A task becomes ->runnable() on a CPU,
+	 * and then goes through one or more ->running() and ->stopping() pairs
+	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+	 * done running on the CPU.
+	 *
+	 * @p is becoming runnable on the CPU because it's
+	 *
+	 * - waking up (%SCX_ENQ_WAKEUP)
+	 * - being moved from another CPU
+	 * - being restored after temporarily taken off the queue for an
+	 *   attribute change.
+	 *
+	 * This and ->enqueue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be followed by ->enqueue()
+	 * e.g. when @p is being dispatched to a remote CPU, or when @p is
+	 * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+	 * task may be ->enqueue()'d without being preceded by this operation
+	 * e.g. after exhausting its slice.
+	 */
+	void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+	/**
+	 * @running: A task is starting to run on its associated CPU
+	 * @p: task starting to run
+	 *
+	 * Note that this callback may be called from a CPU other than the
+	 * one the task is going to run on. This can happen when a task
+	 * property is changed (i.e., affinity), since scx_next_task_scx(),
+	 * which triggers this callback, may run on a CPU different from
+	 * the task's assigned CPU.
+	 *
+	 * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+	 * target CPU the task is going to use.
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 */
+	void (*running)(struct task_struct *p);
+
+	/**
+	 * @stopping: A task is stopping execution
+	 * @p: task stopping to run
+	 * @runnable: is task @p still runnable?
+	 *
+	 * Note that this callback may be called from a CPU other than the
+	 * one the task was running on. This can happen when a task
+	 * property is changed (i.e., affinity), since dequeue_task_scx(),
+	 * which triggers this callback, may run on a CPU different from
+	 * the task's assigned CPU.
+	 *
+	 * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+	 * the task was running on.
+	 *
+	 * See ->runnable() for explanation on the task state notifiers. If
+	 * !@runnable, ->quiescent() will be invoked after this operation
+	 * returns.
+	 */
+	void (*stopping)(struct task_struct *p, bool runnable);
+
+	/**
+	 * @quiescent: A task is becoming not runnable on its associated CPU
+	 * @p: task becoming not runnable
+	 * @deq_flags: %SCX_DEQ_*
+	 *
+	 * See ->runnable() for explanation on the task state notifiers.
+	 *
+	 * @p is becoming quiescent on the CPU because it's
+	 *
+	 * - sleeping (%SCX_DEQ_SLEEP)
+	 * - being moved to another CPU
+	 * - being temporarily taken off the queue for an attribute change
+	 *   (%SCX_DEQ_SAVE)
+	 *
+	 * This and ->dequeue() are related but not coupled. This operation
+	 * notifies @p's state transition and may not be preceded by ->dequeue()
+	 * e.g. when @p is being dispatched to a remote CPU.
+	 */
+	void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+	/**
+	 * @yield: Yield CPU
+	 * @from: yielding task
+	 * @to: optional yield target task
+	 *
+	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+	 * The BPF scheduler should ensure that other available tasks are
+	 * dispatched before the yielding task. Return value is ignored in this
+	 * case.
+	 *
+	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+	 * scheduler can implement the request, return %true; otherwise, %false.
+	 */
+	bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+	/**
+	 * @core_sched_before: Task ordering for core-sched
+	 * @a: task A
+	 * @b: task B
+	 *
+	 * Used by core-sched to determine the ordering between two tasks. See
+	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+	 * core-sched.
+	 *
+	 * Both @a and @b are runnable and may or may not currently be queued on
+	 * the BPF scheduler. Should return %true if @a should run before @b.
+	 * %false if there's no required ordering or @b should run before @a.
+	 *
+	 * If not specified, the default is ordering them according to when they
+	 * became runnable.
+	 */
+	bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+	/**
+	 * @set_weight: Set task weight
+	 * @p: task to set weight for
+	 * @weight: new weight [1..10000]
+	 *
+	 * Update @p's weight to @weight.
+	 */
+	void (*set_weight)(struct task_struct *p, u32 weight);
+
+	/**
+	 * @set_cpumask: Set CPU affinity
+	 * @p: task to set CPU affinity for
+	 * @cpumask: cpumask of cpus that @p can run on
+	 *
+	 * Update @p's CPU affinity to @cpumask.
+	 */
+	void (*set_cpumask)(struct task_struct *p,
+			    const struct cpumask *cpumask);
+
+	/**
+	 * @update_idle: Update the idle state of a CPU
+	 * @cpu: CPU to update the idle state for
+	 * @idle: whether entering or exiting the idle state
+	 *
+	 * This operation is called when @rq's CPU goes or leaves the idle
+	 * state. By default, implementing this operation disables the built-in
+	 * idle CPU tracking and the following helpers become unavailable:
+	 *
+	 * - scx_bpf_select_cpu_dfl()
+	 * - scx_bpf_select_cpu_and()
+	 * - scx_bpf_test_and_clear_cpu_idle()
+	 * - scx_bpf_pick_idle_cpu()
+	 *
+	 * The user also must implement ops.select_cpu() as the default
+	 * implementation relies on scx_bpf_select_cpu_dfl().
+	 *
+	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+	 * tracking.
+	 */
+	void (*update_idle)(s32 cpu, bool idle);
+
+	/**
+	 * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+	 * @cpu: The CPU being acquired by the BPF scheduler.
+	 * @args: Acquire arguments, see the struct definition.
+	 *
+	 * A CPU that was previously released from the BPF scheduler is now once
+	 * again under its control.
+	 */
+	void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+	/**
+	 * @cpu_release: A CPU is taken away from the BPF scheduler
+	 * @cpu: The CPU being released by the BPF scheduler.
+	 * @args: Release arguments, see the struct definition.
+	 *
+	 * The specified CPU is no longer under the control of the BPF
+	 * scheduler. This could be because it was preempted by a higher
+	 * priority sched_class, though there may be other reasons as well. The
+	 * caller should consult @args->reason to determine the cause.
+	 */
+	void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+	/**
+	 * @init_task: Initialize a task to run in a BPF scheduler
+	 * @p: task to initialize for BPF scheduling
+	 * @args: init arguments, see the struct definition
+	 *
+	 * Either we're loading a BPF scheduler or a new task is being forked.
+	 * Initialize @p for BPF scheduling. This operation may block and can
+	 * be used for allocations, and is called exactly once for a task.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During a fork, it
+	 * will abort that specific fork.
+	 */
+	s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+	/**
+	 * @exit_task: Exit a previously-running task from the system
+	 * @p: task to exit
+	 * @args: exit arguments, see the struct definition
+	 *
+	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
+	 * necessary cleanup for @p.
+	 */
+	void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+	/**
+	 * @enable: Enable BPF scheduling for a task
+	 * @p: task to enable BPF scheduling for
+	 *
+	 * Enable @p for BPF scheduling. enable() is called on @p any time it
+	 * enters SCX, and is always paired with a matching disable().
+	 */
+	void (*enable)(struct task_struct *p);
+
+	/**
+	 * @disable: Disable BPF scheduling for a task
+	 * @p: task to disable BPF scheduling for
+	 *
+	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+	 * Disable BPF scheduling for @p. A disable() call is always matched
+	 * with a prior enable() call.
+	 */
+	void (*disable)(struct task_struct *p);
+
+	/**
+	 * @dump: Dump BPF scheduler state on error
+	 * @ctx: debug dump context
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+	 */
+	void (*dump)(struct scx_dump_ctx *ctx);
+
+	/**
+	 * @dump_cpu: Dump BPF scheduler state for a CPU on error
+	 * @ctx: debug dump context
+	 * @cpu: CPU to generate debug dump for
+	 * @idle: @cpu is currently idle without any runnable tasks
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+	 * @cpu. If @idle is %true and this operation doesn't produce any
+	 * output, @cpu is skipped for dump.
+	 */
+	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+	/**
+	 * @dump_task: Dump BPF scheduler state for a runnable task on error
+	 * @ctx: debug dump context
+	 * @p: runnable task to generate debug dump for
+	 *
+	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+	 * @p.
+	 */
+	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+	/**
+	 * @cgroup_init: Initialize a cgroup
+	 * @cgrp: cgroup being initialized
+	 * @args: init arguments, see the struct definition
+	 *
+	 * Either the BPF scheduler is being loaded or @cgrp created, initialize
+	 * @cgrp for sched_ext. This operation may block.
+	 *
+	 * Return 0 for success, -errno for failure. An error return while
+	 * loading will abort loading of the BPF scheduler. During cgroup
+	 * creation, it will abort the specific cgroup creation.
+	 */
+	s32 (*cgroup_init)(struct cgroup *cgrp,
+			   struct scx_cgroup_init_args *args);
+
+	/**
+	 * @cgroup_exit: Exit a cgroup
+	 * @cgrp: cgroup being exited
+	 *
+	 * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+	 * @cgrp for sched_ext. This operation my block.
+	 */
+	void (*cgroup_exit)(struct cgroup *cgrp);
+
+	/**
+	 * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Prepare @p for move from cgroup @from to @to. This operation may
+	 * block and can be used for allocations.
+	 *
+	 * Return 0 for success, -errno for failure. An error return aborts the
+	 * migration.
+	 */
+	s32 (*cgroup_prep_move)(struct task_struct *p,
+				struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * @cgroup_move: Commit cgroup move
+	 * @p: task being moved
+	 * @from: cgroup @p is being moved from
+	 * @to: cgroup @p is being moved to
+	 *
+	 * Commit the move. @p is dequeued during this operation.
+	 */
+	void (*cgroup_move)(struct task_struct *p,
+			    struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * @cgroup_cancel_move: Cancel cgroup move
+	 * @p: task whose cgroup move is being canceled
+	 * @from: cgroup @p was being moved from
+	 * @to: cgroup @p was being moved to
+	 *
+	 * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+	 * Undo the preparation.
+	 */
+	void (*cgroup_cancel_move)(struct task_struct *p,
+				   struct cgroup *from, struct cgroup *to);
+
+	/**
+	 * @cgroup_set_weight: A cgroup's weight is being changed
+	 * @cgrp: cgroup whose weight is being updated
+	 * @weight: new weight [1..10000]
+	 *
+	 * Update @cgrp's weight to @weight.
+	 */
+	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+	/**
+	 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+	 * @cgrp: cgroup whose bandwidth is being updated
+	 * @period_us: bandwidth control period
+	 * @quota_us: bandwidth control quota
+	 * @burst_us: bandwidth control burst
+	 *
+	 * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+	 * cgroup interface.
+	 *
+	 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+	 * to. For example, if @period_us is 1_000_000 and @quota_us is
+	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+	 * interpreted in the same fashion and specifies how much @cgrp can
+	 * burst temporarily. The specific control mechanism and thus the
+	 * interpretation of @period_us and burstiness is upto to the BPF
+	 * scheduler.
+	 */
+	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+				     u64 period_us, u64 quota_us, u64 burst_us);
+
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
+	/*
+	 * All online ops must come before ops.cpu_online().
+	 */
+
+	/**
+	 * @cpu_online: A CPU became online
+	 * @cpu: CPU which just came up
+	 *
+	 * @cpu just came online. @cpu will not call ops.enqueue() or
+	 * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+	 */
+	void (*cpu_online)(s32 cpu);
+
+	/**
+	 * @cpu_offline: A CPU is going offline
+	 * @cpu: CPU which is going offline
+	 *
+	 * @cpu is going offline. @cpu will not call ops.enqueue() or
+	 * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+	 */
+	void (*cpu_offline)(s32 cpu);
+
+	/*
+	 * All CPU hotplug ops must come before ops.init().
+	 */
+
+	/**
+	 * @init: Initialize the BPF scheduler
+	 */
+	s32 (*init)(void);
+
+	/**
+	 * @exit: Clean up after the BPF scheduler
+	 * @info: Exit info
+	 *
+	 * ops.exit() is also called on ops.init() failure, which is a bit
+	 * unusual. This is to allow rich reporting through @info on how
+	 * ops.init() failed.
+	 */
+	void (*exit)(struct scx_exit_info *info);
+
+	/**
+	 * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+	 */
+	u32 dispatch_max_batch;
+
+	/**
+	 * @flags: %SCX_OPS_* flags
+	 */
+	u64 flags;
+
+	/**
+	 * @timeout_ms: The maximum amount of time, in milliseconds, that a
+	 * runnable task should be able to wait before being scheduled. The
+	 * maximum timeout may not exceed the default timeout of 30 seconds.
+	 *
+	 * Defaults to the maximum allowed timeout value of 30 seconds.
+	 */
+	u32 timeout_ms;
+
+	/**
+	 * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+	 * value of 32768 is used.
+	 */
+	u32 exit_dump_len;
+
+	/**
+	 * @hotplug_seq: A sequence number that may be set by the scheduler to
+	 * detect when a hotplug event has occurred during the loading process.
+	 * If 0, no detection occurs. Otherwise, the scheduler will fail to
+	 * load if the sequence number does not match @scx_hotplug_seq on the
+	 * enable path.
+	 */
+	u64 hotplug_seq;
+
+	/**
+	 * @name: BPF scheduler's name
+	 *
+	 * Must be a non-zero valid BPF object name including only isalnum(),
+	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+	 * BPF scheduler is enabled.
+	 */
+	char name[SCX_OPS_NAME_LEN];
+
+	/* internal use only, must be NULL */
+	void *priv;
+};
+
+enum scx_opi {
+	SCX_OPI_BEGIN			= 0,
+	SCX_OPI_NORMAL_BEGIN		= 0,
+	SCX_OPI_NORMAL_END		= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_BEGIN	= SCX_OP_IDX(cpu_online),
+	SCX_OPI_CPU_HOTPLUG_END		= SCX_OP_IDX(init),
+	SCX_OPI_END			= SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+	/*
+	 * If ops.select_cpu() returns a CPU which can't be used by the task,
+	 * the core scheduler code silently picks a fallback CPU.
+	 */
+	s64		SCX_EV_SELECT_CPU_FALLBACK;
+
+	/*
+	 * When dispatching to a local DSQ, the CPU may have gone offline in
+	 * the meantime. In this case, the task is bounced to the global DSQ.
+	 */
+	s64		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+	/*
+	 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+	 * continued to run because there were no other tasks on the CPU.
+	 */
+	s64		SCX_EV_DISPATCH_KEEP_LAST;
+
+	/*
+	 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+	 * is dispatched to a local DSQ when exiting.
+	 */
+	s64		SCX_EV_ENQ_SKIP_EXITING;
+
+	/*
+	 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+	 * migration disabled task skips ops.enqueue() and is dispatched to its
+	 * local DSQ.
+	 */
+	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+	/*
+	 * Total number of times a task's time slice was refilled with the
+	 * default value (SCX_SLICE_DFL).
+	 */
+	s64		SCX_EV_REFILL_SLICE_DFL;
+
+	/*
+	 * The total duration of bypass modes in nanoseconds.
+	 */
+	s64		SCX_EV_BYPASS_DURATION;
+
+	/*
+	 * The number of tasks dispatched in the bypassing mode.
+	 */
+	s64		SCX_EV_BYPASS_DISPATCH;
+
+	/*
+	 * The number of times the bypassing mode has been activated.
+	 */
+	s64		SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+	/*
+	 * The event counters are in a per-CPU variable to minimize the
+	 * accounting overhead. A system-wide view on the event counter is
+	 * constructed when requested by scx_bpf_events().
+	 */
+	struct scx_event_stats	event_stats;
+};
+
+struct scx_sched {
+	struct sched_ext_ops	ops;
+	DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+	/*
+	 * Dispatch queues.
+	 *
+	 * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+	 * This is to avoid live-locking in bypass mode where all tasks are
+	 * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+	 * per-node split isn't sufficient, it can be further split.
+	 */
+	struct rhashtable	dsq_hash;
+	struct scx_dispatch_q	**global_dsqs;
+	struct scx_sched_pcpu __percpu *pcpu;
+
+	bool			warned_zero_slice:1;
+	bool			warned_deprecated_rq:1;
+
+	atomic_t		exit_kind;
+	struct scx_exit_info	*exit_info;
+
+	struct kobject		kobj;
+
+	struct kthread_worker	*helper;
+	struct irq_work		error_irq_work;
+	struct kthread_work	disable_work;
+	struct rcu_work		rcu_work;
+};
+
+enum scx_wake_flags {
+	/* expose select WF_* flags as enums */
+	SCX_WAKE_FORK		= WF_FORK,
+	SCX_WAKE_TTWU		= WF_TTWU,
+	SCX_WAKE_SYNC		= WF_SYNC,
+};
+
+enum scx_enq_flags {
+	/* expose select ENQUEUE_* flags as enums */
+	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
+	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
+	SCX_ENQ_CPU_SELECTED	= ENQUEUE_RQ_SELECTED,
+
+	/* high 32bits are SCX specific */
+
+	/*
+	 * Set the following to trigger preemption when calling
+	 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+	 * current task is cleared to zero and the CPU is kicked into the
+	 * scheduling path. Implies %SCX_ENQ_HEAD.
+	 */
+	SCX_ENQ_PREEMPT		= 1LLU << 32,
+
+	/*
+	 * The task being enqueued was previously enqueued on the current CPU's
+	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+	 * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+	 * invoked in a ->cpu_release() callback, and the task is again
+	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+	 * task will not be scheduled on the CPU until at least the next invocation
+	 * of the ->cpu_acquire() callback.
+	 */
+	SCX_ENQ_REENQ		= 1LLU << 40,
+
+	/*
+	 * The task being enqueued is the only task available for the cpu. By
+	 * default, ext core keeps executing such tasks but when
+	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+	 * %SCX_ENQ_LAST flag set.
+	 *
+	 * The BPF scheduler is responsible for triggering a follow-up
+	 * scheduling event. Otherwise, Execution may stall.
+	 */
+	SCX_ENQ_LAST		= 1LLU << 41,
+
+	/* high 8 bits are internal */
+	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
+
+	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
+	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
+};
+
+enum scx_deq_flags {
+	/* expose select DEQUEUE_* flags as enums */
+	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
+
+	/* high 32bits are SCX specific */
+
+	/*
+	 * The generic core-sched layer decided to execute the task even though
+	 * it hasn't been dispatched yet. Dequeue from the BPF side.
+	 */
+	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
+	SCX_PICK_IDLE_IN_NODE	= 1LLU << 1,	/* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+	/*
+	 * Kick the target CPU if idle. Guarantees that the target CPU goes
+	 * through at least one full scheduling cycle before going idle. If the
+	 * target CPU can be determined to be currently not idle and going to go
+	 * through a scheduling cycle before going idle, noop.
+	 */
+	SCX_KICK_IDLE		= 1LLU << 0,
+
+	/*
+	 * Preempt the current task and execute the dispatch path. If the
+	 * current task of the target CPU is an SCX task, its ->scx.slice is
+	 * cleared to zero before the scheduling path is invoked so that the
+	 * task expires and the dispatch path is invoked.
+	 */
+	SCX_KICK_PREEMPT	= 1LLU << 1,
+
+	/*
+	 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+	 * return after the target CPU finishes picking the next task.
+	 */
+	SCX_KICK_WAIT		= 1LLU << 2,
+};
+
+enum scx_tg_flags {
+	SCX_TG_ONLINE		= 1U << 0,
+	SCX_TG_INITED		= 1U << 1,
+};
+
+enum scx_enable_state {
+	SCX_ENABLING,
+	SCX_ENABLED,
+	SCX_DISABLING,
+	SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+	[SCX_ENABLING]		= "enabling",
+	[SCX_ENABLED]		= "enabled",
+	[SCX_DISABLING]		= "disabling",
+	[SCX_DISABLED]		= "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ *   ^              |                 |
+ *   |              v                 v
+ *   \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+	SCX_OPSS_NONE,		/* owned by the SCX core */
+	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
+	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
+	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
+
+	/*
+	 * QSEQ brands each QUEUED instance so that, when dispatch races
+	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
+	 * on the task being dispatched.
+	 *
+	 * As some 32bit archs can't do 64bit store_release/load_acquire,
+	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
+	 * and runs with IRQ disabled. 30 bits should be sufficient.
+	 */
+	SCX_OPSS_QSEQ_SHIFT	= 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+	return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+	return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..3a89f949e307 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3542,7 +3542,7 @@ out:
 	}
 }
 
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+void init_numa_balancing(u64 clone_flags, struct task_struct *p)
 {
 	int mm_users = 0;
 	struct mm_struct *mm = p->mm;
@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
 	if (!gcfs_rq || !gcfs_rq->load.weight)
 		return;
 
-	if (throttled_hierarchy(gcfs_rq))
-		return;
-
 	shares = calc_group_shares(gcfs_rq);
 	if (unlikely(se->load.weight != shares))
 		reweight_entity(cfs_rq_of(se), se, shares);
@@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (cfs_rq->pelt_clock_throttled) {
 			struct rq *rq = rq_of(cfs_rq);
 
-			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
-				cfs_rq->throttled_clock = rq_clock(rq);
-			if (!cfs_rq->throttled_clock_self)
-				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+			cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+				cfs_rq->throttled_clock_pelt;
+			cfs_rq->pelt_clock_throttled = 0;
 		}
+#endif
 	}
 }
 
@@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable--;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		cfs_rq->h_nr_runnable++;
-		if (cfs_rq_throttled(cfs_rq))
-			break;
 	}
 }
 
@@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		 * DELAY_DEQUEUE relies on spurious wakeups, special task
 		 * states must not suffer spurious wakeups, excempt them.
 		 */
-		if (flags & DEQUEUE_SPECIAL)
+		if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
 			delay = false;
 
 		WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);
 
-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
+			struct rq *rq = rq_of(cfs_rq);
+
+			list_del_leaf_cfs_rq(cfs_rq);
+			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+			cfs_rq->pelt_clock_throttled = 1;
+		}
+#endif
+	}
 
 	return true;
 }
@@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 }
 
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+	return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+	return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+	struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+
+	WARN_ON_ONCE(p != current);
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+
+	/*
+	 * If task is exiting, then there won't be a return to userspace, so we
+	 * don't have to bother with any of this.
+	 */
+	if ((p->flags & PF_EXITING))
+		return;
+
+	scoped_guard(task_rq_lock, p) {
+		se = &p->se;
+		cfs_rq = cfs_rq_of(se);
+
+		/* Raced, forget */
+		if (p->sched_class != &fair_sched_class)
+			return;
+
+		/*
+		 * If not in limbo, then either replenish has happened or this
+		 * task got migrated out of the throttled cfs_rq, move along.
+		 */
+		if (!cfs_rq->throttle_count)
+			return;
+		rq = scope.rq;
+		update_rq_clock(rq);
+		WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		/*
+		 * Must not set throttled before dequeue or dequeue will
+		 * mistakenly regard this task as an already throttled one.
+		 */
+		p->throttled = true;
+		resched_curr(rq);
+	}
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+	init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+	/* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+	INIT_LIST_HEAD(&p->throttle_node);
+}
+
 /*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
  */
-static inline int throttled_lb_pair(struct task_group *tg,
-				    int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+	WARN_ON_ONCE(p->se.on_rq);
+	list_del_init(&p->throttle_node);
+
+	/* task blocked after throttled */
+	if (flags & DEQUEUE_SLEEP) {
+		p->throttled = false;
+		return;
+	}
+
+	/*
+	 * task is migrating off its old cfs_rq, detach
+	 * the task's load from its old cfs_rq.
+	 */
+	if (task_on_rq_migrating(p))
+		detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
 {
-	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
 
-	src_cfs_rq = tg->cfs_rq[src_cpu];
-	dest_cfs_rq = tg->cfs_rq[dest_cpu];
+	/* @p should have gone through dequeue_throttled_task() first */
+	WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+	/*
+	 * If the throttled task @p is enqueued to a throttled cfs_rq,
+	 * take the fast path by directly putting the task on the
+	 * target cfs_rq's limbo list.
+	 *
+	 * Do not do that when @p is current because the following race can
+	 * cause @p's group_node to be incorectly re-insterted in its rq's
+	 * cfs_tasks list, despite being throttled:
+	 *
+	 *     cpuX                       cpuY
+	 *   p ret2user
+	 *  throttle_cfs_rq_work()  sched_move_task(p)
+	 *  LOCK task_rq_lock
+	 *  dequeue_task_fair(p)
+	 *  UNLOCK task_rq_lock
+	 *                          LOCK task_rq_lock
+	 *                          task_current_donor(p) == true
+	 *                          task_on_rq_queued(p) == true
+	 *                          dequeue_task(p)
+	 *                          put_prev_task(p)
+	 *                          sched_change_group()
+	 *                          enqueue_task(p) -> p's new cfs_rq
+	 *                                             is throttled, go
+	 *                                             fast path and skip
+	 *                                             actual enqueue
+	 *                          set_next_task(p)
+	 *                    list_move(&se->group_node, &rq->cfs_tasks); // bug
+	 *  schedule()
+	 *
+	 * In the above race case, @p current cfs_rq is in the same rq as
+	 * its previous cfs_rq because sched_move_task() only moves a task
+	 * to a different group from the same rq, so we can use its current
+	 * cfs_rq to derive rq and test if the task is current.
+	 */
+	if (throttled_hierarchy(cfs_rq) &&
+	    !task_current_donor(rq_of(cfs_rq), p)) {
+		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
 
-	return throttled_hierarchy(src_cfs_rq) ||
-	       throttled_hierarchy(dest_cfs_rq);
+	/* we can't take the fast path, do an actual enqueue*/
+	p->throttled = false;
+	return false;
 }
 
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;
+
+	if (--cfs_rq->throttle_count)
+		return 0;
 
-	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
+	if (cfs_rq->pelt_clock_throttled) {
 		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					     cfs_rq->throttled_clock_pelt;
+		cfs_rq->pelt_clock_throttled = 0;
+	}
 
-		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
 
-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+		cfs_rq->throttled_clock_self = 0;
 
-			cfs_rq->throttled_clock_self = 0;
+		if (WARN_ON_ONCE((s64)delta < 0))
+			delta = 0;
 
-			if (WARN_ON_ONCE((s64)delta < 0))
-				delta = 0;
+		cfs_rq->throttled_clock_self_time += delta;
+	}
 
-			cfs_rq->throttled_clock_self_time += delta;
-		}
+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+		list_del_init(&p->throttle_node);
+		p->throttled = false;
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}
 
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }
 
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+	return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+	if (task_has_throttle_work(p))
+		return;
+
+	/*
+	 * Kthreads and exiting tasks don't return to userspace, so adding the
+	 * work is pointless
+	 */
+	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+		return;
+
+	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+		cfs_rq->throttled_clock = rq_clock(rq);
+
+	if (!cfs_rq->throttled_clock_self)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-		list_del_leaf_cfs_rq(cfs_rq);
+	if (cfs_rq->throttle_count++)
+		return 0;
 
-		WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+	/*
+	 * For cfs_rqs that still have entities enqueued, PELT clock
+	 * stop happens at dequeue time when all entities are dequeued.
+	 */
+	if (!cfs_rq->nr_queued) {
+		list_del_leaf_cfs_rq(cfs_rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+		cfs_rq->pelt_clock_throttled = 1;
 	}
-	cfs_rq->throttle_count++;
 
+	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	return 0;
 }
 
@@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+	int dequeue = 1;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
 	 */
 	cfs_rq->throttled = 1;
 	WARN_ON_ONCE(cfs_rq->throttled_clock);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock = rq_clock(rq);
 	return true;
 }
 
@@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reached here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue in tg_unthrottle_up() will immediately trigger a
+	 * throttle, which is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+		return;
 
 	se = cfs_rq->tg->se[cpu_of(rq)];
 
@@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
-	}
-
-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
 	}
 
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
 
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+	return false;
+}
+
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
 
-static inline int throttled_lb_pair(struct task_group *tg,
-				    int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
 {
 	return 0;
 }
@@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
+	if (task_is_throttled(p) && enqueue_throttled_task(p))
+		return;
+
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
 	 * the cfs_rq utilization to select a frequency.
@@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}
 
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);
 
-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);
 
 	hrtick_update(rq);
@@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	bool was_sched_idle = sched_idle_rq(rq);
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
+	bool task_throttled = flags & DEQUEUE_THROTTLE;
 	struct task_struct *p = NULL;
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
@@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
+		if (throttled_hierarchy(cfs_rq) && task_throttled)
+			record_throttle_clock(cfs_rq);
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
-			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+			if (task_sleep && se)
 				set_next_buddy(se);
 			break;
 		}
@@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
 
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
+		if (throttled_hierarchy(cfs_rq) && task_throttled)
+			record_throttle_clock(cfs_rq);
 	}
 
 	sub_nr_running(rq, h_nr_queued);
@@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (task_is_throttled(p)) {
+		dequeue_throttled_task(p, flags);
+		return true;
+	}
+
 	if (!p->se.sched_delayed)
 		util_est_dequeue(&rq->cfs, p);
 
@@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
-	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+	if (task_is_throttled(p))
 		return;
 
 	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
@@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
+	struct task_struct *p;
+	bool throttled;
 
 again:
 	cfs_rq = &rq->cfs;
 	if (!cfs_rq->nr_queued)
 		return NULL;
 
+	throttled = false;
+
 	do {
 		/* Might not have done put_prev_entity() */
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);
 
-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		throttled |= check_cfs_rq_runtime(cfs_rq);
 
 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
@@ -8761,7 +8849,10 @@ again:
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	if (unlikely(throttled))
+		task_throttle_setup_work(p);
+	return p;
 }
 
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
 	return pick_next_task_fair(rq, prev, NULL);
 }
 
-static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
-{
-	return !!dl_se->rq->cfs.nr_queued;
-}
-
 static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
 {
 	return pick_task_fair(dl_se->rq);
@@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq)
 
 	init_dl_entity(dl_se);
 
-	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+	dl_server_init(dl_se, rq, fair_server_pick_task);
 }
 
 /*
@@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 
-	/* throttled hierarchies are not runnable */
-	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+	/* !se->on_rq also covers throttled task */
+	if (!se->on_rq)
 		return false;
 
 	/* Tell the scheduler that we'd really like se to run next. */
@@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) delayed dequeued unless we migrate load, or
-	 * 2) throttled_lb_pair, or
+	 * 2) target cfs_rq is in throttled hierarchy, or
 	 * 3) cannot be migrated to this CPU due to cpus_ptr, or
 	 * 4) running (obviously), or
 	 * 5) are cache-hot on their current CPU, or
@@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
 		return 0;
 
-	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+	if (lb_throttled_hierarchy(p, env->dst_cpu))
 		return 0;
 
 	/*
@@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	if (!throttled_hierarchy(cfs_rq))
+	/*
+	 * If a task gets attached to this cfs_rq and before being queued,
+	 * it gets migrated to another CPU due to reasons like affinity
+	 * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+	 * that removed load decayed or it can cause faireness problem.
+	 */
+	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
 	/* Start to propagate at parent */
@@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
 
-		if (cfs_rq_throttled(cfs_rq))
-			break;
-
-		if (!throttled_hierarchy(cfs_rq))
+		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 }
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f..f921302dc40f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	u64 throttled;
 
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		throttled = U64_MAX;
 	else
 		throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
-	if (unlikely(cfs_rq->throttle_count))
+	if (unlikely(cfs_rq->pelt_clock_throttled))
 		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
 	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+	DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+	return 0;
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f7..1f5d07067f60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
  *
  *   dl_se::rq -- runqueue we belong to.
  *
- *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
- *                                server when it runs out of tasks to run.
- *
  *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
  *                           returns NULL.
  *
  *   dl_server_update() -- called from update_curr_common(), propagates runtime
  *                         to the server.
  *
- *   dl_server_start()
- *   dl_server_stop()  -- start/stop the server when it has (no) tasks.
+ *   dl_server_start() -- start the server when it has tasks; it will stop
+ *			  automatically when there are no more tasks, per
+ *			  dl_se::server_pick() returning NULL.
+ *
+ *   dl_server_stop() -- (force) stop the server; use when updating
+ *                       parameters.
  *
  *   dl_server_init() -- initializes the server.
+ *
+ * When started the dl_server will (per dl_defer) schedule a timer for its
+ * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a
+ * server will run at the very end of its period.
+ *
+ * This is done such that any runtime from the target class can be accounted
+ * against the server -- through dl_server_update() above -- such that when it
+ * becomes time to run, it might already be out of runtime and get deferred
+ * until the next period. In this case dl_server_timer() will alternate
+ * between defer and replenish but never actually enqueue the server.
+ *
+ * Only when the target class does not manage to exhaust the server's runtime
+ * (there's actualy starvation in the given period), will the dl_server get on
+ * the runqueue. Once queued it will pick tasks from the target class and run
+ * them until either its runtime is exhaused, at which point its back to
+ * dl_server_timer, or until there are no more tasks to run, at which point
+ * the dl_server stops itself.
+ *
+ * By stopping at this point the dl_server retains bandwidth, which, if a new
+ * task wakes up imminently (starting the server again), can be used --
+ * subject to CBS wakeup rules -- without having to wait for the next period.
+ *
+ * Additionally, because of the dl_defer behaviour the start/stop behaviour is
+ * naturally thottled to once per period, avoiding high context switch
+ * workloads from spamming the hrtimer program/cancel paths.
  */
 extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
 extern void dl_server_start(struct sched_dl_entity *dl_se);
 extern void dl_server_stop(struct sched_dl_entity *dl_se);
 extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
-		    dl_server_has_tasks_f has_tasks,
 		    dl_server_pick_f pick_task);
 extern void sched_init_dl_servers(void);
 
@@ -735,10 +760,12 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
+	bool			throttled:1;
+	bool			pelt_clock_throttled:1;
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
+	struct list_head        throttled_limbo_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -1935,12 +1962,12 @@ extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *p, struct task_struct *t,
 			int cpu, int scpu);
-extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+extern void init_numa_balancing(u64 clone_flags, struct task_struct *p);
 
 #else /* !CONFIG_NUMA_BALANCING: */
 
 static inline void
-init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+init_numa_balancing(u64 clone_flags, struct task_struct *p)
 {
 }
 
@@ -2342,6 +2369,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SPECIAL		0x10
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE	0x800
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2658,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
 extern void init_dl_entity(struct sched_dl_entity *dl_se);
 
+extern void init_cfs_throttle_work(struct task_struct *p);
+
 #define BW_SHIFT		20
 #define BW_UNIT			(1 << BW_SHIFT)
 #define RATIO_SHIFT		8
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 977e133bb8a4..444bdfdab731 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 enum numa_topology_type sched_numa_topology_type;
 
 static int			sched_domains_numa_levels;
-static int			sched_domains_curr_level;
 
 int				sched_max_numa_distance;
 static int			*sched_domains_numa_distance;
@@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl,
 	int sd_id, sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
 
-#ifdef CONFIG_NUMA
-	/*
-	 * Ugly hack to pass state to sd_numa_mask()...
-	 */
-	sched_domains_curr_level = tl->numa_level;
-#endif
-
-	sd_weight = cpumask_weight(tl->mask(cpu));
+	sd_weight = cpumask_weight(tl->mask(tl, cpu));
 
 	if (tl->sd_flags)
 		sd_flags = (*tl->sd_flags)();
@@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl,
 	};
 
 	sd_span = sched_domain_span(sd);
-	cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+	cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
 	sd_id = cpumask_first(sd_span);
 
 	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl,
 	return sd;
 }
 
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+	return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+	return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+	return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_node_mask(cpu);
+}
+
 /*
  * Topology list, bottom-up.
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
+	SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
 #endif
 
 #ifdef CONFIG_SCHED_CLUSTER
-	SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
+	SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
 #endif
 
 #ifdef CONFIG_SCHED_MC
-	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+	SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
 #endif
-	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
+	SDTL_INIT(tl_pkg_mask, NULL, PKG),
 	{ NULL, },
 };
 
@@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
 }
 
 #ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+	return SD_NUMA;
+}
 
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
 {
-	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+	return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
 }
 
 static void sched_numa_warn(const char *str)
@@ -2201,6 +2238,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 		goto unlock;
 
 	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+	if (!hop_masks)
+		goto unlock;
 	hop = hop_masks	- k.masks;
 
 	ret = hop ?
@@ -2411,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 		 * breaks the linking done for an earlier span.
 		 */
 		for_each_cpu(cpu, cpu_map) {
-			const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+			const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
 			int id;
 
 			/* lowest bit set in this mask is used as a unique id */
@@ -2419,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 
 			if (cpumask_test_cpu(id, id_seen)) {
 				/* First CPU has already been seen, ensure identical spans */
-				if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+				if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
 					return false;
 			} else {
 				/* First CPU hasn't been seen before, ensure it's a completely new span */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 41aa761c7738..25f62867a16d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -741,6 +741,26 @@ out:
 }
 
 #ifdef SECCOMP_ARCH_NATIVE
+static bool seccomp_uprobe_exception(struct seccomp_data *sd)
+{
+#if defined __NR_uretprobe || defined __NR_uprobe
+#ifdef SECCOMP_ARCH_COMPAT
+	if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+	{
+#ifdef __NR_uretprobe
+		if (sd->nr == __NR_uretprobe)
+			return true;
+#endif
+#ifdef __NR_uprobe
+		if (sd->nr == __NR_uprobe)
+			return true;
+#endif
+	}
+#endif
+	return false;
+}
+
 /**
  * seccomp_is_const_allow - check if filter is constant allow with given data
  * @fprog: The BPF programs
@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
 		return false;
 
 	/* Our single exception to filtering. */
-#ifdef __NR_uretprobe
-#ifdef SECCOMP_ARCH_COMPAT
-	if (sd->arch == SECCOMP_ARCH_NATIVE)
-#endif
-		if (sd->nr == __NR_uretprobe)
-			return true;
-#endif
+	if (seccomp_uprobe_exception(sd))
+		return true;
 
 	for (pc = 0; pc < fprog->len; pc++) {
 		struct sock_filter *insn = &fprog->filter[pc];
@@ -1043,6 +1058,9 @@ static const int mode1_syscalls[] = {
 #ifdef __NR_uretprobe
 	__NR_uretprobe,
 #endif
+#ifdef __NR_uprobe
+	__NR_uprobe,
+#endif
 	-1, /* negative terminated */
 };
 
@@ -1139,7 +1157,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
 static bool should_sleep_killable(struct seccomp_filter *match,
 				  struct seccomp_knotif *n)
 {
-	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+	return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT;
 }
 
 static int seccomp_do_user_notification(int this_syscall,
@@ -1186,13 +1204,11 @@ static int seccomp_do_user_notification(int this_syscall,
 
 		if (err != 0) {
 			/*
-			 * Check to see if the notifcation got picked up and
-			 * whether we should switch to wait killable.
+			 * Check to see whether we should switch to wait
+			 * killable. Only return the interrupted error if not.
 			 */
-			if (!wait_killable && should_sleep_killable(match, &n))
-				continue;
-
-			goto interrupted;
+			if (!(!wait_killable && should_sleep_killable(match, &n)))
+				goto interrupted;
 		}
 
 		addfd = list_first_entry_or_null(&n.addfd,
diff --git a/kernel/signal.c b/kernel/signal.c
index e2c928de7d2c..fe9190d84f28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 {
 	struct pid *pid;
 	enum pid_type type;
+	int ret;
 
 	/* Enforce flags be set to 0 until we add an extension. */
 	if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
@@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	}
 	}
 
-	return do_pidfd_send_signal(pid, sig, type, info, flags);
+	ret = do_pidfd_send_signal(pid, sig, type, info, flags);
+	put_pid(pid);
+
+	return ret;
 }
 
 static int
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 513b1945987c..77198911b8dd 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -165,7 +165,11 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 	/* First entry of a task into a BH disabled section? */
 	if (!current->softirq_disable_cnt) {
 		if (preemptible()) {
-			local_lock(&softirq_ctrl.lock);
+			if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+				local_lock(&softirq_ctrl.lock);
+			else
+				migrate_disable();
+
 			/* Required to meet the RCU bottomhalf requirements. */
 			rcu_read_lock();
 		} else {
@@ -177,17 +181,34 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 	 * Track the per CPU softirq disabled state. On RT this is per CPU
 	 * state to allow preemption of bottom half disabled sections.
 	 */
-	newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
-	/*
-	 * Reflect the result in the task state to prevent recursion on the
-	 * local lock and to make softirq_count() & al work.
-	 */
-	current->softirq_disable_cnt = newcnt;
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+		newcnt = this_cpu_add_return(softirq_ctrl.cnt, cnt);
+		/*
+		 * Reflect the result in the task state to prevent recursion on the
+		 * local lock and to make softirq_count() & al work.
+		 */
+		current->softirq_disable_cnt = newcnt;
 
-	if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
-		raw_local_irq_save(flags);
-		lockdep_softirqs_off(ip);
-		raw_local_irq_restore(flags);
+		if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+			raw_local_irq_save(flags);
+			lockdep_softirqs_off(ip);
+			raw_local_irq_restore(flags);
+		}
+	} else {
+		bool sirq_dis = false;
+
+		if (!current->softirq_disable_cnt)
+			sirq_dis = true;
+
+		this_cpu_add(softirq_ctrl.cnt, cnt);
+		current->softirq_disable_cnt += cnt;
+		WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+
+		if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_dis) {
+			raw_local_irq_save(flags);
+			lockdep_softirqs_off(ip);
+			raw_local_irq_restore(flags);
+		}
 	}
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
@@ -195,23 +216,42 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
 static void __local_bh_enable(unsigned int cnt, bool unlock)
 {
 	unsigned long flags;
+	bool sirq_en = false;
 	int newcnt;
 
-	DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
-			    this_cpu_read(softirq_ctrl.cnt));
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+		DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+				    this_cpu_read(softirq_ctrl.cnt));
+		if (softirq_count() == cnt)
+			sirq_en = true;
+	} else {
+		if (current->softirq_disable_cnt == cnt)
+			sirq_en = true;
+	}
 
-	if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+	if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && sirq_en) {
 		raw_local_irq_save(flags);
 		lockdep_softirqs_on(_RET_IP_);
 		raw_local_irq_restore(flags);
 	}
 
-	newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
-	current->softirq_disable_cnt = newcnt;
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK)) {
+		newcnt = this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+		current->softirq_disable_cnt = newcnt;
 
-	if (!newcnt && unlock) {
-		rcu_read_unlock();
-		local_unlock(&softirq_ctrl.lock);
+		if (!newcnt && unlock) {
+			rcu_read_unlock();
+			local_unlock(&softirq_ctrl.lock);
+		}
+	} else {
+		current->softirq_disable_cnt -= cnt;
+		this_cpu_sub(softirq_ctrl.cnt, cnt);
+		if (unlock && !current->softirq_disable_cnt) {
+			migrate_enable();
+			rcu_read_unlock();
+		} else {
+			WARN_ON_ONCE(current->softirq_disable_cnt < 0);
+		}
 	}
 }
 
@@ -228,7 +268,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 	lock_map_release(&bh_lock_map);
 
 	local_irq_save(flags);
-	curcnt = __this_cpu_read(softirq_ctrl.cnt);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_NEEDS_BH_LOCK))
+		curcnt = this_cpu_read(softirq_ctrl.cnt);
+	else
+		curcnt = current->softirq_disable_cnt;
 
 	/*
 	 * If this is not reenabling soft interrupts, no point in trying to
@@ -805,6 +848,58 @@ static bool tasklet_clear_sched(struct tasklet_struct *t)
 	return false;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+struct tasklet_sync_callback {
+	spinlock_t	cb_lock;
+	atomic_t	cb_waiters;
+};
+
+static DEFINE_PER_CPU(struct tasklet_sync_callback, tasklet_sync_callback) = {
+	.cb_lock	= __SPIN_LOCK_UNLOCKED(tasklet_sync_callback.cb_lock),
+	.cb_waiters	= ATOMIC_INIT(0),
+};
+
+static void tasklet_lock_callback(void)
+{
+	spin_lock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_unlock_callback(void)
+{
+	spin_unlock(this_cpu_ptr(&tasklet_sync_callback.cb_lock));
+}
+
+static void tasklet_callback_cancel_wait_running(void)
+{
+	struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+	atomic_inc(&sync_cb->cb_waiters);
+	spin_lock(&sync_cb->cb_lock);
+	atomic_dec(&sync_cb->cb_waiters);
+	spin_unlock(&sync_cb->cb_lock);
+}
+
+static void tasklet_callback_sync_wait_running(void)
+{
+	struct tasklet_sync_callback *sync_cb = this_cpu_ptr(&tasklet_sync_callback);
+
+	if (atomic_read(&sync_cb->cb_waiters)) {
+		spin_unlock(&sync_cb->cb_lock);
+		spin_lock(&sync_cb->cb_lock);
+	}
+}
+
+#else /* !CONFIG_PREEMPT_RT: */
+
+static void tasklet_lock_callback(void) { }
+static void tasklet_unlock_callback(void) { }
+static void tasklet_callback_sync_wait_running(void) { }
+
+#ifdef CONFIG_SMP
+static void tasklet_callback_cancel_wait_running(void) { }
+#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
 static void tasklet_action_common(struct tasklet_head *tl_head,
 				  unsigned int softirq_nr)
 {
@@ -816,6 +911,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
 	tl_head->tail = &tl_head->head;
 	local_irq_enable();
 
+	tasklet_lock_callback();
 	while (list) {
 		struct tasklet_struct *t = list;
 
@@ -835,6 +931,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
 					}
 				}
 				tasklet_unlock(t);
+				tasklet_callback_sync_wait_running();
 				continue;
 			}
 			tasklet_unlock(t);
@@ -847,6 +944,7 @@ static void tasklet_action_common(struct tasklet_head *tl_head,
 		__raise_softirq_irqoff(softirq_nr);
 		local_irq_enable();
 	}
+	tasklet_unlock_callback();
 }
 
 static __latent_entropy void tasklet_action(void)
@@ -897,12 +995,9 @@ void tasklet_unlock_spin_wait(struct tasklet_struct *t)
 			/*
 			 * Prevent a live lock when current preempted soft
 			 * interrupt processing or prevents ksoftirqd from
-			 * running. If the tasklet runs on a different CPU
-			 * then this has no effect other than doing the BH
-			 * disable/enable dance for nothing.
+			 * running.
 			 */
-			local_bh_disable();
-			local_bh_enable();
+			tasklet_callback_cancel_wait_running();
 		} else {
 			cpu_relax();
 		}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..bf5d05c635ff 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
 COND_SYSCALL(rseq);
 
 COND_SYSCALL(uretprobe);
+COND_SYSCALL(uprobe);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30899a8cc52c..e8c479329282 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg)
 	 * of the next expiring timer is enough. The return from the SMP
 	 * function call will take care of the reprogramming in case the
 	 * CPU was in a NOHZ idle sleep.
+	 *
+	 * In periodic low resolution mode, the next softirq expiration
+	 * must also be updated.
 	 */
-	if (!hrtimer_hres_active(base) && !tick_nohz_active)
-		return;
-
 	raw_spin_lock(&base->lock);
 	hrtimer_update_base(base);
 	if (hrtimer_hres_active(base))
@@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 				     &new_base->clock_base[i]);
 	}
 
-	/*
-	 * The migration might have changed the first expiring softirq
-	 * timer on this CPU. Update it.
-	 */
-	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
 	/* Tell the other CPU to retrigger the next event */
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 667452768ed3..5b6997f4dc3d 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_ns.h>
 #include <linux/export.h>
+#include <linux/nstree.h>
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 		goto fail;
 
 	err = -ENOMEM;
-	ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
 	if (!ns)
 		goto fail_dec;
 
-	refcount_set(&ns->ns.count, 1);
-
 	ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!ns->vvar_page)
 		goto fail_free;
 
-	err = ns_alloc_inum(&ns->ns);
+	err = ns_common_init(ns);
 	if (err)
 		goto fail_free_page;
 
 	ns->ucounts = ucounts;
-	ns->ns.ops = &timens_operations;
 	ns->user_ns = get_user_ns(user_ns);
 	ns->offsets = old_ns->offsets;
 	ns->frozen_offsets = false;
+	ns_tree_add(ns);
 	return ns;
 
 fail_free_page:
@@ -130,7 +129,7 @@ fail:
  *
  * Return: timens_for_children namespace or ERR_PTR.
  */
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
 	struct user_namespace *user_ns, struct time_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWTIME))
@@ -253,16 +252,13 @@ out:
 
 void free_time_ns(struct time_namespace *ns)
 {
+	ns_tree_remove(ns);
 	dec_time_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 	__free_page(ns->vvar_page);
-	kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct time_namespace, ns);
+	/* Concurrent nstree traversal depends on a grace period. */
+	kfree_rcu(ns, ns.ns_rcu);
 }
 
 static struct ns_common *timens_get(struct task_struct *task)
@@ -466,7 +462,6 @@ out:
 
 const struct proc_ns_operations timens_operations = {
 	.name		= "time",
-	.type		= CLONE_NEWTIME,
 	.get		= timens_get,
 	.put		= timens_put,
 	.install	= timens_install,
@@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = {
 const struct proc_ns_operations timens_for_children_operations = {
 	.name		= "time_for_children",
 	.real_ns_name	= "time",
-	.type		= CLONE_NEWTIME,
 	.get		= timens_for_children_get,
 	.put		= timens_put,
 	.install	= timens_install,
@@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = {
 };
 
 struct time_namespace init_time_ns = {
-	.ns.count	= REFCOUNT_INIT(3),
+	.ns.ns_type	= ns_common_type(&init_time_ns),
+	.ns.__ns_ref	= REFCOUNT_INIT(3),
 	.user_ns	= &init_user_ns,
-	.ns.inum	= PROC_TIME_INIT_INO,
+	.ns.inum	= ns_init_inum(&init_time_ns),
 	.ns.ops		= &timens_operations,
 	.frozen_offsets	= true,
 };
+
+void __init time_ns_init(void)
+{
+	ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 059fa8b79be6..b6974fce800c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -83,6 +83,12 @@ static inline bool tk_is_aux(const struct timekeeper *tk)
 }
 #endif
 
+static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
+{
+	tk->offs_aux = offs;
+	tk->monotonic_to_aux = ktime_to_timespec64(offs);
+}
+
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
@@ -1506,7 +1512,7 @@ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespe
 			timekeeping_restore_shadow(tkd);
 			return -EINVAL;
 		}
-		tks->offs_aux = offs;
+		tk_update_aux_offs(tks, offs);
 	}
 
 	timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
@@ -2937,7 +2943,7 @@ static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
 	 * xtime ("realtime") is not applicable for auxiliary clocks and
 	 * kept in sync with "monotonic".
 	 */
-	aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow);
+	tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
 
 	timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
 	return 0;
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 8ba8b0d8a387..aa59919b8f2c 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -159,10 +159,10 @@ void vdso_time_update_aux(struct timekeeper *tk)
 	if (clock_mode != VDSO_CLOCKMODE_NONE) {
 		fill_clock_configuration(vc, &tk->tkr_mono);
 
-		vdso_ts->sec	= tk->xtime_sec;
+		vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec;
 
 		nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-		nsec += tk->offs_aux;
+		nsec += tk->monotonic_to_aux.tv_nsec;
 		vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
 		nsec = nsec << tk->tkr_mono.shift;
 		vdso_ts->nsec = nsec;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index f4d200f0c610..484ad7a18463 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
 	unsigned long bitmap;
 	unsigned long ret;
 	int offset;
+	int bit;
 	int i;
 
 	ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
@@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
 	if (fregs)
 		ftrace_regs_set_instruction_pointer(fregs, ret);
 
+	bit = ftrace_test_recursion_trylock(trace.func, ret);
+	/*
+	 * This can fail because ftrace_test_recursion_trylock() allows one nest
+	 * call. If we are already in a nested call, then we don't probe this and
+	 * just return the original return address.
+	 */
+	if (unlikely(bit < 0))
+		goto out;
+
 #ifdef CONFIG_FUNCTION_GRAPH_RETVAL
 	trace.retval = ftrace_regs_get_return_value(fregs);
 #endif
@@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
 		}
 	}
 
+	ftrace_test_recursion_unlock(bit);
+out:
 	/*
 	 * The ftrace_graph_return() may still access the current
 	 * ret_stack structure, we need to make sure the update of
@@ -1397,6 +1409,8 @@ error:
 		ftrace_graph_active--;
 		gops->saved_func = NULL;
 		fgraph_lru_release_index(i);
+		if (!ftrace_graph_active)
+			unregister_pm_notifier(&ftrace_suspend_notifier);
 	}
 	return ret;
 }
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index c8034dfc1070..5a807d62e76d 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad
 {
 	unsigned long *addrs;
 
-	if (alist->index >= alist->size)
-		return -ENOMEM;
+	/* Previously we failed to expand the list. */
+	if (alist->index == alist->size)
+		return -ENOSPC;
 
 	alist->addrs[alist->index++] = addr;
 	if (alist->index < alist->size)
@@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
 		fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
 
-	if (alist.index < alist.size && alist.index > 0)
+	if (alist.index > 0)
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops,
 				      alist.addrs, alist.index, 1, 0);
 	mutex_unlock(&fprobe_mutex);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00b76d450a89..a69067367c29 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4661,13 +4661,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 	        } else {
 			iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
 		}
+	} else {
+		if (hash)
+			iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash);
+		else
+			iter->hash = EMPTY_HASH;
+	}
 
-		if (!iter->hash) {
-			trace_parser_put(&iter->parser);
-			goto out_unlock;
-		}
-	} else
-		iter->hash = hash;
+	if (!iter->hash) {
+		trace_parser_put(&iter->parser);
+		goto out_unlock;
+	}
 
 	ret = 0;
 
@@ -6543,9 +6547,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 		ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
 						      iter->hash, filter_hash);
 		mutex_unlock(&ftrace_lock);
-	} else {
-		/* For read only, the hash is the ops hash */
-		iter->hash = NULL;
 	}
 
 	mutex_unlock(&iter->ops->func_hash->regex_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb71a0dc9d69..43460949ad3f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7666,7 +7666,7 @@ static __init int test_ringbuffer(void)
 	rb_test_started = true;
 
 	set_current_state(TASK_INTERRUPTIBLE);
-	/* Just run for 10 seconds */;
+	/* Just run for 10 seconds */
 	schedule_timeout(10 * HZ);
 
 	kthread_stop(rb_hammer);
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index eea447b06907..c1347da69e9d 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
 	mon = ltl_get_monitor(current);
 
 	switch (id) {
+#ifdef __NR_clock_nanosleep
 	case __NR_clock_nanosleep:
+#endif
 #ifdef __NR_clock_nanosleep_time64
 	case __NR_clock_nanosleep_time64:
 #endif
@@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
 		ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
 		break;
 
+#ifdef __NR_futex
 	case __NR_futex:
+#endif
 #ifdef __NR_futex_time64
 	case __NR_futex_time64:
 #endif
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 1482e91c39f4..48338520376f 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
  */
 static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	struct rv_monitor *mon = p;
+	struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
 
 	(*pos)++;
 
@@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
 
 	retval = create_monitor_dir(monitor, parent);
 	if (retval)
-		return retval;
+		goto out_unlock;
 
 	/* keep children close to the parent for easier visualisation */
 	if (parent)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4283ed4e8f59..b3c94fbaf002 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 		/* copy the current bits to the new max */
 		ret = trace_pid_list_first(filtered_pids, &pid);
 		while (!ret) {
-			trace_pid_list_set(pid_list, pid);
+			ret = trace_pid_list_set(pid_list, pid);
+			if (ret < 0)
+				goto out;
+
 			ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
 			nr_pids++;
 		}
@@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 		trace_parser_clear(&parser);
 		ret = 0;
 	}
+ out:
 	trace_parser_put(&parser);
 
 	if (ret < 0) {
@@ -1816,7 +1820,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 
 	ret = get_user(ch, ubuf++);
 	if (ret)
-		return ret;
+		goto fail;
 
 	read++;
 	cnt--;
@@ -1830,7 +1834,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		while (cnt && isspace(ch)) {
 			ret = get_user(ch, ubuf++);
 			if (ret)
-				return ret;
+				goto fail;
 			read++;
 			cnt--;
 		}
@@ -1848,12 +1852,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 	while (cnt && !isspace(ch) && ch) {
 		if (parser->idx < parser->size - 1)
 			parser->buffer[parser->idx++] = ch;
-		else
-			return -EINVAL;
+		else {
+			ret = -EINVAL;
+			goto fail;
+		}
 
 		ret = get_user(ch, ubuf++);
 		if (ret)
-			return ret;
+			goto fail;
 		read++;
 		cnt--;
 	}
@@ -1868,11 +1874,15 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		/* Make sure the parsed string always terminates with '\0'. */
 		parser->buffer[parser->idx] = 0;
 	} else {
-		return -EINVAL;
+		ret = -EINVAL;
+		goto fail;
 	}
 
 	*ppos += read;
 	return read;
+fail:
+	trace_parser_fail(parser);
+	return ret;
 }
 
 /* TODO add a seq_buf_to_buffer() */
@@ -7203,7 +7213,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user
 	entry = ring_buffer_event_data(event);
 	entry->ip = ip;
 
-	len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+	len = copy_from_user_nofault(&entry->buf, ubuf, cnt);
 	if (len) {
 		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
 		cnt = FAULTED_SIZE;
@@ -7300,7 +7310,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
 
 	entry = ring_buffer_event_data(event);
 
-	len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+	len = copy_from_user_nofault(&entry->id, ubuf, cnt);
 	if (len) {
 		entry->id = -1;
 		memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
@@ -10632,10 +10642,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
 			ret = print_trace_line(&iter);
 			if (ret != TRACE_TYPE_NO_CONSUME)
 				trace_consume(&iter);
+
+			trace_printk_seq(&iter.seq);
 		}
 		touch_nmi_watchdog();
-
-		trace_printk_seq(&iter.seq);
 	}
 
 	if (!cnt)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dbf1d3cf2f1..5f4bed5842f9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1292,6 +1292,7 @@ bool ftrace_event_is_function(struct trace_event_call *call);
  */
 struct trace_parser {
 	bool		cont;
+	bool		fail;
 	char		*buffer;
 	unsigned	idx;
 	unsigned	size;
@@ -1299,7 +1300,7 @@ struct trace_parser {
 
 static inline bool trace_parser_loaded(struct trace_parser *parser)
 {
-	return (parser->idx != 0);
+	return !parser->fail && parser->idx != 0;
 }
 
 static inline bool trace_parser_cont(struct trace_parser *parser)
@@ -1313,6 +1314,11 @@ static inline void trace_parser_clear(struct trace_parser *parser)
 	parser->idx = 0;
 }
 
+static inline void trace_parser_fail(struct trace_parser *parser)
+{
+	parser->fail = true;
+}
+
 extern int trace_parser_get_init(struct trace_parser *parser, int size);
 extern void trace_parser_put(struct trace_parser *parser);
 extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
@@ -2204,7 +2210,7 @@ static inline bool is_good_system_name(const char *name)
 static inline void sanitize_event_name(char *name)
 {
 	while (*name++ != '\0')
-		if (*name == ':' || *name == '.')
+		if (*name == ':' || *name == '.' || *name == '*')
 			*name = '_';
 }
 
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 5d64a18cacac..d06854bd32b3 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
+	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	if (ret)
+		return ret;
+
 	ret = tracing_check_open_get_tr(NULL);
 	if (ret)
 		return ret;
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index af42aaa3d172..2ab283fd3032 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
 {
 	struct user_event_enabler_fault *fault;
 
-	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT);
 
 	if (!fault)
 		return false;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66e1a527cf1a..a7f4b9a47a71 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -27,14 +27,21 @@ struct fgraph_cpu_data {
 	unsigned long	enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 
+struct fgraph_ent_args {
+	struct ftrace_graph_ent_entry	ent;
+	/* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+	unsigned long			args[FTRACE_REGS_MAX_ARGS];
+};
+
 struct fgraph_data {
 	struct fgraph_cpu_data __percpu *cpu_data;
 
 	/* Place to preserve last processed entry. */
 	union {
-		struct ftrace_graph_ent_entry	ent;
+		struct fgraph_ent_args		ent;
+		/* TODO allow retaddr to have args */
 		struct fgraph_retaddr_ent_entry	rent;
-	} ent;
+	};
 	struct ftrace_graph_ret_entry	ret;
 	int				failed;
 	int				cpu;
@@ -627,10 +634,13 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * Save current and next entries for later reference
 			 * if the output fails.
 			 */
-			if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
-				data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
-			else
-				data->ent.ent = *curr;
+			if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) {
+				data->rent = *(struct fgraph_retaddr_ent_entry *)curr;
+			} else {
+				int size = min((int)sizeof(data->ent), (int)iter->ent_size);
+
+				memcpy(&data->ent, curr, size);
+			}
 			/*
 			 * If the next event is not a return type, then
 			 * we only care about what type it is. Otherwise we can
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ccae62d4fb91..fa60362a3f31 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
 			return -EINVAL;
 		}
 		buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
 		buf[len] = '\0';
 		ret = kstrtouint(buf, 0, &maxactive);
 		if (ret || !maxactive) {
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index fd259da0aa64..dc734867f0fc 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2322,12 +2322,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
 	int running, err;
 	char *buf __free(kfree) = NULL;
 
-	buf = kmalloc(count, GFP_KERNEL);
+	if (count < 1)
+		return 0;
+
+	buf = kmalloc(count + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	if (copy_from_user(buf, ubuf, count))
 		return -EFAULT;
+	buf[count] = '\0';
 
 	if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
 		return -ENOMEM;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 16b283f9d831..6ea2f6363b90 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 	stats->ac_sched	 = tsk->policy;
 	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);
 	stats->ac_tgid   = task_tgid_nr_ns(tsk, pid_ns);
+	stats->ac_ppid	 = task_ppid_nr_ns(tsk, pid_ns);
 	rcu_read_lock();
 	tcred = __task_cred(tsk);
 	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid);
 	stats->ac_gid	 = from_kgid_munged(user_ns, tcred->gid);
-	stats->ac_ppid	 = pid_alive(tsk) ?
-		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 	rcu_read_unlock();
 
 	task_cputime(tsk, &utime, &stime);
diff --git a/kernel/user.c b/kernel/user.c
index f46b1d41163b..0163665914c9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -65,10 +65,11 @@ struct user_namespace init_user_ns = {
 			.nr_extents = 1,
 		},
 	},
-	.ns.count = REFCOUNT_INIT(3),
+	.ns.ns_type = ns_common_type(&init_user_ns),
+	.ns.__ns_ref = REFCOUNT_INIT(3),
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
-	.ns.inum = PROC_USER_INIT_INO,
+	.ns.inum = ns_init_inum(&init_user_ns),
 #ifdef CONFIG_USER_NS
 	.ns.ops = &userns_operations,
 #endif
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 682f40d5632d..03cb63883d04 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/fs_struct.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/nstree.h>
 
 static struct kmem_cache *user_ns_cachep __ro_after_init;
 static DEFINE_MUTEX(userns_state_mutex);
@@ -124,12 +125,11 @@ int create_user_ns(struct cred *new)
 		goto fail_dec;
 
 	ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
-	ret = ns_alloc_inum(&ns->ns);
+
+	ret = ns_common_init(ns);
 	if (ret)
 		goto fail_free;
-	ns->ns.ops = &userns_operations;
 
-	refcount_set(&ns->ns.count, 1);
 	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
 	ns->level = parent_ns->level + 1;
@@ -159,12 +159,13 @@ int create_user_ns(struct cred *new)
 		goto fail_keyring;
 
 	set_cred_user_ns(new, ns);
+	ns_tree_add(ns);
 	return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	key_put(ns->persistent_keyring_register);
 #endif
-	ns_free_inum(&ns->ns);
+	ns_common_free(ns);
 fail_free:
 	kmem_cache_free(user_ns_cachep, ns);
 fail_dec:
@@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work)
 	do {
 		struct ucounts *ucounts = ns->ucounts;
 		parent = ns->parent;
+		ns_tree_remove(ns);
 		if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
 			kfree(ns->gid_map.forward);
 			kfree(ns->gid_map.reverse);
@@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work)
 #endif
 		retire_userns_sysctls(ns);
 		key_free_user_ns(ns);
-		ns_free_inum(&ns->ns);
-		kmem_cache_free(user_ns_cachep, ns);
+		ns_common_free(ns);
+		/* Concurrent nstree traversal depends on a grace period. */
+		kfree_rcu(ns, ns.ns_rcu);
 		dec_user_namespaces(ucounts);
 		ns = parent;
-	} while (refcount_dec_and_test(&parent->ns.count));
+	} while (ns_ref_put(parent));
 }
 
 void __put_user_ns(struct user_namespace *ns)
@@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns)
 }
 EXPORT_SYMBOL(current_in_userns);
 
-static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct user_namespace, ns);
-}
-
 static struct ns_common *userns_get(struct task_struct *task)
 {
 	struct user_namespace *user_ns;
@@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations userns_operations = {
 	.name		= "user",
-	.type		= CLONE_NEWUSER,
 	.get		= userns_get,
 	.put		= userns_put,
 	.install	= userns_install,
@@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = {
 static __init int user_namespaces_init(void)
 {
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+	ns_tree_add(&init_user_ns);
 	return 0;
 }
 subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index b1ac3ca870f2..ebbfc578a9d3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
 #include <linux/cred.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
+#include <linux/nstree.h>
 #include <linux/sched/task.h>
 
 static struct kmem_cache *uts_ns_cache __ro_after_init;
@@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts)
 	dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
 }
 
-static struct uts_namespace *create_uts_ns(void)
-{
-	struct uts_namespace *uts_ns;
-
-	uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
-	if (uts_ns)
-		refcount_set(&uts_ns->ns.count, 1);
-	return uts_ns;
-}
-
 /*
  * Clone a new ns copying an original utsname, setting refcount to 1
  * @old_ns: namespace to clone
@@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 		goto fail;
 
 	err = -ENOMEM;
-	ns = create_uts_ns();
+	ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL);
 	if (!ns)
 		goto fail_dec;
 
-	err = ns_alloc_inum(&ns->ns);
+	err = ns_common_init(ns);
 	if (err)
 		goto fail_free;
 
 	ns->ucounts = ucounts;
-	ns->ns.ops = &utsns_operations;
-
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
 	ns->user_ns = get_user_ns(user_ns);
 	up_read(&uts_sem);
+	ns_tree_add(ns);
 	return ns;
 
 fail_free:
@@ -86,7 +76,7 @@ fail:
  * utsname of this process won't be seen by parent, and vice
  * versa.
  */
-struct uts_namespace *copy_utsname(unsigned long flags,
+struct uts_namespace *copy_utsname(u64 flags,
 	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
 	struct uts_namespace *new_ns;
@@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags,
 
 void free_uts_ns(struct uts_namespace *ns)
 {
+	ns_tree_remove(ns);
 	dec_uts_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kmem_cache_free(uts_ns_cache, ns);
-}
-
-static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct uts_namespace, ns);
+	ns_common_free(ns);
+	/* Concurrent nstree traversal depends on a grace period. */
+	kfree_rcu(ns, ns.ns_rcu);
 }
 
 static struct ns_common *utsns_get(struct task_struct *task)
@@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations utsns_operations = {
 	.name		= "uts",
-	.type		= CLONE_NEWUTS,
 	.get		= utsns_get,
 	.put		= utsns_put,
 	.install	= utsns_install,
@@ -174,4 +160,5 @@ void __init uts_ns_init(void)
 			offsetof(struct uts_namespace, name),
 			sizeof_field(struct uts_namespace, name),
 			NULL);
+	ns_tree_add(&init_uts_ns);
 }
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index bc738fa90c1d..27107dcc1cbf 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk)
 	 * freeing it below.
 	 */
 	wait_for_completion(&vtsk->exited);
+	put_task_struct(vtsk->task);
 	kfree(vtsk);
 }
 EXPORT_SYMBOL_GPL(vhost_task_stop);
@@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
 		return ERR_CAST(tsk);
 	}
 
-	vtsk->task = tsk;
+	vtsk->task = get_task_struct(tsk);
 	return vtsk;
 }
 EXPORT_SYMBOL_GPL(vhost_task_create);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6b79b3675c3..45320e27a16c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -222,7 +222,9 @@ struct worker_pool {
 	struct workqueue_attrs	*attrs;		/* I: worker attributes */
 	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
 	int			refcnt;		/* PL: refcnt for unbound pools */
-
+#ifdef CONFIG_PREEMPT_RT
+	spinlock_t		cb_lock;	/* BH worker cancel lock */
+#endif
 	/*
 	 * Destruction of pool is RCU protected to allow dereferences
 	 * from get_work_pool().
@@ -2930,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t)
 	raw_spin_unlock_irq(&pool->lock);
 
 	if (do_cull)
-		queue_work(system_unbound_wq, &pool->idle_cull_work);
+		queue_work(system_dfl_wq, &pool->idle_cull_work);
 }
 
 /**
@@ -3078,6 +3080,31 @@ restart:
 		goto restart;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static void worker_lock_callback(struct worker_pool *pool)
+{
+	spin_lock(&pool->cb_lock);
+}
+
+static void worker_unlock_callback(struct worker_pool *pool)
+{
+	spin_unlock(&pool->cb_lock);
+}
+
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool)
+{
+	spin_lock(&pool->cb_lock);
+	spin_unlock(&pool->cb_lock);
+}
+
+#else
+
+static void worker_lock_callback(struct worker_pool *pool) { }
+static void worker_unlock_callback(struct worker_pool *pool) { }
+static void workqueue_callback_cancel_wait_running(struct worker_pool *pool) { }
+
+#endif
+
 /**
  * manage_workers - manage worker pool
  * @worker: self
@@ -3557,6 +3584,7 @@ static void bh_worker(struct worker *worker)
 	int nr_restarts = BH_WORKER_RESTARTS;
 	unsigned long end = jiffies + BH_WORKER_JIFFIES;
 
+	worker_lock_callback(pool);
 	raw_spin_lock_irq(&pool->lock);
 	worker_leave_idle(worker);
 
@@ -3585,6 +3613,7 @@ done:
 	worker_enter_idle(worker);
 	kick_pool(pool);
 	raw_spin_unlock_irq(&pool->lock);
+	worker_unlock_callback(pool);
 }
 
 /*
@@ -4222,17 +4251,17 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
 		    (data & WORK_OFFQ_BH)) {
 			/*
 			 * On RT, prevent a live lock when %current preempted
-			 * soft interrupt processing or prevents ksoftirqd from
-			 * running by keeping flipping BH. If the BH work item
-			 * runs on a different CPU then this has no effect other
-			 * than doing the BH disable/enable dance for nothing.
-			 * This is copied from
-			 * kernel/softirq.c::tasklet_unlock_spin_wait().
+			 * soft interrupt processing by blocking on lock which
+			 * is owned by the thread invoking the callback.
 			 */
 			while (!try_wait_for_completion(&barr.done)) {
 				if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
-					local_bh_disable();
-					local_bh_enable();
+					struct worker_pool *pool;
+
+					guard(rcu)();
+					pool = get_work_pool(work);
+					if (pool)
+						workqueue_callback_cancel_wait_running(pool);
 				} else {
 					cpu_relax();
 				}
@@ -4782,6 +4811,9 @@ static int init_worker_pool(struct worker_pool *pool)
 	ida_init(&pool->worker_ida);
 	INIT_HLIST_NODE(&pool->hash_node);
 	pool->refcnt = 1;
+#ifdef CONFIG_PREEMPT_RT
+	spin_lock_init(&pool->cb_lock);
+#endif
 
 	/* shouldn't fail above this point */
 	pool->attrs = alloc_workqueue_attrs();
@@ -6046,7 +6078,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	struct pool_workqueue *pwq;
 	bool ret;
 
-	rcu_read_lock();
 	preempt_disable();
 
 	if (cpu == WORK_CPU_UNBOUND)
@@ -6056,7 +6087,6 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	ret = !list_empty(&pwq->inactive_works);
 
 	preempt_enable();
-	rcu_read_unlock();
 
 	return ret;
 }
@@ -7546,8 +7576,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
 	if (!thresh)
 		return;
 
-	rcu_read_lock();
-
 	for_each_pool(pool, pi) {
 		unsigned long pool_ts, touched, ts;
 
@@ -7589,8 +7617,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
 
 	}
 
-	rcu_read_unlock();
-
 	if (lockup_detected)
 		show_all_workqueues();
 
@@ -7642,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val,
 	if (ret)
 		return ret;
 
-	if (system_wq)
+	if (system_percpu_wq)
 		wq_watchdog_set_thresh(thresh);
 	else
 		wq_watchdog_thresh = thresh;
@@ -7802,22 +7828,22 @@ void __init workqueue_init_early(void)
 		ordered_wq_attrs[i] = attrs;
 	}
 
-	system_wq = alloc_workqueue("events", 0, 0);
-	system_percpu_wq = alloc_workqueue("events", 0, 0);
-	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
-	system_long_wq = alloc_workqueue("events_long", 0, 0);
+	system_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+	system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0);
+	system_highpri_wq = alloc_workqueue("events_highpri",
+					    WQ_HIGHPRI | WQ_PERCPU, 0);
+	system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0);
 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	system_freezable_wq = alloc_workqueue("events_freezable",
-					      WQ_FREEZABLE, 0);
+					      WQ_FREEZABLE | WQ_PERCPU, 0);
 	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
-					      WQ_POWER_EFFICIENT, 0);
+					      WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
 	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
-					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
-					      0);
-	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
+					      WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0);
+	system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
 	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
-					       WQ_BH | WQ_HIGHPRI, 0);
+					       WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
 	BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
 	       !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
 	       !system_power_efficient_wq ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dc0e0c6ed075..24939b8553e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST
 	depends on KUNIT
 	default KUNIT_ALL_TESTS
 
+config FFS_KUNIT_TEST
+	tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  This builds KUnit tests for ffs-family bit manipulation functions
+	  including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64().
+
+	  These tests validate mathematical correctness, edge case handling,
+	  and cross-architecture consistency of bit scanning functions.
+
+	  For more information on KUnit and unit tests in general,
+	  please refer to Documentation/dev-tools/kunit/.
+
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
 
@@ -2894,7 +2908,7 @@ config FORTIFY_KUNIT_TEST
 config LONGEST_SYM_KUNIT_TEST
 	tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
 	depends on KUNIT && KPROBES
-	depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL
+	depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL
 	default KUNIT_ALL_TESTS
 	help
 	  Tests the longest symbol possible
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
index fb8c0c5c2bd2..8778ec44bf63 100644
--- a/lib/clz_ctz.c
+++ b/lib/clz_ctz.c
@@ -15,28 +15,28 @@
 #include <linux/kernel.h>
 
 int __weak __ctzsi2(int val);
-int __weak __ctzsi2(int val)
+int __weak __attribute_const__ __ctzsi2(int val)
 {
 	return __ffs(val);
 }
 EXPORT_SYMBOL(__ctzsi2);
 
 int __weak __clzsi2(int val);
-int __weak __clzsi2(int val)
+int __weak __attribute_const__ __clzsi2(int val)
 {
 	return 32 - fls(val);
 }
 EXPORT_SYMBOL(__clzsi2);
 
 int __weak __clzdi2(u64 val);
-int __weak __clzdi2(u64 val)
+int __weak __attribute_const__ __clzdi2(u64 val)
 {
 	return 64 - fls64(val);
 }
 EXPORT_SYMBOL(__clzdi2);
 
 int __weak __ctzdi2(u64 val);
-int __weak __ctzdi2(u64 val)
+int __weak __attribute_const__ __ctzdi2(u64 val)
 {
 	return __ffs64(val);
 }
diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h
index 2edf7e9681d0..63441de5e3f1 100644
--- a/lib/crc/arm/crc-t10dif.h
+++ b/lib/crc/arm/crc-t10dif.h
@@ -5,8 +5,6 @@
  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <crypto/internal/simd.h>
-
 #include <asm/neon.h>
 #include <asm/simd.h>
 
@@ -23,7 +21,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
 		if (static_branch_likely(&have_pmull)) {
-			if (crypto_simd_usable()) {
+			if (likely(may_use_simd())) {
 				kernel_neon_begin();
 				crc = crc_t10dif_pmull64(crc, data, length);
 				kernel_neon_end();
@@ -31,7 +29,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 			}
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
 			   static_branch_likely(&have_neon) &&
-			   crypto_simd_usable()) {
+			   likely(may_use_simd())) {
 			u8 buf[16] __aligned(16);
 
 			kernel_neon_begin();
@@ -45,7 +43,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 }
 
 #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
-static inline void crc_t10dif_mod_init_arch(void)
+static void crc_t10dif_mod_init_arch(void)
 {
 	if (elf_hwcap & HWCAP_NEON) {
 		static_branch_enable(&have_neon);
diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h
index 018007e162a2..7b76f52f6907 100644
--- a/lib/crc/arm/crc32.h
+++ b/lib/crc/arm/crc32.h
@@ -7,8 +7,6 @@
 
 #include <linux/cpufeature.h>
 
-#include <crypto/internal/simd.h>
-
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
@@ -34,7 +32,7 @@ static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
 static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 {
 	if (len >= PMULL_MIN_LEN + 15 &&
-	    static_branch_likely(&have_pmull) && crypto_simd_usable()) {
+	    static_branch_likely(&have_pmull) && likely(may_use_simd())) {
 		size_t n = -(uintptr_t)p & 15;
 
 		/* align p to 16-byte boundary */
@@ -63,7 +61,7 @@ static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
 static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 {
 	if (len >= PMULL_MIN_LEN + 15 &&
-	    static_branch_likely(&have_pmull) && crypto_simd_usable()) {
+	    static_branch_likely(&have_pmull) && likely(may_use_simd())) {
 		size_t n = -(uintptr_t)p & 15;
 
 		/* align p to 16-byte boundary */
@@ -85,7 +83,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 #define crc32_be_arch crc32_be_base /* not implemented on this arch */
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	if (elf_hwcap2 & HWCAP2_CRC32)
 		static_branch_enable(&have_crc32);
diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h
index c4521a7f1ee9..f88db2971805 100644
--- a/lib/crc/arm64/crc-t10dif.h
+++ b/lib/crc/arm64/crc-t10dif.h
@@ -7,8 +7,6 @@
 
 #include <linux/cpufeature.h>
 
-#include <crypto/internal/simd.h>
-
 #include <asm/neon.h>
 #include <asm/simd.h>
 
@@ -25,7 +23,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
 		if (static_branch_likely(&have_pmull)) {
-			if (crypto_simd_usable()) {
+			if (likely(may_use_simd())) {
 				kernel_neon_begin();
 				crc = crc_t10dif_pmull_p64(crc, data, length);
 				kernel_neon_end();
@@ -33,7 +31,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 			}
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
 			   static_branch_likely(&have_asimd) &&
-			   crypto_simd_usable()) {
+			   likely(may_use_simd())) {
 			u8 buf[16];
 
 			kernel_neon_begin();
@@ -47,7 +45,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 }
 
 #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
-static inline void crc_t10dif_mod_init_arch(void)
+static void crc_t10dif_mod_init_arch(void)
 {
 	if (cpu_have_named_feature(ASIMD)) {
 		static_branch_enable(&have_asimd);
diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h
index 6e5dec45f05d..31e649cd40a2 100644
--- a/lib/crc/arm64/crc32.h
+++ b/lib/crc/arm64/crc32.h
@@ -5,8 +5,6 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-#include <crypto/internal/simd.h>
-
 // The minimum input length to consider the 4-way interleaved code path
 static const size_t min_len = 1024;
 
@@ -23,7 +21,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_le_base(crc, p, len);
 
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+	if (len >= min_len && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
 		kernel_neon_begin();
 		crc = crc32_le_arm64_4way(crc, p, len);
 		kernel_neon_end();
@@ -43,7 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32c_base(crc, p, len);
 
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+	if (len >= min_len && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
 		kernel_neon_begin();
 		crc = crc32c_le_arm64_4way(crc, p, len);
 		kernel_neon_end();
@@ -63,7 +63,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_be_base(crc, p, len);
 
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+	if (len >= min_len && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
 		kernel_neon_begin();
 		crc = crc32_be_arm64_4way(crc, p, len);
 		kernel_neon_end();
diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h
index 6de5c96594af..d34fa4c68632 100644
--- a/lib/crc/loongarch/crc32.h
+++ b/lib/crc/loongarch/crc32.h
@@ -101,7 +101,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 #define crc32_be_arch crc32_be_base /* not implemented on this arch */
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	if (cpu_has_crc32)
 		static_branch_enable(&have_crc32);
diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h
index 11cb272c63a6..3100354a049e 100644
--- a/lib/crc/mips/crc32.h
+++ b/lib/crc/mips/crc32.h
@@ -148,7 +148,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 #define crc32_be_arch crc32_be_base /* not implemented on this arch */
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	if (cpu_have_feature(cpu_feature(MIPS_CRC32)))
 		static_branch_enable(&have_crc32);
diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h
index 59e16804a6ea..8f4592a5323d 100644
--- a/lib/crc/powerpc/crc-t10dif.h
+++ b/lib/crc/powerpc/crc-t10dif.h
@@ -6,8 +6,8 @@
  * [based on crc32c-vpmsum_glue.c]
  */
 
+#include <asm/simd.h>
 #include <asm/switch_to.h>
-#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
 #include <linux/preempt.h>
@@ -29,7 +29,8 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
 	u32 crc = crci;
 
 	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
-	    !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+	    !static_branch_likely(&have_vec_crypto) ||
+	    unlikely(!may_use_simd()))
 		return crc_t10dif_generic(crc, p, len);
 
 	if ((unsigned long)p & VMX_ALIGN_MASK) {
@@ -61,7 +62,7 @@ static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
 }
 
 #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
-static inline void crc_t10dif_mod_init_arch(void)
+static void crc_t10dif_mod_init_arch(void)
 {
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h
index 811cc2e6ed24..0c852272a382 100644
--- a/lib/crc/powerpc/crc32.h
+++ b/lib/crc/powerpc/crc32.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <asm/simd.h>
 #include <asm/switch_to.h>
-#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
 #include <linux/preempt.h>
@@ -24,7 +24,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 	unsigned int tail;
 
 	if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
-	    !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+	    !static_branch_likely(&have_vec_crypto) ||
+	    unlikely(!may_use_simd()))
 		return crc32c_base(crc, p, len);
 
 	if ((unsigned long)p & VMX_ALIGN_MASK) {
@@ -54,7 +55,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 }
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h
index 60f2765ac015..df7c350acd7b 100644
--- a/lib/crc/sparc/crc32.h
+++ b/lib/crc/sparc/crc32.h
@@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len)
 }
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	unsigned long cfr;
 
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
index f08d985d8860..9a450e25ac81 100644
--- a/lib/crc/tests/crc_kunit.c
+++ b/lib/crc/tests/crc_kunit.c
@@ -6,6 +6,7 @@
  *
  * Author: Eric Biggers <ebiggers@google.com>
  */
+#include <kunit/run-in-irq-context.h>
 #include <kunit/test.h>
 #include <linux/crc7.h>
 #include <linux/crc16.h>
@@ -141,6 +142,54 @@ static size_t generate_random_length(size_t max_length)
 	return len % (max_length + 1);
 }
 
+#define IRQ_TEST_DATA_LEN 512
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct crc_irq_test_state {
+	const struct crc_variant *v;
+	u64 initial_crc;
+	u64 expected_crcs[IRQ_TEST_NUM_BUFFERS];
+	atomic_t seqno;
+};
+
+/*
+ * Compute the CRC of one of the test messages and verify that it matches the
+ * expected CRC from @state->expected_crcs.  To increase the chance of detecting
+ * problems, cycle through multiple messages.
+ */
+static bool crc_irq_test_func(void *state_)
+{
+	struct crc_irq_test_state *state = state_;
+	const struct crc_variant *v = state->v;
+	u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+	u64 actual_crc = v->func(state->initial_crc,
+				 &test_buffer[i * IRQ_TEST_DATA_LEN],
+				 IRQ_TEST_DATA_LEN);
+
+	return actual_crc == state->expected_crcs[i];
+}
+
+/*
+ * Test that if CRCs are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void crc_interrupt_context_test(struct kunit *test,
+				       const struct crc_variant *v)
+{
+	struct crc_irq_test_state state = {
+		.v = v,
+		.initial_crc = generate_random_initial_crc(v),
+	};
+
+	for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++) {
+		state.expected_crcs[i] = crc_ref(
+			v, state.initial_crc,
+			&test_buffer[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN);
+	}
+
+	kunit_run_irq_test(test, crc_irq_test_func, 100000, &state);
+}
+
 /* Test that v->func gives the same CRCs as a reference implementation. */
 static void crc_test(struct kunit *test, const struct crc_variant *v)
 {
@@ -149,7 +198,6 @@ static void crc_test(struct kunit *test, const struct crc_variant *v)
 	for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) {
 		u64 init_crc, expected_crc, actual_crc;
 		size_t len, offset;
-		bool nosimd;
 
 		init_crc = generate_random_initial_crc(v);
 		len = generate_random_length(CRC_KUNIT_MAX_LEN);
@@ -168,22 +216,18 @@ static void crc_test(struct kunit *test, const struct crc_variant *v)
 			/* Refresh the data occasionally. */
 			prandom_bytes_state(&rng, &test_buffer[offset], len);
 
-		nosimd = rand32() % 8 == 0;
-
 		/*
 		 * Compute the CRC, and verify that it equals the CRC computed
 		 * by a simple bit-at-a-time reference implementation.
 		 */
 		expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len);
-		if (nosimd)
-			local_irq_disable();
 		actual_crc = v->func(init_crc, &test_buffer[offset], len);
-		if (nosimd)
-			local_irq_enable();
 		KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
-				    "Wrong result with len=%zu offset=%zu nosimd=%d",
-				    len, offset, nosimd);
+				    "Wrong result with len=%zu offset=%zu",
+				    len, offset);
 	}
+
+	crc_interrupt_context_test(test, v);
 }
 
 static __always_inline void
diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h
index 35c950d7010c..02744831c6fa 100644
--- a/lib/crc/x86/crc-pclmul-template.h
+++ b/lib/crc/x86/crc-pclmul-template.h
@@ -12,7 +12,6 @@
 
 #include <asm/cpufeatures.h>
 #include <asm/simd.h>
-#include <crypto/internal/simd.h>
 #include <linux/static_call.h>
 #include "crc-pclmul-consts.h"
 
@@ -57,7 +56,7 @@ static inline bool have_avx512(void)
 #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq)		\
 do {									\
 	if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) &&	\
-	    crypto_simd_usable()) {					\
+	    likely(irq_fpu_usable())) {					\
 		const void *consts_ptr;					\
 									\
 		consts_ptr = (consts).fold_across_128_bits_consts;	\
diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h
index 2a02a3026f3f..8ee8824da551 100644
--- a/lib/crc/x86/crc-t10dif.h
+++ b/lib/crc/x86/crc-t10dif.h
@@ -19,7 +19,7 @@ static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
 }
 
 #define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
-static inline void crc_t10dif_mod_init_arch(void)
+static void crc_t10dif_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
index cea2c96d08d0..19a5e3c6c73b 100644
--- a/lib/crc/x86/crc32.h
+++ b/lib/crc/x86/crc32.h
@@ -44,7 +44,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 		return crc32c_base(crc, p, len);
 
 	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
-	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+	    static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) {
 		/*
 		 * Long length, the vector registers are usable, and the CPU is
 		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
@@ -106,7 +106,7 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 #define crc32_be_arch crc32_be_base /* not implemented on this arch */
 
 #define crc32_mod_init_arch crc32_mod_init_arch
-static inline void crc32_mod_init_arch(void)
+static void crc32_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_XMM4_2))
 		static_branch_enable(&have_crc32);
diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h
index fde1222c4c58..7d4599319343 100644
--- a/lib/crc/x86/crc64.h
+++ b/lib/crc/x86/crc64.h
@@ -27,7 +27,7 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
 }
 
 #define crc64_mod_init_arch crc64_mod_init_arch
-static inline void crc64_mod_init_arch(void)
+static void crc64_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		static_branch_enable(&have_pclmulqdq);
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index c2b65b6a9bb6..eea17e36a22b 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -28,109 +28,102 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
-config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Blake2s library interface,
-	  either builtin or as a module.
-
-config CRYPTO_LIB_BLAKE2S_GENERIC
-	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  This symbol can be depended upon by arch implementations of the
-	  Blake2s library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_BLAKE2S.
+# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
-config CRYPTO_ARCH_HAVE_LIB_CHACHA
+config CRYPTO_LIB_BLAKE2S_ARCH
 	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the ChaCha library interface,
-	  either builtin or as a module.
+	depends on !UML
+	default y if ARM
+	default y if X86_64
 
-config CRYPTO_LIB_CHACHA_GENERIC
+config CRYPTO_LIB_CHACHA
 	tristate
-	default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA
 	select CRYPTO_LIB_UTILS
 	help
-	  This symbol can be selected by arch implementations of the ChaCha
-	  library interface that require the generic code as a fallback, e.g.,
-	  for SIMD implementations. If no arch specific implementation is
-	  enabled, this implementation serves the users of CRYPTO_LIB_CHACHA.
+	  Enable the ChaCha library interface.  Select this if your module uses
+	  chacha_crypt() or hchacha_block().
 
-config CRYPTO_LIB_CHACHA
+config CRYPTO_LIB_CHACHA_ARCH
+	bool
+	depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS && CPU_MIPS32_R2
+	default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if X86_64
+
+config CRYPTO_LIB_CURVE25519
 	tristate
+	select CRYPTO_LIB_UTILS
 	help
-	  Enable the ChaCha library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
+	  The Curve25519 library functions.  Select this if your module uses any
+	  of the functions from <crypto/curve25519.h>.
 
-config CRYPTO_ARCH_HAVE_LIB_CURVE25519
+config CRYPTO_LIB_CURVE25519_ARCH
 	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Curve25519 library interface,
-	  either builtin or as a module.
+	depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN
+	default y if ARM && KERNEL_MODE_NEON
+	default y if PPC64 && CPU_LITTLE_ENDIAN
+	default y if X86_64
 
 config CRYPTO_LIB_CURVE25519_GENERIC
-	tristate
-	select CRYPTO_LIB_UTILS
-	help
-	  This symbol can be depended upon by arch implementations of the
-	  Curve25519 library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_CURVE25519.
+	bool
+	depends on CRYPTO_LIB_CURVE25519
+	default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64
 
-config CRYPTO_LIB_CURVE25519_INTERNAL
+config CRYPTO_LIB_DES
 	tristate
-	select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
 
-config CRYPTO_LIB_CURVE25519
+config CRYPTO_LIB_MD5
 	tristate
-	select CRYPTO
-	select CRYPTO_LIB_CURVE25519_INTERNAL
 	help
-	  Enable the Curve25519 library interface. This interface may be
-	  fulfilled by either the generic implementation or an arch-specific
-	  one, if one is available and enabled.
+	  The MD5 and HMAC-MD5 library functions.  Select this if your module
+	  uses any of the functions from <crypto/md5.h>.
 
-config CRYPTO_LIB_DES
+config CRYPTO_LIB_MD5_ARCH
+	bool
+	depends on CRYPTO_LIB_MD5 && !UML
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC
+	default y if SPARC64
+
+config CRYPTO_LIB_POLY1305
 	tristate
+	help
+	  The Poly1305 library functions.  Select this if your module uses any
+	  of the functions from <crypto/poly1305.h>.
+
+config CRYPTO_LIB_POLY1305_ARCH
+	bool
+	depends on CRYPTO_LIB_POLY1305 && !UML
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS
+	# The PPC64 code needs to be fixed to work in softirq context.
+	default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN
+	default y if RISCV
+	default y if X86_64
+
+# This symbol controls the inclusion of the Poly1305 generic code.  This differs
+# from most of the other algorithms, which handle the generic code
+# "automatically" via __maybe_unused.  This is needed so that the Adiantum code,
+# which calls the poly1305_core_*() functions directly, can enable them.
+config CRYPTO_LIB_POLY1305_GENERIC
+	bool
+	depends on CRYPTO_LIB_POLY1305
+	# Enable if there's no arch impl or the arch impl requires the generic
+	# impl as a fallback.  (Or if selected explicitly.)
+	default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64
 
 config CRYPTO_LIB_POLY1305_RSIZE
 	int
-	default 2 if MIPS
+	default 2 if MIPS || RISCV
 	default 11 if X86_64
 	default 9 if ARM || ARM64
 	default 1
 
-config CRYPTO_ARCH_HAVE_LIB_POLY1305
-	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Poly1305 library interface,
-	  either builtin or as a module.
-
-config CRYPTO_LIB_POLY1305_GENERIC
-	tristate
-	default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305
-	help
-	  This symbol can be selected by arch implementations of the Poly1305
-	  library interface that require the generic code as a fallback, e.g.,
-	  for SIMD implementations. If no arch specific implementation is
-	  enabled, this implementation serves the users of CRYPTO_LIB_POLY1305.
-
-config CRYPTO_LIB_POLY1305
-	tristate
-	help
-	  Enable the Poly1305 library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
-
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
 	select CRYPTO_LIB_CHACHA
@@ -140,8 +133,8 @@ config CRYPTO_LIB_CHACHA20POLY1305
 config CRYPTO_LIB_SHA1
 	tristate
 	help
-	  The SHA-1 library functions.  Select this if your module uses any of
-	  the functions from <crypto/sha1.h>.
+	  The SHA-1 and HMAC-SHA1 library functions.  Select this if your module
+	  uses any of the functions from <crypto/sha1.h>.
 
 config CRYPTO_LIB_SHA1_ARCH
 	bool
@@ -157,9 +150,9 @@ config CRYPTO_LIB_SHA1_ARCH
 config CRYPTO_LIB_SHA256
 	tristate
 	help
-	  Enable the SHA-256 library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
+	  The SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions.
+	  Select this if your module uses any of these functions from
+	  <crypto/sha2.h>.
 
 config CRYPTO_LIB_SHA256_ARCH
 	bool
@@ -196,28 +189,4 @@ config CRYPTO_LIB_SM3
 
 source "lib/crypto/tests/Kconfig"
 
-if !KMSAN # avoid false positives from assembly
-if ARM
-source "lib/crypto/arm/Kconfig"
-endif
-if ARM64
-source "lib/crypto/arm64/Kconfig"
-endif
-if MIPS
-source "lib/crypto/mips/Kconfig"
-endif
-if PPC
-source "lib/crypto/powerpc/Kconfig"
-endif
-if RISCV
-source "lib/crypto/riscv/Kconfig"
-endif
-if S390
-source "lib/crypto/s390/Kconfig"
-endif
-if X86
-source "lib/crypto/x86/Kconfig"
-endif
-endif
-
 endmenu
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index e4151be2ebd4..bded351aeace 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -15,10 +15,6 @@ obj-$(CONFIG_CRYPTO_HASH_INFO)			+= hash_info.o
 obj-$(CONFIG_CRYPTO_LIB_UTILS)			+= libcryptoutils.o
 libcryptoutils-y				:= memneq.o utils.o
 
-# chacha is used by the /dev/random driver which is always builtin
-obj-y						+= chacha.o
-obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)		+= libchacha.o
-
 obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
 libaes-y					:= aes.o
 
@@ -33,39 +29,162 @@ libarc4-y					:= arc4.o
 
 obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
-obj-y						+= libblake2s.o
-libblake2s-y					:= blake2s.o
-libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
-libblake2s-$(CONFIG_CRYPTO_SELFTESTS)		+= blake2s-selftest.o
+obj-y += blake2s.o
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
+CFLAGS_blake2s.o += -I$(src)/$(SRCARCH)
+obj-$(CONFIG_ARM) += arm/blake2s-core.o
+obj-$(CONFIG_X86) += x86/blake2s-core.o
+endif
+
+################################################################################
+
+# chacha20_block() is used by the /dev/random driver which is always builtin
+obj-y += chacha-block-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o
+libchacha-y := chacha.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y)
+CFLAGS_chacha.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libchacha-y += arm/chacha-scalar-core.o
+libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o
+endif
+
+libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o
+
+ifeq ($(CONFIG_MIPS),y)
+libchacha-y += mips/chacha-core.o
+AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots
+endif
+
+libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o
+libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o
+libchacha-$(CONFIG_S390) += s390/chacha-s390.o
+libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \
+			   x86/chacha-avx2-x86_64.o \
+			   x86/chacha-avx512vl-x86_64.o
+endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH
+
+################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
 libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS)	+= chacha20poly1305-selftest.o
 
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)	+= libcurve25519-generic.o
-libcurve25519-generic-y				:= curve25519-fiat32.o
-libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128)	:= curve25519-hacl64.o
-libcurve25519-generic-y				+= curve25519-generic.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
+libcurve25519-y := curve25519.o
+
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519.o := n
+
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o
+else
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o
+endif
 # clang versions prior to 18 may blow out the stack with KASAN
 ifeq ($(call clang-min-version, 180000),)
 KASAN_SANITIZE_curve25519-hacl64.o := n
 endif
 
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519)		+= libcurve25519.o
-libcurve25519-y					+= curve25519.o
-libcurve25519-$(CONFIG_CRYPTO_SELFTESTS)	+= curve25519-selftest.o
+ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y)
+CFLAGS_curve25519.o += -I$(src)/$(SRCARCH)
+libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o
+libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o
+endif
+
+################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_DES)			+= libdes.o
 libdes-y					:= des.o
 
-obj-$(CONFIG_CRYPTO_LIB_POLY1305)		+= libpoly1305.o
-libpoly1305-y					+= poly1305.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o
+libmd5-y := md5.o
+ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y)
+CFLAGS_md5.o += -I$(src)/$(SRCARCH)
+libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o
+libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o
+endif # CONFIG_CRYPTO_LIB_MD5_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
+libpoly1305-y := poly1305.o
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o
+else
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o
+endif
+
+ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y)
+CFLAGS_poly1305.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libpoly1305-y += arm/poly1305-core.o
+$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl
+	$(call cmd,perlasm)
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libpoly1305-y += arm64/poly1305-core.o
+$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl
+	$(call cmd,perlasm_with_args)
+endif
+
+ifeq ($(CONFIG_MIPS),y)
+libpoly1305-y += mips/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+      cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm_poly1305)
+targets += mips/poly1305-core.S
+endif
+
+libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o
+
+ifeq ($(CONFIG_RISCV),y)
+libpoly1305-y += riscv/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+      cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE
+	$(call if_changed,perlasm_poly1305)
+targets += riscv/poly1305-core.S
+AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init
+endif
+
+ifeq ($(CONFIG_X86),y)
+libpoly1305-y += x86/poly1305-x86_64-cryptogams.o
+$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl
+	$(call cmd,perlasm)
+endif
+
+endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH
 
-obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC)	+= libpoly1305-generic.o
-libpoly1305-generic-y				:= poly1305-donna32.o
-libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
-libpoly1305-generic-y				+= poly1305-generic.o
+# clean-files must be defined unconditionally
+clean-files += arm/poly1305-core.S \
+	       arm64/poly1305-core.S \
+	       mips/poly1305-core.S \
+	       riscv/poly1305-core.S \
+	       x86/poly1305-x86_64-cryptogams.S
 
 ################################################################################
 
@@ -100,7 +219,6 @@ ifeq ($(CONFIG_ARM),y)
 libsha256-y += arm/sha256-ce.o arm/sha256-core.o
 $(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
 	$(call cmd,perlasm)
-clean-files += arm/sha256-core.S
 AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
 endif
 
@@ -108,7 +226,6 @@ ifeq ($(CONFIG_ARM64),y)
 libsha256-y += arm64/sha256-core.o
 $(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
 	$(call cmd,perlasm_with_args)
-clean-files += arm64/sha256-core.S
 libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
 endif
 
@@ -132,7 +249,6 @@ ifeq ($(CONFIG_ARM),y)
 libsha512-y += arm/sha512-core.o
 $(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
 	$(call cmd,perlasm)
-clean-files += arm/sha512-core.S
 AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
 endif
 
@@ -140,7 +256,6 @@ ifeq ($(CONFIG_ARM64),y)
 libsha512-y += arm64/sha512-core.o
 $(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
 	$(call cmd,perlasm_with_args)
-clean-files += arm64/sha512-core.S
 libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
 endif
 
@@ -160,10 +275,6 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
 obj-$(CONFIG_CRYPTO_LIB_SM3)			+= libsm3.o
 libsm3-y					:= sm3.o
 
-obj-$(CONFIG_ARM) += arm/
-obj-$(CONFIG_ARM64) += arm64/
-obj-$(CONFIG_MIPS) += mips/
-obj-$(CONFIG_PPC) += powerpc/
-obj-$(CONFIG_RISCV) += riscv/
-obj-$(CONFIG_S390) += s390/
-obj-$(CONFIG_X86) += x86/
+# clean-files must be defined unconditionally
+clean-files += arm/sha256-core.S arm/sha512-core.S
+clean-files += arm64/sha256-core.S arm64/sha512-core.S
diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig
deleted file mode 100644
index e8444fd0aae3..000000000000
--- a/lib/crypto/arm/Kconfig
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_ARM
-	bool "Hash functions: BLAKE2s"
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: arm
-
-	  This is faster than the generic implementations of BLAKE2s and
-	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
-	  There is no NEON implementation of BLAKE2s, since NEON doesn't
-	  really help with it.
-
-config CRYPTO_CHACHA20_NEON
-	tristate
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_ARM
-	tristate
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile
deleted file mode 100644
index 4c042a4c77ed..000000000000
--- a/lib/crypto/arm/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
-libblake2s-arm-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
-
-quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
-	$(call cmd,perl)
-
-clean-files += poly1305-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index df40e46601f1..293f44fa8f31 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * BLAKE2s digest algorithm, ARM scalar implementation
+ * BLAKE2s digest algorithm, ARM scalar implementation.  This is faster
+ * than the generic implementations of BLAKE2s and BLAKE2b, but slower
+ * than the NEON implementation of BLAKE2b.  There is no NEON
+ * implementation of BLAKE2s, since NEON doesn't really help with it.
  *
  * Copyright 2020 Google LLC
  *
diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c
deleted file mode 100644
index 0238a70d9581..000000000000
--- a/lib/crypto/arm/blake2s-glue.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
new file mode 100644
index 000000000000..aa7a97139ea7
--- /dev/null
+++ b/lib/crypto/arm/blake2s.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* defined in blake2s-core.S */
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+		      size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha.h
index 88ec96415283..0cae30f8ee5d 100644
--- a/lib/crypto/arm/chacha-glue.c
+++ b/lib/crypto/arm/chacha.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * ChaCha and HChaCha functions (ARM optimized)
  *
@@ -6,11 +6,9 @@
  * Copyright (C) 2015 Martin Willi
  */
 
-#include <crypto/chacha.h>
 #include <crypto/internal/simd.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 
 #include <asm/cputype.h>
 #include <asm/hwcap.h>
@@ -64,8 +62,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
 	}
 }
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
 {
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
 		hchacha_block_arm(state, out, nrounds);
@@ -75,10 +73,9 @@ void hchacha_block_arch(const struct chacha_state *state,
 		kernel_neon_end();
 	}
 }
-EXPORT_SYMBOL(hchacha_block_arch);
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
 	    bytes <= CHACHA_BLOCK_SIZE) {
@@ -99,16 +96,9 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 		dst += todo;
 	} while (bytes);
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
 
-bool chacha_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
 		switch (read_cpuid_part()) {
@@ -124,15 +114,4 @@ static int __init chacha_arm_mod_init(void)
 			static_branch_enable(&use_neon);
 		}
 	}
-	return 0;
 }
-subsys_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/crypto/curve25519-core.S b/lib/crypto/arm/curve25519-core.S
index b697fa5d059a..b697fa5d059a 100644
--- a/arch/arm/crypto/curve25519-core.S
+++ b/lib/crypto/arm/curve25519-core.S
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
new file mode 100644
index 000000000000..f6d66494eb8f
--- /dev/null
+++ b/lib/crypto/arm/curve25519.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/types.h>
+#include <linux/jump_label.h>
+
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+				const u8 secret[CURVE25519_KEY_SIZE],
+				const u8 basepoint[CURVE25519_KEY_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+			    const u8 scalar[CURVE25519_KEY_SIZE],
+			    const u8 point[CURVE25519_KEY_SIZE])
+{
+	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		curve25519_neon(out, scalar, point);
+		kernel_neon_end();
+	} else {
+		curve25519_generic(out, scalar, point);
+	}
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_arch(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
index dd7a996361a7..34c11b7b44bd 100644
--- a/lib/crypto/arm/poly1305-armv4.pl
+++ b/lib/crypto/arm/poly1305-armv4.pl
@@ -43,9 +43,8 @@ $code.=<<___;
 #else
 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init   poly1305_block_init_arch
+# define poly1305_init   poly1305_block_init
 # define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit   poly1305_emit_arch
 #endif
 
 #if defined(__thumb2__)
diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c
deleted file mode 100644
index 2d86c78af883..000000000000
--- a/lib/crypto/arm/poly1305-glue.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
-				    const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit)
-{
-	len = round_down(len, POLY1305_BLOCK_SIZE);
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		do {
-			unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
-
-			len -= todo;
-			src += todo;
-		} while (len);
-	} else
-		poly1305_blocks_arm(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    (elf_hwcap & HWCAP_NEON))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-subsys_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
new file mode 100644
index 000000000000..0021cf368307
--- /dev/null
+++ b/lib/crypto/arm/poly1305.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+				    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, padbit);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm(state, src, len, padbit);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
index fa1e92419000..29f8bcad0447 100644
--- a/lib/crypto/arm/sha1.h
+++ b/lib/crypto/arm/sha1.h
@@ -35,7 +35,7 @@ static void sha1_blocks(struct sha1_block_state *state,
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha1_mod_init_arch sha1_mod_init_arch
-static inline void sha1_mod_init_arch(void)
+static void sha1_mod_init_arch(void)
 {
 	if (elf_hwcap & HWCAP_NEON) {
 		static_branch_enable(&have_neon);
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
index da75cbdc51d4..7556457b3094 100644
--- a/lib/crypto/arm/sha256.h
+++ b/lib/crypto/arm/sha256.h
@@ -5,7 +5,10 @@
  * Copyright 2025 Google LLC
  */
 #include <asm/neon.h>
-#include <crypto/internal/simd.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
 
 asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
@@ -14,14 +17,11 @@ asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
 asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
 				    const u8 *data, size_t nblocks);
 
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
 static void sha256_blocks(struct sha256_block_state *state,
 			  const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
 		kernel_neon_begin();
 		if (static_branch_likely(&have_ce))
 			sha256_ce_transform(state, data, nblocks);
@@ -35,7 +35,7 @@ static void sha256_blocks(struct sha256_block_state *state,
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	if (elf_hwcap & HWCAP_NEON) {
 		static_branch_enable(&have_neon);
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
index f147b6490d6c..d1b485dd275d 100644
--- a/lib/crypto/arm/sha512.h
+++ b/lib/crypto/arm/sha512.h
@@ -4,9 +4,8 @@
  *
  * Copyright 2025 Google LLC
  */
-
 #include <asm/neon.h>
-#include <crypto/internal/simd.h>
+#include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 
@@ -19,7 +18,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 			  const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && likely(crypto_simd_usable())) {
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
 		kernel_neon_begin();
 		sha512_block_data_order_neon(state, data, nblocks);
 		kernel_neon_end();
@@ -30,7 +29,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	if (cpu_has_neon())
 		static_branch_enable(&have_neon);
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
deleted file mode 100644
index 0b903ef524d8..000000000000
--- a/lib/crypto/arm64/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile
deleted file mode 100644
index 6207088397a7..000000000000
--- a/lib/crypto/arm64/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
-AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/%-core.S: $(src)/%-armv8.pl
-	$(call cmd,perlasm)
-
-clean-files += poly1305-core.S
diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha.h
index d0188f974ca5..ba6c22d46086 100644
--- a/lib/crypto/arm64/chacha-neon-glue.c
+++ b/lib/crypto/arm64/chacha.h
@@ -18,11 +18,9 @@
  * (at your option) any later version.
  */
 
-#include <crypto/chacha.h>
 #include <crypto/internal/simd.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 
 #include <asm/hwcap.h>
 #include <asm/neon.h>
@@ -61,8 +59,8 @@ static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
 	}
 }
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
 {
 	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
 		hchacha_block_generic(state, out, nrounds);
@@ -72,10 +70,9 @@ void hchacha_block_arch(const struct chacha_state *state,
 		kernel_neon_end();
 	}
 }
-EXPORT_SYMBOL(hchacha_block_arch);
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
 	    !crypto_simd_usable())
@@ -93,27 +90,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 		dst += todo;
 	} while (bytes);
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
 
-bool chacha_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_neon);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
 {
 	if (cpu_have_named_feature(ASIMD))
 		static_branch_enable(&have_neon);
-	return 0;
 }
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
index 22c9069c0650..f1930c6b55ce 100644
--- a/lib/crypto/arm64/poly1305-armv8.pl
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -50,6 +50,9 @@ $code.=<<___;
 #ifndef __KERNEL__
 # include "arm_arch.h"
 .extern	OPENSSL_armcap_P
+#else
+# define poly1305_init   poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm64
 #endif
 
 .text
diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c
deleted file mode 100644
index 31aea21ce42f..000000000000
--- a/lib/crypto/arm64/poly1305-glue.c
+++ /dev/null
@@ -1,74 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
-				const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit)
-{
-	len = round_down(len, POLY1305_BLOCK_SIZE);
-	if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		do {
-			unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
-
-			len -= todo;
-			src += todo;
-		} while (len);
-	} else
-		poly1305_blocks(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM64 scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init neon_poly1305_mod_init(void)
-{
-	if (cpu_have_named_feature(ASIMD))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-subsys_initcall(neon_poly1305_mod_init);
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-}
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
new file mode 100644
index 000000000000..aed5921ccd9a
--- /dev/null
+++ b/lib/crypto/arm64/poly1305.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state,
+				      const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
+{
+	if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			kernel_neon_begin();
+			poly1305_blocks_neon(state, src, todo, padbit);
+			kernel_neon_end();
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm64(state, src, len, padbit);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
index f822563538cc..aaef4ebfc5e3 100644
--- a/lib/crypto/arm64/sha1.h
+++ b/lib/crypto/arm64/sha1.h
@@ -32,7 +32,7 @@ static void sha1_blocks(struct sha1_block_state *state,
 }
 
 #define sha1_mod_init_arch sha1_mod_init_arch
-static inline void sha1_mod_init_arch(void)
+static void sha1_mod_init_arch(void)
 {
 	if (cpu_have_named_feature(SHA1))
 		static_branch_enable(&have_ce);
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index b99d9589c421..410174ba5237 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -70,18 +70,22 @@
 	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
+	.macro load_round_constants	tmp
+	adr_l		\tmp, .Lsha2_rcon
+	ld1		{ v0.4s- v3.4s}, [\tmp], #64
+	ld1		{ v4.4s- v7.4s}, [\tmp], #64
+	ld1		{ v8.4s-v11.4s}, [\tmp], #64
+	ld1		{v12.4s-v15.4s}, [\tmp]
+	.endm
+
 	/*
 	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
 	 *				const u8 *data, size_t nblocks);
 	 */
 	.text
 SYM_FUNC_START(__sha256_ce_transform)
-	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
-	ld1		{ v0.4s- v3.4s}, [x8], #64
-	ld1		{ v4.4s- v7.4s}, [x8], #64
-	ld1		{ v8.4s-v11.4s}, [x8], #64
-	ld1		{v12.4s-v15.4s}, [x8]
+
+	load_round_constants	x8
 
 	/* load state */
 	ld1		{dgav.4s, dgbv.4s}, [x0]
@@ -134,3 +138,271 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x0, x2
 	ret
 SYM_FUNC_END(__sha256_ce_transform)
+
+	.unreq dga
+	.unreq dgav
+	.unreq dgb
+	.unreq dgbv
+	.unreq t0
+	.unreq t1
+	.unreq dg0q
+	.unreq dg0v
+	.unreq dg1q
+	.unreq dg1v
+	.unreq dg2q
+	.unreq dg2v
+
+	// parameters for sha256_ce_finup2x()
+	ctx		.req	x0
+	data1		.req	x1
+	data2		.req	x2
+	len		.req	w3
+	out1		.req	x4
+	out2		.req	x5
+
+	// other scalar variables
+	count		.req	x6
+	final_step	.req	w7
+
+	// x8-x9 are used as temporaries.
+
+	// v0-v15 are used to cache the SHA-256 round constants.
+	// v16-v19 are used for the message schedule for the first message.
+	// v20-v23 are used for the message schedule for the second message.
+	// v24-v31 are used for the state and temporaries as given below.
+	// *_a are for the first message and *_b for the second.
+	state0_a_q	.req	q24
+	state0_a	.req	v24
+	state1_a_q	.req	q25
+	state1_a	.req	v25
+	state0_b_q	.req	q26
+	state0_b	.req	v26
+	state1_b_q	.req	q27
+	state1_b	.req	v27
+	t0_a		.req	v28
+	t0_b		.req	v29
+	t1_a_q		.req	q30
+	t1_a		.req	v30
+	t1_b_q		.req	q31
+	t1_b		.req	v31
+
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
+	// and m0_b contain the current 4 message schedule words for the first
+	// and second message respectively.
+	//
+	// If not all the message schedule words have been computed yet, then
+	// this also computes 4 more message schedule words for each message.
+	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+	// the first message, and likewise m1_b-m3_b for the second.  After
+	// consuming the current value of m0_a, this macro computes the group
+	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
+	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+	// the registers accordingly.
+	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
+				       m0_b, m1_b, m2_b, m3_b
+	add		t0_a\().4s, \m0_a\().4s, \k\().4s
+	add		t0_b\().4s, \m0_b\().4s, \k\().4s
+	.if \i < 48
+	sha256su0	\m0_a\().4s, \m1_a\().4s
+	sha256su0	\m0_b\().4s, \m1_b\().4s
+	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+	.endif
+	mov		t1_a.16b, state0_a.16b
+	mov		t1_b.16b, state0_b.16b
+	sha256h		state0_a_q, state1_a_q, t0_a\().4s
+	sha256h		state0_b_q, state1_b_q, t0_b\().4s
+	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
+	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
+	.endm
+
+	.macro	do_16rounds_2x	i, k0, k1, k2, k3
+	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
+	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
+	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
+	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
+	.endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+	sub		sp, sp, #128
+	mov		final_step, #0
+	load_round_constants	x8
+
+	// Load the initial state from ctx->state.
+	ld1		{state0_a.4s-state1_a.4s}, [ctx]
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// len added to it.
+	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
+	add		count, x8, len, sxtw
+	and		x8, x8, #63
+	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
+
+	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
+	// just load 64 bytes from each of ctx->buf, data1, and data2
+	// unconditionally and rearrange the data as needed.
+	add		x9, ctx, #OFFSETOF_BUF
+	ld1		{v16.16b-v19.16b}, [x9]
+	st1		{v16.16b-v19.16b}, [sp]
+
+	ld1		{v16.16b-v19.16b}, [data1], #64
+	add		x9, sp, x8
+	st1		{v16.16b-v19.16b}, [x9]
+	ld1		{v16.4s-v19.4s}, [sp]
+
+	ld1		{v20.16b-v23.16b}, [data2], #64
+	st1		{v20.16b-v23.16b}, [x9]
+	ld1		{v20.4s-v23.4s}, [sp]
+
+	sub		len, len, #64
+	sub		data1, data1, x8
+	sub		data2, data2, x8
+	add		len, len, w8
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+	b		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		len, len, #64
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	ld1		{v16.4s-v19.4s}, [data1], #64
+	ld1		{v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+CPU_LE(	rev32		v20.16b, v20.16b	)
+CPU_LE(	rev32		v21.16b, v21.16b	)
+CPU_LE(	rev32		v22.16b, v22.16b	)
+CPU_LE(	rev32		v23.16b, v23.16b	)
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	st1		{state0_a.4s-state1_b.4s}, [sp]
+
+	// Do the SHA-256 rounds on each block.
+	do_16rounds_2x	0,  v0, v1, v2, v3
+	do_16rounds_2x	16, v4, v5, v6, v7
+	do_16rounds_2x	32, v8, v9, v10, v11
+	do_16rounds_2x	48, v12, v13, v14, v15
+
+	// Add the original state for each block.
+	ld1		{v16.4s-v19.4s}, [sp]
+	add		state0_a.4s, state0_a.4s, v16.4s
+	add		state1_a.4s, state1_a.4s, v17.4s
+	add		state0_b.4s, state0_b.4s, v18.4s
+	add		state1_b.4s, state1_b.4s, v19.4s
+
+	// Update len and loop back if more blocks remain.
+	sub		len, len, #64
+	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
+
+	// Check if any final blocks need to be handled.
+	// final_step = 2: all done
+	// final_step = 1: need to do count-only padding block
+	// final_step = 0: need to do the block with 0x80 padding byte
+	tbnz		final_step, #1, .Lfinup2x_done
+	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
+	add		len, len, #64
+	cbz		len, .Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - len] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	sub		w8, len, #64		// w8 = len - 64
+	add		data1, data1, w8, sxtw	// data1 += len - 64
+	add		data2, data2, w8, sxtw	// data2 += len - 64
+CPU_LE(	mov		x9, #0x80		)
+CPU_LE(	fmov		d16, x9			)
+CPU_BE(	movi		v16.16b, #0		)
+CPU_BE(	mov		x9, #0x8000000000000000	)
+CPU_BE(	mov		v16.d[1], x9		)
+	movi		v17.16b, #0
+	stp		q16, q17, [sp, #64]
+	stp		q17, q17, [sp, #96]
+	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
+	cmp		len, #56
+	b.ge		1f		// will count spill into its own block?
+	lsl		count, count, #3
+CPU_LE(	rev		count, count		)
+	str		count, [x9, #56]
+	mov		final_step, #2	// won't need count-only block
+	b		2f
+1:
+	mov		final_step, #1	// will need count-only block
+2:
+	ld1		{v16.16b-v19.16b}, [data1]
+	st1		{v16.16b-v19.16b}, [sp]
+	ld1		{v16.4s-v19.4s}, [x9]
+	ld1		{v20.16b-v23.16b}, [data2]
+	st1		{v20.16b-v23.16b}, [sp]
+	ld1		{v20.4s-v23.4s}, [x9]
+	b		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	movi		v16.2d, #0
+	b		1f
+.Lfinup2x_finalize_blockaligned:
+	mov		x8, #0x80000000
+	fmov		d16, x8
+1:
+	movi		v17.2d, #0
+	movi		v18.2d, #0
+	ror		count, count, #29	// ror(lsl(count, 3), 32)
+	mov		v19.d[0], xzr
+	mov		v19.d[1], count
+	mov		v20.16b, v16.16b
+	movi		v21.2d, #0
+	movi		v22.2d, #0
+	mov		v23.16b, v19.16b
+	mov		final_step, #2
+	b		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
+CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
+CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
+CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
+	st1		{state0_a.4s-state1_a.4s}, [out1]
+	st1		{state0_b.4s-state1_b.4s}, [out2]
+	add		sp, sp, #128
+	ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
index a211966c124a..80d06df27d3a 100644
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -5,9 +5,12 @@
  * Copyright 2025 Google LLC
  */
 #include <asm/neon.h>
-#include <crypto/internal/simd.h>
+#include <asm/simd.h>
 #include <linux/cpufeature.h>
 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
 asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
 asmlinkage void sha256_block_neon(struct sha256_block_state *state,
@@ -15,14 +18,11 @@ asmlinkage void sha256_block_neon(struct sha256_block_state *state,
 asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
 
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
 static void sha256_blocks(struct sha256_block_state *state,
 			  const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
 		if (static_branch_likely(&have_ce)) {
 			do {
 				size_t rem;
@@ -44,9 +44,46 @@ static void sha256_blocks(struct sha256_block_state *state,
 	}
 }
 
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(may_use_simd())) {
+		kernel_neon_begin();
+		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_neon_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_ce);
+}
+
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	if (cpu_have_named_feature(ASIMD)) {
 		static_branch_enable(&have_neon);
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
index 6abb40b467f2..ddb0d256f73a 100644
--- a/lib/crypto/arm64/sha512.h
+++ b/lib/crypto/arm64/sha512.h
@@ -4,9 +4,8 @@
  *
  * Copyright 2025 Google LLC
  */
-
 #include <asm/neon.h>
-#include <crypto/internal/simd.h>
+#include <asm/simd.h>
 #include <linux/cpufeature.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
@@ -21,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_sha512_insns) &&
-	    likely(crypto_simd_usable())) {
+	    likely(may_use_simd())) {
 		do {
 			size_t rem;
 
@@ -38,7 +37,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	if (cpu_have_named_feature(SHA512))
 		static_branch_enable(&have_sha512_insns);
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
deleted file mode 100644
index 9828176a2efe..000000000000
--- a/lib/crypto/blake2s-generic.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/bug.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/unaligned.h>
-
-static const u8 blake2s_sigma[10][16] = {
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
-					     const u32 inc)
-{
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
-}
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, const u32 inc)
-		      __weak __alias(blake2s_compress_generic);
-
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
-			      size_t nblocks, const u32 inc)
-{
-	u32 m[16];
-	u32 v[16];
-	int i;
-
-	WARN_ON(IS_ENABLED(DEBUG) &&
-		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
-
-	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
-		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
-		v[ 8] = BLAKE2S_IV0;
-		v[ 9] = BLAKE2S_IV1;
-		v[10] = BLAKE2S_IV2;
-		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
-	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
-	d = ror32(d ^ a, 16); \
-	c += d; \
-	b = ror32(b ^ c, 12); \
-	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
-	d = ror32(d ^ a, 8); \
-	c += d; \
-	b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
-	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
-	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
-	G(r, 2, v[2], v[ 6], v[10], v[14]); \
-	G(r, 3, v[3], v[ 7], v[11], v[15]); \
-	G(r, 4, v[0], v[ 5], v[10], v[15]); \
-	G(r, 5, v[1], v[ 6], v[11], v[12]); \
-	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
-	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-
-#undef G
-#undef ROUND
-
-		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
-
-		block += BLAKE2S_BLOCK_SIZE;
-		--nblocks;
-	}
-}
-
-EXPORT_SYMBOL(blake2s_compress_generic);
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
deleted file mode 100644
index d0634ed6a937..000000000000
--- a/lib/crypto/blake2s-selftest.c
+++ /dev/null
@@ -1,651 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/string.h>
-
-/*
- * blake2s_testvecs[] generated with the program below (using libb2-dev and
- * libssl-dev [OpenSSL])
- *
- * #include <blake2.h>
- * #include <stdint.h>
- * #include <stdio.h>
- *
- * #include <openssl/evp.h>
- *
- * #define BLAKE2S_TESTVEC_COUNT	256
- *
- * static void print_vec(const uint8_t vec[], int len)
- * {
- *	int i;
- *
- *	printf("  { ");
- *	for (i = 0; i < len; i++) {
- *		if (i && (i % 12) == 0)
- *			printf("\n    ");
- *		printf("0x%02x, ", vec[i]);
- *	}
- *	printf("},\n");
- * }
- *
- * int main(void)
- * {
- *	uint8_t key[BLAKE2S_KEYBYTES];
- *	uint8_t buf[BLAKE2S_TESTVEC_COUNT];
- *	uint8_t hash[BLAKE2S_OUTBYTES];
- *	int i, j;
- *
- *	key[0] = key[1] = 1;
- *	for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
- *		key[i] = key[i - 2] + key[i - 1];
- *
- *	for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
- *		buf[i] = (uint8_t)i;
- *
- *	printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- *	for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
- *		int outlen = 1 + i % BLAKE2S_OUTBYTES;
- *		int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
- *
- *		blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
- *			keylen);
- *		print_vec(hash, outlen);
- *	}
- *	printf("};\n\n");
- *
- *	return 0;
- *}
- */
-static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
-  { 0xa1, },
-  { 0x7c, 0x89, },
-  { 0x74, 0x0e, 0xd4, },
-  { 0x47, 0x0c, 0x21, 0x15, },
-  { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
-  { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
-  { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
-  { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
-  { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
-  { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
-  { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
-  { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
-  { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
-    0xb7, },
-  { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
-    0x52, 0x3e, },
-  { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
-    0x57, 0xe2, 0x27, },
-  { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
-    0x47, 0x2a, 0xc7, 0xf5, },
-  { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
-    0xe7, 0x72, 0x08, 0xec, 0xbe, },
-  { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
-    0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
-  { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
-    0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
-  { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
-    0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
-  { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
-    0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
-  { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
-    0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
-  { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
-    0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
-  { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
-    0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
-  { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
-    0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
-    0xd1, },
-  { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
-    0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
-    0x01, 0x3e, },
-  { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
-    0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
-    0xec, 0x13, 0xed, },
-  { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
-    0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
-    0x72, 0x67, 0x09, 0xfc, },
-  { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
-    0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
-    0x95, 0x29, 0x19, 0xf2, 0x4b, },
-  { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
-    0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
-    0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
-  { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
-    0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
-    0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
-  { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
-    0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
-    0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
-  { 0x0a, },
-  { 0x6e, 0xd4, },
-  { 0x64, 0xe9, 0xd1, },
-  { 0x30, 0xdd, 0x71, 0xef, },
-  { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
-  { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
-  { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
-  { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
-  { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
-  { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
-  { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
-  { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
-  { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
-    0xe2, },
-  { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
-    0x7e, 0xb0, },
-  { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
-    0x98, 0x0a, 0x8d, },
-  { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
-    0xf1, 0xc3, 0x94, 0xd8, },
-  { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
-    0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
-  { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
-    0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
-  { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
-    0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
-  { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
-    0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
-  { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
-    0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
-  { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
-    0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
-  { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
-    0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
-  { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
-    0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
-  { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
-    0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
-    0xe3, },
-  { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
-    0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
-    0xe3, 0x90, },
-  { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
-    0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
-    0x58, 0x06, 0x9a, },
-  { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
-    0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
-    0x53, 0x03, 0x1a, 0x39, },
-  { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
-    0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
-    0xd4, 0x36, 0xb1, 0xac, 0x73, },
-  { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
-    0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
-    0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
-  { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
-    0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
-    0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
-  { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
-    0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
-    0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
-  { 0x9d, },
-  { 0x9d, 0x7d, },
-  { 0xfd, 0xc3, 0xda, },
-  { 0xe8, 0x82, 0xcd, 0x21, },
-  { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
-  { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
-  { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
-  { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
-  { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
-  { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
-  { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
-  { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
-  { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
-    0x63, },
-  { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
-    0xe6, 0xa9, },
-  { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
-    0xf6, 0x0e, 0x20, },
-  { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
-    0x18, 0x3e, 0xc7, 0xc3, },
-  { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
-    0x92, 0xfc, 0x7f, 0x34, 0x41, },
-  { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
-    0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
-  { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
-    0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
-  { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
-    0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
-  { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
-    0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
-  { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
-    0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
-  { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
-    0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
-  { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
-    0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
-  { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
-    0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
-    0xaf, },
-  { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
-    0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
-    0xb1, 0x1d, },
-  { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
-    0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
-    0xee, 0x6a, 0xbe, },
-  { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
-    0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
-    0x5c, 0xef, 0xa3, 0x25, },
-  { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
-    0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
-    0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
-  { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
-    0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
-    0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
-  { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
-    0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
-    0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
-  { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
-    0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
-    0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
-  { 0x02, },
-  { 0x52, 0xa8, },
-  { 0x38, 0x25, 0x0d, },
-  { 0xe3, 0x04, 0xd4, 0x92, },
-  { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
-  { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
-  { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
-  { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
-  { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
-  { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
-  { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
-  { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
-  { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
-    0xd3, },
-  { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
-    0xb7, 0x9a, },
-  { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
-    0x3d, 0x47, 0x3d, },
-  { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
-    0x24, 0xf0, 0x17, 0xc0, },
-  { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
-    0xd2, 0xfd, 0x0c, 0x47, 0x40, },
-  { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
-    0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
-  { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
-    0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
-  { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
-    0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
-  { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
-    0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
-  { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
-    0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
-  { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
-    0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
-  { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
-    0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
-  { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
-    0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
-    0xae, },
-  { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
-    0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
-    0x59, 0x00, },
-  { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
-    0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
-    0x5a, 0x2c, 0x3b, },
-  { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
-    0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
-    0x10, 0x07, 0x2a, 0xc4, },
-  { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
-    0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
-    0xee, 0xa1, 0x74, 0xd6, 0x71, },
-  { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
-    0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
-    0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
-  { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
-    0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
-    0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
-  { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
-    0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
-    0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
-  { 0xbe, },
-  { 0x17, 0x6c, },
-  { 0x1a, 0x42, 0x4f, },
-  { 0xba, 0xaf, 0xb7, 0x65, },
-  { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
-  { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
-  { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
-  { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
-  { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
-  { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
-  { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
-  { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
-  { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
-    0xe9, },
-  { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
-    0xc4, 0xb3, },
-  { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
-    0xd7, 0xd3, 0x5e, },
-  { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
-    0xaf, 0x39, 0xbd, 0x92, },
-  { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
-    0xd9, 0xa7, 0x66, 0x27, 0xcf, },
-  { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
-    0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
-  { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
-    0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
-  { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
-    0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
-  { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
-    0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
-  { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
-    0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
-  { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
-    0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
-  { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
-    0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
-  { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
-    0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
-    0x26, },
-  { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
-    0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
-    0xf4, 0x1d, },
-  { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
-    0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
-    0x4e, 0x29, 0x26, },
-  { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
-    0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
-    0x1a, 0x5b, 0xff, 0xc9, },
-  { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
-    0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
-    0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
-  { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
-    0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
-    0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
-  { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
-    0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
-    0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
-  { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
-    0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
-    0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
-  { 0x1f, },
-  { 0x82, 0x60, },
-  { 0xcc, 0xe3, 0x08, },
-  { 0x56, 0x17, 0xe4, 0x59, },
-  { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
-  { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
-  { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
-  { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
-  { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
-  { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
-  { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
-  { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
-  { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
-    0x5b, },
-  { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
-    0x90, 0x48, },
-  { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
-    0x04, 0xd9, 0xd5, },
-  { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
-    0xe5, 0x31, 0x06, 0x70, },
-  { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
-    0x70, 0x25, 0xdb, 0x33, 0x27, },
-  { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
-    0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
-  { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
-    0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
-  { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
-    0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
-  { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
-    0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
-  { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
-    0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
-  { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
-    0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
-  { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
-    0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
-  { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
-    0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
-    0x88, },
-  { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
-    0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
-    0xc6, 0xbb, },
-  { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
-    0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
-    0x55, 0xfd, 0x4f, },
-  { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
-    0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
-    0x42, 0x64, 0x3b, 0x1a, },
-  { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
-    0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
-    0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
-  { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
-    0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
-    0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
-  { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
-    0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
-    0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
-  { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
-    0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
-    0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
-  { 0x60, },
-  { 0x24, 0x26, },
-  { 0x47, 0xeb, 0xc9, },
-  { 0x4a, 0xd0, 0xbc, 0xf0, },
-  { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
-  { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
-  { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
-  { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
-  { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
-  { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
-  { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
-  { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
-  { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
-    0x8d, },
-  { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
-    0xbf, 0xa0, },
-  { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
-    0xd5, 0x4b, 0xed, },
-  { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
-    0xc8, 0x24, 0x0a, 0xb7, },
-  { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
-    0xb2, 0x15, 0xd1, 0xb1, 0x10, },
-  { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
-    0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
-  { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
-    0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
-  { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
-    0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
-  { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
-    0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
-  { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
-    0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
-  { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
-    0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
-  { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
-    0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
-  { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
-    0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
-    0xd3, },
-  { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
-    0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
-    0xa6, 0xd6, },
-  { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
-    0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
-    0x55, 0xd1, 0xa9, },
-  { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
-    0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
-    0xef, 0xb0, 0x00, 0x8d, },
-  { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
-    0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
-    0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
-  { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
-    0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
-    0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
-  { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
-    0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
-    0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
-  { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
-    0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
-    0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
-  { 0x7e, },
-  { 0x1e, 0x21, },
-  { 0x26, 0xd3, 0xdd, },
-  { 0x2c, 0xd4, 0xb3, 0x3d, },
-  { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
-  { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
-  { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
-  { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
-  { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
-  { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
-  { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
-  { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
-  { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
-    0x66, },
-  { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
-    0x55, 0x66, },
-  { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
-    0x43, 0xbf, 0xa1, },
-  { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
-    0x95, 0x8a, 0xc5, 0xed, },
-  { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
-    0xf6, 0x2b, 0x3a, 0xba, 0x60, },
-  { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
-    0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
-  { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
-    0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
-  { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
-    0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
-  { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
-    0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
-  { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
-    0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
-  { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
-    0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
-  { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
-    0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
-  { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
-    0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
-    0xd7, },
-  { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
-    0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
-    0xb6, 0xef, },
-  { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
-    0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
-    0xef, 0xf8, 0x53, },
-  { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
-    0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
-    0x89, 0xfd, 0xf7, 0x99, },
-  { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
-    0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
-    0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
-  { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
-    0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
-    0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
-  { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
-    0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
-    0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
-  { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
-    0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
-    0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
-};
-
-static bool __init noinline_for_stack blake2s_digest_test(void)
-{
-	u8 key[BLAKE2S_KEY_SIZE];
-	u8 buf[ARRAY_SIZE(blake2s_testvecs)];
-	u8 hash[BLAKE2S_HASH_SIZE];
-	struct blake2s_state state;
-	bool success = true;
-	int i, l;
-
-	key[0] = key[1] = 1;
-	for (i = 2; i < sizeof(key); ++i)
-		key[i] = key[i - 2] + key[i - 1];
-
-	for (i = 0; i < sizeof(buf); ++i)
-		buf[i] = (u8)i;
-
-	for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
-		int outlen = 1 + i % BLAKE2S_HASH_SIZE;
-		int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
-
-		blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
-			keylen);
-		if (memcmp(hash, blake2s_testvecs[i], outlen)) {
-			pr_err("blake2s self-test %d: FAIL\n", i + 1);
-			success = false;
-		}
-
-		if (!keylen)
-			blake2s_init(&state, outlen);
-		else
-			blake2s_init_key(&state, outlen,
-					 key + BLAKE2S_KEY_SIZE - keylen,
-					 keylen);
-
-		blake2s_update(&state, buf, l);
-		blake2s_update(&state, buf + l, i - l);
-		blake2s_final(&state, hash);
-		if (memcmp(hash, blake2s_testvecs[i], outlen)) {
-			pr_err("blake2s init/update/final self-test %d: FAIL\n",
-			       i + 1);
-			success = false;
-		}
-	}
-
-	return success;
-}
-
-static bool __init noinline_for_stack blake2s_random_test(void)
-{
-	struct blake2s_state state;
-	bool success = true;
-	int i, l;
-
-	for (i = 0; i < 32; ++i) {
-		enum { TEST_ALIGNMENT = 16 };
-		u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1]
-					__aligned(TEST_ALIGNMENT);
-		u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE;
-		struct blake2s_state state1, state2;
-
-		get_random_bytes(blocks, sizeof(blocks));
-		get_random_bytes(&state, sizeof(state));
-
-#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
-    defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-		memcpy(&state1, &state, sizeof(state1));
-		memcpy(&state2, &state, sizeof(state2));
-		blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE);
-		if (memcmp(&state1, &state2, sizeof(state1))) {
-			pr_err("blake2s random compress self-test %d: FAIL\n",
-			       i + 1);
-			success = false;
-		}
-#endif
-
-		memcpy(&state1, &state, sizeof(state1));
-		blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
-		for (l = 1; l < TEST_ALIGNMENT; ++l) {
-			memcpy(unaligned_block + l, blocks,
-			       BLAKE2S_BLOCK_SIZE);
-			memcpy(&state2, &state, sizeof(state2));
-			blake2s_compress(&state2, unaligned_block + l, 1,
-					 BLAKE2S_BLOCK_SIZE);
-			if (memcmp(&state1, &state2, sizeof(state1))) {
-				pr_err("blake2s random compress align %d self-test %d: FAIL\n",
-				       l, i + 1);
-				success = false;
-			}
-		}
-	}
-
-	return success;
-}
-
-bool __init blake2s_selftest(void)
-{
-	bool success;
-
-	success = blake2s_digest_test();
-	success &= blake2s_random_test();
-
-	return success;
-}
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index f6ec68c3dcda..5638ed9d882d 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -8,15 +8,108 @@
  *
  */
 
-#include <crypto/internal/blake2s.h>
+#include <crypto/blake2s.h>
 #include <linux/bug.h>
 #include <linux/export.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
+static const u8 blake2s_sigma[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_state *state,
+					     const u32 inc)
+{
+	state->t[0] += inc;
+	state->t[1] += (state->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
+			 size_t nblocks, const u32 inc)
+{
+	u32 m[16];
+	u32 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2s_increment_counter(state, inc);
+		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+		le32_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, state->h, 32);
+		v[ 8] = BLAKE2S_IV0;
+		v[ 9] = BLAKE2S_IV1;
+		v[10] = BLAKE2S_IV2;
+		v[11] = BLAKE2S_IV3;
+		v[12] = BLAKE2S_IV4 ^ state->t[0];
+		v[13] = BLAKE2S_IV5 ^ state->t[1];
+		v[14] = BLAKE2S_IV6 ^ state->f[0];
+		v[15] = BLAKE2S_IV7 ^ state->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+	d = ror32(d ^ a, 16); \
+	c += d; \
+	b = ror32(b ^ c, 12); \
+	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+	d = ror32(d ^ a, 8); \
+	c += d; \
+	b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			state->h[i] ^= v[i] ^ v[i + 8];
+
+		block += BLAKE2S_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH
+#include "blake2s.h" /* $(SRCARCH)/blake2s.h */
+#else
+#define blake2s_compress blake2s_compress_generic
+#endif
+
 static inline void blake2s_set_lastblock(struct blake2s_state *state)
 {
 	state->f[0] = -1;
@@ -59,14 +152,14 @@ void blake2s_final(struct blake2s_state *state, u8 *out)
 }
 EXPORT_SYMBOL(blake2s_final);
 
+#ifdef blake2s_mod_init_arch
 static int __init blake2s_mod_init(void)
 {
-	if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
-	    WARN_ON(!blake2s_selftest()))
-		return -ENODEV;
+	blake2s_mod_init_arch();
 	return 0;
 }
+subsys_initcall(blake2s_mod_init);
+#endif
 
-module_init(blake2s_mod_init);
 MODULE_DESCRIPTION("BLAKE2s hash function");
 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/chacha-block-generic.c b/lib/crypto/chacha-block-generic.c
new file mode 100644
index 000000000000..77f68de71066
--- /dev/null
+++ b/lib/crypto/chacha-block-generic.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static void chacha_permute(struct chacha_state *state, int nrounds)
+{
+	u32 *x = state->x;
+	int i;
+
+	/* whitelist the allowed round counts */
+	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+	for (i = 0; i < nrounds; i += 2) {
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	}
+}
+
+/**
+ * chacha_block_generic - generate one keystream block and increment block counter
+ * @state: input state matrix
+ * @out: output keystream block
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input.  This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(struct chacha_state *state,
+			  u8 out[CHACHA_BLOCK_SIZE], int nrounds)
+{
+	struct chacha_state permuted_state = *state;
+	int i;
+
+	chacha_permute(&permuted_state, nrounds);
+
+	for (i = 0; i < ARRAY_SIZE(state->x); i++)
+		put_unaligned_le32(permuted_state.x[i] + state->x[i],
+				   &out[i * sizeof(u32)]);
+
+	state->x[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix
+ * @out: the output words
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state.  It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const struct chacha_state *state,
+			   u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	struct chacha_state permuted_state = *state;
+
+	chacha_permute(&permuted_state, nrounds);
+
+	memcpy(&out[0], &permuted_state.x[0], 16);
+	memcpy(&out[4], &permuted_state.x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index 77f68de71066..e0c7cb4af318 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -1,114 +1,70 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ * The ChaCha stream cipher (RFC7539)
  *
  * Copyright (C) 2015 Martin Willi
  */
 
+#include <crypto/algapi.h> // for crypto_xor_cpy
 #include <crypto/chacha.h>
-#include <linux/bitops.h>
-#include <linux/bug.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
+#include <linux/module.h>
 
-static void chacha_permute(struct chacha_state *state, int nrounds)
+static void __maybe_unused
+chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
+		     unsigned int bytes, int nrounds)
 {
-	u32 *x = state->x;
-	int i;
-
-	/* whitelist the allowed round counts */
-	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
-
-	for (i = 0; i < nrounds; i += 2) {
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
-
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	/* aligned to potentially speed up crypto_xor() */
+	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+		bytes -= CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+	}
+	if (bytes) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, bytes);
 	}
 }
 
-/**
- * chacha_block_generic - generate one keystream block and increment block counter
- * @state: input state matrix
- * @out: output keystream block
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
- * The caller has already converted the endianness of the input.  This function
- * also handles incrementing the block counter in the input matrix.
- */
-void chacha_block_generic(struct chacha_state *state,
-			  u8 out[CHACHA_BLOCK_SIZE], int nrounds)
-{
-	struct chacha_state permuted_state = *state;
-	int i;
-
-	chacha_permute(&permuted_state, nrounds);
-
-	for (i = 0; i < ARRAY_SIZE(state->x); i++)
-		put_unaligned_le32(permuted_state.x[i] + state->x[i],
-				   &out[i * sizeof(u32)]);
+#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH
+#include "chacha.h" /* $(SRCARCH)/chacha.h */
+#else
+#define chacha_crypt_arch chacha_crypt_generic
+#define hchacha_block_arch hchacha_block_generic
+#endif
 
-	state->x[12]++;
+void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
+		  unsigned int bytes, int nrounds)
+{
+	chacha_crypt_arch(state, dst, src, bytes, nrounds);
 }
-EXPORT_SYMBOL(chacha_block_generic);
+EXPORT_SYMBOL_GPL(chacha_crypt);
 
-/**
- * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
- * @state: input state matrix
- * @out: the output words
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
- * skips the final addition of the initial state, and outputs only certain words
- * of the state.  It should not be used for streaming directly.
- */
-void hchacha_block_generic(const struct chacha_state *state,
-			   u32 out[HCHACHA_OUT_WORDS], int nrounds)
+void hchacha_block(const struct chacha_state *state,
+		   u32 out[HCHACHA_OUT_WORDS], int nrounds)
 {
-	struct chacha_state permuted_state = *state;
+	hchacha_block_arch(state, out, nrounds);
+}
+EXPORT_SYMBOL_GPL(hchacha_block);
 
-	chacha_permute(&permuted_state, nrounds);
+#ifdef chacha_mod_init_arch
+static int __init chacha_mod_init(void)
+{
+	chacha_mod_init_arch();
+	return 0;
+}
+subsys_initcall(chacha_mod_init);
 
-	memcpy(&out[0], &permuted_state.x[0], 16);
-	memcpy(&out[4], &permuted_state.x[12], 16);
+static void __exit chacha_mod_exit(void)
+{
 }
-EXPORT_SYMBOL(hchacha_block_generic);
+module_exit(chacha_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c
deleted file mode 100644
index f8aa70c9f559..000000000000
--- a/lib/crypto/curve25519-generic.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
- * depending on what is supported by the target compiler.
- *
- * Information: https://cr.yp.to/ecdh.html
- */
-
-#include <crypto/curve25519.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
-EXPORT_SYMBOL(curve25519_null_point);
-EXPORT_SYMBOL(curve25519_base_point);
-EXPORT_SYMBOL(curve25519_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c
index 6850b76a80c9..01e265dfbcd9 100644
--- a/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
@@ -2,32 +2,77 @@
 /*
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * This is an implementation of the Curve25519 ECDH algorithm, using either an
+ * architecture-optimized implementation or a generic implementation. The
+ * generic implementation is either 32-bit, or 64-bit with 128-bit integers,
  * depending on what is supported by the target compiler.
  *
  * Information: https://cr.yp.to/ecdh.html
  */
 
 #include <crypto/curve25519.h>
-#include <linux/module.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
-static int __init curve25519_init(void)
+static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+
+#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH
+#include "curve25519.h" /* $(SRCARCH)/curve25519.h */
+#else
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
 {
-	if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
-	    WARN_ON(!curve25519_selftest()))
-		return -ENODEV;
-	return 0;
+	curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_generic(pub, secret, curve25519_base_point);
+}
+#endif
+
+bool __must_check
+curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
+	   const u8 secret[CURVE25519_KEY_SIZE],
+	   const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+	curve25519_arch(mypublic, secret, basepoint);
+	return crypto_memneq(mypublic, curve25519_null_point,
+			     CURVE25519_KEY_SIZE);
+}
+EXPORT_SYMBOL(curve25519);
+
+bool __must_check
+curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+			   const u8 secret[CURVE25519_KEY_SIZE])
+{
+	if (unlikely(!crypto_memneq(secret, curve25519_null_point,
+				    CURVE25519_KEY_SIZE)))
+		return false;
+	curve25519_base_arch(pub, secret);
+	return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
 }
+EXPORT_SYMBOL(curve25519_generate_public);
 
-static void __exit curve25519_exit(void)
+#ifdef curve25519_mod_init_arch
+static int __init curve25519_mod_init(void)
 {
+	curve25519_mod_init_arch();
+	return 0;
 }
+subsys_initcall(curve25519_mod_init);
 
-module_init(curve25519_init);
-module_exit(curve25519_exit);
+static void __exit curve25519_mod_exit(void)
+{
+}
+module_exit(curve25519_mod_exit);
+#endif
 
 MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_DESCRIPTION("Curve25519 algorithm");
 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c
deleted file mode 100644
index 26862ad90a96..000000000000
--- a/lib/crypto/libchacha.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ChaCha stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/algapi.h> // for crypto_xor_cpy
-#include <crypto/chacha.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	/* aligned to potentially speed up crypto_xor() */
-	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
-
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_generic(state, stream, nrounds);
-		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
-		bytes -= CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-	}
-	if (bytes) {
-		chacha_block_generic(state, stream, nrounds);
-		crypto_xor_cpy(dst, src, stream, bytes);
-	}
-}
-EXPORT_SYMBOL(chacha_crypt_generic);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c
new file mode 100644
index 000000000000..c0610ea1370e
--- /dev/null
+++ b/lib/crypto/md5.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * MD5 and HMAC-MD5 library functions
+ *
+ * md5_block_generic() is derived from cryptoapi implementation, originally
+ * based on the public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/md5.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct md5_block_state md5_iv = {
+	.h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 },
+};
+
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+	(w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+static void md5_block_generic(struct md5_block_state *state,
+			      const u8 data[MD5_BLOCK_SIZE])
+{
+	u32 in[MD5_BLOCK_WORDS];
+	u32 a, b, c, d;
+
+	memcpy(in, data, MD5_BLOCK_SIZE);
+	le32_to_cpu_array(in, ARRAY_SIZE(in));
+
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+
+	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+}
+
+static void __maybe_unused md5_blocks_generic(struct md5_block_state *state,
+					      const u8 *data, size_t nblocks)
+{
+	do {
+		md5_block_generic(state, data);
+		data += MD5_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH
+#include "md5.h" /* $(SRCARCH)/md5.h */
+#else
+#define md5_blocks md5_blocks_generic
+#endif
+
+void md5_init(struct md5_ctx *ctx)
+{
+	ctx->state = md5_iv;
+	ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(md5_init);
+
+void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= MD5_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = MD5_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			md5_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / MD5_BLOCK_SIZE;
+		len %= MD5_BLOCK_SIZE;
+
+		if (nblocks) {
+			md5_blocks(&ctx->state, data, nblocks);
+			data += nblocks * MD5_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(md5_update);
+
+static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > MD5_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial);
+		md5_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial);
+	*(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount);
+	md5_blocks(&ctx->state, ctx->buf, 1);
+
+	cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h));
+	memcpy(out, ctx->state.h, MD5_DIGEST_SIZE);
+}
+
+void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	__md5_final(ctx, out);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(md5_final);
+
+void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE])
+{
+	struct md5_ctx ctx;
+
+	md5_init(&ctx);
+	md5_update(&ctx, data, len);
+	md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(md5);
+
+static void __hmac_md5_preparekey(struct md5_block_state *istate,
+				  struct md5_block_state *ostate,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	union {
+		u8 b[MD5_BLOCK_SIZE];
+		unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > MD5_BLOCK_SIZE))
+		md5(raw_key, raw_key_len, derived_key.b);
+	else
+		memcpy(derived_key.b, raw_key, raw_key_len);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = md5_iv;
+	md5_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = md5_iv;
+	md5_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_md5_preparekey(struct hmac_md5_key *key,
+			 const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_preparekey);
+
+void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key)
+{
+	ctx->hash_ctx.state = key->istate;
+	ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init);
+
+void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx,
+			       const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate,
+			      raw_key, raw_key_len);
+	ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey);
+
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	/* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */
+	__md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf);
+	memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0,
+	       MD5_BLOCK_SIZE - MD5_DIGEST_SIZE);
+	ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80;
+	*(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] =
+		cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1);
+	cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h));
+	memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_md5_final);
+
+void hmac_md5(const struct hmac_md5_key *key,
+	      const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE])
+{
+	struct hmac_md5_ctx ctx;
+
+	hmac_md5_init(&ctx, key);
+	hmac_md5_update(&ctx, data, data_len);
+	hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5);
+
+void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			  const u8 *data, size_t data_len,
+			  u8 out[MD5_DIGEST_SIZE])
+{
+	struct hmac_md5_ctx ctx;
+
+	hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_md5_update(&ctx, data, data_len);
+	hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey);
+
+#ifdef md5_mod_init_arch
+static int __init md5_mod_init(void)
+{
+	md5_mod_init_arch();
+	return 0;
+}
+subsys_initcall(md5_mod_init);
+
+static void __exit md5_mod_exit(void)
+{
+}
+module_exit(md5_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig
deleted file mode 100644
index 0670a170c1be..000000000000
--- a/lib/crypto/mips/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_MIPS
-	tristate
-	depends on CPU_MIPS32_R2
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_MIPS
-	tristate
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile
deleted file mode 100644
index 804488c7aded..000000000000
--- a/lib/crypto/mips/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
-chacha-mips-y := chacha-core.o chacha-glue.o
-AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
-
-obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
-poly1305-mips-y := poly1305-core.o poly1305-glue.o
-
-perlasm-flavour-$(CONFIG_32BIT) := o32
-perlasm-flavour-$(CONFIG_64BIT) := 64
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
-
-$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
-	$(call if_changed,perlasm)
-
-targets += poly1305-core.S
diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c
deleted file mode 100644
index 88c097594eb0..000000000000
--- a/lib/crypto/mips/chacha-glue.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (MIPS optimized)
- *
- * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/chacha.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void chacha_crypt_arch(struct chacha_state *state,
-				  u8 *dst, const u8 *src,
-				  unsigned int bytes, int nrounds);
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-asmlinkage void hchacha_block_arch(const struct chacha_state *state,
-				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
-EXPORT_SYMBOL(hchacha_block_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h
new file mode 100644
index 000000000000..0c18c0dc2a40
--- /dev/null
+++ b/lib/crypto/mips/chacha.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/kernel.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+				  u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h
new file mode 100644
index 000000000000..e08e28aeffa4
--- /dev/null
+++ b/lib/crypto/mips/md5.h
@@ -0,0 +1,65 @@
+/*
+ * Cryptographic API.
+ *
+ * MD5 Message Digest Algorithm (RFC1321).
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/md5.c, which is:
+ *
+ * Derived from cryptoapi implementation, originally based on the
+ * public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return md5_blocks_generic(state, data, nblocks);
+
+	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_md5_start(block[7]);
+
+		data += MD5_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	octeon_crypto_disable(&cop2_state, flags);
+
+	le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+}
diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c
deleted file mode 100644
index 764a38a65200..000000000000
--- a/lib/crypto/mips/poly1305-glue.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
index 399f10c3e385..71347f34f4f9 100644
--- a/lib/crypto/mips/poly1305-mips.pl
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -93,9 +93,7 @@ $code.=<<___;
 #endif
 
 #ifdef	__KERNEL__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit   poly1305_emit_arch
+# define poly1305_init   poly1305_block_init
 #endif
 
 #if defined(__MIPSEB__) && !defined(MIPSEB)
@@ -565,9 +563,7 @@ $code.=<<___;
 #endif
 
 #ifdef	__KERNEL__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit   poly1305_emit_arch
+# define poly1305_init   poly1305_block_init
 #endif
 
 #if defined(__MIPSEB__) && !defined(MIPSEB)
diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h
new file mode 100644
index 000000000000..85de450f1a93
--- /dev/null
+++ b/lib/crypto/mips/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c
deleted file mode 100644
index 71a16c5c538b..000000000000
--- a/lib/crypto/poly1305-generic.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Poly1305 authenticator algorithm, RFC7539
- *
- * Copyright (C) 2015 Martin Willi
- *
- * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
- */
-
-#include <crypto/internal/poly1305.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-void poly1305_block_init_generic(struct poly1305_block_state *desc,
-				 const u8 raw_key[POLY1305_BLOCK_SIZE])
-{
-	poly1305_core_init(&desc->h);
-	poly1305_core_setkey(&desc->core_r, raw_key);
-}
-EXPORT_SYMBOL_GPL(poly1305_block_init_generic);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)");
diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c
index a6dc182b6c22..f313ccc4b4dd 100644
--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -7,7 +7,6 @@
  * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
  */
 
-#include <crypto/internal/blockhash.h>
 #include <crypto/internal/poly1305.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
@@ -15,6 +14,14 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 
+#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH
+#include "poly1305.h" /* $(SRCARCH)/poly1305.h */
+#else
+#define poly1305_block_init	poly1305_block_init_generic
+#define poly1305_blocks		poly1305_blocks_generic
+#define poly1305_emit		poly1305_emit_generic
+#endif
+
 void poly1305_init(struct poly1305_desc_ctx *desc,
 		   const u8 key[POLY1305_KEY_SIZE])
 {
@@ -23,28 +30,40 @@ void poly1305_init(struct poly1305_desc_ctx *desc,
 	desc->s[2] = get_unaligned_le32(key + 24);
 	desc->s[3] = get_unaligned_le32(key + 28);
 	desc->buflen = 0;
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
-		poly1305_block_init_arch(&desc->state, key);
-	else
-		poly1305_block_init_generic(&desc->state, key);
+	poly1305_block_init(&desc->state, key);
 }
 EXPORT_SYMBOL(poly1305_init);
 
-static inline void poly1305_blocks(struct poly1305_block_state *state,
-				   const u8 *src, unsigned int len)
-{
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
-		poly1305_blocks_arch(state, src, len, 1);
-	else
-		poly1305_blocks_generic(state, src, len, 1);
-}
-
 void poly1305_update(struct poly1305_desc_ctx *desc,
 		     const u8 *src, unsigned int nbytes)
 {
-	desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state,
-					 src, nbytes, POLY1305_BLOCK_SIZE,
-					 desc->buf, desc->buflen);
+	if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) {
+		unsigned int bulk_len;
+
+		if (desc->buflen) {
+			unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen;
+
+			memcpy(&desc->buf[desc->buflen], src, l);
+			src += l;
+			nbytes -= l;
+
+			poly1305_blocks(&desc->state, desc->buf,
+					POLY1305_BLOCK_SIZE, 1);
+			desc->buflen = 0;
+		}
+
+		bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+		nbytes %= POLY1305_BLOCK_SIZE;
+
+		if (bulk_len) {
+			poly1305_blocks(&desc->state, src, bulk_len, 1);
+			src += bulk_len;
+		}
+	}
+	if (nbytes) {
+		memcpy(&desc->buf[desc->buflen], src, nbytes);
+		desc->buflen += nbytes;
+	}
 }
 EXPORT_SYMBOL(poly1305_update);
 
@@ -54,22 +73,28 @@ void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst)
 		desc->buf[desc->buflen++] = 1;
 		memset(desc->buf + desc->buflen, 0,
 		       POLY1305_BLOCK_SIZE - desc->buflen);
-		if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
-			poly1305_blocks_arch(&desc->state, desc->buf,
-					     POLY1305_BLOCK_SIZE, 0);
-		else
-			poly1305_blocks_generic(&desc->state, desc->buf,
-						POLY1305_BLOCK_SIZE, 0);
+		poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE,
+				0);
 	}
 
-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
-		poly1305_emit_arch(&desc->state.h, dst, desc->s);
-	else
-		poly1305_emit_generic(&desc->state.h, dst, desc->s);
+	poly1305_emit(&desc->state.h, dst, desc->s);
 	*desc = (struct poly1305_desc_ctx){};
 }
 EXPORT_SYMBOL(poly1305_final);
 
+#ifdef poly1305_mod_init_arch
+static int __init poly1305_mod_init(void)
+{
+	poly1305_mod_init_arch();
+	return 0;
+}
+subsys_initcall(poly1305_mod_init);
+
+static void __exit poly1305_mod_exit(void)
+{
+}
+module_exit(poly1305_mod_exit);
+#endif
+
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");
diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig
deleted file mode 100644
index 2eaeb7665a6a..000000000000
--- a/lib/crypto/powerpc/Kconfig
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_P10
-	tristate
-	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_P10
-	tristate
-	depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
-	depends on BROKEN # Needs to be fixed to work in softirq context
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	select CRYPTO_LIB_POLY1305_GENERIC
diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile
deleted file mode 100644
index 5709ae14258a..000000000000
--- a/lib/crypto/powerpc/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o
-chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
-poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
diff --git a/lib/crypto/powerpc/chacha-p10-glue.c b/lib/crypto/powerpc/chacha.h
index fcd23c6f1590..1df6e1ce31c4 100644
--- a/lib/crypto/powerpc/chacha-p10-glue.c
+++ b/lib/crypto/powerpc/chacha.h
@@ -1,14 +1,12 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * ChaCha stream cipher (P10 accelerated)
  *
  * Copyright 2023- IBM Corp. All rights reserved.
  */
 
-#include <crypto/chacha.h>
 #include <crypto/internal/simd.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/cpufeature.h>
 #include <linux/sizes.h>
 #include <asm/simd.h>
@@ -48,15 +46,10 @@ static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
 		chacha_crypt_generic(state, dst, src, bytes, nrounds);
 }
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
 	    !crypto_simd_usable())
@@ -74,27 +67,10 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 		dst += todo;
 	} while (bytes);
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_p10);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
 
-static int __init chacha_p10_init(void)
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
 {
 	if (cpu_has_feature(CPU_FTR_ARCH_31))
 		static_branch_enable(&have_p10);
-	return 0;
 }
-subsys_initcall(chacha_p10_init);
-
-static void __exit chacha_p10_exit(void)
-{
-}
-module_exit(chacha_p10_exit);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
index 06c1febe24b9..06c1febe24b9 100644
--- a/arch/powerpc/crypto/curve25519-ppc64le_asm.S
+++ b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/lib/crypto/powerpc/curve25519.h
index f7810be0b292..dee6234c48e9 100644
--- a/arch/powerpc/crypto/curve25519-ppc64le-core.c
+++ b/lib/crypto/powerpc/curve25519.h
@@ -7,14 +7,9 @@
  *     - Algorithm 1 Scalar multiplication of a variable point
  */
 
-#include <crypto/curve25519.h>
-#include <crypto/internal/kpp.h>
-
 #include <linux/types.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/scatterlist.h>
 
 #include <linux/cpufeature.h>
 #include <linux/processor.h>
@@ -177,124 +172,15 @@ static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
 	fe51_tobytes(out, x2);
 }
 
-void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
-		     const u8 secret[CURVE25519_KEY_SIZE],
-		     const u8 basepoint[CURVE25519_KEY_SIZE])
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
 {
 	curve25519_fe51(mypublic, secret, basepoint);
 }
-EXPORT_SYMBOL(curve25519_arch);
 
-void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
-			  const u8 secret[CURVE25519_KEY_SIZE])
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
 {
 	curve25519_fe51(pub, secret, curve25519_base_point);
 }
-EXPORT_SYMBOL(curve25519_base_arch);
-
-static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-				 unsigned int len)
-{
-	u8 *secret = kpp_tfm_ctx(tfm);
-
-	if (!len)
-		curve25519_generate_secret(secret);
-	else if (len == CURVE25519_KEY_SIZE &&
-		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
-		memcpy(secret, buf, CURVE25519_KEY_SIZE);
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_generate_public_key(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-
-	if (req->src)
-		return -EINVAL;
-
-	curve25519_base_arch(buf, secret);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_compute_shared_secret(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 public_key[CURVE25519_KEY_SIZE];
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-
-	if (!req->src)
-		return -EINVAL;
-
-	copied = sg_copy_to_buffer(req->src,
-				   sg_nents_for_len(req->src,
-						    CURVE25519_KEY_SIZE),
-				   public_key, CURVE25519_KEY_SIZE);
-	if (copied != CURVE25519_KEY_SIZE)
-		return -EINVAL;
-
-	curve25519_arch(buf, secret, public_key);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
-
-static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
-{
-	return CURVE25519_KEY_SIZE;
-}
-
-static struct kpp_alg curve25519_alg = {
-	.base.cra_name		= "curve25519",
-	.base.cra_driver_name	= "curve25519-ppc64le",
-	.base.cra_priority	= 200,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
-
-	.set_secret		= curve25519_set_secret,
-	.generate_public_key	= curve25519_generate_public_key,
-	.compute_shared_secret	= curve25519_compute_shared_secret,
-	.max_size		= curve25519_max_size,
-};
-
-
-static int __init curve25519_mod_init(void)
-{
-	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
-		crypto_register_kpp(&curve25519_alg) : 0;
-}
-
-static void __exit curve25519_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_KPP))
-		crypto_unregister_kpp(&curve25519_alg);
-}
-
-module_init(curve25519_mod_init);
-module_exit(curve25519_mod_exit);
-
-MODULE_ALIAS_CRYPTO("curve25519");
-MODULE_ALIAS_CRYPTO("curve25519-ppc64le");
-MODULE_DESCRIPTION("PPC64le Curve25519 scalar multiplication with 51 bits limbs");
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>");
diff --git a/arch/powerpc/crypto/md5-asm.S b/lib/crypto/powerpc/md5-asm.S
index fa6bc440cf4a..fa6bc440cf4a 100644
--- a/arch/powerpc/crypto/md5-asm.S
+++ b/lib/crypto/powerpc/md5-asm.S
diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h
new file mode 100644
index 000000000000..540b08e34d1d
--- /dev/null
+++ b/lib/crypto/powerpc/md5.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * MD5 optimized for PowerPC
+ */
+
+void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	ppc_md5_transform(state->h, data, nblocks);
+}
diff --git a/lib/crypto/powerpc/poly1305-p10-glue.c b/lib/crypto/powerpc/poly1305.h
index 3f1664a724b6..b8ed098a0e95 100644
--- a/lib/crypto/powerpc/poly1305-p10-glue.c
+++ b/lib/crypto/powerpc/poly1305.h
@@ -1,15 +1,13 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Poly1305 authenticator algorithm, RFC7539.
  *
  * Copyright 2023- IBM Corp. All rights reserved.
  */
 #include <asm/switch_to.h>
-#include <crypto/internal/poly1305.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/unaligned.h>
 
 asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
@@ -30,8 +28,8 @@ static void vsx_end(void)
 	preempt_enable();
 }
 
-void poly1305_block_init_arch(struct poly1305_block_state *dctx,
-			      const u8 raw_key[POLY1305_BLOCK_SIZE])
+static void poly1305_block_init(struct poly1305_block_state *dctx,
+				const u8 raw_key[POLY1305_BLOCK_SIZE])
 {
 	if (!static_key_enabled(&have_p10))
 		return poly1305_block_init_generic(dctx, raw_key);
@@ -40,10 +38,9 @@ void poly1305_block_init_arch(struct poly1305_block_state *dctx,
 	dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
 	dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
 }
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
 
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-			  unsigned int len, u32 padbit)
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
 {
 	if (!static_key_enabled(&have_p10))
 		return poly1305_blocks_generic(state, src, len, padbit);
@@ -60,37 +57,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
 	}
 	vsx_end();
 }
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
 
-void poly1305_emit_arch(const struct poly1305_state *state,
-			u8 digest[POLY1305_DIGEST_SIZE],
-			const u32 nonce[4])
+static void poly1305_emit(const struct poly1305_state *state,
+			  u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4])
 {
 	if (!static_key_enabled(&have_p10))
 		return poly1305_emit_generic(state, digest, nonce);
 	poly1305_emit_64(state, nonce, digest);
 }
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
 
-bool poly1305_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_p10);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init poly1305_p10_init(void)
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
 {
 	if (cpu_has_feature(CPU_FTR_ARCH_31))
 		static_branch_enable(&have_p10);
-	return 0;
 }
-subsys_initcall(poly1305_p10_init);
-
-static void __exit poly1305_p10_exit(void)
-{
-}
-module_exit(poly1305_p10_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>");
-MODULE_DESCRIPTION("Optimized Poly1305 for P10");
diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig
deleted file mode 100644
index bc7a43f33eb3..000000000000
--- a/lib/crypto/riscv/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_RISCV64
-	tristate
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile
deleted file mode 100644
index e27b78f317fc..000000000000
--- a/lib/crypto/riscv/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
-chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
diff --git a/lib/crypto/riscv/chacha-riscv64-glue.c b/lib/crypto/riscv/chacha.h
index 8c3f11d79be3..5c000c6aef4b 100644
--- a/lib/crypto/riscv/chacha-riscv64-glue.c
+++ b/lib/crypto/riscv/chacha.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * ChaCha stream cipher (RISC-V optimized)
  *
@@ -8,25 +8,18 @@
 
 #include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/chacha.h>
 #include <crypto/internal/simd.h>
 #include <linux/linkage.h>
-#include <linux/module.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
 
 asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
 			    size_t nblocks, int nrounds);
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	u8 block_buffer[CHACHA_BLOCK_SIZE];
 	unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
@@ -48,28 +41,11 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 	}
 	kernel_vector_end();
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return static_key_enabled(&use_zvkb);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
 
-static int __init riscv64_chacha_mod_init(void)
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
 {
 	if (riscv_isa_extension_available(NULL, ZVKB) &&
 	    riscv_vector_vlen() >= 128)
 		static_branch_enable(&use_zvkb);
-	return 0;
 }
-subsys_initcall(riscv64_chacha_mod_init);
-
-static void __exit riscv64_chacha_mod_exit(void)
-{
-}
-module_exit(riscv64_chacha_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (RISC-V optimized)");
-MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl
new file mode 100644
index 000000000000..e25e6338a9ac
--- /dev/null
+++ b/lib/crypto/riscv/poly1305-riscv.pl
@@ -0,0 +1,847 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
+# ====================================================================
+#
+# Poly1305 hash for RISC-V.
+#
+# February 2019
+#
+# In the essence it's pretty straightforward transliteration of MIPS
+# module [without big-endian option].
+#
+# 1.8 cycles per byte on U74, >100% faster than compiler-generated
+# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
+# improvement.
+#
+# June 2024.
+#
+# Add CHERI support.
+#
+######################################################################
+#
+($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
+($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
+#
+######################################################################
+
+$flavour = shift || "64";
+
+for (@ARGV) {   $output=$_ if (/\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+
+$code.=<<___;
+#ifdef __KERNEL__
+# ifdef __riscv_zicfilp
+#  undef __riscv_zicfilp // calls are expected to be direct
+# endif
+#endif
+
+#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
+# define __riscv_misaligned_fast 1
+#endif
+___
+
+if ($flavour =~ /64/) {{{
+######################################################################
+# 64-bit code path...
+#
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
+
+$code.=<<___;
+#if __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sd
+#  define POP	ld
+# endif
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option	pic
+.text
+
+.globl	poly1305_init
+.type	poly1305_init,\@function
+poly1305_init:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#ifndef	__riscv_misaligned_fast
+	andi	$tmp0,$inp,7		# $inp % 8
+	andi	$inp,$inp,-8		# align $inp
+	slli	$tmp0,$tmp0,3		# byte to bit offset
+#endif
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$tmp0,.Laligned_key
+
+	ld	$tmp2,16($inp)
+	neg	$tmp1,$tmp0		# implicit &63 in sll
+	srl	$in0,$in0,$tmp0
+	sll	$tmp3,$in1,$tmp1
+	srl	$in1,$in1,$tmp0
+	sll	$tmp2,$tmp2,$tmp1
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+
+.Laligned_key:
+#endif
+	li	$tmp0,1
+	slli	$tmp0,$tmp0,32		# 0x0000000100000000
+	addi	$tmp0,$tmp0,-63		# 0x00000000ffffffc1
+	slli	$tmp0,$tmp0,28		# 0x0ffffffc10000000
+	addi	$tmp0,$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$in0,$tmp0
+	addi	$tmp0,$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	srli	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	add	$tmp0,$tmp0,$in1	# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$a0,0			# return 0
+	ret
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
+my ($shr,$shl) = ($t5,$t6);		# used on R6
+
+$code.=<<___;
+.globl	poly1305_blocks
+.type	poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	andi	$len,$len,-16		# complete blocks only
+	beqz	$len,.Lno_data
+
+	caddi	$sp,$sp,-4*__SIZEOF_POINTER__
+	PUSH	$s0,3*__SIZEOF_POINTER__($sp)
+	PUSH	$s1,2*__SIZEOF_POINTER__($sp)
+	PUSH	$s2,1*__SIZEOF_POINTER__($sp)
+	PUSH	$s3,0*__SIZEOF_POINTER__($sp)
+
+#ifndef	__riscv_misaligned_fast
+	andi	$shr,$inp,7
+	andi	$inp,$inp,-8		# align $inp
+	slli	$shr,$shr,3		# byte to bit offset
+	neg	$shl,$shr		# implicit &63 in sll
+#endif
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+	add	$len,$len,$inp		# end of buffer
+
+.Loop:
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$shr,.Laligned_inp
+
+	ld	$tmp2,16($inp)
+	srl	$in0,$in0,$shr
+	sll	$tmp3,$in1,$shl
+	srl	$in1,$in1,$shr
+	sll	$tmp2,$tmp2,$shl
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+
+.Laligned_inp:
+#endif
+	caddi	$inp,$inp,16
+
+	andi	$tmp0,$h2,-4		# modulo-scheduled reduction
+	srli	$tmp1,$h2,2
+	andi	$h2,$h2,3
+
+	add	$d0,$h0,$in0		# accumulate input
+	 add	$tmp1,$tmp1,$tmp0
+	sltu	$tmp0,$d0,$h0
+	add	$d0,$d0,$tmp1		# ... and residue
+	sltu	$tmp1,$d0,$tmp1
+	add	$d1,$h1,$in1
+	add	$tmp0,$tmp0,$tmp1
+	sltu	$tmp1,$d1,$h1
+	add	$d1,$d1,$tmp0
+
+	 add	$d2,$h2,$padbit
+	 sltu	$tmp0,$d1,$tmp0
+	mulhu	$h1,$r0,$d0		# h0*r0
+	mul	$h0,$r0,$d0
+
+	 add	$d2,$d2,$tmp1
+	 add	$d2,$d2,$tmp0
+	mulhu	$tmp1,$rs1,$d1		# h1*5*r1
+	mul	$tmp0,$rs1,$d1
+
+	mulhu	$h2,$r1,$d0		# h0*r1
+	mul	$tmp2,$r1,$d0
+	 add	$h0,$h0,$tmp0
+	 add	$h1,$h1,$tmp1
+	 sltu	$tmp0,$h0,$tmp0
+
+	 add	$h1,$h1,$tmp0
+	 add	$h1,$h1,$tmp2
+	mulhu	$tmp1,$r0,$d1		# h1*r0
+	mul	$tmp0,$r0,$d1
+
+	 sltu	$tmp2,$h1,$tmp2
+	 add	$h2,$h2,$tmp2
+	mul	$tmp2,$rs1,$d2		# h2*5*r1
+
+	 add	$h1,$h1,$tmp0
+	 add	$h2,$h2,$tmp1
+	mul	$tmp3,$r0,$d2		# h2*r0
+	 sltu	$tmp0,$h1,$tmp0
+	 add	$h2,$h2,$tmp0
+
+	add	$h1,$h1,$tmp2
+	sltu	$tmp2,$h1,$tmp2
+	add	$h2,$h2,$tmp2
+	add	$h2,$h2,$tmp3
+
+	bne	$inp,$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	POP	$s0,3*__SIZEOF_POINTER__($sp)		# epilogue
+	POP	$s1,2*__SIZEOF_POINTER__($sp)
+	POP	$s2,1*__SIZEOF_POINTER__($sp)
+	POP	$s3,0*__SIZEOF_POINTER__($sp)
+	caddi	$sp,$sp,4*__SIZEOF_POINTER__
+
+.Lno_data:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.globl	poly1305_emit
+.type	poly1305_emit,\@function
+poly1305_emit:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	ld	$tmp2,16($ctx)
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+
+	andi	$in0,$tmp2,-4		# final reduction
+	srl	$in1,$tmp2,2
+	andi	$tmp2,$tmp2,3
+	add	$in0,$in0,$in1
+
+	add	$tmp0,$tmp0,$in0
+	sltu	$in1,$tmp0,$in0
+	 addi	$in0,$tmp0,5		# compare to modulus
+	add	$tmp1,$tmp1,$in1
+	 sltiu	$tmp3,$in0,5
+	sltu	$tmp4,$tmp1,$in1
+	 add	$in1,$tmp1,$tmp3
+	add	$tmp2,$tmp2,$tmp4
+	 sltu	$tmp3,$in1,$tmp3
+	 add	$tmp2,$tmp2,$tmp3
+
+	srli	$tmp2,$tmp2,2		# see if it carried/borrowed
+	neg	$tmp2,$tmp2
+
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	and	$in0,$in0,$tmp2
+	and	$in1,$in1,$tmp2
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	slli	$tmp1,$tmp1,32
+	slli	$tmp3,$tmp3,32
+	or	$tmp0,$tmp0,$tmp1
+	or	$tmp2,$tmp2,$tmp3
+
+	add	$in0,$in0,$tmp0		# accumulate nonce
+	add	$in1,$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	add	$in1,$in1,$tmp0
+
+#ifdef	__riscv_misaligned_fast
+	sd	$in0,0($mac)		# write mac value
+	sd	$in1,8($mac)
+#else
+	srli	$tmp0,$in0,8		# write mac value
+	srli	$tmp1,$in0,16
+	srli	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	srli	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	srli	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	srli	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	srli	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	srli	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	srli	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	srli	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	srli	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	srli	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	srli	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	srli	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+#endif
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
+
+$code.=<<___;
+#if __riscv_xlen == 32
+# if __SIZEOF_POINTER__ == 8
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sw
+#  define POP	lw
+# endif
+# define MULX(hi,lo,a,b)	mulhu hi,a,b; mul lo,a,b
+# define srliw	srli
+# define srlw	srl
+# define sllw	sll
+# define addw	add
+# define addiw	addi
+# define mulw	mul
+#elif __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sd
+#  define POP	ld
+# endif
+# define MULX(hi,lo,a,b)	slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option	pic
+.text
+
+.globl	poly1305_init
+.type	poly1305_init,\@function
+poly1305_init:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#ifndef	__riscv_misaligned_fast
+	andi	$tmp0,$inp,3		# $inp % 4
+	sub	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+#endif
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	sub	$tmp1,$zero,$tmp0
+	srlw	$in0,$in0,$tmp0
+	sllw	$tmp3,$in1,$tmp1
+	srlw	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllw	$tmp3,$in2,$tmp1
+	srlw	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllw	$tmp3,$in3,$tmp1
+	srlw	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllw	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+.Laligned_key:
+#endif
+
+	lui	$tmp0,0x10000
+	addi	$tmp0,$tmp0,-1		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	addi	$tmp0,$tmp0,-3		# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srlw	$tmp1,$in1,2
+	srlw	$tmp2,$in2,2
+	srlw	$tmp3,$in3,2
+	addw	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addw	$in2,$in2,$tmp2
+	addw	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$a0,0
+	ret
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $ra;		# used on R6
+
+$code.=<<___;
+.globl	poly1305_blocks
+.type	poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	andi	$len,$len,-16		# complete blocks only
+	beqz	$len,.Labort
+
+#ifdef	__riscv_zcmp
+	cm.push	{ra,s0-s8}, -48
+#else
+	caddi	$sp,$sp,-__SIZEOF_POINTER__*12
+	PUSH	$ra, __SIZEOF_POINTER__*11($sp)
+	PUSH	$s0, __SIZEOF_POINTER__*10($sp)
+	PUSH	$s1, __SIZEOF_POINTER__*9($sp)
+	PUSH	$s2, __SIZEOF_POINTER__*8($sp)
+	PUSH	$s3, __SIZEOF_POINTER__*7($sp)
+	PUSH	$s4, __SIZEOF_POINTER__*6($sp)
+	PUSH	$s5, __SIZEOF_POINTER__*5($sp)
+	PUSH	$s6, __SIZEOF_POINTER__*4($sp)
+	PUSH	$s7, __SIZEOF_POINTER__*3($sp)
+	PUSH	$s8, __SIZEOF_POINTER__*2($sp)
+#endif
+
+#ifndef	__riscv_misaligned_fast
+	andi	$shr,$inp,3
+	andi	$inp,$inp,-4		# align $inp
+	slli	$shr,$shr,3		# byte to bit offset
+#endif
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+	add	$len,$len,$inp		# end of buffer
+
+.Loop:
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$shr,.Laligned_inp
+
+	lw	$t4,16($inp)
+	sub	$t5,$zero,$shr
+	srlw	$d0,$d0,$shr
+	sllw	$t3,$d1,$t5
+	srlw	$d1,$d1,$shr
+	or	$d0,$d0,$t3
+	sllw	$t3,$d2,$t5
+	srlw	$d2,$d2,$shr
+	or	$d1,$d1,$t3
+	sllw	$t3,$d3,$t5
+	srlw	$d3,$d3,$shr
+	or	$d2,$d2,$t3
+	sllw	$t4,$t4,$t5
+	or	$d3,$d3,$t4
+
+.Laligned_inp:
+#endif
+	srliw	$t3,$h4,2		# modulo-scheduled reduction
+	andi	$t4,$h4,-4
+	andi	$h4,$h4,3
+
+	addw	$d0,$d0,$h0		# accumulate input
+	 addw	$t4,$t4,$t3
+	sltu	$h0,$d0,$h0
+	addw	$d0,$d0,$t4		# ... and residue
+	sltu	$t4,$d0,$t4
+
+	addw	$d1,$d1,$h1
+	 addw	$h0,$h0,$t4		# carry
+	sltu	$h1,$d1,$h1
+	addw	$d1,$d1,$h0
+	sltu	$h0,$d1,$h0
+
+	addw	$d2,$d2,$h2
+	 addw	$h1,$h1,$h0		# carry
+	sltu	$h2,$d2,$h2
+	addw	$d2,$d2,$h1
+	sltu	$h1,$d2,$h1
+
+	addw	$d3,$d3,$h3
+	 addw	$h2,$h2,$h1		# carry
+	sltu	$h3,$d3,$h3
+	addw	$d3,$d3,$h2
+
+	MULX	($h1,$h0,$r0,$d0)	# d0*r0
+
+	 sltu	$h2,$d3,$h2
+	 addw	$h3,$h3,$h2		# carry
+
+	MULX	($t4,$t3,$rs3,$d1)	# d1*s3
+
+	 addw	$h4,$h4,$padbit
+	 caddi	$inp,$inp,16
+	 addw	$h4,$h4,$h3
+
+	MULX	($t6,$a3,$rs2,$d2)	# d2*s2
+	 addw	$h0,$h0,$t3
+	 addw	$h1,$h1,$t4
+	 sltu	$t3,$h0,$t3
+	 addw	$h1,$h1,$t3
+
+	MULX	($t4,$t3,$rs1,$d3)	# d3*s1
+	 addw	$h0,$h0,$a3
+	 addw	$h1,$h1,$t6
+	 sltu	$a3,$h0,$a3
+	 addw	$h1,$h1,$a3
+
+
+	MULX	($h2,$a3,$r1,$d0)	# d0*r1
+	 addw	$h0,$h0,$t3
+	 addw	$h1,$h1,$t4
+	 sltu	$t3,$h0,$t3
+	 addw	$h1,$h1,$t3
+
+	MULX	($t4,$t3,$r0,$d1)	# d1*r0
+	 addw	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	MULX	($t6,$a3,$rs3,$d2)	# d2*s3
+	 addw	$h1,$h1,$t3
+	 addw	$h2,$h2,$t4
+	 sltu	$t3,$h1,$t3
+	 addw	$h2,$h2,$t3
+
+	MULX	($t4,$t3,$rs2,$d3)	# d3*s2
+	 addw	$h1,$h1,$a3
+	 addw	$h2,$h2,$t6
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	mulw	$a3,$rs1,$h4		# h4*s1
+	 addw	$h1,$h1,$t3
+	 addw	$h2,$h2,$t4
+	 sltu	$t3,$h1,$t3
+	 addw	$h2,$h2,$t3
+
+
+	MULX	($h3,$t3,$r2,$d0)	# d0*r2
+	 addw	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	MULX	($t6,$a3,$r1,$d1)	# d1*r1
+	 addw	$h2,$h2,$t3
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	MULX	($t4,$t3,$r0,$d2)	# d2*r0
+	 addw	$h2,$h2,$a3
+	 addw	$h3,$h3,$t6
+	 sltu	$a3,$h2,$a3
+	 addw	$h3,$h3,$a3
+
+	MULX	($t6,$a3,$rs3,$d3)	# d3*s3
+	 addw	$h2,$h2,$t3
+	 addw	$h3,$h3,$t4
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	mulw	$t3,$rs2,$h4		# h4*s2
+	 addw	$h2,$h2,$a3
+	 addw	$h3,$h3,$t6
+	 sltu	$a3,$h2,$a3
+	 addw	$h3,$h3,$a3
+
+
+	MULX	($t6,$a3,$r3,$d0)	# d0*r3
+	 addw	$h2,$h2,$t3
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	MULX	($t4,$t3,$r2,$d1)	# d1*r2
+	 addw	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addw	$t6,$t6,$a3
+
+	MULX	($a3,$d3,$r0,$d3)	# d3*r0
+	 addw	$h3,$h3,$t3
+	 addw	$t6,$t6,$t4
+	 sltu	$t3,$h3,$t3
+	 addw	$t6,$t6,$t3
+
+	MULX	($t4,$t3,$r1,$d2)	# d2*r1
+	 addw	$h3,$h3,$d3
+	 addw	$t6,$t6,$a3
+	 sltu	$d3,$h3,$d3
+	 addw	$t6,$t6,$d3
+
+	mulw	$a3,$rs3,$h4		# h4*s3
+	 addw	$h3,$h3,$t3
+	 addw	$t6,$t6,$t4
+	 sltu	$t3,$h3,$t3
+	 addw	$t6,$t6,$t3
+
+
+	mulw	$h4,$r0,$h4		# h4*r0
+	 addw	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addw	$t6,$t6,$a3
+	addw	$h4,$t6,$h4
+
+	li	$padbit,1		# if we loop, padbit is 1
+
+	bne	$inp,$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+#ifdef	__riscv_zcmp
+	cm.popret	{ra,s0-s8}, 48
+#else
+	POP	$ra, __SIZEOF_POINTER__*11($sp)
+	POP	$s0, __SIZEOF_POINTER__*10($sp)
+	POP	$s1, __SIZEOF_POINTER__*9($sp)
+	POP	$s2, __SIZEOF_POINTER__*8($sp)
+	POP	$s3, __SIZEOF_POINTER__*7($sp)
+	POP	$s4, __SIZEOF_POINTER__*6($sp)
+	POP	$s5, __SIZEOF_POINTER__*5($sp)
+	POP	$s6, __SIZEOF_POINTER__*4($sp)
+	POP	$s7, __SIZEOF_POINTER__*3($sp)
+	POP	$s8, __SIZEOF_POINTER__*2($sp)
+	caddi	$sp,$sp,__SIZEOF_POINTER__*12
+#endif
+.Labort:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.globl	poly1305_emit
+.type	poly1305_emit,\@function
+poly1305_emit:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	lw	$tmp4,16($ctx)
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+
+	srliw	$ctx,$tmp4,2		# final reduction
+	andi	$in0,$tmp4,-4
+	andi	$tmp4,$tmp4,3
+	addw	$ctx,$ctx,$in0
+
+	addw	$tmp0,$tmp0,$ctx
+	sltu	$ctx,$tmp0,$ctx
+	 addiw	$in0,$tmp0,5		# compare to modulus
+	addw	$tmp1,$tmp1,$ctx
+	 sltiu	$in1,$in0,5
+	sltu	$ctx,$tmp1,$ctx
+	 addw	$in1,$in1,$tmp1
+	addw	$tmp2,$tmp2,$ctx
+	 sltu	$in2,$in1,$tmp1
+	sltu	$ctx,$tmp2,$ctx
+	 addw	$in2,$in2,$tmp2
+	addw	$tmp3,$tmp3,$ctx
+	 sltu	$in3,$in2,$tmp2
+	sltu	$ctx,$tmp3,$ctx
+	 addw	$in3,$in3,$tmp3
+	addw	$tmp4,$tmp4,$ctx
+	 sltu	$ctx,$in3,$tmp3
+	 addw	$ctx,$ctx,$tmp4
+
+	srl	$ctx,$ctx,2		# see if it carried/borrowed
+	sub	$ctx,$zero,$ctx
+
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	xor	$in2,$in2,$tmp2
+	xor	$in3,$in3,$tmp3
+	and	$in0,$in0,$ctx
+	and	$in1,$in1,$ctx
+	and	$in2,$in2,$ctx
+	and	$in3,$in3,$ctx
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	xor	$in2,$in2,$tmp2
+	xor	$in3,$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addw	$in0,$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addw	$in1,$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addw	$in1,$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addw	$ctx,$ctx,$tmp1
+
+	addw	$in2,$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addw	$in2,$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addw	$ctx,$ctx,$tmp2
+
+	addw	$in3,$in3,$tmp3
+	addw	$in3,$in3,$ctx
+
+#ifdef	__riscv_misaligned_fast
+	sw	$in0,0($mac)		# write mac value
+	sw	$in1,4($mac)
+	sw	$in2,8($mac)
+	sw	$in3,12($mac)
+#else
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+#endif
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}}
+
+foreach (split("\n", $code)) {
+    if ($flavour =~ /^cheri/) {
+	s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
+	s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
+	s/\b(ret|jal)\b/c$1/;
+	s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
+	m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
+    } else {
+	s/\bcaddi?\b/add/ or
+	s/\bcmove\b/mv/;
+    }
+    print $_, "\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h
new file mode 100644
index 000000000000..88f3df44e355
--- /dev/null
+++ b/lib/crypto/riscv/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv
+ *
+ * Copyright (C) 2025 Institute of Software, CAS.
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
index c0f79c18f119..1def18b0a4fb 100644
--- a/lib/crypto/riscv/sha256.h
+++ b/lib/crypto/riscv/sha256.h
@@ -9,19 +9,19 @@
  * Author: Jerry Shih <jerry.shih@sifive.com>
  */
 
+#include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
 
 asmlinkage void
 sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
 				       const u8 *data, size_t nblocks);
 
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
-
 static void sha256_blocks(struct sha256_block_state *state,
 			  const u8 *data, size_t nblocks)
 {
-	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
+	if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
 		kernel_vector_begin();
 		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
 		kernel_vector_end();
@@ -31,7 +31,7 @@ static void sha256_blocks(struct sha256_block_state *state,
 }
 
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
 	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h
index 9d0abede322f..145bdab1214e 100644
--- a/lib/crypto/riscv/sha512.h
+++ b/lib/crypto/riscv/sha512.h
@@ -11,7 +11,6 @@
 
 #include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
 
@@ -21,8 +20,7 @@ asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
 static void sha512_blocks(struct sha512_block_state *state,
 			  const u8 *data, size_t nblocks)
 {
-	if (static_branch_likely(&have_extensions) &&
-	    likely(crypto_simd_usable())) {
+	if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
 		kernel_vector_begin();
 		sha512_transform_zvknhb_zvkb(state, data, nblocks);
 		kernel_vector_end();
@@ -32,7 +30,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 }
 
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	if (riscv_isa_extension_available(NULL, ZVKNHB) &&
 	    riscv_isa_extension_available(NULL, ZVKB) &&
diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig
deleted file mode 100644
index 069b355fe51a..000000000000
--- a/lib/crypto/s390/Kconfig
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_S390
-	tristate
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile
deleted file mode 100644
index 06c2cf77178e..000000000000
--- a/lib/crypto/s390/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
-chacha_s390-y := chacha-glue.o chacha-s390.o
diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha.h
index c57dc851214f..fd9c4a422365 100644
--- a/lib/crypto/s390/chacha-glue.c
+++ b/lib/crypto/s390/chacha.h
@@ -1,32 +1,21 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * ChaCha stream cipher (s390 optimized)
  *
  * Copyright IBM Corp. 2021
  */
 
-#define KMSG_COMPONENT "chacha_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <crypto/chacha.h>
 #include <linux/cpufeature.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/sizes.h>
 #include <asm/fpu.h>
 #include "chacha-s390.h"
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-	/* TODO: implement hchacha_block_arch() in assembly */
-	hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	/* s390 chacha20 implementation has 20 rounds hard-coded,
 	 * it cannot handle a block of data or less, but otherwise
@@ -45,13 +34,3 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 				CHACHA_BLOCK_SIZE;
 	}
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return cpu_has_vx();
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h
index 08bd138e881c..73d94476a157 100644
--- a/lib/crypto/s390/sha1.h
+++ b/lib/crypto/s390/sha1.h
@@ -20,7 +20,7 @@ static void sha1_blocks(struct sha1_block_state *state,
 }
 
 #define sha1_mod_init_arch sha1_mod_init_arch
-static inline void sha1_mod_init_arch(void)
+static void sha1_mod_init_arch(void)
 {
 	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
 	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
index 70a81cbc06b2..acd483508789 100644
--- a/lib/crypto/s390/sha256.h
+++ b/lib/crypto/s390/sha256.h
@@ -20,7 +20,7 @@ static void sha256_blocks(struct sha256_block_state *state,
 }
 
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
 	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h
index 24744651550c..46699d43df7e 100644
--- a/lib/crypto/s390/sha512.h
+++ b/lib/crypto/s390/sha512.h
@@ -20,7 +20,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 }
 
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
 	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 8fa15165d23e..881b935418ce 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
 	},
 };
 
-static const struct sha256_block_state sha256_iv = {
-	.h = {
-		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
-		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+static const struct sha256_ctx initial_sha256_ctx = {
+	.ctx = {
+		.state = {
+			.h = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+			},
+		},
+		.bytecount = 0,
 	},
 };
 
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
 static const u32 sha256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256);
 
-/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+/*
+ * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
+ * doesn't need either HMAC support or interleaved hashing support
+ */
 #ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+	const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+	size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+	struct __sha256_ctx mut_ctx;
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data1, len);
+	__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data2, len);
+	__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE])
+{
+	if (ctx == NULL)
+		ctx = &initial_sha256_ctx;
+
+	if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+					out2)))
+		return;
+	sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+	return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
 static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
 				     struct sha256_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h
new file mode 100644
index 000000000000..3995f3e075eb
--- /dev/null
+++ b/lib/crypto/sparc/md5.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MD5 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes);
+
+asmlinkage void md5_sparc64_transform(struct md5_block_state *state,
+				      const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_md5_opcodes)) {
+		cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+		md5_sparc64_transform(state, data, nblocks);
+		le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+	} else {
+		md5_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define md5_mod_init_arch md5_mod_init_arch
+static void md5_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_MD5))
+		return;
+
+	static_branch_enable(&have_md5_opcodes);
+	pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+}
diff --git a/arch/sparc/crypto/md5_asm.S b/lib/crypto/sparc/md5_asm.S
index 60b544e4d205..60b544e4d205 100644
--- a/arch/sparc/crypto/md5_asm.S
+++ b/lib/crypto/sparc/md5_asm.S
diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h
index 5015f93584b7..bdf771fcc1f7 100644
--- a/lib/crypto/sparc/sha1.h
+++ b/lib/crypto/sparc/sha1.h
@@ -27,7 +27,7 @@ static void sha1_blocks(struct sha1_block_state *state,
 }
 
 #define sha1_mod_init_arch sha1_mod_init_arch
-static inline void sha1_mod_init_arch(void)
+static void sha1_mod_init_arch(void)
 {
 	unsigned long cfr;
 
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
index 1d10108eb195..b2f4419ec778 100644
--- a/lib/crypto/sparc/sha256.h
+++ b/lib/crypto/sparc/sha256.h
@@ -27,7 +27,7 @@ static void sha256_blocks(struct sha256_block_state *state,
 }
 
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	unsigned long cfr;
 
diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h
index 55303ab6b15f..a8c37a7d4c39 100644
--- a/lib/crypto/sparc/sha512.h
+++ b/lib/crypto/sparc/sha512.h
@@ -26,7 +26,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 }
 
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	unsigned long cfr;
 
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
index de7e8babb6af..578af717e13a 100644
--- a/lib/crypto/tests/Kconfig
+++ b/lib/crypto/tests/Kconfig
@@ -1,5 +1,34 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
+	tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	# No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't
+	# exist; the BLAKE2s code is always built-in for the /dev/random driver.
+	help
+	  KUnit tests for the BLAKE2s cryptographic hash function.
+
+config CRYPTO_LIB_CURVE25519_KUNIT_TEST
+	tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_CURVE25519
+	help
+	  KUnit tests for the Curve25519 Diffie-Hellman function.
+
+config CRYPTO_LIB_MD5_KUNIT_TEST
+	tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_MD5
+	help
+	  KUnit tests for the MD5 cryptographic hash function and its
+	  corresponding HMAC.
+
 config CRYPTO_LIB_POLY1305_KUNIT_TEST
 	tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
index 8601dccd6fdd..a71fad19922b 100644
--- a/lib/crypto/tests/Makefile
+++ b/lib/crypto/tests/Makefile
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h
new file mode 100644
index 000000000000..6f978b79a59b
--- /dev/null
+++ b/lib/crypto/tests/blake2s-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */
+
+static const struct {
+	size_t data_len;
+	u8 digest[BLAKE2S_HASH_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94,
+			0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c,
+			0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e,
+			0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64,
+			0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9,
+			0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9,
+			0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae,
+			0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9,
+			0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9,
+			0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15,
+			0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c,
+			0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc,
+			0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32,
+			0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15,
+			0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a,
+			0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47,
+			0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92,
+			0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d,
+			0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8,
+			0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56,
+			0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea,
+			0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36,
+			0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38,
+			0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1,
+			0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5,
+			0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0,
+			0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98,
+			0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96,
+			0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad,
+			0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24,
+			0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3,
+			0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b,
+			0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed,
+			0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6,
+			0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c,
+			0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92,
+			0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67,
+			0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1,
+			0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75,
+			0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb,
+			0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81,
+			0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37,
+			0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4,
+			0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac,
+			0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60,
+			0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64,
+			0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04,
+			0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab,
+			0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7,
+			0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee,
+			0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12,
+			0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0,
+			0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf,
+			0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f,
+			0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09,
+			0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a,
+			0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69,
+			0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf,
+			0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06,
+			0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5,
+			0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab,
+			0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79,
+			0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0,
+			0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a,
+			0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5,
+			0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba,
+			0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56,
+			0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c,
+			0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b,
+			0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29,
+			0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6,
+			0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f,
+			0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+	0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0,
+	0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb,
+	0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3,
+	0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67,
+};
+
+static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+	0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70,
+	0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4,
+	0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d,
+	0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b,
+};
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
new file mode 100644
index 000000000000..057c40132246
--- /dev/null
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2s.h>
+#include "blake2s-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2s as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2s_default(const u8 *data, size_t len,
+			    u8 out[BLAKE2S_HASH_SIZE])
+{
+	blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
+}
+
+static void blake2s_init_default(struct blake2s_state *state)
+{
+	blake2s_init(state, BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h.  These test BLAKE2s
+ * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
+ */
+#define HASH blake2s_default
+#define HASH_CTX blake2s_state
+#define HASH_SIZE BLAKE2S_HASH_SIZE
+#define HASH_INIT blake2s_init_default
+#define HASH_UPDATE blake2s_update
+#define HASH_FINAL blake2s_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2s specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
+{
+	const size_t data_len = 100;
+	u8 *data = &test_buf[0];
+	u8 *key = data + data_len;
+	u8 *hash = key + BLAKE2S_KEY_SIZE;
+	struct blake2s_state main_state;
+	u8 main_hash[BLAKE2S_HASH_SIZE];
+
+	rand_bytes_seeded_from_len(data, data_len);
+	blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
+	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+		rand_bytes_seeded_from_len(key, key_len);
+		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+			blake2s(hash, data, key, out_len, data_len, key_len);
+			blake2s_update(&main_state, hash, out_len);
+		}
+	}
+	blake2s_final(&main_state, main_hash);
+	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
+			   BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded buffer for all allowed
+ * key lengths.  Also tests both blake2s() and blake2s_init_key().
+ */
+static void test_blake2s_with_guarded_key_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+		u8 key[BLAKE2S_KEY_SIZE];
+		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+		u8 hash1[BLAKE2S_HASH_SIZE];
+		u8 hash2[BLAKE2S_HASH_SIZE];
+		struct blake2s_state state;
+
+		rand_bytes(key, key_len);
+		memcpy(guarded_key, key, key_len);
+
+		blake2s(hash1, test_buf, key,
+			BLAKE2S_HASH_SIZE, data_len, key_len);
+		blake2s(hash2, test_buf, guarded_key,
+			BLAKE2S_HASH_SIZE, data_len, key_len);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+
+		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
+				 guarded_key, key_len);
+		blake2s_update(&state, test_buf, data_len);
+		blake2s_final(&state, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+	}
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2s_with_guarded_out_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+		u8 hash[BLAKE2S_HASH_SIZE];
+		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+		blake2s(hash, test_buf, NULL, out_len, data_len, 0);
+		blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
+		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+	}
+}
+
+static struct kunit_case blake2s_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_blake2s_all_key_and_hash_lens),
+	KUNIT_CASE(test_blake2s_with_guarded_key_buf),
+	KUNIT_CASE(test_blake2s_with_guarded_out_buf),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite blake2s_test_suite = {
+	.name = "blake2s",
+	.test_cases = blake2s_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2s_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c
index c85e85381e78..248d05f66b35 100644
--- a/lib/crypto/curve25519-selftest.c
+++ b/lib/crypto/tests/curve25519_kunit.c
@@ -4,6 +4,8 @@
  */
 
 #include <crypto/curve25519.h>
+#include <kunit/test.h>
+#include <linux/timekeeping.h>
 
 struct curve25519_test_vector {
 	u8 private[CURVE25519_KEY_SIZE];
@@ -11,7 +13,7 @@ struct curve25519_test_vector {
 	u8 result[CURVE25519_KEY_SIZE];
 	bool valid;
 };
-static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = {
+static const struct curve25519_test_vector curve25519_test_vectors[] = {
 	{
 		.private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
 			     0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
@@ -1280,42 +1282,82 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
 	}
 };
 
-bool __init curve25519_selftest(void)
+static void test_curve25519(struct kunit *test)
 {
-	bool success = true, ret, ret2;
-	size_t i = 0, j;
-	u8 in[CURVE25519_KEY_SIZE];
-	u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE],
-	   out3[CURVE25519_KEY_SIZE];
+	for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
+		const struct curve25519_test_vector *vec =
+			&curve25519_test_vectors[i];
+		u8 out[CURVE25519_KEY_SIZE] = {};
+		bool ret;
 
-	for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
-		memset(out, 0, CURVE25519_KEY_SIZE);
-		ret = curve25519(out, curve25519_test_vectors[i].private,
-				 curve25519_test_vectors[i].public);
-		if (ret != curve25519_test_vectors[i].valid ||
-		    memcmp(out, curve25519_test_vectors[i].result,
-			   CURVE25519_KEY_SIZE)) {
-			pr_err("curve25519 self-test %zu: FAIL\n", i + 1);
-			success = false;
-		}
+		ret = curve25519(out, vec->private, vec->public);
+		KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid,
+				    "Wrong return value with test vector %zu",
+				    i);
+		KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out),
+				       "Wrong output with test vector %zu", i);
 	}
+}
+
+static void test_curve25519_basepoint(struct kunit *test)
+{
+	for (size_t i = 0; i < 5; ++i) {
+		u8 in[CURVE25519_KEY_SIZE];
+		u8 out[CURVE25519_KEY_SIZE];
+		u8 out2[CURVE25519_KEY_SIZE];
+		bool ret, ret2;
 
-	for (i = 0; i < 5; ++i) {
 		get_random_bytes(in, sizeof(in));
 		ret = curve25519_generate_public(out, in);
 		ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
-		curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
-		if (ret != ret2 ||
-		    memcmp(out, out2, CURVE25519_KEY_SIZE) ||
-		    memcmp(out, out3, CURVE25519_KEY_SIZE)) {
-			pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x",
-			       i + 1);
-			for (j = CURVE25519_KEY_SIZE; j-- > 0;)
-				printk(KERN_CONT "%02x", in[j]);
-			printk(KERN_CONT "\n");
-			success = false;
-		}
+		KUNIT_EXPECT_EQ_MSG(test, ret, ret2,
+				    "in=%*phN", CURVE25519_KEY_SIZE, in);
+		KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE,
+				       "in=%*phN", CURVE25519_KEY_SIZE, in);
 	}
+}
+
+static void benchmark_curve25519(struct kunit *test)
+{
+	const u8 *private = curve25519_test_vectors[0].private;
+	const u8 *public = curve25519_test_vectors[0].public;
+	const size_t warmup_niter = 5000;
+	const size_t benchmark_niter = 1024;
+	u8 out[CURVE25519_KEY_SIZE];
+	bool ok = true;
+	u64 t;
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
 
-	return success;
+	/* Warm-up */
+	for (size_t i = 0; i < warmup_niter; i++)
+		ok &= curve25519(out, private, public);
+
+	/* Benchmark */
+	preempt_disable();
+	t = ktime_get_ns();
+	for (size_t i = 0; i < benchmark_niter; i++)
+		ok &= curve25519(out, private, public);
+	t = ktime_get_ns() - t;
+	preempt_enable();
+	KUNIT_EXPECT_TRUE(test, ok);
+	kunit_info(test, "%llu ops/s",
+		   div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1));
 }
+
+static struct kunit_case curve25519_test_cases[] = {
+	KUNIT_CASE(test_curve25519),
+	KUNIT_CASE(test_curve25519_basepoint),
+	KUNIT_CASE(benchmark_curve25519),
+	{},
+};
+
+static struct kunit_suite curve25519_test_suite = {
+	.name = "curve25519",
+	.test_cases = curve25519_test_cases,
+};
+kunit_test_suite(curve25519_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h
index f437a0a9ac6c..61b43e62779f 100644
--- a/lib/crypto/tests/hash-test-template.h
+++ b/lib/crypto/tests/hash-test-template.h
@@ -5,11 +5,9 @@
  *
  * Copyright 2025 Google LLC
  */
+#include <kunit/run-in-irq-context.h>
 #include <kunit/test.h>
-#include <linux/hrtimer.h>
-#include <linux/timekeeping.h>
 #include <linux/vmalloc.h>
-#include <linux/workqueue.h>
 
 /* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */
 #define TEST_BUF_LEN 16384
@@ -319,119 +317,6 @@ static void test_hash_ctx_zeroization(struct kunit *test)
 			       "Hash context was not zeroized by finalization");
 }
 
-#define IRQ_TEST_HRTIMER_INTERVAL us_to_ktime(5)
-
-struct hash_irq_test_state {
-	bool (*func)(void *test_specific_state);
-	void *test_specific_state;
-	bool task_func_reported_failure;
-	bool hardirq_func_reported_failure;
-	bool softirq_func_reported_failure;
-	unsigned long hardirq_func_calls;
-	unsigned long softirq_func_calls;
-	struct hrtimer timer;
-	struct work_struct bh_work;
-};
-
-static enum hrtimer_restart hash_irq_test_timer_func(struct hrtimer *timer)
-{
-	struct hash_irq_test_state *state =
-		container_of(timer, typeof(*state), timer);
-
-	WARN_ON_ONCE(!in_hardirq());
-	state->hardirq_func_calls++;
-
-	if (!state->func(state->test_specific_state))
-		state->hardirq_func_reported_failure = true;
-
-	hrtimer_forward_now(&state->timer, IRQ_TEST_HRTIMER_INTERVAL);
-	queue_work(system_bh_wq, &state->bh_work);
-	return HRTIMER_RESTART;
-}
-
-static void hash_irq_test_bh_work_func(struct work_struct *work)
-{
-	struct hash_irq_test_state *state =
-		container_of(work, typeof(*state), bh_work);
-
-	WARN_ON_ONCE(!in_serving_softirq());
-	state->softirq_func_calls++;
-
-	if (!state->func(state->test_specific_state))
-		state->softirq_func_reported_failure = true;
-}
-
-/*
- * Helper function which repeatedly runs the given @func in task, softirq, and
- * hardirq context concurrently, and reports a failure to KUnit if any
- * invocation of @func in any context returns false.  @func is passed
- * @test_specific_state as its argument.  At most 3 invocations of @func will
- * run concurrently: one in each of task, softirq, and hardirq context.
- *
- * The main purpose of this interrupt context testing is to validate fallback
- * code paths that run in contexts where the normal code path cannot be used,
- * typically due to the FPU or vector registers already being in-use in kernel
- * mode.  These code paths aren't covered when the test code is executed only by
- * the KUnit test runner thread in task context.  The reason for the concurrency
- * is because merely using hardirq context is not sufficient to reach a fallback
- * code path on some architectures; the hardirq actually has to occur while the
- * FPU or vector unit was already in-use in kernel mode.
- *
- * Another purpose of this testing is to detect issues with the architecture's
- * irq_fpu_usable() and kernel_fpu_begin/end() or equivalent functions,
- * especially in softirq context when the softirq may have interrupted a task
- * already using kernel-mode FPU or vector (if the arch didn't prevent that).
- * Crypto functions are often executed in softirqs, so this is important.
- */
-static void run_irq_test(struct kunit *test, bool (*func)(void *),
-			 int max_iterations, void *test_specific_state)
-{
-	struct hash_irq_test_state state = {
-		.func = func,
-		.test_specific_state = test_specific_state,
-	};
-	unsigned long end_jiffies;
-
-	/*
-	 * Set up a hrtimer (the way we access hardirq context) and a work
-	 * struct for the BH workqueue (the way we access softirq context).
-	 */
-	hrtimer_setup_on_stack(&state.timer, hash_irq_test_timer_func,
-			       CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
-	INIT_WORK_ONSTACK(&state.bh_work, hash_irq_test_bh_work_func);
-
-	/* Run for up to max_iterations or 1 second, whichever comes first. */
-	end_jiffies = jiffies + HZ;
-	hrtimer_start(&state.timer, IRQ_TEST_HRTIMER_INTERVAL,
-		      HRTIMER_MODE_REL_HARD);
-	for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
-	     i++) {
-		if (!func(test_specific_state))
-			state.task_func_reported_failure = true;
-	}
-
-	/* Cancel the timer and work. */
-	hrtimer_cancel(&state.timer);
-	flush_work(&state.bh_work);
-
-	/* Sanity check: the timer and BH functions should have been run. */
-	KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
-			    "Timer function was not called");
-	KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
-			    "BH work function was not called");
-
-	/* Check for incorrect hash values reported from any context. */
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.task_func_reported_failure,
-		"Incorrect hash values reported from task context");
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.hardirq_func_reported_failure,
-		"Incorrect hash values reported from hardirq context");
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.softirq_func_reported_failure,
-		"Incorrect hash values reported from softirq context");
-}
-
 #define IRQ_TEST_DATA_LEN 256
 #define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
 
@@ -469,7 +354,7 @@ static void test_hash_interrupt_context_1(struct kunit *test)
 		HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN,
 		     state.expected_hashes[i]);
 
-	run_irq_test(test, hash_irq_test1_func, 100000, &state);
+	kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state);
 }
 
 struct hash_irq_test2_hash_ctx {
@@ -500,7 +385,7 @@ static bool hash_irq_test2_func(void *state_)
 	if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) {
 		/*
 		 * This should never happen, as the number of contexts is equal
-		 * to the maximum concurrency level of run_irq_test().
+		 * to the maximum concurrency level of kunit_run_irq_test().
 		 */
 		return false;
 	}
@@ -566,7 +451,7 @@ static void test_hash_interrupt_context_2(struct kunit *test)
 		state->update_lens[state->num_steps++] = remaining;
 	state->num_steps += 2; /* for init and final */
 
-	run_irq_test(test, hash_irq_test2_func, 250000, state);
+	kunit_run_irq_test(test, hash_irq_test2_func, 250000, state);
 }
 
 #define UNKEYED_HASH_KUNIT_CASES                     \
diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h
new file mode 100644
index 000000000000..be6727feb296
--- /dev/null
+++ b/lib/crypto/tests/md5-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[MD5_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
+			0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9,
+			0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2,
+			0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08,
+			0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99,
+			0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59,
+			0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68,
+			0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e,
+			0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05,
+			0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43,
+			0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81,
+			0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3,
+			0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60,
+			0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b,
+			0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb,
+			0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f,
+			0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6,
+			0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc,
+			0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2,
+			0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f,
+			0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7,
+			0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a,
+			0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6,
+			0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50,
+			0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = {
+	0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43,
+	0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6,
+};
+
+static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = {
+	0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7,
+	0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d,
+};
diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c
new file mode 100644
index 000000000000..38bd52c25ae3
--- /dev/null
+++ b/lib/crypto/tests/md5_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/md5.h>
+#include "md5-testvecs.h"
+
+#define HASH md5
+#define HASH_CTX md5_ctx
+#define HASH_SIZE MD5_DIGEST_SIZE
+#define HASH_INIT md5_init
+#define HASH_UPDATE md5_update
+#define HASH_FINAL md5_final
+#define HMAC_KEY hmac_md5_key
+#define HMAC_CTX hmac_md5_ctx
+#define HMAC_PREPAREKEY hmac_md5_preparekey
+#define HMAC_INIT hmac_md5_init
+#define HMAC_UPDATE hmac_md5_update
+#define HMAC_FINAL hmac_md5_final
+#define HMAC hmac_md5
+#define HMAC_USINGRAWKEY hmac_md5_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "md5",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
index 1cd4caee6010..dcedfca06df6 100644
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -5,6 +5,7 @@
 #include <crypto/sha2.h>
 #include "sha256-testvecs.h"
 
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
 #define HASH sha256
 #define HASH_CTX sha256_ctx
 #define HASH_SIZE SHA256_DIGEST_SIZE
@@ -21,9 +22,192 @@
 #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
 #include "hash-test-template.h"
 
+static void free_guarded_buf(void *buf)
+{
+	vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+	size_t full_len = round_up(len, PAGE_SIZE);
+	void *buf = vmalloc(full_len);
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, free_guarded_buf, buf));
+	return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x().  Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+	const size_t max_data_len = 16384;
+	u8 *data1_buf, *data2_buf, *hash1, *hash2;
+	u8 expected_hash1[SHA256_DIGEST_SIZE];
+	u8 expected_hash2[SHA256_DIGEST_SIZE];
+	u8 salt[SHA256_BLOCK_SIZE];
+	struct sha256_ctx *ctx;
+
+	data1_buf = alloc_guarded_buf(test, max_data_len);
+	data2_buf = alloc_guarded_buf(test, max_data_len);
+	hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+	rand_bytes(data1_buf, max_data_len);
+	rand_bytes(data2_buf, max_data_len);
+	rand_bytes(salt, sizeof(salt));
+
+	for (size_t i = 0; i < 500; i++) {
+		size_t salt_len = rand_length(sizeof(salt));
+		size_t data_len = rand_length(max_data_len);
+		const u8 *data1 = data1_buf + max_data_len - data_len;
+		const u8 *data2 = data2_buf + max_data_len - data_len;
+		struct sha256_ctx orig_ctx;
+
+		sha256_init(ctx);
+		sha256_update(ctx, salt, salt_len);
+		orig_ctx = *ctx;
+
+		sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, ctx, &orig_ctx, sizeof(*ctx),
+			"sha256_finup_2x() modified its ctx argument");
+
+		sha256_update(ctx, data1, data_len);
+		sha256_final(ctx, expected_hash1);
+		sha256_update(&orig_ctx, data2, data_len);
+		sha256_final(&orig_ctx, expected_hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+			"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+			"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+	}
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+	const size_t data_len = 128;
+	struct sha256_ctx ctx;
+	u8 hash1_a[SHA256_DIGEST_SIZE];
+	u8 hash2_a[SHA256_DIGEST_SIZE];
+	u8 hash1_b[SHA256_DIGEST_SIZE];
+	u8 hash2_b[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, 2 * data_len);
+
+	sha256_init(&ctx);
+	sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+			hash2_a);
+
+	sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+			hash2_b);
+
+	KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+	KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+	const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+	struct sha256_ctx ctx = {};
+	u8 expected_hash[SHA256_DIGEST_SIZE];
+	u8 hash[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, data_len);
+	for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+		sha256_init(&ctx);
+		ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+		sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+		sha256_update(&ctx, test_buf, data_len);
+		sha256_final(&ctx, expected_hash);
+
+		KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+				   SHA256_DIGEST_SIZE);
+	}
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+	/*
+	 * Try a few different salt lengths, since sha256_finup_2x() performance
+	 * may vary slightly for the same data_len depending on how many bytes
+	 * were already processed in the initial context.
+	 */
+	static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+	const size_t data_len = 4096;
+	const size_t num_iters = 4096;
+	struct sha256_ctx ctx;
+	u8 hash1[SHA256_DIGEST_SIZE];
+	u8 hash2[SHA256_DIGEST_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+	if (!sha256_finup_2x_is_optimized())
+		kunit_skip(test, "not relevant");
+
+	rand_bytes(test_buf, data_len * 2);
+
+	/* Warm-up */
+	for (size_t i = 0; i < num_iters; i++)
+		sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+				data_len, hash1, hash2);
+
+	for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+		size_t salt_len = salt_lens_to_test[i];
+		u64 t0, t1;
+
+		/*
+		 * Prepare the initial context.  The time to process the salt is
+		 * not measured; we're just interested in sha256_finup_2x().
+		 */
+		sha256_init(&ctx);
+		sha256_update(&ctx, test_buf, salt_len);
+
+		preempt_disable();
+		t0 = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+					data_len, hash1, hash2);
+		t1 = ktime_get_ns();
+		preempt_enable();
+		kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+			   data_len, salt_len,
+			   div64_u64((u64)data_len * 2 * num_iters * 1000,
+				     t1 - t0 ?: 1));
+	}
+}
+
 static struct kunit_case hash_test_cases[] = {
 	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha256_finup_2x),
+	KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+	KUNIT_CASE(test_sha256_finup_2x_hugelen),
 	KUNIT_CASE(benchmark_hash),
+	KUNIT_CASE(benchmark_sha256_finup_2x),
 	{},
 };
 
diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig
deleted file mode 100644
index 546fe2afe0b5..000000000000
--- a/lib/crypto/x86/Kconfig
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_X86
-	bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
-	depends on 64BIT
-	select CRYPTO_LIB_BLAKE2S_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: x86_64 using:
-	  - SSSE3 (Supplemental SSE3)
-	  - AVX-512 (Advanced Vector Extensions-512)
-
-config CRYPTO_CHACHA20_X86_64
-	tristate
-	depends on 64BIT
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_X86_64
-	tristate
-	depends on 64BIT
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile
deleted file mode 100644
index c2ff8c5f1046..000000000000
--- a/lib/crypto/x86/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $< > $@
-
-$(obj)/%.S: $(src)/%.pl FORCE
-	$(call if_changed,perlasm)
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index ac1c845445a4..ef8e9f427aab 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -29,19 +29,19 @@ SIGMA:
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
 .align 64
 SIGMA2:
-.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
-.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
-.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
-.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
-.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
-.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
-.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
-.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
-.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
-.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
+.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
+.byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
+.byte 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
+.byte  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
+.byte  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
+.byte  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
+.byte  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
+.byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
+.byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 
 .text
 SYM_FUNC_START(blake2s_compress_ssse3)
@@ -193,9 +193,9 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	leaq		SIGMA2(%rip),%rax
 	movb		$0xa,%cl
 .Lblake2s_compress_avx512_roundloop:
-	addq		$0x40,%rax
-	vmovdqa		-0x40(%rax),%ymm8
-	vmovdqa		-0x20(%rax),%ymm9
+	vpmovzxbd	(%rax),%ymm8
+	vpmovzxbd	0x8(%rax),%ymm9
+	addq		$0x10,%rax
 	vpermi2d	%ymm7,%ymm6,%ymm8
 	vpermi2d	%ymm7,%ymm6,%ymm9
 	vmovdqa		%ymm8,%ymm6
diff --git a/lib/crypto/x86/blake2s-glue.c b/lib/crypto/x86/blake2s.h
index adc296cd17c9..b6d30d2fa045 100644
--- a/lib/crypto/x86/blake2s-glue.c
+++ b/lib/crypto/x86/blake2s.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /*
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
@@ -7,8 +7,6 @@
 #include <asm/fpu/api.h>
 #include <asm/processor.h>
 #include <asm/simd.h>
-#include <crypto/internal/blake2s.h>
-#include <linux/init.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
 #include <linux/sizes.h>
@@ -23,8 +21,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, const u32 inc)
+static void blake2s_compress(struct blake2s_state *state, const u8 *block,
+			     size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -49,9 +47,9 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
-EXPORT_SYMBOL(blake2s_compress);
 
-static int __init blake2s_mod_init(void)
+#define blake2s_mod_init_arch blake2s_mod_init_arch
+static void blake2s_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SSSE3))
 		static_branch_enable(&blake2s_use_ssse3);
@@ -63,8 +61,4 @@ static int __init blake2s_mod_init(void)
 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
 			      XFEATURE_MASK_AVX512, NULL))
 		static_branch_enable(&blake2s_use_avx512);
-
-	return 0;
 }
-
-subsys_initcall(blake2s_mod_init);
diff --git a/lib/crypto/x86/chacha_glue.c b/lib/crypto/x86/chacha.h
index 10b2c945f541..10cf8f1c569d 100644
--- a/lib/crypto/x86/chacha_glue.c
+++ b/lib/crypto/x86/chacha.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
  * ChaCha and HChaCha functions (x86_64 optimized)
  *
@@ -6,10 +6,8 @@
  */
 
 #include <asm/simd.h>
-#include <crypto/chacha.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/sizes.h>
 
 asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
@@ -126,8 +124,8 @@ static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
 	}
 }
 
-void hchacha_block_arch(const struct chacha_state *state,
-			u32 out[HCHACHA_OUT_WORDS], int nrounds)
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
 {
 	if (!static_branch_likely(&chacha_use_simd)) {
 		hchacha_block_generic(state, out, nrounds);
@@ -137,10 +135,9 @@ void hchacha_block_arch(const struct chacha_state *state,
 		kernel_fpu_end();
 	}
 }
-EXPORT_SYMBOL(hchacha_block_arch);
 
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-		       unsigned int bytes, int nrounds)
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
 {
 	if (!static_branch_likely(&chacha_use_simd) ||
 	    bytes <= CHACHA_BLOCK_SIZE)
@@ -158,18 +155,12 @@ void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
 		dst += todo;
 	} while (bytes);
 }
-EXPORT_SYMBOL(chacha_crypt_arch);
 
-bool chacha_is_arch_optimized(void)
-{
-	return static_key_enabled(&chacha_use_simd);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
-		return 0;
+		return;
 
 	static_branch_enable(&chacha_use_simd);
 
@@ -182,15 +173,4 @@ static int __init chacha_simd_mod_init(void)
 		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
 			static_branch_enable(&chacha_use_avx512vl);
 	}
-	return 0;
 }
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)");
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/lib/crypto/x86/curve25519.h
index d587f05c3c8c..5c0b8408852d 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/lib/crypto/x86/curve25519.h
@@ -4,15 +4,9 @@
  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
  */
 
-#include <crypto/curve25519.h>
-#include <crypto/internal/kpp.h>
-
-#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/scatterlist.h>
 
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -1592,135 +1586,28 @@ static void curve25519_ever64_base(u8 *out, const u8 *priv)
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
 
-void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
-		     const u8 secret[CURVE25519_KEY_SIZE],
-		     const u8 basepoint[CURVE25519_KEY_SIZE])
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&curve25519_use_bmi2_adx))
 		curve25519_ever64(mypublic, secret, basepoint);
 	else
 		curve25519_generic(mypublic, secret, basepoint);
 }
-EXPORT_SYMBOL(curve25519_arch);
 
-void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
-			  const u8 secret[CURVE25519_KEY_SIZE])
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&curve25519_use_bmi2_adx))
 		curve25519_ever64_base(pub, secret);
 	else
 		curve25519_generic(pub, secret, curve25519_base_point);
 }
-EXPORT_SYMBOL(curve25519_base_arch);
-
-static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
-				 unsigned int len)
-{
-	u8 *secret = kpp_tfm_ctx(tfm);
-
-	if (!len)
-		curve25519_generate_secret(secret);
-	else if (len == CURVE25519_KEY_SIZE &&
-		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
-		memcpy(secret, buf, CURVE25519_KEY_SIZE);
-	else
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_generate_public_key(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-
-	if (req->src)
-		return -EINVAL;
-
-	curve25519_base_arch(buf, secret);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
-
-static int curve25519_compute_shared_secret(struct kpp_request *req)
-{
-	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	const u8 *secret = kpp_tfm_ctx(tfm);
-	u8 public_key[CURVE25519_KEY_SIZE];
-	u8 buf[CURVE25519_KEY_SIZE];
-	int copied, nbytes;
-
-	if (!req->src)
-		return -EINVAL;
-
-	copied = sg_copy_to_buffer(req->src,
-				   sg_nents_for_len(req->src,
-						    CURVE25519_KEY_SIZE),
-				   public_key, CURVE25519_KEY_SIZE);
-	if (copied != CURVE25519_KEY_SIZE)
-		return -EINVAL;
-
-	curve25519_arch(buf, secret, public_key);
-
-	/* might want less than we've got */
-	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
-	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
-								nbytes),
-				     buf, nbytes);
-	if (copied != nbytes)
-		return -EINVAL;
-	return 0;
-}
 
-static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
-{
-	return CURVE25519_KEY_SIZE;
-}
-
-static struct kpp_alg curve25519_alg = {
-	.base.cra_name		= "curve25519",
-	.base.cra_driver_name	= "curve25519-x86",
-	.base.cra_priority	= 200,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
-
-	.set_secret		= curve25519_set_secret,
-	.generate_public_key	= curve25519_generate_public_key,
-	.compute_shared_secret	= curve25519_compute_shared_secret,
-	.max_size		= curve25519_max_size,
-};
-
-
-static int __init curve25519_mod_init(void)
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
 		static_branch_enable(&curve25519_use_bmi2_adx);
-	else
-		return 0;
-	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
-		crypto_register_kpp(&curve25519_alg) : 0;
-}
-
-static void __exit curve25519_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
-	    static_branch_likely(&curve25519_use_bmi2_adx))
-		crypto_unregister_kpp(&curve25519_alg);
 }
-
-module_init(curve25519_mod_init);
-module_exit(curve25519_mod_exit);
-
-MODULE_ALIAS_CRYPTO("curve25519");
-MODULE_ALIAS_CRYPTO("curve25519-x86");
-MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized");
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
index 501827254fed..409ec6955733 100644
--- a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
+++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
@@ -118,19 +118,6 @@ sub declare_function() {
 	}
 }
 
-sub declare_typed_function() {
-	my ($name, $align, $nargs) = @_;
-	if($kernel) {
-		$code .= "SYM_TYPED_FUNC_START($name)\n";
-		$code .= ".L$name:\n";
-	} else {
-		$code .= ".globl	$name\n";
-		$code .= ".type	$name,\@function,$nargs\n";
-		$code .= ".align	$align\n";
-		$code .= "$name:\n";
-	}
-}
-
 sub end_function() {
 	my ($name) = @_;
 	if($kernel) {
@@ -141,7 +128,7 @@ sub end_function() {
 }
 
 $code.=<<___ if $kernel;
-#include <linux/cfi_types.h>
+#include <linux/linkage.h>
 ___
 
 if ($avx) {
@@ -249,14 +236,14 @@ ___
 $code.=<<___ if (!$kernel);
 .extern	OPENSSL_ia32cap_P
 
-.globl	poly1305_block_init_arch
-.hidden	poly1305_block_init_arch
+.globl	poly1305_init_x86_64
+.hidden	poly1305_init_x86_64
 .globl	poly1305_blocks_x86_64
 .hidden	poly1305_blocks_x86_64
 .globl	poly1305_emit_x86_64
 .hidden	poly1305_emit_x86_64
 ___
-&declare_typed_function("poly1305_block_init_arch", 32, 3);
+&declare_function("poly1305_init_x86_64", 32, 3);
 $code.=<<___;
 	xor	%eax,%eax
 	mov	%rax,0($ctx)		# initialize hash value
@@ -311,7 +298,7 @@ $code.=<<___;
 .Lno_key:
 	RET
 ___
-&end_function("poly1305_block_init_arch");
+&end_function("poly1305_init_x86_64");
 
 &declare_function("poly1305_blocks_x86_64", 32, 4);
 $code.=<<___;
@@ -4118,9 +4105,9 @@ avx_handler:
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_poly1305_block_init_arch
-	.rva	.LSEH_end_poly1305_block_init_arch
-	.rva	.LSEH_info_poly1305_block_init_arch
+	.rva	.LSEH_begin_poly1305_init_x86_64
+	.rva	.LSEH_end_poly1305_init_x86_64
+	.rva	.LSEH_info_poly1305_init_x86_64
 
 	.rva	.LSEH_begin_poly1305_blocks_x86_64
 	.rva	.LSEH_end_poly1305_blocks_x86_64
@@ -4168,10 +4155,10 @@ ___
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_poly1305_block_init_arch:
+.LSEH_info_poly1305_init_x86_64:
 	.byte	9,0,0,0
 	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch
+	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
 
 .LSEH_info_poly1305_blocks_x86_64:
 	.byte	9,0,0,0
diff --git a/lib/crypto/x86/poly1305_glue.c b/lib/crypto/x86/poly1305.h
index 856d48fd422b..ee92e3740a78 100644
--- a/lib/crypto/x86/poly1305_glue.c
+++ b/lib/crypto/x86/poly1305.h
@@ -1,16 +1,13 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /*
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/api.h>
-#include <crypto/internal/poly1305.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/sizes.h>
-#include <linux/unaligned.h>
 
 struct poly1305_arch_internal {
 	union {
@@ -61,10 +58,8 @@ static void convert_to_base2_64(void *ctx)
 	state->is_base2_26 = 0;
 }
 
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state,
+				     const u8 raw_key[POLY1305_BLOCK_SIZE]);
 asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
 				       const u8 *inp,
 				       const size_t len, const u32 padbit);
@@ -88,8 +83,14 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
 
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
-			  unsigned int len, u32 padbit)
+static void poly1305_block_init(struct poly1305_block_state *state,
+				const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	poly1305_init_x86_64(state, raw_key);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp,
+			    unsigned int len, u32 padbit)
 {
 	struct poly1305_arch_internal *ctx =
 		container_of(&state->h.h, struct poly1305_arch_internal, h);
@@ -129,25 +130,18 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp,
 		inp += bytes;
 	} while (len);
 }
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
 
-void poly1305_emit_arch(const struct poly1305_state *ctx,
-			u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+static void poly1305_emit(const struct poly1305_state *ctx,
+			  u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
 {
 	if (!static_branch_likely(&poly1305_use_avx))
 		poly1305_emit_x86_64(ctx, mac, nonce);
 	else
 		poly1305_emit_avx(ctx, mac, nonce);
 }
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	return static_key_enabled(&poly1305_use_avx);
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
 
-static int __init poly1305_simd_mod_init(void)
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_AVX) &&
 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
@@ -161,15 +155,4 @@ static int __init poly1305_simd_mod_init(void)
 	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
 	    boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
 		static_branch_enable(&poly1305_use_avx512);
-	return 0;
 }
-subsys_initcall(poly1305_simd_mod_init);
-
-static void __exit poly1305_simd_mod_exit(void)
-{
-}
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
index e308379d89bc..c48a0131fd12 100644
--- a/lib/crypto/x86/sha1.h
+++ b/lib/crypto/x86/sha1.h
@@ -55,7 +55,7 @@ static void sha1_blocks(struct sha1_block_state *state,
 }
 
 #define sha1_mod_init_arch sha1_mod_init_arch
-static inline void sha1_mod_init_arch(void)
+static void sha1_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
 		static_call_update(sha1_blocks_x86, sha1_blocks_ni);
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
index 4bd9490ffc66..de5f707e7ef7 100644
--- a/lib/crypto/x86/sha256-ni-asm.S
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
 	RET
 SYM_FUNC_END(sha256_ni_transform)
 
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX		%rdi
+#define DATA1		%rsi
+#define DATA2		%rdx
+#define LEN		%ecx
+#define LEN8		%cl
+#define LEN64		%rcx
+#define OUT1		%r8
+#define OUT2		%r9
+
+// other scalar variables
+#define SHA256CONSTANTS	%rax
+#define COUNT		%r10
+#define COUNT32		%r10d
+#define FINAL_STEP	%r11d
+
+// rbx is used as a temporary.
+
+#define MSG		%xmm0	// sha256rnds2 implicit operand
+#define STATE0_A	%xmm1
+#define STATE1_A	%xmm2
+#define STATE0_B	%xmm3
+#define STATE1_B	%xmm4
+#define TMP_A		%xmm5
+#define TMP_B		%xmm6
+#define MSG0_A		%xmm7
+#define MSG1_A		%xmm8
+#define MSG2_A		%xmm9
+#define MSG3_A		%xmm10
+#define MSG0_B		%xmm11
+#define MSG1_B		%xmm12
+#define MSG2_B		%xmm13
+#define MSG3_B		%xmm14
+#define SHUF_MASK	%xmm15
+
+#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message.  m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second.  After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
+	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
+	movdqa		TMP_A, TMP_B
+	paddd		\m0_a, TMP_A
+	paddd		\m0_b, TMP_B
+.if \i < 48
+	sha256msg1	\m1_a, \m0_a
+	sha256msg1	\m1_b, \m0_b
+.endif
+	movdqa		TMP_A, MSG
+	sha256rnds2	STATE0_A, STATE1_A
+	movdqa		TMP_B, MSG
+	sha256rnds2	STATE0_B, STATE1_B
+	pshufd 		$0x0E, TMP_A, MSG
+	sha256rnds2	STATE1_A, STATE0_A
+	pshufd 		$0x0E, TMP_B, MSG
+	sha256rnds2	STATE1_B, STATE0_B
+.if \i < 48
+	movdqa		\m3_a, TMP_A
+	movdqa		\m3_b, TMP_B
+	palignr		$4, \m2_a, TMP_A
+	palignr		$4, \m2_b, TMP_B
+	paddd		TMP_A, \m0_a
+	paddd		TMP_B, \m0_b
+	sha256msg2	\m3_a, \m0_a
+	sha256msg2	\m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+	// Allocate 128 bytes of stack space, 16-byte aligned.
+	push		%rbx
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$128, %rsp
+	and		$~15, %rsp
+
+	// Load the shuffle mask for swapping the endianness of 32-bit words.
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+	// Set up pointer to the round constants.
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+	// Initially we're not processing the final blocks.
+	xor		FINAL_STEP, FINAL_STEP
+
+	// Load the initial state from ctx->state.
+	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
+	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
+	movdqa		STATE0_A, TMP_A
+	punpcklqdq	STATE1_A, STATE0_A			// FEBA
+	punpckhqdq	TMP_A, STATE1_A				// DCHG
+	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
+	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// LEN added to it.
+	mov		LEN, LEN
+	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
+	lea		(%rbx, LEN64, 1), COUNT
+	and		$63, %ebx
+	jz		.Lfinup2x_enter_loop	// No bytes buffered?
+
+	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
+	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+	// unconditionally and rearrange the data as needed.
+
+	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
+	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
+	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
+	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		MSG0_A, 0*16(%rsp,%rbx)
+	movdqu		MSG1_A, 1*16(%rsp,%rbx)
+	movdqu		MSG2_A, 2*16(%rsp,%rbx)
+	movdqu		MSG3_A, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_A
+	movdqa		1*16(%rsp), MSG1_A
+	movdqa		2*16(%rsp), MSG2_A
+	movdqa		3*16(%rsp), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqu		MSG0_B, 0*16(%rsp,%rbx)
+	movdqu		MSG1_B, 1*16(%rsp,%rbx)
+	movdqu		MSG2_B, 2*16(%rsp,%rbx)
+	movdqu		MSG3_B, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_B
+	movdqa		1*16(%rsp), MSG1_B
+	movdqa		2*16(%rsp), MSG2_B
+	movdqa		3*16(%rsp), MSG3_B
+
+	sub		$64, %rbx 	// rbx = buffered - 64
+	sub		%rbx, DATA1	// DATA1 += 64 - buffered
+	sub		%rbx, DATA2	// DATA2 += 64 - buffered
+	add		%ebx, LEN	// LEN += buffered - 64
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+	jmp		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		$64, LEN
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		3*16(DATA2), MSG3_B
+	add		$64, DATA1
+	add		$64, DATA2
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+	pshufb		SHUF_MASK, MSG0_A
+	pshufb		SHUF_MASK, MSG0_B
+	pshufb		SHUF_MASK, MSG1_A
+	pshufb		SHUF_MASK, MSG1_B
+	pshufb		SHUF_MASK, MSG2_A
+	pshufb		SHUF_MASK, MSG2_B
+	pshufb		SHUF_MASK, MSG3_A
+	pshufb		SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	movdqa		STATE0_A, 0*16(%rsp)
+	movdqa		STATE0_B, 1*16(%rsp)
+	movdqa		STATE1_A, 2*16(%rsp)
+	movdqa		STATE1_B, 3*16(%rsp)
+
+	// Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
+	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
+	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
+	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+	// Add the original state for each block.
+	paddd		0*16(%rsp), STATE0_A
+	paddd		1*16(%rsp), STATE0_B
+	paddd		2*16(%rsp), STATE1_A
+	paddd		3*16(%rsp), STATE1_B
+
+	// Update LEN and loop back if more blocks remain.
+	sub		$64, LEN
+	jge		.Lfinup2x_loop
+
+	// Check if any final blocks need to be handled.
+	// FINAL_STEP = 2: all done
+	// FINAL_STEP = 1: need to do count-only padding block
+	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
+	cmp		$1, FINAL_STEP
+	jg		.Lfinup2x_done
+	je		.Lfinup2x_finalize_countonly
+	add		$64, LEN
+	jz		.Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - LEN] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	mov		$64, %ebx
+	sub		LEN, %ebx		// ebx = 64 - LEN
+	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
+	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
+	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
+	movd		FINAL_STEP, MSG0_A
+	pxor		MSG1_A, MSG1_A
+	movdqa		MSG0_A, 4*16(%rsp)
+	movdqa		MSG1_A, 5*16(%rsp)
+	movdqa		MSG1_A, 6*16(%rsp)
+	movdqa		MSG1_A, 7*16(%rsp)
+	cmp		$56, LEN
+	jge		1f	// will COUNT spill into its own block?
+	shl		$3, COUNT
+	bswap		COUNT
+	mov		COUNT, 56(%rsp,%rbx)
+	mov		$2, FINAL_STEP	// won't need count-only block
+	jmp		2f
+1:
+	mov		$1, FINAL_STEP	// will need count-only block
+2:
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_A
+	movdqu		1*16(%rsp,%rbx), MSG1_A
+	movdqu		2*16(%rsp,%rbx), MSG2_A
+	movdqu		3*16(%rsp,%rbx), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqa		MSG0_B, 0*16(%rsp)
+	movdqa		MSG1_B, 1*16(%rsp)
+	movdqa		MSG2_B, 2*16(%rsp)
+	movdqa		MSG3_B, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_B
+	movdqu		1*16(%rsp,%rbx), MSG1_B
+	movdqu		2*16(%rsp,%rbx), MSG2_B
+	movdqu		3*16(%rsp,%rbx), MSG3_B
+	jmp		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	pxor		MSG0_A, MSG0_A
+	jmp		1f
+
+.Lfinup2x_finalize_blockaligned:
+	mov		$0x80000000, %ebx
+	movd		%ebx, MSG0_A
+1:
+	pxor		MSG1_A, MSG1_A
+	pxor		MSG2_A, MSG2_A
+	ror		$29, COUNT
+	movq		COUNT, MSG3_A
+	pslldq		$8, MSG3_A
+	movdqa		MSG0_A, MSG0_B
+	pxor		MSG1_B, MSG1_B
+	pxor		MSG2_B, MSG2_B
+	movdqa		MSG3_A, MSG3_B
+	mov		$2, FINAL_STEP
+	jmp		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+	movdqa		STATE0_A, TMP_A
+	movdqa		STATE0_B, TMP_B
+	punpcklqdq	STATE1_A, STATE0_A		// GHEF
+	punpcklqdq	STATE1_B, STATE0_B
+	punpckhqdq	TMP_A, STATE1_A			// ABCD
+	punpckhqdq	TMP_B, STATE1_B
+	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
+	pshufd		$0xB1, STATE0_B, STATE0_B
+	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
+	pshufd		$0x1B, STATE1_B, STATE1_B
+	pshufb		SHUF_MASK, STATE0_A
+	pshufb		SHUF_MASK, STATE0_B
+	pshufb		SHUF_MASK, STATE1_A
+	pshufb		SHUF_MASK, STATE1_B
+	movdqu		STATE0_A, 1*16(OUT1)
+	movdqu		STATE0_B, 1*16(OUT2)
+	movdqu		STATE1_A, 0*16(OUT1)
+	movdqu		STATE1_B, 0*16(OUT2)
+
+	mov		%rbp, %rsp
+	pop		%rbp
+	pop		%rbx
+	RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
 .section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
index 669bc06538b6..38e33b22a092 100644
--- a/lib/crypto/x86/sha256.h
+++ b/lib/crypto/x86/sha256.h
@@ -5,9 +5,10 @@
  * Copyright 2025 Google LLC
  */
 #include <asm/fpu/api.h>
-#include <crypto/internal/simd.h>
 #include <linux/static_call.h>
 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
 
 #define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
@@ -16,7 +17,7 @@ DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
 	static void c_fn(struct sha256_block_state *state, const u8 *data, \
 			 size_t nblocks)                                   \
 	{                                                                  \
-		if (likely(crypto_simd_usable())) {                        \
+		if (likely(irq_fpu_usable())) {                            \
 			kernel_fpu_begin();                                \
 			asm_fn(state, data, nblocks);                      \
 			kernel_fpu_end();                                  \
@@ -36,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
 	static_call(sha256_blocks_x86)(state, data, nblocks);
 }
 
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_fpu_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_sha_ni);
+}
+
 #define sha256_mod_init_arch sha256_mod_init_arch
-static inline void sha256_mod_init_arch(void)
+static void sha256_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
 		static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+		static_branch_enable(&have_sha_ni);
 	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
 				     NULL) &&
 		   boot_cpu_has(X86_FEATURE_AVX)) {
diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h
index c13503d9d57d..0213c70cedd0 100644
--- a/lib/crypto/x86/sha512.h
+++ b/lib/crypto/x86/sha512.h
@@ -4,9 +4,7 @@
  *
  * Copyright 2025 Google LLC
  */
-
 #include <asm/fpu/api.h>
-#include <crypto/internal/simd.h>
 #include <linux/static_call.h>
 
 DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
@@ -17,7 +15,7 @@ DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
 	static void c_fn(struct sha512_block_state *state, const u8 *data, \
 			 size_t nblocks)                                   \
 	{                                                                  \
-		if (likely(crypto_simd_usable())) {                        \
+		if (likely(irq_fpu_usable())) {                            \
 			kernel_fpu_begin();                                \
 			asm_fn(state, data, nblocks);                      \
 			kernel_fpu_end();                                  \
@@ -37,7 +35,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 }
 
 #define sha512_mod_init_arch sha512_mod_init_arch
-static inline void sha512_mod_init_arch(void)
+static void sha512_mod_init_arch(void)
 {
 	if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
 	    boot_cpu_has(X86_FEATURE_AVX)) {
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index 5d54c4b437df..5f779719c3d3 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -4,9 +4,7 @@
  * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
  */
 
-#include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
 #include <linux/raid/pq.h>
 
 static int rvv_has_vector(void)
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
index 7d82efa5b14f..89da5fc247aa 100644
--- a/lib/raid6/rvv.c
+++ b/lib/raid6/rvv.c
@@ -9,11 +9,8 @@
  *	Copyright 2002-2004 H. Peter Anvin
  */
 
-#include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
 #include <linux/raid/pq.h>
-#include <linux/types.h>
 #include "rvv.h"
 
 #define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
@@ -47,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -120,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -221,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -313,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -443,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -569,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -757,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      "vle8.v	v16, (%[wp4])\n"
-			      "vle8.v	v17, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
 			      "vle8.v	v20, (%[wp5])\n"
-			      "vle8.v	v21, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
 			      "vle8.v	v24, (%[wp6])\n"
-			      "vle8.v	v25, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
 			      "vle8.v	v28, (%[wp7])\n"
-			      "vle8.v	v29, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -951,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      "vle8.v	v16, (%[wp4])\n"
-			      "vle8.v	v17, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
 			      "vle8.v	v20, (%[wp5])\n"
-			      "vle8.v	v21, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
 			      "vle8.v	v24, (%[wp6])\n"
-			      "vle8.v	v25, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
 			      "vle8.v	v28, (%[wp7])\n"
-			      "vle8.v	v29, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index a9e6ffcff04b..cce12287708e 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -434,7 +434,7 @@ void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
 	if (dentry && !xa_is_err(dentry))
 		return;
 
-	ret = snprintf(name, sizeof(name), "%s@%px", dir->class, dir);
+	ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir);
 	name[sizeof(name) - 1] = '\0';
 
 	if (ret < sizeof(name)) {
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index fa6d728a8b5b..f7460831cfdd 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o
 obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
 obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
 obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o
+obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation)
diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c
new file mode 100644
index 000000000000..9a329cdc09c2
--- /dev/null
+++ b/lib/tests/ffs_kunit.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for ffs()-family functions
+ */
+#include <kunit/test.h>
+#include <linux/bitops.h>
+
+/*
+ * Test data structures
+ */
+struct ffs_test_case {
+	unsigned long input;
+	int expected_ffs;	/* ffs() result (1-based) */
+	int expected_fls;	/* fls() result (1-based) */
+	const char *description;
+};
+
+struct ffs64_test_case {
+	u64 input;
+	int expected_fls64;		    /* fls64() result (1-based) */
+	unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */
+	const char *description;
+};
+
+/*
+ * Basic edge cases - core functionality validation
+ */
+static const struct ffs_test_case basic_test_cases[] = {
+	/* Zero case - special handling */
+	{0x00000000, 0, 0, "zero value"},
+
+	/* Single bit patterns - powers of 2 */
+	{0x00000001, 1, 1, "bit 0 set"},
+	{0x00000002, 2, 2, "bit 1 set"},
+	{0x00000004, 3, 3, "bit 2 set"},
+	{0x00000008, 4, 4, "bit 3 set"},
+	{0x00000010, 5, 5, "bit 4 set"},
+	{0x00000020, 6, 6, "bit 5 set"},
+	{0x00000040, 7, 7, "bit 6 set"},
+	{0x00000080, 8, 8, "bit 7 set"},
+	{0x00000100, 9, 9, "bit 8 set"},
+	{0x00008000, 16, 16, "bit 15 set"},
+	{0x00010000, 17, 17, "bit 16 set"},
+	{0x40000000, 31, 31, "bit 30 set"},
+	{0x80000000, 32, 32, "bit 31 set (sign bit)"},
+
+	/* Maximum values */
+	{0xFFFFFFFF, 1, 32, "all bits set"},
+
+	/* Multiple bit patterns */
+	{0x00000003, 1, 2, "bits 0-1 set"},
+	{0x00000007, 1, 3, "bits 0-2 set"},
+	{0x0000000F, 1, 4, "bits 0-3 set"},
+	{0x000000FF, 1, 8, "bits 0-7 set"},
+	{0x0000FFFF, 1, 16, "bits 0-15 set"},
+	{0x7FFFFFFF, 1, 31, "bits 0-30 set"},
+
+	/* Sparse patterns */
+	{0x00000101, 1, 9, "bits 0,8 set"},
+	{0x00001001, 1, 13, "bits 0,12 set"},
+	{0x80000001, 1, 32, "bits 0,31 set"},
+	{0x40000002, 2, 31, "bits 1,30 set"},
+};
+
+/*
+ * 64-bit test cases
+ */
+static const struct ffs64_test_case ffs64_test_cases[] = {
+	/* Zero case */
+	{0x0000000000000000ULL, 0, 0, "zero value"},
+
+	/* Single bit patterns */
+	{0x0000000000000001ULL, 1, 0, "bit 0 set"},
+	{0x0000000000000002ULL, 2, 1, "bit 1 set"},
+	{0x0000000000000004ULL, 3, 2, "bit 2 set"},
+	{0x0000000000000008ULL, 4, 3, "bit 3 set"},
+	{0x0000000000008000ULL, 16, 15, "bit 15 set"},
+	{0x0000000000010000ULL, 17, 16, "bit 16 set"},
+	{0x0000000080000000ULL, 32, 31, "bit 31 set"},
+	{0x0000000100000000ULL, 33, 32, "bit 32 set"},
+	{0x0000000200000000ULL, 34, 33, "bit 33 set"},
+	{0x4000000000000000ULL, 63, 62, "bit 62 set"},
+	{0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"},
+
+	/* Maximum values */
+	{0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"},
+
+	/* Cross 32-bit boundary patterns */
+	{0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"},
+	{0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"},
+	{0x8000000000000001ULL, 64, 0, "bits 0,63 set"},
+	{0x4000000000000002ULL, 63, 1, "bits 1,62 set"},
+
+	/* Mixed patterns */
+	{0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"},
+	{0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"},
+};
+
+/*
+ * Helper function to validate ffs results with detailed error messages
+ */
+static void validate_ffs_result(struct kunit *test, unsigned long input,
+				int actual, int expected, const char *func_name,
+				const char *description)
+{
+	KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+			    "%s(0x%08lx) [%s]: expected %d, got %d",
+			    func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate 64-bit ffs results
+ */
+static void validate_ffs64_result(struct kunit *test, u64 input,
+				  int actual, int expected, const char *func_name,
+				  const char *description)
+{
+	KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+			    "%s(0x%016llx) [%s]: expected %d, got %d",
+			    func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate mathematical relationships between functions
+ */
+static void validate_ffs_relationships(struct kunit *test, unsigned long input)
+{
+	int ffs_result;
+	int fls_result;
+	unsigned int ffs_0based;
+	unsigned int fls_0based;
+
+	if (input == 0) {
+		/* Special case: zero input */
+		KUNIT_EXPECT_EQ(test, ffs(input), 0);
+		KUNIT_EXPECT_EQ(test, fls(input), 0);
+		/* __ffs and __fls are undefined for 0, but often return specific values */
+		return;
+	}
+
+	ffs_result = ffs(input);
+	fls_result = fls(input);
+	ffs_0based = __ffs(input);
+	fls_0based = __fls(input);
+
+	/* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */
+	KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1,
+			    "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1",
+			    input, input, ffs_result, ffs_0based);
+
+	/* Relationship: fls(x) == __fls(x) + 1 for x != 0 */
+	KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1,
+			    "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1",
+			    input, input, fls_result, fls_0based);
+
+	/* Range validation */
+	KUNIT_EXPECT_GE(test, ffs_result, 1);
+	KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG);
+	KUNIT_EXPECT_GE(test, fls_result, 1);
+	KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG);
+}
+
+/*
+ * Helper function to validate 64-bit relationships
+ */
+static void validate_ffs64_relationships(struct kunit *test, u64 input)
+{
+	int fls64_result;
+	unsigned int ffs64_0based;
+
+	if (input == 0) {
+		KUNIT_EXPECT_EQ(test, fls64(input), 0);
+		return;
+	}
+
+	fls64_result = fls64(input);
+	ffs64_0based = __ffs64(input);
+
+	/* Range validation */
+	KUNIT_EXPECT_GE(test, fls64_result, 1);
+	KUNIT_EXPECT_LE(test, fls64_result, 64);
+	KUNIT_EXPECT_LT(test, ffs64_0based, 64);
+
+	/*
+	 * Relationships with 32-bit functions should hold for small values
+	 * on all architectures.
+	 */
+	if (input <= 0xFFFFFFFFULL) {
+		unsigned long input_32 = (unsigned long)input;
+		KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32),
+				    "fls64(0x%llx) != fls(0x%lx): %d != %d",
+				    input, input_32, fls64(input), fls(input_32));
+
+		if (input != 0) {
+			KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32),
+					    "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu",
+					    input, input_32,
+					    (unsigned long)__ffs64(input),
+					    (unsigned long)__ffs(input_32));
+		}
+	}
+}
+
+/*
+ * Test basic correctness of all ffs-family functions
+ */
+static void ffs_basic_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+		const struct ffs_test_case *tc = &basic_test_cases[i];
+
+		/* Test ffs() */
+		validate_ffs_result(test, tc->input, ffs(tc->input),
+				    tc->expected_ffs, "ffs", tc->description);
+
+		/* Test fls() */
+		validate_ffs_result(test, tc->input, fls(tc->input),
+				    tc->expected_fls, "fls", tc->description);
+
+		/* Test __ffs() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			/* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */
+			unsigned int expected_ffs_0based = tc->expected_ffs - 1;
+			validate_ffs_result(test, tc->input, __ffs(tc->input),
+					    expected_ffs_0based, "__ffs", tc->description);
+		}
+
+		/* Test __fls() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			/* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */
+			unsigned int expected_fls_0based = tc->expected_fls - 1;
+			validate_ffs_result(test, tc->input, __fls(tc->input),
+					    expected_fls_0based, "__fls", tc->description);
+		}
+	}
+}
+
+/*
+ * Test 64-bit function correctness
+ */
+static void ffs64_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+		const struct ffs64_test_case *tc = &ffs64_test_cases[i];
+
+		/* Test fls64() */
+		validate_ffs64_result(test, tc->input, fls64(tc->input),
+				      tc->expected_fls64, "fls64", tc->description);
+
+		/* Test __ffs64() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			validate_ffs64_result(test, tc->input, __ffs64(tc->input),
+					      tc->expected_ffs64_0based, "__ffs64",
+					      tc->description);
+		}
+	}
+}
+
+/*
+ * Test mathematical relationships between functions
+ */
+static void ffs_mathematical_relationships_test(struct kunit *test)
+{
+	int i;
+
+	/* Test basic cases */
+	for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+		validate_ffs_relationships(test, basic_test_cases[i].input);
+	}
+
+	/* Test 64-bit cases */
+	for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+		validate_ffs64_relationships(test, ffs64_test_cases[i].input);
+	}
+}
+
+/*
+ * Test edge cases and boundary conditions
+ */
+static void ffs_edge_cases_test(struct kunit *test)
+{
+	unsigned long test_patterns[] = {
+		/* Powers of 2 */
+		1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL,
+		256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL,
+
+		/* Powers of 2 minus 1 */
+		1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL,
+		511UL, 1023UL, 2047UL, 4095UL, 8191UL,
+
+		/* Boundary values */
+		0x7FFFFFFFUL,	/* Maximum positive 32-bit */
+		0x80000000UL,	/* Minimum negative 32-bit */
+		0xFFFFFFFFUL,	/* Maximum 32-bit unsigned */
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+		validate_ffs_relationships(test, test_patterns[i]);
+	}
+}
+
+/*
+ * Test 64-bit edge cases
+ */
+static void ffs64_edge_cases_test(struct kunit *test)
+{
+	u64 test_patterns_64[] = {
+		/* 64-bit powers of 2 */
+		0x0000000100000000ULL,	/* 2^32 */
+		0x0000000200000000ULL,	/* 2^33 */
+		0x0000000400000000ULL,	/* 2^34 */
+		0x0000001000000000ULL,	/* 2^36 */
+		0x0000010000000000ULL,	/* 2^40 */
+		0x0001000000000000ULL,	/* 2^48 */
+		0x0100000000000000ULL,	/* 2^56 */
+		0x4000000000000000ULL,	/* 2^62 */
+		0x8000000000000000ULL,	/* 2^63 */
+
+		/* Cross-boundary patterns */
+		0x00000000FFFFFFFFULL,	/* Lower 32 bits */
+		0xFFFFFFFF00000000ULL,	/* Upper 32 bits */
+		0x7FFFFFFFFFFFFFFFULL,	/* Maximum positive 64-bit */
+		0xFFFFFFFFFFFFFFFFULL,	/* Maximum 64-bit unsigned */
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) {
+		validate_ffs64_relationships(test, test_patterns_64[i]);
+	}
+}
+
+/*
+ * ffz() test data - Find First Zero bit test cases
+ */
+struct ffz_test_case {
+	unsigned long input;
+	unsigned long expected_ffz;
+	const char *description;
+};
+
+static const struct ffz_test_case ffz_test_cases[] = {
+	/* Zero bits in specific positions */
+	{0xFFFFFFFE, 0, "bit 0 is zero"},      /* ...11111110 */
+	{0xFFFFFFFD, 1, "bit 1 is zero"},      /* ...11111101 */
+	{0xFFFFFFFB, 2, "bit 2 is zero"},      /* ...11111011 */
+	{0xFFFFFFF7, 3, "bit 3 is zero"},      /* ...11110111 */
+	{0xFFFFFFEF, 4, "bit 4 is zero"},      /* ...11101111 */
+	{0xFFFFFFDF, 5, "bit 5 is zero"},      /* ...11011111 */
+	{0xFFFFFFBF, 6, "bit 6 is zero"},      /* ...10111111 */
+	{0xFFFFFF7F, 7, "bit 7 is zero"},      /* ...01111111 */
+	{0xFFFFFEFF, 8, "bit 8 is zero"},      /* Gap in bit 8 */
+	{0xFFFF7FFF, 15, "bit 15 is zero"},    /* Gap in bit 15 */
+	{0xFFFEFFFF, 16, "bit 16 is zero"},    /* Gap in bit 16 */
+	{0xBFFFFFFF, 30, "bit 30 is zero"},    /* Gap in bit 30 */
+	{0x7FFFFFFF, 31, "bit 31 is zero"},    /* 01111111... */
+
+	/* Multiple zero patterns */
+	{0xFFFFFFFC, 0, "bits 0-1 are zero"},  /* ...11111100 */
+	{0xFFFFFFF8, 0, "bits 0-2 are zero"},  /* ...11111000 */
+	{0xFFFFFFF0, 0, "bits 0-3 are zero"},  /* ...11110000 */
+	{0xFFFFFF00, 0, "bits 0-7 are zero"},  /* ...00000000 */
+	{0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */
+
+	/* All zeros (special case) */
+	{0x00000000, 0, "all bits zero"},
+
+	/* Complex patterns */
+	{0xFFFDFFFF, 17, "bit 17 is zero"},    /* Gap in bit 17 */
+	{0xFFF7FFFF, 19, "bit 19 is zero"},    /* Gap in bit 19 */
+	{0xF7FFFFFF, 27, "bit 27 is zero"},    /* Gap in bit 27 */
+	{0xDFFFFFFF, 29, "bit 29 is zero"},    /* Gap in bit 29 */
+};
+
+/*
+ * Test basic correctness of ffz() function
+ */
+static void ffz_basic_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+		const struct ffz_test_case *tc = &ffz_test_cases[i];
+		unsigned long result = ffz(tc->input);
+
+		KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz,
+				    "ffz(0x%08lx) [%s]: expected %lu, got %lu",
+				    tc->input, tc->description, tc->expected_ffz, result);
+	}
+}
+
+/*
+ * Test mathematical relationships between ffz() and other functions
+ */
+static void validate_ffz_relationships(struct kunit *test, unsigned long input)
+{
+	unsigned long ffz_result;
+
+	if (input == 0) {
+		/* ffz(0) should return 0 (first zero bit is at position 0) */
+		KUNIT_EXPECT_EQ(test, ffz(input), 0);
+		return;
+	}
+
+	if (input == ~0UL) {
+		/* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */
+		ffz_result = ffz(input);
+		/* Implementation-defined behavior, just ensure it completes */
+		return;
+	}
+
+	ffz_result = ffz(input);
+
+	/* Range validation - result should be within valid bit range */
+	KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG);
+
+	/* Verify the bit at ffz_result position is actually zero */
+	KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0,
+			    "ffz(0x%08lx) = %lu, but bit %lu is not zero",
+			    input, ffz_result, ffz_result);
+
+	/* Core relationship: if we set the ffz bit, ffz should find a different bit */
+	if (ffz_result < BITS_PER_LONG - 1) {
+		unsigned long modified = input | (1UL << ffz_result);
+		if (modified != ~0UL) { /* Skip if all bits would be set */
+			unsigned long new_ffz = ffz(modified);
+			KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result,
+					    "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result",
+					    input, ffz_result);
+		}
+	}
+}
+
+static void ffz_mathematical_relationships_test(struct kunit *test)
+{
+	unsigned long test_patterns[] = {
+		/* Powers of 2 with one bit clear */
+		0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+		0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+
+		/* Multiple patterns */
+		0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+		0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF,
+
+		/* Complex bit patterns */
+		0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333,
+		0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF,
+	};
+	int i;
+
+	/* Test basic test cases */
+	for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+		validate_ffz_relationships(test, ffz_test_cases[i].input);
+	}
+
+	/* Test additional patterns */
+	for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+		validate_ffz_relationships(test, test_patterns[i]);
+	}
+}
+
+/*
+ * Test edge cases and boundary conditions for ffz()
+ */
+static void ffz_edge_cases_test(struct kunit *test)
+{
+	unsigned long edge_patterns[] = {
+		/* Boundary values */
+		0x00000000,  /* All zeros */
+		0x80000000,  /* Only MSB set */
+		0x00000001,  /* Only LSB set */
+		0x7FFFFFFF,  /* MSB clear */
+		0xFFFFFFFE,  /* LSB clear */
+
+		/* Powers of 2 complement patterns (one zero bit each) */
+		~(1UL << 0),  ~(1UL << 1),  ~(1UL << 2),  ~(1UL << 3),
+		~(1UL << 4),  ~(1UL << 8),  ~(1UL << 16), ~(1UL << 31),
+
+		/* Walking zero patterns */
+		0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+		0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+		0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF,
+
+		/* Multiple zeros */
+		0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+		0xFF000000, 0xF0000000, 0x00000000,
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) {
+		validate_ffz_relationships(test, edge_patterns[i]);
+	}
+}
+
+/*
+ * To have useful build error output, split the tests into separate
+ * functions so it's clear which are missing __attribute_const__.
+ */
+#define CREATE_WRAPPER(func)						\
+static noinline bool build_test_##func(void)				\
+{									\
+	int init_##func = 32;						\
+	int result_##func = func(6);					\
+									\
+	/* Does the static initializer vanish after calling func? */	\
+	BUILD_BUG_ON(init_##func < 32);					\
+									\
+	/* "Consume" the results so optimizer doesn't drop them. */	\
+	barrier_data(&init_##func);					\
+	barrier_data(&result_##func);					\
+									\
+	return true;							\
+}
+CREATE_WRAPPER(ffs)
+CREATE_WRAPPER(fls)
+CREATE_WRAPPER(__ffs)
+CREATE_WRAPPER(__fls)
+CREATE_WRAPPER(ffz)
+#undef CREATE_WRAPPER
+
+/*
+ * Make sure that __attribute_const__ has be applied to all the
+ * functions. This is a regression test for:
+ * https://github.com/KSPP/linux/issues/364
+ */
+static void ffs_attribute_const_test(struct kunit *test)
+{
+	KUNIT_EXPECT_TRUE(test, build_test_ffs());
+	KUNIT_EXPECT_TRUE(test, build_test_fls());
+	KUNIT_EXPECT_TRUE(test, build_test___ffs());
+	KUNIT_EXPECT_TRUE(test, build_test___fls());
+	KUNIT_EXPECT_TRUE(test, build_test_ffz());
+}
+
+/*
+ * KUnit test case definitions
+ */
+static struct kunit_case ffs_test_cases[] = {
+	KUNIT_CASE(ffs_basic_correctness_test),
+	KUNIT_CASE(ffs64_correctness_test),
+	KUNIT_CASE(ffs_mathematical_relationships_test),
+	KUNIT_CASE(ffs_edge_cases_test),
+	KUNIT_CASE(ffs64_edge_cases_test),
+	KUNIT_CASE(ffz_basic_correctness_test),
+	KUNIT_CASE(ffz_mathematical_relationships_test),
+	KUNIT_CASE(ffz_edge_cases_test),
+	KUNIT_CASE(ffs_attribute_const_test),
+	{}
+};
+
+/*
+ * KUnit test suite definition
+ */
+static struct kunit_suite ffs_test_suite = {
+	.name = "ffs",
+	.test_cases = ffs_test_cases,
+};
+
+kunit_test_suites(&ffs_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for ffs()-family functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/ubsan.c b/lib/ubsan.c
index a6ca235dd714..456e3dd8f4ea 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -333,18 +333,18 @@ EXPORT_SYMBOL(__ubsan_handle_implicit_conversion);
 void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
 {
 	struct overflow_data *data = _data;
-	char rhs_val_str[VALUE_LENGTH];
+	char lhs_val_str[VALUE_LENGTH];
 
 	if (suppress_report(&data->location))
 		return;
 
 	ubsan_prologue(&data->location, "division-overflow");
 
-	val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
+	val_to_string(lhs_val_str, sizeof(lhs_val_str), data->type, lhs);
 
 	if (type_is_signed(data->type) && get_signed_val(data->type, rhs) == -1)
 		pr_err("division of %s by -1 cannot be represented in type %s\n",
-			rhs_val_str, data->type->type_name);
+			lhs_val_str, data->type->type_name);
 	else
 		pr_err("division by zero\n");
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..0beaca6bacf7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
 	wb_exit(wb);
 	bdi_put(bdi);
 	WARN_ON_ONCE(!list_empty(&wb->b_attached));
+	WARN_ON_ONCE(work_pending(&wb->switch_work));
 	call_rcu(&wb->rcu, cgwb_free_rcu);
 }
 
@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
 	INIT_LIST_HEAD(&wb->b_attached);
+	INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
+	init_llist_head(&wb->switch_wbs_ctxs);
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
 	set_bit(WB_registered, &wb->state);
 	bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 	if (!ret) {
 		bdi->wb.memcg_css = &root_mem_cgroup->css;
 		bdi->wb.blkcg_css = blkcg_root_css;
+		INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
+		init_llist_head(&bdi->wb.switch_wbs_ctxs);
 	}
 	return ret;
 }
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 2a4a649805c1..03c5dbabb156 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -254,4 +254,10 @@ const struct movable_operations balloon_mops = {
 	.putback_page = balloon_page_putback,
 };
 
+static int __init balloon_init(void)
+{
+	return set_movable_ops(&balloon_mops, PGTY_offline);
+}
+core_initcall(balloon_init);
+
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 52a48c9316bc..08065b363972 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -845,6 +845,18 @@ static struct damos_filter *damos_nth_filter(int n, struct damos *s)
 	return NULL;
 }
 
+static struct damos_filter *damos_nth_ops_filter(int n, struct damos *s)
+{
+	struct damos_filter *filter;
+	int i = 0;
+
+	damos_for_each_ops_filter(filter, s) {
+		if (i++ == n)
+			return filter;
+	}
+	return NULL;
+}
+
 static void damos_commit_filter_arg(
 		struct damos_filter *dst, struct damos_filter *src)
 {
@@ -871,6 +883,7 @@ static void damos_commit_filter(
 {
 	dst->type = src->type;
 	dst->matching = src->matching;
+	dst->allow = src->allow;
 	damos_commit_filter_arg(dst, src);
 }
 
@@ -908,7 +921,7 @@ static int damos_commit_ops_filters(struct damos *dst, struct damos *src)
 	int i = 0, j = 0;
 
 	damos_for_each_ops_filter_safe(dst_filter, next, dst) {
-		src_filter = damos_nth_filter(i++, src);
+		src_filter = damos_nth_ops_filter(i++, src);
 		if (src_filter)
 			damos_commit_filter(dst_filter, src_filter);
 		else
@@ -2060,8 +2073,8 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 
 	if (quota->ms) {
 		if (quota->total_charged_ns)
-			throughput = quota->total_charged_sz * 1000000 /
-				quota->total_charged_ns;
+			throughput = mult_frac(quota->total_charged_sz, 1000000,
+							quota->total_charged_ns);
 		else
 			throughput = PAGE_SIZE * 1024;
 		esz = min(throughput * quota->ms, esz);
@@ -2098,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 	if (!quota->ms && !quota->sz && list_empty(&quota->goals))
 		return;
 
+	/* First charge window */
+	if (!quota->total_charged_sz && !quota->charged_from)
+		quota->charged_from = jiffies;
+
 	/* New charge window starts */
 	if (time_after_eq(jiffies, quota->charged_from +
 				msecs_to_jiffies(quota->reset_interval))) {
@@ -2462,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
 		mutex_lock(&ctx->call_controls_lock);
 		list_del(&control->list);
 		mutex_unlock(&ctx->call_controls_lock);
-		if (!control->repeat)
+		if (!control->repeat) {
 			complete(&control->completion);
-		else
+		} else if (control->canceled && control->dealloc_on_cancel) {
+			kfree(control);
+			continue;
+		} else {
 			list_add(&control->list, &repeat_controls);
+		}
 	}
 	control = list_first_entry_or_null(&repeat_controls,
 			struct damon_call_control, list);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 151a9de5ad8b..b5a5ed16a7a5 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void)
 	if (err)
 		return err;
 
+	if (!damon_lru_sort_mon_attrs.sample_interval) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
 	if (err)
 		goto out;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3c71b4596676..fb7c982a0018 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void)
 	if (err)
 		return err;
 
+	if (!damon_reclaim_mon_attrs.aggr_interval) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs);
 	if (err)
 		goto out;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 74056bcd6a2c..6536f16006c9 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -2158,8 +2158,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 {
 	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
 	kobject_put(&scheme->access_pattern->kobj);
-	kobject_put(&scheme->dests->kobj);
 	damos_sysfs_dests_rm_dirs(scheme->dests);
+	kobject_put(&scheme->dests->kobj);
 	damon_sysfs_quotas_rm_dirs(scheme->quotas);
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6d2b0dab50cb..7308dee97b21 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	struct damon_sysfs_kdamond *kdamond = container_of(kobj,
 			struct damon_sysfs_kdamond, kobj);
-	struct damon_ctx *ctx = kdamond->damon_ctx;
-	bool running;
+	struct damon_ctx *ctx;
+	bool running = false;
 
-	if (!ctx)
-		running = false;
-	else
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+
+	ctx = kdamond->damon_ctx;
+	if (ctx)
 		running = damon_is_running(ctx);
 
+	mutex_unlock(&damon_sysfs_lock);
+
 	return sysfs_emit(buf, "%s\n", running ?
 			damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
 			damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]);
@@ -1530,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data)
 	return 0;
 }
 
-static struct damon_call_control damon_sysfs_repeat_call_control = {
-	.fn = damon_sysfs_repeat_call_fn,
-	.repeat = true,
-};
-
 static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
 {
 	struct damon_ctx *ctx;
+	struct damon_call_control *repeat_call_control;
 	int err;
 
 	if (damon_sysfs_kdamond_running(kdamond))
@@ -1550,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
 		damon_destroy_ctx(kdamond->damon_ctx);
 	kdamond->damon_ctx = NULL;
 
+	repeat_call_control = kmalloc(sizeof(*repeat_call_control),
+			GFP_KERNEL);
+	if (!repeat_call_control)
+		return -ENOMEM;
+
 	ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
-	if (IS_ERR(ctx))
+	if (IS_ERR(ctx)) {
+		kfree(repeat_call_control);
 		return PTR_ERR(ctx);
+	}
 	err = damon_start(&ctx, 1, false);
 	if (err) {
+		kfree(repeat_call_control);
 		damon_destroy_ctx(ctx);
 		return err;
 	}
 	kdamond->damon_ctx = ctx;
 
-	damon_sysfs_repeat_call_control.data = kdamond;
-	damon_call(ctx, &damon_sysfs_repeat_call_control);
+	repeat_call_control->fn = damon_sysfs_repeat_call_fn;
+	repeat_call_control->data = kdamond;
+	repeat_call_control->repeat = true;
+	repeat_call_control->dealloc_on_cancel = true;
+	damon_call(ctx, repeat_call_control);
 	return err;
 }
 
@@ -1581,12 +1592,14 @@ static int damon_sysfs_damon_call(int (*fn)(void *data),
 		struct damon_sysfs_kdamond *kdamond)
 {
 	struct damon_call_control call_control = {};
+	int err;
 
 	if (!kdamond->damon_ctx)
 		return -EINVAL;
 	call_control.fn = fn;
 	call_control.data = kdamond;
-	return damon_call(kdamond->damon_ctx, &call_control);
+	err = damon_call(kdamond->damon_ctx, &call_control);
+	return err ? err : call_control.return_code;
 }
 
 struct damon_sysfs_schemes_walk_data {
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index d19031f275a3..830107b6dd08 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -990,29 +990,34 @@ static void __init destroy_args(struct pgtable_debug_args *args)
 
 	/* Free page table entries */
 	if (args->start_ptep) {
+		pmd_clear(args->pmdp);
 		pte_free(args->mm, args->start_ptep);
 		mm_dec_nr_ptes(args->mm);
 	}
 
 	if (args->start_pmdp) {
+		pud_clear(args->pudp);
 		pmd_free(args->mm, args->start_pmdp);
 		mm_dec_nr_pmds(args->mm);
 	}
 
 	if (args->start_pudp) {
+		p4d_clear(args->p4dp);
 		pud_free(args->mm, args->start_pudp);
 		mm_dec_nr_puds(args->mm);
 	}
 
-	if (args->start_p4dp)
+	if (args->start_p4dp) {
+		pgd_clear(args->pgdp);
 		p4d_free(args->mm, args->start_p4dp);
+	}
 
 	/* Free vma and mm struct */
 	if (args->vma)
 		vm_area_free(args->vma);
 
 	if (args->mm)
-		mmdrop(args->mm);
+		mmput(args->mm);
 }
 
 static struct page * __init
diff --git a/mm/gup.c b/mm/gup.c
index adffe663594d..0bc4d140fc07 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios(
 		struct pages_or_folios *pofs)
 {
 	unsigned long collected = 0;
-	bool drain_allow = true;
 	struct folio *folio;
+	int drained = 0;
 	long i = 0;
 
 	for (folio = pofs_get_folio(pofs, i); folio;
@@ -2307,9 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios(
 			continue;
 		}
 
-		if (!folio_test_lru(folio) && drain_allow) {
+		if (drained == 0 && folio_may_be_lru_cached(folio) &&
+				folio_ref_count(folio) !=
+				folio_expected_ref_count(folio) + 1) {
+			lru_add_drain();
+			drained = 1;
+		}
+		if (drained == 1 && folio_may_be_lru_cached(folio) &&
+				folio_ref_count(folio) !=
+				folio_expected_ref_count(folio) + 1) {
 			lru_add_drain_all();
-			drain_allow = false;
+			drained = 2;
 		}
 
 		if (!folio_isolate_lru(folio))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 753f99b4c718..6cfe0b43ab8f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5594,18 +5594,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			break;
 		}
 
-		/*
-		 * If the pagetables are shared don't copy or take references.
-		 *
-		 * dst_pte == src_pte is the common case of src/dest sharing.
-		 * However, src could have 'unshared' and dst shares with
-		 * another vma. So page_count of ptep page is checked instead
-		 * to reliably determine whether pte is shared.
-		 */
-		if (page_count(virt_to_page(dst_pte)) > 1) {
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+		/* If the pagetables are shared, there is nothing to do */
+		if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
 			addr |= last_addr_mask;
 			continue;
 		}
+#endif
 
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
 		src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -5851,7 +5846,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
-	bool adjust_reservation = false;
+	bool adjust_reservation;
 	unsigned long last_addr_mask;
 	bool force_flush = false;
 
@@ -5944,6 +5939,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 					sz);
 		hugetlb_count_sub(pages_per_huge_page(h), mm);
 		hugetlb_remove_rmap(folio);
+		spin_unlock(ptl);
 
 		/*
 		 * Restore the reservation for anonymous page, otherwise the
@@ -5951,14 +5947,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 * If there we are freeing a surplus, do not set the restore
 		 * reservation bit.
 		 */
+		adjust_reservation = false;
+
+		spin_lock_irq(&hugetlb_lock);
 		if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
 		    folio_test_anon(folio)) {
 			folio_set_hugetlb_restore_reserve(folio);
 			/* Reservation to be adjusted after the spin lock */
 			adjust_reservation = true;
 		}
-
-		spin_unlock(ptl);
+		spin_unlock_irq(&hugetlb_lock);
 
 		/*
 		 * Adjust the reservation for the region that will have the
@@ -7599,7 +7597,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	hugetlb_vma_assert_locked(vma);
 	if (sz != PMD_SIZE)
 		return 0;
-	if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
+	if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
 		return 0;
 
 	pud_clear(pud);
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ced6b29fcf76..8fce3370c84e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -13,9 +13,9 @@
 #include <linux/mm.h>
 #include <linux/pfn.h>
 #include <linux/slab.h>
+#include <linux/pgalloc.h>
 
 #include <asm/page.h>
-#include <asm/pgalloc.h>
 
 #include "kasan.h"
 
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 			pud_t *pud;
 			pmd_t *pmd;
 
-			p4d_populate(&init_mm, p4d,
+			p4d_populate_kernel(addr, p4d,
 					lm_alias(kasan_early_shadow_pud));
 			pud = pud_offset(p4d, addr);
 			pud_populate(&init_mm, pud,
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 			} else {
 				p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
 				pud_init(p);
-				p4d_populate(&init_mm, p4d, p);
+				p4d_populate_kernel(addr, p4d, p);
 			}
 		}
 		zero_pud_populate(p4d, addr, next);
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
 			 * puds,pmds, so pgd_populate(), pud_populate()
 			 * is noops.
 			 */
-			pgd_populate(&init_mm, pgd,
+			pgd_populate_kernel(addr, pgd,
 					lm_alias(kasan_early_shadow_p4d));
 			p4d = p4d_offset(pgd, addr);
-			p4d_populate(&init_mm, p4d,
+			p4d_populate_kernel(addr, p4d,
 					lm_alias(kasan_early_shadow_pud));
 			pud = pud_offset(p4d, addr);
 			pud_populate(&init_mm, pud,
@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
 				if (!p)
 					return -ENOMEM;
 			} else {
-				pgd_populate(&init_mm, pgd,
+				pgd_populate_kernel(addr, pgd,
 					early_alloc(PAGE_SIZE, NUMA_NO_NODE));
 			}
 		}
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 2aa12dfa427a..f4b17984b627 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -47,7 +47,7 @@ static struct {
  * Some tests use these global variables to store return values from function
  * calls that could otherwise be eliminated by the compiler as dead code.
  */
-static volatile void *kasan_ptr_result;
+static void *volatile kasan_ptr_result;
 static volatile int kasan_int_result;
 
 /* Probe for console output: obtains test_status lines of interest. */
@@ -1578,9 +1578,11 @@ static void kasan_strings(struct kunit *test)
 
 	ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+	OPTIMIZER_HIDE_VAR(ptr);
 
 	src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
 	strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+	OPTIMIZER_HIDE_VAR(src);
 
 	/*
 	 * Make sure that strscpy() does not trigger KASAN if it overreads into
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d2c70cd2afb1..11d472a5c4e8 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -305,8 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 	pte_t pte;
 	int index;
 
-	if (likely(!pte_none(ptep_get(ptep))))
-		return 0;
+	arch_leave_lazy_mmu_mode();
 
 	index = PFN_DOWN(addr - data->start);
 	page = data->pages[index];
@@ -320,6 +319,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 	}
 	spin_unlock(&init_mm.page_table_lock);
 
+	arch_enter_lazy_mmu_mode();
+
 	return 0;
 }
 
@@ -335,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages)
 	}
 }
 
-static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask)
 {
 	unsigned long nr_populated, nr_total = nr_pages;
 	struct page **page_array = pages;
 
 	while (nr_pages) {
-		nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+		nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages);
 		if (!nr_populated) {
 			___free_pages_bulk(page_array, nr_total - nr_pages);
 			return -ENOMEM;
@@ -353,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
 	return 0;
 }
 
-static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
 {
 	unsigned long nr_pages, nr_total = PFN_UP(end - start);
 	struct vmalloc_populate_data data;
+	unsigned int flags;
 	int ret = 0;
 
-	data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO);
 	if (!data.pages)
 		return -ENOMEM;
 
 	while (nr_total) {
 		nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
-		ret = ___alloc_pages_bulk(data.pages, nr_pages);
+		ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask);
 		if (ret)
 			break;
 
 		data.start = start;
+
+		/*
+		 * page tables allocations ignore external gfp mask, enforce it
+		 * by the scope API
+		 */
+		if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+			flags = memalloc_nofs_save();
+		else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+			flags = memalloc_noio_save();
+
 		ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
 					  kasan_populate_vmalloc_pte, &data);
+
+		if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+			memalloc_nofs_restore(flags);
+		else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+			memalloc_noio_restore(flags);
+
 		___free_pages_bulk(data.pages, nr_pages);
 		if (ret)
 			break;
@@ -385,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
 	return ret;
 }
 
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
 {
 	unsigned long shadow_start, shadow_end;
 	int ret;
@@ -414,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 	shadow_start = PAGE_ALIGN_DOWN(shadow_start);
 	shadow_end = PAGE_ALIGN(shadow_end);
 
-	ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
+	ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
 	if (ret)
 		return ret;
 
@@ -461,18 +479,23 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 					void *unused)
 {
-	unsigned long page;
+	pte_t pte;
+	int none;
 
-	page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
+	arch_leave_lazy_mmu_mode();
 
 	spin_lock(&init_mm.page_table_lock);
-
-	if (likely(!pte_none(ptep_get(ptep)))) {
+	pte = ptep_get(ptep);
+	none = pte_none(pte);
+	if (likely(!none))
 		pte_clear(&init_mm, addr, ptep);
-		free_page(page);
-	}
 	spin_unlock(&init_mm.page_table_lock);
 
+	if (likely(!none))
+		__free_page(pfn_to_page(pte_pfn(pte)));
+
+	arch_enter_lazy_mmu_mode();
+
 	return 0;
 }
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 374a6a5193a7..b486c1d19b2d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1172,11 +1172,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 	/* check if the pmd is still valid */
+	vma_start_write(vma);
 	result = check_pmd_still_valid(mm, address, pmd);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 
-	vma_start_write(vma);
 	anon_vma_lock_write(vma->anon_vma);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		 */
 		if (cc->is_khugepaged &&
 		    (pte_young(pteval) || folio_test_young(folio) ||
-		     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
-								     address)))
+		     folio_test_referenced(folio) ||
+		     mmu_notifier_test_young(vma->vm_mm, _address)))
 			referenced++;
 	}
 	if (!writable) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8d588e685311..1ac56ceb29b6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
 		else if (untagged_objp == untagged_ptr || alias)
 			return object;
 		else {
+			/*
+			 * Printk deferring due to the kmemleak_lock held.
+			 * This is done to avoid deadlock.
+			 */
+			printk_deferred_enter();
 			kmemleak_warn("Found object by alias at 0x%08lx\n",
 				      ptr);
 			dump_object_info(object);
+			printk_deferred_exit();
 			break;
 		}
 	}
@@ -470,6 +476,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
 {
 	unsigned long flags;
 	struct kmemleak_object *object;
+	bool warn = false;
 
 	/* try the slab allocator first */
 	if (object_cache) {
@@ -488,8 +495,10 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
 	else if (mem_pool_free_count)
 		object = &mem_pool[--mem_pool_free_count];
 	else
-		pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+		warn = true;
 	raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
+	if (warn)
+		pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
 
 	return object;
 }
@@ -733,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
 		else if (untagged_objp + parent->size <= untagged_ptr)
 			link = &parent->rb_node.rb_right;
 		else {
+			/*
+			 * Printk deferring due to the kmemleak_lock held.
+			 * This is done to avoid deadlock.
+			 */
+			printk_deferred_enter();
 			kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
 				      ptr);
 			/*
@@ -740,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
 			 * be freed while the kmemleak_lock is held.
 			 */
 			dump_object_info(parent);
+			printk_deferred_exit();
 			return -EEXIST;
 		}
 	}
@@ -853,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size,
 
 	raw_spin_lock_irqsave(&kmemleak_lock, flags);
 	object = __find_and_remove_object(ptr, 1, objflags);
-	if (!object) {
-#ifdef DEBUG
-		kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
-			      ptr, size);
-#endif
+	if (!object)
 		goto unlock;
-	}
 
 	/*
 	 * Create one or two objects that may result from the memory block
@@ -879,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size,
 
 unlock:
 	raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
-	if (object)
+	if (object) {
 		__delete_object(object);
+	} else {
+#ifdef DEBUG
+		kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+			      ptr, size);
+#endif
+	}
 
 out:
 	if (object_l)
@@ -2181,6 +2197,7 @@ static const struct file_operations kmemleak_fops = {
 static void __kmemleak_do_cleanup(void)
 {
 	struct kmemleak_object *object, *tmp;
+	unsigned int cnt = 0;
 
 	/*
 	 * Kmemleak has already been disabled, no need for RCU list traversal
@@ -2189,6 +2206,10 @@ static void __kmemleak_do_cleanup(void)
 	list_for_each_entry_safe(object, tmp, &object_list, object_list) {
 		__remove_object(object);
 		__delete_object(object);
+
+		/* Call cond_resched() once per 64 iterations to avoid soft lockup */
+		if (!(++cnt & 0x3f))
+			cond_resched();
 	}
 }
 
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 1ea711786c52..8bca7fece47f 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -195,7 +195,8 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
 				      u32 origin, bool checked)
 {
 	u64 address = (u64)addr;
-	u32 *shadow_start, *origin_start;
+	void *shadow_start;
+	u32 *aligned_shadow, *origin_start;
 	size_t pad = 0;
 
 	KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
@@ -214,9 +215,12 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
 	}
 	__memset(shadow_start, b, size);
 
-	if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+	if (IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+		aligned_shadow = shadow_start;
+	} else {
 		pad = address % KMSAN_ORIGIN_SIZE;
 		address -= pad;
+		aligned_shadow = shadow_start - pad;
 		size += pad;
 	}
 	size = ALIGN(size, KMSAN_ORIGIN_SIZE);
@@ -230,7 +234,7 @@ void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
 	 * corresponding shadow slot is zero.
 	 */
 	for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) {
-		if (origin || !shadow_start[i])
+		if (origin || !aligned_shadow[i])
 			origin_start[i] = origin;
 	}
 }
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index c6c5b2bbede0..902ec48b1e3e 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -556,6 +556,21 @@ DEFINE_TEST_MEMSETXX(16)
 DEFINE_TEST_MEMSETXX(32)
 DEFINE_TEST_MEMSETXX(64)
 
+/* Test case: ensure that KMSAN does not access shadow memory out of bounds. */
+static void test_memset_on_guarded_buffer(struct kunit *test)
+{
+	void *buf = vmalloc(PAGE_SIZE);
+
+	kunit_info(test,
+		   "memset() on ends of guarded buffer should not crash\n");
+
+	for (size_t size = 0; size <= 128; size++) {
+		memset(buf, 0xff, size);
+		memset(buf + PAGE_SIZE - size, 0xff, size);
+	}
+	vfree(buf);
+}
+
 static noinline void fibonacci(int *array, int size, int start)
 {
 	if (start < 2 || (start == size))
@@ -677,6 +692,7 @@ static struct kunit_case kmsan_test_cases[] = {
 	KUNIT_CASE(test_memset16),
 	KUNIT_CASE(test_memset32),
 	KUNIT_CASE(test_memset64),
+	KUNIT_CASE(test_memset_on_guarded_buffer),
 	KUNIT_CASE(test_long_origin_chain),
 	KUNIT_CASE(test_stackdepot_roundtrip),
 	KUNIT_CASE(test_unpoison_memory),
diff --git a/mm/memblock.c b/mm/memblock.c
index 154f1d73b61f..117d963e677c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -780,9 +780,9 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
 	}
 
 	if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
-		mem_size_mb = memblock_phys_mem_size() >> 20;
+		mem_size_mb = memblock_phys_mem_size() / SZ_1M;
 		pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
-		       (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
+		       (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb);
 		return false;
 	}
 
@@ -1091,13 +1091,20 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
 
 /**
  * memblock_reserved_mark_noinit - Mark a reserved memory region with flag
- * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
- * for this region.
+ * MEMBLOCK_RSRV_NOINIT
+ *
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * struct pages will not be initialized for reserved memory regions marked with
- * %MEMBLOCK_RSRV_NOINIT.
+ * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will
+ * not be fully initialized to allow the caller optimize their initialization.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag
+ * completely bypasses the initialization of struct pages for such region.
+ *
+ * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this
+ * region will be initialized with default values but won't be marked as
+ * reserved.
  *
  * Return: 0 on success, -errno on failure.
  */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e2e685b971bb..df6ee59527dd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -853,9 +853,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
 #define hwpoison_hugetlb_range	NULL
 #endif
 
+static int hwpoison_test_walk(unsigned long start, unsigned long end,
+			     struct mm_walk *walk)
+{
+	/* We also want to consider pages mapped into VM_PFNMAP. */
+	return 0;
+}
+
 static const struct mm_walk_ops hwpoison_walk_ops = {
 	.pmd_entry = hwpoison_pte_range,
 	.hugetlb_entry = hwpoison_hugetlb_range,
+	.test_walk = hwpoison_test_walk,
 	.walk_lock = PGWALK_RDLOCK,
 };
 
@@ -948,7 +956,7 @@ static const char * const action_page_types[] = {
 	[MF_MSG_BUDDY]			= "free buddy page",
 	[MF_MSG_DAX]			= "dax page",
 	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
-	[MF_MSG_ALREADY_POISONED]	= "already poisoned",
+	[MF_MSG_ALREADY_POISONED]	= "already poisoned page",
 	[MF_MSG_UNKNOWN]		= "unknown page",
 };
 
@@ -1341,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
 {
 	trace_memory_failure_event(pfn, type, result);
 
-	num_poisoned_pages_inc(pfn);
-
-	update_per_node_mf_stats(pfn, result);
+	if (type != MF_MSG_ALREADY_POISONED) {
+		num_poisoned_pages_inc(pfn);
+		update_per_node_mf_stats(pfn, result);
+	}
 
 	pr_err("%#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
@@ -2086,12 +2095,11 @@ retry:
 		*hugetlb = 0;
 		return 0;
 	} else if (res == -EHWPOISON) {
-		pr_err("%#lx: already hardware poisoned\n", pfn);
 		if (flags & MF_ACTION_REQUIRED) {
 			folio = page_folio(p);
 			res = kill_accessing_process(current, folio_pfn(folio), flags);
-			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
 		}
+		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
 		return res;
 	} else if (res == -EBUSY) {
 		if (!(flags & MF_NO_RETRY)) {
@@ -2277,7 +2285,6 @@ try_again:
 		goto unlock_mutex;
 
 	if (TestSetPageHWPoison(p)) {
-		pr_err("%#lx: already hardware poisoned\n", pfn);
 		res = -EHWPOISON;
 		if (flags & MF_ACTION_REQUIRED)
 			res = kill_accessing_process(current, pfn, flags);
@@ -2561,10 +2568,9 @@ int unpoison_memory(unsigned long pfn)
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
 
-	if (!pfn_valid(pfn))
-		return -ENXIO;
-
-	p = pfn_to_page(pfn);
+	p = pfn_to_online_page(pfn);
+	if (!p)
+		return -EIO;
 	folio = page_folio(p);
 
 	mutex_lock(&mf_mutex);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f15af712bc3..74318c787715 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
 
 		if (folio_contain_hwpoisoned_page(folio)) {
-			if (WARN_ON(folio_test_lru(folio)))
-				folio_isolate_lru(folio);
+			/*
+			 * unmap_poisoned_folio() cannot handle large folios
+			 * in all cases yet.
+			 */
+			if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+				goto put_folio;
+			if (folio_test_lru(folio) && !folio_isolate_lru(folio))
+				goto put_folio;
 			if (folio_mapped(folio)) {
 				folio_lock(folio);
 				unmap_poisoned_folio(folio, pfn, false);
diff --git a/mm/migrate.c b/mm/migrate.c
index 425401b2d4e1..9e5ef39ce73a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -43,8 +43,6 @@
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
 #include <linux/pagewalk.h>
-#include <linux/balloon_compaction.h>
-#include <linux/zsmalloc.h>
 
 #include <asm/tlbflush.h>
 
@@ -53,6 +51,33 @@
 #include "internal.h"
 #include "swap.h"
 
+static const struct movable_operations *offline_movable_ops;
+static const struct movable_operations *zsmalloc_movable_ops;
+
+int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
+{
+	/*
+	 * We only allow for selected types and don't handle concurrent
+	 * registration attempts yet.
+	 */
+	switch (type) {
+	case PGTY_offline:
+		if (offline_movable_ops && ops)
+			return -EBUSY;
+		offline_movable_ops = ops;
+		break;
+	case PGTY_zsmalloc:
+		if (zsmalloc_movable_ops && ops)
+			return -EBUSY;
+		zsmalloc_movable_ops = ops;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(set_movable_ops);
+
 static const struct movable_operations *page_movable_ops(struct page *page)
 {
 	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
@@ -62,15 +87,12 @@ static const struct movable_operations *page_movable_ops(struct page *page)
 	 * it as movable, the page type must be sticky until the page gets freed
 	 * back to the buddy.
 	 */
-#ifdef CONFIG_BALLOON_COMPACTION
 	if (PageOffline(page))
 		/* Only balloon compaction sets PageOffline pages movable. */
-		return &balloon_mops;
-#endif /* CONFIG_BALLOON_COMPACTION */
-#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION)
+		return offline_movable_ops;
 	if (PageZsmalloc(page))
-		return &zsmalloc_mops;
-#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */
+		return zsmalloc_movable_ops;
+
 	return NULL;
 }
 
diff --git a/mm/mlock.c b/mm/mlock.c
index a1d93ad33c6d..bb0776f5ef7c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio)
 
 	folio_get(folio);
 	if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
-	    folio_test_large(folio) || lru_cache_disabled())
+	    !folio_may_be_lru_cached(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
 }
@@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio)
 
 	folio_get(folio);
 	if (!folio_batch_add(fbatch, mlock_new(folio)) ||
-	    folio_test_large(folio) || lru_cache_disabled())
+	    !folio_may_be_lru_cached(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
 }
@@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio)
 	 */
 	folio_get(folio);
 	if (!folio_batch_add(fbatch, folio) ||
-	    folio_test_large(folio) || lru_cache_disabled())
+	    !folio_may_be_lru_cached(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 78bded7acf79..113b48985834 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -120,9 +120,8 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
 
 static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
 			   pte_t oldpte, pte_t *pte, int target_node,
-			   struct folio **foliop)
+			   struct folio *folio)
 {
-	struct folio *folio = NULL;
 	bool ret = true;
 	bool toptier;
 	int nid;
@@ -131,7 +130,6 @@ static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
 	if (pte_protnone(oldpte))
 		goto skip;
 
-	folio = vm_normal_folio(vma, addr, oldpte);
 	if (!folio)
 		goto skip;
 
@@ -173,7 +171,6 @@ static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
 		folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
 
 skip:
-	*foliop = folio;
 	return ret;
 }
 
@@ -231,10 +228,9 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
  * retrieve sub-batches.
  */
 static void commit_anon_folio_batch(struct vm_area_struct *vma,
-		struct folio *folio, unsigned long addr, pte_t *ptep,
+		struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
 		pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
 {
-	struct page *first_page = folio_page(folio, 0);
 	bool expected_anon_exclusive;
 	int sub_batch_idx = 0;
 	int len;
@@ -251,7 +247,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma,
 }
 
 static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
-		struct folio *folio, unsigned long addr, pte_t *ptep,
+		struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
 		pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
 {
 	bool set_write;
@@ -270,7 +266,7 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
 				       /* idx = */ 0, set_write, tlb);
 		return;
 	}
-	commit_anon_folio_batch(vma, folio, addr, ptep, oldpte, ptent, nr_ptes, tlb);
+	commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
 }
 
 static long change_pte_range(struct mmu_gather *tlb,
@@ -305,15 +301,19 @@ static long change_pte_range(struct mmu_gather *tlb,
 			const fpb_t flags = FPB_RESPECT_SOFT_DIRTY | FPB_RESPECT_WRITE;
 			int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
 			struct folio *folio = NULL;
+			struct page *page;
 			pte_t ptent;
 
+			page = vm_normal_page(vma, addr, oldpte);
+			if (page)
+				folio = page_folio(page);
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
 			 */
 			if (prot_numa) {
 				int ret = prot_numa_skip(vma, addr, oldpte, pte,
-							 target_node, &folio);
+							 target_node, folio);
 				if (ret) {
 
 					/* determine batch to skip */
@@ -323,9 +323,6 @@ static long change_pte_range(struct mmu_gather *tlb,
 				}
 			}
 
-			if (!folio)
-				folio = vm_normal_folio(vma, addr, oldpte);
-
 			nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
 
 			oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
@@ -351,7 +348,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 			 */
 			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
 			     !pte_write(ptent))
-				set_write_prot_commit_flush_ptes(vma, folio,
+				set_write_prot_commit_flush_ptes(vma, folio, page,
 				addr, pte, oldpte, ptent, nr_ptes, tlb);
 			else
 				prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
diff --git a/mm/mremap.c b/mm/mremap.c
index 677a4d744df9..35de0a7b910e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -179,6 +179,10 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
 	if (max_nr == 1)
 		return 1;
 
+	/* Avoid expensive folio lookup if we stand no chance of benefit. */
+	if (pte_batch_hint(ptep, pte) == 1)
+		return 1;
+
 	folio = vm_normal_folio(vma, addr, pte);
 	if (!folio || !folio_test_large(folio))
 		return 1;
@@ -319,6 +323,25 @@ static inline bool arch_supports_page_table_move(void)
 }
 #endif
 
+static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc)
+{
+	/*
+	 * If we are moving a VMA that has uffd-wp registered but with
+	 * remap events disabled (new VMA will not be registered with uffd), we
+	 * need to ensure that the uffd-wp state is cleared from all pgtables.
+	 * This means recursing into lower page tables in move_page_tables().
+	 *
+	 * We might get called with VMAs reversed when recovering from a
+	 * failed page table move. In that case, the
+	 * "old"-but-actually-"originally new" VMA during recovery will not have
+	 * a uffd context. Recursing into lower page tables during the original
+	 * move but not during the recovery move will cause trouble, because we
+	 * run into already-existing page tables. So check both VMAs.
+	 */
+	return !vma_has_uffd_without_event_remap(pmc->old) &&
+	       !vma_has_uffd_without_event_remap(pmc->new);
+}
+
 #ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct pagetable_move_control *pmc,
 			pmd_t *old_pmd, pmd_t *new_pmd)
@@ -331,6 +354,8 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc,
 
 	if (!arch_supports_page_table_move())
 		return false;
+	if (!uffd_supports_page_table_move(pmc))
+		return false;
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
 	 * should have released it.
@@ -357,15 +382,6 @@ static bool move_normal_pmd(struct pagetable_move_control *pmc,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
-	/* If this pmd belongs to a uffd vma with remap events disabled, we need
-	 * to ensure that the uffd-wp state is cleared from all pgtables. This
-	 * means recursing into lower page tables in move_page_tables(), and we
-	 * can reuse the existing code if we simply treat the entry as "not
-	 * moved".
-	 */
-	if (vma_has_uffd_without_event_remap(vma))
-		return false;
-
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -414,6 +430,8 @@ static bool move_normal_pud(struct pagetable_move_control *pmc,
 
 	if (!arch_supports_page_table_move())
 		return false;
+	if (!uffd_supports_page_table_move(pmc))
+		return false;
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
@@ -421,15 +439,6 @@ static bool move_normal_pud(struct pagetable_move_control *pmc,
 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
 		return false;
 
-	/* If this pud belongs to a uffd vma with remap events disabled, we need
-	 * to ensure that the uffd-wp state is cleared from all pgtables. This
-	 * means recursing into lower page tables in move_page_tables(), and we
-	 * can reuse the existing code if we simply treat the entry as "not
-	 * moved".
-	 */
-	if (vma_has_uffd_without_event_remap(vma))
-		return false;
-
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -1616,7 +1625,7 @@ static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
 
 static bool vma_multi_allowed(struct vm_area_struct *vma)
 {
-	struct file *file;
+	struct file *file = vma->vm_file;
 
 	/*
 	 * We can't support moving multiple uffd VMAs as notify requires
@@ -1629,15 +1638,17 @@ static bool vma_multi_allowed(struct vm_area_struct *vma)
 	 * Custom get unmapped area might result in MREMAP_FIXED not
 	 * being obeyed.
 	 */
-	file = vma->vm_file;
-	if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) {
-		const struct file_operations *fop = file->f_op;
-
-		if (fop->get_unmapped_area)
-			return false;
-	}
+	if (!file || !file->f_op->get_unmapped_area)
+		return true;
+	/* Known good. */
+	if (vma_is_shmem(vma))
+		return true;
+	if (is_vm_hugetlb_page(vma))
+		return true;
+	if (file->f_op->get_unmapped_area == thp_get_unmapped_area)
+		return true;
 
-	return true;
+	return false;
 }
 
 static int check_prep_vma(struct vma_remap_struct *vrm)
@@ -1763,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
 	if (!vrm->new_len)
 		return -EINVAL;
 
-	/* Is the new length or address silly? */
-	if (vrm->new_len > TASK_SIZE ||
-	    vrm->new_addr > TASK_SIZE - vrm->new_len)
+	/* Is the new length silly? */
+	if (vrm->new_len > TASK_SIZE)
 		return -EINVAL;
 
 	/* Remainder of checks are for cases with specific new_addr. */
 	if (!vrm_implies_new_addr(vrm))
 		return 0;
 
+	/* Is the new address silly? */
+	if (vrm->new_addr > TASK_SIZE - vrm->new_len)
+		return -EINVAL;
+
 	/* The new address must be page-aligned. */
 	if (offset_in_page(vrm->new_addr))
 		return -EINVAL;
@@ -1814,10 +1828,11 @@ static unsigned long remap_move(struct vma_remap_struct *vrm)
 	unsigned long start = vrm->addr;
 	unsigned long end = vrm->addr + vrm->old_len;
 	unsigned long new_addr = vrm->new_addr;
-	bool allowed = true, seen_vma = false;
 	unsigned long target_addr = new_addr;
 	unsigned long res = -EFAULT;
 	unsigned long last_end;
+	bool seen_vma = false;
+
 	VMA_ITERATOR(vmi, current->mm, start);
 
 	/*
@@ -1830,9 +1845,7 @@ static unsigned long remap_move(struct vma_remap_struct *vrm)
 		unsigned long addr = max(vma->vm_start, start);
 		unsigned long len = min(end, vma->vm_end) - addr;
 		unsigned long offset, res_vma;
-
-		if (!allowed)
-			return -EFAULT;
+		bool multi_allowed;
 
 		/* No gap permitted at the start of the range. */
 		if (!seen_vma && start < vma->vm_start)
@@ -1861,9 +1874,15 @@ static unsigned long remap_move(struct vma_remap_struct *vrm)
 		vrm->new_addr = target_addr + offset;
 		vrm->old_len = vrm->new_len = len;
 
-		allowed = vma_multi_allowed(vma);
-		if (seen_vma && !allowed)
-			return -EFAULT;
+		multi_allowed = vma_multi_allowed(vma);
+		if (!multi_allowed) {
+			/* This is not the first VMA, abort immediately. */
+			if (seen_vma)
+				return -EFAULT;
+			/* This is the first, but there are more, abort. */
+			if (vma->vm_end < end)
+				return -EFAULT;
+		}
 
 		res_vma = check_prep_vma(vrm);
 		if (!res_vma)
@@ -1872,7 +1891,7 @@ static unsigned long remap_move(struct vma_remap_struct *vrm)
 			return res_vma;
 
 		if (!seen_vma) {
-			VM_WARN_ON_ONCE(allowed && res_vma != new_addr);
+			VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr);
 			res = res_vma;
 		}
 
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 9d55679d99ce..703c8fa05048 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -73,7 +73,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 	}
 
 	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
-	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M);
 	return 0;
 }
 
@@ -264,7 +264,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
 	min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
 	if (size < min_size) {
 		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-			size >> 20, min_size >> 20);
+			size / SZ_1M, min_size / SZ_1M);
 		size = min_size;
 	}
 	size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index 541a99c4071a..5b009a9cd8b4 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -76,7 +76,7 @@ static int __init numa_alloc_distance(void)
 		for (j = 0; j < cnt; j++)
 			numa_distance[i * cnt + j] = i == j ?
 				LOCAL_DISTANCE : REMOTE_DISTANCE;
-	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+	pr_debug("NUMA: Initialized distance table, cnt=%d\n", cnt);
 
 	return 0;
 }
@@ -427,9 +427,9 @@ static int __init numa_register_meminfo(struct numa_meminfo *mi)
 		unsigned long pfn_align = node_map_pfn_alignment();
 
 		if (pfn_align && pfn_align < PAGES_PER_SECTION) {
-			unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
+			unsigned long node_align_mb = PFN_PHYS(pfn_align) / SZ_1M;
 
-			unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
+			unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) / SZ_1M;
 
 			pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
 				node_align_mb, sect_align_mb);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 648038247a8d..936689d8bcac 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 int walk_kernel_page_table_range(unsigned long start, unsigned long end,
 		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
 {
-	struct mm_struct *mm = &init_mm;
+	/*
+	 * Kernel intermediate page tables are usually not freed, so the mmap
+	 * read lock is sufficient. But there are some exceptions.
+	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+	 * to prevent the intermediate kernel pages tables belonging to the
+	 * specified address range from being freed. The caller should take
+	 * other actions to prevent this race.
+	 */
+	mmap_assert_locked(&init_mm);
+
+	return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
+						     private);
+}
+
+/*
+ * Use this function to walk the kernel page tables locklessly. It should be
+ * guaranteed that the caller has exclusive access over the range they are
+ * operating on - that there should be no concurrent access, for example,
+ * changing permissions for vmalloc objects.
+ */
+int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
+		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
 	struct mm_walk walk = {
 		.ops		= ops,
-		.mm		= mm,
+		.mm		= &init_mm,
 		.pgd		= pgd,
 		.private	= private,
 		.no_vma		= true
@@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end,
 	if (!check_ops_valid(ops))
 		return -EINVAL;
 
-	/*
-	 * Kernel intermediate page tables are usually not freed, so the mmap
-	 * read lock is sufficient. But there are some exceptions.
-	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
-	 * to prevent the intermediate kernel pages tables belonging to the
-	 * specified address range from being freed. The caller should take
-	 * other actions to prevent this race.
-	 */
-	mmap_assert_locked(mm);
-
 	return walk_pgd_range(start, end, &walk);
 }
 
diff --git a/mm/percpu.c b/mm/percpu.c
index d9cbaee92b60..81462ce5866e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
 	bool is_atomic;
 	bool do_warn;
 	struct obj_cgroup *objcg = NULL;
-	static int warn_limit = 10;
+	static atomic_t warn_limit = ATOMIC_INIT(10);
 	struct pcpu_chunk *chunk, *next;
 	const char *err;
 	int slot, off, cpu, ret;
@@ -1904,13 +1904,17 @@ fail_unlock:
 fail:
 	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
 
-	if (do_warn && warn_limit) {
-		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
-			size, align, is_atomic, err);
-		if (!is_atomic)
-			dump_stack();
-		if (!--warn_limit)
-			pr_info("limit reached, disable warning\n");
+	if (do_warn) {
+		int remaining = atomic_dec_if_positive(&warn_limit);
+
+		if (remaining >= 0) {
+			pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+				size, align, is_atomic, err);
+			if (!is_atomic)
+				dump_stack();
+			if (remaining == 0)
+				pr_info("limit reached, disable warning\n");
+		}
 	}
 
 	if (is_atomic) {
@@ -3108,7 +3112,7 @@ out_free:
 #endif /* BUILD_EMBED_FIRST_CHUNK */
 
 #ifdef BUILD_PAGE_FIRST_CHUNK
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
 
 #ifndef P4D_TABLE_SIZE
 #define P4D_TABLE_SIZE PAGE_SIZE
@@ -3134,13 +3138,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
 
 	if (pgd_none(*pgd)) {
 		p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
-		pgd_populate(&init_mm, pgd, p4d);
+		pgd_populate_kernel(addr, pgd, p4d);
 	}
 
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d)) {
 		pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
-		p4d_populate(&init_mm, p4d, pud);
+		p4d_populate_kernel(addr, p4d, pud);
 	}
 
 	pud = pud_offset(p4d, addr);
diff --git a/mm/shmem.c b/mm/shmem.c
index e2c76a30802b..932727247c64 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = {
 	.get_dquots	= shmem_get_dquots,
 #endif
 	.evict_inode	= shmem_evict_inode,
-	.drop_inode	= generic_delete_inode,
+	.drop_inode	= inode_just_drop,
 	.put_super	= shmem_put_super,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	.nr_cached_objects	= shmem_unused_huge_count,
diff --git a/mm/slub.c b/mm/slub.c
index 30003763d224..d257141896c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -962,19 +962,19 @@ static struct track *get_track(struct kmem_cache *s, void *object,
 }
 
 #ifdef CONFIG_STACKDEPOT
-static noinline depot_stack_handle_t set_track_prepare(void)
+static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 {
 	depot_stack_handle_t handle;
 	unsigned long entries[TRACK_ADDRS_COUNT];
 	unsigned int nr_entries;
 
 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
-	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+	handle = stack_depot_save(entries, nr_entries, gfp_flags);
 
 	return handle;
 }
 #else
-static inline depot_stack_handle_t set_track_prepare(void)
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 {
 	return 0;
 }
@@ -996,9 +996,9 @@ static void set_track_update(struct kmem_cache *s, void *object,
 }
 
 static __always_inline void set_track(struct kmem_cache *s, void *object,
-				      enum track_item alloc, unsigned long addr)
+				      enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
 {
-	depot_stack_handle_t handle = set_track_prepare();
+	depot_stack_handle_t handle = set_track_prepare(gfp_flags);
 
 	set_track_update(s, object, alloc, addr, handle);
 }
@@ -1140,7 +1140,12 @@ static void object_err(struct kmem_cache *s, struct slab *slab,
 		return;
 
 	slab_bug(s, reason);
-	print_trailer(s, slab, object);
+	if (!object || !check_valid_pointer(s, slab, object)) {
+		print_slab_info(slab);
+		pr_err("Invalid pointer 0x%p\n", object);
+	} else {
+		print_trailer(s, slab, object);
+	}
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 
 	WARN_ON(1);
@@ -1921,9 +1926,9 @@ static inline bool free_debug_processing(struct kmem_cache *s,
 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
 static inline int check_object(struct kmem_cache *s, struct slab *slab,
 			void *object, u8 val) { return 1; }
-static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
+static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
 static inline void set_track(struct kmem_cache *s, void *object,
-			     enum track_item alloc, unsigned long addr) {}
+			     enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct slab *slab) {}
 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -3876,9 +3881,14 @@ new_objects:
 			 * For debug caches here we had to go through
 			 * alloc_single_from_partial() so just store the
 			 * tracking info and return the object.
+			 *
+			 * Due to disabled preemption we need to disallow
+			 * blocking. The flags are further adjusted by
+			 * gfp_nested_mask() in stack_depot itself.
 			 */
 			if (s->flags & SLAB_STORE_USER)
-				set_track(s, freelist, TRACK_ALLOC, addr);
+				set_track(s, freelist, TRACK_ALLOC, addr,
+					  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 
 			return freelist;
 		}
@@ -3910,7 +3920,8 @@ new_objects:
 			goto new_objects;
 
 		if (s->flags & SLAB_STORE_USER)
-			set_track(s, freelist, TRACK_ALLOC, addr);
+			set_track(s, freelist, TRACK_ALLOC, addr,
+				  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 
 		return freelist;
 	}
@@ -4421,8 +4432,12 @@ static noinline void free_to_partial_list(
 	unsigned long flags;
 	depot_stack_handle_t handle = 0;
 
+	/*
+	 * We cannot use GFP_NOWAIT as there are callsites where waking up
+	 * kswapd could deadlock
+	 */
 	if (s->flags & SLAB_STORE_USER)
-		handle = set_track_prepare();
+		handle = set_track_prepare(__GFP_NOWARN);
 
 	spin_lock_irqsave(&n->list_lock, flags);
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index fd2ab5118e13..dbd8daccade2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,9 +27,9 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/pgalloc.h>
 
 #include <asm/dma.h>
-#include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
 #include "hugetlb_vmemmap.h"
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
 		if (!p)
 			return NULL;
 		pud_init(p);
-		p4d_populate(&init_mm, p4d, p);
+		p4d_populate_kernel(addr, p4d, p);
 	}
 	return p4d;
 }
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 		if (!p)
 			return NULL;
-		pgd_populate(&init_mm, pgd, p);
+		pgd_populate_kernel(addr, pgd, p);
 	}
 	return pgd;
 }
@@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
 	if (r < 0)
 		return NULL;
 
-	if (system_state == SYSTEM_BOOTING)
-		memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-	else
-		memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-
 	return pfn_to_page(pfn);
 }
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 3c012cf83cc2..e6075b622407 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
 	 */
 	sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
 	sparsemap_buf_end = sparsemap_buf + size;
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-	memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
-#endif
 }
 
 static void __init sparse_buffer_fini(void)
@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				sparse_buffer_fini();
 				goto failed;
 			}
+			memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
+							   PAGE_SIZE));
 			sparse_init_early_section(nid, map, pnum, 0);
 		}
 	}
@@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
 
-	memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
 	vmemmap_free(start, end, altmap);
 }
 static void free_map_bootmem(struct page *memmap)
@@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	 * The memmap of early sections is always fully populated. See
 	 * section_activate() and pfn_valid() .
 	 */
-	if (!section_is_early)
+	if (!section_is_early) {
+		memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
 		depopulate_section_memmap(pfn, nr_pages, altmap);
-	else if (memmap)
+	} else if (memmap) {
+		memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
+							  PAGE_SIZE)));
 		free_map_bootmem(memmap);
+	}
 
 	if (empty)
 		ms->section_mem_map = (unsigned long)NULL;
@@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
 		section_deactivate(pfn, nr_pages, altmap);
 		return ERR_PTR(-ENOMEM);
 	}
+	memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
 
 	return memmap;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 3632dd061beb..b74ebe865dd9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 	for (i = 0; i < folio_batch_count(fbatch); i++) {
 		struct folio *folio = fbatch->folios[i];
 
+		/* block memcg migration while the folio moves between lru */
+		if (move_fn != lru_add && !folio_test_clear_lru(folio))
+			continue;
+
 		folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
 		move_fn(lruvec, folio);
 
@@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 }
 
 static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
-		struct folio *folio, move_fn_t move_fn,
-		bool on_lru, bool disable_irq)
+		struct folio *folio, move_fn_t move_fn, bool disable_irq)
 {
 	unsigned long flags;
 
-	if (on_lru && !folio_test_clear_lru(folio))
-		return;
-
 	folio_get(folio);
 
 	if (disable_irq)
@@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
 	else
 		local_lock(&cpu_fbatches.lock);
 
-	if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
-	    lru_cache_disabled())
+	if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
+			!folio_may_be_lru_cached(folio) || lru_cache_disabled())
 		folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
 
 	if (disable_irq)
@@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
 		local_unlock(&cpu_fbatches.lock);
 }
 
-#define folio_batch_add_and_move(folio, op, on_lru)						\
-	__folio_batch_add_and_move(								\
-		&cpu_fbatches.op,								\
-		folio,										\
-		op,										\
-		on_lru,										\
-		offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq)	\
+#define folio_batch_add_and_move(folio, op)		\
+	__folio_batch_add_and_move(			\
+		&cpu_fbatches.op,			\
+		folio,					\
+		op,					\
+		offsetof(struct cpu_fbatches, op) >=	\
+		offsetof(struct cpu_fbatches, lock_irq)	\
 	)
 
 static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
@@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
 void folio_rotate_reclaimable(struct folio *folio)
 {
 	if (folio_test_locked(folio) || folio_test_dirty(folio) ||
-	    folio_test_unevictable(folio))
+	    folio_test_unevictable(folio) || !folio_test_lru(folio))
 		return;
 
-	folio_batch_add_and_move(folio, lru_move_tail, true);
+	folio_batch_add_and_move(folio, lru_move_tail);
 }
 
 void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
@@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu)
 
 void folio_activate(struct folio *folio)
 {
-	if (folio_test_active(folio) || folio_test_unevictable(folio))
+	if (folio_test_active(folio) || folio_test_unevictable(folio) ||
+	    !folio_test_lru(folio))
 		return;
 
-	folio_batch_add_and_move(folio, lru_activate, true);
+	folio_batch_add_and_move(folio, lru_activate);
 }
 
 #else
@@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio)
 	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
 		folio_set_active(folio);
 
-	folio_batch_add_and_move(folio, lru_add, false);
+	folio_batch_add_and_move(folio, lru_add);
 }
 EXPORT_SYMBOL(folio_add_lru);
 
@@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu)
 void deactivate_file_folio(struct folio *folio)
 {
 	/* Deactivating an unevictable folio will not accelerate reclaim */
-	if (folio_test_unevictable(folio))
+	if (folio_test_unevictable(folio) || !folio_test_lru(folio))
 		return;
 
 	if (lru_gen_enabled() && lru_gen_clear_refs(folio))
 		return;
 
-	folio_batch_add_and_move(folio, lru_deactivate_file, true);
+	folio_batch_add_and_move(folio, lru_deactivate_file);
 }
 
 /*
@@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio)
  */
 void folio_deactivate(struct folio *folio)
 {
-	if (folio_test_unevictable(folio))
+	if (folio_test_unevictable(folio) || !folio_test_lru(folio))
 		return;
 
 	if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
 		return;
 
-	folio_batch_add_and_move(folio, lru_deactivate, true);
+	folio_batch_add_and_move(folio, lru_deactivate);
 }
 
 /**
@@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio)
 void folio_mark_lazyfree(struct folio *folio)
 {
 	if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+	    !folio_test_lru(folio) ||
 	    folio_test_swapcache(folio) || folio_test_unevictable(folio))
 		return;
 
-	folio_batch_add_and_move(folio, lru_lazyfree, true);
+	folio_batch_add_and_move(folio, lru_lazyfree);
 }
 
 void lru_add_drain(void)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index cbed91b09640..aefdf3a812a1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1453,10 +1453,15 @@ out:
 		folio_unlock(src_folio);
 		folio_put(src_folio);
 	}
-	if (dst_pte)
-		pte_unmap(dst_pte);
+	/*
+	 * Unmap in reverse order (LIFO) to maintain proper kmap_local
+	 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
+	 * first, then src_pte, so we must unmap src_pte first, then dst_pte.
+	 */
 	if (src_pte)
 		pte_unmap(src_pte);
+	if (dst_pte)
+		pte_unmap(dst_pte);
 	mmu_notifier_invalidate_range_end(&range);
 	if (si)
 		put_swap_device(si);
@@ -1821,13 +1826,16 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			/* Check if we can move the pmd without splitting it. */
 			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
 			    !pmd_none(dst_pmdval)) {
-				struct folio *folio = pmd_folio(*src_pmd);
-
-				if (!folio || (!is_huge_zero_folio(folio) &&
-					       !PageAnonExclusive(&folio->page))) {
-					spin_unlock(ptl);
-					err = -EBUSY;
-					break;
+				/* Can be a migration entry */
+				if (pmd_present(*src_pmd)) {
+					struct folio *folio = pmd_folio(*src_pmd);
+
+					if (!is_huge_zero_folio(folio) &&
+					    !PageAnonExclusive(&folio->page)) {
+						spin_unlock(ptl);
+						err = -EBUSY;
+						break;
+					}
 				}
 
 				spin_unlock(ptl);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6dbcdceecae1..5edd536ba9d2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	if (unlikely(!vmap_initialized))
 		return ERR_PTR(-EBUSY);
 
+	/* Only reclaim behaviour flags are relevant. */
+	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
 	might_sleep();
 
 	/*
@@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	 */
 	va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
 	if (!va) {
-		gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
-
 		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
 		if (unlikely(!va))
 			return ERR_PTR(-ENOMEM);
@@ -2089,7 +2089,7 @@ retry:
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
-	ret = kasan_populate_vmalloc(addr, size);
+	ret = kasan_populate_vmalloc(addr, size, gfp_mask);
 	if (ret) {
 		free_vmap_area(va);
 		return ERR_PTR(ret);
@@ -4826,7 +4826,7 @@ retry:
 
 	/* populate the kasan shadow space */
 	for (area = 0; area < nr_vms; area++) {
-		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL))
 			goto err_free_shadow;
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7de11524a936..674999999cd0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
 	}
 
 	/* ineligible */
-	if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
+	if (zone > sc->reclaim_idx) {
 		gen = folio_inc_gen(lruvec, folio, false);
 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
 		return true;
@@ -5772,9 +5772,9 @@ static int __init init_lru_gen(void)
 	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
 		pr_err("lru_gen: failed to create sysfs group\n");
 
-	debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, 1,
+	debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false,
 				    &lru_gen_rw_fops);
-	debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, 0,
+	debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true,
 				    &lru_gen_ro_fops);
 
 	return 0;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2c5e56a65354..805a10b41266 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2246,9 +2246,16 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
 
 static int __init zs_init(void)
 {
+	int rc __maybe_unused;
+
 #ifdef CONFIG_ZPOOL
 	zpool_register_driver(&zs_zpool_driver);
 #endif
+#ifdef CONFIG_COMPACTION
+	rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
+	if (rc)
+		return rc;
+#endif
 	zs_stat_init();
 	return 0;
 }
@@ -2258,6 +2265,9 @@ static void __exit zs_exit(void)
 #ifdef CONFIG_ZPOOL
 	zpool_unregister_driver(&zs_zpool_driver);
 #endif
+#ifdef CONFIG_COMPACTION
+	set_movable_ops(NULL, PGTY_zsmalloc);
+#endif
 	zs_stat_exit();
 }
 
diff --git a/net/atm/common.c b/net/atm/common.c
index d7f7976ea13a..881c7f259dbd 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -635,18 +635,27 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
 
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
 	if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
-		atm_return_tx(vcc, skb);
-		kfree_skb(skb);
 		error = -EFAULT;
-		goto out;
+		goto free_skb;
 	}
 	if (eff != size)
 		memset(skb->data + size, 0, eff-size);
+
+	if (vcc->dev->ops->pre_send) {
+		error = vcc->dev->ops->pre_send(vcc, skb);
+		if (error)
+			goto free_skb;
+	}
+
 	error = vcc->dev->ops->send(vcc, skb);
 	error = error ? error : size;
 out:
 	release_sock(sk);
 	return error;
+free_skb:
+	atm_return_tx(vcc, skb);
+	kfree_skb(skb);
+	goto out;
 }
 
 __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
diff --git a/net/atm/resources.c b/net/atm/resources.c
index b19d851e1f44..7c6fdedbcf4e 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -112,7 +112,9 @@ struct atm_dev *atm_dev_register(const char *type, struct device *parent,
 
 	if (atm_proc_dev_register(dev) < 0) {
 		pr_err("atm_proc_dev_register failed for dev %s\n", type);
-		goto out_fail;
+		mutex_unlock(&atm_dev_mutex);
+		kfree(dev);
+		return NULL;
 	}
 
 	if (atm_register_sysfs(dev, parent) < 0) {
@@ -128,7 +130,7 @@ out:
 	return dev;
 
 out_fail:
-	kfree(dev);
+	put_device(&dev->class_dev);
 	dev = NULL;
 	goto out;
 }
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 1cac25aca637..f2d66af86359 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -433,6 +433,10 @@ free:
 int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
 		  struct packet_type *ptype, struct net_device *orig_dev)
 {
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		return NET_RX_DROP;
+
 	skb_orphan(skb);
 
 	if (!net_eq(dev_net(dev), &init_net)) {
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 9f56308779cc..af97d077369f 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1687,7 +1687,12 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
 
 	coding_len = ntohs(coded_packet_tmp.coded_len);
 
-	if (coding_len > skb->len)
+	/* ensure dst buffer is large enough (payload only) */
+	if (coding_len + h_size > skb->len)
+		return NULL;
+
+	/* ensure src buffer is large enough (payload only) */
+	if (coding_len + h_size > nc_packet->skb->len)
 		return NULL;
 
 	/* Here the magic is reversed:
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 7d1e79f69cd1..e524bb59bff2 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -149,8 +149,6 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 
 	hci_chan_list_flush(conn);
 
-	hci_conn_hash_del(hdev, conn);
-
 	if (HCI_CONN_HANDLE_UNSET(conn->handle))
 		ida_free(&hdev->unset_handle_ida, conn->handle);
 
@@ -339,7 +337,8 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
 	case BT_CODEC_TRANSPARENT:
 		if (!find_next_esco_param(conn, esco_param_msbc,
 					  ARRAY_SIZE(esco_param_msbc)))
-			return false;
+			return -EINVAL;
+
 		param = &esco_param_msbc[conn->attempt - 1];
 		cp.tx_coding_format.id = 0x03;
 		cp.rx_coding_format.id = 0x03;
@@ -830,7 +829,17 @@ static void bis_cleanup(struct hci_conn *conn)
 		/* Check if ISO connection is a BIS and terminate advertising
 		 * set and BIG if there are no other connections using it.
 		 */
-		bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big);
+		bis = hci_conn_hash_lookup_big_state(hdev,
+						     conn->iso_qos.bcast.big,
+						     BT_CONNECTED,
+						     HCI_ROLE_MASTER);
+		if (bis)
+			return;
+
+		bis = hci_conn_hash_lookup_big_state(hdev,
+						     conn->iso_qos.bcast.big,
+						     BT_CONNECT,
+						     HCI_ROLE_MASTER);
 		if (bis)
 			return;
 
@@ -1141,28 +1150,54 @@ void hci_conn_del(struct hci_conn *conn)
 	disable_delayed_work_sync(&conn->auto_accept_work);
 	disable_delayed_work_sync(&conn->idle_work);
 
-	if (conn->type == ACL_LINK) {
-		/* Unacked frames */
-		hdev->acl_cnt += conn->sent;
-	} else if (conn->type == LE_LINK) {
-		cancel_delayed_work(&conn->le_conn_timeout);
+	/* Remove the connection from the list so unacked logic can detect when
+	 * a certain pool is not being utilized.
+	 */
+	hci_conn_hash_del(hdev, conn);
 
-		if (hdev->le_pkts)
-			hdev->le_cnt += conn->sent;
+	/* Handle unacked frames:
+	 *
+	 * - In case there are no connection, or if restoring the buffers
+	 *   considered in transist would overflow, restore all buffers to the
+	 *   pool.
+	 * - Otherwise restore just the buffers considered in transit for the
+	 *   hci_conn
+	 */
+	switch (conn->type) {
+	case ACL_LINK:
+		if (!hci_conn_num(hdev, ACL_LINK) ||
+		    hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+			hdev->acl_cnt = hdev->acl_pkts;
 		else
 			hdev->acl_cnt += conn->sent;
-	} else {
-		/* Unacked ISO frames */
-		if (conn->type == CIS_LINK ||
-		    conn->type == BIS_LINK ||
-		    conn->type == PA_LINK) {
-			if (hdev->iso_pkts)
-				hdev->iso_cnt += conn->sent;
-			else if (hdev->le_pkts)
+		break;
+	case LE_LINK:
+		cancel_delayed_work(&conn->le_conn_timeout);
+
+		if (hdev->le_pkts) {
+			if (!hci_conn_num(hdev, LE_LINK) ||
+			    hdev->le_cnt + conn->sent > hdev->le_pkts)
+				hdev->le_cnt = hdev->le_pkts;
+			else
 				hdev->le_cnt += conn->sent;
+		} else {
+			if ((!hci_conn_num(hdev, LE_LINK) &&
+			     !hci_conn_num(hdev, ACL_LINK)) ||
+			    hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+				hdev->acl_cnt = hdev->acl_pkts;
 			else
 				hdev->acl_cnt += conn->sent;
 		}
+		break;
+	case CIS_LINK:
+	case BIS_LINK:
+	case PA_LINK:
+		if (!hci_iso_count(hdev) ||
+		    hdev->iso_cnt + conn->sent > hdev->iso_pkts)
+			hdev->iso_cnt = hdev->iso_pkts;
+		else
+			hdev->iso_cnt += conn->sent;
+		break;
 	}
 
 	skb_queue_purge(&conn->data_q);
@@ -2249,7 +2284,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	 * the start periodic advertising and create BIG commands have
 	 * been queued
 	 */
-	hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK,
+	hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
 				 BT_BOUND, &data);
 
 	/* Queue start periodic advertising and create BIG */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 8aa5039b975a..fe49e8a7969f 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2703,7 +2703,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
 	if (!conn)
 		goto unlock;
 
-	if (status) {
+	if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) {
 		mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
 				       conn->dst_type, status);
 
@@ -2718,6 +2718,12 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
 		goto done;
 	}
 
+	/* During suspend, mark connection as closed immediately
+	 * since we might not receive HCI_EV_DISCONN_COMPLETE
+	 */
+	if (hdev->suspended)
+		conn->state = BT_CLOSED;
+
 	mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
 
 	if (conn->type == ACL_LINK) {
@@ -3081,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
+	/* Check for existing connection:
+	 *
+	 * 1. If it doesn't exist then it must be receiver/slave role.
+	 * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+	 *    of initiator/master role since there could be a collision where
+	 *    either side is attempting to connect or something like a fuzzing
+	 *    testing is trying to play tricks to destroy the hcon object before
+	 *    it even attempts to connect (e.g. hcon->state == BT_OPEN).
+	 */
 	conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
-	if (!conn) {
+	if (!conn ||
+	    (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
 		/* In case of error status and there is no connection pending
 		 * just unlock as there is nothing to cleanup.
 		 */
@@ -4385,6 +4401,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 
 	bt_dev_dbg(hdev, "num %d", ev->num);
 
+	hci_dev_lock(hdev);
+
 	for (i = 0; i < ev->num; i++) {
 		struct hci_comp_pkts_info *info = &ev->handles[i];
 		struct hci_conn *conn;
@@ -4398,7 +4416,17 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 		if (!conn)
 			continue;
 
-		conn->sent -= count;
+		/* Check if there is really enough packets outstanding before
+		 * attempting to decrease the sent counter otherwise it could
+		 * underflow..
+		 */
+		if (conn->sent >= count) {
+			conn->sent -= count;
+		} else {
+			bt_dev_warn(hdev, "hcon %p sent %u < count %u",
+				    conn, conn->sent, count);
+			conn->sent = 0;
+		}
 
 		for (i = 0; i < count; ++i)
 			hci_conn_tx_dequeue(conn);
@@ -4456,6 +4484,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 	}
 
 	queue_work(hdev->workqueue, &hdev->tx_work);
+
+	hci_dev_unlock(hdev);
 }
 
 static void hci_mode_change_evt(struct hci_dev *hdev, void *data,
@@ -5618,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	 */
 	hci_dev_clear_flag(hdev, HCI_LE_ADV);
 
-	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr);
-	if (!conn) {
+	/* Check for existing connection:
+	 *
+	 * 1. If it doesn't exist then use the role to create a new object.
+	 * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+	 *    of initiator/master role since there could be a collision where
+	 *    either side is attempting to connect or something like a fuzzing
+	 *    testing is trying to play tricks to destroy the hcon object before
+	 *    it even attempts to connect (e.g. hcon->state == BT_OPEN).
+	 */
+	conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr);
+	if (!conn ||
+	    (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
 		/* In case of error status and there is no connection pending
 		 * just unlock as there is nothing to cleanup.
 		 */
@@ -6745,8 +6785,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
 		qos->ucast.out.latency =
 			DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
 					  1000);
-		qos->ucast.in.sdu = le16_to_cpu(ev->c_mtu);
-		qos->ucast.out.sdu = le16_to_cpu(ev->p_mtu);
+		qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+		qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
 		qos->ucast.in.phy = ev->c_phy;
 		qos->ucast.out.phy = ev->p_phy;
 		break;
@@ -6760,8 +6800,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
 		qos->ucast.in.latency =
 			DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
 					  1000);
-		qos->ucast.out.sdu = le16_to_cpu(ev->c_mtu);
-		qos->ucast.in.sdu = le16_to_cpu(ev->p_mtu);
+		qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+		qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
 		qos->ucast.out.phy = ev->c_phy;
 		qos->ucast.in.phy = ev->p_phy;
 		break;
@@ -6957,9 +6997,14 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 				continue;
 		}
 
-		if (ev->status != 0x42)
+		if (ev->status != 0x42) {
 			/* Mark PA sync as established */
 			set_bit(HCI_CONN_PA_SYNC, &bis->flags);
+			/* Reset cleanup callback of PA Sync so it doesn't
+			 * terminate the sync when deleting the connection.
+			 */
+			conn->cleanup = NULL;
+		}
 
 		bis->sync_handle = conn->sync_handle;
 		bis->iso_qos.bcast.big = ev->handle;
@@ -7003,6 +7048,7 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
 {
 	struct hci_evt_le_big_sync_lost *ev = data;
 	struct hci_conn *bis, *conn;
+	bool mgmt_conn;
 
 	bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle);
 
@@ -7021,6 +7067,10 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
 	while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle,
 						     BT_CONNECTED,
 						     HCI_ROLE_SLAVE))) {
+		mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags);
+		mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type,
+					 ev->reason, mgmt_conn);
+
 		clear_bit(HCI_CONN_BIG_SYNC, &bis->flags);
 		hci_disconn_cfm(bis, ev->reason);
 		hci_conn_del(bis);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 2b4f21fbf9c1..7a7d49890858 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2594,6 +2594,13 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev)
 			hci_remove_ext_adv_instance_sync(hdev, adv->instance,
 							 NULL);
 		}
+
+		/* If current advertising instance is set to instance 0x00
+		 * then we need to re-enable it.
+		 */
+		if (!hdev->cur_adv_instance)
+			err = hci_enable_ext_advertising_sync(hdev,
+							      hdev->cur_adv_instance);
 	} else {
 		/* Schedule for most recent instance to be restarted and begin
 		 * the software rotation loop
@@ -3344,7 +3351,7 @@ static int hci_powered_update_adv_sync(struct hci_dev *hdev)
 	 * advertising data. This also applies to the case
 	 * where BR/EDR was toggled during the AUTO_OFF phase.
 	 */
-	if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+	if (hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
 	    list_empty(&hdev->adv_instances)) {
 		if (ext_adv_capable(hdev)) {
 			err = hci_setup_ext_adv_instance_sync(hdev, 0x00);
@@ -3481,13 +3488,13 @@ int hci_update_scan_sync(struct hci_dev *hdev)
 	return hci_write_scan_enable_sync(hdev, scan);
 }
 
-int hci_update_name_sync(struct hci_dev *hdev)
+int hci_update_name_sync(struct hci_dev *hdev, const u8 *name)
 {
 	struct hci_cp_write_local_name cp;
 
 	memset(&cp, 0, sizeof(cp));
 
-	memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
+	memcpy(cp.name, name, sizeof(cp.name));
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME,
 					    sizeof(cp), &cp,
@@ -3540,7 +3547,7 @@ int hci_powered_update_sync(struct hci_dev *hdev)
 			hci_write_fast_connectable_sync(hdev, false);
 		hci_update_scan_sync(hdev);
 		hci_update_class_sync(hdev);
-		hci_update_name_sync(hdev);
+		hci_update_name_sync(hdev, hdev->dev_name);
 		hci_update_eir_sync(hdev);
 	}
 
@@ -4531,14 +4538,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev)
 {
 	struct hci_cp_le_set_host_feature cp;
 
-	if (!cis_capable(hdev))
+	if (!iso_capable(hdev))
 		return 0;
 
 	memset(&cp, 0, sizeof(cp));
 
 	/* Connected Isochronous Channels (Host Support) */
 	cp.bit_number = 32;
-	cp.bit_value = 1;
+	cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
@@ -6985,8 +6992,6 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
 
 	hci_dev_lock(hdev);
 
-	hci_dev_clear_flag(hdev, HCI_PA_SYNC);
-
 	if (!hci_conn_valid(hdev, conn))
 		clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
 
@@ -7047,10 +7052,13 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
 	/* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update
 	 * it.
 	 */
-	if (conn->sid == HCI_SID_INVALID)
-		__hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
-					 HCI_EV_LE_EXT_ADV_REPORT,
-					 conn->conn_timeout, NULL);
+	if (conn->sid == HCI_SID_INVALID) {
+		err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+					       HCI_EV_LE_EXT_ADV_REPORT,
+					       conn->conn_timeout, NULL);
+		if (err == -ETIMEDOUT)
+			goto done;
+	}
 
 	memset(&cp, 0, sizeof(cp));
 	cp.options = qos->bcast.options;
@@ -7080,6 +7088,12 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
 		__hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL,
 				      0, NULL, HCI_CMD_TIMEOUT);
 
+done:
+	hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+	/* Update passive scan since HCI_PA_SYNC flag has been cleared */
+	hci_update_passive_scan_sync(hdev);
+
 	return err;
 }
 
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 7bd3aa0a6db9..5ce823ca3aaf 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1347,7 +1347,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
 		bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
 		sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
 
-		if (hcon && hcon->type == BIS_LINK) {
+		if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) {
 			sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid;
 			sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis;
 			memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis,
@@ -2483,11 +2483,11 @@ static const struct net_proto_family iso_sock_family_ops = {
 	.create	= iso_sock_create,
 };
 
-static bool iso_inited;
+static bool inited;
 
-bool iso_enabled(void)
+bool iso_inited(void)
 {
-	return iso_inited;
+	return inited;
 }
 
 int iso_init(void)
@@ -2496,7 +2496,7 @@ int iso_init(void)
 
 	BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr));
 
-	if (iso_inited)
+	if (inited)
 		return -EALREADY;
 
 	err = proto_register(&iso_proto, 0);
@@ -2524,7 +2524,7 @@ int iso_init(void)
 		iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs,
 						  NULL, &iso_debugfs_fops);
 
-	iso_inited = true;
+	inited = true;
 
 	return 0;
 
@@ -2535,7 +2535,7 @@ error:
 
 int iso_exit(void)
 {
-	if (!iso_inited)
+	if (!inited)
 		return -EALREADY;
 
 	bt_procfs_cleanup(&init_net, "iso");
@@ -2549,7 +2549,7 @@ int iso_exit(void)
 
 	proto_unregister(&iso_proto);
 
-	iso_inited = false;
+	inited = false;
 
 	return 0;
 }
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index f4257c4d3052..814fb8610ac4 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1422,7 +1422,10 @@ static int l2cap_sock_release(struct socket *sock)
 	if (!sk)
 		return 0;
 
+	lock_sock_nested(sk, L2CAP_NESTING_PARENT);
 	l2cap_sock_cleanup_listen(sk);
+	release_sock(sk);
+
 	bt_sock_unlink(&l2cap_sk_list, sk);
 
 	err = l2cap_sock_shutdown(sock, SHUT_RDWR);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 1ce682038b51..225140fcb3d6 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -922,19 +922,19 @@ static u32 get_current_settings(struct hci_dev *hdev)
 	if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED))
 		settings |= MGMT_SETTING_WIDEBAND_SPEECH;
 
-	if (cis_central_capable(hdev))
+	if (cis_central_enabled(hdev))
 		settings |= MGMT_SETTING_CIS_CENTRAL;
 
-	if (cis_peripheral_capable(hdev))
+	if (cis_peripheral_enabled(hdev))
 		settings |= MGMT_SETTING_CIS_PERIPHERAL;
 
-	if (bis_capable(hdev))
+	if (bis_enabled(hdev))
 		settings |= MGMT_SETTING_ISO_BROADCASTER;
 
-	if (sync_recv_capable(hdev))
+	if (sync_recv_enabled(hdev))
 		settings |= MGMT_SETTING_ISO_SYNC_RECEIVER;
 
-	if (ll_privacy_capable(hdev))
+	if (ll_privacy_enabled(hdev))
 		settings |= MGMT_SETTING_LL_PRIVACY;
 
 	return settings;
@@ -1323,8 +1323,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err)
 	struct mgmt_mode *cp;
 
 	/* Make sure cmd still outstanding. */
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_SET_POWERED, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
 	cp = cmd->param;
@@ -1351,23 +1350,29 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err)
 				mgmt_status(err));
 	}
 
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 }
 
 static int set_powered_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_mode *cp;
+	struct mgmt_mode cp;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
 
 	/* Make sure cmd still outstanding. */
-	if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev))
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
 		return -ECANCELED;
+	}
 
-	cp = cmd->param;
+	memcpy(&cp, cmd->param, sizeof(cp));
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
 
 	BT_DBG("%s", hdev->name);
 
-	return hci_set_powered_sync(hdev, cp->val);
+	return hci_set_powered_sync(hdev, cp.val);
 }
 
 static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -1516,8 +1521,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data,
 	bt_dev_dbg(hdev, "err %d", err);
 
 	/* Make sure cmd still outstanding. */
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
 	hci_dev_lock(hdev);
@@ -1539,12 +1543,15 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data,
 	new_settings(hdev, cmd->sk);
 
 done:
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 	hci_dev_unlock(hdev);
 }
 
 static int set_discoverable_sync(struct hci_dev *hdev, void *data)
 {
+	if (!mgmt_pending_listed(hdev, data))
+		return -ECANCELED;
+
 	BT_DBG("%s", hdev->name);
 
 	return hci_update_discoverable_sync(hdev);
@@ -1691,8 +1698,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data,
 	bt_dev_dbg(hdev, "err %d", err);
 
 	/* Make sure cmd still outstanding. */
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
 	hci_dev_lock(hdev);
@@ -1707,7 +1713,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data,
 	new_settings(hdev, cmd->sk);
 
 done:
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 
 	hci_dev_unlock(hdev);
 }
@@ -1743,6 +1749,9 @@ static int set_connectable_update_settings(struct hci_dev *hdev,
 
 static int set_connectable_sync(struct hci_dev *hdev, void *data)
 {
+	if (!mgmt_pending_listed(hdev, data))
+		return -ECANCELED;
+
 	BT_DBG("%s", hdev->name);
 
 	return hci_update_connectable_sync(hdev);
@@ -1919,14 +1928,17 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct cmd_lookup match = { NULL, hdev };
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_mode *cp = cmd->param;
-	u8 enable = cp->val;
+	struct mgmt_mode *cp;
+	u8 enable;
 	bool changed;
 
 	/* Make sure cmd still outstanding. */
-	if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
+	cp = cmd->param;
+	enable = cp->val;
+
 	if (err) {
 		u8 mgmt_err = mgmt_status(err);
 
@@ -1935,8 +1947,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 			new_settings(hdev, NULL);
 		}
 
-		mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true,
-				     cmd_status_rsp, &mgmt_err);
+		mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
 		return;
 	}
 
@@ -1946,7 +1957,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 		changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
 	}
 
-	mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match);
+	settings_rsp(cmd, &match);
 
 	if (changed)
 		new_settings(hdev, match.sk);
@@ -1960,14 +1971,25 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 static int set_ssp_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_mode *cp = cmd->param;
+	struct mgmt_mode cp;
 	bool changed = false;
 	int err;
 
-	if (cp->val)
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	memcpy(&cp, cmd->param, sizeof(cp));
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	if (cp.val)
 		changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
 
-	err = hci_write_ssp_mode_sync(hdev, cp->val);
+	err = hci_write_ssp_mode_sync(hdev, cp.val);
 
 	if (!err && changed)
 		hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
@@ -2060,32 +2082,50 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 
 static void set_le_complete(struct hci_dev *hdev, void *data, int err)
 {
+	struct mgmt_pending_cmd *cmd = data;
 	struct cmd_lookup match = { NULL, hdev };
 	u8 status = mgmt_status(err);
 
 	bt_dev_dbg(hdev, "err %d", err);
 
-	if (status) {
-		mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp,
-				     &status);
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
 		return;
+
+	if (status) {
+		mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+		goto done;
 	}
 
-	mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match);
+	settings_rsp(cmd, &match);
 
 	new_settings(hdev, match.sk);
 
 	if (match.sk)
 		sock_put(match.sk);
+
+done:
+	mgmt_pending_free(cmd);
 }
 
 static int set_le_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_mode *cp = cmd->param;
-	u8 val = !!cp->val;
+	struct mgmt_mode cp;
+	u8 val;
 	int err;
 
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	memcpy(&cp, cmd->param, sizeof(cp));
+	val = !!cp.val;
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
 	if (!val) {
 		hci_clear_adv_instance_sync(hdev, NULL, 0x00, true);
 
@@ -2127,7 +2167,12 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct mgmt_pending_cmd *cmd = data;
 	u8 status = mgmt_status(err);
-	struct sock *sk = cmd->sk;
+	struct sock *sk;
+
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+		return;
+
+	sk = cmd->sk;
 
 	if (status) {
 		mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true,
@@ -2142,24 +2187,37 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err)
 static int set_mesh_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_cp_set_mesh *cp = cmd->param;
-	size_t len = cmd->param_len;
+	struct mgmt_cp_set_mesh cp;
+	size_t len;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	memcpy(&cp, cmd->param, sizeof(cp));
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	len = cmd->param_len;
 
 	memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types));
 
-	if (cp->enable)
+	if (cp.enable)
 		hci_dev_set_flag(hdev, HCI_MESH);
 	else
 		hci_dev_clear_flag(hdev, HCI_MESH);
 
-	hdev->le_scan_interval = __le16_to_cpu(cp->period);
-	hdev->le_scan_window = __le16_to_cpu(cp->window);
+	hdev->le_scan_interval = __le16_to_cpu(cp.period);
+	hdev->le_scan_window = __le16_to_cpu(cp.window);
 
-	len -= sizeof(*cp);
+	len -= sizeof(cp);
 
 	/* If filters don't fit, forward all adv pkts */
 	if (len <= sizeof(hdev->mesh_ad_types))
-		memcpy(hdev->mesh_ad_types, cp->ad_types, len);
+		memcpy(hdev->mesh_ad_types, cp.ad_types, len);
 
 	hci_update_passive_scan_sync(hdev);
 	return 0;
@@ -3867,15 +3925,16 @@ static int name_changed_sync(struct hci_dev *hdev, void *data)
 static void set_name_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_cp_set_local_name *cp = cmd->param;
+	struct mgmt_cp_set_local_name *cp;
 	u8 status = mgmt_status(err);
 
 	bt_dev_dbg(hdev, "err %d", err);
 
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
+	cp = cmd->param;
+
 	if (status) {
 		mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
 				status);
@@ -3887,13 +3946,27 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err)
 			hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL);
 	}
 
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 }
 
 static int set_name_sync(struct hci_dev *hdev, void *data)
 {
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_set_local_name cp;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	memcpy(&cp, cmd->param, sizeof(cp));
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
 	if (lmp_bredr_capable(hdev)) {
-		hci_update_name_sync(hdev);
+		hci_update_name_sync(hdev, cp.name);
 		hci_update_eir_sync(hdev);
 	}
 
@@ -4045,12 +4118,10 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
 static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct sk_buff *skb = cmd->skb;
+	struct sk_buff *skb;
 	u8 status = mgmt_status(err);
 
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev))
-		return;
+	skb = cmd->skb;
 
 	if (!status) {
 		if (!skb)
@@ -4077,7 +4148,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err)
 	if (skb && !IS_ERR(skb))
 		kfree_skb(skb);
 
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 }
 
 static int set_default_phy_sync(struct hci_dev *hdev, void *data)
@@ -4085,7 +4156,9 @@ static int set_default_phy_sync(struct hci_dev *hdev, void *data)
 	struct mgmt_pending_cmd *cmd = data;
 	struct mgmt_cp_set_phy_configuration *cp = cmd->param;
 	struct hci_cp_le_set_default_phy cp_phy;
-	u32 selected_phys = __le32_to_cpu(cp->selected_phys);
+	u32 selected_phys;
+
+	selected_phys = __le32_to_cpu(cp->selected_phys);
 
 	memset(&cp_phy, 0, sizeof(cp_phy));
 
@@ -4225,7 +4298,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
 		goto unlock;
 	}
 
-	cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
+	cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
 			       len);
 	if (!cmd)
 		err = -ENOMEM;
@@ -4513,7 +4586,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 	}
 
 	if (IS_ENABLED(CONFIG_BT_LE)) {
-		flags = iso_enabled() ? BIT(0) : 0;
+		flags = iso_inited() ? BIT(0) : 0;
 		memcpy(rp->features[idx].uuid, iso_socket_uuid, 16);
 		rp->features[idx].flags = cpu_to_le32(flags);
 		idx++;
@@ -5186,7 +5259,17 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev,
 {
 	struct mgmt_rp_add_adv_patterns_monitor rp;
 	struct mgmt_pending_cmd *cmd = data;
-	struct adv_monitor *monitor = cmd->user_data;
+	struct adv_monitor *monitor;
+
+	/* This is likely the result of hdev being closed and mgmt_index_removed
+	 * is attempting to clean up any pending command so
+	 * hci_adv_monitors_clear is about to be called which will take care of
+	 * freeing the adv_monitor instances.
+	 */
+	if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd))
+		return;
+
+	monitor = cmd->user_data;
 
 	hci_dev_lock(hdev);
 
@@ -5212,9 +5295,20 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev,
 static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct adv_monitor *monitor = cmd->user_data;
+	struct adv_monitor *mon;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
 
-	return hci_add_adv_monitor(hdev, monitor);
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	mon = cmd->user_data;
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	return hci_add_adv_monitor(hdev, mon);
 }
 
 static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
@@ -5481,7 +5575,8 @@ unlock:
 			       status);
 }
 
-static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err)
+static void read_local_oob_data_complete(struct hci_dev *hdev, void *data,
+					 int err)
 {
 	struct mgmt_rp_read_local_oob_data mgmt_rp;
 	size_t rp_size = sizeof(mgmt_rp);
@@ -5501,7 +5596,8 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int e
 	bt_dev_dbg(hdev, "status %d", status);
 
 	if (status) {
-		mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status);
+		mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+				status);
 		goto remove;
 	}
 
@@ -5783,17 +5879,12 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err)
 
 	bt_dev_dbg(hdev, "err %d", err);
 
-	if (err == -ECANCELED)
-		return;
-
-	if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) &&
-	    cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) &&
-	    cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
 	mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
 			  cmd->param, 1);
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 
 	hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED:
 				DISCOVERY_FINDING);
@@ -5801,6 +5892,9 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err)
 
 static int start_discovery_sync(struct hci_dev *hdev, void *data)
 {
+	if (!mgmt_pending_listed(hdev, data))
+		return -ECANCELED;
+
 	return hci_start_discovery_sync(hdev);
 }
 
@@ -6006,15 +6100,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err)
 {
 	struct mgmt_pending_cmd *cmd = data;
 
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev))
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
 		return;
 
 	bt_dev_dbg(hdev, "err %d", err);
 
 	mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
 			  cmd->param, 1);
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 
 	if (!err)
 		hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
@@ -6022,6 +6115,9 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err)
 
 static int stop_discovery_sync(struct hci_dev *hdev, void *data)
 {
+	if (!mgmt_pending_listed(hdev, data))
+		return -ECANCELED;
+
 	return hci_stop_discovery_sync(hdev);
 }
 
@@ -6231,14 +6327,18 @@ static void enable_advertising_instance(struct hci_dev *hdev, int err)
 
 static void set_advertising_complete(struct hci_dev *hdev, void *data, int err)
 {
+	struct mgmt_pending_cmd *cmd = data;
 	struct cmd_lookup match = { NULL, hdev };
 	u8 instance;
 	struct adv_info *adv_instance;
 	u8 status = mgmt_status(err);
 
+	if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
+		return;
+
 	if (status) {
-		mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true,
-				     cmd_status_rsp, &status);
+		mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+		mgmt_pending_free(cmd);
 		return;
 	}
 
@@ -6247,8 +6347,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err)
 	else
 		hci_dev_clear_flag(hdev, HCI_ADVERTISING);
 
-	mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp,
-			     &match);
+	settings_rsp(cmd, &match);
 
 	new_settings(hdev, match.sk);
 
@@ -6280,10 +6379,23 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err)
 static int set_adv_sync(struct hci_dev *hdev, void *data)
 {
 	struct mgmt_pending_cmd *cmd = data;
-	struct mgmt_mode *cp = cmd->param;
-	u8 val = !!cp->val;
+	struct mgmt_mode cp;
+	u8 val;
 
-	if (cp->val == 0x02)
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	if (!__mgmt_pending_listed(hdev, cmd)) {
+		mutex_unlock(&hdev->mgmt_pending_lock);
+		return -ECANCELED;
+	}
+
+	memcpy(&cp, cmd->param, sizeof(cp));
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	val = !!cp.val;
+
+	if (cp.val == 0x02)
 		hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
 	else
 		hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
@@ -8036,10 +8148,6 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data,
 	u8 status = mgmt_status(err);
 	u16 eir_len;
 
-	if (err == -ECANCELED ||
-	    cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev))
-		return;
-
 	if (!status) {
 		if (!skb)
 			status = MGMT_STATUS_FAILED;
@@ -8146,7 +8254,7 @@ done:
 		kfree_skb(skb);
 
 	kfree(mgmt_rp);
-	mgmt_pending_remove(cmd);
+	mgmt_pending_free(cmd);
 }
 
 static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk,
@@ -8155,7 +8263,7 @@ static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
-	cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
+	cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
 			       cp, sizeof(*cp));
 	if (!cmd)
 		return -ENOMEM;
@@ -9705,7 +9813,9 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
 	if (!mgmt_connected)
 		return;
 
-	if (link_type != ACL_LINK && link_type != LE_LINK)
+	if (link_type != ACL_LINK &&
+	    link_type != LE_LINK  &&
+	    link_type != BIS_LINK)
 		return;
 
 	bacpy(&ev.addr.bdaddr, bdaddr);
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index a88a07da3947..aa7b5585cb26 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -320,6 +320,52 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd)
 	mgmt_pending_free(cmd);
 }
 
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+	struct mgmt_pending_cmd *tmp;
+
+	lockdep_assert_held(&hdev->mgmt_pending_lock);
+
+	if (!cmd)
+		return false;
+
+	list_for_each_entry(tmp, &hdev->mgmt_pending, list) {
+		if (cmd == tmp)
+			return true;
+	}
+
+	return false;
+}
+
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+	bool listed;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
+	listed = __mgmt_pending_listed(hdev, cmd);
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	return listed;
+}
+
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+	bool listed;
+
+	if (!cmd)
+		return false;
+
+	mutex_lock(&hdev->mgmt_pending_lock);
+
+	listed = __mgmt_pending_listed(hdev, cmd);
+	if (listed)
+		list_del(&cmd->list);
+
+	mutex_unlock(&hdev->mgmt_pending_lock);
+
+	return listed;
+}
+
 void mgmt_mesh_foreach(struct hci_dev *hdev,
 		       void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
 		       void *data, struct sock *sk)
diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h
index 024e51dd6937..bcba8c9d8952 100644
--- a/net/bluetooth/mgmt_util.h
+++ b/net/bluetooth/mgmt_util.h
@@ -65,6 +65,9 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
 					  void *data, u16 len);
 void mgmt_pending_free(struct mgmt_pending_cmd *cmd);
 void mgmt_pending_remove(struct mgmt_pending_cmd *cmd);
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
 void mgmt_mesh_foreach(struct hci_dev *hdev,
 		       void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
 		       void *data, struct sock *sk);
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 1885d0c315f0..c683baa3847f 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br,
 	int err = 0;
 	int opt_id;
 
+	opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX);
+	if (opt_id != BITS_PER_LONG) {
+		NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d",
+				       opt_id);
+		return -EINVAL;
+	}
+
 	for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
 		bool on = !!(bm->optval & BIT(opt_id));
 
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 1377f31b719c..8ce145938b02 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4818,6 +4818,14 @@ void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
 		intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN;
 	}
 
+	if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) {
+		br_info(brmctx->br,
+			"trying to set multicast query interval above maximum, setting to %lu (%ums)\n",
+			jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX),
+			jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX));
+		intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX;
+	}
+
 	brmctx->multicast_query_interval = intvl_jiffies;
 }
 
@@ -4834,6 +4842,14 @@ void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
 		intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN;
 	}
 
+	if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) {
+		br_info(brmctx->br,
+			"trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n",
+			jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX),
+			jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX));
+		intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX;
+	}
+
 	brmctx->multicast_startup_query_interval = intvl_jiffies;
 }
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 94cbe967d1c1..083e2fe96441 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -626,9 +626,6 @@ static unsigned int br_nf_local_in(void *priv,
 		break;
 	}
 
-	ct = container_of(nfct, struct nf_conn, ct_general);
-	WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
-
 	return ret;
 }
 #endif
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b159aae594c0..8de0904b9627 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -31,6 +31,8 @@
 #define BR_MULTICAST_DEFAULT_HASH_MAX 4096
 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000)
 #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN
+#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */
+#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX
 
 #define BR_HWDOM_MAX BITS_PER_LONG
 
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 60f28e4fb5c0..4fd5a6ea26b4 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -43,6 +43,7 @@ config NF_CONNTRACK_BRIDGE
 config BRIDGE_NF_EBTABLES_LEGACY
 	tristate "Legacy EBTABLES support"
 	depends on BRIDGE && NETFILTER_XTABLES_LEGACY
+	depends on NETFILTER_XTABLES
 	default	n
 	help
 	 Legacy ebtables packet/frame classifier.
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
index 39844f14eed8..797719cb227e 100644
--- a/net/can/j1939/bus.c
+++ b/net/can/j1939/bus.c
@@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
 	if (!ecu)
 		ecu = j1939_ecu_create_locked(priv, name);
 	err = PTR_ERR_OR_ZERO(ecu);
-	if (err)
+	if (err) {
+		if (j1939_address_is_unicast(sa))
+			priv->ents[sa].nusers--;
 		goto done;
+	}
 
 	ecu->nusers++;
 	/* TODO: do we care if ecu->addr != sa? */
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
index 31a93cae5111..81f58924b4ac 100644
--- a/net/can/j1939/j1939-priv.h
+++ b/net/can/j1939/j1939-priv.h
@@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv);
 
 /* notify/alert all j1939 sockets bound to ifindex */
 void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv);
 int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
 void j1939_tp_init(struct j1939_priv *priv);
 
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 7e8a20f2fc42..3706a872ecaf 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb,
 		j1939_sk_netdev_event_netdown(priv);
 		j1939_ecu_unmap_all(priv);
 		break;
+	case NETDEV_UNREGISTER:
+		j1939_sk_netdev_event_unregister(priv);
+		break;
 	}
 
 	j1939_priv_put(priv);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 3d8b588822f9..88e7160d4248 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 	ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
 	if (ret) {
 		j1939_netdev_stop(priv);
+		jsk->priv = NULL;
+		synchronize_rcu();
+		j1939_priv_put(priv);
 		goto out_release_sock;
 	}
 
@@ -1300,6 +1303,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
 	read_unlock_bh(&priv->j1939_socks_lock);
 }
 
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv)
+{
+	struct sock *sk;
+	struct j1939_sock *jsk;
+	bool wait_rcu = false;
+
+rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */
+	read_lock_bh(&priv->j1939_socks_lock);
+	list_for_each_entry(jsk, &priv->j1939_socks, list) {
+		/* Skip if j1939_jsk_add() is not called on this socket. */
+		if (!(jsk->state & J1939_SOCK_BOUND))
+			continue;
+		sk = &jsk->sk;
+		sock_hold(sk);
+		read_unlock_bh(&priv->j1939_socks_lock);
+		/* Check if j1939_jsk_del() is not yet called on this socket after holding
+		 * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call
+		 * j1939_jsk_del() with socket's lock held.
+		 */
+		lock_sock(sk);
+		if (jsk->state & J1939_SOCK_BOUND) {
+			/* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del().
+			 * Make this socket no longer bound, by pretending as if j1939_sk_bind()
+			 * dropped old references but did not get new references.
+			 */
+			j1939_jsk_del(priv, jsk);
+			j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+			j1939_netdev_stop(priv);
+			/* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from
+			 * calling the corresponding j1939_priv_put().
+			 *
+			 * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after
+			 * an RCU grace period. But since the caller is holding a ref on this
+			 * "priv", we can defer synchronize_rcu() until immediately before
+			 * the caller calls j1939_priv_put().
+			 */
+			j1939_priv_put(priv);
+			jsk->priv = NULL;
+			wait_rcu = true;
+		}
+		release_sock(sk);
+		sock_put(sk);
+		goto rescan;
+	}
+	read_unlock_bh(&priv->j1939_socks_lock);
+	if (wait_rcu)
+		synchronize_rcu();
+}
+
 static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
 				unsigned long arg)
 {
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d1b5705dc0c6..9f6d860411cb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con)
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->v1.auth_retry) {
+	if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) {
 		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
 		if (con->ops->invalidate_authorizer)
 			con->ops->invalidate_authorizer(con);
@@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
 	if (con->state == CEPH_CON_S_STANDBY) {
-		dout("clear_standby %p and ++connect_seq\n", con);
+		dout("clear_standby %p\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
-		con->v1.connect_seq++;
+		if (!ceph_msgr2(from_msgr(con->msgr)))
+			con->v1.connect_seq++;
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 94cc4705e91d..f474b9b120f9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -618,6 +618,20 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
+int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
+				     struct iov_iter *from, int len)
+{
+	struct iov_iter_state state;
+	int ret;
+
+	iov_iter_save_state(from, &state);
+	ret = skb_copy_datagram_from_iter(skb, offset, from, len);
+	if (ret)
+		iov_iter_restore(from, &state);
+	return ret;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter_full);
+
 int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
 				struct iov_iter *from, size_t length)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 68dc47d7e700..8d49b2198d07 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3779,6 +3779,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
 			features &= ~NETIF_F_TSO_MANGLEID;
 	}
 
+	/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
+	 * so neither does TSO that depends on it.
+	 */
+	if (features & NETIF_F_IPV6_CSUM &&
+	    (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
+	     (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+	      vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
+	    skb_transport_header_was_set(skb) &&
+	    skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+	    !ipv6_has_hopopt_jumbo(skb))
+		features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
+
 	return features;
 }
 
@@ -6953,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi)
 	 * the kthread.
 	 */
 	while (true) {
-		if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state))
+		if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
 			break;
 
 		msleep(20);
@@ -6999,7 +7011,7 @@ int netif_set_threaded(struct net_device *dev,
 		       enum netdev_napi_threaded threaded)
 {
 	struct napi_struct *napi;
-	int err = 0;
+	int i, err = 0;
 
 	netdev_assert_locked_or_invisible(dev);
 
@@ -7021,6 +7033,10 @@ int netif_set_threaded(struct net_device *dev,
 	list_for_each_entry(napi, &dev->napi_list, dev_list)
 		WARN_ON_ONCE(napi_set_threaded(napi, threaded));
 
+	/* Override the config for all NAPIs even if currently not listed */
+	for (i = 0; i < dev->num_napi_configs; i++)
+		dev->napi_config[i].threaded = threaded;
+
 	return err;
 }
 
@@ -7353,8 +7369,9 @@ void netif_napi_add_weight_locked(struct net_device *dev,
 	 * Clear dev->threaded if kthread creation failed so that
 	 * threaded mode will not be enabled in napi_enable().
 	 */
-	if (dev->threaded && napi_kthread_create(napi))
-		dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
+	if (napi_get_threaded_config(dev, napi))
+		if (napi_kthread_create(napi))
+			dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
 	netif_napi_set_irq_locked(napi, -1);
 }
 EXPORT_SYMBOL(netif_napi_add_weight_locked);
@@ -11873,6 +11890,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		goto free_all;
 	dev->cfg_pending = dev->cfg;
 
+	dev->num_napi_configs = maxqs;
 	napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
 	dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
 	if (!dev->napi_config)
diff --git a/net/core/dev.h b/net/core/dev.h
index ab69edc0c3e3..d6b08d435479 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -323,6 +323,14 @@ static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
 	return NETDEV_NAPI_THREADED_DISABLED;
 }
 
+static inline enum netdev_napi_threaded
+napi_get_threaded_config(struct net_device *dev, struct napi_struct *n)
+{
+	if (n->config)
+		return n->config->threaded;
+	return dev->threaded;
+}
+
 int napi_set_threaded(struct napi_struct *n,
 		      enum netdev_napi_threaded threaded);
 
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9c0ad7f4b5d8..ad54b12d4b4c 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
 	if (!netif_device_present(dev))
 		return -ENODEV;
 
-	if (ops->ndo_hwtstamp_get)
-		return dev_get_hwtstamp_phylib(dev, kernel_cfg);
+	if (ops->ndo_hwtstamp_get) {
+		int err;
+
+		netdev_lock_ops(dev);
+		err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+		netdev_unlock_ops(dev);
+
+		return err;
+	}
 
 	/* Legacy path: unconverted lower driver */
 	return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
@@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
 	if (!netif_device_present(dev))
 		return -ENODEV;
 
-	if (ops->ndo_hwtstamp_set)
-		return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+	if (ops->ndo_hwtstamp_set) {
+		int err;
+
+		netdev_lock_ops(dev);
+		err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+		netdev_unlock_ops(dev);
+
+		return err;
+	}
 
 	/* Legacy path: unconverted lower driver */
 	return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 7d426a8e29f3..f112156db587 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t)
 	rate = (b_packets - est->last_packets) << (10 - est->intvl_log);
 	rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
 
+	preempt_disable_nested();
 	write_seqcount_begin(&est->seq);
 	est->avbps += brate;
 	est->avpps += rate;
 	write_seqcount_end(&est->seq);
+	preempt_enable_nested();
 
 	est->last_bytes = b_bytes;
 	est->last_packets = b_packets;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c28cd6665444..3c2dc4c5e683 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 		struct netdev_rx_queue *queue = &dev->_rx[i];
 		struct kobject *kobj = &queue->kobj;
 
-		if (!refcount_read(&dev_net(dev)->ns.count))
+		if (!check_net(dev_net(dev)))
 			kobj->uevent_suppress = 1;
 		if (dev->sysfs_rx_queue_group)
 			sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
 	while (--i >= new_num) {
 		struct netdev_queue *queue = dev->_tx + i;
 
-		if (!refcount_read(&dev_net(dev)->ns.count))
+		if (!check_net(dev_net(dev)))
 			queue->kobj.uevent_suppress = 1;
 
 		if (netdev_uses_bql(dev))
@@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
 {
 	struct device *dev = &ndev->dev;
 
-	if (!refcount_read(&dev_net(ndev)->ns.count))
+	if (!check_net(dev_net(ndev)))
 		dev_set_uevent_suppress(dev, 1);
 
 	kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1b6f3826dd0e..b0e0f22d7b21 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task.h>
 #include <linux/uidgid.h>
 #include <linux/proc_fs.h>
+#include <linux/nstree.h>
 
 #include <net/aligned_data.h>
 #include <net/sock.h>
@@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
 {
 	int id;
 
-	if (refcount_read(&net->ns.count) == 0)
+	if (!check_net(net))
 		return NETNSA_NSID_NOT_ASSIGNED;
 
 	spin_lock(&net->nsid_lock);
@@ -397,10 +398,15 @@ static __net_init void preinit_net_sysctl(struct net *net)
 }
 
 /* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
 {
+	int ret;
+
+	ret = ns_common_init(net);
+	if (ret)
+		return ret;
+
 	refcount_set(&net->passive, 1);
-	refcount_set(&net->ns.count, 1);
 	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
 	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
 
@@ -420,6 +426,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
 	INIT_LIST_HEAD(&net->ptype_all);
 	INIT_LIST_HEAD(&net->ptype_specific);
 	preinit_net_sysctl(net);
+	return 0;
 }
 
 /*
@@ -432,7 +439,7 @@ static __net_init int setup_net(struct net *net)
 	LIST_HEAD(net_exit_list);
 	int error = 0;
 
-	net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie);
+	net->net_cookie = ns_tree_gen_id(&net->ns);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
@@ -442,6 +449,7 @@ static __net_init int setup_net(struct net *net)
 	down_write(&net_rwsem);
 	list_add_tail_rcu(&net->list, &net_namespace_list);
 	up_write(&net_rwsem);
+	ns_tree_add_raw(net);
 out:
 	return error;
 
@@ -539,7 +547,7 @@ void net_drop_ns(void *p)
 		net_passive_dec(net);
 }
 
-struct net *copy_net_ns(unsigned long flags,
+struct net *copy_net_ns(u64 flags,
 			struct user_namespace *user_ns, struct net *old_net)
 {
 	struct ucounts *ucounts;
@@ -559,7 +567,9 @@ struct net *copy_net_ns(unsigned long flags,
 		goto dec_ucounts;
 	}
 
-	preinit_net(net, user_ns);
+	rv = preinit_net(net, user_ns);
+	if (rv < 0)
+		goto dec_ucounts;
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
 
@@ -573,6 +583,7 @@ struct net *copy_net_ns(unsigned long flags,
 
 	if (rv < 0) {
 put_userns:
+		ns_common_free(net);
 #ifdef CONFIG_KEYS
 		key_remove_domain(net->key_domain);
 #endif
@@ -659,8 +670,10 @@ static void cleanup_net(struct work_struct *work)
 
 	/* Don't let anyone else find us. */
 	down_write(&net_rwsem);
-	llist_for_each_entry(net, net_kill_list, cleanup_list)
+	llist_for_each_entry(net, net_kill_list, cleanup_list) {
+		ns_tree_remove(net);
 		list_del_rcu(&net->list);
+	}
 	/* Cache last net. After we unlock rtnl, no one new net
 	 * added to net_namespace_list can assign nsid pointer
 	 * to a net from net_kill_list (see peernet2id_alloc()).
@@ -693,6 +706,7 @@ static void cleanup_net(struct work_struct *work)
 	/* Finally it is safe to free my network namespace structure */
 	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
 		list_del_init(&net->exit_list);
+		ns_common_free(net);
 		dec_net_namespaces(net->ucounts);
 #ifdef CONFIG_KEYS
 		key_remove_domain(net->key_domain);
@@ -812,31 +826,12 @@ static void net_ns_net_debugfs(struct net *net)
 
 static __net_init int net_ns_net_init(struct net *net)
 {
-#ifdef CONFIG_NET_NS
-	net->ns.ops = &netns_operations;
-#endif
-	net->ns.inum = PROC_NET_INIT_INO;
-	if (net != &init_net) {
-		int ret = ns_alloc_inum(&net->ns);
-		if (ret)
-			return ret;
-	}
 	net_ns_net_debugfs(net);
 	return 0;
 }
 
-static __net_exit void net_ns_net_exit(struct net *net)
-{
-	/*
-	 * Initial network namespace doesn't exit so we don't need any
-	 * special checks here.
-	 */
-	ns_free_inum(&net->ns);
-}
-
 static struct pernet_operations __net_initdata net_ns_ops = {
 	.init = net_ns_net_init,
-	.exit = net_ns_net_exit,
 };
 
 static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -1282,7 +1277,12 @@ void __init net_ns_init(void)
 #ifdef CONFIG_KEYS
 	init_net.key_domain = &init_net_key_domain;
 #endif
-	preinit_net(&init_net, &init_user_ns);
+	/*
+	 * This currently cannot fail as the initial network namespace
+	 * has a static inode number.
+	 */
+	if (preinit_net(&init_net, &init_user_ns))
+		panic("Could not preinitialize the initial network namespace");
 
 	down_write(&pernet_ops_rwsem);
 	if (setup_net(&init_net))
@@ -1517,11 +1517,6 @@ static struct ns_common *netns_get(struct task_struct *task)
 	return net ? &net->ns : NULL;
 }
 
-static inline struct net *to_net_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct net, ns);
-}
-
 static void netns_put(struct ns_common *ns)
 {
 	put_net(to_net_ns(ns));
@@ -1548,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns)
 
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
-	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 05e2e22a8f7c..ba70569bd4b0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -287,8 +287,10 @@ static int page_pool_init(struct page_pool *pool,
 	}
 
 	if (pool->mp_ops) {
-		if (!pool->dma_map || !pool->dma_sync)
-			return -EOPNOTSUPP;
+		if (!pool->dma_map || !pool->dma_sync) {
+			err = -EOPNOTSUPP;
+			goto free_ptr_ring;
+		}
 
 		if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
 			err = -EFAULT;
@@ -1201,6 +1203,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 	pool->xdp_mem_id = mem->id;
 }
 
+/**
+ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
+ * @pool: page pool to modify
+ * @napi: NAPI instance to associate the page pool with
+ *
+ * Associate a page pool with a NAPI instance for lockless page recycling.
+ * This is useful when a new page pool has to be added to a NAPI instance
+ * without disabling that NAPI instance, to mark the point at which control
+ * path "hands over" the page pool to the NAPI instance. In most cases driver
+ * can simply set the @napi field in struct page_pool_params, and does not
+ * have to call this helper.
+ *
+ * The function is idempotent, but does not implement any refcounting.
+ * Single page_pool_disable_direct_recycling() will disable recycling,
+ * no matter how many times enable was called.
+ */
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+				       struct napi_struct *napi)
+{
+	if (READ_ONCE(pool->p.napi) == napi)
+		return;
+	WARN_ON(!napi || pool->p.napi);
+
+	mutex_lock(&page_pools_lock);
+	WRITE_ONCE(pool->p.napi, napi);
+	mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_enable_direct_recycling);
+
 void page_pool_disable_direct_recycling(struct page_pool *pool)
 {
 	/* Disable direct recycling based on pool->cpuid.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ee0274417948..1c0279b9cb9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 		return NULL;
 
 	while (data_len) {
-		if (nr_frags == MAX_SKB_FRAGS - 1)
+		if (nr_frags == MAX_SKB_FRAGS)
 			goto failure;
 		while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
 			order--;
diff --git a/net/core/sock.c b/net/core/sock.c
index 7c26ec8dce63..158bddd23134 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb)
 EXPORT_SYMBOL(sock_pfree);
 #endif /* CONFIG_INET */
 
-unsigned long __sock_i_ino(struct sock *sk)
-{
-	unsigned long ino;
-
-	read_lock(&sk->sk_callback_lock);
-	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
-	read_unlock(&sk->sk_callback_lock);
-	return ino;
-}
-EXPORT_SYMBOL(__sock_i_ino);
-
-unsigned long sock_i_ino(struct sock *sk)
-{
-	unsigned long ino;
-
-	local_bh_disable();
-	ino = __sock_i_ino(sk);
-	local_bh_enable();
-	return ino;
-}
-EXPORT_SYMBOL(sock_i_ino);
-
 /*
  * Allocate a skb from the socket's send buffer.
  */
diff --git a/net/devlink/port.c b/net/devlink/port.c
index 939081a0e615..cb8d4df61619 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -1519,7 +1519,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int n = 0;
 
-	if (!devlink_port->attrs_set)
+	if (!devlink_port->attrs_set || devlink_port->attrs.no_phys_port_name)
 		return -EOPNOTSUPP;
 
 	switch (attrs->flavour) {
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 110b3fa8a0b1..264fb82cba19 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
 static struct devlink_rate *
 devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
 {
-	static struct devlink_rate *devlink_rate;
+	struct devlink_rate *devlink_rate;
 
 	list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
 		if (devlink_rate_is_node(devlink_rate) &&
@@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
  */
 void devl_rate_nodes_destroy(struct devlink *devlink)
 {
-	static struct devlink_rate *devlink_rate, *tmp;
 	const struct devlink_ops *ops = devlink->ops;
+	struct devlink_rate *devlink_rate, *tmp;
 
 	devl_assert_locked(devlink);
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 4f58648a27ad..92e6a681c797 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
 	int err;
 
 	if (!ops->get_ts_info)
-		return -ENODEV;
+		return -EOPNOTSUPP;
 
 	/* Does ptp comes from netdev */
 	ethtool_init_tsinfo(info);
@@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev,
 	int err;
 
 	err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc);
-	if (err == -ENODEV) {
+	if (err == -ENODEV || err == -EOPNOTSUPP) {
 		struct phy_device *phy;
 
 		phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc);
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 88657255fec1..fbbc3ccf9df6 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master)
 
 	ASSERT_RTNL();
 
-	hsr_for_each_port(master->hsr, port) {
+	hsr_for_each_port_rtnl(master->hsr, port) {
 		if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
 			netif_carrier_on(master->dev);
 			return true;
@@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)
 	struct hsr_port *port;
 
 	mtu_max = ETH_DATA_LEN;
-	hsr_for_each_port(hsr, port)
+	hsr_for_each_port_rtnl(hsr, port)
 		if (port->type != HSR_PT_MASTER)
 			mtu_max = min(port->dev->mtu, mtu_max);
 
@@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev)
 
 	hsr = netdev_priv(dev);
 
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		if (port->type == HSR_PT_MASTER)
 			continue;
 		switch (port->type) {
@@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev)
 	struct hsr_priv *hsr;
 
 	hsr = netdev_priv(dev);
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		if (port->type == HSR_PT_MASTER)
 			continue;
 		switch (port->type) {
@@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
 	 * may become enabled.
 	 */
 	features &= ~NETIF_F_ONE_FOR_ALL;
-	hsr_for_each_port(hsr, port)
+	hsr_for_each_port_rtnl(hsr, port)
 		features = netdev_increment_features(features,
 						     port->dev->features,
 						     mask);
@@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct hsr_priv *hsr = netdev_priv(dev);
 	struct hsr_port *master;
 
+	rcu_read_lock();
 	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
 	if (master) {
 		skb->dev = master->dev;
@@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev_core_stats_tx_dropped_inc(dev);
 		dev_kfree_skb_any(skb);
 	}
+	rcu_read_unlock();
+
 	return NETDEV_TX_OK;
 }
 
@@ -484,7 +487,7 @@ static void hsr_set_rx_mode(struct net_device *dev)
 
 	hsr = netdev_priv(dev);
 
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		if (port->type == HSR_PT_MASTER)
 			continue;
 		switch (port->type) {
@@ -506,7 +509,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change)
 
 	hsr = netdev_priv(dev);
 
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		if (port->type == HSR_PT_MASTER)
 			continue;
 		switch (port->type) {
@@ -534,7 +537,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,
 
 	hsr = netdev_priv(dev);
 
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		if (port->type == HSR_PT_MASTER ||
 		    port->type == HSR_PT_INTERLINK)
 			continue;
@@ -580,7 +583,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,
 
 	hsr = netdev_priv(dev);
 
-	hsr_for_each_port(hsr, port) {
+	hsr_for_each_port_rtnl(hsr, port) {
 		switch (port->type) {
 		case HSR_PT_SLAVE_A:
 		case HSR_PT_SLAVE_B:
@@ -672,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev,
 	struct hsr_priv *hsr = netdev_priv(ndev);
 	struct hsr_port *port;
 
+	rcu_read_lock();
 	hsr_for_each_port(hsr, port)
-		if (port->type == pt)
+		if (port->type == pt) {
+			dev_hold(port->dev);
+			rcu_read_unlock();
 			return port->dev;
+		}
+	rcu_read_unlock();
 	return NULL;
 }
 EXPORT_SYMBOL(hsr_get_port_ndev);
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index 192893c3f2ec..bc94b07101d8 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr)
 {
 	struct hsr_port *port;
 
-	hsr_for_each_port(hsr, port)
+	hsr_for_each_port_rtnl(hsr, port)
 		if (port->type != HSR_PT_MASTER)
 			return false;
 	return true;
@@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
 {
 	struct hsr_port *port;
 
-	hsr_for_each_port(hsr, port)
+	hsr_for_each_port_rtnl(hsr, port)
 		if (port->type == pt)
 			return port;
 	return NULL;
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 135ec5fce019..33b0d2460c9b 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -224,6 +224,9 @@ struct hsr_priv {
 #define hsr_for_each_port(hsr, port) \
 	list_for_each_entry_rcu((port), &(hsr)->ports, port_list)
 
+#define hsr_for_each_port_rtnl(hsr, port) \
+	list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held())
+
 struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);
 
 /* Caller must ensure skb is a valid HSR frame */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index b87b6a6fe070..102eccf5ead7 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -63,8 +63,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
 	skb_push(skb, ETH_HLEN);
 	skb_reset_mac_header(skb);
 	if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) ||
-	    protocol == htons(ETH_P_HSR))
+	    protocol == htons(ETH_P_HSR)) {
+		if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) {
+			kfree_skb(skb);
+			goto finish_consume;
+		}
+
 		skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+	}
 	skb_reset_mac_len(skb);
 
 	/* Only the frames received over the interlink port will assign a
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c47d3828d4f6..942a887bf089 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -340,14 +340,13 @@ static void inetdev_destroy(struct in_device *in_dev)
 
 static int __init inet_blackhole_dev_init(void)
 {
-	int err = 0;
+	struct in_device *in_dev;
 
 	rtnl_lock();
-	if (!inetdev_init(blackhole_netdev))
-		err = -ENOMEM;
+	in_dev = inetdev_init(blackhole_netdev);
 	rtnl_unlock();
 
-	return err;
+	return PTR_ERR_OR_ZERO(in_dev);
 }
 late_initcall(inet_blackhole_dev_init);
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2ffe73ea644f..c48c572f024d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -799,11 +799,12 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	struct sk_buff *cloned_skb = NULL;
 	struct ip_options opts = { 0 };
 	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
 	struct nf_conn *ct;
 	__be32 orig_ip;
 
 	ct = nf_ct_get(skb_in, &ctinfo);
-	if (!ct || !(ct->status & IPS_SRC_NAT)) {
+	if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
 		__icmp_send(skb_in, type, code, info, &opts);
 		return;
 	}
@@ -818,7 +819,8 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		goto out;
 
 	orig_ip = ip_hdr(skb_in)->saddr;
-	ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+	dir = CTINFO2DIR(ctinfo);
+	ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
 	__icmp_send(skb_in, type, code, info, &opts);
 	ip_hdr(skb_in)->saddr = orig_ip;
 out:
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 875ff923a8ed..56a117560c0c 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -329,13 +329,13 @@ restart:
 					     TCPF_NEW_SYN_RECV))
 				continue;
 
-			if (refcount_read(&sock_net(sk)->ns.count))
+			if (check_net(sock_net(sk)))
 				continue;
 
 			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
 				continue;
 
-			if (refcount_read(&sock_net(sk)->ns.count)) {
+			if (check_net(sock_net(sk))) {
 				sock_gen_put(sk);
 				goto restart;
 			}
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index cc9915543637..2e61ac137128 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
 	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
 		return -EINVAL;
 
+	if (skb_is_gso(skb))
+		skb_gso_reset(skb);
+
 	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 	pskb_pull(skb, ETH_HLEN);
 	skb_reset_network_header(skb);
@@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
 	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
 		return -EINVAL;
 
+	if (skb_is_gso(skb))
+		skb_gso_reset(skb);
+
 	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 	pskb_pull(skb, ETH_HLEN);
 	skb_reset_network_header(skb);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 2c438b140e88..7dc9772fe2d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -14,6 +14,7 @@ config NF_DEFRAG_IPV4
 config IP_NF_IPTABLES_LEGACY
 	tristate "Legacy IP tables support"
 	depends on NETFILTER_XTABLES_LEGACY
+	depends on NETFILTER_XTABLES
 	default	m if NETFILTER_XTABLES_LEGACY
 	help
 	  iptables is a legacy packet classifier.
@@ -326,6 +327,7 @@ endif # IP_NF_IPTABLES
 config IP_NF_ARPTABLES
 	tristate "Legacy ARPTABLES support"
 	depends on NETFILTER_XTABLES_LEGACY
+	depends on NETFILTER_XTABLES
 	default	n
 	help
 	  arptables is a legacy packet classifier.
@@ -343,6 +345,7 @@ config IP_NF_ARPFILTER
 	select IP_NF_ARPTABLES
 	select NETFILTER_FAMILY_ARP
 	depends on NETFILTER_XTABLES_LEGACY
+	depends on NETFILTER_XTABLES
 	help
 	  ARP packet filtering defines a table `filter', which has a series of
 	  rules for simple ARP packet filtering at local input and
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 87fd945a0d27..0d3cb2ba6fc8 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -247,8 +247,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	if (!oth)
 		return;
 
-	if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
-	    nf_reject_fill_skb_dst(oldskb) < 0)
+	if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0)
 		return;
 
 	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -321,8 +320,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
 	if (iph->frag_off & htons(IP_OFFSET))
 		return;
 
-	if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) &&
-	    nf_reject_fill_skb_dst(skb_in) < 0)
+	if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0)
 		return;
 
 	if (skb_csum_unnecessary(skb_in) ||
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 29118c43ebf5..34137768e7f9 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2399,6 +2399,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old,
 		return -EINVAL;
 	}
 
+	if (!list_empty(&old->grp_list) &&
+	    rtnl_dereference(new->nh_info)->fdb_nh !=
+	    rtnl_dereference(old->nh_info)->fdb_nh) {
+		NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group");
+		return -EINVAL;
+	}
+
 	err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
 	if (err)
 		return err;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f639a2ae881a..baa43e5966b1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2575,12 +2575,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 		    !netif_is_l3_master(dev_out))
 			return ERR_PTR(-EINVAL);
 
-	if (ipv4_is_lbcast(fl4->daddr))
+	if (ipv4_is_lbcast(fl4->daddr)) {
 		type = RTN_BROADCAST;
-	else if (ipv4_is_multicast(fl4->daddr))
+
+		/* reset fi to prevent gateway resolution */
+		fi = NULL;
+	} else if (ipv4_is_multicast(fl4->daddr)) {
 		type = RTN_MULTICAST;
-	else if (ipv4_is_zeronet(fl4->daddr))
+	} else if (ipv4_is_zeronet(fl4->daddr)) {
 		return ERR_PTR(-EINVAL);
+	}
 
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71a956fbfc55..ad76556800f2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int old_state = sk->sk_state;
+	struct request_sock *req;
 	u32 seq;
 
 	if (old_state != TCP_CLOSE)
@@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 
 	/* Clean up fastopen related fields */
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	if (req)
+		reqsk_fastopen_remove(sk, req, false);
 	tcp_free_fastopen_req(tp);
 	inet_clear_bit(DEFER_CONNECT, sk);
 	tp->fastopen_client_fail = 0;
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index bbb8d5f0eae7..3338b6cc85c4 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
 	if (!ao)
 		return;
 
-	WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+	/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+	if (skb)
+		WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
 	ao->rcv_sne = 0;
 
 	hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ba581785adb4..a268e1595b22 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -408,8 +408,11 @@ more_data:
 		if (!psock->cork) {
 			psock->cork = kzalloc(sizeof(*psock->cork),
 					      GFP_ATOMIC | __GFP_NOWARN);
-			if (!psock->cork)
+			if (!psock->cork) {
+				sk_msg_free(sk, msg);
+				*copied = 0;
 				return -ENOMEM;
+			}
 		}
 		memcpy(psock->cork, msg, sizeof(*msg));
 		return 0;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03c068ea27b6..b67f94c60f9f 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net)
 		spin_lock_bh(&tcp_metrics_lock);
 		for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
 			match = net ? net_eq(tm_net(tm), net) :
-				!refcount_read(&tm_net(tm)->ns.count);
+				!check_net(tm_net(tm));
 			if (match) {
 				rcu_assign_pointer(*pp, tm->tcpm_next);
 				kfree_rcu(tm, rcu_head);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 5128e2a5b00a..b1f3fd302e9d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -217,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
 	skb->remcsum_offload = remcsum;
 
-	need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+	need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
 			  !need_ipsec &&
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index d1ef9644f826..a23eb8734e15 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -494,10 +494,8 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
 
 	idev = __in6_dev_get(skb->dev);
 
-	accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
-	if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
-		accept_rpl_seg = idev->cnf.rpl_seg_enabled;
-
+	accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled),
+			     READ_ONCE(idev->cnf.rpl_seg_enabled));
 	if (!accept_rpl_seg) {
 		kfree_skb(skb);
 		return -1;
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 9e3574880cb0..233914b63bdb 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -54,11 +54,12 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 	struct inet6_skb_parm parm = { 0 };
 	struct sk_buff *cloned_skb = NULL;
 	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
 	struct in6_addr orig_ip;
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb_in, &ctinfo);
-	if (!ct || !(ct->status & IPS_SRC_NAT)) {
+	if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
 		__icmpv6_send(skb_in, type, code, info, &parm);
 		return;
 	}
@@ -73,7 +74,8 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
 		goto out;
 
 	orig_ip = ipv6_hdr(skb_in)->saddr;
-	ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
+	dir = CTINFO2DIR(ctinfo);
+	ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6;
 	__icmpv6_send(skb_in, type, code, info, &parm);
 	ipv6_hdr(skb_in)->saddr = orig_ip;
 out:
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 276860f65baa..81daf82ddc2d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,6 +10,7 @@ menu "IPv6: Netfilter Configuration"
 config IP6_NF_IPTABLES_LEGACY
 	tristate "Legacy IP6 tables support"
 	depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY
+	depends on NETFILTER_XTABLES
 	default	m if NETFILTER_XTABLES_LEGACY
 	help
 	  ip6tables is a legacy packet classifier.
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 838295fa32e3..cb2d38e80de9 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -293,7 +293,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 	fl6.fl6_sport = otcph->dest;
 	fl6.fl6_dport = otcph->source;
 
-	if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) {
+	if (!skb_dst(oldskb)) {
 		nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
 		if (!dst)
 			return;
@@ -397,8 +397,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
 	if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
 		skb_in->dev = net->loopback_dev;
 
-	if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) &&
-	    nf_reject6_fill_skb_dst(skb_in) < 0)
+	if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0)
 		return;
 
 	icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index f78ecb6ad838..fd58426f222b 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -35,6 +35,7 @@
 #include <net/xfrm.h>
 
 #include <crypto/hash.h>
+#include <crypto/utils.h>
 #include <net/seg6.h>
 #include <net/genetlink.h>
 #include <net/seg6_hmac.h>
@@ -280,7 +281,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb)
 	if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
 		return false;
 
-	if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
+	if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN))
 		return false;
 
 	return true;
@@ -304,6 +305,9 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
 	struct seg6_pernet_data *sdata = seg6_pernet(net);
 	int err;
 
+	if (!__hmac_get_algo(hinfo->alg_id))
+		return -EINVAL;
+
 	err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
 					    rht_params);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7577e7eb2c97..e885629312a4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1431,17 +1431,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	ireq = inet_rsk(req);
 
 	if (sk_acceptq_is_full(sk))
-		goto out_overflow;
+		goto exit_overflow;
 
 	if (!dst) {
 		dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
 		if (!dst)
-			goto out;
+			goto exit;
 	}
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
-		goto out_nonewsk;
+		goto exit_nonewsk;
 
 	/*
 	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1525,25 +1525,19 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 			const union tcp_md5_addr *addr;
 
 			addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
-			if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
-				inet_csk_prepare_forced_close(newsk);
-				tcp_done(newsk);
-				goto out;
-			}
+			if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key))
+				goto put_and_exit;
 		}
 	}
 #endif
 #ifdef CONFIG_TCP_AO
 	/* Copy over tcp_ao_info if any */
 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
-		goto out; /* OOM */
+		goto put_and_exit; /* OOM */
 #endif
 
-	if (__inet_inherit_port(sk, newsk) < 0) {
-		inet_csk_prepare_forced_close(newsk);
-		tcp_done(newsk);
-		goto out;
-	}
+	if (__inet_inherit_port(sk, newsk) < 0)
+		goto put_and_exit;
 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
 				       &found_dup_sk);
 	if (*own_req) {
@@ -1570,13 +1564,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 
 	return newsk;
 
-out_overflow:
+exit_overflow:
 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out_nonewsk:
+exit_nonewsk:
 	dst_release(dst);
-out:
+exit:
 	tcp_listendrop(sk);
 	return NULL;
+put_and_exit:
+	inet_csk_prepare_forced_close(newsk);
+	tcp_done(newsk);
+	goto exit;
 }
 
 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 5120a763da0d..0a0eeaed0591 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
 	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
 	unsigned int i;
 
-	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
+	xfrm_state_flush(net, 0, false);
 	xfrm_flush_gc();
 
 	for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index a4971e6fa943..b4f01cb07561 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk)
 
 	/* Check if the socket is reserved so someone is waiting for sending. */
 	kcm = psock->tx_kcm;
-	if (kcm && !unlikely(kcm->tx_stopped))
+	if (kcm)
 		queue_work(kcm_wq, &kcm->tx_work);
 
 	spin_unlock_bh(&mux->lock);
@@ -1693,12 +1693,6 @@ static int kcm_release(struct socket *sock)
 	 */
 	__skb_queue_purge(&sk->sk_write_queue);
 
-	/* Set tx_stopped. This is checked when psock is bound to a kcm and we
-	 * get a writespace callback. This prevents further work being queued
-	 * from the callback (unbinding the psock occurs after canceling work.
-	 */
-	kcm->tx_stopped = 1;
-
 	release_sock(sk);
 
 	spin_lock_bh(&mux->lock);
@@ -1714,7 +1708,7 @@ static int kcm_release(struct socket *sock)
 	/* Cancel work. After this point there should be no outside references
 	 * to the kcm socket.
 	 */
-	cancel_work_sync(&kcm->tx_work);
+	disable_work_sync(&kcm->tx_work);
 
 	lock_sock(sk);
 	psock = kcm->tx_psock;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index fc5c2fd8f34c..5e12e7ce17d8 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -129,22 +129,12 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = {
 
 static const struct proto_ops pppol2tp_ops;
 
-/* Retrieves the pppol2tp socket associated to a session.
- * A reference is held on the returned socket, so this function must be paired
- * with sock_put().
- */
+/* Retrieves the pppol2tp socket associated to a session. */
 static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session)
 {
 	struct pppol2tp_session *ps = l2tp_session_priv(session);
-	struct sock *sk;
-
-	rcu_read_lock();
-	sk = rcu_dereference(ps->sk);
-	if (sk)
-		sock_hold(sk);
-	rcu_read_unlock();
 
-	return sk;
+	return rcu_dereference(ps->sk);
 }
 
 /* Helpers to obtain tunnel/session contexts from sockets.
@@ -206,14 +196,13 @@ end:
 
 static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
 {
-	struct pppol2tp_session *ps = l2tp_session_priv(session);
-	struct sock *sk = NULL;
+	struct sock *sk;
 
 	/* If the socket is bound, send it in to PPP's input queue. Otherwise
 	 * queue it on the session socket.
 	 */
 	rcu_read_lock();
-	sk = rcu_dereference(ps->sk);
+	sk = pppol2tp_session_get_sock(session);
 	if (!sk)
 		goto no_sock;
 
@@ -510,13 +499,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg)
 	struct l2tp_session *session = arg;
 	struct sock *sk;
 
+	rcu_read_lock();
 	sk = pppol2tp_session_get_sock(session);
 	if (sk) {
 		struct pppox_sock *po = pppox_sk(sk);
 
 		seq_printf(m, "   interface %s\n", ppp_dev_name(&po->chan));
-		sock_put(sk);
 	}
+	rcu_read_unlock();
 }
 
 static void pppol2tp_session_init(struct l2tp_session *session)
@@ -1530,6 +1520,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
 		port = ntohs(inet->inet_sport);
 	}
 
+	rcu_read_lock();
 	sk = pppol2tp_session_get_sock(session);
 	if (sk) {
 		state = sk->sk_state;
@@ -1565,8 +1556,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
 		struct pppox_sock *po = pppox_sk(sk);
 
 		seq_printf(m, "   interface %s\n", ppp_dev_name(&po->chan));
-		sock_put(sk);
 	}
+	rcu_read_unlock();
 }
 
 static int pppol2tp_seq_show(struct seq_file *m, void *v)
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 181bcb34b795..55105d238d6b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1416,7 +1416,7 @@ drv_get_ftm_responder_stats(struct ieee80211_local *local,
 			    struct ieee80211_sub_if_data *sdata,
 			    struct cfg80211_ftm_responder_stats *ftm_stats)
 {
-	u32 ret = -EOPNOTSUPP;
+	int ret = -EOPNOTSUPP;
 
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 9c8f18b258a6..3ae6104e5cb2 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1111,7 +1111,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	int result, i;
 	enum nl80211_band band;
 	int channels, max_bitrates;
-	bool supp_ht, supp_vht, supp_he, supp_eht;
+	bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g;
 	struct cfg80211_chan_def dflt_chandef = {};
 
 	if (ieee80211_hw_check(hw, QUEUE_CONTROL) &&
@@ -1227,6 +1227,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	supp_vht = false;
 	supp_he = false;
 	supp_eht = false;
+	supp_s1g = false;
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
 		const struct ieee80211_sband_iftype_data *iftd;
 		struct ieee80211_supported_band *sband;
@@ -1274,6 +1275,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 			max_bitrates = sband->n_bitrates;
 		supp_ht = supp_ht || sband->ht_cap.ht_supported;
 		supp_vht = supp_vht || sband->vht_cap.vht_supported;
+		supp_s1g = supp_s1g || sband->s1g_cap.s1g;
 
 		for_each_sband_iftype_data(sband, i, iftd) {
 			u8 he_40_mhz_cap;
@@ -1406,6 +1408,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		local->scan_ies_len +=
 			2 + sizeof(struct ieee80211_vht_cap);
 
+	if (supp_s1g)
+		local->scan_ies_len += 2 + sizeof(struct ieee80211_s1g_cap);
+
 	/*
 	 * HE cap element is variable in size - set len to allow max size */
 	if (supp_he) {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1008eb8e9b13..dd650a127a31 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1189,6 +1189,14 @@ again:
 			     "required MCSes not supported, disabling EHT\n");
 	}
 
+	if (conn->mode >= IEEE80211_CONN_MODE_EHT &&
+	    channel->band != NL80211_BAND_2GHZ &&
+	    conn->bw_limit == IEEE80211_CONN_BW_LIMIT_40) {
+		conn->mode = IEEE80211_CONN_MODE_HE;
+		link_id_info(sdata, link_id,
+			     "required bandwidth not supported, disabling EHT\n");
+	}
+
 	/* the mode can only decrease, so this must terminate */
 	if (ap_mode != conn->mode) {
 		kfree(elems);
diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c
index 96c7b3ab2744..adc069065e73 100644
--- a/net/mac80211/tests/chan-mode.c
+++ b/net/mac80211/tests/chan-mode.c
@@ -2,7 +2,7 @@
 /*
  * KUnit tests for channel mode functions
  *
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2024-2025 Intel Corporation
  */
 #include <net/cfg80211.h>
 #include <kunit/test.h>
@@ -28,6 +28,10 @@ static const struct determine_chan_mode_case {
 	u8 vht_basic_mcs_1_4, vht_basic_mcs_5_8;
 	u8 he_basic_mcs_1_4, he_basic_mcs_5_8;
 	u8 eht_mcs7_min_nss;
+	u16 eht_disabled_subchannels;
+	u8 eht_bw;
+	enum ieee80211_conn_bw_limit conn_bw_limit;
+	enum ieee80211_conn_bw_limit expected_bw_limit;
 	int error;
 } determine_chan_mode_cases[] = {
 	{
@@ -128,6 +132,14 @@ static const struct determine_chan_mode_case {
 		.conn_mode = IEEE80211_CONN_MODE_EHT,
 		.eht_mcs7_min_nss = 0x15,
 		.error = EINVAL,
+	}, {
+		.desc = "80 MHz EHT is downgraded to 40 MHz HE due to puncturing",
+		.conn_mode = IEEE80211_CONN_MODE_EHT,
+		.expected_mode = IEEE80211_CONN_MODE_HE,
+		.conn_bw_limit = IEEE80211_CONN_BW_LIMIT_80,
+		.expected_bw_limit = IEEE80211_CONN_BW_LIMIT_40,
+		.eht_disabled_subchannels = 0x08,
+		.eht_bw = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ,
 	}
 };
 KUNIT_ARRAY_PARAM_DESC(determine_chan_mode, determine_chan_mode_cases, desc)
@@ -138,7 +150,7 @@ static void test_determine_chan_mode(struct kunit *test)
 	struct t_sdata *t_sdata = T_SDATA(test);
 	struct ieee80211_conn_settings conn = {
 		.mode = params->conn_mode,
-		.bw_limit = IEEE80211_CONN_BW_LIMIT_20,
+		.bw_limit = params->conn_bw_limit,
 	};
 	struct cfg80211_bss cbss = {
 		.channel = &t_sdata->band_5ghz.channels[0],
@@ -191,14 +203,21 @@ static void test_determine_chan_mode(struct kunit *test)
 		0x7f, 0x01, 0x00, 0x88, 0x88, 0x88, 0x00, 0x00,
 		0x00,
 		/* EHT Operation */
-		WLAN_EID_EXTENSION, 0x09, WLAN_EID_EXT_EHT_OPERATION,
-		0x01, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11,
-		0x00, 0x00, 0x00, 0x00, 0x24, 0x00,
+		WLAN_EID_EXTENSION, 0x0b, WLAN_EID_EXT_EHT_OPERATION,
+		0x03, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11,
+		0x00, 0x00, 0x00, params->eht_bw,
+		params->eht_bw == IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ ? 42 : 36,
+		0x00,
+		u16_get_bits(params->eht_disabled_subchannels, 0xff),
+		u16_get_bits(params->eht_disabled_subchannels, 0xff00),
 	};
 	struct ieee80211_chan_req chanreq = {};
 	struct cfg80211_chan_def ap_chandef = {};
 	struct ieee802_11_elems *elems;
 
+	/* To force EHT downgrade to HE on punctured 80 MHz downgraded to 40 MHz */
+	set_bit(IEEE80211_HW_DISALLOW_PUNCTURING, t_sdata->local.hw.flags);
+
 	if (params->strict)
 		set_bit(IEEE80211_HW_STRICT, t_sdata->local.hw.flags);
 	else
@@ -237,6 +256,7 @@ static void test_determine_chan_mode(struct kunit *test)
 	} else {
 		KUNIT_ASSERT_NOT_ERR_OR_NULL(test, elems);
 		KUNIT_ASSERT_EQ(test, conn.mode, params->expected_mode);
+		KUNIT_ASSERT_EQ(test, conn.bw_limit, params->expected_bw_limit);
 	}
 }
 
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index df4e8cf33899..685524800d70 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -425,7 +425,7 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname,
 		return 0;
 	}
 
-	return -EINVAL;
+	return -ENOPROTOOPT;
 }
 
 /* helpers for reading/writing the tag ioc, handling compatibility across the
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 2b2b958ef6a3..4d314e062ba9 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -378,6 +378,7 @@ static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {}
 static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {}
 #endif
 
+/* takes ownership of skb, both in success and failure cases */
 static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
 {
 	struct mctp_hdr *hdr = mctp_hdr(skb);
@@ -387,8 +388,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
 		& MCTP_HDR_SEQ_MASK;
 
 	if (!key->reasm_head) {
-		/* Since we're manipulating the shared frag_list, ensure it isn't
-		 * shared with any other SKBs.
+		/* Since we're manipulating the shared frag_list, ensure it
+		 * isn't shared with any other SKBs. In the cloned case,
+		 * this will free the skb; callers can no longer access it
+		 * safely.
 		 */
 		key->reasm_head = skb_unshare(skb, GFP_ATOMIC);
 		if (!key->reasm_head)
@@ -402,10 +405,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
 	exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK;
 
 	if (this_seq != exp_seq)
-		return -EINVAL;
+		goto err_free;
 
 	if (key->reasm_head->len + skb->len > mctp_message_maxlen)
-		return -EINVAL;
+		goto err_free;
 
 	skb->next = NULL;
 	skb->sk = NULL;
@@ -419,6 +422,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
 	key->reasm_head->truesize += skb->truesize;
 
 	return 0;
+
+err_free:
+	kfree_skb(skb);
+	return -EINVAL;
 }
 
 static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
@@ -532,18 +539,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
 			 * key isn't observable yet
 			 */
 			mctp_frag_queue(key, skb);
+			skb = NULL;
 
 			/* if the key_add fails, we've raced with another
 			 * SOM packet with the same src, dest and tag. There's
 			 * no way to distinguish future packets, so all we
-			 * can do is drop; we'll free the skb on exit from
-			 * this function.
+			 * can do is drop.
 			 */
 			rc = mctp_key_add(key, msk);
-			if (!rc) {
+			if (!rc)
 				trace_mctp_key_acquire(key);
-				skb = NULL;
-			}
 
 			/* we don't need to release key->lock on exit, so
 			 * clean up here and suppress the unlock via
@@ -561,8 +566,7 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
 				key = NULL;
 			} else {
 				rc = mctp_frag_queue(key, skb);
-				if (!rc)
-					skb = NULL;
+				skb = NULL;
 			}
 		}
 
@@ -572,17 +576,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
 		 */
 
 		/* we need to be continuing an existing reassembly... */
-		if (!key->reasm_head)
+		if (!key->reasm_head) {
 			rc = -EINVAL;
-		else
+		} else {
 			rc = mctp_frag_queue(key, skb);
+			skb = NULL;
+		}
 
 		if (rc)
 			goto out_unlock;
 
-		/* we've queued; the queue owns the skb now */
-		skb = NULL;
-
 		/* end of message? deliver to socket, and we're done with
 		 * the reassembly/response key
 		 */
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index fb6b46a952cb..69a3ccfc6310 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -1586,7 +1586,6 @@ static void mctp_test_bind_lookup(struct kunit *test)
 
 cleanup:
 	kfree_skb(skb_sock);
-	kfree_skb(skb_pkt);
 
 	/* Drop all binds */
 	for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++)
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 70c0ab0ecf90..1103b3341a70 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
 		return false;
 	}
 
-	if (mp_opt->deny_join_id0)
-		WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
-
 	if (unlikely(!READ_ONCE(msk->pm.server_side)))
 		pr_warn_once("bogus mpc option on established client sk");
 
 set_fully_established:
+	if (mp_opt->deny_join_id0)
+		WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
+
 	mptcp_data_lock((struct sock *)msk);
 	__mptcp_subflow_fully_established(msk, subflow, mp_opt);
 	mptcp_data_unlock((struct sock *)msk);
@@ -1118,7 +1118,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
 	return hmac == mp_opt->ahmac;
 }
 
-/* Return false if a subflow has been reset, else return true */
+/* Return false in case of error (or subflow has been reset),
+ * else return true.
+ */
 bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -1222,7 +1224,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 
 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
 	if (!mpext)
-		return true;
+		return false;
 
 	memset(mpext, 0, sizeof(*mpext));
 
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 420d416e2603..136a380602ca 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -274,6 +274,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
 							      add_timer);
 	struct mptcp_sock *msk = entry->sock;
 	struct sock *sk = (struct sock *)msk;
+	unsigned int timeout;
 
 	pr_debug("msk=%p\n", msk);
 
@@ -291,6 +292,10 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
 		goto out;
 	}
 
+	timeout = mptcp_get_add_addr_timeout(sock_net(sk));
+	if (!timeout)
+		goto out;
+
 	spin_lock_bh(&msk->pm.lock);
 
 	if (!mptcp_pm_should_add_signal_addr(msk)) {
@@ -302,7 +307,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
 
 	if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
 		sk_reset_timer(sk, timer,
-			       jiffies + mptcp_get_add_addr_timeout(sock_net(sk)));
+			       jiffies + timeout);
 
 	spin_unlock_bh(&msk->pm.lock);
 
@@ -344,6 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
 	struct mptcp_pm_add_entry *add_entry = NULL;
 	struct sock *sk = (struct sock *)msk;
 	struct net *net = sock_net(sk);
+	unsigned int timeout;
 
 	lockdep_assert_held(&msk->pm.lock);
 
@@ -353,9 +359,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
 		if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk)))
 			return false;
 
-		sk_reset_timer(sk, &add_entry->add_timer,
-			       jiffies + mptcp_get_add_addr_timeout(net));
-		return true;
+		goto reset_timer;
 	}
 
 	add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
@@ -369,8 +373,10 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
 	add_entry->retrans_times = 0;
 
 	timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
-	sk_reset_timer(sk, &add_entry->add_timer,
-		       jiffies + mptcp_get_add_addr_timeout(net));
+reset_timer:
+	timeout = mptcp_get_add_addr_timeout(net);
+	if (timeout)
+		sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
 
 	return true;
 }
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index d39e7c178460..667803d72b64 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -1085,7 +1085,6 @@ static void __flush_addrs(struct list_head *list)
 static void __reset_counters(struct pm_nl_pernet *pernet)
 {
 	WRITE_ONCE(pernet->add_addr_signal_max, 0);
-	WRITE_ONCE(pernet->add_addr_accept_max, 0);
 	WRITE_ONCE(pernet->local_addr_max, 0);
 	pernet->addrs = 0;
 }
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 50aaf259959a..ce7d42d3bd00 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb,
 			       const struct sock *ssk)
 {
 	int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token));
+	u16 flags = 0;
 
 	if (err)
 		return err;
@@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb,
 	if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
 		return -EMSGSIZE;
 
+	if (READ_ONCE(msk->pm.remote_deny_join_id0))
+		flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0;
+
+	if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags))
+		return -EMSGSIZE;
+
 	return mptcp_event_add_subflow(skb, ssk);
 }
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9a287b75c1b3..5e497a83e967 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk)
 		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 }
 
+static void mptcp_shutdown_subflows(struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		bool slow;
+
+		slow = lock_sock_fast(ssk);
+		tcp_shutdown(ssk, SEND_SHUTDOWN);
+		unlock_sock_fast(ssk, slow);
+	}
+}
+
 /* called under the msk socket lock */
 static bool mptcp_pending_data_fin_ack(struct sock *sk)
 {
@@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk)
 			break;
 		case TCP_CLOSING:
 		case TCP_LAST_ACK:
+			mptcp_shutdown_subflows(msk);
 			mptcp_set_state(sk, TCP_CLOSE);
 			break;
 		}
@@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
 			mptcp_set_state(sk, TCP_CLOSING);
 			break;
 		case TCP_FIN_WAIT2:
+			mptcp_shutdown_subflows(msk);
 			mptcp_set_state(sk, TCP_CLOSE);
 			break;
 		default:
@@ -3554,7 +3570,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
 	write_lock_bh(&sk->sk_callback_lock);
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	sk_set_socket(sk, parent);
-	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 2c267aff95be..2abe6f1e9940 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
 {
 	static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
 	struct sock *sk = (struct sock *)msk;
+	bool keep_open;
 
-	if (ssk->sk_prot->keepalive) {
-		if (sock_flag(sk, SOCK_KEEPOPEN))
-			ssk->sk_prot->keepalive(ssk, 1);
-		else
-			ssk->sk_prot->keepalive(ssk, 0);
-	}
+	keep_open = sock_flag(sk, SOCK_KEEPOPEN);
+	if (ssk->sk_prot->keepalive)
+		ssk->sk_prot->keepalive(ssk, keep_open);
+	sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);
 
 	ssk->sk_priority = sk->sk_priority;
 	ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 3f1b62a9fe88..f31a3a79531a 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -883,6 +883,10 @@ create_child:
 
 			ctx->subflow_id = 1;
 			owner = mptcp_sk(ctx->conn);
+
+			if (mp_opt.deny_join_id0)
+				WRITE_ONCE(owner->pm.remote_deny_join_id0, true);
+
 			mptcp_pm_new_connection(owner, child, 1);
 
 			/* with OoO packets we can reach here without ingress
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index f821ad2e19b3..15049b826732 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
 	}
 
 	set_user_nice(kd->task, sysctl_est_nice(ipvs));
-	set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+	if (sysctl_est_preferred_cpulist(ipvs))
+		kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
 
 	pr_info("starting estimator thread %d...\n", kd->id);
 	wake_up_process(kd->task);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 4ed5878cb25b..ceb48c3ca0a4 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -368,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 			    (cur->tuple.src.l3num == NFPROTO_UNSPEC ||
 			     cur->tuple.src.l3num == me->tuple.src.l3num) &&
 			    cur->tuple.dst.protonum == me->tuple.dst.protonum) {
-				ret = -EEXIST;
+				ret = -EBUSY;
 				goto out;
 			}
 		}
@@ -379,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 		hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
 			if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple,
 						     &mask)) {
-				ret = -EEXIST;
+				ret = -EBUSY;
 				goto out;
 			}
 		}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 486d52b45fe5..50fd6809380f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -884,8 +884,6 @@ errout:
 
 static int ctnetlink_done(struct netlink_callback *cb)
 {
-	if (cb->args[1])
-		nf_ct_put((struct nf_conn *)cb->args[1]);
 	kfree(cb->data);
 	return 0;
 }
@@ -1208,19 +1206,26 @@ ignore_entry:
 	return 0;
 }
 
+static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
+{
+	unsigned long id = nf_ct_get_id(ct);
+
+	return id ? id : 1;
+}
+
 static int
 ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
 	struct net *net = sock_net(skb->sk);
-	struct nf_conn *ct, *last;
+	unsigned long last_id = cb->args[1];
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
 	struct nf_conn *nf_ct_evict[8];
+	struct nf_conn *ct;
 	int res, i;
 	spinlock_t *lockp;
 
-	last = (struct nf_conn *)cb->args[1];
 	i = 0;
 
 	local_bh_disable();
@@ -1257,7 +1262,7 @@ restart:
 				continue;
 
 			if (cb->args[1]) {
-				if (ct != last)
+				if (ctnetlink_get_id(ct) != last_id)
 					continue;
 				cb->args[1] = 0;
 			}
@@ -1270,8 +1275,7 @@ restart:
 					    NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
 					    ct, true, flags);
 			if (res < 0) {
-				nf_conntrack_get(&ct->ct_general);
-				cb->args[1] = (unsigned long)ct;
+				cb->args[1] = ctnetlink_get_id(ct);
 				spin_unlock(lockp);
 				goto out;
 			}
@@ -1284,12 +1288,10 @@ restart:
 	}
 out:
 	local_bh_enable();
-	if (last) {
+	if (last_id) {
 		/* nf ct hash resize happened, now clear the leftover. */
-		if ((struct nf_conn *)cb->args[1] == last)
+		if (cb->args[1] == last_id)
 			cb->args[1] = 0;
-
-		nf_ct_put(last);
 	}
 
 	while (i) {
@@ -3168,23 +3170,27 @@ errout:
 	return 0;
 }
 #endif
-static int ctnetlink_exp_done(struct netlink_callback *cb)
+
+static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
 {
-	if (cb->args[1])
-		nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
-	return 0;
+	unsigned long id = (unsigned long)exp;
+
+	id += nf_ct_get_id(exp->master);
+	id += exp->class;
+
+	return id ? id : 1;
 }
 
 static int
 ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nf_conntrack_expect *exp, *last;
 	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	u_int8_t l3proto = nfmsg->nfgen_family;
+	unsigned long last_id = cb->args[1];
+	struct nf_conntrack_expect *exp;
 
 	rcu_read_lock();
-	last = (struct nf_conntrack_expect *)cb->args[1];
 	for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
 restart:
 		hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
@@ -3196,7 +3202,7 @@ restart:
 				continue;
 
 			if (cb->args[1]) {
-				if (exp != last)
+				if (ctnetlink_exp_id(exp) != last_id)
 					continue;
 				cb->args[1] = 0;
 			}
@@ -3205,9 +3211,7 @@ restart:
 						    cb->nlh->nlmsg_seq,
 						    IPCTNL_MSG_EXP_NEW,
 						    exp) < 0) {
-				if (!refcount_inc_not_zero(&exp->use))
-					continue;
-				cb->args[1] = (unsigned long)exp;
+				cb->args[1] = ctnetlink_exp_id(exp);
 				goto out;
 			}
 		}
@@ -3218,32 +3222,30 @@ restart:
 	}
 out:
 	rcu_read_unlock();
-	if (last)
-		nf_ct_expect_put(last);
-
 	return skb->len;
 }
 
 static int
 ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct nf_conntrack_expect *exp, *last;
 	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
 	struct nf_conn *ct = cb->data;
 	struct nf_conn_help *help = nfct_help(ct);
 	u_int8_t l3proto = nfmsg->nfgen_family;
+	unsigned long last_id = cb->args[1];
+	struct nf_conntrack_expect *exp;
 
 	if (cb->args[0])
 		return 0;
 
 	rcu_read_lock();
-	last = (struct nf_conntrack_expect *)cb->args[1];
+
 restart:
 	hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
 		if (l3proto && exp->tuple.src.l3num != l3proto)
 			continue;
 		if (cb->args[1]) {
-			if (exp != last)
+			if (ctnetlink_exp_id(exp) != last_id)
 				continue;
 			cb->args[1] = 0;
 		}
@@ -3251,9 +3253,7 @@ restart:
 					    cb->nlh->nlmsg_seq,
 					    IPCTNL_MSG_EXP_NEW,
 					    exp) < 0) {
-			if (!refcount_inc_not_zero(&exp->use))
-				continue;
-			cb->args[1] = (unsigned long)exp;
+			cb->args[1] = ctnetlink_exp_id(exp);
 			goto out;
 		}
 	}
@@ -3264,9 +3264,6 @@ restart:
 	cb->args[0] = 1;
 out:
 	rcu_read_unlock();
-	if (last)
-		nf_ct_expect_put(last);
-
 	return skb->len;
 }
 
@@ -3285,7 +3282,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
 	struct nf_conntrack_zone zone;
 	struct netlink_dump_control c = {
 		.dump = ctnetlink_exp_ct_dump_table,
-		.done = ctnetlink_exp_done,
 	};
 
 	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
@@ -3335,7 +3331,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
 		else {
 			struct netlink_dump_control c = {
 				.dump = ctnetlink_exp_dump_table,
-				.done = ctnetlink_exp_done,
 			};
 			return netlink_dump_start(info->sk, skb, info->nlh, &c);
 		}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9b8b10a85233..1f14ef0436c6 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -567,16 +567,16 @@ nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write,
 		return ret;
 
 	if (*(u8 *)table->data == 0)
-		return ret;
+		return 0;
 
 	/* Load nf_log_syslog only if no logger is currently registered */
 	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
 		if (nf_log_is_registered(i))
-			return ret;
+			return 0;
 	}
 	request_module("%s", "nf_log_syslog");
 
-	return ret;
+	return 0;
 }
 
 static struct ctl_table_header *nf_ct_netfilter_header;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 13d0ed9d1895..c3c73411c40c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
 	return ERR_PTR(-ENOENT);
 }
 
-static __be16 nft_base_seq(const struct net *net)
+static unsigned int nft_base_seq(const struct net *net)
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
+	return READ_ONCE(net->nft.base_seq);
+}
 
-	return htons(nft_net->base_seq & 0xffff);
+static __be16 nft_base_seq_be16(const struct net *net)
+{
+	return htons(nft_base_seq(net) & 0xffff);
 }
 
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
@@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 
 	nlh = nfnl_msg_put(skb, portid, seq,
 			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
-			   flags, family, NFNETLINK_V0, nft_base_seq(net));
+			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -1959,6 +1962,18 @@ nla_put_failure:
 	return -ENOSPC;
 }
 
+static bool hook_is_prefix(struct nft_hook *hook)
+{
+	return strlen(hook->ifname) >= hook->ifnamelen;
+}
+
+static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook)
+{
+	int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME;
+
+	return nla_put_string(skb, attr, hook->ifname);
+}
+
 static int nft_dump_basechain_hook(struct sk_buff *skb,
 				   const struct net *net, int family,
 				   const struct nft_base_chain *basechain,
@@ -1990,16 +2005,15 @@ static int nft_dump_basechain_hook(struct sk_buff *skb,
 			if (!first)
 				first = hook;
 
-			if (nla_put(skb, NFTA_DEVICE_NAME,
-				    hook->ifnamelen, hook->ifname))
+			if (nft_nla_put_hook_dev(skb, hook))
 				goto nla_put_failure;
 			n++;
 		}
 		nla_nest_end(skb, nest_devs);
 
 		if (n == 1 &&
-		    nla_put(skb, NFTA_HOOK_DEV,
-			    first->ifnamelen, first->ifname))
+		    !hook_is_prefix(first) &&
+		    nla_put_string(skb, NFTA_HOOK_DEV, first->ifname))
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, nest);
@@ -2019,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 
 	nlh = nfnl_msg_put(skb, portid, seq,
 			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
-			   flags, family, NFNETLINK_V0, nft_base_seq(net));
+			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -2122,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -2310,7 +2324,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain)
 }
 
 static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
-					      const struct nlattr *attr)
+					      const struct nlattr *attr,
+					      bool prefix)
 {
 	struct nf_hook_ops *ops;
 	struct net_device *dev;
@@ -2327,7 +2342,8 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
 	if (err < 0)
 		goto err_hook_free;
 
-	hook->ifnamelen = nla_len(attr);
+	/* include the terminating NUL-char when comparing non-prefixes */
+	hook->ifnamelen = strlen(hook->ifname) + !prefix;
 
 	/* nf_tables_netdev_event() is called under rtnl_mutex, this is
 	 * indirectly serializing all the other holders of the commit_mutex with
@@ -2374,14 +2390,22 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
 	struct nft_hook *hook, *next;
 	const struct nlattr *tmp;
 	int rem, n = 0, err;
+	bool prefix;
 
 	nla_for_each_nested(tmp, attr, rem) {
-		if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+		switch (nla_type(tmp)) {
+		case NFTA_DEVICE_NAME:
+			prefix = false;
+			break;
+		case NFTA_DEVICE_PREFIX:
+			prefix = true;
+			break;
+		default:
 			err = -EINVAL;
 			goto err_hook;
 		}
 
-		hook = nft_netdev_hook_alloc(net, tmp);
+		hook = nft_netdev_hook_alloc(net, tmp, prefix);
 		if (IS_ERR(hook)) {
 			NL_SET_BAD_ATTR(extack, tmp);
 			err = PTR_ERR(hook);
@@ -2427,7 +2451,7 @@ static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
 	int err;
 
 	if (tb[NFTA_HOOK_DEV]) {
-		hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
+		hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false);
 		if (IS_ERR(hook)) {
 			NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
 			return PTR_ERR(hook);
@@ -2803,6 +2827,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	struct nft_chain *chain = ctx->chain;
 	struct nft_chain_hook hook = {};
 	struct nft_stats __percpu *stats = NULL;
+	struct nftables_pernet *nft_net;
 	struct nft_hook *h, *next;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
@@ -2845,6 +2870,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				if (nft_hook_list_find(&basechain->hook_list, h)) {
 					list_del(&h->list);
 					nft_netdev_hook_free(h);
+					continue;
+				}
+
+				nft_net = nft_pernet(ctx->net);
+				list_for_each_entry(trans, &nft_net->commit_list, list) {
+					if (trans->msg_type != NFT_MSG_NEWCHAIN ||
+					    trans->table != ctx->table ||
+					    !nft_trans_chain_update(trans))
+						continue;
+
+					if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
+						nft_chain_release_hook(&hook);
+						return -EEXIST;
+					}
 				}
 			}
 		} else {
@@ -3635,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 
 	nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
-			   nft_base_seq(net));
+			   nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -3803,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -4014,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb,
 	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
 			nla_len(nla[NFTA_RULE_TABLE]),
 			(char *)nla_data(nla[NFTA_RULE_TABLE]),
-			nft_net->base_seq);
+			nft_base_seq(net));
 	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
 			AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
 	kfree(buf);
@@ -4851,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 	nlh = nfnl_msg_put(skb, portid, seq,
 			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
 			   flags, ctx->family, NFNETLINK_V0,
-			   nft_base_seq(ctx->net));
+			   nft_base_seq_be16(ctx->net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -4996,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (ctx->family != NFPROTO_UNSPEC &&
@@ -6173,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
@@ -6202,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	seq    = cb->nlh->nlmsg_seq;
 
 	nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
-			   table->family, NFNETLINK_V0, nft_base_seq(net));
+			   table->family, NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -6295,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
-			   NFNETLINK_V0, nft_base_seq(ctx->net));
+			   NFNETLINK_V0, nft_base_seq_be16(ctx->net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -6594,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb,
 		}
 		nelems++;
 	}
-	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems);
+	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);
 
 out_unlock:
 	rcu_read_unlock();
@@ -8345,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 
 	nlh = nfnl_msg_put(skb, portid, seq,
 			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
-			   flags, family, NFNETLINK_V0, nft_base_seq(net));
+			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -8410,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -8444,7 +8483,7 @@ cont:
 			idx++;
 		}
 		if (ctx->reset && entries)
-			audit_log_obj_reset(table, nft_net->base_seq, entries);
+			audit_log_obj_reset(table, nft_base_seq(net), entries);
 		if (rc < 0)
 			break;
 	}
@@ -8613,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb,
 	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
 			nla_len(nla[NFTA_OBJ_TABLE]),
 			(char *)nla_data(nla[NFTA_OBJ_TABLE]),
-			nft_net->base_seq);
+			nft_base_seq(net));
 	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
 			AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
 	kfree(buf);
@@ -8718,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
 		    struct nft_object *obj, u32 portid, u32 seq, int event,
 		    u16 flags, int family, int report, gfp_t gfp)
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
 	char *buf = kasprintf(gfp, "%s:%u",
-			      table->name, nft_net->base_seq);
+			      table->name, nft_base_seq(net));
 
 	audit_log_nfcfg(buf,
 			family,
@@ -9060,6 +9098,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 {
 	const struct nlattr * const *nla = ctx->nla;
 	struct nft_flowtable_hook flowtable_hook;
+	struct nftables_pernet *nft_net;
 	struct nft_hook *hook, *next;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
@@ -9076,6 +9115,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
 		if (nft_hook_list_find(&flowtable->hook_list, hook)) {
 			list_del(&hook->list);
 			nft_netdev_hook_free(hook);
+			continue;
+		}
+
+		nft_net = nft_pernet(ctx->net);
+		list_for_each_entry(trans, &nft_net->commit_list, list) {
+			if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
+			    trans->table != ctx->table ||
+			    !nft_trans_flowtable_update(trans))
+				continue;
+
+			if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
+				err = -EEXIST;
+				goto err_flowtable_update_hook;
+			}
 		}
 	}
 
@@ -9391,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 
 	nlh = nfnl_msg_put(skb, portid, seq,
 			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
-			   flags, family, NFNETLINK_V0, nft_base_seq(net));
+			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
@@ -9428,8 +9481,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 
 	list_for_each_entry_rcu(hook, hook_list, list,
 				lockdep_commit_lock_is_held(net)) {
-		if (nla_put(skb, NFTA_DEVICE_NAME,
-			    hook->ifnamelen, hook->ifname))
+		if (nft_nla_put_hook_dev(skb, hook))
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, nest_devs);
@@ -9461,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
-	cb->seq = READ_ONCE(nft_net->base_seq);
+	cb->seq = nft_base_seq(net);
 
 	list_for_each_entry_rcu(table, &nft_net->tables, list) {
 		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -9646,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
 	struct nlmsghdr *nlh;
 	char buf[TASK_COMM_LEN];
 	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
 
 	nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
-			   NFNETLINK_V0, nft_base_seq(net));
+			   NFNETLINK_V0, nft_base_seq_be16(net));
 	if (!nlh)
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
+	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
 	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
 	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
 		goto nla_put_failure;
@@ -10918,11 +10969,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	 * Bump generation counter, invalidate any dump in progress.
 	 * Cannot fail after this point.
 	 */
-	base_seq = READ_ONCE(nft_net->base_seq);
+	base_seq = nft_base_seq(net);
 	while (++base_seq == 0)
 		;
 
-	WRITE_ONCE(nft_net->base_seq, base_seq);
+	/* pairs with smp_load_acquire in nft_lookup_eval */
+	smp_store_release(&net->nft.base_seq, base_seq);
 
 	gc_seq = nft_gc_seq_begin(nft_net);
 
@@ -11131,7 +11183,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 
 	nft_commit_notify(net, NETLINK_CB(skb).portid);
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
-	nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+	nf_tables_commit_audit_log(&adl, nft_base_seq(net));
 
 	nft_gc_seq_end(nft_net, gc_seq);
 	nft_net->validate_state = NFT_VALIDATE_SKIP;
@@ -11456,7 +11508,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid)
 	mutex_lock(&nft_net->commit_mutex);
 	nft_net->tstamp = get_jiffies_64();
 
-	genid_ok = genid == 0 || nft_net->base_seq == genid;
+	genid_ok = genid == 0 || nft_base_seq(net) == genid;
 	if (!genid_ok)
 		mutex_unlock(&nft_net->commit_mutex);
 
@@ -12093,7 +12145,7 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&nft_net->module_list);
 	INIT_LIST_HEAD(&nft_net->notify_list);
 	mutex_init(&nft_net->commit_mutex);
-	nft_net->base_seq = 1;
+	net->nft.base_seq = 1;
 	nft_net->gc_seq = 0;
 	nft_net->validate_state = NFT_VALIDATE_SKIP;
 	INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 40c602ffbcba..58c5b14889c4 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -24,11 +24,11 @@ struct nft_lookup {
 	struct nft_set_binding		binding;
 };
 
-#ifdef CONFIG_MITIGATION_RETPOLINE
-const struct nft_set_ext *
-nft_set_do_lookup(const struct net *net, const struct nft_set *set,
-		  const u32 *key)
+static const struct nft_set_ext *
+__nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+		    const u32 *key)
 {
+#ifdef CONFIG_MITIGATION_RETPOLINE
 	if (set->ops == &nft_set_hash_fast_type.ops)
 		return nft_hash_lookup_fast(net, set, key);
 	if (set->ops == &nft_set_hash_type.ops)
@@ -51,10 +51,46 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set,
 		return nft_rbtree_lookup(net, set, key);
 
 	WARN_ON_ONCE(1);
+#endif
 	return set->ops->lookup(net, set, key);
 }
+
+static unsigned int nft_base_seq(const struct net *net)
+{
+	/* pairs with smp_store_release() in nf_tables_commit() */
+	return smp_load_acquire(&net->nft.base_seq);
+}
+
+static bool nft_lookup_should_retry(const struct net *net, unsigned int seq)
+{
+	return unlikely(seq != nft_base_seq(net));
+}
+
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key)
+{
+	const struct nft_set_ext *ext;
+	unsigned int base_seq;
+
+	do {
+		base_seq = nft_base_seq(net);
+
+		ext = __nft_set_do_lookup(net, set, key);
+		if (ext)
+			break;
+		/* No match?  There is a small chance that lookup was
+		 * performed in the old generation, but nf_tables_commit()
+		 * already unlinked a (matching) element.
+		 *
+		 * We need to repeat the lookup to make sure that we didn't
+		 * miss a matching element in the new generation.
+		 */
+	} while (nft_lookup_should_retry(net, base_seq));
+
+	return ext;
+}
 EXPORT_SYMBOL_GPL(nft_set_do_lookup);
-#endif
 
 void nft_lookup_eval(const struct nft_expr *expr,
 		     struct nft_regs *regs,
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index c24c922f895d..8d3f040a904a 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
 	const struct nft_bitmap *priv = nft_set_priv(set);
 	struct nft_bitmap_elem *be;
 
-	list_for_each_entry_rcu(be, &priv->list, head) {
+	list_for_each_entry_rcu(be, &priv->list, head,
+				lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
 		if (iter->count < iter->skip)
 			goto cont;
 
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 1a19649c2851..793790d79d13 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -426,10 +426,9 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
 
 	local_bh_disable();
 
-	if (unlikely(!raw_cpu_ptr(m->scratch)))
-		goto out;
-
 	scratch = *raw_cpu_ptr(m->scratch);
+	if (unlikely(!scratch))
+		goto out;
 
 	map_index = scratch->map_index;
 
@@ -511,6 +510,23 @@ out:
  *
  * This function is called from the data path.  It will search for
  * an element matching the given key in the current active copy.
+ * Unlike other set types, this uses NFT_GENMASK_ANY instead of
+ * nft_genmask_cur().
+ *
+ * This is because new (future) elements are not reachable from
+ * priv->match, they get added to priv->clone instead.
+ * When the commit phase flips the generation bitmask, the
+ * 'now old' entries are skipped but without the 'now current'
+ * elements becoming visible. Using nft_genmask_cur() thus creates
+ * inconsistent state: matching old entries get skipped but thew
+ * newly matching entries are unreachable.
+ *
+ * GENMASK will still find the 'now old' entries which ensures consistent
+ * priv->match view.
+ *
+ * nft_pipapo_commit swaps ->clone and ->match shortly after the
+ * genbit flip.  As ->clone doesn't contain the old entries in the first
+ * place, lookup will only find the now-current ones.
  *
  * Return: ntables API extension pointer or NULL if no match.
  */
@@ -519,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
 		  const u32 *key)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
-	u8 genmask = nft_genmask_cur(net);
 	const struct nft_pipapo_match *m;
 	const struct nft_pipapo_elem *e;
 
 	m = rcu_dereference(priv->match);
-	e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64());
+	e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());
 
 	return e ? &e->ext : NULL;
 }
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index db5d367e43c4..c0884fa68c79 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1150,12 +1150,11 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
 		       const u32 *key)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
+	const struct nft_set_ext *ext = NULL;
 	struct nft_pipapo_scratch *scratch;
-	u8 genmask = nft_genmask_cur(net);
 	const struct nft_pipapo_match *m;
 	const struct nft_pipapo_field *f;
 	const u8 *rp = (const u8 *)key;
-	const struct nft_set_ext *ext;
 	unsigned long *res, *fill;
 	bool map_index;
 	int i;
@@ -1246,13 +1245,12 @@ next_match:
 			goto out;
 
 		if (last) {
-			ext = &f->mt[ret].e->ext;
-			if (unlikely(nft_set_elem_expired(ext) ||
-				     !nft_set_elem_active(ext, genmask))) {
-				ext = NULL;
+			const struct nft_set_ext *e = &f->mt[ret].e->ext;
+
+			if (unlikely(nft_set_elem_expired(e)))
 				goto next_match;
-			}
 
+			ext = e;
 			goto out;
 		}
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 938a257c069e..b1f04168ec93 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 			    nft_rbtree_interval_end(rbe) &&
 			    nft_rbtree_interval_start(interval))
 				continue;
-			interval = rbe;
+			if (nft_set_elem_active(&rbe->ext, genmask) &&
+			    !nft_rbtree_elem_expired(rbe))
+				interval = rbe;
 		} else if (d > 0)
 			parent = rcu_dereference_raw(parent->rb_right);
 		else {
@@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
 	}
 
 	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
-	    nft_set_elem_active(&interval->ext, genmask) &&
-	    !nft_rbtree_elem_expired(interval) &&
 	    nft_rbtree_interval_start(interval))
 		return &interval->ext;
 
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 35d0409b0095..36affbb697c2 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -217,7 +217,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 
 		level += err;
 		/* Implies a giant cgroup tree */
-		if (WARN_ON_ONCE(level > 255))
+		if (level > 255)
 			return -EOPNOTSUPP;
 
 		priv->level = level;
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 0d04d23aafe7..0da652844dd6 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 					       struct netlbl_audit *audit_info)
 {
 	struct audit_buffer *audit_buf;
-	struct lsm_context ctx;
 
 	if (audit_enabled == AUDIT_OFF)
 		return NULL;
@@ -96,12 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
 	audit_log_format(audit_buf, "netlabel: auid=%u ses=%u",
 			 from_kuid(&init_user_ns, audit_info->loginuid),
 			 audit_info->sessionid);
-
-	if (lsmprop_is_set(&audit_info->prop) &&
-	    security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) {
-		audit_log_format(audit_buf, " subj=%s", ctx.context);
-		security_release_secctx(&ctx);
-	}
+	audit_log_subj_ctx(audit_buf, &audit_info->prop);
 
 	return audit_buf;
 }
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 61981e01fd6f..b8e58132e8af 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -168,7 +168,7 @@ mc_list:
 				 NETLINK_CB(cb->skb).portid,
 				 cb->nlh->nlmsg_seq,
 				 NLM_F_MULTI,
-				 __sock_i_ino(sk)) < 0) {
+				 sock_i_ino(sk)) < 0) {
 			ret = 1;
 			break;
 		}
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 104732d34543..978c129c6095 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group)
 		    !ns_capable(net->user_ns, CAP_SYS_ADMIN))
 			ret = -EPERM;
 
+		if (ret)
+			break;
+
 		if (family->bind)
 			family->bind(i);
 
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 28c1b0022178..bd861191157b 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 
 	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len,
 				&off, PAGE_SIZE);
-	if (unlikely(ret != ibmr->sg_dma_len))
-		return ret < 0 ? ret : -EINVAL;
+	if (unlikely(ret != ibmr->sg_dma_len)) {
+		ret = ret < 0 ? ret : -EINVAL;
+		goto out_inc;
+	}
 
-	if (cmpxchg(&frmr->fr_state,
-		    FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
-		return -EBUSY;
+	if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) {
+		ret = -EBUSY;
+		goto out_inc;
+	}
 
 	atomic_inc(&ibmr->ic->i_fastreg_inuse_count);
 
@@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 		/* Failure here can be because of -ENOMEM as well */
 		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
 
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		if (printk_ratelimit())
 			pr_warn("RDS/IB: %s returned error(%d)\n",
 				__func__, ret);
-		goto out;
+		goto out_inc;
 	}
 
 	/* Wait for the registration to complete in order to prevent an invalid
@@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	 */
 	wait_event(frmr->fr_reg_done, !frmr->fr_reg);
 
-out:
+	return ret;
 
+out_inc:
+	atomic_inc(&ibmr->ic->i_fastreg_wrs);
 	return ret;
 }
 
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 41e657e97761..cf2dcec6ce5a 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = {
 static int rfkill_gpio_probe(struct platform_device *pdev)
 {
 	struct rfkill_gpio_data *rfkill;
-	struct gpio_desc *gpio;
+	const char *type_name = NULL;
 	const char *name_property;
 	const char *type_property;
-	const char *type_name;
+	struct gpio_desc *gpio;
 	int ret;
 
 	if (dmi_check_system(rfkill_gpio_deny_table))
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 4e72b636a46a..543f9e8ebb69 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -170,7 +170,7 @@ void rose_kill_by_neigh(struct rose_neigh *neigh)
 
 		if (rose->neighbour == neigh) {
 			rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
-			rose->neighbour->use--;
+			rose_neigh_put(rose->neighbour);
 			rose->neighbour = NULL;
 		}
 	}
@@ -212,7 +212,7 @@ start:
 		if (rose->device == dev) {
 			rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
 			if (rose->neighbour)
-				rose->neighbour->use--;
+				rose_neigh_put(rose->neighbour);
 			netdev_put(rose->device, &rose->dev_tracker);
 			rose->device = NULL;
 		}
@@ -655,7 +655,7 @@ static int rose_release(struct socket *sock)
 		break;
 
 	case ROSE_STATE_2:
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		release_sock(sk);
 		rose_disconnect(sk, 0, -1, -1);
 		lock_sock(sk);
@@ -823,6 +823,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 	rose->lci = rose_new_lci(rose->neighbour);
 	if (!rose->lci) {
 		err = -ENETUNREACH;
+		rose_neigh_put(rose->neighbour);
 		goto out_release;
 	}
 
@@ -834,12 +835,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 		dev = rose_dev_first();
 		if (!dev) {
 			err = -ENETUNREACH;
+			rose_neigh_put(rose->neighbour);
 			goto out_release;
 		}
 
 		user = ax25_findbyuid(current_euid());
 		if (!user) {
 			err = -EINVAL;
+			rose_neigh_put(rose->neighbour);
 			dev_put(dev);
 			goto out_release;
 		}
@@ -874,8 +877,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 
 	rose->state = ROSE_STATE_1;
 
-	rose->neighbour->use++;
-
 	rose_write_internal(sk, ROSE_CALL_REQUEST);
 	rose_start_heartbeat(sk);
 	rose_start_t1timer(sk);
@@ -1077,7 +1078,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
 			     GFP_ATOMIC);
 	make_rose->facilities    = facilities;
 
-	make_rose->neighbour->use++;
+	rose_neigh_hold(make_rose->neighbour);
 
 	if (rose_sk(sk)->defer) {
 		make_rose->state = ROSE_STATE_5;
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index 3e99181e759f..0276b393f0e5 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -56,7 +56,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety
 	case ROSE_CLEAR_REQUEST:
 		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
 		rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		break;
 
 	default:
@@ -79,12 +79,12 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety
 	case ROSE_CLEAR_REQUEST:
 		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
 		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		break;
 
 	case ROSE_CLEAR_CONFIRMATION:
 		rose_disconnect(sk, 0, -1, -1);
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		break;
 
 	default:
@@ -121,7 +121,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
 	case ROSE_CLEAR_REQUEST:
 		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
 		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		break;
 
 	case ROSE_RR:
@@ -234,7 +234,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety
 	case ROSE_CLEAR_REQUEST:
 		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
 		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		break;
 
 	default:
@@ -254,7 +254,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety
 	if (frametype == ROSE_CLEAR_REQUEST) {
 		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
 		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
-		rose_sk(sk)->neighbour->use--;
+		rose_neigh_put(rose_sk(sk)->neighbour);
 	}
 
 	return 0;
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index b72bf8a08d48..a1e9b05ef6f5 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -93,11 +93,11 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route,
 		rose_neigh->ax25      = NULL;
 		rose_neigh->dev       = dev;
 		rose_neigh->count     = 0;
-		rose_neigh->use       = 0;
 		rose_neigh->dce_mode  = 0;
 		rose_neigh->loopback  = 0;
 		rose_neigh->number    = rose_neigh_no++;
 		rose_neigh->restarted = 0;
+		refcount_set(&rose_neigh->use, 1);
 
 		skb_queue_head_init(&rose_neigh->queue);
 
@@ -178,6 +178,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route,
 			}
 		}
 		rose_neigh->count++;
+		rose_neigh_hold(rose_neigh);
 
 		goto out;
 	}
@@ -187,6 +188,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route,
 		rose_node->neighbour[rose_node->count] = rose_neigh;
 		rose_node->count++;
 		rose_neigh->count++;
+		rose_neigh_hold(rose_neigh);
 	}
 
 out:
@@ -234,20 +236,12 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh)
 
 	if ((s = rose_neigh_list) == rose_neigh) {
 		rose_neigh_list = rose_neigh->next;
-		if (rose_neigh->ax25)
-			ax25_cb_put(rose_neigh->ax25);
-		kfree(rose_neigh->digipeat);
-		kfree(rose_neigh);
 		return;
 	}
 
 	while (s != NULL && s->next != NULL) {
 		if (s->next == rose_neigh) {
 			s->next = rose_neigh->next;
-			if (rose_neigh->ax25)
-				ax25_cb_put(rose_neigh->ax25);
-			kfree(rose_neigh->digipeat);
-			kfree(rose_neigh);
 			return;
 		}
 
@@ -263,10 +257,10 @@ static void rose_remove_route(struct rose_route *rose_route)
 	struct rose_route *s;
 
 	if (rose_route->neigh1 != NULL)
-		rose_route->neigh1->use--;
+		rose_neigh_put(rose_route->neigh1);
 
 	if (rose_route->neigh2 != NULL)
-		rose_route->neigh2->use--;
+		rose_neigh_put(rose_route->neigh2);
 
 	if ((s = rose_route_list) == rose_route) {
 		rose_route_list = rose_route->next;
@@ -330,9 +324,12 @@ static int rose_del_node(struct rose_route_struct *rose_route,
 	for (i = 0; i < rose_node->count; i++) {
 		if (rose_node->neighbour[i] == rose_neigh) {
 			rose_neigh->count--;
+			rose_neigh_put(rose_neigh);
 
-			if (rose_neigh->count == 0 && rose_neigh->use == 0)
+			if (rose_neigh->count == 0) {
 				rose_remove_neigh(rose_neigh);
+				rose_neigh_put(rose_neigh);
+			}
 
 			rose_node->count--;
 
@@ -381,11 +378,11 @@ void rose_add_loopback_neigh(void)
 	sn->ax25      = NULL;
 	sn->dev       = NULL;
 	sn->count     = 0;
-	sn->use       = 0;
 	sn->dce_mode  = 1;
 	sn->loopback  = 1;
 	sn->number    = rose_neigh_no++;
 	sn->restarted = 1;
+	refcount_set(&sn->use, 1);
 
 	skb_queue_head_init(&sn->queue);
 
@@ -436,6 +433,7 @@ int rose_add_loopback_node(const rose_address *address)
 	rose_node_list  = rose_node;
 
 	rose_loopback_neigh->count++;
+	rose_neigh_hold(rose_loopback_neigh);
 
 out:
 	spin_unlock_bh(&rose_node_list_lock);
@@ -467,6 +465,7 @@ void rose_del_loopback_node(const rose_address *address)
 	rose_remove_node(rose_node);
 
 	rose_loopback_neigh->count--;
+	rose_neigh_put(rose_loopback_neigh);
 
 out:
 	spin_unlock_bh(&rose_node_list_lock);
@@ -506,6 +505,7 @@ void rose_rt_device_down(struct net_device *dev)
 				memmove(&t->neighbour[i], &t->neighbour[i + 1],
 					sizeof(t->neighbour[0]) *
 						(t->count - i));
+				rose_neigh_put(s);
 			}
 
 			if (t->count <= 0)
@@ -513,6 +513,7 @@ void rose_rt_device_down(struct net_device *dev)
 		}
 
 		rose_remove_neigh(s);
+		rose_neigh_put(s);
 	}
 	spin_unlock_bh(&rose_neigh_list_lock);
 	spin_unlock_bh(&rose_node_list_lock);
@@ -548,6 +549,7 @@ static int rose_clear_routes(void)
 {
 	struct rose_neigh *s, *rose_neigh;
 	struct rose_node  *t, *rose_node;
+	int i;
 
 	spin_lock_bh(&rose_node_list_lock);
 	spin_lock_bh(&rose_neigh_list_lock);
@@ -558,17 +560,21 @@ static int rose_clear_routes(void)
 	while (rose_node != NULL) {
 		t         = rose_node;
 		rose_node = rose_node->next;
-		if (!t->loopback)
+
+		if (!t->loopback) {
+			for (i = 0; i < t->count; i++)
+				rose_neigh_put(t->neighbour[i]);
 			rose_remove_node(t);
+		}
 	}
 
 	while (rose_neigh != NULL) {
 		s          = rose_neigh;
 		rose_neigh = rose_neigh->next;
 
-		if (s->use == 0 && !s->loopback) {
-			s->count = 0;
+		if (!s->loopback) {
 			rose_remove_neigh(s);
+			rose_neigh_put(s);
 		}
 	}
 
@@ -684,6 +690,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
 			for (i = 0; i < node->count; i++) {
 				if (node->neighbour[i]->restarted) {
 					res = node->neighbour[i];
+					rose_neigh_hold(node->neighbour[i]);
 					goto out;
 				}
 			}
@@ -695,6 +702,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
 				for (i = 0; i < node->count; i++) {
 					if (!rose_ftimer_running(node->neighbour[i])) {
 						res = node->neighbour[i];
+						rose_neigh_hold(node->neighbour[i]);
 						goto out;
 					}
 					failed = 1;
@@ -784,13 +792,13 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh)
 		}
 
 		if (rose_route->neigh1 == rose_neigh) {
-			rose_route->neigh1->use--;
+			rose_neigh_put(rose_route->neigh1);
 			rose_route->neigh1 = NULL;
 			rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0);
 		}
 
 		if (rose_route->neigh2 == rose_neigh) {
-			rose_route->neigh2->use--;
+			rose_neigh_put(rose_route->neigh2);
 			rose_route->neigh2 = NULL;
 			rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0);
 		}
@@ -919,7 +927,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 			rose_clear_queues(sk);
 			rose->cause	 = ROSE_NETWORK_CONGESTION;
 			rose->diagnostic = 0;
-			rose->neighbour->use--;
+			rose_neigh_put(rose->neighbour);
 			rose->neighbour	 = NULL;
 			rose->lci	 = 0;
 			rose->state	 = ROSE_STATE_0;
@@ -1044,12 +1052,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 
 	if ((new_lci = rose_new_lci(new_neigh)) == 0) {
 		rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71);
-		goto out;
+		goto put_neigh;
 	}
 
 	if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) {
 		rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120);
-		goto out;
+		goto put_neigh;
 	}
 
 	rose_route->lci1      = lci;
@@ -1062,8 +1070,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 	rose_route->lci2      = new_lci;
 	rose_route->neigh2    = new_neigh;
 
-	rose_route->neigh1->use++;
-	rose_route->neigh2->use++;
+	rose_neigh_hold(rose_route->neigh1);
+	rose_neigh_hold(rose_route->neigh2);
 
 	rose_route->next = rose_route_list;
 	rose_route_list  = rose_route;
@@ -1075,6 +1083,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 	rose_transmit_link(skb, rose_route->neigh2);
 	res = 1;
 
+put_neigh:
+	rose_neigh_put(new_neigh);
 out:
 	spin_unlock_bh(&rose_route_list_lock);
 	spin_unlock_bh(&rose_neigh_list_lock);
@@ -1190,7 +1200,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
 			   (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign),
 			   rose_neigh->dev ? rose_neigh->dev->name : "???",
 			   rose_neigh->count,
-			   rose_neigh->use,
+			   refcount_read(&rose_neigh->use) - rose_neigh->count - 1,
 			   (rose_neigh->dce_mode) ? "DCE" : "DTE",
 			   (rose_neigh->restarted) ? "yes" : "no",
 			   ax25_display_timer(&rose_neigh->t0timer) / HZ,
@@ -1295,18 +1305,22 @@ void __exit rose_rt_free(void)
 	struct rose_neigh *s, *rose_neigh = rose_neigh_list;
 	struct rose_node  *t, *rose_node  = rose_node_list;
 	struct rose_route *u, *rose_route = rose_route_list;
+	int i;
 
 	while (rose_neigh != NULL) {
 		s          = rose_neigh;
 		rose_neigh = rose_neigh->next;
 
 		rose_remove_neigh(s);
+		rose_neigh_put(s);
 	}
 
 	while (rose_node != NULL) {
 		t         = rose_node;
 		rose_node = rose_node->next;
 
+		for (i = 0; i < t->count; i++)
+			rose_neigh_put(t->neighbour[i]);
 		rose_remove_node(t);
 	}
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 020369c49587..bb60a1654d61 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -180,7 +180,7 @@ static void rose_timer_expiry(struct timer_list *t)
 		break;
 
 	case ROSE_STATE_2:	/* T3 */
-		rose->neighbour->use--;
+		rose_neigh_put(rose->neighbour);
 		rose_disconnect(sk, ETIMEDOUT, -1, -1);
 		break;
 
diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c
index 1e19c605bcc8..dce5a3d8a964 100644
--- a/net/rxrpc/rxgk.c
+++ b/net/rxrpc/rxgk.c
@@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
 	struct krb5_buffer metadata;
 	unsigned int offset = sp->offset, len = sp->len;
 	size_t data_offset = 0, data_len = len;
-	u32 ac;
+	u32 ac = 0;
 	int ret = -ENOMEM;
 
 	_enter("");
@@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call,
 	ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata,
 				  skb, &offset, &len, &ac);
 	kfree(hdr);
-	if (ret == -EPROTO) {
-		rxrpc_abort_eproto(call, skb, ac,
-				   rxgk_abort_1_verify_mic_eproto);
+	if (ret < 0) {
+		if (ret != -ENOMEM)
+			rxrpc_abort_eproto(call, skb, ac,
+					   rxgk_abort_1_verify_mic_eproto);
 	} else {
 		sp->offset = offset;
 		sp->len = len;
@@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call,
 	struct rxgk_header hdr;
 	unsigned int offset = sp->offset, len = sp->len;
 	int ret;
-	u32 ac;
+	u32 ac = 0;
 
 	_enter("");
 
 	ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac);
-	if (ret == -EPROTO)
-		rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto);
-	if (ret < 0)
+	if (ret < 0) {
+		if (ret != -ENOMEM)
+			rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto);
 		goto error;
+	}
 
 	if (len < sizeof(hdr)) {
 		ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT,
diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c
index b94b77a1c317..30275cb5ba3e 100644
--- a/net/rxrpc/rxgk_app.c
+++ b/net/rxrpc/rxgk_app.c
@@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb,
 
 	_enter("");
 
+	if (ticket_len < 10 * sizeof(__be32))
+		return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO,
+					rxgk_abort_resp_short_yfs_tkt);
+
 	/* Get the session key length */
 	ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp));
 	if (ret < 0)
@@ -187,7 +191,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
 	struct key *server_key;
 	unsigned int ticket_offset, ticket_len;
 	u32 kvno, enctype;
-	int ret, ec;
+	int ret, ec = 0;
 
 	struct {
 		__be32 kvno;
@@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
 		__be32 token_len;
 	} container;
 
+	if (token_len < sizeof(container))
+		goto short_packet;
+
 	/* Decode the RXGK_TokenContainer object.  This tells us which server
 	 * key we should be using.  We can then fetch the key, get the secret
 	 * and set up the crypto to extract the token.
 	 */
 	if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0)
-		return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
-					rxgk_abort_resp_tok_short);
+		goto short_packet;
 
 	kvno		= ntohl(container.kvno);
 	enctype		= ntohl(container.enctype);
 	ticket_len	= ntohl(container.token_len);
 	ticket_offset	= token_offset + sizeof(container);
 
-	if (xdr_round_up(ticket_len) > token_len - 3 * 4)
-		return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
-					rxgk_abort_resp_tok_short);
+	if (xdr_round_up(ticket_len) > token_len - sizeof(container))
+		goto short_packet;
 
 	_debug("KVNO %u", kvno);
 	_debug("ENC  %u", enctype);
@@ -236,9 +241,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
 			       &ticket_offset, &ticket_len, &ec);
 	crypto_free_aead(token_enc);
 	token_enc = NULL;
-	if (ret < 0)
-		return rxrpc_abort_conn(conn, skb, ec, ret,
-					rxgk_abort_resp_tok_dec);
+	if (ret < 0) {
+		if (ret != -ENOMEM)
+			return rxrpc_abort_conn(conn, skb, ec, ret,
+						rxgk_abort_resp_tok_dec);
+	}
 
 	ret = conn->security->default_decode_ticket(conn, skb, ticket_offset,
 						    ticket_len, _key);
@@ -283,4 +290,8 @@ temporary_error:
 	 * also come out this way if the ticket decryption fails.
 	 */
 	return ret;
+
+short_packet:
+	return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO,
+				rxgk_abort_resp_tok_short);
 }
diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h
index 7370a5655985..80164d89e19c 100644
--- a/net/rxrpc/rxgk_common.h
+++ b/net/rxrpc/rxgk_common.h
@@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5,
 		*_offset += offset;
 		*_len = len;
 		break;
+	case -EBADMSG: /* Checksum mismatch. */
 	case -EPROTO:
-	case -EBADMSG:
 		*_error_code = RXGK_SEALEDINCON;
 		break;
+	case -EMSGSIZE:
+		*_error_code = RXGK_PACKETSHORT;
+		break;
+	case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */
 	default:
+		*_error_code = RXGK_INCONSISTENCY;
 		break;
 	}
 
@@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5,
 		*_offset += offset;
 		*_len = len;
 		break;
+	case -EBADMSG: /* Checksum mismatch */
 	case -EPROTO:
-	case -EBADMSG:
 		*_error_code = RXGK_SEALEDINCON;
 		break;
+	case -EMSGSIZE:
+		*_error_code = RXGK_PACKETSHORT;
+		break;
+	case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */
 	default:
+		*_error_code = RXGK_INCONSISTENCY;
 		break;
 	}
 
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index dbcfb948c867..32bacfc314c2 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1750,7 +1750,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	ktime_t now = ktime_get();
 	struct cake_tin_data *b;
 	struct cake_flow *flow;
-	u32 idx;
+	u32 idx, tin;
 
 	/* choose flow to insert into */
 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
@@ -1760,6 +1760,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		__qdisc_drop(skb, to_free);
 		return ret;
 	}
+	tin = (u32)(b - q->tins);
 	idx--;
 	flow = &b->flows[idx];
 
@@ -1927,13 +1928,22 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		q->buffer_max_used = q->buffer_used;
 
 	if (q->buffer_used > q->buffer_limit) {
+		bool same_flow = false;
 		u32 dropped = 0;
+		u32 drop_id;
 
 		while (q->buffer_used > q->buffer_limit) {
 			dropped++;
-			cake_drop(sch, to_free);
+			drop_id = cake_drop(sch, to_free);
+
+			if ((drop_id >> 16) == tin &&
+			    (drop_id & 0xFFFF) == idx)
+				same_flow = true;
 		}
 		b->drop_overlimit += dropped;
+
+		if (same_flow)
+			return NET_XMIT_CN;
 	}
 	return NET_XMIT_SUCCESS;
 }
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index c93761040c6e..fa0314679e43 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
 static int codel_change(struct Qdisc *sch, struct nlattr *opt,
 			struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_CODEL_MAX + 1];
-	unsigned int qlen, dropped = 0;
 	int err;
 
 	err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt,
@@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt,
 		WRITE_ONCE(q->params.ecn,
 			   !!nla_get_u32(tb[TCA_CODEL_ECN]));
 
-	qlen = sch->q.qlen;
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
 
-		dropped += qdisc_pkt_len(skb);
-		qdisc_qstats_backlog_dec(sch, skb);
+		if (!skb)
+			break;
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_qdisc_drop(skb, sch);
 	}
-	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 845375ebd4ea..4b975feb52b1 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -927,7 +927,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
 
 	q->sch = sch;
 	dualpi2_reset_default(sch);
-	hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
+		      HRTIMER_MODE_ABS_PINNED_SOFT);
 
 	if (opt && nla_len(opt)) {
 		err = dualpi2_change(sch, opt, extack);
@@ -937,7 +938,7 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
-		      HRTIMER_MODE_ABS_PINNED);
+		      HRTIMER_MODE_ABS_PINNED_SOFT);
 	return 0;
 }
 
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index 037f764822b9..82635dd2cfa5 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -651,6 +651,12 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
 
 	sch_tree_lock(sch);
 
+	for (i = nbands; i < oldbands; i++) {
+		if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
+			list_del_init(&q->classes[i].alist);
+		qdisc_purge_queue(q->classes[i].qdisc);
+	}
+
 	WRITE_ONCE(q->nbands, nbands);
 	for (i = nstrict; i < q->nstrict; i++) {
 		if (q->classes[i].qdisc->q.qlen) {
@@ -658,11 +664,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
 			q->classes[i].deficit = quanta[i];
 		}
 	}
-	for (i = q->nbands; i < oldbands; i++) {
-		if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
-			list_del_init(&q->classes[i].alist);
-		qdisc_purge_queue(q->classes[i].qdisc);
-	}
 	WRITE_ONCE(q->nstrict, nstrict);
 	memcpy(q->prio2band, priomap, sizeof(priomap));
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 902ff5470607..fee922da2f99 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1013,11 +1013,11 @@ static int fq_load_priomap(struct fq_sched_data *q,
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		     struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_MAX + 1];
-	int err, drop_count = 0;
-	unsigned drop_len = 0;
 	u32 fq_log;
+	int err;
 
 	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
 					  NULL);
@@ -1135,16 +1135,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		err = fq_resize(sch, fq_log);
 		sch_tree_lock(sch);
 	}
+
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
 
 		if (!skb)
 			break;
-		drop_len += qdisc_pkt_len(skb);
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_kfree_skbs(skb, skb);
-		drop_count++;
 	}
-	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return err;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 2a0f3a513bfa..a14142392939 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
 			   struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
 	u32 quantum = 0;
@@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
 	       q->memory_usage > q->memory_limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
 
-		q->cstats.drop_len += qdisc_pkt_len(skb);
+		if (!skb)
+			break;
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_kfree_skbs(skb, skb);
-		q->cstats.drop_count++;
 	}
-	qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
-	q->cstats.drop_count = 0;
-	q->cstats.drop_len = 0;
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index b0e34daf1f75..7b96bc3ff891 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -287,10 +287,9 @@ begin:
 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
 			 struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct fq_pie_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
-	unsigned int len_dropped = 0;
-	unsigned int num_dropped = 0;
 	int err;
 
 	err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
@@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
 
-		len_dropped += qdisc_pkt_len(skb);
-		num_dropped += 1;
+		if (!skb)
+			break;
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_kfree_skbs(skb, skb);
 	}
-	qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 5aa434b46707..2d4855e28a28 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
 static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
 		      struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct hhf_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_HHF_MAX + 1];
-	unsigned int qlen, prev_backlog;
 	int err;
 	u64 non_hh_quantum;
 	u32 new_quantum = q->quantum;
@@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
 			   usecs_to_jiffies(us));
 	}
 
-	qlen = sch->q.qlen;
-	prev_backlog = sch->qstats.backlog;
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
 
+		if (!skb)
+			break;
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_kfree_skbs(skb, skb);
 	}
-	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
-				  prev_backlog - sch->qstats.backlog);
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index c968ea763774..b5e40c51655a 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -592,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
  */
 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 {
-	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
+	WARN_ON(cl->level || !cl->leaf.q);
 
 	if (!cl->prio_activity) {
 		cl->prio_activity = 1 << cl->prio;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index ad46ee3ed5a9..0a377313b6a9 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
 static int pie_change(struct Qdisc *sch, struct nlattr *opt,
 		      struct netlink_ext_ack *extack)
 {
+	unsigned int dropped_pkts = 0, dropped_bytes = 0;
 	struct pie_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_PIE_MAX + 1];
-	unsigned int qlen, dropped = 0;
 	int err;
 
 	err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy,
@@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
 			   nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]));
 
 	/* Drop excess packets if new limit is lower */
-	qlen = sch->q.qlen;
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
 
-		dropped += qdisc_pkt_len(skb);
-		qdisc_qstats_backlog_dec(sch, skb);
+		if (!skb)
+			break;
+
+		dropped_pkts++;
+		dropped_bytes += qdisc_pkt_len(skb);
 		rtnl_qdisc_drop(skb, sch);
 	}
-	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
+	qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes);
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 2dc2666988fb..7e99894778d4 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb)
 	 * it's better to just linearize it otherwise crc computing
 	 * takes longer.
 	 */
-	if ((!is_gso && skb_linearize(skb)) ||
+	if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) ||
 	    !pskb_may_pull(skb, sizeof(struct sctphdr)))
 		goto discard_it;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 3336dcfb4515..568ff8797c39 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -547,7 +547,9 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk)
 {
 	addr->v6.sin6_family = AF_INET6;
 	addr->v6.sin6_port = 0;
+	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_addr = sk->sk_v6_rcv_saddr;
+	addr->v6.sin6_scope_id = 0;
 }
 
 /* Initialize sk->sk_rcv_saddr from sctp_addr. */
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 9311c38f7abe..e0e48f24cd61 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2568,8 +2568,9 @@ static void smc_listen_work(struct work_struct *work)
 			goto out_decl;
 	}
 
-	smc_listen_out_connected(new_smc);
 	SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
+	/* smc_listen_out() will release smcsk */
+	smc_listen_out_connected(new_smc);
 	goto out_free;
 
 out_unlock:
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 5a4db151fe95..08be56dfb3f2 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -426,8 +426,6 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc)
 {
 	struct smc_clc_msg_hdr *hdr = &dclc->hdr;
 
-	if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
-		return false;
 	if (hdr->version == SMC_V1) {
 		if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline))
 			return false;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 53828833a3f7..a42ef3f77b96 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -742,6 +742,9 @@ bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
 	unsigned int i;
 	bool ret = false;
 
+	if (!lnk->smcibdev->ibdev->dma_device)
+		return ret;
+
 	/* for now there is just one DMA address */
 	for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
 		    buf_slot->sgt[lnk->link_idx].nents, i) {
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
index 0eb00bbefd17..77cc1c6dc3e9 100644
--- a/net/smc/smc_loopback.c
+++ b/net/smc/smc_loopback.c
@@ -56,6 +56,7 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
 {
 	struct smc_lo_dmb_node *dmb_node, *tmp_node;
 	struct smc_lo_dev *ldev = smcd->priv;
+	struct folio *folio;
 	int sba_idx, rc;
 
 	/* check space for new dmb */
@@ -74,13 +75,16 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
 
 	dmb_node->sba_idx = sba_idx;
 	dmb_node->len = dmb->dmb_len;
-	dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL |
-				     __GFP_NOWARN | __GFP_NORETRY |
-				     __GFP_NOMEMALLOC);
-	if (!dmb_node->cpu_addr) {
+
+	/* not critical; fail under memory pressure and fallback to TCP */
+	folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC |
+			    __GFP_NORETRY | __GFP_ZERO,
+			    get_order(dmb_node->len));
+	if (!folio) {
 		rc = -ENOMEM;
 		goto err_node;
 	}
+	dmb_node->cpu_addr = folio_address(folio);
 	dmb_node->dma_addr = SMC_DMA_ADDR_INVALID;
 	refcount_set(&dmb_node->refcnt, 1);
 
@@ -122,7 +126,7 @@ static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev,
 	write_unlock_bh(&ldev->dmb_ht_lock);
 
 	clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask);
-	kvfree(dmb_node->cpu_addr);
+	folio_put(virt_to_folio(dmb_node->cpu_addr));
 	kfree(dmb_node);
 
 	if (atomic_dec_and_test(&ldev->dmb_cnt))
diff --git a/net/socket.c b/net/socket.c
index 682969deaed3..bac335ecee4c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (sock->type == SOCK_SEQPACKET)
 		msg.msg_flags |= MSG_EOR;
 
+	if (iocb->ki_flags & IOCB_NOSIGNAL)
+		msg.msg_flags |= MSG_NOSIGNAL;
+
 	res = __sock_sendmsg(sock, &msg);
 	*from = msg.msg_iter;
 	return res;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 73bc39281ef5..9b45fbdc90ca 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
 static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	if (unlikely(current->flags & PF_EXITING))
-		return -EINTR;
 	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 46c156b121db..e2c5e0e626f9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -257,20 +257,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
 }
 
 static int
-svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg)
+svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags)
 {
 	union {
 		struct cmsghdr	cmsg;
 		u8		buf[CMSG_SPACE(sizeof(u8))];
 	} u;
-	struct socket *sock = svsk->sk_sock;
+	u8 alert[2];
+	struct kvec alert_kvec = {
+		.iov_base = alert,
+		.iov_len = sizeof(alert),
+	};
+	struct msghdr msg = {
+		.msg_flags = *msg_flags,
+		.msg_control = &u,
+		.msg_controllen = sizeof(u),
+	};
+	int ret;
+
+	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+		      alert_kvec.iov_len);
+	ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
+	if (ret > 0 &&
+	    tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+		iov_iter_revert(&msg.msg_iter, ret);
+		ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN);
+	}
+	return ret;
+}
+
+static int
+svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg)
+{
 	int ret;
+	struct socket *sock = svsk->sk_sock;
 
-	msg->msg_control = &u;
-	msg->msg_controllen = sizeof(u);
 	ret = sock_recvmsg(sock, msg, MSG_DONTWAIT);
-	if (unlikely(msg->msg_controllen != sizeof(u)))
-		ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+	if (msg->msg_flags & MSG_CTRUNC) {
+		msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+		if (ret == 0 || ret == -EIO)
+			ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags);
+	}
 	return ret;
 }
 
@@ -321,7 +348,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
 		iov_iter_advance(&msg.msg_iter, seek);
 		buflen -= seek;
 	}
-	len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+	len = svc_tcp_sock_recvmsg(svsk, &msg);
 	if (len > 0)
 		svc_flush_bvec(bvec, len, seek);
 
@@ -1018,7 +1045,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
 		iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
 		iov.iov_len  = want;
 		iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
-		len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+		len = svc_tcp_sock_recvmsg(svsk, &msg);
 		if (len < 0)
 			return len;
 		svsk->sk_tcplen += len;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c5f7bbf5775f..3aa987e7f072 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)
 	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
 		      alert_kvec.iov_len);
 	ret = sock_recvmsg(sock, &msg, flags);
-	if (ret > 0 &&
-	    tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
-		iov_iter_revert(&msg.msg_iter, ret);
+	if (ret > 0) {
+		if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT)
+			iov_iter_revert(&msg.msg_iter, ret);
 		ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,
 					   -EAGAIN);
 	}
diff --git a/net/tls/tls.h b/net/tls/tls.h
index 774859b63f0d..e4c42731ce39 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx);
 
 int wait_on_pending_writer(struct sock *sk, long *timeo);
 void tls_err_abort(struct sock *sk, int err);
+void tls_strp_abort_strp(struct tls_strparser *strp, int err);
 
 int init_prot_info(struct tls_prot_info *prot,
 		   const struct tls_crypto_info *crypto_info,
@@ -196,7 +197,7 @@ void tls_strp_msg_done(struct tls_strparser *strp);
 int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb);
 void tls_rx_msg_ready(struct tls_strparser *strp);
 
-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
+bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh);
 int tls_strp_msg_cow(struct tls_sw_context_rx *ctx);
 struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx);
 int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst);
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 095cf31bae0b..98e12f0ff57e 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -13,7 +13,7 @@
 
 static struct workqueue_struct *tls_strp_wq;
 
-static void tls_strp_abort_strp(struct tls_strparser *strp, int err)
+void tls_strp_abort_strp(struct tls_strparser *strp, int err)
 {
 	if (strp->stopped)
 		return;
@@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb,
 				struct sk_buff *in_skb, unsigned int offset,
 				size_t in_len)
 {
+	unsigned int nfrag = skb->len / PAGE_SIZE;
 	size_t len, chunk;
 	skb_frag_t *frag;
 	int sz;
 
-	frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE];
+	if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) {
+		DEBUG_NET_WARN_ON_ONCE(1);
+		return -EMSGSIZE;
+	}
+
+	frag = &skb_shinfo(skb)->frags[nfrag];
 
 	len = in_len;
 	/* First make sure we got the header */
@@ -475,7 +481,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
 	strp->stm.offset = offset;
 }
 
-void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
+bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
 {
 	struct strp_msg *rxm;
 	struct tls_msg *tlm;
@@ -484,8 +490,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
 	DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len);
 
 	if (!strp->copy_mode && force_refresh) {
-		if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len))
-			return;
+		if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) {
+			WRITE_ONCE(strp->msg_ready, 0);
+			memset(&strp->stm, 0, sizeof(strp->stm));
+			return false;
+		}
 
 		tls_strp_load_anchor_with_queue(strp, strp->stm.full_len);
 	}
@@ -495,6 +504,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh)
 	rxm->offset	= strp->stm.offset;
 	tlm = tls_msg(strp->anchor);
 	tlm->control	= strp->mark;
+
+	return true;
 }
 
 /* Called with lock held on lower socket */
@@ -515,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp)
 	tls_strp_load_anchor_with_queue(strp, inq);
 	if (!strp->stm.full_len) {
 		sz = tls_rx_msg_size(strp, strp->anchor);
-		if (sz < 0) {
-			tls_strp_abort_strp(strp, sz);
+		if (sz < 0)
 			return sz;
-		}
 
 		strp->stm.full_len = sz;
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 549d1ea01a72..daac9fd4be7e 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
 			return sock_intr_errno(timeo);
 	}
 
-	tls_strp_msg_load(&ctx->strp, released);
+	if (unlikely(!tls_strp_msg_load(&ctx->strp, released)))
+		return tls_rx_rec_wait(sk, psock, nonblock, false);
 
 	return 1;
 }
@@ -1807,6 +1808,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout)
 	return tls_decrypt_sg(sk, NULL, sgout, &darg);
 }
 
+/* All records returned from a recvmsg() call must have the same type.
+ * 0 is not a valid content type. Use it as "no type reported, yet".
+ */
 static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm,
 				   u8 *control)
 {
@@ -2050,8 +2054,10 @@ int tls_sw_recvmsg(struct sock *sk,
 	if (err < 0)
 		goto end;
 
+	/* process_rx_list() will set @control if it processed any records */
 	copied = err;
-	if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more)
+	if (len <= copied || rx_more ||
+	    (control && control != TLS_RECORD_TYPE_DATA))
 		goto end;
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
@@ -2468,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb)
 	return data_len + TLS_HEADER_SIZE;
 
 read_failure:
-	tls_err_abort(strp->sk, ret);
-
+	tls_strp_abort_strp(strp, ret);
 	return ret;
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6d7c110814ff..768098dec231 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	 * Get the parent directory, calculate the hash for last
 	 * component.
 	 */
-	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
+	dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
 		goto out;
@@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
 	unix_table_double_unlock(net, old_hash, new_hash);
 	unix_insert_bsd_socket(sk);
 	mutex_unlock(&u->bindlock);
-	done_path_create(&parent, dentry);
+	end_creating_path(&parent, dentry);
 	return 0;
 
 out_unlock:
@@ -1427,7 +1427,7 @@ out_unlink:
 	/* failed after successful mknod?  unlink what we'd created... */
 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
 out_path:
-	done_path_create(&parent, dentry);
+	end_creating_path(&parent, dentry);
 out:
 	unix_release_addr(addr);
 	return err == -EEXIST ? -EADDRINUSE : err;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ead6a3c14b87..bebb355f3ffe 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -689,7 +689,8 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
 		unsigned int i;
 
 		for (i = 0; i < MAX_PORT_RETRIES; i++) {
-			if (port <= LAST_RESERVED_PORT)
+			if (port == VMADDR_PORT_ANY ||
+			    port <= LAST_RESERVED_PORT)
 				port = LAST_RESERVED_PORT + 1;
 
 			new_addr.svm_port = port++;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index fe92e5fa95b4..dcc8a1d5851e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -105,12 +105,14 @@ static int virtio_transport_fill_skb(struct sk_buff *skb,
 				     size_t len,
 				     bool zcopy)
 {
+	struct msghdr *msg = info->msg;
+
 	if (zcopy)
-		return __zerocopy_sg_from_iter(info->msg, NULL, skb,
-					       &info->msg->msg_iter, len, NULL);
+		return __zerocopy_sg_from_iter(msg, NULL, skb,
+					       &msg->msg_iter, len, NULL);
 
 	virtio_vsock_skb_put(skb, len);
-	return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len);
+	return skb_copy_datagram_from_iter_full(skb, 0, &msg->msg_iter, len);
 }
 
 static void virtio_transport_init_hdr(struct sk_buff *skb,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 89519aa52893..852573423e52 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 				u32 seq, int flags,
 				struct cfg80211_registered_device *rdev,
 				struct net_device *dev,
-				const u8 *mac_addr, struct station_info *sinfo)
+				const u8 *mac_addr, struct station_info *sinfo,
+				bool link_stats)
 {
 	void *hdr;
 	struct nlattr *sinfoattr, *bss_param;
@@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 			goto nla_put_failure;
 	}
 
-	if (sinfo->valid_links) {
+	if (link_stats && sinfo->valid_links) {
 		links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
 		if (!links)
 			goto nla_put_failure;
@@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev->netdev, mac_addr,
-				&sinfo) < 0)
+				&sinfo, false) < 0)
 			goto out;
 
 		sta_idx++;
@@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
-				 rdev, dev, mac_addr, &sinfo) < 0) {
+				 rdev, dev, mac_addr, &sinfo, false) < 0) {
 		nlmsg_free(msg);
 		return -ENOBUFS;
 	}
@@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
 		return;
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0,
-				 rdev, dev, mac_addr, sinfo) < 0) {
+				 rdev, dev, mac_addr, sinfo, false) < 0) {
 		nlmsg_free(msg);
 		return;
 	}
@@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
-				 rdev, dev, mac_addr, sinfo) < 0) {
+				 rdev, dev, mac_addr, sinfo, false) < 0) {
 		nlmsg_free(msg);
 		return;
 	}
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index a8339ed52404..6c7b7c3828a4 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1916,7 +1916,8 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
 			 */
 
 			f = rcu_access_pointer(new->pub.beacon_ies);
-			kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head);
+			if (!new->pub.hidden_beacon_bss)
+				kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head);
 			return false;
 		}
 
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 826ec0a6355f..3a028ff287fb 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -900,13 +900,16 @@ void __cfg80211_connect_result(struct net_device *dev,
 	if (!wdev->u.client.ssid_len) {
 		rcu_read_lock();
 		for_each_valid_link(cr, link) {
+			u32 ssid_len;
+
 			ssid = ieee80211_bss_get_elem(cr->links[link].bss,
 						      WLAN_EID_SSID);
 
 			if (!ssid || !ssid->datalen)
 				continue;
 
-			memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen);
+			ssid_len = min(ssid->datalen, IEEE80211_MAX_SSID_LEN);
+			memcpy(wdev->u.client.ssid, ssid->data, ssid_len);
 			wdev->u.client.ssid_len = ssid->datalen;
 			break;
 		}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9c3acecc14b1..72e34bd2d925 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,20 @@
 #define TX_BATCH_SIZE 32
 #define MAX_PER_SOCKET_BUDGET 32
 
+struct xsk_addr_node {
+	u64 addr;
+	struct list_head addr_node;
+};
+
+struct xsk_addr_head {
+	u32 num_descs;
+	struct list_head addrs_list;
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
+
+#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
+
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
 	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
+				      struct sk_buff *skb)
 {
+	struct xsk_addr_node *pos, *tmp;
+	u32 descs_processed = 0;
 	unsigned long flags;
+	u32 idx;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
+	idx = xskq_get_prod(pool->cq);
+
+	xskq_prod_write_addr(pool->cq, idx,
+			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
+	descs_processed++;
+
+	if (unlikely(XSKCB(skb)->num_descs > 1)) {
+		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+			xskq_prod_write_addr(pool->cq, idx + descs_processed,
+					     pos->addr);
+			descs_processed++;
+			list_del(&pos->addr_node);
+			kmem_cache_free(xsk_tx_generic_cache, pos);
+		}
+	}
+	xskq_prod_submit_n(pool->cq, descs_processed);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
@@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
+static void xsk_inc_num_desc(struct sk_buff *skb)
+{
+	XSKCB(skb)->num_descs++;
+}
+
 static u32 xsk_get_num_desc(struct sk_buff *skb)
 {
-	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+	return XSKCB(skb)->num_descs;
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
 	sock_wfree(skb);
 }
 
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr)
 {
-	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
-
-	skb_shinfo(skb)->destructor_arg = (void *)num;
+	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
+	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
+	XSKCB(skb)->num_descs = 0;
+	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
 }
 
 static void xsk_consume_skb(struct sk_buff *skb)
 {
 	struct xdp_sock *xs = xdp_sk(skb->sk);
+	u32 num_descs = xsk_get_num_desc(skb);
+	struct xsk_addr_node *pos, *tmp;
+
+	if (unlikely(num_descs > 1)) {
+		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+			list_del(&pos->addr_node);
+			kmem_cache_free(xsk_tx_generic_cache, pos);
+		}
+	}
 
 	skb->destructor = sock_wfree;
-	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
+	xsk_cq_cancel_locked(xs->pool, num_descs);
 	/* Free skb without triggering the perf drop trace */
 	consume_skb(skb);
 	xs->skb = NULL;
@@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 {
 	struct xsk_buff_pool *pool = xs->pool;
 	u32 hr, len, ts, offset, copy, copied;
+	struct xsk_addr_node *xsk_addr;
 	struct sk_buff *skb = xs->skb;
 	struct page *page;
 	void *buffer;
@@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 			return ERR_PTR(err);
 
 		skb_reserve(skb, hr);
+
+		xsk_set_destructor_arg(skb, desc->addr);
+	} else {
+		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+		if (!xsk_addr)
+			return ERR_PTR(-ENOMEM);
+
+		/* in case of -EOVERFLOW that could happen below,
+		 * xsk_consume_skb() will release this node as whole skb
+		 * would be dropped, which implies freeing all list elements
+		 */
+		xsk_addr->addr = desc->addr;
+		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
 	}
 
 	addr = desc->addr;
@@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			err = skb_store_bits(skb, 0, buffer, len);
 			if (unlikely(err))
 				goto free_err;
+
+			xsk_set_destructor_arg(skb, desc->addr);
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
+			struct xsk_addr_node *xsk_addr;
 			struct page *page;
 			u8 *vaddr;
 
@@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				goto free_err;
 			}
 
+			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+			if (!xsk_addr) {
+				__free_page(page);
+				err = -ENOMEM;
+				goto free_err;
+			}
+
 			vaddr = kmap_local_page(page);
 			memcpy(vaddr, buffer, len);
 			kunmap_local(vaddr);
 
 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
 			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
+
+			xsk_addr->addr = desc->addr;
+			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
 		}
 
 		if (first_frag && desc->options & XDP_TX_METADATA) {
@@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	skb->mark = READ_ONCE(xs->sk.sk_mark);
 	skb->destructor = xsk_destruct_skb;
 	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
-	xsk_set_destructor_arg(skb);
+	xsk_inc_num_desc(skb);
 
 	return skb;
 
@@ -769,7 +844,7 @@ free_err:
 
 	if (err == -EOVERFLOW) {
 		/* Drop the packet */
-		xsk_set_destructor_arg(xs->skb);
+		xsk_inc_num_desc(xs->skb);
 		xsk_drop_skb(xs->skb);
 		xskq_cons_release(xs->tx);
 	} else {
@@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_locked(xs->pool);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
@@ -1815,8 +1890,18 @@ static int __init xsk_init(void)
 	if (err)
 		goto out_pernet;
 
+	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+						 sizeof(struct xsk_addr_node),
+						 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!xsk_tx_generic_cache) {
+		err = -ENOMEM;
+		goto out_unreg_notif;
+	}
+
 	return 0;
 
+out_unreg_notif:
+	unregister_netdevice_notifier(&xsk_netdev_notifier);
 out_pernet:
 	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 
 /* Functions for producers */
 
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+	return READ_ONCE(q->ring->producer);
+}
+
 static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
 {
 	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	ring->desc[idx & q->ring_mask] = addr;
+}
+
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index d2819baea414..44b9de6e4e77 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 		return skb;
 	}
 
-	if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) {
+	if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
+				unlikely(xmit_xfrm_check_overflow(skb)))) {
 		struct sk_buff *segs;
 
 		/* Packet got rerouted, fixup features and segment it. */
@@ -415,10 +416,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 	struct net_device *dev = x->xso.dev;
 	bool check_tunnel_size;
 
-	if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
+	if (!x->type_offload ||
+	    (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
 		return false;
 
-	if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
+	if ((!dev || dev == xfrm_dst_path(dst)->dev) &&
+	    !xdst->child->xfrm) {
 		mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
 		if (skb->len <= mtu)
 			goto ok;
@@ -430,9 +433,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 	return false;
 
 ok:
+	if (!dev)
+		return true;
+
 	check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
 			    x->props.mode == XFRM_MODE_TUNNEL;
-	switch (x->props.family) {
+	switch (x->inner_mode.family) {
 	case AF_INET:
 		/* Check for IPv4 options */
 		if (ip_hdr(skb)->ihl != 5)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 77db3b5fe4ac..d213ca3653a8 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2583,6 +2583,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
 
 	for (h = 0; h < range; h++) {
 		u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high);
+		if (spi == 0)
+			goto next;
 		newspi = htonl(spi);
 
 		spin_lock_bh(&net->xfrm.xfrm_state_lock);
@@ -2598,6 +2600,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
 		xfrm_state_put(x0);
 		spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
+next:
 		if (signal_pending(current)) {
 			err = -ERESTARTSYS;
 			goto unlock;
@@ -3297,7 +3300,7 @@ void xfrm_state_fini(struct net *net)
 	unsigned int sz;
 
 	flush_work(&net->xfrm.state_hash_work);
-	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
+	xfrm_state_flush(net, 0, false);
 	flush_work(&xfrm_state_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.state_all));
diff --git a/rust/Makefile b/rust/Makefile
index 4263462b8470..bfa915b0e588 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -65,6 +65,10 @@ core-cfgs = \
 
 core-edition := $(if $(call rustc-min-version,108700),2024,2021)
 
+# `rustdoc` did not save the target modifiers, thus workaround for
+# the time being (https://github.com/rust-lang/rust/issues/144521).
+rustdoc_modifiers_workaround := $(if $(call rustc-min-version,108800),-Cunsafe-allow-abi-mismatch=fixed-x18)
+
 # `rustc` recognizes `--remap-path-prefix` since 1.26.0, but `rustdoc` only
 # since Rust 1.81.0. Moreover, `rustdoc` ICEs on out-of-tree builds since Rust
 # 1.82.0 (https://github.com/rust-lang/rust/issues/138520). Thus workaround both
@@ -77,6 +81,7 @@ quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $<
 		-Zunstable-options --generate-link-to-definition \
 		--output $(rustdoc_output) \
 		--crate-name $(subst rustdoc-,,$@) \
+		$(rustdoc_modifiers_workaround) \
 		$(if $(rustdoc_host),,--sysroot=/dev/null) \
 		@$(objtree)/include/generated/rustc_cfg $<
 
@@ -106,14 +111,14 @@ rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \
 rustdoc-macros: private rustdoc_host = yes
 rustdoc-macros: private rustc_target_flags = --crate-type proc-macro \
     --extern proc_macro
-rustdoc-macros: $(src)/macros/lib.rs FORCE
+rustdoc-macros: $(src)/macros/lib.rs rustdoc-clean FORCE
 	+$(call if_changed,rustdoc)
 
 # Starting with Rust 1.82.0, skipping `-Wrustdoc::unescaped_backticks` should
 # not be needed -- see https://github.com/rust-lang/rust/pull/128307.
 rustdoc-core: private skip_flags = --edition=2021 -Wrustdoc::unescaped_backticks
 rustdoc-core: private rustc_target_flags = --edition=$(core-edition) $(core-cfgs)
-rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs FORCE
+rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs rustdoc-clean FORCE
 	+$(call if_changed,rustdoc)
 
 rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE
@@ -125,7 +130,8 @@ rustdoc-ffi: $(src)/ffi.rs rustdoc-core FORCE
 rustdoc-pin_init_internal: private rustdoc_host = yes
 rustdoc-pin_init_internal: private rustc_target_flags = --cfg kernel \
     --extern proc_macro --crate-type proc-macro
-rustdoc-pin_init_internal: $(src)/pin-init/internal/src/lib.rs FORCE
+rustdoc-pin_init_internal: $(src)/pin-init/internal/src/lib.rs \
+    rustdoc-clean FORCE
 	+$(call if_changed,rustdoc)
 
 rustdoc-pin_init: private rustdoc_host = yes
@@ -143,6 +149,9 @@ rustdoc-kernel: $(src)/kernel/lib.rs rustdoc-core rustdoc-ffi rustdoc-macros \
     $(obj)/bindings.o FORCE
 	+$(call if_changed,rustdoc)
 
+rustdoc-clean: FORCE
+	$(Q)rm -rf $(rustdoc_output)
+
 quiet_cmd_rustc_test_library = $(RUSTC_OR_CLIPPY_QUIET) TL $<
       cmd_rustc_test_library = \
 	OBJTREE=$(abspath $(objtree)) \
@@ -215,6 +224,7 @@ quiet_cmd_rustdoc_test_kernel = RUSTDOC TK $<
 		--extern bindings --extern uapi \
 		--no-run --crate-name kernel -Zunstable-options \
 		--sysroot=/dev/null \
+		$(rustdoc_modifiers_workaround) \
 		--test-builder $(objtree)/scripts/rustdoc_test_builder \
 		$< $(rustdoc_test_kernel_quiet); \
 	$(objtree)/scripts/rustdoc_test_gen
diff --git a/rust/helpers/atomic.c b/rust/helpers/atomic.c
new file mode 100644
index 000000000000..cf06b7ef9a1c
--- /dev/null
+++ b/rust/helpers/atomic.c
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Generated by scripts/atomic/gen-rust-atomic-helpers.sh
+// DO NOT MODIFY THIS FILE DIRECTLY
+
+/*
+ * This file provides helpers for the various atomic functions for Rust.
+ */
+#ifndef _RUST_ATOMIC_API_H
+#define _RUST_ATOMIC_API_H
+
+#include <linux/atomic.h>
+
+// TODO: Remove this after INLINE_HELPERS support is added.
+#ifndef __rust_helper
+#define __rust_helper
+#endif
+
+__rust_helper int
+rust_helper_atomic_read(const atomic_t *v)
+{
+	return atomic_read(v);
+}
+
+__rust_helper int
+rust_helper_atomic_read_acquire(const atomic_t *v)
+{
+	return atomic_read_acquire(v);
+}
+
+__rust_helper void
+rust_helper_atomic_set(atomic_t *v, int i)
+{
+	atomic_set(v, i);
+}
+
+__rust_helper void
+rust_helper_atomic_set_release(atomic_t *v, int i)
+{
+	atomic_set_release(v, i);
+}
+
+__rust_helper void
+rust_helper_atomic_add(int i, atomic_t *v)
+{
+	atomic_add(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_add_return(int i, atomic_t *v)
+{
+	return atomic_add_return(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_add_return_acquire(int i, atomic_t *v)
+{
+	return atomic_add_return_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_add_return_release(int i, atomic_t *v)
+{
+	return atomic_add_return_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_add_return_relaxed(int i, atomic_t *v)
+{
+	return atomic_add_return_relaxed(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_add(int i, atomic_t *v)
+{
+	return atomic_fetch_add(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_add_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_add_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_add_release(int i, atomic_t *v)
+{
+	return atomic_fetch_add_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_add_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_add_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic_sub(int i, atomic_t *v)
+{
+	atomic_sub(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_sub_return(int i, atomic_t *v)
+{
+	return atomic_sub_return(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_sub_return_acquire(int i, atomic_t *v)
+{
+	return atomic_sub_return_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_sub_return_release(int i, atomic_t *v)
+{
+	return atomic_sub_return_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_sub_return_relaxed(int i, atomic_t *v)
+{
+	return atomic_sub_return_relaxed(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_sub(int i, atomic_t *v)
+{
+	return atomic_fetch_sub(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_sub_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_sub_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_sub_release(int i, atomic_t *v)
+{
+	return atomic_fetch_sub_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_sub_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_sub_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic_inc(atomic_t *v)
+{
+	atomic_inc(v);
+}
+
+__rust_helper int
+rust_helper_atomic_inc_return(atomic_t *v)
+{
+	return atomic_inc_return(v);
+}
+
+__rust_helper int
+rust_helper_atomic_inc_return_acquire(atomic_t *v)
+{
+	return atomic_inc_return_acquire(v);
+}
+
+__rust_helper int
+rust_helper_atomic_inc_return_release(atomic_t *v)
+{
+	return atomic_inc_return_release(v);
+}
+
+__rust_helper int
+rust_helper_atomic_inc_return_relaxed(atomic_t *v)
+{
+	return atomic_inc_return_relaxed(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_inc(atomic_t *v)
+{
+	return atomic_fetch_inc(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_inc_acquire(atomic_t *v)
+{
+	return atomic_fetch_inc_acquire(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_inc_release(atomic_t *v)
+{
+	return atomic_fetch_inc_release(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_inc_relaxed(atomic_t *v)
+{
+	return atomic_fetch_inc_relaxed(v);
+}
+
+__rust_helper void
+rust_helper_atomic_dec(atomic_t *v)
+{
+	atomic_dec(v);
+}
+
+__rust_helper int
+rust_helper_atomic_dec_return(atomic_t *v)
+{
+	return atomic_dec_return(v);
+}
+
+__rust_helper int
+rust_helper_atomic_dec_return_acquire(atomic_t *v)
+{
+	return atomic_dec_return_acquire(v);
+}
+
+__rust_helper int
+rust_helper_atomic_dec_return_release(atomic_t *v)
+{
+	return atomic_dec_return_release(v);
+}
+
+__rust_helper int
+rust_helper_atomic_dec_return_relaxed(atomic_t *v)
+{
+	return atomic_dec_return_relaxed(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_dec(atomic_t *v)
+{
+	return atomic_fetch_dec(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_dec_acquire(atomic_t *v)
+{
+	return atomic_fetch_dec_acquire(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_dec_release(atomic_t *v)
+{
+	return atomic_fetch_dec_release(v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_dec_relaxed(atomic_t *v)
+{
+	return atomic_fetch_dec_relaxed(v);
+}
+
+__rust_helper void
+rust_helper_atomic_and(int i, atomic_t *v)
+{
+	atomic_and(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_and(int i, atomic_t *v)
+{
+	return atomic_fetch_and(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_and_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_and_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_and_release(int i, atomic_t *v)
+{
+	return atomic_fetch_and_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_and_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_and_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic_andnot(int i, atomic_t *v)
+{
+	atomic_andnot(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_andnot(int i, atomic_t *v)
+{
+	return atomic_fetch_andnot(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_andnot_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_andnot_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_andnot_release(int i, atomic_t *v)
+{
+	return atomic_fetch_andnot_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_andnot_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic_or(int i, atomic_t *v)
+{
+	atomic_or(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_or(int i, atomic_t *v)
+{
+	return atomic_fetch_or(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_or_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_or_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_or_release(int i, atomic_t *v)
+{
+	return atomic_fetch_or_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_or_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_or_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic_xor(int i, atomic_t *v)
+{
+	atomic_xor(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_xor(int i, atomic_t *v)
+{
+	return atomic_fetch_xor(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_xor_acquire(int i, atomic_t *v)
+{
+	return atomic_fetch_xor_acquire(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_xor_release(int i, atomic_t *v)
+{
+	return atomic_fetch_xor_release(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_xor_relaxed(int i, atomic_t *v)
+{
+	return atomic_fetch_xor_relaxed(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_xchg(atomic_t *v, int new)
+{
+	return atomic_xchg(v, new);
+}
+
+__rust_helper int
+rust_helper_atomic_xchg_acquire(atomic_t *v, int new)
+{
+	return atomic_xchg_acquire(v, new);
+}
+
+__rust_helper int
+rust_helper_atomic_xchg_release(atomic_t *v, int new)
+{
+	return atomic_xchg_release(v, new);
+}
+
+__rust_helper int
+rust_helper_atomic_xchg_relaxed(atomic_t *v, int new)
+{
+	return atomic_xchg_relaxed(v, new);
+}
+
+__rust_helper int
+rust_helper_atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+	return atomic_cmpxchg(v, old, new);
+}
+
+__rust_helper int
+rust_helper_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
+{
+	return atomic_cmpxchg_acquire(v, old, new);
+}
+
+__rust_helper int
+rust_helper_atomic_cmpxchg_release(atomic_t *v, int old, int new)
+{
+	return atomic_cmpxchg_release(v, old, new);
+}
+
+__rust_helper int
+rust_helper_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
+{
+	return atomic_cmpxchg_relaxed(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+	return atomic_try_cmpxchg(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
+{
+	return atomic_try_cmpxchg_acquire(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
+{
+	return atomic_try_cmpxchg_release(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
+{
+	return atomic_try_cmpxchg_relaxed(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic_sub_and_test(int i, atomic_t *v)
+{
+	return atomic_sub_and_test(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic_dec_and_test(atomic_t *v)
+{
+	return atomic_dec_and_test(v);
+}
+
+__rust_helper bool
+rust_helper_atomic_inc_and_test(atomic_t *v)
+{
+	return atomic_inc_and_test(v);
+}
+
+__rust_helper bool
+rust_helper_atomic_add_negative(int i, atomic_t *v)
+{
+	return atomic_add_negative(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	return atomic_add_negative_acquire(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic_add_negative_release(int i, atomic_t *v)
+{
+	return atomic_add_negative_release(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic_add_negative_relaxed(int i, atomic_t *v)
+{
+	return atomic_add_negative_relaxed(i, v);
+}
+
+__rust_helper int
+rust_helper_atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+	return atomic_fetch_add_unless(v, a, u);
+}
+
+__rust_helper bool
+rust_helper_atomic_add_unless(atomic_t *v, int a, int u)
+{
+	return atomic_add_unless(v, a, u);
+}
+
+__rust_helper bool
+rust_helper_atomic_inc_not_zero(atomic_t *v)
+{
+	return atomic_inc_not_zero(v);
+}
+
+__rust_helper bool
+rust_helper_atomic_inc_unless_negative(atomic_t *v)
+{
+	return atomic_inc_unless_negative(v);
+}
+
+__rust_helper bool
+rust_helper_atomic_dec_unless_positive(atomic_t *v)
+{
+	return atomic_dec_unless_positive(v);
+}
+
+__rust_helper int
+rust_helper_atomic_dec_if_positive(atomic_t *v)
+{
+	return atomic_dec_if_positive(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_read(const atomic64_t *v)
+{
+	return atomic64_read(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_read_acquire(const atomic64_t *v)
+{
+	return atomic64_read_acquire(v);
+}
+
+__rust_helper void
+rust_helper_atomic64_set(atomic64_t *v, s64 i)
+{
+	atomic64_set(v, i);
+}
+
+__rust_helper void
+rust_helper_atomic64_set_release(atomic64_t *v, s64 i)
+{
+	atomic64_set_release(v, i);
+}
+
+__rust_helper void
+rust_helper_atomic64_add(s64 i, atomic64_t *v)
+{
+	atomic64_add(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_add_return(s64 i, atomic64_t *v)
+{
+	return atomic64_add_return(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_add_return_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_add_return_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_add_return_release(s64 i, atomic64_t *v)
+{
+	return atomic64_add_return_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_add_return_relaxed(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_add(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_add(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_add_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_add_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_add_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_add_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic64_sub(s64 i, atomic64_t *v)
+{
+	atomic64_sub(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_sub_return(s64 i, atomic64_t *v)
+{
+	return atomic64_sub_return(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_sub_return_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_sub_return_release(s64 i, atomic64_t *v)
+{
+	return atomic64_sub_return_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_sub_return_relaxed(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_sub(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_sub(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_sub_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_sub_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_sub_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic64_inc(atomic64_t *v)
+{
+	atomic64_inc(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_inc_return(atomic64_t *v)
+{
+	return atomic64_inc_return(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_inc_return_acquire(atomic64_t *v)
+{
+	return atomic64_inc_return_acquire(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_inc_return_release(atomic64_t *v)
+{
+	return atomic64_inc_return_release(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_inc_return_relaxed(atomic64_t *v)
+{
+	return atomic64_inc_return_relaxed(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_inc(atomic64_t *v)
+{
+	return atomic64_fetch_inc(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_inc_acquire(atomic64_t *v)
+{
+	return atomic64_fetch_inc_acquire(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_inc_release(atomic64_t *v)
+{
+	return atomic64_fetch_inc_release(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_inc_relaxed(atomic64_t *v)
+{
+	return atomic64_fetch_inc_relaxed(v);
+}
+
+__rust_helper void
+rust_helper_atomic64_dec(atomic64_t *v)
+{
+	atomic64_dec(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_dec_return(atomic64_t *v)
+{
+	return atomic64_dec_return(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_dec_return_acquire(atomic64_t *v)
+{
+	return atomic64_dec_return_acquire(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_dec_return_release(atomic64_t *v)
+{
+	return atomic64_dec_return_release(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_dec_return_relaxed(atomic64_t *v)
+{
+	return atomic64_dec_return_relaxed(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_dec(atomic64_t *v)
+{
+	return atomic64_fetch_dec(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_dec_acquire(atomic64_t *v)
+{
+	return atomic64_fetch_dec_acquire(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_dec_release(atomic64_t *v)
+{
+	return atomic64_fetch_dec_release(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_dec_relaxed(atomic64_t *v)
+{
+	return atomic64_fetch_dec_relaxed(v);
+}
+
+__rust_helper void
+rust_helper_atomic64_and(s64 i, atomic64_t *v)
+{
+	atomic64_and(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_and(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_and(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_and_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_and_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_and_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_and_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic64_andnot(s64 i, atomic64_t *v)
+{
+	atomic64_andnot(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_andnot(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_andnot(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_andnot_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_andnot_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_andnot_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic64_or(s64 i, atomic64_t *v)
+{
+	atomic64_or(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_or(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_or(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_or_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_or_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_or_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_or_relaxed(i, v);
+}
+
+__rust_helper void
+rust_helper_atomic64_xor(s64 i, atomic64_t *v)
+{
+	atomic64_xor(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_xor(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_xor(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_xor_acquire(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_xor_release(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_fetch_xor_relaxed(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_xchg(atomic64_t *v, s64 new)
+{
+	return atomic64_xchg(v, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_xchg_acquire(atomic64_t *v, s64 new)
+{
+	return atomic64_xchg_acquire(v, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_xchg_release(atomic64_t *v, s64 new)
+{
+	return atomic64_xchg_release(v, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
+{
+	return atomic64_xchg_relaxed(v, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+{
+	return atomic64_cmpxchg(v, old, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
+{
+	return atomic64_cmpxchg_acquire(v, old, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
+{
+	return atomic64_cmpxchg_release(v, old, new);
+}
+
+__rust_helper s64
+rust_helper_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
+{
+	return atomic64_cmpxchg_relaxed(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+	return atomic64_try_cmpxchg(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
+{
+	return atomic64_try_cmpxchg_acquire(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
+{
+	return atomic64_try_cmpxchg_release(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
+{
+	return atomic64_try_cmpxchg_relaxed(v, old, new);
+}
+
+__rust_helper bool
+rust_helper_atomic64_sub_and_test(s64 i, atomic64_t *v)
+{
+	return atomic64_sub_and_test(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_dec_and_test(atomic64_t *v)
+{
+	return atomic64_dec_and_test(v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_inc_and_test(atomic64_t *v)
+{
+	return atomic64_inc_and_test(v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_add_negative(s64 i, atomic64_t *v)
+{
+	return atomic64_add_negative(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	return atomic64_add_negative_acquire(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	return atomic64_add_negative_release(i, v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
+{
+	return atomic64_add_negative_relaxed(i, v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
+{
+	return atomic64_fetch_add_unless(v, a, u);
+}
+
+__rust_helper bool
+rust_helper_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+{
+	return atomic64_add_unless(v, a, u);
+}
+
+__rust_helper bool
+rust_helper_atomic64_inc_not_zero(atomic64_t *v)
+{
+	return atomic64_inc_not_zero(v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_inc_unless_negative(atomic64_t *v)
+{
+	return atomic64_inc_unless_negative(v);
+}
+
+__rust_helper bool
+rust_helper_atomic64_dec_unless_positive(atomic64_t *v)
+{
+	return atomic64_dec_unless_positive(v);
+}
+
+__rust_helper s64
+rust_helper_atomic64_dec_if_positive(atomic64_t *v)
+{
+	return atomic64_dec_if_positive(v);
+}
+
+#endif /* _RUST_ATOMIC_API_H */
+// 615a0e0c98b5973a47fe4fa65e92935051ca00ed
diff --git a/rust/helpers/barrier.c b/rust/helpers/barrier.c
new file mode 100644
index 000000000000..cdf28ce8e511
--- /dev/null
+++ b/rust/helpers/barrier.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/barrier.h>
+
+void rust_helper_smp_mb(void)
+{
+	smp_mb();
+}
+
+void rust_helper_smp_wmb(void)
+{
+	smp_wmb();
+}
+
+void rust_helper_smp_rmb(void)
+{
+	smp_rmb();
+}
diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c
index 7cf7fe95e41d..85ad14b81925 100644
--- a/rust/helpers/helpers.c
+++ b/rust/helpers/helpers.c
@@ -7,7 +7,9 @@
  * Sorted alphabetically.
  */
 
+#include "atomic.c"
 #include "auxiliary.c"
+#include "barrier.c"
 #include "blk.c"
 #include "bug.c"
 #include "build_assert.c"
diff --git a/rust/helpers/refcount.c b/rust/helpers/refcount.c
index d6adbd2e45a1..d175898ad7b8 100644
--- a/rust/helpers/refcount.c
+++ b/rust/helpers/refcount.c
@@ -7,11 +7,21 @@ refcount_t rust_helper_REFCOUNT_INIT(int n)
 	return (refcount_t)REFCOUNT_INIT(n);
 }
 
+void rust_helper_refcount_set(refcount_t *r, int n)
+{
+	refcount_set(r, n);
+}
+
 void rust_helper_refcount_inc(refcount_t *r)
 {
 	refcount_inc(r);
 }
 
+void rust_helper_refcount_dec(refcount_t *r)
+{
+	refcount_dec(r);
+}
+
 bool rust_helper_refcount_dec_and_test(refcount_t *r)
 {
 	return refcount_dec_and_test(r);
diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs
index aa2dfa9dca4c..2692cf90c948 100644
--- a/rust/kernel/alloc/allocator.rs
+++ b/rust/kernel/alloc/allocator.rs
@@ -43,17 +43,6 @@ pub struct Vmalloc;
 /// For more details see [self].
 pub struct KVmalloc;
 
-/// Returns a proper size to alloc a new object aligned to `new_layout`'s alignment.
-fn aligned_size(new_layout: Layout) -> usize {
-    // Customized layouts from `Layout::from_size_align()` can have size < align, so pad first.
-    let layout = new_layout.pad_to_align();
-
-    // Note that `layout.size()` (after padding) is guaranteed to be a multiple of `layout.align()`
-    // which together with the slab guarantees means the `krealloc` will return a properly aligned
-    // object (see comments in `kmalloc()` for more information).
-    layout.size()
-}
-
 /// # Invariants
 ///
 /// One of the following: `krealloc`, `vrealloc`, `kvrealloc`.
@@ -88,7 +77,7 @@ impl ReallocFunc {
         old_layout: Layout,
         flags: Flags,
     ) -> Result<NonNull<[u8]>, AllocError> {
-        let size = aligned_size(layout);
+        let size = layout.size();
         let ptr = match ptr {
             Some(ptr) => {
                 if old_layout.size() == 0 {
@@ -123,6 +112,17 @@ impl ReallocFunc {
     }
 }
 
+impl Kmalloc {
+    /// Returns a [`Layout`] that makes [`Kmalloc`] fulfill the requested size and alignment of
+    /// `layout`.
+    pub fn aligned_layout(layout: Layout) -> Layout {
+        // Note that `layout.size()` (after padding) is guaranteed to be a multiple of
+        // `layout.align()` which together with the slab guarantees means that `Kmalloc` will return
+        // a properly aligned object (see comments in `kmalloc()` for more information).
+        layout.pad_to_align()
+    }
+}
+
 // SAFETY: `realloc` delegates to `ReallocFunc::call`, which guarantees that
 // - memory remains valid until it is explicitly freed,
 // - passing a pointer to a valid memory allocation is OK,
@@ -135,6 +135,8 @@ unsafe impl Allocator for Kmalloc {
         old_layout: Layout,
         flags: Flags,
     ) -> Result<NonNull<[u8]>, AllocError> {
+        let layout = Kmalloc::aligned_layout(layout);
+
         // SAFETY: `ReallocFunc::call` has the same safety requirements as `Allocator::realloc`.
         unsafe { ReallocFunc::KREALLOC.call(ptr, layout, old_layout, flags) }
     }
@@ -176,6 +178,10 @@ unsafe impl Allocator for KVmalloc {
         old_layout: Layout,
         flags: Flags,
     ) -> Result<NonNull<[u8]>, AllocError> {
+        // `KVmalloc` may use the `Kmalloc` backend, hence we have to enforce a `Kmalloc`
+        // compatible layout.
+        let layout = Kmalloc::aligned_layout(layout);
+
         // TODO: Support alignments larger than PAGE_SIZE.
         if layout.align() > bindings::PAGE_SIZE {
             pr_warn!("KVmalloc does not support alignments larger than PAGE_SIZE yet.\n");
diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs
index a3074480bd8d..90dd987d40e4 100644
--- a/rust/kernel/alloc/allocator_test.rs
+++ b/rust/kernel/alloc/allocator_test.rs
@@ -22,6 +22,17 @@ pub type Kmalloc = Cmalloc;
 pub type Vmalloc = Kmalloc;
 pub type KVmalloc = Kmalloc;
 
+impl Cmalloc {
+    /// Returns a [`Layout`] that makes [`Kmalloc`] fulfill the requested size and alignment of
+    /// `layout`.
+    pub fn aligned_layout(layout: Layout) -> Layout {
+        // Note that `layout.size()` (after padding) is guaranteed to be a multiple of
+        // `layout.align()` which together with the slab guarantees means that `Kmalloc` will return
+        // a properly aligned object (see comments in `kmalloc()` for more information).
+        layout.pad_to_align()
+    }
+}
+
 extern "C" {
     #[link_name = "aligned_alloc"]
     fn libc_aligned_alloc(align: usize, size: usize) -> *mut crate::ffi::c_void;
diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
index c2b98f507bcb..c0f95a9419c4 100644
--- a/rust/kernel/block/mq/operations.rs
+++ b/rust/kernel/block/mq/operations.rs
@@ -10,9 +10,10 @@ use crate::{
     block::mq::Request,
     error::{from_result, Result},
     prelude::*,
+    sync::Refcount,
     types::ARef,
 };
-use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering};
+use core::marker::PhantomData;
 
 /// Implement this trait to interface blk-mq as block devices.
 ///
@@ -78,7 +79,7 @@ impl<T: Operations> OperationsVTable<T> {
         let request = unsafe { &*(*bd).rq.cast::<Request<T>>() };
 
         // One refcount for the ARef, one for being in flight
-        request.wrapper_ref().refcount().store(2, Ordering::Relaxed);
+        request.wrapper_ref().refcount().set(2);
 
         // SAFETY:
         //  - We own a refcount that we took above. We pass that to `ARef`.
@@ -187,7 +188,7 @@ impl<T: Operations> OperationsVTable<T> {
 
             // SAFETY: The refcount field is allocated but not initialized, so
             // it is valid for writes.
-            unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) };
+            unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(Refcount::new(0)) };
 
             Ok(0)
         })
diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
index fefd394f064a..f62a376dc313 100644
--- a/rust/kernel/block/mq/request.rs
+++ b/rust/kernel/block/mq/request.rs
@@ -8,13 +8,10 @@ use crate::{
     bindings,
     block::mq::Operations,
     error::Result,
+    sync::{atomic::Relaxed, Refcount},
     types::{ARef, AlwaysRefCounted, Opaque},
 };
-use core::{
-    marker::PhantomData,
-    ptr::NonNull,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use core::{marker::PhantomData, ptr::NonNull};
 
 /// A wrapper around a blk-mq [`struct request`]. This represents an IO request.
 ///
@@ -37,6 +34,9 @@ use core::{
 /// We need to track 3 and 4 to ensure that it is safe to end the request and hand
 /// back ownership to the block layer.
 ///
+/// Note that the driver can still obtain new `ARef` even if there is no `ARef`s in existence by
+/// using `tag_to_rq`, hence the need to distinguish B and C.
+///
 /// The states are tracked through the private `refcount` field of
 /// `RequestDataWrapper`. This structure lives in the private data area of the C
 /// [`struct request`].
@@ -98,13 +98,16 @@ impl<T: Operations> Request<T> {
     ///
     /// [`struct request`]: srctree/include/linux/blk-mq.h
     fn try_set_end(this: ARef<Self>) -> Result<*mut bindings::request, ARef<Self>> {
-        // We can race with `TagSet::tag_to_rq`
-        if let Err(_old) = this.wrapper_ref().refcount().compare_exchange(
-            2,
-            0,
-            Ordering::Relaxed,
-            Ordering::Relaxed,
-        ) {
+        // To hand back the ownership, we need the current refcount to be 2.
+        // Since we can race with `TagSet::tag_to_rq`, this needs to atomically reduce
+        // refcount to 0. `Refcount` does not provide a way to do this, so use the underlying
+        // atomics directly.
+        if let Err(_old) = this
+            .wrapper_ref()
+            .refcount()
+            .as_atomic()
+            .cmpxchg(2, 0, Relaxed)
+        {
             return Err(this);
         }
 
@@ -173,13 +176,13 @@ pub(crate) struct RequestDataWrapper {
     /// - 0: The request is owned by C block layer.
     /// - 1: The request is owned by Rust abstractions but there are no [`ARef`] references to it.
     /// - 2+: There are [`ARef`] references to the request.
-    refcount: AtomicU64,
+    refcount: Refcount,
 }
 
 impl RequestDataWrapper {
     /// Return a reference to the refcount of the request that is embedding
     /// `self`.
-    pub(crate) fn refcount(&self) -> &AtomicU64 {
+    pub(crate) fn refcount(&self) -> &Refcount {
         &self.refcount
     }
 
@@ -189,7 +192,7 @@ impl RequestDataWrapper {
     /// # Safety
     ///
     /// - `this` must point to a live allocation of at least the size of `Self`.
-    pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 {
+    pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut Refcount {
         // SAFETY: Because of the safety requirements of this function, the
         // field projection is safe.
         unsafe { &raw mut (*this).refcount }
@@ -205,47 +208,13 @@ unsafe impl<T: Operations> Send for Request<T> {}
 // mutate `self` are internally synchronized`
 unsafe impl<T: Operations> Sync for Request<T> {}
 
-/// Store the result of `op(target.load())` in target, returning new value of
-/// target.
-fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
-    let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x)));
-
-    // SAFETY: Because the operation passed to `fetch_update` above always
-    // return `Some`, `old` will always be `Ok`.
-    let old = unsafe { old.unwrap_unchecked() };
-
-    op(old)
-}
-
-/// Store the result of `op(target.load)` in `target` if `target.load() !=
-/// pred`, returning [`true`] if the target was updated.
-fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool {
-    target
-        .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| {
-            if x == pred {
-                None
-            } else {
-                Some(op(x))
-            }
-        })
-        .is_ok()
-}
-
 // SAFETY: All instances of `Request<T>` are reference counted. This
 // implementation of `AlwaysRefCounted` ensure that increments to the ref count
 // keeps the object alive in memory at least until a matching reference count
 // decrement is executed.
 unsafe impl<T: Operations> AlwaysRefCounted for Request<T> {
     fn inc_ref(&self) {
-        let refcount = &self.wrapper_ref().refcount();
-
-        #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
-        let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0);
-
-        #[cfg(CONFIG_DEBUG_MISC)]
-        if !updated {
-            panic!("Request refcount zero on clone")
-        }
+        self.wrapper_ref().refcount().inc();
     }
 
     unsafe fn dec_ref(obj: core::ptr::NonNull<Self>) {
@@ -257,10 +226,10 @@ unsafe impl<T: Operations> AlwaysRefCounted for Request<T> {
         let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) };
 
         #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))]
-        let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1);
+        let is_zero = refcount.dec_and_test();
 
         #[cfg(CONFIG_DEBUG_MISC)]
-        if new_refcount == 0 {
+        if is_zero {
             panic!("Request reached refcount zero in Rust abstractions");
         }
     }
diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs
index 2599f01e8b28..4a2229542fb7 100644
--- a/rust/kernel/cred.rs
+++ b/rust/kernel/cred.rs
@@ -8,11 +8,7 @@
 //!
 //! Reference: <https://www.kernel.org/doc/html/latest/security/credentials.html>
 
-use crate::{
-    bindings,
-    task::Kuid,
-    types::{AlwaysRefCounted, Opaque},
-};
+use crate::{bindings, sync::aref::AlwaysRefCounted, task::Kuid, types::Opaque};
 
 /// Wraps the kernel's `struct cred`.
 ///
diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs
index b8613289de8e..a1db49eb159a 100644
--- a/rust/kernel/device.rs
+++ b/rust/kernel/device.rs
@@ -15,23 +15,132 @@ use crate::c_str;
 
 pub mod property;
 
-/// A reference-counted device.
+/// The core representation of a device in the kernel's driver model.
 ///
-/// This structure represents the Rust abstraction for a C `struct device`. This implementation
-/// abstracts the usage of an already existing C `struct device` within Rust code that we get
-/// passed from the C side.
+/// This structure represents the Rust abstraction for a C `struct device`. A [`Device`] can either
+/// exist as temporary reference (see also [`Device::from_raw`]), which is only valid within a
+/// certain scope or as [`ARef<Device>`], owning a dedicated reference count.
 ///
-/// An instance of this abstraction can be obtained temporarily or permanent.
+/// # Device Types
 ///
-/// A temporary one is bound to the lifetime of the C `struct device` pointer used for creation.
-/// A permanent instance is always reference-counted and hence not restricted by any lifetime
-/// boundaries.
+/// A [`Device`] can represent either a bus device or a class device.
 ///
-/// For subsystems it is recommended to create a permanent instance to wrap into a subsystem
-/// specific device structure (e.g. `pci::Device`). This is useful for passing it to drivers in
-/// `T::probe()`, such that a driver can store the `ARef<Device>` (equivalent to storing a
-/// `struct device` pointer in a C driver) for arbitrary purposes, e.g. allocating DMA coherent
-/// memory.
+/// ## Bus Devices
+///
+/// A bus device is a [`Device`] that is associated with a physical or virtual bus. Examples of
+/// buses include PCI, USB, I2C, and SPI. Devices attached to a bus are registered with a specific
+/// bus type, which facilitates matching devices with appropriate drivers based on IDs or other
+/// identifying information. Bus devices are visible in sysfs under `/sys/bus/<bus-name>/devices/`.
+///
+/// ## Class Devices
+///
+/// A class device is a [`Device`] that is associated with a logical category of functionality
+/// rather than a physical bus. Examples of classes include block devices, network interfaces, sound
+/// cards, and input devices. Class devices are grouped under a common class and exposed to
+/// userspace via entries in `/sys/class/<class-name>/`.
+///
+/// # Device Context
+///
+/// [`Device`] references are generic over a [`DeviceContext`], which represents the type state of
+/// a [`Device`].
+///
+/// As the name indicates, this type state represents the context of the scope the [`Device`]
+/// reference is valid in. For instance, the [`Bound`] context guarantees that the [`Device`] is
+/// bound to a driver for the entire duration of the existence of a [`Device<Bound>`] reference.
+///
+/// Other [`DeviceContext`] types besides [`Bound`] are [`Normal`], [`Core`] and [`CoreInternal`].
+///
+/// Unless selected otherwise [`Device`] defaults to the [`Normal`] [`DeviceContext`], which by
+/// itself has no additional requirements.
+///
+/// It is always up to the caller of [`Device::from_raw`] to select the correct [`DeviceContext`]
+/// type for the corresponding scope the [`Device`] reference is created in.
+///
+/// All [`DeviceContext`] types other than [`Normal`] are intended to be used with
+/// [bus devices](#bus-devices) only.
+///
+/// # Implementing Bus Devices
+///
+/// This section provides a guideline to implement bus specific devices, such as [`pci::Device`] or
+/// [`platform::Device`].
+///
+/// A bus specific device should be defined as follows.
+///
+/// ```ignore
+/// #[repr(transparent)]
+/// pub struct Device<Ctx: device::DeviceContext = device::Normal>(
+///     Opaque<bindings::bus_device_type>,
+///     PhantomData<Ctx>,
+/// );
+/// ```
+///
+/// Since devices are reference counted, [`AlwaysRefCounted`] should be implemented for `Device`
+/// (i.e. `Device<Normal>`). Note that [`AlwaysRefCounted`] must not be implemented for any other
+/// [`DeviceContext`], since all other device context types are only valid within a certain scope.
+///
+/// In order to be able to implement the [`DeviceContext`] dereference hierarchy, bus device
+/// implementations should call the [`impl_device_context_deref`] macro as shown below.
+///
+/// ```ignore
+/// // SAFETY: `Device` is a transparent wrapper of a type that doesn't depend on `Device`'s
+/// // generic argument.
+/// kernel::impl_device_context_deref!(unsafe { Device });
+/// ```
+///
+/// In order to convert from a any [`Device<Ctx>`] to [`ARef<Device>`], bus devices can implement
+/// the following macro call.
+///
+/// ```ignore
+/// kernel::impl_device_context_into_aref!(Device);
+/// ```
+///
+/// Bus devices should also implement the following [`AsRef`] implementation, such that users can
+/// easily derive a generic [`Device`] reference.
+///
+/// ```ignore
+/// impl<Ctx: device::DeviceContext> AsRef<device::Device<Ctx>> for Device<Ctx> {
+///     fn as_ref(&self) -> &device::Device<Ctx> {
+///         ...
+///     }
+/// }
+/// ```
+///
+/// # Implementing Class Devices
+///
+/// Class device implementations require less infrastructure and depend slightly more on the
+/// specific subsystem.
+///
+/// An example implementation for a class device could look like this.
+///
+/// ```ignore
+/// #[repr(C)]
+/// pub struct Device<T: class::Driver> {
+///     dev: Opaque<bindings::class_device_type>,
+///     data: T::Data,
+/// }
+/// ```
+///
+/// This class device uses the sub-classing pattern to embed the driver's private data within the
+/// allocation of the class device. For this to be possible the class device is generic over the
+/// class specific `Driver` trait implementation.
+///
+/// Just like any device, class devices are reference counted and should hence implement
+/// [`AlwaysRefCounted`] for `Device`.
+///
+/// Class devices should also implement the following [`AsRef`] implementation, such that users can
+/// easily derive a generic [`Device`] reference.
+///
+/// ```ignore
+/// impl<T: class::Driver> AsRef<device::Device> for Device<T> {
+///     fn as_ref(&self) -> &device::Device {
+///         ...
+///     }
+/// }
+/// ```
+///
+/// An example for a class device implementation is
+#[cfg_attr(CONFIG_DRM = "y", doc = "[`drm::Device`](kernel::drm::Device).")]
+#[cfg_attr(not(CONFIG_DRM = "y"), doc = "`drm::Device`.")]
 ///
 /// # Invariants
 ///
@@ -42,6 +151,11 @@ pub mod property;
 ///
 /// `bindings::device::release` is valid to be called from any thread, hence `ARef<Device>` can be
 /// dropped from any thread.
+///
+/// [`AlwaysRefCounted`]: kernel::types::AlwaysRefCounted
+/// [`impl_device_context_deref`]: kernel::impl_device_context_deref
+/// [`pci::Device`]: kernel::pci::Device
+/// [`platform::Device`]: kernel::platform::Device
 #[repr(transparent)]
 pub struct Device<Ctx: DeviceContext = Normal>(Opaque<bindings::device>, PhantomData<Ctx>);
 
@@ -311,28 +425,75 @@ unsafe impl Send for Device {}
 // synchronization in `struct device`.
 unsafe impl Sync for Device {}
 
-/// Marker trait for the context of a bus specific device.
+/// Marker trait for the context or scope of a bus specific device.
+///
+/// [`DeviceContext`] is a marker trait for types representing the context of a bus specific
+/// [`Device`].
+///
+/// The specific device context types are: [`CoreInternal`], [`Core`], [`Bound`] and [`Normal`].
 ///
-/// Some functions of a bus specific device should only be called from a certain context, i.e. bus
-/// callbacks, such as `probe()`.
+/// [`DeviceContext`] types are hierarchical, which means that there is a strict hierarchy that
+/// defines which [`DeviceContext`] type can be derived from another. For instance, any
+/// [`Device<Core>`] can dereference to a [`Device<Bound>`].
 ///
-/// This is the marker trait for structures representing the context of a bus specific device.
+/// The following enumeration illustrates the dereference hierarchy of [`DeviceContext`] types.
+///
+/// - [`CoreInternal`] => [`Core`] => [`Bound`] => [`Normal`]
+///
+/// Bus devices can automatically implement the dereference hierarchy by using
+/// [`impl_device_context_deref`].
+///
+/// Note that the guarantee for a [`Device`] reference to have a certain [`DeviceContext`] comes
+/// from the specific scope the [`Device`] reference is valid in.
+///
+/// [`impl_device_context_deref`]: kernel::impl_device_context_deref
 pub trait DeviceContext: private::Sealed {}
 
-/// The [`Normal`] context is the context of a bus specific device when it is not an argument of
-/// any bus callback.
+/// The [`Normal`] context is the default [`DeviceContext`] of any [`Device`].
+///
+/// The normal context does not indicate any specific context. Any `Device<Ctx>` is also a valid
+/// [`Device<Normal>`]. It is the only [`DeviceContext`] for which it is valid to implement
+/// [`AlwaysRefCounted`] for.
+///
+/// [`AlwaysRefCounted`]: kernel::types::AlwaysRefCounted
 pub struct Normal;
 
-/// The [`Core`] context is the context of a bus specific device when it is supplied as argument of
-/// any of the bus callbacks, such as `probe()`.
+/// The [`Core`] context is the context of a bus specific device when it appears as argument of
+/// any bus specific callback, such as `probe()`.
+///
+/// The core context indicates that the [`Device<Core>`] reference's scope is limited to the bus
+/// callback it appears in. It is intended to be used for synchronization purposes. Bus device
+/// implementations can implement methods for [`Device<Core>`], such that they can only be called
+/// from bus callbacks.
 pub struct Core;
 
-/// Semantically the same as [`Core`] but reserved for internal usage of the corresponding bus
+/// Semantically the same as [`Core`], but reserved for internal usage of the corresponding bus
 /// abstraction.
+///
+/// The internal core context is intended to be used in exactly the same way as the [`Core`]
+/// context, with the difference that this [`DeviceContext`] is internal to the corresponding bus
+/// abstraction.
+///
+/// This context mainly exists to share generic [`Device`] infrastructure that should only be called
+/// from bus callbacks with bus abstractions, but without making them accessible for drivers.
 pub struct CoreInternal;
 
-/// The [`Bound`] context is the context of a bus specific device reference when it is guaranteed to
-/// be bound for the duration of its lifetime.
+/// The [`Bound`] context is the [`DeviceContext`] of a bus specific device when it is guaranteed to
+/// be bound to a driver.
+///
+/// The bound context indicates that for the entire duration of the lifetime of a [`Device<Bound>`]
+/// reference, the [`Device`] is guaranteed to be bound to a driver.
+///
+/// Some APIs, such as [`dma::CoherentAllocation`] or [`Devres`] rely on the [`Device`] to be bound,
+/// which can be proven with the [`Bound`] device context.
+///
+/// Any abstraction that can guarantee a scope where the corresponding bus device is bound, should
+/// provide a [`Device<Bound>`] reference to its users for this scope. This allows users to benefit
+/// from optimizations for accessing device resources, see also [`Devres::access`].
+///
+/// [`Devres`]: kernel::devres::Devres
+/// [`Devres::access`]: kernel::devres::Devres::access
+/// [`dma::CoherentAllocation`]: kernel::dma::CoherentAllocation
 pub struct Bound;
 
 mod private {
diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs
index da18091143a6..d04e3fcebafb 100644
--- a/rust/kernel/devres.rs
+++ b/rust/kernel/devres.rs
@@ -115,10 +115,11 @@ pub struct Devres<T: Send> {
     /// Contains all the fields shared with [`Self::callback`].
     // TODO: Replace with `UnsafePinned`, once available.
     //
-    // Subsequently, the `drop_in_place()` in `Devres::drop` and the explicit `Send` and `Sync'
-    // impls can be removed.
+    // Subsequently, the `drop_in_place()` in `Devres::drop` and `Devres::new` as well as the
+    // explicit `Send` and `Sync' impls can be removed.
     #[pin]
     inner: Opaque<Inner<T>>,
+    _add_action: (),
 }
 
 impl<T: Send> Devres<T> {
@@ -140,7 +141,15 @@ impl<T: Send> Devres<T> {
             dev: dev.into(),
             callback,
             // INVARIANT: `inner` is properly initialized.
-            inner <- {
+            inner <- Opaque::pin_init(try_pin_init!(Inner {
+                    devm <- Completion::new(),
+                    revoke <- Completion::new(),
+                    data <- Revocable::new(data),
+            })),
+            // TODO: Replace with "initializer code blocks" [1] once available.
+            //
+            // [1] https://github.com/Rust-for-Linux/pin-init/pull/69
+            _add_action: {
                 // SAFETY: `this` is a valid pointer to uninitialized memory.
                 let inner = unsafe { &raw mut (*this.as_ptr()).inner };
 
@@ -152,13 +161,13 @@ impl<T: Send> Devres<T> {
                 //    live at least as long as the returned `impl PinInit<Self, Error>`.
                 to_result(unsafe {
                     bindings::devm_add_action(dev.as_raw(), Some(callback), inner.cast())
-                })?;
+                }).inspect_err(|_| {
+                    let inner = Opaque::cast_into(inner);
 
-                Opaque::pin_init(try_pin_init!(Inner {
-                    devm <- Completion::new(),
-                    revoke <- Completion::new(),
-                    data <- Revocable::new(data),
-                }))
+                    // SAFETY: `inner` is a valid pointer to an `Inner<T>` and valid for both reads
+                    // and writes.
+                    unsafe { core::ptr::drop_in_place(inner) };
+                })?;
             },
         })
     }
diff --git a/rust/kernel/driver.rs b/rust/kernel/driver.rs
index a8f2675ba7a7..279e3af20682 100644
--- a/rust/kernel/driver.rs
+++ b/rust/kernel/driver.rs
@@ -2,8 +2,93 @@
 
 //! Generic support for drivers of different buses (e.g., PCI, Platform, Amba, etc.).
 //!
-//! Each bus / subsystem is expected to implement [`RegistrationOps`], which allows drivers to
-//! register using the [`Registration`] class.
+//! This documentation describes how to implement a bus specific driver API and how to align it with
+//! the design of (bus specific) devices.
+//!
+//! Note: Readers are expected to know the content of the documentation of [`Device`] and
+//! [`DeviceContext`].
+//!
+//! # Driver Trait
+//!
+//! The main driver interface is defined by a bus specific driver trait. For instance:
+//!
+//! ```ignore
+//! pub trait Driver: Send {
+//!     /// The type holding information about each device ID supported by the driver.
+//!     type IdInfo: 'static;
+//!
+//!     /// The table of OF device ids supported by the driver.
+//!     const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = None;
+//!
+//!     /// The table of ACPI device ids supported by the driver.
+//!     const ACPI_ID_TABLE: Option<acpi::IdTable<Self::IdInfo>> = None;
+//!
+//!     /// Driver probe.
+//!     fn probe(dev: &Device<device::Core>, id_info: &Self::IdInfo) -> Result<Pin<KBox<Self>>>;
+//!
+//!     /// Driver unbind (optional).
+//!     fn unbind(dev: &Device<device::Core>, this: Pin<&Self>) {
+//!         let _ = (dev, this);
+//!     }
+//! }
+//! ```
+//!
+//! For specific examples see [`auxiliary::Driver`], [`pci::Driver`] and [`platform::Driver`].
+//!
+//! The `probe()` callback should return a `Result<Pin<KBox<Self>>>`, i.e. the driver's private
+//! data. The bus abstraction should store the pointer in the corresponding bus device. The generic
+//! [`Device`] infrastructure provides common helpers for this purpose on its
+//! [`Device<CoreInternal>`] implementation.
+//!
+//! All driver callbacks should provide a reference to the driver's private data. Once the driver
+//! is unbound from the device, the bus abstraction should take back the ownership of the driver's
+//! private data from the corresponding [`Device`] and [`drop`] it.
+//!
+//! All driver callbacks should provide a [`Device<Core>`] reference (see also [`device::Core`]).
+//!
+//! # Adapter
+//!
+//! The adapter implementation of a bus represents the abstraction layer between the C bus
+//! callbacks and the Rust bus callbacks. It therefore has to be generic over an implementation of
+//! the [driver trait](#driver-trait).
+//!
+//! ```ignore
+//! pub struct Adapter<T: Driver>;
+//! ```
+//!
+//! There's a common [`Adapter`] trait that can be implemented to inherit common driver
+//! infrastructure, such as finding the ID info from an [`of::IdTable`] or [`acpi::IdTable`].
+//!
+//! # Driver Registration
+//!
+//! In order to register C driver types (such as `struct platform_driver`) the [adapter](#adapter)
+//! should implement the [`RegistrationOps`] trait.
+//!
+//! This trait implementation can be used to create the actual registration with the common
+//! [`Registration`] type.
+//!
+//! Typically, bus abstractions want to provide a bus specific `module_bus_driver!` macro, which
+//! creates a kernel module with exactly one [`Registration`] for the bus specific adapter.
+//!
+//! The generic driver infrastructure provides a helper for this with the [`module_driver`] macro.
+//!
+//! # Device IDs
+//!
+//! Besides the common device ID types, such as [`of::DeviceId`] and [`acpi::DeviceId`], most buses
+//! may need to implement their own device ID types.
+//!
+//! For this purpose the generic infrastructure in [`device_id`] should be used.
+//!
+//! [`auxiliary::Driver`]: kernel::auxiliary::Driver
+//! [`Core`]: device::Core
+//! [`Device`]: device::Device
+//! [`Device<Core>`]: device::Device<device::Core>
+//! [`Device<CoreInternal>`]: device::Device<device::CoreInternal>
+//! [`DeviceContext`]: device::DeviceContext
+//! [`device_id`]: kernel::device_id
+//! [`module_driver`]: kernel::module_driver
+//! [`pci::Driver`]: kernel::pci::Driver
+//! [`platform::Driver`]: kernel::platform::Driver
 
 use crate::error::{Error, Result};
 use crate::{acpi, device, of, str::CStr, try_pin_init, types::Opaque, ThisModule};
diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs
index 3bb7c83966cf..d29c477e89a8 100644
--- a/rust/kernel/drm/device.rs
+++ b/rust/kernel/drm/device.rs
@@ -5,6 +5,7 @@
 //! C header: [`include/linux/drm/drm_device.h`](srctree/include/linux/drm/drm_device.h)
 
 use crate::{
+    alloc::allocator::Kmalloc,
     bindings, device, drm,
     drm::driver::AllocImpl,
     error::from_err_ptr,
@@ -12,7 +13,7 @@ use crate::{
     prelude::*,
     types::{ARef, AlwaysRefCounted, Opaque},
 };
-use core::{mem, ops::Deref, ptr, ptr::NonNull};
+use core::{alloc::Layout, mem, ops::Deref, ptr, ptr::NonNull};
 
 #[cfg(CONFIG_DRM_LEGACY)]
 macro_rules! drm_legacy_fields {
@@ -53,10 +54,8 @@ macro_rules! drm_legacy_fields {
 ///
 /// `self.dev` is a valid instance of a `struct device`.
 #[repr(C)]
-#[pin_data]
 pub struct Device<T: drm::Driver> {
     dev: Opaque<bindings::drm_device>,
-    #[pin]
     data: T::Data,
 }
 
@@ -96,6 +95,10 @@ impl<T: drm::Driver> Device<T> {
 
     /// Create a new `drm::Device` for a `drm::Driver`.
     pub fn new(dev: &device::Device, data: impl PinInit<T::Data, Error>) -> Result<ARef<Self>> {
+        // `__drm_dev_alloc` uses `kmalloc()` to allocate memory, hence ensure a `kmalloc()`
+        // compatible `Layout`.
+        let layout = Kmalloc::aligned_layout(Layout::new::<Self>());
+
         // SAFETY:
         // - `VTABLE`, as a `const` is pinned to the read-only section of the compilation,
         // - `dev` is valid by its type invarants,
@@ -103,7 +106,7 @@ impl<T: drm::Driver> Device<T> {
             bindings::__drm_dev_alloc(
                 dev.as_raw(),
                 &Self::VTABLE,
-                mem::size_of::<Self>(),
+                layout.size(),
                 mem::offset_of!(Self, dev),
             )
         }
@@ -117,9 +120,13 @@ impl<T: drm::Driver> Device<T> {
         // - `raw_data` is a valid pointer to uninitialized memory.
         // - `raw_data` will not move until it is dropped.
         unsafe { data.__pinned_init(raw_data) }.inspect_err(|_| {
-            // SAFETY: `__drm_dev_alloc()` was successful, hence `raw_drm` must be valid and the
+            // SAFETY: `raw_drm` is a valid pointer to `Self`, given that `__drm_dev_alloc` was
+            // successful.
+            let drm_dev = unsafe { Self::into_drm_device(raw_drm) };
+
+            // SAFETY: `__drm_dev_alloc()` was successful, hence `drm_dev` must be valid and the
             // refcount must be non-zero.
-            unsafe { bindings::drm_dev_put(ptr::addr_of_mut!((*raw_drm.as_ptr()).dev).cast()) };
+            unsafe { bindings::drm_dev_put(drm_dev) };
         })?;
 
         // SAFETY: The reference count is one, and now we take ownership of that reference as a
@@ -140,6 +147,14 @@ impl<T: drm::Driver> Device<T> {
         unsafe { crate::container_of!(Opaque::cast_from(ptr), Self, dev) }.cast_mut()
     }
 
+    /// # Safety
+    ///
+    /// `ptr` must be a valid pointer to `Self`.
+    unsafe fn into_drm_device(ptr: NonNull<Self>) -> *mut bindings::drm_device {
+        // SAFETY: By the safety requirements of this function, `ptr` is a valid pointer to `Self`.
+        unsafe { &raw mut (*ptr.as_ptr()).dev }.cast()
+    }
+
     /// Not intended to be called externally, except via declare_drm_ioctls!()
     ///
     /// # Safety
@@ -189,8 +204,11 @@ unsafe impl<T: drm::Driver> AlwaysRefCounted for Device<T> {
     }
 
     unsafe fn dec_ref(obj: NonNull<Self>) {
+        // SAFETY: `obj` is a valid pointer to `Self`.
+        let drm_dev = unsafe { Self::into_drm_device(obj) };
+
         // SAFETY: The safety requirements guarantee that the refcount is non-zero.
-        unsafe { bindings::drm_dev_put(obj.cast().as_ptr()) };
+        unsafe { bindings::drm_dev_put(drm_dev) };
     }
 }
 
diff --git a/rust/kernel/faux.rs b/rust/kernel/faux.rs
index 7a906099993f..7fe2dd197e37 100644
--- a/rust/kernel/faux.rs
+++ b/rust/kernel/faux.rs
@@ -4,7 +4,7 @@
 //!
 //! This module provides bindings for working with faux devices in kernel modules.
 //!
-//! C header: [`include/linux/device/faux.h`]
+//! C header: [`include/linux/device/faux.h`](srctree/include/linux/device/faux.h)
 
 use crate::{bindings, device, error::code::*, prelude::*};
 use core::ptr::{addr_of_mut, null, null_mut, NonNull};
diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs
index 35fd5db35c46..f1a3fa698745 100644
--- a/rust/kernel/fs/file.rs
+++ b/rust/kernel/fs/file.rs
@@ -10,8 +10,9 @@
 use crate::{
     bindings,
     cred::Credential,
-    error::{code::*, Error, Result},
-    types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque},
+    error::{code::*, to_result, Error, Result},
+    sync::aref::{ARef, AlwaysRefCounted},
+    types::{NotThreadSafe, Opaque},
 };
 use core::ptr;
 
@@ -398,9 +399,8 @@ impl FileDescriptorReservation {
     pub fn get_unused_fd_flags(flags: u32) -> Result<Self> {
         // SAFETY: FFI call, there are no safety requirements on `flags`.
         let fd: i32 = unsafe { bindings::get_unused_fd_flags(flags) };
-        if fd < 0 {
-            return Err(Error::from_errno(fd));
-        }
+        to_result(fd)?;
+
         Ok(Self {
             fd: fd as u32,
             _not_send: NotThreadSafe,
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index ed53169e795c..fef97f2a5098 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -296,7 +296,7 @@ macro_rules! asm {
 
 /// Gets the C string file name of a [`Location`].
 ///
-/// If `file_with_nul()` is not available, returns a string that warns about it.
+/// If `Location::file_as_c_str()` is not available, returns a string that warns about it.
 ///
 /// [`Location`]: core::panic::Location
 ///
@@ -310,8 +310,8 @@ macro_rules! asm {
 ///     let caller = core::panic::Location::caller();
 ///
 ///     // Output:
-///     // - A path like "rust/kernel/example.rs" if file_with_nul() is available.
-///     // - "<Location::file_with_nul() not supported>" otherwise.
+///     // - A path like "rust/kernel/example.rs" if `file_as_c_str()` is available.
+///     // - "<Location::file_as_c_str() not supported>" otherwise.
 ///     let caller_file = file_from_location(caller);
 ///
 ///     // Prints out the message with caller's file name.
@@ -326,7 +326,12 @@ macro_rules! asm {
 /// ```
 #[inline]
 pub fn file_from_location<'a>(loc: &'a core::panic::Location<'a>) -> &'a core::ffi::CStr {
-    #[cfg(CONFIG_RUSTC_HAS_FILE_WITH_NUL)]
+    #[cfg(CONFIG_RUSTC_HAS_FILE_AS_C_STR)]
+    {
+        loc.file_as_c_str()
+    }
+
+    #[cfg(all(CONFIG_RUSTC_HAS_FILE_WITH_NUL, not(CONFIG_RUSTC_HAS_FILE_AS_C_STR)))]
     {
         loc.file_with_nul()
     }
@@ -334,6 +339,6 @@ pub fn file_from_location<'a>(loc: &'a core::panic::Location<'a>) -> &'a core::f
     #[cfg(not(CONFIG_RUSTC_HAS_FILE_WITH_NUL))]
     {
         let _ = loc;
-        c"<Location::file_with_nul() not supported>"
+        c"<Location::file_as_c_str() not supported>"
     }
 }
diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs
index 6086ca981b06..a1bfa4e19293 100644
--- a/rust/kernel/mm/virt.rs
+++ b/rust/kernel/mm/virt.rs
@@ -209,6 +209,7 @@ impl VmaMixedMap {
 ///
 /// For the duration of 'a, the referenced vma must be undergoing initialization in an
 /// `f_ops->mmap()` hook.
+#[repr(transparent)]
 pub struct VmaNew {
     vma: VmaRef,
 }
diff --git a/rust/kernel/pid_namespace.rs b/rust/kernel/pid_namespace.rs
index 0e93808e4639..979a9718f153 100644
--- a/rust/kernel/pid_namespace.rs
+++ b/rust/kernel/pid_namespace.rs
@@ -7,10 +7,7 @@
 //! C header: [`include/linux/pid_namespace.h`](srctree/include/linux/pid_namespace.h) and
 //! [`include/linux/pid.h`](srctree/include/linux/pid.h)
 
-use crate::{
-    bindings,
-    types::{AlwaysRefCounted, Opaque},
-};
+use crate::{bindings, sync::aref::AlwaysRefCounted, types::Opaque};
 use core::ptr;
 
 /// Wraps the kernel's `struct pid_namespace`. Thread safe.
diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs
index 00f9b558a3ad..cf5b638a097d 100644
--- a/rust/kernel/sync.rs
+++ b/rust/kernel/sync.rs
@@ -11,12 +11,15 @@ use pin_init;
 
 mod arc;
 pub mod aref;
+pub mod atomic;
+pub mod barrier;
 pub mod completion;
 mod condvar;
 pub mod lock;
 mod locked_by;
 pub mod poll;
 pub mod rcu;
+mod refcount;
 
 pub use arc::{Arc, ArcBorrow, UniqueArc};
 pub use completion::Completion;
@@ -25,6 +28,7 @@ pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend,
 pub use lock::mutex::{new_mutex, Mutex, MutexGuard};
 pub use lock::spinlock::{new_spinlock, SpinLock, SpinLockGuard};
 pub use locked_by::LockedBy;
+pub use refcount::Refcount;
 
 /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`.
 #[repr(transparent)]
diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs
index 63a66761d0c7..9298993ea7d8 100644
--- a/rust/kernel/sync/arc.rs
+++ b/rust/kernel/sync/arc.rs
@@ -8,7 +8,7 @@
 //! threads.
 //!
 //! It is different from the standard library's [`Arc`] in a few ways:
-//! 1. It is backed by the kernel's `refcount_t` type.
+//! 1. It is backed by the kernel's [`Refcount`] type.
 //! 2. It does not support weak references, which allows it to be half the size.
 //! 3. It saturates the reference count instead of aborting when it goes over a threshold.
 //! 4. It does not provide a `get_mut` method, so the ref counted object is pinned.
@@ -18,11 +18,11 @@
 
 use crate::{
     alloc::{AllocError, Flags, KBox},
-    bindings,
     ffi::c_void,
     init::InPlaceInit,
+    sync::Refcount,
     try_init,
-    types::{ForeignOwnable, Opaque},
+    types::ForeignOwnable,
 };
 use core::{
     alloc::Layout,
@@ -145,7 +145,7 @@ pub struct Arc<T: ?Sized> {
 #[pin_data]
 #[repr(C)]
 struct ArcInner<T: ?Sized> {
-    refcount: Opaque<bindings::refcount_t>,
+    refcount: Refcount,
     data: T,
 }
 
@@ -157,7 +157,7 @@ impl<T: ?Sized> ArcInner<T> {
     /// `ptr` must have been returned by a previous call to [`Arc::into_raw`], and the `Arc` must
     /// not yet have been destroyed.
     unsafe fn container_of(ptr: *const T) -> NonNull<ArcInner<T>> {
-        let refcount_layout = Layout::new::<bindings::refcount_t>();
+        let refcount_layout = Layout::new::<Refcount>();
         // SAFETY: The caller guarantees that the pointer is valid.
         let val_layout = Layout::for_value(unsafe { &*ptr });
         // SAFETY: We're computing the layout of a real struct that existed when compiling this
@@ -229,8 +229,7 @@ impl<T> Arc<T> {
     pub fn new(contents: T, flags: Flags) -> Result<Self, AllocError> {
         // INVARIANT: The refcount is initialised to a non-zero value.
         let value = ArcInner {
-            // SAFETY: There are no safety requirements for this FFI call.
-            refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }),
+            refcount: Refcount::new(1),
             data: contents,
         };
 
@@ -321,7 +320,7 @@ impl<T: ?Sized> Arc<T> {
     /// use kernel::sync::{Arc, UniqueArc};
     ///
     /// let arc = Arc::new(42, GFP_KERNEL)?;
-    /// let unique_arc = arc.into_unique_or_drop();
+    /// let unique_arc = Arc::into_unique_or_drop(arc);
     ///
     /// // The above conversion should succeed since refcount of `arc` is 1.
     /// assert!(unique_arc.is_some());
@@ -337,35 +336,30 @@ impl<T: ?Sized> Arc<T> {
     /// let arc = Arc::new(42, GFP_KERNEL)?;
     /// let another = arc.clone();
     ///
-    /// let unique_arc = arc.into_unique_or_drop();
+    /// let unique_arc = Arc::into_unique_or_drop(arc);
     ///
     /// // The above conversion should fail since refcount of `arc` is >1.
     /// assert!(unique_arc.is_none());
     ///
     /// # Ok::<(), Error>(())
     /// ```
-    pub fn into_unique_or_drop(self) -> Option<Pin<UniqueArc<T>>> {
+    pub fn into_unique_or_drop(this: Self) -> Option<Pin<UniqueArc<T>>> {
         // We will manually manage the refcount in this method, so we disable the destructor.
-        let me = ManuallyDrop::new(self);
+        let this = ManuallyDrop::new(this);
         // SAFETY: We own a refcount, so the pointer is still valid.
-        let refcount = unsafe { me.ptr.as_ref() }.refcount.get();
+        let refcount = unsafe { &this.ptr.as_ref().refcount };
 
         // If the refcount reaches a non-zero value, then we have destroyed this `Arc` and will
         // return without further touching the `Arc`. If the refcount reaches zero, then there are
         // no other arcs, and we can create a `UniqueArc`.
-        //
-        // SAFETY: We own a refcount, so the pointer is not dangling.
-        let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) };
-        if is_zero {
-            // SAFETY: We have exclusive access to the arc, so we can perform unsynchronized
-            // accesses to the refcount.
-            unsafe { core::ptr::write(refcount, bindings::REFCOUNT_INIT(1)) };
+        if refcount.dec_and_test() {
+            refcount.set(1);
 
             // INVARIANT: We own the only refcount to this arc, so we may create a `UniqueArc`. We
             // must pin the `UniqueArc` because the values was previously in an `Arc`, and they pin
             // their values.
             Some(Pin::from(UniqueArc {
-                inner: ManuallyDrop::into_inner(me),
+                inner: ManuallyDrop::into_inner(this),
             }))
         } else {
             None
@@ -456,14 +450,10 @@ impl<T: ?Sized> Borrow<T> for Arc<T> {
 
 impl<T: ?Sized> Clone for Arc<T> {
     fn clone(&self) -> Self {
-        // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is
-        // safe to dereference it.
-        let refcount = unsafe { self.ptr.as_ref() }.refcount.get();
-
-        // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero.
+        // INVARIANT: `Refcount` saturates the refcount, so it cannot overflow to zero.
         // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is
         // safe to increment the refcount.
-        unsafe { bindings::refcount_inc(refcount) };
+        unsafe { self.ptr.as_ref() }.refcount.inc();
 
         // SAFETY: We just incremented the refcount. This increment is now owned by the new `Arc`.
         unsafe { Self::from_inner(self.ptr) }
@@ -472,16 +462,10 @@ impl<T: ?Sized> Clone for Arc<T> {
 
 impl<T: ?Sized> Drop for Arc<T> {
     fn drop(&mut self) {
-        // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot
-        // touch `refcount` after it's decremented to a non-zero value because another thread/CPU
-        // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to
-        // freed/invalid memory as long as it is never dereferenced.
-        let refcount = unsafe { self.ptr.as_ref() }.refcount.get();
-
         // INVARIANT: If the refcount reaches zero, there are no other instances of `Arc`, and
         // this instance is being dropped, so the broken invariant is not observable.
-        // SAFETY: Also by the type invariant, we are allowed to decrement the refcount.
-        let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) };
+        // SAFETY: By the type invariant, there is necessarily a reference to the object.
+        let is_zero = unsafe { self.ptr.as_ref() }.refcount.dec_and_test();
         if is_zero {
             // The count reached zero, we must free the memory.
             //
@@ -775,8 +759,7 @@ impl<T> UniqueArc<T> {
         // INVARIANT: The refcount is initialised to a non-zero value.
         let inner = KBox::try_init::<AllocError>(
             try_init!(ArcInner {
-                // SAFETY: There are no safety requirements for this FFI call.
-                refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }),
+                refcount: Refcount::new(1),
                 data <- pin_init::uninit::<T, AllocError>(),
             }? AllocError),
             flags,
diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs
new file mode 100644
index 000000000000..016a6bcaf080
--- /dev/null
+++ b/rust/kernel/sync/atomic.rs
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Atomic primitives.
+//!
+//! These primitives have the same semantics as their C counterparts: and the precise definitions of
+//! semantics can be found at [`LKMM`]. Note that Linux Kernel Memory (Consistency) Model is the
+//! only model for Rust code in kernel, and Rust's own atomics should be avoided.
+//!
+//! # Data races
+//!
+//! [`LKMM`] atomics have different rules regarding data races:
+//!
+//! - A normal write from C side is treated as an atomic write if
+//!   CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y.
+//! - Mixed-size atomic accesses don't cause data races.
+//!
+//! [`LKMM`]: srctree/tools/memory-model/
+
+mod internal;
+pub mod ordering;
+mod predefine;
+
+pub use internal::AtomicImpl;
+pub use ordering::{Acquire, Full, Relaxed, Release};
+
+use crate::build_error;
+use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr};
+use ordering::OrderingType;
+
+/// A memory location which can be safely modified from multiple execution contexts.
+///
+/// This has the same size, alignment and bit validity as the underlying type `T`. And it disables
+/// niche optimization for the same reason as [`UnsafeCell`].
+///
+/// The atomic operations are implemented in a way that is fully compatible with the [Linux Kernel
+/// Memory (Consistency) Model][LKMM], hence they should be modeled as the corresponding
+/// [`LKMM`][LKMM] atomic primitives. With the help of [`Atomic::from_ptr()`] and
+/// [`Atomic::as_ptr()`], this provides a way to interact with [C-side atomic operations]
+/// (including those without the `atomic` prefix, e.g. `READ_ONCE()`, `WRITE_ONCE()`,
+/// `smp_load_acquire()` and `smp_store_release()`).
+///
+/// # Invariants
+///
+/// `self.0` is a valid `T`.
+///
+/// [`UnsafeCell`]: core::cell::UnsafeCell
+/// [LKMM]: srctree/tools/memory-model/
+/// [C-side atomic operations]: srctree/Documentation/atomic_t.txt
+#[repr(transparent)]
+pub struct Atomic<T: AtomicType>(AtomicRepr<T::Repr>);
+
+// SAFETY: `Atomic<T>` is safe to share among execution contexts because all accesses are atomic.
+unsafe impl<T: AtomicType> Sync for Atomic<T> {}
+
+/// Types that support basic atomic operations.
+///
+/// # Round-trip transmutability
+///
+/// `T` is round-trip transmutable to `U` if and only if both of these properties hold:
+///
+/// - Any valid bit pattern for `T` is also a valid bit pattern for `U`.
+/// - Transmuting (e.g. using [`transmute()`]) a value of type `T` to `U` and then to `T` again
+///   yields a value that is in all aspects equivalent to the original value.
+///
+/// # Safety
+///
+/// - [`Self`] must have the same size and alignment as [`Self::Repr`].
+/// - [`Self`] must be [round-trip transmutable] to  [`Self::Repr`].
+///
+/// Note that this is more relaxed than requiring the bi-directional transmutability (i.e.
+/// [`transmute()`] is always sound between `U` and `T`) because of the support for atomic
+/// variables over unit-only enums, see [Examples].
+///
+/// # Limitations
+///
+/// Because C primitives are used to implement the atomic operations, and a C function requires a
+/// valid object of a type to operate on (i.e. no `MaybeUninit<_>`), hence at the Rust <-> C
+/// surface, only types with all the bits initialized can be passed. As a result, types like `(u8,
+/// u16)` (padding bytes are uninitialized) are currently not supported.
+///
+/// # Examples
+///
+/// A unit-only enum that implements [`AtomicType`]:
+///
+/// ```
+/// use kernel::sync::atomic::{AtomicType, Atomic, Relaxed};
+///
+/// #[derive(Clone, Copy, PartialEq, Eq)]
+/// #[repr(i32)]
+/// enum State {
+///     Uninit = 0,
+///     Working = 1,
+///     Done = 2,
+/// };
+///
+/// // SAFETY: `State` and `i32` has the same size and alignment, and it's round-trip
+/// // transmutable to `i32`.
+/// unsafe impl AtomicType for State {
+///     type Repr = i32;
+/// }
+///
+/// let s = Atomic::new(State::Uninit);
+///
+/// assert_eq!(State::Uninit, s.load(Relaxed));
+/// ```
+/// [`transmute()`]: core::mem::transmute
+/// [round-trip transmutable]: AtomicType#round-trip-transmutability
+/// [Examples]: AtomicType#examples
+pub unsafe trait AtomicType: Sized + Send + Copy {
+    /// The backing atomic implementation type.
+    type Repr: AtomicImpl;
+}
+
+/// Types that support atomic add operations.
+///
+/// # Safety
+///
+// TODO: Properly defines `wrapping_add` in the following comment.
+/// `wrapping_add` any value of type `Self::Repr::Delta` obtained by [`Self::rhs_into_delta()`] to
+/// any value of type `Self::Repr` obtained through transmuting a value of type `Self` to must
+/// yield a value with a bit pattern also valid for `Self`.
+pub unsafe trait AtomicAdd<Rhs = Self>: AtomicType {
+    /// Converts `Rhs` into the `Delta` type of the atomic implementation.
+    fn rhs_into_delta(rhs: Rhs) -> <Self::Repr as AtomicImpl>::Delta;
+}
+
+#[inline(always)]
+const fn into_repr<T: AtomicType>(v: T) -> T::Repr {
+    // SAFETY: Per the safety requirement of `AtomicType`, `T` is round-trip transmutable to
+    // `T::Repr`, therefore the transmute operation is sound.
+    unsafe { core::mem::transmute_copy(&v) }
+}
+
+/// # Safety
+///
+/// `r` must be a valid bit pattern of `T`.
+#[inline(always)]
+const unsafe fn from_repr<T: AtomicType>(r: T::Repr) -> T {
+    // SAFETY: Per the safety requirement of the function, the transmute operation is sound.
+    unsafe { core::mem::transmute_copy(&r) }
+}
+
+impl<T: AtomicType> Atomic<T> {
+    /// Creates a new atomic `T`.
+    pub const fn new(v: T) -> Self {
+        // INVARIANT: Per the safety requirement of `AtomicType`, `into_repr(v)` is a valid `T`.
+        Self(AtomicRepr::new(into_repr(v)))
+    }
+
+    /// Creates a reference to an atomic `T` from a pointer of `T`.
+    ///
+    /// This usually is used when communicating with C side or manipulating a C struct, see
+    /// examples below.
+    ///
+    /// # Safety
+    ///
+    /// - `ptr` is aligned to `align_of::<T>()`.
+    /// - `ptr` is valid for reads and writes for `'a`.
+    /// - For the duration of `'a`, other accesses to `*ptr` must not cause data races (defined
+    ///   by [`LKMM`]) against atomic operations on the returned reference. Note that if all other
+    ///   accesses are atomic, then this safety requirement is trivially fulfilled.
+    ///
+    /// [`LKMM`]: srctree/tools/memory-model
+    ///
+    /// # Examples
+    ///
+    /// Using [`Atomic::from_ptr()`] combined with [`Atomic::load()`] or [`Atomic::store()`] can
+    /// achieve the same functionality as `READ_ONCE()`/`smp_load_acquire()` or
+    /// `WRITE_ONCE()`/`smp_store_release()` in C side:
+    ///
+    /// ```
+    /// # use kernel::types::Opaque;
+    /// use kernel::sync::atomic::{Atomic, Relaxed, Release};
+    ///
+    /// // Assume there is a C struct `foo`.
+    /// mod cbindings {
+    ///     #[repr(C)]
+    ///     pub(crate) struct foo {
+    ///         pub(crate) a: i32,
+    ///         pub(crate) b: i32
+    ///     }
+    /// }
+    ///
+    /// let tmp = Opaque::new(cbindings::foo { a: 1, b: 2 });
+    ///
+    /// // struct foo *foo_ptr = ..;
+    /// let foo_ptr = tmp.get();
+    ///
+    /// // SAFETY: `foo_ptr` is valid, and `.a` is in bounds.
+    /// let foo_a_ptr = unsafe { &raw mut (*foo_ptr).a };
+    ///
+    /// // a = READ_ONCE(foo_ptr->a);
+    /// //
+    /// // SAFETY: `foo_a_ptr` is valid for read, and all other accesses on it is atomic, so no
+    /// // data race.
+    /// let a = unsafe { Atomic::from_ptr(foo_a_ptr) }.load(Relaxed);
+    /// # assert_eq!(a, 1);
+    ///
+    /// // smp_store_release(&foo_ptr->a, 2);
+    /// //
+    /// // SAFETY: `foo_a_ptr` is valid for writes, and all other accesses on it is atomic, so
+    /// // no data race.
+    /// unsafe { Atomic::from_ptr(foo_a_ptr) }.store(2, Release);
+    /// ```
+    pub unsafe fn from_ptr<'a>(ptr: *mut T) -> &'a Self
+    where
+        T: Sync,
+    {
+        // CAST: `T` and `Atomic<T>` have the same size, alignment and bit validity.
+        // SAFETY: Per function safety requirement, `ptr` is a valid pointer and the object will
+        // live long enough. It's safe to return a `&Atomic<T>` because function safety requirement
+        // guarantees other accesses won't cause data races.
+        unsafe { &*ptr.cast::<Self>() }
+    }
+
+    /// Returns a pointer to the underlying atomic `T`.
+    ///
+    /// Note that use of the return pointer must not cause data races defined by [`LKMM`].
+    ///
+    /// # Guarantees
+    ///
+    /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::<T>()`]).
+    ///
+    /// [`LKMM`]: srctree/tools/memory-model
+    /// [`align_of::<T>()`]: core::mem::align_of
+    pub const fn as_ptr(&self) -> *mut T {
+        // GUARANTEE: Per the function guarantee of `AtomicRepr::as_ptr()`, the `self.0.as_ptr()`
+        // must be a valid and properly aligned pointer for `T::Repr`, and per the safety guarantee
+        // of `AtomicType`, it's a valid and properly aligned pointer of `T`.
+        self.0.as_ptr().cast()
+    }
+
+    /// Returns a mutable reference to the underlying atomic `T`.
+    ///
+    /// This is safe because the mutable reference of the atomic `T` guarantees exclusive access.
+    pub fn get_mut(&mut self) -> &mut T {
+        // CAST: `T` and `T::Repr` has the same size and alignment per the safety requirement of
+        // `AtomicType`, and per the type invariants `self.0` is a valid `T`, therefore the casting
+        // result is a valid pointer of `T`.
+        // SAFETY: The pointer is valid per the CAST comment above, and the mutable reference
+        // guarantees exclusive access.
+        unsafe { &mut *self.0.as_ptr().cast() }
+    }
+}
+
+impl<T: AtomicType> Atomic<T>
+where
+    T::Repr: AtomicBasicOps,
+{
+    /// Loads the value from the atomic `T`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Relaxed};
+    ///
+    /// let x = Atomic::new(42i32);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    ///
+    /// let x = Atomic::new(42i64);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    /// ```
+    #[doc(alias("atomic_read", "atomic64_read"))]
+    #[inline(always)]
+    pub fn load<Ordering: ordering::AcquireOrRelaxed>(&self, _: Ordering) -> T {
+        let v = {
+            match Ordering::TYPE {
+                OrderingType::Relaxed => T::Repr::atomic_read(&self.0),
+                OrderingType::Acquire => T::Repr::atomic_read_acquire(&self.0),
+                _ => build_error!("Wrong ordering"),
+            }
+        };
+
+        // SAFETY: `v` comes from reading `self.0`, which is a valid `T` per the type invariants.
+        unsafe { from_repr(v) }
+    }
+
+    /// Stores a value to the atomic `T`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Relaxed};
+    ///
+    /// let x = Atomic::new(42i32);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    ///
+    /// x.store(43, Relaxed);
+    ///
+    /// assert_eq!(43, x.load(Relaxed));
+    /// ```
+    #[doc(alias("atomic_set", "atomic64_set"))]
+    #[inline(always)]
+    pub fn store<Ordering: ordering::ReleaseOrRelaxed>(&self, v: T, _: Ordering) {
+        let v = into_repr(v);
+
+        // INVARIANT: `v` is a valid `T`, and is stored to `self.0` by `atomic_set*()`.
+        match Ordering::TYPE {
+            OrderingType::Relaxed => T::Repr::atomic_set(&self.0, v),
+            OrderingType::Release => T::Repr::atomic_set_release(&self.0, v),
+            _ => build_error!("Wrong ordering"),
+        }
+    }
+}
+
+impl<T: AtomicType> Atomic<T>
+where
+    T::Repr: AtomicExchangeOps,
+{
+    /// Atomic exchange.
+    ///
+    /// Atomically updates `*self` to `v` and returns the old value of `*self`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Acquire, Relaxed};
+    ///
+    /// let x = Atomic::new(42);
+    ///
+    /// assert_eq!(42, x.xchg(52, Acquire));
+    /// assert_eq!(52, x.load(Relaxed));
+    /// ```
+    #[doc(alias("atomic_xchg", "atomic64_xchg", "swap"))]
+    #[inline(always)]
+    pub fn xchg<Ordering: ordering::Ordering>(&self, v: T, _: Ordering) -> T {
+        let v = into_repr(v);
+
+        // INVARIANT: `self.0` is a valid `T` after `atomic_xchg*()` because `v` is transmutable to
+        // `T`.
+        let ret = {
+            match Ordering::TYPE {
+                OrderingType::Full => T::Repr::atomic_xchg(&self.0, v),
+                OrderingType::Acquire => T::Repr::atomic_xchg_acquire(&self.0, v),
+                OrderingType::Release => T::Repr::atomic_xchg_release(&self.0, v),
+                OrderingType::Relaxed => T::Repr::atomic_xchg_relaxed(&self.0, v),
+            }
+        };
+
+        // SAFETY: `ret` comes from reading `*self`, which is a valid `T` per type invariants.
+        unsafe { from_repr(ret) }
+    }
+
+    /// Atomic compare and exchange.
+    ///
+    /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not
+    /// modified.
+    ///
+    /// Compare: The comparison is done via the byte level comparison between `*self` and `old`.
+    ///
+    /// Ordering: When succeeds, provides the corresponding ordering as the `Ordering` type
+    /// parameter indicates, and a failed one doesn't provide any ordering, the load part of a
+    /// failed cmpxchg is a [`Relaxed`] load.
+    ///
+    /// Returns `Ok(value)` if cmpxchg succeeds, and `value` is guaranteed to be equal to `old`,
+    /// otherwise returns `Err(value)`, and `value` is the current value of `*self`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Full, Relaxed};
+    ///
+    /// let x = Atomic::new(42);
+    ///
+    /// // Checks whether cmpxchg succeeded.
+    /// let success = x.cmpxchg(52, 64, Relaxed).is_ok();
+    /// # assert!(!success);
+    ///
+    /// // Checks whether cmpxchg failed.
+    /// let failure = x.cmpxchg(52, 64, Relaxed).is_err();
+    /// # assert!(failure);
+    ///
+    /// // Uses the old value if failed, probably re-try cmpxchg.
+    /// match x.cmpxchg(52, 64, Relaxed) {
+    ///     Ok(_) => { },
+    ///     Err(old) => {
+    ///         // do something with `old`.
+    ///         # assert_eq!(old, 42);
+    ///     }
+    /// }
+    ///
+    /// // Uses the latest value regardlessly, same as atomic_cmpxchg() in C.
+    /// let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old);
+    /// # assert_eq!(42, latest);
+    /// assert_eq!(64, x.load(Relaxed));
+    /// ```
+    ///
+    /// [`Relaxed`]: ordering::Relaxed
+    #[doc(alias(
+        "atomic_cmpxchg",
+        "atomic64_cmpxchg",
+        "atomic_try_cmpxchg",
+        "atomic64_try_cmpxchg",
+        "compare_exchange"
+    ))]
+    #[inline(always)]
+    pub fn cmpxchg<Ordering: ordering::Ordering>(
+        &self,
+        mut old: T,
+        new: T,
+        o: Ordering,
+    ) -> Result<T, T> {
+        // Note on code generation:
+        //
+        // try_cmpxchg() is used to implement cmpxchg(), and if the helper functions are inlined,
+        // the compiler is able to figure out that branch is not needed if the users don't care
+        // about whether the operation succeeds or not. One exception is on x86, due to commit
+        // 44fe84459faf ("locking/atomic: Fix atomic_try_cmpxchg() semantics"), the
+        // atomic_try_cmpxchg() on x86 has a branch even if the caller doesn't care about the
+        // success of cmpxchg and only wants to use the old value. For example, for code like:
+        //
+        //     let latest = x.cmpxchg(42, 64, Full).unwrap_or_else(|old| old);
+        //
+        // It will still generate code:
+        //
+        //     movl    $0x40, %ecx
+        //     movl    $0x34, %eax
+        //     lock
+        //     cmpxchgl        %ecx, 0x4(%rsp)
+        //     jne     1f
+        //     2:
+        //     ...
+        //     1:  movl    %eax, %ecx
+        //     jmp 2b
+        //
+        // This might be "fixed" by introducing a try_cmpxchg_exclusive() that knows the "*old"
+        // location in the C function is always safe to write.
+        if self.try_cmpxchg(&mut old, new, o) {
+            Ok(old)
+        } else {
+            Err(old)
+        }
+    }
+
+    /// Atomic compare and exchange and returns whether the operation succeeds.
+    ///
+    /// If `*self` == `old`, atomically updates `*self` to `new`. Otherwise, `*self` is not
+    /// modified, `*old` is updated to the current value of `*self`.
+    ///
+    /// "Compare" and "Ordering" part are the same as [`Atomic::cmpxchg()`].
+    ///
+    /// Returns `true` means the cmpxchg succeeds otherwise returns `false`.
+    #[inline(always)]
+    fn try_cmpxchg<Ordering: ordering::Ordering>(&self, old: &mut T, new: T, _: Ordering) -> bool {
+        let mut tmp = into_repr(*old);
+        let new = into_repr(new);
+
+        // INVARIANT: `self.0` is a valid `T` after `atomic_try_cmpxchg*()` because `new` is
+        // transmutable to `T`.
+        let ret = {
+            match Ordering::TYPE {
+                OrderingType::Full => T::Repr::atomic_try_cmpxchg(&self.0, &mut tmp, new),
+                OrderingType::Acquire => {
+                    T::Repr::atomic_try_cmpxchg_acquire(&self.0, &mut tmp, new)
+                }
+                OrderingType::Release => {
+                    T::Repr::atomic_try_cmpxchg_release(&self.0, &mut tmp, new)
+                }
+                OrderingType::Relaxed => {
+                    T::Repr::atomic_try_cmpxchg_relaxed(&self.0, &mut tmp, new)
+                }
+            }
+        };
+
+        // SAFETY: `tmp` comes from reading `*self`, which is a valid `T` per type invariants.
+        *old = unsafe { from_repr(tmp) };
+
+        ret
+    }
+}
+
+impl<T: AtomicType> Atomic<T>
+where
+    T::Repr: AtomicArithmeticOps,
+{
+    /// Atomic add.
+    ///
+    /// Atomically updates `*self` to `(*self).wrapping_add(v)`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Relaxed};
+    ///
+    /// let x = Atomic::new(42);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    ///
+    /// x.add(12, Relaxed);
+    ///
+    /// assert_eq!(54, x.load(Relaxed));
+    /// ```
+    #[inline(always)]
+    pub fn add<Rhs>(&self, v: Rhs, _: ordering::Relaxed)
+    where
+        T: AtomicAdd<Rhs>,
+    {
+        let v = T::rhs_into_delta(v);
+
+        // INVARIANT: `self.0` is a valid `T` after `atomic_add()` due to safety requirement of
+        // `AtomicAdd`.
+        T::Repr::atomic_add(&self.0, v);
+    }
+
+    /// Atomic fetch and add.
+    ///
+    /// Atomically updates `*self` to `(*self).wrapping_add(v)`, and returns the value of `*self`
+    /// before the update.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use kernel::sync::atomic::{Atomic, Acquire, Full, Relaxed};
+    ///
+    /// let x = Atomic::new(42);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    ///
+    /// assert_eq!(54, { x.fetch_add(12, Acquire); x.load(Relaxed) });
+    ///
+    /// let x = Atomic::new(42);
+    ///
+    /// assert_eq!(42, x.load(Relaxed));
+    ///
+    /// assert_eq!(54, { x.fetch_add(12, Full); x.load(Relaxed) } );
+    /// ```
+    #[inline(always)]
+    pub fn fetch_add<Rhs, Ordering: ordering::Ordering>(&self, v: Rhs, _: Ordering) -> T
+    where
+        T: AtomicAdd<Rhs>,
+    {
+        let v = T::rhs_into_delta(v);
+
+        // INVARIANT: `self.0` is a valid `T` after `atomic_fetch_add*()` due to safety requirement
+        // of `AtomicAdd`.
+        let ret = {
+            match Ordering::TYPE {
+                OrderingType::Full => T::Repr::atomic_fetch_add(&self.0, v),
+                OrderingType::Acquire => T::Repr::atomic_fetch_add_acquire(&self.0, v),
+                OrderingType::Release => T::Repr::atomic_fetch_add_release(&self.0, v),
+                OrderingType::Relaxed => T::Repr::atomic_fetch_add_relaxed(&self.0, v),
+            }
+        };
+
+        // SAFETY: `ret` comes from reading `self.0`, which is a valid `T` per type invariants.
+        unsafe { from_repr(ret) }
+    }
+}
diff --git a/rust/kernel/sync/atomic/internal.rs b/rust/kernel/sync/atomic/internal.rs
new file mode 100644
index 000000000000..6fdd8e59f45b
--- /dev/null
+++ b/rust/kernel/sync/atomic/internal.rs
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Atomic internal implementations.
+//!
+//! Provides 1:1 mapping to the C atomic operations.
+
+use crate::bindings;
+use crate::macros::paste;
+use core::cell::UnsafeCell;
+
+mod private {
+    /// Sealed trait marker to disable customized impls on atomic implementation traits.
+    pub trait Sealed {}
+}
+
+// `i32` and `i64` are only supported atomic implementations.
+impl private::Sealed for i32 {}
+impl private::Sealed for i64 {}
+
+/// A marker trait for types that implement atomic operations with C side primitives.
+///
+/// This trait is sealed, and only types that have directly mapping to the C side atomics should
+/// impl this:
+///
+/// - `i32` maps to `atomic_t`.
+/// - `i64` maps to `atomic64_t`.
+pub trait AtomicImpl: Sized + Send + Copy + private::Sealed {
+    /// The type of the delta in arithmetic or logical operations.
+    ///
+    /// For example, in `atomic_add(ptr, v)`, it's the type of `v`. Usually it's the same type of
+    /// [`Self`], but it may be different for the atomic pointer type.
+    type Delta;
+}
+
+// `atomic_t` implements atomic operations on `i32`.
+impl AtomicImpl for i32 {
+    type Delta = Self;
+}
+
+// `atomic64_t` implements atomic operations on `i64`.
+impl AtomicImpl for i64 {
+    type Delta = Self;
+}
+
+/// Atomic representation.
+#[repr(transparent)]
+pub struct AtomicRepr<T: AtomicImpl>(UnsafeCell<T>);
+
+impl<T: AtomicImpl> AtomicRepr<T> {
+    /// Creates a new atomic representation `T`.
+    pub const fn new(v: T) -> Self {
+        Self(UnsafeCell::new(v))
+    }
+
+    /// Returns a pointer to the underlying `T`.
+    ///
+    /// # Guarantees
+    ///
+    /// The returned pointer is valid and properly aligned (i.e. aligned to [`align_of::<T>()`]).
+    pub const fn as_ptr(&self) -> *mut T {
+        // GUARANTEE: `self.0` is an `UnsafeCell<T>`, therefore the pointer returned by `.get()`
+        // must be valid and properly aligned.
+        self.0.get()
+    }
+}
+
+// This macro generates the function signature with given argument list and return type.
+macro_rules! declare_atomic_method {
+    (
+        $(#[doc=$doc:expr])*
+        $func:ident($($arg:ident : $arg_type:ty),*) $(-> $ret:ty)?
+    ) => {
+        paste!(
+            $(#[doc = $doc])*
+            fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)?;
+        );
+    };
+    (
+        $(#[doc=$doc:expr])*
+        $func:ident [$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)?
+    ) => {
+        paste!(
+            declare_atomic_method!(
+                $(#[doc = $doc])*
+                [< $func _ $variant >]($($arg_sig)*) $(-> $ret)?
+            );
+        );
+
+        declare_atomic_method!(
+            $(#[doc = $doc])*
+            $func [$($rest)*]($($arg_sig)*) $(-> $ret)?
+        );
+    };
+    (
+        $(#[doc=$doc:expr])*
+        $func:ident []($($arg_sig:tt)*) $(-> $ret:ty)?
+    ) => {
+        declare_atomic_method!(
+            $(#[doc = $doc])*
+            $func($($arg_sig)*) $(-> $ret)?
+        );
+    }
+}
+
+// This macro generates the function implementation with given argument list and return type, and it
+// will replace "call(...)" expression with "$ctype _ $func" to call the real C function.
+macro_rules! impl_atomic_method {
+    (
+        ($ctype:ident) $func:ident($($arg:ident: $arg_type:ty),*) $(-> $ret:ty)? {
+            $unsafe:tt { call($($c_arg:expr),*) }
+        }
+    ) => {
+        paste!(
+            #[inline(always)]
+            fn [< atomic_ $func >]($($arg: $arg_type,)*) $(-> $ret)? {
+                // TODO: Ideally we want to use the SAFETY comments written at the macro invocation
+                // (e.g. in `declare_and_impl_atomic_methods!()`, however, since SAFETY comments
+                // are just comments, and they are not passed to macros as tokens, therefore we
+                // cannot use them here. One potential improvement is that if we support using
+                // attributes as an alternative for SAFETY comments, then we can use that for macro
+                // generating code.
+                //
+                // SAFETY: specified on macro invocation.
+                $unsafe { bindings::[< $ctype _ $func >]($($c_arg,)*) }
+            }
+        );
+    };
+    (
+        ($ctype:ident) $func:ident[$variant:ident $($rest:ident)*]($($arg_sig:tt)*) $(-> $ret:ty)? {
+            $unsafe:tt { call($($arg:tt)*) }
+        }
+    ) => {
+        paste!(
+            impl_atomic_method!(
+                ($ctype) [< $func _ $variant >]($($arg_sig)*) $( -> $ret)? {
+                    $unsafe { call($($arg)*) }
+            }
+            );
+        );
+        impl_atomic_method!(
+            ($ctype) $func [$($rest)*]($($arg_sig)*) $( -> $ret)? {
+                $unsafe { call($($arg)*) }
+            }
+        );
+    };
+    (
+        ($ctype:ident) $func:ident[]($($arg_sig:tt)*) $( -> $ret:ty)? {
+            $unsafe:tt { call($($arg:tt)*) }
+        }
+    ) => {
+        impl_atomic_method!(
+            ($ctype) $func($($arg_sig)*) $(-> $ret)? {
+                $unsafe { call($($arg)*) }
+            }
+        );
+    }
+}
+
+// Delcares $ops trait with methods and implements the trait for `i32` and `i64`.
+macro_rules! declare_and_impl_atomic_methods {
+    ($(#[$attr:meta])* $pub:vis trait $ops:ident {
+        $(
+            $(#[doc=$doc:expr])*
+            fn $func:ident [$($variant:ident),*]($($arg_sig:tt)*) $( -> $ret:ty)? {
+                $unsafe:tt { bindings::#call($($arg:tt)*) }
+            }
+        )*
+    }) => {
+        $(#[$attr])*
+        $pub trait $ops: AtomicImpl {
+            $(
+                declare_atomic_method!(
+                    $(#[doc=$doc])*
+                    $func[$($variant)*]($($arg_sig)*) $(-> $ret)?
+                );
+            )*
+        }
+
+        impl $ops for i32 {
+            $(
+                impl_atomic_method!(
+                    (atomic) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? {
+                        $unsafe { call($($arg)*) }
+                    }
+                );
+            )*
+        }
+
+        impl $ops for i64 {
+            $(
+                impl_atomic_method!(
+                    (atomic64) $func[$($variant)*]($($arg_sig)*) $(-> $ret)? {
+                        $unsafe { call($($arg)*) }
+                    }
+                );
+            )*
+        }
+    }
+}
+
+declare_and_impl_atomic_methods!(
+    /// Basic atomic operations
+    pub trait AtomicBasicOps {
+        /// Atomic read (load).
+        fn read[acquire](a: &AtomicRepr<Self>) -> Self {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            unsafe { bindings::#call(a.as_ptr().cast()) }
+        }
+
+        /// Atomic set (store).
+        fn set[release](a: &AtomicRepr<Self>, v: Self) {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            unsafe { bindings::#call(a.as_ptr().cast(), v) }
+        }
+    }
+);
+
+declare_and_impl_atomic_methods!(
+    /// Exchange and compare-and-exchange atomic operations
+    pub trait AtomicExchangeOps {
+        /// Atomic exchange.
+        ///
+        /// Atomically updates `*a` to `v` and returns the old value.
+        fn xchg[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self) -> Self {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            unsafe { bindings::#call(a.as_ptr().cast(), v) }
+        }
+
+        /// Atomic compare and exchange.
+        ///
+        /// If `*a` == `*old`, atomically updates `*a` to `new`. Otherwise, `*a` is not
+        /// modified, `*old` is updated to the current value of `*a`.
+        ///
+        /// Return `true` if the update of `*a` occurred, `false` otherwise.
+        fn try_cmpxchg[acquire, release, relaxed](
+            a: &AtomicRepr<Self>, old: &mut Self, new: Self
+        ) -> bool {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned. `core::ptr::from_mut(old)`
+            // is valid and properly aligned.
+            unsafe { bindings::#call(a.as_ptr().cast(), core::ptr::from_mut(old), new) }
+        }
+    }
+);
+
+declare_and_impl_atomic_methods!(
+    /// Atomic arithmetic operations
+    pub trait AtomicArithmeticOps {
+        /// Atomic add (wrapping).
+        ///
+        /// Atomically updates `*a` to `(*a).wrapping_add(v)`.
+        fn add[](a: &AtomicRepr<Self>, v: Self::Delta) {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            unsafe { bindings::#call(v, a.as_ptr().cast()) }
+        }
+
+        /// Atomic fetch and add (wrapping).
+        ///
+        /// Atomically updates `*a` to `(*a).wrapping_add(v)`, and returns the value of `*a`
+        /// before the update.
+        fn fetch_add[acquire, release, relaxed](a: &AtomicRepr<Self>, v: Self::Delta) -> Self {
+            // SAFETY: `a.as_ptr()` is valid and properly aligned.
+            unsafe { bindings::#call(v, a.as_ptr().cast()) }
+        }
+    }
+);
diff --git a/rust/kernel/sync/atomic/ordering.rs b/rust/kernel/sync/atomic/ordering.rs
new file mode 100644
index 000000000000..3f103aa8db99
--- /dev/null
+++ b/rust/kernel/sync/atomic/ordering.rs
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Memory orderings.
+//!
+//! The semantics of these orderings follows the [`LKMM`] definitions and rules.
+//!
+//! - [`Acquire`] provides ordering between the load part of the annotated operation and all the
+//!   following memory accesses, and if there is a store part, the store part has the [`Relaxed`]
+//!   ordering.
+//! - [`Release`] provides ordering between all the preceding memory accesses and the store part of
+//!   the annotated operation, and if there is a load part, the load part has the [`Relaxed`]
+//!   ordering.
+//! - [`Full`] means "fully-ordered", that is:
+//!   - It provides ordering between all the preceding memory accesses and the annotated operation.
+//!   - It provides ordering between the annotated operation and all the following memory accesses.
+//!   - It provides ordering between all the preceding memory accesses and all the following memory
+//!     accesses.
+//!   - All the orderings are the same strength as a full memory barrier (i.e. `smp_mb()`).
+//! - [`Relaxed`] provides no ordering except the dependency orderings. Dependency orderings are
+//!   described in "DEPENDENCY RELATIONS" in [`LKMM`]'s [`explanation`].
+//!
+//! [`LKMM`]: srctree/tools/memory-model/
+//! [`explanation`]: srctree/tools/memory-model/Documentation/explanation.txt
+
+/// The annotation type for relaxed memory ordering, for the description of relaxed memory
+/// ordering, see [module-level documentation].
+///
+/// [module-level documentation]: crate::sync::atomic::ordering
+pub struct Relaxed;
+
+/// The annotation type for acquire memory ordering, for the description of acquire memory
+/// ordering, see [module-level documentation].
+///
+/// [module-level documentation]: crate::sync::atomic::ordering
+pub struct Acquire;
+
+/// The annotation type for release memory ordering, for the description of release memory
+/// ordering, see [module-level documentation].
+///
+/// [module-level documentation]: crate::sync::atomic::ordering
+pub struct Release;
+
+/// The annotation type for fully-ordered memory ordering, for the description fully-ordered memory
+/// ordering, see [module-level documentation].
+///
+/// [module-level documentation]: crate::sync::atomic::ordering
+pub struct Full;
+
+/// Describes the exact memory ordering.
+#[doc(hidden)]
+pub enum OrderingType {
+    /// Relaxed ordering.
+    Relaxed,
+    /// Acquire ordering.
+    Acquire,
+    /// Release ordering.
+    Release,
+    /// Fully-ordered.
+    Full,
+}
+
+mod internal {
+    /// Sealed trait, can be only implemented inside atomic mod.
+    pub trait Sealed {}
+
+    impl Sealed for super::Relaxed {}
+    impl Sealed for super::Acquire {}
+    impl Sealed for super::Release {}
+    impl Sealed for super::Full {}
+}
+
+/// The trait bound for annotating operations that support any ordering.
+pub trait Ordering: internal::Sealed {
+    /// Describes the exact memory ordering.
+    const TYPE: OrderingType;
+}
+
+impl Ordering for Relaxed {
+    const TYPE: OrderingType = OrderingType::Relaxed;
+}
+
+impl Ordering for Acquire {
+    const TYPE: OrderingType = OrderingType::Acquire;
+}
+
+impl Ordering for Release {
+    const TYPE: OrderingType = OrderingType::Release;
+}
+
+impl Ordering for Full {
+    const TYPE: OrderingType = OrderingType::Full;
+}
+
+/// The trait bound for operations that only support acquire or relaxed ordering.
+pub trait AcquireOrRelaxed: Ordering {}
+
+impl AcquireOrRelaxed for Acquire {}
+impl AcquireOrRelaxed for Relaxed {}
+
+/// The trait bound for operations that only support release or relaxed ordering.
+pub trait ReleaseOrRelaxed: Ordering {}
+
+impl ReleaseOrRelaxed for Release {}
+impl ReleaseOrRelaxed for Relaxed {}
diff --git a/rust/kernel/sync/atomic/predefine.rs b/rust/kernel/sync/atomic/predefine.rs
new file mode 100644
index 000000000000..45a17985cda4
--- /dev/null
+++ b/rust/kernel/sync/atomic/predefine.rs
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Pre-defined atomic types
+
+use crate::static_assert;
+use core::mem::{align_of, size_of};
+
+// SAFETY: `i32` has the same size and alignment with itself, and is round-trip transmutable to
+// itself.
+unsafe impl super::AtomicType for i32 {
+    type Repr = i32;
+}
+
+// SAFETY: The wrapping add result of two `i32`s is a valid `i32`.
+unsafe impl super::AtomicAdd<i32> for i32 {
+    fn rhs_into_delta(rhs: i32) -> i32 {
+        rhs
+    }
+}
+
+// SAFETY: `i64` has the same size and alignment with itself, and is round-trip transmutable to
+// itself.
+unsafe impl super::AtomicType for i64 {
+    type Repr = i64;
+}
+
+// SAFETY: The wrapping add result of two `i64`s is a valid `i64`.
+unsafe impl super::AtomicAdd<i64> for i64 {
+    fn rhs_into_delta(rhs: i64) -> i64 {
+        rhs
+    }
+}
+
+// Defines an internal type that always maps to the integer type which has the same size alignment
+// as `isize` and `usize`, and `isize` and `usize` are always bi-directional transmutable to
+// `isize_atomic_repr`, which also always implements `AtomicImpl`.
+#[allow(non_camel_case_types)]
+#[cfg(not(CONFIG_64BIT))]
+type isize_atomic_repr = i32;
+#[allow(non_camel_case_types)]
+#[cfg(CONFIG_64BIT)]
+type isize_atomic_repr = i64;
+
+// Ensure size and alignment requirements are checked.
+static_assert!(size_of::<isize>() == size_of::<isize_atomic_repr>());
+static_assert!(align_of::<isize>() == align_of::<isize_atomic_repr>());
+static_assert!(size_of::<usize>() == size_of::<isize_atomic_repr>());
+static_assert!(align_of::<usize>() == align_of::<isize_atomic_repr>());
+
+// SAFETY: `isize` has the same size and alignment with `isize_atomic_repr`, and is round-trip
+// transmutable to `isize_atomic_repr`.
+unsafe impl super::AtomicType for isize {
+    type Repr = isize_atomic_repr;
+}
+
+// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`.
+unsafe impl super::AtomicAdd<isize> for isize {
+    fn rhs_into_delta(rhs: isize) -> isize_atomic_repr {
+        rhs as isize_atomic_repr
+    }
+}
+
+// SAFETY: `u32` and `i32` has the same size and alignment, and `u32` is round-trip transmutable to
+// `i32`.
+unsafe impl super::AtomicType for u32 {
+    type Repr = i32;
+}
+
+// SAFETY: The wrapping add result of two `i32`s is a valid `u32`.
+unsafe impl super::AtomicAdd<u32> for u32 {
+    fn rhs_into_delta(rhs: u32) -> i32 {
+        rhs as i32
+    }
+}
+
+// SAFETY: `u64` and `i64` has the same size and alignment, and `u64` is round-trip transmutable to
+// `i64`.
+unsafe impl super::AtomicType for u64 {
+    type Repr = i64;
+}
+
+// SAFETY: The wrapping add result of two `i64`s is a valid `u64`.
+unsafe impl super::AtomicAdd<u64> for u64 {
+    fn rhs_into_delta(rhs: u64) -> i64 {
+        rhs as i64
+    }
+}
+
+// SAFETY: `usize` has the same size and alignment with `isize_atomic_repr`, and is round-trip
+// transmutable to `isize_atomic_repr`.
+unsafe impl super::AtomicType for usize {
+    type Repr = isize_atomic_repr;
+}
+
+// SAFETY: The wrapping add result of two `isize_atomic_repr`s is a valid `usize`.
+unsafe impl super::AtomicAdd<usize> for usize {
+    fn rhs_into_delta(rhs: usize) -> isize_atomic_repr {
+        rhs as isize_atomic_repr
+    }
+}
+
+use crate::macros::kunit_tests;
+
+#[kunit_tests(rust_atomics)]
+mod tests {
+    use super::super::*;
+
+    // Call $fn($val) with each $type of $val.
+    macro_rules! for_each_type {
+        ($val:literal in [$($type:ty),*] $fn:expr) => {
+            $({
+                let v: $type = $val;
+
+                $fn(v);
+            })*
+        }
+    }
+
+    #[test]
+    fn atomic_basic_tests() {
+        for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+
+            assert_eq!(v, x.load(Relaxed));
+        });
+    }
+
+    #[test]
+    fn atomic_xchg_tests() {
+        for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+
+            let old = v;
+            let new = v + 1;
+
+            assert_eq!(old, x.xchg(new, Full));
+            assert_eq!(new, x.load(Relaxed));
+        });
+    }
+
+    #[test]
+    fn atomic_cmpxchg_tests() {
+        for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+
+            let old = v;
+            let new = v + 1;
+
+            assert_eq!(Err(old), x.cmpxchg(new, new, Full));
+            assert_eq!(old, x.load(Relaxed));
+            assert_eq!(Ok(old), x.cmpxchg(old, new, Relaxed));
+            assert_eq!(new, x.load(Relaxed));
+        });
+    }
+
+    #[test]
+    fn atomic_arithmetic_tests() {
+        for_each_type!(42 in [i32, i64, u32, u64, isize, usize] |v| {
+            let x = Atomic::new(v);
+
+            assert_eq!(v, x.fetch_add(12, Full));
+            assert_eq!(v + 12, x.load(Relaxed));
+
+            x.add(13, Relaxed);
+
+            assert_eq!(v + 25, x.load(Relaxed));
+        });
+    }
+}
diff --git a/rust/kernel/sync/barrier.rs b/rust/kernel/sync/barrier.rs
new file mode 100644
index 000000000000..8f2d435fcd94
--- /dev/null
+++ b/rust/kernel/sync/barrier.rs
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Memory barriers.
+//!
+//! These primitives have the same semantics as their C counterparts: and the precise definitions
+//! of semantics can be found at [`LKMM`].
+//!
+//! [`LKMM`]: srctree/tools/memory-model/
+
+/// A compiler barrier.
+///
+/// A barrier that prevents compiler from reordering memory accesses across the barrier.
+#[inline(always)]
+pub(crate) fn barrier() {
+    // By default, Rust inline asms are treated as being able to access any memory or flags, hence
+    // it suffices as a compiler barrier.
+    //
+    // SAFETY: An empty asm block.
+    unsafe { core::arch::asm!("") };
+}
+
+/// A full memory barrier.
+///
+/// A barrier that prevents compiler and CPU from reordering memory accesses across the barrier.
+#[inline(always)]
+pub fn smp_mb() {
+    if cfg!(CONFIG_SMP) {
+        // SAFETY: `smp_mb()` is safe to call.
+        unsafe { bindings::smp_mb() };
+    } else {
+        barrier();
+    }
+}
+
+/// A write-write memory barrier.
+///
+/// A barrier that prevents compiler and CPU from reordering memory write accesses across the
+/// barrier.
+#[inline(always)]
+pub fn smp_wmb() {
+    if cfg!(CONFIG_SMP) {
+        // SAFETY: `smp_wmb()` is safe to call.
+        unsafe { bindings::smp_wmb() };
+    } else {
+        barrier();
+    }
+}
+
+/// A read-read memory barrier.
+///
+/// A barrier that prevents compiler and CPU from reordering memory read accesses across the
+/// barrier.
+#[inline(always)]
+pub fn smp_rmb() {
+    if cfg!(CONFIG_SMP) {
+        // SAFETY: `smp_rmb()` is safe to call.
+        unsafe { bindings::smp_rmb() };
+    } else {
+        barrier();
+    }
+}
diff --git a/rust/kernel/sync/refcount.rs b/rust/kernel/sync/refcount.rs
new file mode 100644
index 000000000000..19236a5bccde
--- /dev/null
+++ b/rust/kernel/sync/refcount.rs
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Atomic reference counting.
+//!
+//! C header: [`include/linux/refcount.h`](srctree/include/linux/refcount.h)
+
+use crate::build_assert;
+use crate::sync::atomic::Atomic;
+use crate::types::Opaque;
+
+/// Atomic reference counter.
+///
+/// This type is conceptually an atomic integer, but provides saturation semantics compared to
+/// normal atomic integers. Values in the negative range when viewed as a signed integer are
+/// saturation (bad) values. For details about the saturation semantics, please refer to top of
+/// [`include/linux/refcount.h`](srctree/include/linux/refcount.h).
+///
+/// Wraps the kernel's C `refcount_t`.
+#[repr(transparent)]
+pub struct Refcount(Opaque<bindings::refcount_t>);
+
+impl Refcount {
+    /// Construct a new [`Refcount`] from an initial value.
+    ///
+    /// The initial value should be non-saturated.
+    #[inline]
+    pub fn new(value: i32) -> Self {
+        build_assert!(value >= 0, "initial value saturated");
+        // SAFETY: There are no safety requirements for this FFI call.
+        Self(Opaque::new(unsafe { bindings::REFCOUNT_INIT(value) }))
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *mut bindings::refcount_t {
+        self.0.get()
+    }
+
+    /// Get the underlying atomic counter that backs the refcount.
+    ///
+    /// NOTE: Usage of this function is discouraged as it can circumvent the protections offered by
+    /// `refcount.h`. If there is no way to achieve the result using APIs in `refcount.h`, then
+    /// this function can be used. Otherwise consider adding a binding for the required API.
+    #[inline]
+    pub fn as_atomic(&self) -> &Atomic<i32> {
+        let ptr = self.0.get().cast();
+        // SAFETY: `refcount_t` is a transparent wrapper of `atomic_t`, which is an atomic 32-bit
+        // integer that is layout-wise compatible with `Atomic<i32>`. All values are valid for
+        // `refcount_t`, despite some of the values being considered saturated and "bad".
+        unsafe { &*ptr }
+    }
+
+    /// Set a refcount's value.
+    #[inline]
+    pub fn set(&self, value: i32) {
+        // SAFETY: `self.as_ptr()` is valid.
+        unsafe { bindings::refcount_set(self.as_ptr(), value) }
+    }
+
+    /// Increment a refcount.
+    ///
+    /// It will saturate if overflows and `WARN`. It will also `WARN` if the refcount is 0, as this
+    /// represents a possible use-after-free condition.
+    ///
+    /// Provides no memory ordering, it is assumed that caller already has a reference on the
+    /// object.
+    #[inline]
+    pub fn inc(&self) {
+        // SAFETY: self is valid.
+        unsafe { bindings::refcount_inc(self.as_ptr()) }
+    }
+
+    /// Decrement a refcount.
+    ///
+    /// It will `WARN` on underflow and fail to decrement when saturated.
+    ///
+    /// Provides release memory ordering, such that prior loads and stores are done
+    /// before.
+    #[inline]
+    pub fn dec(&self) {
+        // SAFETY: `self.as_ptr()` is valid.
+        unsafe { bindings::refcount_dec(self.as_ptr()) }
+    }
+
+    /// Decrement a refcount and test if it is 0.
+    ///
+    /// It will `WARN` on underflow and fail to decrement when saturated.
+    ///
+    /// Provides release memory ordering, such that prior loads and stores are done
+    /// before, and provides an acquire ordering on success such that memory deallocation
+    /// must come after.
+    ///
+    /// Returns true if the resulting refcount is 0, false otherwise.
+    ///
+    /// # Notes
+    ///
+    /// A common pattern of using `Refcount` is to free memory when the reference count reaches
+    /// zero. This means that the reference to `Refcount` could become invalid after calling this
+    /// function. This is fine as long as the reference to `Refcount` is no longer used when this
+    /// function returns `false`. It is not necessary to use raw pointers in this scenario, see
+    /// <https://github.com/rust-lang/rust/issues/55005>.
+    #[inline]
+    #[must_use = "use `dec` instead if you do not need to test if it is 0"]
+    pub fn dec_and_test(&self) -> bool {
+        // SAFETY: `self.as_ptr()` is valid.
+        unsafe { bindings::refcount_dec_and_test(self.as_ptr()) }
+    }
+}
+
+// SAFETY: `refcount_t` is thread-safe.
+unsafe impl Send for Refcount {}
+
+// SAFETY: `refcount_t` is thread-safe.
+unsafe impl Sync for Refcount {}
diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c
index 7ebd352138e4..beaf36657dea 100644
--- a/samples/damon/mtier.c
+++ b/samples/damon/mtier.c
@@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store(
 	if (enabled == is_enabled)
 		return 0;
 
+	if (!init_called)
+		return 0;
+
 	if (enabled) {
 		err = damon_sample_mtier_start();
 		if (err)
diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c
index 1b839c06a612..0226652f94d5 100644
--- a/samples/damon/prcl.c
+++ b/samples/damon/prcl.c
@@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store(
 	if (enabled == is_enabled)
 		return 0;
 
+	if (!init_called)
+		return 0;
+
 	if (enabled) {
 		err = damon_sample_prcl_start();
 		if (err)
diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c
index da052023b099..21eaf15f987d 100644
--- a/samples/damon/wsse.c
+++ b/samples/damon/wsse.c
@@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store(
 		return 0;
 
 	if (enabled) {
+		if (!init_called)
+			return 0;
+
 		err = damon_sample_wsse_start();
 		if (err)
 			enabled = false;
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c
index cfea7a38befb..da3a9f2091f5 100644
--- a/samples/ftrace/ftrace-direct-modify.c
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -75,8 +75,8 @@ asm (
 	CALL_DEPTH_ACCOUNT
 "	call my_direct_func1\n"
 "	leave\n"
-"	.size		my_tramp1, .-my_tramp1\n"
 	ASM_RET
+"	.size		my_tramp1, .-my_tramp1\n"
 
 "	.type		my_tramp2, @function\n"
 "	.globl		my_tramp2\n"
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 693dbbebebba..0ba2aac3b8dc 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -86,10 +86,14 @@ kasan_params += hwasan-instrument-stack=$(stack_enable) \
 		hwasan-use-short-granules=0 \
 		hwasan-inline-all-checks=0
 
-# Instrument memcpy/memset/memmove calls by using instrumented __hwasan_mem*().
-ifeq ($(call clang-min-version, 150000)$(call gcc-min-version, 130000),y)
-	kasan_params += hwasan-kernel-mem-intrinsic-prefix=1
-endif
+# Instrument memcpy/memset/memmove calls by using instrumented __(hw)asan_mem*().
+ifdef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
+	ifdef CONFIG_CC_IS_GCC
+		kasan_params += asan-kernel-mem-intrinsic-prefix=1
+	else
+		kasan_params += hwasan-kernel-mem-intrinsic-prefix=1
+	endif
+endif # CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
 
 endif # CONFIG_KASAN_SW_TAGS
 
diff --git a/scripts/atomic/gen-atomics.sh b/scripts/atomic/gen-atomics.sh
index 5b98a8307693..02508d0d6fe4 100755
--- a/scripts/atomic/gen-atomics.sh
+++ b/scripts/atomic/gen-atomics.sh
@@ -11,6 +11,7 @@ cat <<EOF |
 gen-atomic-instrumented.sh      linux/atomic/atomic-instrumented.h
 gen-atomic-long.sh              linux/atomic/atomic-long.h
 gen-atomic-fallback.sh          linux/atomic/atomic-arch-fallback.h
+gen-rust-atomic-helpers.sh      ../rust/helpers/atomic.c
 EOF
 while read script header args; do
 	/bin/sh ${ATOMICDIR}/${script} ${ATOMICTBL} ${args} > ${LINUXDIR}/include/${header}
diff --git a/scripts/atomic/gen-rust-atomic-helpers.sh b/scripts/atomic/gen-rust-atomic-helpers.sh
new file mode 100755
index 000000000000..45b1e100ed7c
--- /dev/null
+++ b/scripts/atomic/gen-rust-atomic-helpers.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+ATOMICDIR=$(dirname $0)
+
+. ${ATOMICDIR}/atomic-tbl.sh
+
+#gen_proto_order_variant(meta, pfx, name, sfx, order, atomic, int, arg...)
+gen_proto_order_variant()
+{
+	local meta="$1"; shift
+	local pfx="$1"; shift
+	local name="$1"; shift
+	local sfx="$1"; shift
+	local order="$1"; shift
+	local atomic="$1"; shift
+	local int="$1"; shift
+
+	local atomicname="${atomic}_${pfx}${name}${sfx}${order}"
+
+	local ret="$(gen_ret_type "${meta}" "${int}")"
+	local params="$(gen_params "${int}" "${atomic}" "$@")"
+	local args="$(gen_args "$@")"
+	local retstmt="$(gen_ret_stmt "${meta}")"
+
+cat <<EOF
+__rust_helper ${ret}
+rust_helper_${atomicname}(${params})
+{
+	${retstmt}${atomicname}(${args});
+}
+
+EOF
+}
+
+cat << EOF
+// SPDX-License-Identifier: GPL-2.0
+
+// Generated by $0
+// DO NOT MODIFY THIS FILE DIRECTLY
+
+/*
+ * This file provides helpers for the various atomic functions for Rust.
+ */
+#ifndef _RUST_ATOMIC_API_H
+#define _RUST_ATOMIC_API_H
+
+#include <linux/atomic.h>
+
+// TODO: Remove this after INLINE_HELPERS support is added.
+#ifndef __rust_helper
+#define __rust_helper
+#endif
+
+EOF
+
+grep '^[a-z]' "$1" | while read name meta args; do
+	gen_proto "${meta}" "${name}" "atomic" "int" ${args}
+done
+
+grep '^[a-z]' "$1" | while read name meta args; do
+	gen_proto "${meta}" "${name}" "atomic64" "s64" ${args}
+done
+
+cat <<EOF
+#endif /* _RUST_ATOMIC_API_H */
+EOF
diff --git a/scripts/crypto/gen-hash-testvecs.py b/scripts/crypto/gen-hash-testvecs.py
index 4ac927d40cf5..fc063f2ee95f 100755
--- a/scripts/crypto/gen-hash-testvecs.py
+++ b/scripts/crypto/gen-hash-testvecs.py
@@ -84,11 +84,16 @@ def print_c_struct_u8_array_field(name, value):
     print_bytes('\t\t\t', value, 8)
     print('\t\t},')
 
+def alg_digest_size_const(alg):
+    if alg == 'blake2s':
+        return 'BLAKE2S_HASH_SIZE'
+    return f'{alg.upper()}_DIGEST_SIZE'
+
 def gen_unkeyed_testvecs(alg):
     print('')
     print('static const struct {')
     print('\tsize_t data_len;')
-    print(f'\tu8 digest[{alg.upper()}_DIGEST_SIZE];')
+    print(f'\tu8 digest[{alg_digest_size_const(alg)}];')
     print('} hash_testvecs[] = {')
     for data_len in DATA_LENS:
         data = rand_bytes(data_len)
@@ -103,7 +108,7 @@ def gen_unkeyed_testvecs(alg):
     for data_len in range(len(data) + 1):
         hash_update(ctx, compute_hash(alg, data[:data_len]))
     print_static_u8_array_definition(
-            f'hash_testvec_consolidated[{alg.upper()}_DIGEST_SIZE]',
+            f'hash_testvec_consolidated[{alg_digest_size_const(alg)}]',
             hash_final(ctx))
 
 def gen_hmac_testvecs(alg):
@@ -119,6 +124,20 @@ def gen_hmac_testvecs(alg):
             f'hmac_testvec_consolidated[{alg.upper()}_DIGEST_SIZE]',
             ctx.digest())
 
+BLAKE2S_KEY_SIZE = 32
+BLAKE2S_HASH_SIZE = 32
+
+def gen_additional_blake2s_testvecs():
+    hashes = b''
+    for key_len in range(BLAKE2S_KEY_SIZE + 1):
+        for out_len in range(1, BLAKE2S_HASH_SIZE + 1):
+            h = hashlib.blake2s(digest_size=out_len, key=rand_bytes(key_len))
+            h.update(rand_bytes(100))
+            hashes += h.digest()
+    print_static_u8_array_definition(
+            'blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE]',
+            compute_hash('blake2s', hashes))
+
 def gen_additional_poly1305_testvecs():
     key = b'\xff' * POLY1305_KEY_SIZE
     data = b''
@@ -141,7 +160,9 @@ alg = sys.argv[1]
 print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
 print(f'/* This file was generated by: {sys.argv[0]} {" ".join(sys.argv[1:])} */')
 gen_unkeyed_testvecs(alg)
-if alg == 'poly1305':
+if alg == 'blake2s':
+    gen_additional_blake2s_testvecs()
+elif alg == 'poly1305':
     gen_additional_poly1305_testvecs()
 else:
     gen_hmac_testvecs(alg)
diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
index 6cb6d1051815..8f1b3500f8e2 100644
--- a/scripts/gcc-plugins/gcc-common.h
+++ b/scripts/gcc-plugins/gcc-common.h
@@ -173,10 +173,17 @@ static inline opt_pass *get_pass_for_id(int id)
 	return g->get_passes()->get_pass_for_id(id);
 }
 
+#if BUILDING_GCC_VERSION < 16000
 #define TODO_verify_ssa TODO_verify_il
 #define TODO_verify_flow TODO_verify_il
 #define TODO_verify_stmts TODO_verify_il
 #define TODO_verify_rtl_sharing TODO_verify_il
+#else
+#define TODO_verify_ssa 0
+#define TODO_verify_flow 0
+#define TODO_verify_stmts 0
+#define TODO_verify_rtl_sharing 0
+#endif
 
 #define INSN_DELETED_P(insn) (insn)->deleted()
 
diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs
index 39c82908ff3a..38b3416bb979 100644
--- a/scripts/generate_rust_target.rs
+++ b/scripts/generate_rust_target.rs
@@ -225,7 +225,11 @@ fn main() {
         ts.push("features", features);
         ts.push("llvm-target", "x86_64-linux-gnu");
         ts.push("supported-sanitizers", ["kcfi", "kernel-address"]);
-        ts.push("target-pointer-width", "64");
+        if cfg.rustc_version_atleast(1, 91, 0) {
+            ts.push("target-pointer-width", 64);
+        } else {
+            ts.push("target-pointer-width", "64");
+        }
     } else if cfg.has("X86_32") {
         // This only works on UML, as i386 otherwise needs regparm support in rustc
         if !cfg.has("UML") {
@@ -245,7 +249,11 @@ fn main() {
         }
         ts.push("features", features);
         ts.push("llvm-target", "i386-unknown-linux-gnu");
-        ts.push("target-pointer-width", "32");
+        if cfg.rustc_version_atleast(1, 91, 0) {
+            ts.push("target-pointer-width", 32);
+        } else {
+            ts.push("target-pointer-width", "32");
+        }
     } else if cfg.has("LOONGARCH") {
         panic!("loongarch uses the builtin rustc loongarch64-unknown-none-softfloat target");
     } else {
diff --git a/scripts/kconfig/expr.h b/scripts/kconfig/expr.h
index fe2231e0e6a4..5f900d18dae0 100644
--- a/scripts/kconfig/expr.h
+++ b/scripts/kconfig/expr.h
@@ -145,6 +145,7 @@ struct symbol {
 #define SYMBOL_CONST      0x0001  /* symbol is const */
 #define SYMBOL_CHECK      0x0008  /* used during dependency checking */
 #define SYMBOL_VALID      0x0080  /* set when symbol.curr is calculated */
+#define SYMBOL_TRANS      0x0100  /* symbol is transitional only (not visible)*/
 #define SYMBOL_WRITE      0x0200  /* write symbol to file (KCONFIG_CONFIG) */
 #define SYMBOL_WRITTEN    0x0800  /* track info to avoid double-write to .config */
 #define SYMBOL_CHECKED    0x2000  /* used during dependency checking */
diff --git a/scripts/kconfig/lexer.l b/scripts/kconfig/lexer.l
index 9c2cdfc33c6f..6d2c92c6095d 100644
--- a/scripts/kconfig/lexer.l
+++ b/scripts/kconfig/lexer.l
@@ -126,6 +126,7 @@ n	[A-Za-z0-9_-]
 "select"		return T_SELECT;
 "source"		return T_SOURCE;
 "string"		return T_STRING;
+"transitional"		return T_TRANSITIONAL;
 "tristate"		return T_TRISTATE;
 "visible"		return T_VISIBLE;
 "||"			return T_OR;
diff --git a/scripts/kconfig/parser.y b/scripts/kconfig/parser.y
index e9c3c664e925..49b79dde1725 100644
--- a/scripts/kconfig/parser.y
+++ b/scripts/kconfig/parser.y
@@ -75,6 +75,7 @@ struct menu *current_menu, *current_entry, *current_choice;
 %token T_SELECT
 %token T_SOURCE
 %token T_STRING
+%token T_TRANSITIONAL
 %token T_TRISTATE
 %token T_VISIBLE
 %token T_EOL
@@ -205,6 +206,12 @@ config_option: T_PROMPT T_WORD_QUOTE if_expr T_EOL
 	printd(DEBUG_PARSE, "%s:%d:prompt\n", cur_filename, cur_lineno);
 };
 
+config_option: T_TRANSITIONAL T_EOL
+{
+	current_entry->sym->flags |= SYMBOL_TRANS;
+	printd(DEBUG_PARSE, "%s:%d:transitional\n", cur_filename, cur_lineno);
+};
+
 config_option: default expr if_expr T_EOL
 {
 	menu_add_expr(P_DEFAULT, $2, $3);
@@ -483,6 +490,43 @@ assign_val:
 %%
 
 /**
+ * transitional_check_sanity - check transitional symbols have no other
+ *			       properties
+ *
+ * @menu: menu of the potentially transitional symbol
+ *
+ * Return: -1 if an error is found, 0 otherwise.
+ */
+static int transitional_check_sanity(const struct menu *menu)
+{
+	struct property *prop;
+
+	if (!menu->sym || !(menu->sym->flags & SYMBOL_TRANS))
+		return 0;
+
+	/* Check for depends and visible conditions. */
+	if ((menu->dep && !expr_is_yes(menu->dep)) ||
+	    (menu->visibility && !expr_is_yes(menu->visibility))) {
+		fprintf(stderr, "%s:%d: error: %s",
+			menu->filename, menu->lineno,
+			"transitional symbols can only have help sections\n");
+		return -1;
+	}
+
+	/* Check for any property other than "help". */
+	for (prop = menu->sym->prop; prop; prop = prop->next) {
+		if (prop->type != P_COMMENT) {
+			fprintf(stderr, "%s:%d: error: %s",
+				prop->filename, prop->lineno,
+				"transitional symbols can only have help sections\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/**
  * choice_check_sanity - check sanity of a choice member
  *
  * @menu: menu of the choice member
@@ -558,6 +602,9 @@ void conf_parse(const char *name)
 		if (menu->sym && sym_check_deps(menu->sym))
 			yynerrs++;
 
+		if (transitional_check_sanity(menu))
+			yynerrs++;
+
 		if (menu->sym && sym_is_choice(menu->sym)) {
 			menu_for_each_sub_entry(child, menu)
 				if (child->sym && choice_check_sanity(child))
diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index 26ab10c0fd76..760cac998381 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -214,6 +214,11 @@ static void sym_calc_visibility(struct symbol *sym)
 	struct property *prop;
 	tristate tri;
 
+	if (sym->flags & SYMBOL_TRANS) {
+		sym->visible = yes;
+		return;
+	}
+
 	/* any prompt visible? */
 	tri = no;
 	for_all_prompts(sym, prop) {
@@ -526,7 +531,7 @@ void sym_calc_value(struct symbol *sym)
 		}
 	}
 
-	if (sym_is_choice(sym))
+	if (sym_is_choice(sym) || sym->flags & SYMBOL_TRANS)
 		sym->flags &= ~SYMBOL_WRITE;
 }
 
diff --git a/scripts/kconfig/tests/conftest.py b/scripts/kconfig/tests/conftest.py
index 2a2a7e2da060..d94b79e012c0 100644
--- a/scripts/kconfig/tests/conftest.py
+++ b/scripts/kconfig/tests/conftest.py
@@ -81,7 +81,22 @@ class Conf:
                 # For interactive modes such as oldaskconfig, oldconfig,
                 # send 'Enter' key until the program finishes.
                 if interactive:
-                    ps.stdin.write(b'\n')
+                    try:
+                        ps.stdin.write(b'\n')
+                        ps.stdin.flush()
+                    except (BrokenPipeError, OSError):
+                        # Process has exited, stop sending input
+                        break
+
+            # Close stdin gracefully
+            try:
+                ps.stdin.close()
+            except (BrokenPipeError, OSError):
+                # Ignore broken pipe on close
+                pass
+
+            # Wait for process to complete
+            ps.wait()
 
             self.retcode = ps.returncode
             self.stdout = ps.stdout.read().decode()
diff --git a/scripts/kconfig/tests/err_transitional/Kconfig b/scripts/kconfig/tests/err_transitional/Kconfig
new file mode 100644
index 000000000000..a75ed3b2fe5e
--- /dev/null
+++ b/scripts/kconfig/tests/err_transitional/Kconfig
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+# Test that transitional symbols cannot have properties other than help
+
+config BAD_DEFAULT
+	bool
+	transitional
+	default y
+	help
+	  This transitional symbol illegally has a default property.
+
+config BAD_PROMPT
+	bool
+	transitional
+	prompt "Bad prompt"
+	help
+	  This transitional symbol illegally has a prompt.
+
+config BAD_SELECT
+	bool
+	transitional
+	select OTHER_SYMBOL
+	help
+	  This transitional symbol illegally has a select.
+
+config BAD_IMPLY
+	bool
+	transitional
+	imply OTHER_SYMBOL
+	help
+	  This transitional symbol illegally has an imply.
+
+config BAD_DEPENDS
+	bool
+	transitional
+	depends on OTHER_SYMBOL
+	help
+	  This transitional symbol illegally has a depends.
+
+config BAD_RANGE
+	int
+	transitional
+	range 1 10
+	help
+	  This transitional symbol illegally has a range.
+
+config BAD_NO_TYPE
+	transitional
+	help
+	  This transitional symbol illegally has no type specified.
+
+config OTHER_SYMBOL
+	bool
diff --git a/scripts/kconfig/tests/err_transitional/__init__.py b/scripts/kconfig/tests/err_transitional/__init__.py
new file mode 100644
index 000000000000..7dffb5b0833f
--- /dev/null
+++ b/scripts/kconfig/tests/err_transitional/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+"""
+Test that transitional symbols with invalid properties are rejected.
+
+Transitional symbols can only have help sections. Any other properties
+(default, select, depends, etc.) should cause a parser error.
+"""
+
+def test(conf):
+    # This should fail with exit code 1 due to invalid transitional symbol
+    assert conf.olddefconfig() == 1
+
+    # Check that the error message is about transitional symbols
+    assert conf.stderr_contains('expected_stderr')
diff --git a/scripts/kconfig/tests/err_transitional/expected_stderr b/scripts/kconfig/tests/err_transitional/expected_stderr
new file mode 100644
index 000000000000..b52db4f680f4
--- /dev/null
+++ b/scripts/kconfig/tests/err_transitional/expected_stderr
@@ -0,0 +1,7 @@
+Kconfig:46:warning: config symbol defined without type
+Kconfig:7: error: transitional symbols can only have help sections
+Kconfig:14: error: transitional symbols can only have help sections
+Kconfig:21: error: transitional symbols can only have help sections
+Kconfig:28: error: transitional symbols can only have help sections
+Kconfig:32: error: transitional symbols can only have help sections
+Kconfig:42: error: transitional symbols can only have help sections
diff --git a/scripts/kconfig/tests/transitional/Kconfig b/scripts/kconfig/tests/transitional/Kconfig
new file mode 100644
index 000000000000..62c3b24665b9
--- /dev/null
+++ b/scripts/kconfig/tests/transitional/Kconfig
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: GPL-2.0
+# Test transitional symbols for config migration with all Kconfig types
+
+# Enable module support for tristate testing
+config MODULES
+	bool "Enable loadable module support"
+	modules
+	default y
+
+# Basic migration tests for all types
+config NEW_BOOL
+	bool "New bool option"
+	default OLD_BOOL
+
+config OLD_BOOL
+	bool
+	transitional
+
+config NEW_TRISTATE
+	tristate "New tristate option"
+	default OLD_TRISTATE
+
+config OLD_TRISTATE
+	tristate
+	transitional
+
+config NEW_STRING
+	string "New string option"
+	default OLD_STRING
+
+config OLD_STRING
+	string
+	transitional
+
+config NEW_HEX
+	hex "New hex option"
+	default OLD_HEX
+
+config OLD_HEX
+	hex
+	transitional
+
+config NEW_INT
+	int "New int option"
+	default OLD_INT
+
+config OLD_INT
+	int
+	transitional
+
+# Precedence tests for all types
+config NEW_BOOL_PRECEDENCE
+	bool "New bool option with precedence"
+	default OLD_BOOL_PRECEDENCE
+
+config OLD_BOOL_PRECEDENCE
+	bool
+	transitional
+
+config NEW_STRING_PRECEDENCE
+	string "New string option with precedence"
+	default OLD_STRING_PRECEDENCE
+
+config OLD_STRING_PRECEDENCE
+	string
+	transitional
+
+config NEW_TRISTATE_PRECEDENCE
+	tristate "New tristate option with precedence"
+	default OLD_TRISTATE_PRECEDENCE
+
+config OLD_TRISTATE_PRECEDENCE
+	tristate
+	transitional
+
+config NEW_HEX_PRECEDENCE
+	hex "New hex option with precedence"
+	default OLD_HEX_PRECEDENCE
+
+config OLD_HEX_PRECEDENCE
+	hex
+	transitional
+
+config NEW_INT_PRECEDENCE
+	int "New int option with precedence"
+	default OLD_INT_PRECEDENCE
+
+config OLD_INT_PRECEDENCE
+	int
+	transitional
+
+# Test that help sections are allowed for transitional symbols
+config OLD_WITH_HELP
+	bool
+	transitional
+	help
+	  This transitional symbol has a help section to validate that help is allowed.
+
+config REGULAR_OPTION
+	bool "Regular option"
diff --git a/scripts/kconfig/tests/transitional/__init__.py b/scripts/kconfig/tests/transitional/__init__.py
new file mode 100644
index 000000000000..61937d10edf1
--- /dev/null
+++ b/scripts/kconfig/tests/transitional/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+"""
+Test transitional symbol migration functionality for all Kconfig types.
+
+This tests that:
+- OLD_* options in existing .config cause NEW_* options to be set
+- OLD_* options are not written to the new .config file
+- NEW_* options appear in the new .config file with correct values
+- All Kconfig types work correctly: bool, tristate, string, hex, int
+- User-set NEW values take precedence over conflicting OLD transitional values
+"""
+
+def test(conf):
+    # Run olddefconfig to process the migration with the initial config
+    assert conf.olddefconfig(dot_config='initial_config') == 0
+
+    # Check that the configuration matches expected output
+    assert conf.config_contains('expected_config')
diff --git a/scripts/kconfig/tests/transitional/expected_config b/scripts/kconfig/tests/transitional/expected_config
new file mode 100644
index 000000000000..846e9ddcab91
--- /dev/null
+++ b/scripts/kconfig/tests/transitional/expected_config
@@ -0,0 +1,12 @@
+CONFIG_MODULES=y
+CONFIG_NEW_BOOL=y
+CONFIG_NEW_TRISTATE=m
+CONFIG_NEW_STRING="test string"
+CONFIG_NEW_HEX=0x1234
+CONFIG_NEW_INT=42
+# CONFIG_NEW_BOOL_PRECEDENCE is not set
+CONFIG_NEW_STRING_PRECEDENCE="user value"
+CONFIG_NEW_TRISTATE_PRECEDENCE=y
+CONFIG_NEW_HEX_PRECEDENCE=0xABCD
+CONFIG_NEW_INT_PRECEDENCE=100
+# CONFIG_REGULAR_OPTION is not set
diff --git a/scripts/kconfig/tests/transitional/initial_config b/scripts/kconfig/tests/transitional/initial_config
new file mode 100644
index 000000000000..e648a65e504c
--- /dev/null
+++ b/scripts/kconfig/tests/transitional/initial_config
@@ -0,0 +1,16 @@
+CONFIG_MODULES=y
+CONFIG_OLD_BOOL=y
+CONFIG_OLD_TRISTATE=m
+CONFIG_OLD_STRING="test string"
+CONFIG_OLD_HEX=0x1234
+CONFIG_OLD_INT=42
+# CONFIG_NEW_BOOL_PRECEDENCE is not set
+CONFIG_OLD_BOOL_PRECEDENCE=y
+CONFIG_NEW_STRING_PRECEDENCE="user value"
+CONFIG_OLD_STRING_PRECEDENCE="old value"
+CONFIG_NEW_TRISTATE_PRECEDENCE=y
+CONFIG_OLD_TRISTATE_PRECEDENCE=m
+CONFIG_NEW_HEX_PRECEDENCE=0xABCD
+CONFIG_OLD_HEX_PRECEDENCE=0x5678
+CONFIG_NEW_INT_PRECEDENCE=100
+CONFIG_OLD_INT_PRECEDENCE=200
diff --git a/security/Kconfig b/security/Kconfig
index 4816fc74f81e..285f284dfcac 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -269,6 +269,7 @@ endchoice
 
 config LSM
 	string "Ordered list of enabled LSMs"
+	depends on SECURITY
 	default "landlock,lockdown,yama,loadpin,safesetid,smack,selinux,tomoyo,apparmor,ipe,bpf" if DEFAULT_SECURITY_SMACK
 	default "landlock,lockdown,yama,loadpin,safesetid,apparmor,selinux,smack,tomoyo,ipe,bpf" if DEFAULT_SECURITY_APPARMOR
 	default "landlock,lockdown,yama,loadpin,safesetid,tomoyo,ipe,bpf" if DEFAULT_SECURITY_TOMOYO
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 8e1cc229b41b..b3f7a3258a2c 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task)
 }
 
 static int apparmor_task_alloc(struct task_struct *task,
-			       unsigned long clone_flags)
+			       u64 clone_flags)
 {
 	struct aa_task_ctx *new = task_ctx(task);
 
@@ -2530,6 +2530,9 @@ static int __init apparmor_init(void)
 	security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
 				&apparmor_lsmid);
 
+	/* Inform the audit system that secctx is used */
+	audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT);
+
 	/* Report that AppArmor successfully initialized */
 	apparmor_initialized = 1;
 	if (aa_g_profile_mode == APPARMOR_COMPLAIN)
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index c04f8879ad03..0bade2c5aa1d 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb)
 		struct landlock_object *object;
 
 		/* Only handles referenced inodes. */
-		if (!atomic_read(&inode->i_count))
+		if (!icount_read(inode))
 			continue;
 
 		/*
diff --git a/security/min_addr.c b/security/min_addr.c
index df1bc643d886..c55bb84b8632 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -3,6 +3,7 @@
 #include <linux/mm.h>
 #include <linux/security.h>
 #include <linux/sysctl.h>
+#include <linux/minmax.h>
 
 /* amount of vm to protect from userspace access by both DAC and the LSM*/
 unsigned long mmap_min_addr;
@@ -16,10 +17,7 @@ unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
 static void update_mmap_min_addr(void)
 {
 #ifdef CONFIG_LSM_MMAP_MIN_ADDR
-	if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR)
-		mmap_min_addr = dac_mmap_min_addr;
-	else
-		mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR;
+	mmap_min_addr = umax(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR);
 #else
 	mmap_min_addr = dac_mmap_min_addr;
 #endif
diff --git a/security/security.c b/security/security.c
index ad163f06bf7a..301104d63fde 100644
--- a/security/security.c
+++ b/security/security.c
@@ -283,6 +283,9 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
 	lsm_set_blob_size(&needed->lbs_xattr_count,
 			  &blob_sizes.lbs_xattr_count);
 	lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev);
+	lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map);
+	lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog);
+	lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token);
 }
 
 /* Prepare LSM for initialization. */
@@ -480,6 +483,9 @@ static void __init ordered_lsm_init(void)
 	init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev);
 	init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);
 	init_debug("bdev blob size       = %d\n", blob_sizes.lbs_bdev);
+	init_debug("bpf map blob size    = %d\n", blob_sizes.lbs_bpf_map);
+	init_debug("bpf prog blob size   = %d\n", blob_sizes.lbs_bpf_prog);
+	init_debug("bpf token blob size  = %d\n", blob_sizes.lbs_bpf_token);
 
 	/*
 	 * Create any kmem_caches needed for blobs
@@ -823,17 +829,50 @@ static int lsm_msg_msg_alloc(struct msg_msg *mp)
  */
 static int lsm_bdev_alloc(struct block_device *bdev)
 {
-	if (blob_sizes.lbs_bdev == 0) {
-		bdev->bd_security = NULL;
-		return 0;
-	}
+	return lsm_blob_alloc(&bdev->bd_security, blob_sizes.lbs_bdev,
+			      GFP_KERNEL);
+}
 
-	bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL);
-	if (!bdev->bd_security)
-		return -ENOMEM;
+#ifdef CONFIG_BPF_SYSCALL
+/**
+ * lsm_bpf_map_alloc - allocate a composite bpf_map blob
+ * @map: the bpf_map that needs a blob
+ *
+ * Allocate the bpf_map blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_bpf_map_alloc(struct bpf_map *map)
+{
+	return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL);
+}
 
-	return 0;
+/**
+ * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob
+ * @prog: the bpf_prog that needs a blob
+ *
+ * Allocate the bpf_prog blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_bpf_prog_alloc(struct bpf_prog *prog)
+{
+	return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL);
+}
+
+/**
+ * lsm_bpf_token_alloc - allocate a composite bpf_token blob
+ * @token: the bpf_token that needs a blob
+ *
+ * Allocate the bpf_token blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_bpf_token_alloc(struct bpf_token *token)
+{
+	return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL);
 }
+#endif /* CONFIG_BPF_SYSCALL */
 
 /**
  * lsm_early_task - during initialization allocate a composite task blob
@@ -3185,7 +3224,7 @@ int security_file_truncate(struct file *file)
  *
  * Return: Returns a zero on success, negative values on failure.
  */
-int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
+int security_task_alloc(struct task_struct *task, u64 clone_flags)
 {
 	int rc = lsm_task_alloc(task);
 
@@ -4342,17 +4381,31 @@ EXPORT_SYMBOL(security_secid_to_secctx);
  * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
  * @prop: lsm specific information
  * @cp: the LSM context
+ * @lsmid: which security module to report
  *
  * Convert a @prop entry to security context.  If @cp is NULL the
  * length of the result will be returned. This does mean that the
  * length could change between calls to check the length and the
  * next call which actually allocates and returns the @cp.
  *
+ * @lsmid identifies which LSM should supply the context.
+ * A value of LSM_ID_UNDEF indicates that the first LSM suppling
+ * the hook should be used. This is used in cases where the
+ * ID of the supplying LSM is unambiguous.
+ *
  * Return: Return length of data on success, error on failure.
  */
-int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp)
+int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp,
+			       int lsmid)
 {
-	return call_int_hook(lsmprop_to_secctx, prop, cp);
+	struct lsm_static_call *scall;
+
+	lsm_for_each_hook(scall, lsmprop_to_secctx) {
+		if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id)
+			continue;
+		return scall->hl->hook.lsmprop_to_secctx(prop, cp);
+	}
+	return LSM_RET_DEFAULT(lsmprop_to_secctx);
 }
 EXPORT_SYMBOL(security_lsmprop_to_secctx);
 
@@ -5714,7 +5767,16 @@ int security_bpf_prog(struct bpf_prog *prog)
 int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
 			    struct bpf_token *token, bool kernel)
 {
-	return call_int_hook(bpf_map_create, map, attr, token, kernel);
+	int rc;
+
+	rc = lsm_bpf_map_alloc(map);
+	if (unlikely(rc))
+		return rc;
+
+	rc = call_int_hook(bpf_map_create, map, attr, token, kernel);
+	if (unlikely(rc))
+		security_bpf_map_free(map);
+	return rc;
 }
 
 /**
@@ -5733,7 +5795,16 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
 int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 			   struct bpf_token *token, bool kernel)
 {
-	return call_int_hook(bpf_prog_load, prog, attr, token, kernel);
+	int rc;
+
+	rc = lsm_bpf_prog_alloc(prog);
+	if (unlikely(rc))
+		return rc;
+
+	rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel);
+	if (unlikely(rc))
+		security_bpf_prog_free(prog);
+	return rc;
 }
 
 /**
@@ -5750,7 +5821,16 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
 			      const struct path *path)
 {
-	return call_int_hook(bpf_token_create, token, attr, path);
+	int rc;
+
+	rc = lsm_bpf_token_alloc(token);
+	if (unlikely(rc))
+		return rc;
+
+	rc = call_int_hook(bpf_token_create, token, attr, path);
+	if (unlikely(rc))
+		security_bpf_token_free(token);
+	return rc;
 }
 
 /**
@@ -5794,6 +5874,8 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap)
 void security_bpf_map_free(struct bpf_map *map)
 {
 	call_void_hook(bpf_map_free, map);
+	kfree(map->security);
+	map->security = NULL;
 }
 
 /**
@@ -5805,6 +5887,8 @@ void security_bpf_map_free(struct bpf_map *map)
 void security_bpf_prog_free(struct bpf_prog *prog)
 {
 	call_void_hook(bpf_prog_free, prog);
+	kfree(prog->aux->security);
+	prog->aux->security = NULL;
 }
 
 /**
@@ -5816,6 +5900,8 @@ void security_bpf_prog_free(struct bpf_prog *prog)
 void security_bpf_token_free(struct bpf_token *token)
 {
 	call_void_hook(bpf_token_free, token);
+	kfree(token->security);
+	token->security = NULL;
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 4b4837a20225..430b0e23ee00 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -292,27 +292,26 @@ static struct avc_xperms_decision_node
 	struct avc_xperms_decision_node *xpd_node;
 	struct extended_perms_decision *xpd;
 
-	xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
-				     GFP_NOWAIT | __GFP_NOWARN);
+	xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT);
 	if (!xpd_node)
 		return NULL;
 
 	xpd = &xpd_node->xpd;
 	if (which & XPERMS_ALLOWED) {
 		xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
-						GFP_NOWAIT | __GFP_NOWARN);
+						 GFP_NOWAIT);
 		if (!xpd->allowed)
 			goto error;
 	}
 	if (which & XPERMS_AUDITALLOW) {
 		xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
-						GFP_NOWAIT | __GFP_NOWARN);
+						    GFP_NOWAIT);
 		if (!xpd->auditallow)
 			goto error;
 	}
 	if (which & XPERMS_DONTAUDIT) {
 		xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
-						GFP_NOWAIT | __GFP_NOWARN);
+						   GFP_NOWAIT);
 		if (!xpd->dontaudit)
 			goto error;
 	}
@@ -340,7 +339,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void)
 {
 	struct avc_xperms_node *xp_node;
 
-	xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN);
+	xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT);
 	if (!xp_node)
 		return xp_node;
 	INIT_LIST_HEAD(&xp_node->xpd_head);
@@ -495,7 +494,7 @@ static struct avc_node *avc_alloc_node(void)
 {
 	struct avc_node *node;
 
-	node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN);
+	node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT);
 	if (!node)
 		goto out;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c95a5874bf7d..76b66845a1c3 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -476,7 +476,9 @@ static int selinux_is_genfs_special_handling(struct super_block *sb)
 		!strcmp(sb->s_type->name, "rootfs") ||
 		(selinux_policycap_cgroupseclabel() &&
 		 (!strcmp(sb->s_type->name, "cgroup") ||
-		  !strcmp(sb->s_type->name, "cgroup2")));
+		  !strcmp(sb->s_type->name, "cgroup2"))) ||
+		(selinux_policycap_functionfs_seclabel() &&
+		 !strcmp(sb->s_type->name, "functionfs"));
 }
 
 static int selinux_is_sblabel_mnt(struct super_block *sb)
@@ -741,7 +743,9 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 	    !strcmp(sb->s_type->name, "binder") ||
 	    !strcmp(sb->s_type->name, "bpf") ||
 	    !strcmp(sb->s_type->name, "pstore") ||
-	    !strcmp(sb->s_type->name, "securityfs"))
+	    !strcmp(sb->s_type->name, "securityfs") ||
+	    (selinux_policycap_functionfs_seclabel() &&
+	     !strcmp(sb->s_type->name, "functionfs")))
 		sbsec->flags |= SE_SBGENFS;
 
 	if (!strcmp(sb->s_type->name, "sysfs") ||
@@ -4144,7 +4148,7 @@ static int selinux_file_open(struct file *file)
 /* task security operations */
 
 static int selinux_task_alloc(struct task_struct *task,
-			      unsigned long clone_flags)
+			      u64 clone_flags)
 {
 	u32 sid = current_sid();
 
@@ -5885,7 +5889,7 @@ static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb,
 	/* we do this in the LOCAL_OUT path and not the POST_ROUTING path
 	 * because we want to make sure we apply the necessary labeling
 	 * before IPsec is applied so we can leverage AH protection */
-	sk = sk_to_full_sk(skb->sk);
+	sk = skb_to_full_sk(skb);
 	if (sk) {
 		struct sk_security_struct *sksec;
 
@@ -7062,14 +7066,14 @@ static int bpf_fd_pass(const struct file *file, u32 sid)
 
 	if (file->f_op == &bpf_map_fops) {
 		map = file->private_data;
-		bpfsec = map->security;
+		bpfsec = selinux_bpf_map_security(map);
 		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
 				   bpf_map_fmode_to_av(file->f_mode), NULL);
 		if (ret)
 			return ret;
 	} else if (file->f_op == &bpf_prog_fops) {
 		prog = file->private_data;
-		bpfsec = prog->aux->security;
+		bpfsec = selinux_bpf_prog_security(prog);
 		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
 				   BPF__PROG_RUN, NULL);
 		if (ret)
@@ -7083,7 +7087,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
 	u32 sid = current_sid();
 	struct bpf_security_struct *bpfsec;
 
-	bpfsec = map->security;
+	bpfsec = selinux_bpf_map_security(map);
 	return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
 			    bpf_map_fmode_to_av(fmode), NULL);
 }
@@ -7093,7 +7097,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 	u32 sid = current_sid();
 	struct bpf_security_struct *bpfsec;
 
-	bpfsec = prog->aux->security;
+	bpfsec = selinux_bpf_prog_security(prog);
 	return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
 			    BPF__PROG_RUN, NULL);
 }
@@ -7103,69 +7107,33 @@ static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
 {
 	struct bpf_security_struct *bpfsec;
 
-	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
-	if (!bpfsec)
-		return -ENOMEM;
-
+	bpfsec = selinux_bpf_map_security(map);
 	bpfsec->sid = current_sid();
-	map->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_map_free(struct bpf_map *map)
-{
-	struct bpf_security_struct *bpfsec = map->security;
-
-	map->security = NULL;
-	kfree(bpfsec);
-}
-
 static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				 struct bpf_token *token, bool kernel)
 {
 	struct bpf_security_struct *bpfsec;
 
-	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
-	if (!bpfsec)
-		return -ENOMEM;
-
+	bpfsec = selinux_bpf_prog_security(prog);
 	bpfsec->sid = current_sid();
-	prog->aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog *prog)
-{
-	struct bpf_security_struct *bpfsec = prog->aux->security;
-
-	prog->aux->security = NULL;
-	kfree(bpfsec);
-}
-
 static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
 				    const struct path *path)
 {
 	struct bpf_security_struct *bpfsec;
 
-	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
-	if (!bpfsec)
-		return -ENOMEM;
-
+	bpfsec = selinux_bpf_token_security(token);
 	bpfsec->sid = current_sid();
-	token->security = bpfsec;
 
 	return 0;
 }
-
-static void selinux_bpf_token_free(struct bpf_token *token)
-{
-	struct bpf_security_struct *bpfsec = token->security;
-
-	token->security = NULL;
-	kfree(bpfsec);
-}
 #endif
 
 struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
@@ -7183,6 +7151,9 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
 	.lbs_xattr_count = SELINUX_INODE_INIT_XATTRS,
 	.lbs_tun_dev = sizeof(struct tun_security_struct),
 	.lbs_ib = sizeof(struct ib_security_struct),
+	.lbs_bpf_map = sizeof(struct bpf_security_struct),
+	.lbs_bpf_prog = sizeof(struct bpf_security_struct),
+	.lbs_bpf_token = sizeof(struct bpf_security_struct),
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7536,9 +7507,6 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
-	LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7618,6 +7586,11 @@ static __init int selinux_init(void)
 	/* Set the security state for the initial task. */
 	cred_init_security();
 
+	/* Inform the audit system that secctx is used */
+	audit_cfg_lsm(&selinux_lsmid,
+		      AUDIT_CFG_LSM_SECCTX_SUBJECT |
+		      AUDIT_CFG_LSM_SECCTX_OBJECT);
+
 	default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);
 	if (!default_noexec)
 		pr_notice("SELinux:  virtual memory is executable by default\n");
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 1d7ac59015a1..2d5139c6d45b 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -26,6 +26,7 @@
 #include <linux/lsm_hooks.h>
 #include <linux/msg.h>
 #include <net/net_namespace.h>
+#include <linux/bpf.h>
 #include "flask.h"
 #include "avc.h"
 
@@ -245,4 +246,23 @@ selinux_perf_event(void *perf_event)
 	return perf_event + selinux_blob_sizes.lbs_perf_event;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static inline struct bpf_security_struct *
+selinux_bpf_map_security(struct bpf_map *map)
+{
+	return map->security + selinux_blob_sizes.lbs_bpf_map;
+}
+
+static inline struct bpf_security_struct *
+selinux_bpf_prog_security(struct bpf_prog *prog)
+{
+	return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog;
+}
+
+static inline struct bpf_security_struct *
+selinux_bpf_token_security(struct bpf_token *token)
+{
+	return token->security + selinux_blob_sizes.lbs_bpf_token;
+}
+#endif /* CONFIG_BPF_SYSCALL */
 #endif /* _SELINUX_OBJSEC_H_ */
diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h
index 7405154e6c42..135a969f873c 100644
--- a/security/selinux/include/policycap.h
+++ b/security/selinux/include/policycap.h
@@ -17,6 +17,7 @@ enum {
 	POLICYDB_CAP_NETLINK_XPERM,
 	POLICYDB_CAP_NETIF_WILDCARD,
 	POLICYDB_CAP_GENFS_SECLABEL_WILDCARD,
+	POLICYDB_CAP_FUNCTIONFS_SECLABEL,
 	__POLICYDB_CAP_MAX
 };
 #define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1)
diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h
index d8962fcf2ff9..ff8882887651 100644
--- a/security/selinux/include/policycap_names.h
+++ b/security/selinux/include/policycap_names.h
@@ -20,6 +20,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = {
 	"netlink_xperm",
 	"netif_wildcard",
 	"genfs_seclabel_wildcard",
+	"functionfs_seclabel",
 };
 /* clang-format on */
 
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 8201e6a3ac0f..0f954a40d3fc 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -203,10 +203,10 @@ static inline bool selinux_policycap_netlink_xperm(void)
 		selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]);
 }
 
-static inline bool selinux_policycap_netif_wildcard(void)
+static inline bool selinux_policycap_functionfs_seclabel(void)
 {
 	return READ_ONCE(
-		selinux_state.policycap[POLICYDB_CAP_NETIF_WILDCARD]);
+		selinux_state.policycap[POLICYDB_CAP_FUNCTIONFS_SECLABEL]);
 }
 
 struct selinux_policy_convert_data;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 9aa1d03ab612..232e087bce3e 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1203,7 +1203,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
 	struct selinux_fs_info *fsi = file_inode(filep)->i_sb->s_fs_info;
-	char *page = NULL;
+	char buffer[4];
 	ssize_t length;
 	ssize_t ret;
 	int cur_enforcing;
@@ -1217,27 +1217,19 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 					     fsi->bool_pending_names[index]))
 		goto out_unlock;
 
-	ret = -ENOMEM;
-	page = (char *)get_zeroed_page(GFP_KERNEL);
-	if (!page)
-		goto out_unlock;
-
 	cur_enforcing = security_get_bool_value(index);
 	if (cur_enforcing < 0) {
 		ret = cur_enforcing;
 		goto out_unlock;
 	}
-	length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
-			  fsi->bool_pending_values[index]);
+	length = scnprintf(buffer, sizeof(buffer), "%d %d", !!cur_enforcing,
+			  !!fsi->bool_pending_values[index]);
 	mutex_unlock(&selinux_state.policy_mutex);
-	ret = simple_read_from_buffer(buf, count, ppos, page, length);
-out_free:
-	free_page((unsigned long)page);
-	return ret;
+	return simple_read_from_buffer(buf, count, ppos, buffer, length);
 
 out_unlock:
 	mutex_unlock(&selinux_state.policy_mutex);
-	goto out_free;
+	return ret;
 }
 
 static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index fc340a6f0dde..fdf2f193a291 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -5267,6 +5267,11 @@ static __init int smack_init(void)
 	/* initialize the smack_known_list */
 	init_smack_known_list();
 
+	/* Inform the audit system that secctx is used */
+	audit_cfg_lsm(&smack_lsmid,
+		      AUDIT_CFG_LSM_SECCTX_SUBJECT |
+		      AUDIT_CFG_LSM_SECCTX_OBJECT);
+
 	return 0;
 }
 
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index d6ebcd9db80a..48fc59d38ab2 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
  * Returns 0.
  */
 static int tomoyo_task_alloc(struct task_struct *task,
-			     unsigned long clone_flags)
+			     u64 clone_flags)
 {
 	struct tomoyo_task *old = tomoyo_task(current);
 	struct tomoyo_task *new = tomoyo_task(task);
diff --git a/sound/core/timer.c b/sound/core/timer.c
index 3ce12264eed8..d9fff5c87613 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -2139,14 +2139,14 @@ static int snd_utimer_create(struct snd_timer_uinfo *utimer_info,
 		goto err_take_id;
 	}
 
+	utimer->id = utimer_id;
+
 	utimer->name = kasprintf(GFP_KERNEL, "snd-utimer%d", utimer_id);
 	if (!utimer->name) {
 		err = -ENOMEM;
 		goto err_get_name;
 	}
 
-	utimer->id = utimer_id;
-
 	tid.dev_sclass = SNDRV_TIMER_SCLASS_APPLICATION;
 	tid.dev_class = SNDRV_TIMER_CLASS_GLOBAL;
 	tid.card = -1;
diff --git a/sound/firewire/motu/motu-hwdep.c b/sound/firewire/motu/motu-hwdep.c
index 1ed60618220d..fa2685665db3 100644
--- a/sound/firewire/motu/motu-hwdep.c
+++ b/sound/firewire/motu/motu-hwdep.c
@@ -111,7 +111,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 		events = 0;
 	spin_unlock_irq(&motu->lock);
 
-	return events | EPOLLOUT;
+	return events;
 }
 
 static int hwdep_get_info(struct snd_motu *motu, void __user *arg)
diff --git a/sound/hda/codecs/hdmi/hdmi.c b/sound/hda/codecs/hdmi/hdmi.c
index b5d840d9892b..44576b30f699 100644
--- a/sound/hda/codecs/hdmi/hdmi.c
+++ b/sound/hda/codecs/hdmi/hdmi.c
@@ -1582,6 +1582,7 @@ static int hdmi_add_cvt(struct hda_codec *codec, hda_nid_t cvt_nid)
 static const struct snd_pci_quirk force_connect_list[] = {
 	SND_PCI_QUIRK(0x103c, 0x83e2, "HP EliteDesk 800 G4", 1),
 	SND_PCI_QUIRK(0x103c, 0x83ef, "HP MP9 G4 Retail System AMS", 1),
+	SND_PCI_QUIRK(0x103c, 0x845a, "HP EliteDesk 800 G4 DM 65W", 1),
 	SND_PCI_QUIRK(0x103c, 0x870f, "HP", 1),
 	SND_PCI_QUIRK(0x103c, 0x871a, "HP", 1),
 	SND_PCI_QUIRK(0x103c, 0x8711, "HP", 1),
diff --git a/sound/hda/codecs/hdmi/nvhdmi.c b/sound/hda/codecs/hdmi/nvhdmi.c
index b513253b1101..94671ad24b5e 100644
--- a/sound/hda/codecs/hdmi/nvhdmi.c
+++ b/sound/hda/codecs/hdmi/nvhdmi.c
@@ -198,15 +198,32 @@ static const struct hda_device_id snd_hda_id_nvhdmi[] = {
 	HDA_CODEC_ID_MODEL(0x10de0098, "GPU 98 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de0099, "GPU 99 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de009a, "GPU 9a HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de009b, "GPU 9b HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de009c, "GPU 9c HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de009d, "GPU 9d HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de009e, "GPU 9e HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de009f, "GPU 9f HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a0, "GPU a0 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00a1, "GPU a1 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a3, "GPU a3 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a4, "GPU a4 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a5, "GPU a5 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a6, "GPU a6 HDMI/DP",	MODEL_GENERIC),
 	HDA_CODEC_ID_MODEL(0x10de00a7, "GPU a7 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00a8, "GPU a8 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00a9, "GPU a9 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00aa, "GPU aa HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00ab, "GPU ab HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00ad, "GPU ad HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00ae, "GPU ae HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00af, "GPU af HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00b0, "GPU b0 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00b1, "GPU b1 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00c0, "GPU c0 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00c1, "GPU c1 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00c3, "GPU c3 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00c4, "GPU c4 HDMI/DP",	MODEL_GENERIC),
+	HDA_CODEC_ID_MODEL(0x10de00c5, "GPU c5 HDMI/DP",	MODEL_GENERIC),
 	{} /* terminator */
 };
 MODULE_DEVICE_TABLE(hdaudio, snd_hda_id_nvhdmi);
diff --git a/sound/hda/codecs/hdmi/tegrahdmi.c b/sound/hda/codecs/hdmi/tegrahdmi.c
index f1f745187f68..5f6fe31aa202 100644
--- a/sound/hda/codecs/hdmi/tegrahdmi.c
+++ b/sound/hda/codecs/hdmi/tegrahdmi.c
@@ -299,7 +299,9 @@ static const struct hda_device_id snd_hda_id_tegrahdmi[] = {
 	HDA_CODEC_ID_MODEL(0x10de002f, "Tegra194 HDMI/DP2",	MODEL_TEGRA),
 	HDA_CODEC_ID_MODEL(0x10de0030, "Tegra194 HDMI/DP3",	MODEL_TEGRA),
 	HDA_CODEC_ID_MODEL(0x10de0031, "Tegra234 HDMI/DP",	MODEL_TEGRA234),
+	HDA_CODEC_ID_MODEL(0x10de0033, "SoC 33 HDMI/DP",	MODEL_TEGRA234),
 	HDA_CODEC_ID_MODEL(0x10de0034, "Tegra264 HDMI/DP",	MODEL_TEGRA234),
+	HDA_CODEC_ID_MODEL(0x10de0035, "SoC 35 HDMI/DP",	MODEL_TEGRA234),
 	{} /* terminator */
 };
 MODULE_DEVICE_TABLE(hdaudio, snd_hda_id_tegrahdmi);
diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c
index e27a36e4e92a..f267437c9698 100644
--- a/sound/hda/codecs/realtek/alc269.c
+++ b/sound/hda/codecs/realtek/alc269.c
@@ -510,6 +510,15 @@ static void alc256_shutup(struct hda_codec *codec)
 		hp_pin = 0x21;
 
 	alc_update_coefex_idx(codec, 0x57, 0x04, 0x0007, 0x1); /* Low power */
+
+	/* 3k pull low control for Headset jack. */
+	/* NOTE: call this before clearing the pin, otherwise codec stalls */
+	/* If disable 3k pulldown control for alc257, the Mic detection will not work correctly
+	 * when booting with headset plugged. So skip setting it for the codec alc257
+	 */
+	if (spec->en_3kpull_low)
+		alc_update_coef_idx(codec, 0x46, 0, 3 << 12);
+
 	hp_pin_sense = snd_hda_jack_detect(codec, hp_pin);
 
 	if (hp_pin_sense) {
@@ -520,14 +529,6 @@ static void alc256_shutup(struct hda_codec *codec)
 
 		msleep(75);
 
-	/* 3k pull low control for Headset jack. */
-	/* NOTE: call this before clearing the pin, otherwise codec stalls */
-	/* If disable 3k pulldown control for alc257, the Mic detection will not work correctly
-	 * when booting with headset plugged. So skip setting it for the codec alc257
-	 */
-		if (spec->en_3kpull_low)
-			alc_update_coef_idx(codec, 0x46, 0, 3 << 12);
-
 		if (!spec->no_shutup_pins)
 			snd_hda_codec_write(codec, hp_pin, 0,
 				    AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0);
@@ -3579,6 +3580,7 @@ enum {
 	ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE,
 	ALC294_FIXUP_ASUS_MIC,
 	ALC294_FIXUP_ASUS_HEADSET_MIC,
+	ALC294_FIXUP_ASUS_I2C_HEADSET_MIC,
 	ALC294_FIXUP_ASUS_SPK,
 	ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE,
 	ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE,
@@ -3700,6 +3702,7 @@ enum {
 	ALC236_FIXUP_DELL_DUAL_CODECS,
 	ALC287_FIXUP_CS35L41_I2C_2_THINKPAD_ACPI,
 	ALC287_FIXUP_TAS2781_I2C,
+	ALC295_FIXUP_DELL_TAS2781_I2C,
 	ALC245_FIXUP_TAS2781_SPI_2,
 	ALC287_FIXUP_TXNW2781_I2C,
 	ALC287_FIXUP_YOGA7_14ARB7_I2C,
@@ -4889,6 +4892,15 @@ static const struct hda_fixup alc269_fixups[] = {
 		.chained = true,
 		.chain_id = ALC269_FIXUP_HEADSET_MIC
 	},
+	[ALC294_FIXUP_ASUS_I2C_HEADSET_MIC] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{ 0x19, 0x03a19020 }, /* use as headset mic */
+			{ }
+		},
+		.chained = true,
+		.chain_id = ALC287_FIXUP_CS35L41_I2C_2
+	},
 	[ALC294_FIXUP_ASUS_SPK] = {
 		.type = HDA_FIXUP_VERBS,
 		.v.verbs = (const struct hda_verb[]) {
@@ -5156,6 +5168,12 @@ static const struct hda_fixup alc269_fixups[] = {
 		.type = HDA_FIXUP_FUNC,
 		.v.func = alc294_fixup_gx502_hp,
 	},
+	[ALC295_FIXUP_DELL_TAS2781_I2C] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = tas2781_fixup_tias_i2c,
+		.chained = true,
+		.chain_id = ALC289_FIXUP_DUAL_SPK
+	},
 	[ALC294_FIXUP_ASUS_GU502_PINS] = {
 		.type = HDA_FIXUP_PINS,
 		.v.pins = (const struct hda_pintbl[]) {
@@ -6278,8 +6296,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS),
 	SND_PCI_QUIRK(0x1028, 0x0c28, "Dell Inspiron 16 Plus 7630", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS),
 	SND_PCI_QUIRK(0x1028, 0x0c4d, "Dell", ALC287_FIXUP_CS35L41_I2C_4),
-	SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC287_FIXUP_TAS2781_I2C),
-	SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC287_FIXUP_TAS2781_I2C),
+	SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC295_FIXUP_DELL_TAS2781_I2C),
+	SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC295_FIXUP_DELL_TAS2781_I2C),
 	SND_PCI_QUIRK(0x1028, 0x0cbd, "Dell Oasis 13 CS MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1028, 0x0cbe, "Dell Oasis 13 2-IN-1 MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1028, 0x0cbf, "Dell Oasis 13 Low Weight MTU-L", ALC289_FIXUP_DELL_CS35L41_SPI_2),
@@ -6368,6 +6386,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3),
 	SND_PCI_QUIRK(0x103c, 0x8519, "HP Spectre x360 15-df0xxx", ALC285_FIXUP_HP_SPECTRE_X360),
 	SND_PCI_QUIRK(0x103c, 0x8537, "HP ProBook 440 G6", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
+	SND_PCI_QUIRK(0x103c, 0x8548, "HP EliteBook x360 830 G6", ALC285_FIXUP_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x854a, "HP EliteBook 830 G6", ALC285_FIXUP_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x85c6, "HP Pavilion x360 Convertible 14-dy1xxx", ALC295_FIXUP_HP_MUTE_LED_COEFBIT11),
 	SND_PCI_QUIRK(0x103c, 0x85de, "HP Envy x360 13-ar0xxx", ALC285_FIXUP_HP_ENVY_X360),
 	SND_PCI_QUIRK(0x103c, 0x860f, "HP ZBook 15 G6", ALC285_FIXUP_HP_GPIO_AMP_INIT),
@@ -6456,6 +6476,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x8992, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2),
+	SND_PCI_QUIRK(0x103c, 0x89a0, "HP Laptop 15-dw4xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2),
 	SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x89aa, "HP EliteBook 630 G9", ALC236_FIXUP_HP_GPIO_LED),
@@ -6728,7 +6749,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1043, 0x1b13, "ASUS U41SV/GA403U", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
-	SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2),
+	SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC294_FIXUP_ASUS_I2C_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
 	SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2),
@@ -7058,8 +7079,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C),
-	SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4),
-	SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4),
+	SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD),
+	SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD),
 	SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN),
@@ -7080,6 +7101,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC),
 	SND_PCI_QUIRK(0x17aa, 0x391f, "Yoga S990-16 pro Quad YC Quad", ALC287_FIXUP_TXNW2781_I2C),
 	SND_PCI_QUIRK(0x17aa, 0x3920, "Yoga S990-16 pro Quad VECO Quad", ALC287_FIXUP_TXNW2781_I2C),
+	SND_PCI_QUIRK(0x17aa, 0x3929, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD),
+	SND_PCI_QUIRK(0x17aa, 0x392b, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD),
 	SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),
 	SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
 	SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K),
@@ -7134,12 +7157,15 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP),
 	SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1d05, 0x1409, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC),
+	SND_PCI_QUIRK(0x1d05, 0x300f, "TongFang X6AR5xxY", ALC2XX_FIXUP_HEADSET_MIC),
+	SND_PCI_QUIRK(0x1d05, 0x3019, "TongFang X6FR5xxY", ALC2XX_FIXUP_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1d17, 0x3288, "Haier Boyue G42", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS),
 	SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC),
+	SND_PCI_QUIRK(0x1ee7, 0x2078, "HONOR BRB-X M1010", ALC2XX_FIXUP_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1f66, 0x0105, "Ayaneo Portable Game Player", ALC287_FIXUP_CS35L41_I2C_2),
 	SND_PCI_QUIRK(0x2014, 0x800a, "Positivo ARN50", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
 	SND_PCI_QUIRK(0x2782, 0x0214, "VAIO VJFE-CL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
@@ -7158,6 +7184,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE),
+	SND_PCI_QUIRK(0xf111, 0x000b, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE),
 
 #if 0
diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c
index d8249d997c2a..16d5ea77192f 100644
--- a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c
+++ b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c
@@ -135,6 +135,8 @@ static const struct cs35l41_config cs35l41_config_table[] = {
 	{ "17AA38C8", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 },
 	{ "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 },
 	{ "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 },
+	{ "17AA3929", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 },
+	{ "17AA392B", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 },
 	{}
 };
 
@@ -558,6 +560,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = {
 	{ "CSC3551", "17AA38C8", generic_dsd_config },
 	{ "CSC3551", "17AA38F9", generic_dsd_config },
 	{ "CSC3551", "17AA38FA", generic_dsd_config },
+	{ "CSC3551", "17AA3929", generic_dsd_config },
+	{ "CSC3551", "17AA392B", generic_dsd_config },
 	{}
 };
 
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c
index f46d2e06c64f..96e6d82dc69e 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda.c
@@ -33,6 +33,23 @@ const efi_guid_t tasdev_fct_efi_guid[] = {
 };
 EXPORT_SYMBOL_NS_GPL(tasdev_fct_efi_guid, "SND_HDA_SCODEC_TAS2781");
 
+/*
+ * The order of calibrated-data writing function is a bit different from the
+ * order in UEFI. Here is the conversion to match the order of calibrated-data
+ * writing function.
+ */
+static void cali_cnv(unsigned char *data, unsigned int base, int offset)
+{
+	struct cali_reg reg_data;
+
+	memcpy(&reg_data, &data[base], sizeof(reg_data));
+	/* the data order has to be swapped between r0_low_reg and inv0_reg */
+	swap(reg_data.r0_low_reg, reg_data.invr0_reg);
+
+	cpu_to_be32_array((__force __be32 *)(data + offset + 1),
+		(u32 *)&reg_data, TASDEV_CALIB_N);
+}
+
 static void tas2781_apply_calib(struct tasdevice_priv *p)
 {
 	struct calidata *cali_data = &p->cali_data;
@@ -103,8 +120,7 @@ static void tas2781_apply_calib(struct tasdevice_priv *p)
 
 				data[l] = k;
 				oft++;
-				for (i = 0; i < TASDEV_CALIB_N * 4; i++)
-					data[l + i + 1] = data[4 * oft + i];
+				cali_cnv(data, 4 * oft, l);
 				k++;
 			}
 		}
@@ -130,9 +146,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *p)
 
 		for (j = p->ndev - 1; j >= 0; j--) {
 			l = j * (cali_data->cali_dat_sz_per_dev + 1);
-			for (i = TASDEV_CALIB_N * 4; i > 0 ; i--)
-				data[l + i] = data[p->index * 5 + i];
-			data[l+i] = j;
+			cali_cnv(data, cali_data->cali_dat_sz_per_dev * j, l);
+			data[l] = j;
 		}
 	}
 
@@ -178,6 +193,11 @@ int tas2781_save_calibration(struct tas2781_hda *hda)
 	efi_status_t status;
 	int i;
 
+	if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
+		dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__);
+		return -EINVAL;
+	}
+
 	if (hda->catlog_id < LENOVO)
 		efi_guid = tasdev_fct_efi_guid[hda->catlog_id];
 
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
index 45ac5e41bd4f..b5b7a1e82b75 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
@@ -265,9 +265,9 @@ static const struct snd_kcontrol_new tas2770_snd_controls[] = {
 };
 
 static const struct snd_kcontrol_new tas2781_snd_controls[] = {
-	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Gain", TAS2781_AMP_LEVEL,
+	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2781_AMP_LEVEL,
 		1, 0, 20, 0, tas2781_amp_getvol,
-		tas2781_amp_putvol, amp_vol_tlv),
+		tas2781_amp_putvol, tas2781_amp_tlv),
 	ACARD_SINGLE_BOOL_EXT("Speaker Force Firmware Load", 0,
 		tas2781_force_fwload_get, tas2781_force_fwload_put),
 };
@@ -300,20 +300,26 @@ static int tas2563_save_calibration(struct tas2781_hda *h)
 {
 	efi_guid_t efi_guid = tasdev_fct_efi_guid[LENOVO];
 	char *vars[TASDEV_CALIB_N] = {
-		"R0_%d", "InvR0_%d", "R0_Low_%d", "Power_%d", "TLim_%d"
+		"R0_%d", "R0_Low_%d", "InvR0_%d", "Power_%d", "TLim_%d"
 	};
 	efi_char16_t efi_name[TAS2563_CAL_VAR_NAME_MAX];
 	unsigned long max_size = TAS2563_CAL_DATA_SIZE;
 	unsigned char var8[TAS2563_CAL_VAR_NAME_MAX];
-	struct tasdevice_priv *p = h->hda_priv;
+	struct tasdevice_priv *p = h->priv;
 	struct calidata *cd = &p->cali_data;
 	struct cali_reg *r = &cd->cali_reg_array;
 	unsigned int offset = 0;
 	unsigned char *data;
+	__be32 bedata;
 	efi_status_t status;
 	unsigned int attr;
 	int ret, i, j, k;
 
+	if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
+		dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__);
+		return -EINVAL;
+	}
+
 	cd->cali_dat_sz_per_dev = TAS2563_CAL_DATA_SIZE * TASDEV_CALIB_N;
 
 	/* extra byte for each device is the device number */
@@ -327,8 +333,8 @@ static int tas2563_save_calibration(struct tas2781_hda *h)
 		data[offset] = i;
 		offset++;
 		for (j = 0; j < TASDEV_CALIB_N; ++j) {
-			ret = snprintf(var8, sizeof(var8), vars[j], i);
-
+			/* EFI name for calibration started with 1, not 0 */
+			ret = snprintf(var8, sizeof(var8), vars[j], i + 1);
 			if (ret < 0 || ret >= sizeof(var8) - 1) {
 				dev_err(p->dev, "%s: Read %s failed\n",
 					__func__, var8);
@@ -351,6 +357,8 @@ static int tas2563_save_calibration(struct tas2781_hda *h)
 					i, j, status);
 				return -EINVAL;
 			}
+			bedata = cpu_to_be32(*(uint32_t *)&data[offset]);
+			memcpy(&data[offset], &bedata, sizeof(bedata));
 			offset += TAS2563_CAL_DATA_SIZE;
 		}
 	}
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_spi.c b/sound/hda/codecs/side-codecs/tas2781_hda_spi.c
index 09a5d0f131b2..b9a55672bf15 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_spi.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_spi.c
@@ -494,9 +494,11 @@ static int tas2781_force_fwload_put(struct snd_kcontrol *kcontrol,
 
 static struct snd_kcontrol_new tas2781_snd_ctls[] = {
 	ACARD_SINGLE_RANGE_EXT_TLV(NULL, TAS2781_AMP_LEVEL, 1, 0, 20, 0,
-		tas2781_amp_getvol, tas2781_amp_putvol, amp_vol_tlv),
+		tas2781_amp_getvol, tas2781_amp_putvol,
+		tas2781_amp_tlv),
 	ACARD_SINGLE_RANGE_EXT_TLV(NULL, TAS2781_DVC_LVL, 0, 0, 200, 1,
-		tas2781_digital_getvol, tas2781_digital_putvol, dvc_tlv),
+		tas2781_digital_getvol, tas2781_digital_putvol,
+		tas2781_dvc_tlv),
 	ACARD_SINGLE_BOOL_EXT(NULL, 0, tas2781_force_fwload_get,
 		tas2781_force_fwload_put),
 };
diff --git a/sound/hda/controllers/intel.c b/sound/hda/controllers/intel.c
index fcf67e97a546..1bb3ff55b115 100644
--- a/sound/hda/controllers/intel.c
+++ b/sound/hda/controllers/intel.c
@@ -2077,7 +2077,6 @@ static const struct pci_device_id driver_denylist[] = {
 	{ PCI_DEVICE_SUB(0x1022, 0x1487, 0x1043, 0x874f) }, /* ASUS ROG Zenith II / Strix */
 	{ PCI_DEVICE_SUB(0x1022, 0x1487, 0x1462, 0xcb59) }, /* MSI TRX40 Creator */
 	{ PCI_DEVICE_SUB(0x1022, 0x1487, 0x1462, 0xcb60) }, /* MSI TRX40 */
-	{ PCI_DEVICE_SUB(0x1022, 0x15e3, 0x1022, 0xd601) }, /* ASRock X670E Taichi */
 	{}
 };
 
diff --git a/sound/hda/core/intel-dsp-config.c b/sound/hda/core/intel-dsp-config.c
index 3cb1e7fc3b3b..2a9e35cddcf7 100644
--- a/sound/hda/core/intel-dsp-config.c
+++ b/sound/hda/core/intel-dsp-config.c
@@ -116,6 +116,13 @@ static const struct config_entry config_table[] = {
 		.flags = FLAG_SST,
 		.device = PCI_DEVICE_ID_INTEL_HDA_FCL,
 	},
+#else /* AVS disabled; force to legacy as SOF doesn't work for SKL or KBL */
+	{
+		.device = PCI_DEVICE_ID_INTEL_HDA_SKL_LP,
+	},
+	{
+		.device = PCI_DEVICE_ID_INTEL_HDA_KBL_LP,
+	},
 #endif
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_APOLLOLAKE)
 	{
@@ -167,9 +174,9 @@ static const struct config_entry config_table[] = {
 
 /*
  * CoffeeLake, CannonLake, CometLake, IceLake, TigerLake, AlderLake,
- * RaptorLake use legacy HDAudio driver except for Google Chromebooks
- * and when DMICs are present. Two cases are required since Coreboot
- * does not expose NHLT tables.
+ * RaptorLake, MeteorLake use legacy HDAudio driver except for Google
+ * Chromebooks and when DMICs are present. Two cases are required since
+ * Coreboot does not expose NHLT tables.
  *
  * When the Chromebook quirk is not present, it's based on information
  * that no such device exists. When the quirk is present, it could be
@@ -517,6 +524,19 @@ static const struct config_entry config_table[] = {
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_METEORLAKE)
 	/* Meteorlake-P */
 	{
+		.flags = FLAG_SOF,
+		.device = PCI_DEVICE_ID_INTEL_HDA_MTL,
+		.dmi_table = (const struct dmi_system_id []) {
+			{
+				.ident = "Google Chromebooks",
+				.matches = {
+					DMI_MATCH(DMI_SYS_VENDOR, "Google"),
+				}
+			},
+			{}
+		}
+	},
+	{
 		.flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE,
 		.device = PCI_DEVICE_ID_INTEL_HDA_MTL,
 	},
@@ -630,6 +650,8 @@ static int snd_intel_dsp_check_soundwire(struct pci_dev *pci)
 	int ret;
 
 	handle = ACPI_HANDLE(&pci->dev);
+	if (!handle)
+		return -ENODEV;
 
 	ret = sdw_intel_acpi_scan(handle, &info);
 	if (ret < 0)
diff --git a/sound/pci/azt3328.c b/sound/pci/azt3328.c
index 4418b9ae33e6..b33344f65b8c 100644
--- a/sound/pci/azt3328.c
+++ b/sound/pci/azt3328.c
@@ -412,25 +412,25 @@ snd_azf3328_ctrl_outl(const struct snd_azf3328 *chip, unsigned reg, u32 value)
 	outl(value, chip->ctrl_io + reg);
 }
 
-static inline void
+static inline void __maybe_unused
 snd_azf3328_game_outb(const struct snd_azf3328 *chip, unsigned reg, u8 value)
 {
 	outb(value, chip->game_io + reg);
 }
 
-static inline void
+static inline void __maybe_unused
 snd_azf3328_game_outw(const struct snd_azf3328 *chip, unsigned reg, u16 value)
 {
 	outw(value, chip->game_io + reg);
 }
 
-static inline u8
+static inline u8 __maybe_unused
 snd_azf3328_game_inb(const struct snd_azf3328 *chip, unsigned reg)
 {
 	return inb(chip->game_io + reg);
 }
 
-static inline u16
+static inline u16 __maybe_unused
 snd_azf3328_game_inw(const struct snd_azf3328 *chip, unsigned reg)
 {
 	return inw(chip->game_io + reg);
diff --git a/sound/soc/Kconfig b/sound/soc/Kconfig
index bf362bfca456..ce74818bd715 100644
--- a/sound/soc/Kconfig
+++ b/sound/soc/Kconfig
@@ -111,7 +111,6 @@ source "sound/soc/bcm/Kconfig"
 source "sound/soc/cirrus/Kconfig"
 source "sound/soc/dwc/Kconfig"
 source "sound/soc/fsl/Kconfig"
-source "sound/soc/generic/Kconfig"
 source "sound/soc/google/Kconfig"
 source "sound/soc/hisilicon/Kconfig"
 source "sound/soc/jz4740/Kconfig"
@@ -149,5 +148,8 @@ source "sound/soc/codecs/Kconfig"
 
 source "sound/soc/sdw_utils/Kconfig"
 
+# generic frame-work
+source "sound/soc/generic/Kconfig"
+
 endif	# SND_SOC
 
diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c
index 617690362ad7..4ba0a66981ea 100644
--- a/sound/soc/amd/acp/acp-i2s.c
+++ b/sound/soc/amd/acp/acp-i2s.c
@@ -73,7 +73,7 @@ static int acp_i2s_set_fmt(struct snd_soc_dai *cpu_dai,
 			   unsigned int fmt)
 {
 	struct device *dev = cpu_dai->component->dev;
-	struct acp_chip_info *chip = dev_get_platdata(dev);
+	struct acp_chip_info *chip = dev_get_drvdata(dev->parent);
 	int mode;
 
 	mode = fmt & SND_SOC_DAIFMT_FORMAT_MASK;
@@ -199,7 +199,7 @@ static int acp_i2s_hwparams(struct snd_pcm_substream *substream, struct snd_pcm_
 	u32 reg_val, fmt_reg, tdm_fmt;
 	u32 lrclk_div_val, bclk_div_val;
 
-	chip = dev_get_platdata(dev);
+	chip = dev_get_drvdata(dev->parent);
 	rsrc = chip->rsrc;
 
 	/* These values are as per Hardware Spec */
@@ -386,7 +386,7 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct
 {
 	struct acp_stream *stream = substream->runtime->private_data;
 	struct device *dev = dai->component->dev;
-	struct acp_chip_info *chip = dev_get_platdata(dev);
+	struct acp_chip_info *chip = dev_get_drvdata(dev->parent);
 	struct acp_resource *rsrc = chip->rsrc;
 	u32 val, period_bytes, reg_val, ier_val, water_val, buf_size, buf_reg;
 
@@ -516,14 +516,13 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct
 static int acp_i2s_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai)
 {
 	struct device *dev = dai->component->dev;
-	struct acp_chip_info *chip = dev_get_platdata(dev);
+	struct acp_chip_info *chip = dev_get_drvdata(dev->parent);
 	struct acp_resource *rsrc = chip->rsrc;
 	struct acp_stream *stream = substream->runtime->private_data;
 	u32 reg_dma_size = 0, reg_fifo_size = 0, reg_fifo_addr = 0;
 	u32 phy_addr = 0, acp_fifo_addr = 0, ext_int_ctrl;
 	unsigned int dir = substream->stream;
 
-	chip = dev_get_platdata(dev);
 	switch (dai->driver->id) {
 	case I2S_SP_INSTANCE:
 		if (dir == SNDRV_PCM_STREAM_PLAYBACK) {
@@ -632,7 +631,7 @@ static int acp_i2s_startup(struct snd_pcm_substream *substream, struct snd_soc_d
 {
 	struct acp_stream *stream = substream->runtime->private_data;
 	struct device *dev = dai->component->dev;
-	struct acp_chip_info *chip = dev_get_platdata(dev);
+	struct acp_chip_info *chip = dev_get_drvdata(dev->parent);
 	struct acp_resource *rsrc = chip->rsrc;
 	unsigned int dir = substream->stream;
 	unsigned int irq_bit = 0;
diff --git a/sound/soc/amd/acp/acp-sdw-legacy-mach.c b/sound/soc/amd/acp/acp-sdw-legacy-mach.c
index c2197b75a7dd..5a3cfedacbaf 100644
--- a/sound/soc/amd/acp/acp-sdw-legacy-mach.c
+++ b/sound/soc/amd/acp/acp-sdw-legacy-mach.c
@@ -79,6 +79,22 @@ static const struct dmi_system_id soc_sdw_quirk_table[] = {
 		},
 		.driver_data = (void *)(ASOC_SDW_CODEC_SPKR),
 	},
+	{
+		.callback = soc_sdw_quirk_cb,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+			DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0DD3"),
+		},
+		.driver_data = (void *)(ASOC_SDW_CODEC_SPKR),
+	},
+	{
+		.callback = soc_sdw_quirk_cb,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+			DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0DD4"),
+		},
+		.driver_data = (void *)(ASOC_SDW_CODEC_SPKR),
+	},
 	{}
 };
 
diff --git a/sound/soc/amd/acp/amd.h b/sound/soc/amd/acp/amd.h
index cb8d97122f95..73a028e67246 100644
--- a/sound/soc/amd/acp/amd.h
+++ b/sound/soc/amd/acp/amd.h
@@ -130,7 +130,7 @@
 #define PDM_DMA_INTR_MASK       0x10000
 #define PDM_DEC_64              0x2
 #define PDM_CLK_FREQ_MASK       0x07
-#define PDM_MISC_CTRL_MASK      0x10
+#define PDM_MISC_CTRL_MASK      0x18
 #define PDM_ENABLE              0x01
 #define PDM_DISABLE             0x00
 #define DMA_EN_MASK             0x02
diff --git a/sound/soc/codecs/aw87390.c b/sound/soc/codecs/aw87390.c
index 110009616966..ef6f64856988 100644
--- a/sound/soc/codecs/aw87390.c
+++ b/sound/soc/codecs/aw87390.c
@@ -177,7 +177,7 @@ static int aw87390_profile_info(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol);
 	struct aw87390 *aw87390 = snd_soc_component_get_drvdata(codec);
-	char *prof_name, *name;
+	char *prof_name;
 	int count, ret;
 
 	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
@@ -194,17 +194,15 @@ static int aw87390_profile_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item >= count)
 		uinfo->value.enumerated.item = count - 1;
 
-	name = uinfo->value.enumerated.name;
 	count = uinfo->value.enumerated.item;
 
 	ret = aw87390_dev_get_prof_name(aw87390->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						strlen("null") + 1);
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/aw88081.c b/sound/soc/codecs/aw88081.c
index 3dd8428f08cc..d61a7b8c5470 100644
--- a/sound/soc/codecs/aw88081.c
+++ b/sound/soc/codecs/aw88081.c
@@ -914,12 +914,11 @@ static int aw88081_profile_info(struct snd_kcontrol *kcontrol,
 
 	ret = aw88081_dev_get_prof_name(aw88081->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						sizeof(uinfo->value.enumerated.name));
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(uinfo->value.enumerated.name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/aw88166.c b/sound/soc/codecs/aw88166.c
index 4f76ebe11cc7..28f62b991ef2 100644
--- a/sound/soc/codecs/aw88166.c
+++ b/sound/soc/codecs/aw88166.c
@@ -1478,7 +1478,7 @@ static int aw88166_profile_info(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol);
 	struct aw88166 *aw88166 = snd_soc_component_get_drvdata(codec);
-	char *prof_name, *name;
+	char *prof_name;
 	int count, ret;
 
 	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
@@ -1495,17 +1495,15 @@ static int aw88166_profile_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item >= count)
 		uinfo->value.enumerated.item = count - 1;
 
-	name = uinfo->value.enumerated.name;
 	count = uinfo->value.enumerated.item;
 
 	ret = aw88166_dev_get_prof_name(aw88166->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						strlen("null") + 1);
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/aw88261.c b/sound/soc/codecs/aw88261.c
index fb99871578c5..de11ae8dd9d9 100644
--- a/sound/soc/codecs/aw88261.c
+++ b/sound/soc/codecs/aw88261.c
@@ -819,7 +819,7 @@ static int aw88261_profile_info(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol);
 	struct aw88261 *aw88261 = snd_soc_component_get_drvdata(codec);
-	char *prof_name, *name;
+	char *prof_name;
 	int count, ret;
 
 	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
@@ -836,17 +836,15 @@ static int aw88261_profile_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item >= count)
 		uinfo->value.enumerated.item = count - 1;
 
-	name = uinfo->value.enumerated.name;
 	count = uinfo->value.enumerated.item;
 
 	ret = aw88261_dev_get_prof_name(aw88261->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						strlen("null") + 1);
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/aw88395/aw88395.c b/sound/soc/codecs/aw88395/aw88395.c
index aea44a199b98..fb563b4c6971 100644
--- a/sound/soc/codecs/aw88395/aw88395.c
+++ b/sound/soc/codecs/aw88395/aw88395.c
@@ -175,7 +175,7 @@ static int aw88395_profile_info(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol);
 	struct aw88395 *aw88395 = snd_soc_component_get_drvdata(codec);
-	char *prof_name, *name;
+	char *prof_name;
 	int count, ret;
 
 	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
@@ -192,17 +192,15 @@ static int aw88395_profile_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item >= count)
 		uinfo->value.enumerated.item = count - 1;
 
-	name = uinfo->value.enumerated.name;
 	count = uinfo->value.enumerated.item;
 
 	ret = aw88395_dev_get_prof_name(aw88395->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						strlen("null") + 1);
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/aw88399.c b/sound/soc/codecs/aw88399.c
index c23e70d64d0c..58846feb013d 100644
--- a/sound/soc/codecs/aw88399.c
+++ b/sound/soc/codecs/aw88399.c
@@ -1831,7 +1831,7 @@ static int aw88399_profile_info(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol);
 	struct aw88399 *aw88399 = snd_soc_component_get_drvdata(codec);
-	char *prof_name, *name;
+	char *prof_name;
 	int count, ret;
 
 	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
@@ -1848,17 +1848,15 @@ static int aw88399_profile_info(struct snd_kcontrol *kcontrol,
 	if (uinfo->value.enumerated.item >= count)
 		uinfo->value.enumerated.item = count - 1;
 
-	name = uinfo->value.enumerated.name;
 	count = uinfo->value.enumerated.item;
 
 	ret = aw88399_dev_get_prof_name(aw88399->aw_pa, count, &prof_name);
 	if (ret) {
-		strscpy(uinfo->value.enumerated.name, "null",
-						strlen("null") + 1);
+		strscpy(uinfo->value.enumerated.name, "null");
 		return 0;
 	}
 
-	strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name));
+	strscpy(uinfo->value.enumerated.name, prof_name);
 
 	return 0;
 }
diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c
index ee14031695a1..3905c9cb188a 100644
--- a/sound/soc/codecs/cs35l56-sdw.c
+++ b/sound/soc/codecs/cs35l56-sdw.c
@@ -393,74 +393,6 @@ static int cs35l56_sdw_update_status(struct sdw_slave *peripheral,
 	return 0;
 }
 
-static int cs35l63_sdw_kick_divider(struct cs35l56_private *cs35l56,
-				    struct sdw_slave *peripheral)
-{
-	unsigned int curr_scale_reg, next_scale_reg;
-	int curr_scale, next_scale, ret;
-
-	if (!cs35l56->base.init_done)
-		return 0;
-
-	if (peripheral->bus->params.curr_bank) {
-		curr_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B1;
-		next_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B0;
-	} else {
-		curr_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B0;
-		next_scale_reg = SDW_SCP_BUSCLOCK_SCALE_B1;
-	}
-
-	/*
-	 * Current clock scale value must be different to new value.
-	 * Modify current to guarantee this. If next still has the dummy
-	 * value we wrote when it was current, the core code has not set
-	 * a new scale so restore its original good value
-	 */
-	curr_scale = sdw_read_no_pm(peripheral, curr_scale_reg);
-	if (curr_scale < 0) {
-		dev_err(cs35l56->base.dev, "Failed to read current clock scale: %d\n", curr_scale);
-		return curr_scale;
-	}
-
-	next_scale = sdw_read_no_pm(peripheral, next_scale_reg);
-	if (next_scale < 0) {
-		dev_err(cs35l56->base.dev, "Failed to read next clock scale: %d\n", next_scale);
-		return next_scale;
-	}
-
-	if (next_scale == CS35L56_SDW_INVALID_BUS_SCALE) {
-		next_scale = cs35l56->old_sdw_clock_scale;
-		ret = sdw_write_no_pm(peripheral, next_scale_reg, next_scale);
-		if (ret < 0) {
-			dev_err(cs35l56->base.dev, "Failed to modify current clock scale: %d\n",
-				ret);
-			return ret;
-		}
-	}
-
-	cs35l56->old_sdw_clock_scale = curr_scale;
-	ret = sdw_write_no_pm(peripheral, curr_scale_reg, CS35L56_SDW_INVALID_BUS_SCALE);
-	if (ret < 0) {
-		dev_err(cs35l56->base.dev, "Failed to modify current clock scale: %d\n", ret);
-		return ret;
-	}
-
-	dev_dbg(cs35l56->base.dev, "Next bus scale: %#x\n", next_scale);
-
-	return 0;
-}
-
-static int cs35l56_sdw_bus_config(struct sdw_slave *peripheral,
-				  struct sdw_bus_params *params)
-{
-	struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev);
-
-	if ((cs35l56->base.type == 0x63) && (cs35l56->base.rev < 0xa1))
-		return cs35l63_sdw_kick_divider(cs35l56, peripheral);
-
-	return 0;
-}
-
 static int __maybe_unused cs35l56_sdw_clk_stop(struct sdw_slave *peripheral,
 					       enum sdw_clk_stop_mode mode,
 					       enum sdw_clk_stop_type type)
@@ -476,7 +408,6 @@ static const struct sdw_slave_ops cs35l56_sdw_ops = {
 	.read_prop = cs35l56_sdw_read_prop,
 	.interrupt_callback = cs35l56_sdw_interrupt,
 	.update_status = cs35l56_sdw_update_status,
-	.bus_config = cs35l56_sdw_bus_config,
 #ifdef DEBUG
 	.clk_stop = cs35l56_sdw_clk_stop,
 #endif
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index ba653f6ccfae..850fcf385996 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -838,6 +838,15 @@ const struct cirrus_amp_cal_controls cs35l56_calibration_controls = {
 };
 EXPORT_SYMBOL_NS_GPL(cs35l56_calibration_controls, "SND_SOC_CS35L56_SHARED");
 
+static const struct cirrus_amp_cal_controls cs35l63_calibration_controls = {
+	.alg_id =	0xbf210,
+	.mem_region =	WMFW_ADSP2_YM,
+	.ambient =	"CAL_AMBIENT",
+	.calr =		"CAL_R",
+	.status =	"CAL_STATUS",
+	.checksum =	"CAL_CHECKSUM",
+};
+
 int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base)
 {
 	u64 silicon_uid = 0;
@@ -912,19 +921,31 @@ EXPORT_SYMBOL_NS_GPL(cs35l56_read_prot_status, "SND_SOC_CS35L56_SHARED");
 void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp)
 {
 	__be32 pid, sid, tid;
+	unsigned int alg_id;
 	int ret;
 
+	switch (cs35l56_base->type) {
+	case 0x54:
+	case 0x56:
+	case 0x57:
+		alg_id = 0x9f212;
+		break;
+	default:
+		alg_id = 0xbf212;
+		break;
+	}
+
 	scoped_guard(mutex, &cs_dsp->pwr_lock) {
 		ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_PRJCT_ID",
-							    WMFW_ADSP2_XM, 0x9f212),
+							    WMFW_ADSP2_XM, alg_id),
 					     0, &pid, sizeof(pid));
 		if (!ret)
 			ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_CHNNL_ID",
-								    WMFW_ADSP2_XM, 0x9f212),
+								    WMFW_ADSP2_XM, alg_id),
 						     0, &sid, sizeof(sid));
 		if (!ret)
 			ret = cs_dsp_coeff_read_ctrl(cs_dsp_get_ctl(cs_dsp, "AS_SNPSHT_ID",
-								    WMFW_ADSP2_XM, 0x9f212),
+								    WMFW_ADSP2_XM, alg_id),
 						     0, &tid, sizeof(tid));
 	}
 
@@ -974,8 +995,10 @@ int cs35l56_hw_init(struct cs35l56_base *cs35l56_base)
 	case 0x35A54:
 	case 0x35A56:
 	case 0x35A57:
+		cs35l56_base->calibration_controls = &cs35l56_calibration_controls;
 		break;
 	case 0x35A630:
+		cs35l56_base->calibration_controls = &cs35l63_calibration_controls;
 		devid = devid >> 4;
 		break;
 	default:
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index b1c65d8331e7..2c1edbd636ef 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -695,7 +695,7 @@ static int cs35l56_write_cal(struct cs35l56_private *cs35l56)
 		return ret;
 
 	ret = cs_amp_write_cal_coeffs(&cs35l56->dsp.cs_dsp,
-				      &cs35l56_calibration_controls,
+				      cs35l56->base.calibration_controls,
 				      &cs35l56->base.cal_data);
 
 	wm_adsp_stop(&cs35l56->dsp);
diff --git a/sound/soc/codecs/cs35l56.h b/sound/soc/codecs/cs35l56.h
index bd77a57249d7..40a1800a4585 100644
--- a/sound/soc/codecs/cs35l56.h
+++ b/sound/soc/codecs/cs35l56.h
@@ -20,8 +20,6 @@
 #define CS35L56_SDW_GEN_INT_MASK_1	0xc1
 #define CS35L56_SDW_INT_MASK_CODEC_IRQ	BIT(0)
 
-#define CS35L56_SDW_INVALID_BUS_SCALE	0xf
-
 #define CS35L56_RX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE)
 #define CS35L56_TX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE \
 			    | SNDRV_PCM_FMTBIT_S32_LE)
@@ -52,7 +50,6 @@ struct cs35l56_private {
 	u8 asp_slot_count;
 	bool tdm_mode;
 	bool sysclk_set;
-	u8 old_sdw_clock_scale;
 	u8 sdw_link_num;
 	u8 sdw_unique_id;
 };
diff --git a/sound/soc/codecs/es8389.c b/sound/soc/codecs/es8389.c
index ba1763f36f17..6e4c75d288ef 100644
--- a/sound/soc/codecs/es8389.c
+++ b/sound/soc/codecs/es8389.c
@@ -636,7 +636,7 @@ static int es8389_set_bias_level(struct snd_soc_component *component,
 		regmap_write(es8389->regmap, ES8389_ANA_CTL1, 0x59);
 		regmap_write(es8389->regmap, ES8389_ADC_EN, 0x00);
 		regmap_write(es8389->regmap, ES8389_CLK_OFF1, 0x00);
-		regmap_write(es8389->regmap, ES8389_RESET, 0x7E);
+		regmap_write(es8389->regmap, ES8389_RESET, 0x3E);
 		regmap_update_bits(es8389->regmap, ES8389_DAC_INV, 0x80, 0x80);
 		usleep_range(8000, 8500);
 		regmap_update_bits(es8389->regmap, ES8389_DAC_INV, 0x80, 0x00);
diff --git a/sound/soc/codecs/idt821034.c b/sound/soc/codecs/idt821034.c
index 6738cf21983b..a03d4e5e7d14 100644
--- a/sound/soc/codecs/idt821034.c
+++ b/sound/soc/codecs/idt821034.c
@@ -1067,7 +1067,7 @@ static int idt821034_chip_direction_output(struct gpio_chip *c, unsigned int off
 
 	ret = idt821034_set_slic_conf(idt821034, ch, slic_conf);
 	if (ret) {
-		dev_err(&idt821034->spi->dev, "dir in gpio %d (%u, 0x%x) failed (%d)\n",
+		dev_err(&idt821034->spi->dev, "dir out gpio %d (%u, 0x%x) failed (%d)\n",
 			offset, ch, mask, ret);
 	}
 
diff --git a/sound/soc/codecs/lpass-rx-macro.c b/sound/soc/codecs/lpass-rx-macro.c
index 238dbdb46c18..a8fc842cc94e 100644
--- a/sound/soc/codecs/lpass-rx-macro.c
+++ b/sound/soc/codecs/lpass-rx-macro.c
@@ -618,6 +618,7 @@ static struct interp_sample_rate sr_val_tbl[] = {
 	{176400, 0xB}, {352800, 0xC},
 };
 
+/* Matches also rx_macro_mux_text */
 enum {
 	RX_MACRO_AIF1_PB,
 	RX_MACRO_AIF2_PB,
@@ -722,6 +723,7 @@ static const char * const rx_int2_2_interp_mux_text[] = {
 	"ZERO", "RX INT2_2 MUX",
 };
 
+/* Order must match RX_MACRO_MAX_DAIS enum (offset by 1) */
 static const char *const rx_macro_mux_text[] = {
 	"ZERO", "AIF1_PB", "AIF2_PB", "AIF3_PB", "AIF4_PB"
 };
@@ -2474,6 +2476,7 @@ static int rx_macro_mux_put(struct snd_kcontrol *kcontrol,
 	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
 	struct snd_soc_dapm_update *update = NULL;
 	u32 rx_port_value = ucontrol->value.enumerated.item[0];
+	unsigned int dai_id;
 	u32 aif_rst;
 	struct rx_macro *rx = snd_soc_component_get_drvdata(component);
 
@@ -2490,19 +2493,24 @@ static int rx_macro_mux_put(struct snd_kcontrol *kcontrol,
 
 	switch (rx_port_value) {
 	case 0:
-		if (rx->active_ch_cnt[aif_rst]) {
-			clear_bit(widget->shift,
-				&rx->active_ch_mask[aif_rst]);
-			rx->active_ch_cnt[aif_rst]--;
+		/*
+		 * active_ch_cnt and active_ch_mask use DAI IDs (RX_MACRO_MAX_DAIS).
+		 * active_ch_cnt == 0 was tested in if() above.
+		 */
+		dai_id = aif_rst - 1;
+		if (rx->active_ch_cnt[dai_id]) {
+			clear_bit(widget->shift, &rx->active_ch_mask[dai_id]);
+			rx->active_ch_cnt[dai_id]--;
 		}
 		break;
 	case 1:
 	case 2:
 	case 3:
 	case 4:
-		set_bit(widget->shift,
-			&rx->active_ch_mask[rx_port_value]);
-		rx->active_ch_cnt[rx_port_value]++;
+		/* active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS). */
+		dai_id = rx_port_value - 1;
+		set_bit(widget->shift, &rx->active_ch_mask[dai_id]);
+		rx->active_ch_cnt[dai_id]++;
 		break;
 	default:
 		dev_err(component->dev,
diff --git a/sound/soc/codecs/lpass-tx-macro.c b/sound/soc/codecs/lpass-tx-macro.c
index 40d79bee4584..1da34cb3505f 100644
--- a/sound/soc/codecs/lpass-tx-macro.c
+++ b/sound/soc/codecs/lpass-tx-macro.c
@@ -2229,7 +2229,7 @@ static int tx_macro_register_mclk_output(struct tx_macro *tx)
 }
 
 static const struct snd_soc_component_driver tx_macro_component_drv = {
-	.name = "RX-MACRO",
+	.name = "TX-MACRO",
 	.probe = tx_macro_component_probe,
 	.controls = tx_macro_snd_controls,
 	.num_controls = ARRAY_SIZE(tx_macro_snd_controls),
diff --git a/sound/soc/codecs/lpass-wsa-macro.c b/sound/soc/codecs/lpass-wsa-macro.c
index da6adb3de21d..d7eec9fdaf9c 100644
--- a/sound/soc/codecs/lpass-wsa-macro.c
+++ b/sound/soc/codecs/lpass-wsa-macro.c
@@ -368,6 +368,7 @@ static struct interp_sample_rate int_mix_sample_rate_val[] = {
 	{192000, 0x6},	/* 192K */
 };
 
+/* Matches also rx_mux_text */
 enum {
 	WSA_MACRO_AIF1_PB,
 	WSA_MACRO_AIF_MIX1_PB,
@@ -465,6 +466,7 @@ static const char *const rx_mix_ec_text[] = {
 	"ZERO", "RX_MIX_TX0", "RX_MIX_TX1"
 };
 
+/* Order must match WSA_MACRO_MAX_DAIS enum (offset by 1) */
 static const char *const rx_mux_text[] = {
 	"ZERO", "AIF1_PB", "AIF_MIX1_PB"
 };
@@ -2207,6 +2209,7 @@ static int wsa_macro_rx_mux_put(struct snd_kcontrol *kcontrol,
 	u32 rx_port_value = ucontrol->value.integer.value[0];
 	u32 bit_input;
 	u32 aif_rst;
+	unsigned int dai_id;
 	struct wsa_macro *wsa = snd_soc_component_get_drvdata(component);
 
 	aif_rst = wsa->rx_port_value[widget->shift];
@@ -2224,17 +2227,22 @@ static int wsa_macro_rx_mux_put(struct snd_kcontrol *kcontrol,
 
 	switch (rx_port_value) {
 	case 0:
-		if (wsa->active_ch_cnt[aif_rst]) {
-			clear_bit(bit_input,
-				  &wsa->active_ch_mask[aif_rst]);
-			wsa->active_ch_cnt[aif_rst]--;
+		/*
+		 * active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS).
+		 * active_ch_cnt == 0 was tested in if() above.
+		 */
+		dai_id = aif_rst - 1;
+		if (wsa->active_ch_cnt[dai_id]) {
+			clear_bit(bit_input, &wsa->active_ch_mask[dai_id]);
+			wsa->active_ch_cnt[dai_id]--;
 		}
 		break;
 	case 1:
 	case 2:
-		set_bit(bit_input,
-			&wsa->active_ch_mask[rx_port_value]);
-		wsa->active_ch_cnt[rx_port_value]++;
+		/* active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS). */
+		dai_id = rx_port_value - 1;
+		set_bit(bit_input, &wsa->active_ch_mask[dai_id]);
+		wsa->active_ch_cnt[dai_id]++;
 		break;
 	default:
 		dev_err(component->dev,
diff --git a/sound/soc/codecs/rt1320-sdw.c b/sound/soc/codecs/rt1320-sdw.c
index b13d7a99bf63..dcddc28e8856 100644
--- a/sound/soc/codecs/rt1320-sdw.c
+++ b/sound/soc/codecs/rt1320-sdw.c
@@ -109,6 +109,7 @@ static const struct reg_sequence rt1320_blind_write[] = {
 	{ 0x0000d540, 0x01 },
 	{ 0xd172, 0x2a },
 	{ 0xc5d6, 0x01 },
+	{ 0xd478, 0xff },
 };
 
 static const struct reg_sequence rt1320_vc_blind_write[] = {
@@ -159,7 +160,7 @@ static const struct reg_sequence rt1320_vc_blind_write[] = {
 	{ 0xd471, 0x3a },
 	{ 0xd474, 0x11 },
 	{ 0xd475, 0x32 },
-	{ 0xd478, 0x64 },
+	{ 0xd478, 0xff },
 	{ 0xd479, 0x20 },
 	{ 0xd47a, 0x10 },
 	{ 0xd47c, 0xff },
diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c
index 80b921695e7d..1d80a4b862e2 100644
--- a/sound/soc/codecs/rt5682s.c
+++ b/sound/soc/codecs/rt5682s.c
@@ -653,14 +653,15 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode
 	switch (mode) {
 	case SAR_PWR_SAVING:
 		snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_3,
-			RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_DIS);
+			RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_EN);
 		snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1,
-			RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK,
-			RT5682S_CTRL_MB1_REG | RT5682S_CTRL_MB2_REG);
+			RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK |
+			RT5682S_VREF_POW_MASK, RT5682S_CTRL_MB1_FSM |
+			RT5682S_CTRL_MB2_FSM | RT5682S_VREF_POW_FSM);
 		snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1,
 			RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK |
 			RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS |
-			RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU);
+			RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU);
 		usleep_range(5000, 5500);
 		snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1,
 			RT5682S_SAR_BUTDET_MASK, RT5682S_SAR_BUTDET_EN);
@@ -688,7 +689,7 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode
 		snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1,
 			RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK |
 			RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS |
-			RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU);
+			RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU);
 		break;
 	default:
 		dev_err(component->dev, "Invalid SAR Power mode: %d\n", mode);
@@ -725,7 +726,7 @@ static void rt5682s_disable_push_button_irq(struct snd_soc_component *component)
 	snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1,
 		RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK |
 		RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS |
-		RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU);
+		RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU);
 }
 
 /**
@@ -786,7 +787,7 @@ static int rt5682s_headset_detect(struct snd_soc_component *component, int jack_
 			jack_type = SND_JACK_HEADSET;
 			snd_soc_component_write(component, RT5682S_SAR_IL_CMD_3, 0x024c);
 			snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1,
-				RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_EN);
+				RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_DIS);
 			snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1,
 				RT5682S_SAR_SEL_MB1_2_MASK, val << RT5682S_SAR_SEL_MB1_2_SFT);
 			rt5682s_enable_push_button_irq(component);
@@ -966,7 +967,7 @@ static int rt5682s_set_jack_detect(struct snd_soc_component *component,
 			RT5682S_EMB_JD_MASK | RT5682S_DET_TYPE |
 			RT5682S_POL_FAST_OFF_MASK | RT5682S_MIC_CAP_MASK,
 			RT5682S_EMB_JD_EN | RT5682S_DET_TYPE |
-			RT5682S_POL_FAST_OFF_HIGH | RT5682S_MIC_CAP_HS);
+			RT5682S_POL_FAST_OFF_LOW | RT5682S_MIC_CAP_HS);
 		regmap_update_bits(rt5682s->regmap, RT5682S_SAR_IL_CMD_1,
 			RT5682S_SAR_POW_MASK, RT5682S_SAR_POW_EN);
 		regmap_update_bits(rt5682s->regmap, RT5682S_GPIO_CTRL_1,
diff --git a/sound/soc/codecs/rt712-sdca.c b/sound/soc/codecs/rt712-sdca.c
index 5b298db5f0f6..0ebaae426e73 100644
--- a/sound/soc/codecs/rt712-sdca.c
+++ b/sound/soc/codecs/rt712-sdca.c
@@ -1890,11 +1890,9 @@ int rt712_sdca_io_init(struct device *dev, struct sdw_slave *slave)
 
 		rt712_sdca_va_io_init(rt712);
 	} else {
-		if (!rt712->dmic_function_found) {
-			dev_err(&slave->dev, "%s RT712 VB detected but no SMART_MIC function exposed in ACPI\n",
+		if (!rt712->dmic_function_found)
+			dev_warn(&slave->dev, "%s RT712 VB detected but no SMART_MIC function exposed in ACPI\n",
 				__func__);
-			goto suspend;
-		}
 
 		/* multilanes and DMIC are supported by rt712vb */
 		prop->lane_control_support = true;
diff --git a/sound/soc/codecs/rt721-sdca.c b/sound/soc/codecs/rt721-sdca.c
index f6f7c2ffde1c..a4bd29d7220b 100644
--- a/sound/soc/codecs/rt721-sdca.c
+++ b/sound/soc/codecs/rt721-sdca.c
@@ -278,6 +278,8 @@ static void rt721_sdca_jack_preset(struct rt721_sdca_priv *rt721)
 		RT721_ENT_FLOAT_CTL1, 0x4040);
 	rt_sdca_index_write(rt721->mbq_regmap, RT721_HDA_SDCA_FLOAT,
 		RT721_ENT_FLOAT_CTL4, 0x1201);
+	rt_sdca_index_write(rt721->mbq_regmap, RT721_BOOST_CTRL,
+		RT721_BST_4CH_TOP_GATING_CTRL1, 0x002a);
 	regmap_write(rt721->regmap, 0x2f58, 0x07);
 }
 
diff --git a/sound/soc/codecs/rt721-sdca.h b/sound/soc/codecs/rt721-sdca.h
index 0a82c107b19a..71fac9cd8739 100644
--- a/sound/soc/codecs/rt721-sdca.h
+++ b/sound/soc/codecs/rt721-sdca.h
@@ -56,6 +56,7 @@ struct rt721_sdca_dmic_kctrl_priv {
 #define RT721_CBJ_CTRL				0x0a
 #define RT721_CAP_PORT_CTRL			0x0c
 #define RT721_CLASD_AMP_CTRL			0x0d
+#define RT721_BOOST_CTRL			0x0f
 #define RT721_VENDOR_REG			0x20
 #define RT721_RC_CALIB_CTRL			0x40
 #define RT721_VENDOR_EQ_L			0x53
@@ -93,6 +94,9 @@ struct rt721_sdca_dmic_kctrl_priv {
 /* Index (NID:0dh) */
 #define RT721_CLASD_AMP_2CH_CAL			0x14
 
+/* Index (NID:0fh) */
+#define RT721_BST_4CH_TOP_GATING_CTRL1		0x05
+
 /* Index (NID:20h) */
 #define RT721_JD_PRODUCT_NUM			0x00
 #define RT721_ANALOG_BIAS_CTL3			0x04
diff --git a/sound/soc/codecs/sma1307.c b/sound/soc/codecs/sma1307.c
index b3d401ada176..b683e676640d 100644
--- a/sound/soc/codecs/sma1307.c
+++ b/sound/soc/codecs/sma1307.c
@@ -1737,9 +1737,10 @@ static void sma1307_setting_loaded(struct sma1307_priv *sma1307, const char *fil
 	sma1307->set.checksum = data[sma1307->set.header_size - 2];
 	sma1307->set.num_mode = data[sma1307->set.header_size - 1];
 	num_mode = sma1307->set.num_mode;
-	sma1307->set.header = devm_kzalloc(sma1307->dev,
-					   sma1307->set.header_size,
-					   GFP_KERNEL);
+	sma1307->set.header = devm_kmalloc_array(sma1307->dev,
+						 sma1307->set.header_size,
+						 sizeof(int),
+						 GFP_KERNEL);
 	if (!sma1307->set.header) {
 		sma1307->set.status = false;
 		return;
@@ -1749,7 +1750,7 @@ static void sma1307_setting_loaded(struct sma1307_priv *sma1307, const char *fil
 	       sma1307->set.header_size * sizeof(int));
 
 	if ((sma1307->set.checksum >> 8) != SMA1307_SETTING_CHECKSUM) {
-		dev_err(sma1307->dev, "%s: failed by dismatch \"%s\"\n",
+		dev_err(sma1307->dev, "%s: checksum failed \"%s\"\n",
 			__func__, setting_file);
 		sma1307->set.status = false;
 		return;
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index 9f4d965a1335..0e09d794516f 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -908,12 +908,12 @@ static const struct snd_kcontrol_new tasdevice_cali_controls[] = {
 };
 
 static const struct snd_kcontrol_new tas2781_snd_controls[] = {
-	SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Gain", TAS2781_AMP_LEVEL,
+	SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2781_AMP_LEVEL,
 		1, 0, 20, 0, tas2781_amp_getvol,
-		tas2781_amp_putvol, amp_vol_tlv),
-	SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Gain", TAS2781_DVC_LVL,
+		tas2781_amp_putvol, tas2781_amp_tlv),
+	SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2781_DVC_LVL,
 		0, 0, 200, 1, tas2781_digital_getvol,
-		tas2781_digital_putvol, dvc_tlv),
+		tas2781_digital_putvol, tas2781_dvc_tlv),
 };
 
 static const struct snd_kcontrol_new tas2781_cali_controls[] = {
@@ -1480,7 +1480,7 @@ static ssize_t acoustic_ctl_write(struct file *file,
 		return PTR_ERR(src);
 
 	if (src[0] > max_pkg_len && src[0] != count) {
-		dev_err(priv->dev, "pkg(%u), max(%u), count(%u) dismatch.\n",
+		dev_err(priv->dev, "pkg(%u), max(%u), count(%u) mismatch.\n",
 			src[0], max_pkg_len, (unsigned int)count);
 		ret = 0;
 		goto exit;
diff --git a/sound/soc/codecs/wm8940.c b/sound/soc/codecs/wm8940.c
index 401ee20897b1..94873ea63014 100644
--- a/sound/soc/codecs/wm8940.c
+++ b/sound/soc/codecs/wm8940.c
@@ -220,7 +220,7 @@ static const struct snd_kcontrol_new wm8940_snd_controls[] = {
 	SOC_SINGLE_TLV("Digital Capture Volume", WM8940_ADCVOL,
 		       0, 255, 0, wm8940_adc_tlv),
 	SOC_ENUM("Mic Bias Level", wm8940_mic_bias_level_enum),
-	SOC_SINGLE_TLV("Capture Boost Volue", WM8940_ADCBOOST,
+	SOC_SINGLE_TLV("Capture Boost Volume", WM8940_ADCBOOST,
 		       8, 1, 0, wm8940_capture_boost_vol_tlv),
 	SOC_SINGLE_TLV("Speaker Playback Volume", WM8940_SPKVOL,
 		       0, 63, 0, wm8940_spk_vol_tlv),
@@ -693,7 +693,12 @@ static int wm8940_update_clocks(struct snd_soc_dai *dai)
 	f = wm8940_get_mclkdiv(priv->mclk, fs256, &mclkdiv);
 	if (f != priv->mclk) {
 		/* The PLL performs best around 90MHz */
-		fpll = wm8940_get_mclkdiv(22500000, fs256, &mclkdiv);
+		if (fs256 % 8000)
+			f = 22579200;
+		else
+			f = 24576000;
+
+		fpll = wm8940_get_mclkdiv(f, fs256, &mclkdiv);
 	}
 
 	wm8940_set_dai_pll(dai, 0, 0, priv->mclk, fpll);
diff --git a/sound/soc/codecs/wm8974.c b/sound/soc/codecs/wm8974.c
index bdf437a5403f..db16d893a235 100644
--- a/sound/soc/codecs/wm8974.c
+++ b/sound/soc/codecs/wm8974.c
@@ -419,10 +419,14 @@ static int wm8974_update_clocks(struct snd_soc_dai *dai)
 	fs256 = 256 * priv->fs;
 
 	f = wm8974_get_mclkdiv(priv->mclk, fs256, &mclkdiv);
-
 	if (f != priv->mclk) {
 		/* The PLL performs best around 90MHz */
-		fpll = wm8974_get_mclkdiv(22500000, fs256, &mclkdiv);
+		if (fs256 % 8000)
+			f = 22579200;
+		else
+			f = 24576000;
+
+		fpll = wm8974_get_mclkdiv(f, fs256, &mclkdiv);
 	}
 
 	wm8974_set_dai_pll(dai, 0, 0, priv->mclk, fpll);
diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index c313b654236c..d0367b21f775 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -809,9 +809,9 @@ static void fsl_sai_config_disable(struct fsl_sai *sai, int dir)
 	 * are running concurrently.
 	 */
 	/* Software Reset */
-	regmap_write(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR);
+	regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR);
 	/* Clear SR bit to finish the reset */
-	regmap_write(sai->regmap, FSL_SAI_xCSR(tx, ofs), 0);
+	regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR, 0);
 }
 
 static int fsl_sai_trigger(struct snd_pcm_substream *substream, int cmd,
@@ -930,11 +930,11 @@ static int fsl_sai_dai_probe(struct snd_soc_dai *cpu_dai)
 	unsigned int ofs = sai->soc_data->reg_offset;
 
 	/* Software Reset for both Tx and Rx */
-	regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR);
-	regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR);
+	regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR);
+	regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR);
 	/* Clear SR bit to finish the reset */
-	regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), 0);
-	regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), 0);
+	regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, 0);
+	regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, 0);
 
 	regmap_update_bits(sai->regmap, FSL_SAI_TCR1(ofs),
 			   FSL_SAI_CR1_RFW_MASK(sai->soc_data->fifo_depth),
@@ -1824,11 +1824,11 @@ static int fsl_sai_runtime_resume(struct device *dev)
 
 	regcache_cache_only(sai->regmap, false);
 	regcache_mark_dirty(sai->regmap);
-	regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR);
-	regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR);
+	regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR);
+	regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR);
 	usleep_range(1000, 2000);
-	regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), 0);
-	regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), 0);
+	regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, 0);
+	regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, 0);
 
 	ret = regcache_sync(sai->regmap);
 	if (ret)
diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c
index f997b2dc221b..28f03a5f29f7 100644
--- a/sound/soc/intel/boards/sof_sdw.c
+++ b/sound/soc/intel/boards/sof_sdw.c
@@ -761,7 +761,7 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = {
 		.callback = sof_sdw_quirk_cb,
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Google"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "Fatcat"),
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "Google_Fatcat"),
 		},
 		.driver_data = (void *)(SOC_SDW_PCH_DMIC |
 					SOF_BT_OFFLOAD_SSP(2) |
diff --git a/sound/soc/intel/boards/sof_ssp_amp.c b/sound/soc/intel/boards/sof_ssp_amp.c
index 48ee5353bdf1..729c0cd7c19c 100644
--- a/sound/soc/intel/boards/sof_ssp_amp.c
+++ b/sound/soc/intel/boards/sof_ssp_amp.c
@@ -216,6 +216,12 @@ static const struct platform_device_id board_ids[] = {
 					/* SSP 0 and SSP 2 are used for HDMI IN */
 					SOF_HDMI_PLAYBACK_PRESENT),
 	},
+	{
+		.name = "ptl_lt6911_hdmi_ssp",
+		.driver_data = (kernel_ulong_t)(SOF_SSP_MASK_HDMI_CAPTURE(0x5) |
+					/* SSP 0 and SSP 2 are used for HDMI IN */
+					SOF_HDMI_PLAYBACK_PRESENT),
+	},
 	{ }
 };
 MODULE_DEVICE_TABLE(platform, board_ids);
diff --git a/sound/soc/intel/catpt/pcm.c b/sound/soc/intel/catpt/pcm.c
index 46acb7fdc547..bf734c69c4e0 100644
--- a/sound/soc/intel/catpt/pcm.c
+++ b/sound/soc/intel/catpt/pcm.c
@@ -568,8 +568,9 @@ static const struct snd_pcm_hardware catpt_pcm_hardware = {
 				  SNDRV_PCM_INFO_RESUME |
 				  SNDRV_PCM_INFO_NO_PERIOD_WAKEUP,
 	.formats		= SNDRV_PCM_FMTBIT_S16_LE |
-				  SNDRV_PCM_FMTBIT_S24_LE |
 				  SNDRV_PCM_FMTBIT_S32_LE,
+	.subformats		= SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+				  SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	.period_bytes_min	= PAGE_SIZE,
 	.period_bytes_max	= CATPT_BUFFER_MAX_SIZE / CATPT_PCM_PERIODS_MIN,
 	.periods_min		= CATPT_PCM_PERIODS_MIN,
@@ -698,14 +699,18 @@ static struct snd_soc_dai_driver dai_drivers[] = {
 		.channels_min = 2,
 		.channels_max = 2,
 		.rates = SNDRV_PCM_RATE_48000,
-		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE,
+		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE,
+		.subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+			      SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	},
 	.capture = {
 		.stream_name = "Analog Capture",
 		.channels_min = 2,
 		.channels_max = 4,
 		.rates = SNDRV_PCM_RATE_48000,
-		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE,
+		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE,
+		.subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+			      SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	},
 },
 {
@@ -717,7 +722,9 @@ static struct snd_soc_dai_driver dai_drivers[] = {
 		.channels_min = 2,
 		.channels_max = 2,
 		.rates = SNDRV_PCM_RATE_8000_192000,
-		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE,
+		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE,
+		.subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+			      SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	},
 },
 {
@@ -729,7 +736,9 @@ static struct snd_soc_dai_driver dai_drivers[] = {
 		.channels_min = 2,
 		.channels_max = 2,
 		.rates = SNDRV_PCM_RATE_8000_192000,
-		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE,
+		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE,
+		.subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+			      SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	},
 },
 {
@@ -741,7 +750,9 @@ static struct snd_soc_dai_driver dai_drivers[] = {
 		.channels_min = 2,
 		.channels_max = 2,
 		.rates = SNDRV_PCM_RATE_48000,
-		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE,
+		.formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE,
+		.subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 |
+			      SNDRV_PCM_SUBFMTBIT_MSBITS_MAX,
 	},
 },
 {
diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c
index e292701dfcfe..3c8b10e21ceb 100644
--- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c
+++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c
@@ -61,6 +61,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_machines[] = {
 					SND_SOC_ACPI_TPLG_INTEL_SSP_MSB |
 					SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER,
 	},
+	/* place amp-only boards in the end of table */
+	{
+		.id = "INTC10B0",
+		.drv_name = "ptl_lt6911_hdmi_ssp",
+		.sof_tplg_filename = "sof-ptl-hdmi-ssp02.tplg",
+	},
 	{},
 };
 EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_ptl_machines);
diff --git a/sound/soc/qcom/qdsp6/audioreach.c b/sound/soc/qcom/qdsp6/audioreach.c
index 4ebaaf736fb9..3f5eed5afce5 100644
--- a/sound/soc/qcom/qdsp6/audioreach.c
+++ b/sound/soc/qcom/qdsp6/audioreach.c
@@ -971,6 +971,7 @@ static int audioreach_i2s_set_media_format(struct q6apm_graph *graph,
 	param_data->param_id = PARAM_ID_I2S_INTF_CFG;
 	param_data->param_size = ic_sz - APM_MODULE_PARAM_DATA_SIZE;
 
+	intf_cfg->cfg.lpaif_type = module->hw_interface_type;
 	intf_cfg->cfg.intf_idx = module->hw_interface_idx;
 	intf_cfg->cfg.sd_line_idx = module->sd_line_idx;
 
diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c
index a0d90462fd6a..528756f1332b 100644
--- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c
+++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c
@@ -213,8 +213,10 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s
 
 	return 0;
 err:
-	q6apm_graph_close(dai_data->graph[dai->id]);
-	dai_data->graph[dai->id] = NULL;
+	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
+		q6apm_graph_close(dai_data->graph[dai->id]);
+		dai_data->graph[dai->id] = NULL;
+	}
 	return rc;
 }
 
@@ -260,6 +262,7 @@ static const struct snd_soc_dai_ops q6i2s_ops = {
 	.shutdown	= q6apm_lpass_dai_shutdown,
 	.set_channel_map  = q6dma_set_channel_map,
 	.hw_params        = q6dma_hw_params,
+	.set_fmt	= q6i2s_set_fmt,
 };
 
 static const struct snd_soc_dai_ops q6hdmi_ops = {
diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c
index 73f9f82c4e25..288ccd7f8866 100644
--- a/sound/soc/qcom/sc8280xp.c
+++ b/sound/soc/qcom/sc8280xp.c
@@ -32,6 +32,10 @@ static int sc8280xp_snd_init(struct snd_soc_pcm_runtime *rtd)
 	int dp_pcm_id = 0;
 
 	switch (cpu_dai->id) {
+	case PRIMARY_MI2S_RX...QUATERNARY_MI2S_TX:
+	case QUINARY_MI2S_RX...QUINARY_MI2S_TX:
+		snd_soc_dai_set_fmt(cpu_dai, SND_SOC_DAIFMT_BP_FP);
+		break;
 	case WSA_CODEC_DMA_RX_0:
 	case WSA_CODEC_DMA_RX_1:
 		/*
@@ -186,7 +190,7 @@ static int sc8280xp_platform_probe(struct platform_device *pdev)
 static const struct of_device_id snd_sc8280xp_dt_match[] = {
 	{.compatible = "qcom,qcm6490-idp-sndcard", "qcm6490"},
 	{.compatible = "qcom,qcs6490-rb3gen2-sndcard", "qcs6490"},
-	{.compatible = "qcom,qcs8275-sndcard", "qcs8275"},
+	{.compatible = "qcom,qcs8275-sndcard", "qcs8300"},
 	{.compatible = "qcom,qcs9075-sndcard", "qcs9075"},
 	{.compatible = "qcom,qcs9100-sndcard", "qcs9100"},
 	{.compatible = "qcom,sc8280xp-sndcard", "sc8280xp"},
diff --git a/sound/soc/renesas/rcar/core.c b/sound/soc/renesas/rcar/core.c
index 37d954495ea5..9f086906a2e5 100644
--- a/sound/soc/renesas/rcar/core.c
+++ b/sound/soc/renesas/rcar/core.c
@@ -597,7 +597,7 @@ int rsnd_dai_connect(struct rsnd_mod *mod,
 
 	dev_dbg(dev, "%s is connected to io (%s)\n",
 		rsnd_mod_name(mod),
-		snd_pcm_direction_name(io->substream->stream));
+		rsnd_io_is_play(io) ? "Playback" : "Capture");
 
 	return 0;
 }
diff --git a/sound/soc/sdca/sdca_device.c b/sound/soc/sdca/sdca_device.c
index 0244cdcdd109..4798ce2c8f0b 100644
--- a/sound/soc/sdca/sdca_device.c
+++ b/sound/soc/sdca/sdca_device.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/dmi.h>
 #include <linux/module.h>
 #include <linux/property.h>
 #include <linux/soundwire/sdw.h>
@@ -55,11 +56,30 @@ static bool sdca_device_quirk_rt712_vb(struct sdw_slave *slave)
 	return false;
 }
 
+static bool sdca_device_quirk_skip_func_type_patching(struct sdw_slave *slave)
+{
+	const char *vendor, *sku;
+
+	vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+	sku = dmi_get_system_info(DMI_PRODUCT_SKU);
+
+	if (vendor && sku &&
+	    !strcmp(vendor, "Dell Inc.") &&
+	    (!strcmp(sku, "0C62") || !strcmp(sku, "0C63") || !strcmp(sku, "0C6B")) &&
+	    slave->sdca_data.interface_revision == 0x061c &&
+	    slave->id.mfg_id == 0x01fa && slave->id.part_id == 0x4243)
+		return true;
+
+	return false;
+}
+
 bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk)
 {
 	switch (quirk) {
 	case SDCA_QUIRKS_RT712_VB:
 		return sdca_device_quirk_rt712_vb(slave);
+	case SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING:
+		return sdca_device_quirk_skip_func_type_patching(slave);
 	default:
 		break;
 	}
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index f26f597dca9e..13f68f7b6dd6 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -90,6 +90,7 @@ static int find_sdca_function(struct acpi_device *adev, void *data)
 {
 	struct fwnode_handle *function_node = acpi_fwnode_handle(adev);
 	struct sdca_device_data *sdca_data = data;
+	struct sdw_slave *slave = container_of(sdca_data, struct sdw_slave, sdca_data);
 	struct device *dev = &adev->dev;
 	struct fwnode_handle *control5; /* used to identify function type */
 	const char *function_name;
@@ -137,11 +138,13 @@ static int find_sdca_function(struct acpi_device *adev, void *data)
 		return ret;
 	}
 
-	ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type);
-	if (ret < 0) {
-		dev_err(dev, "SDCA version %#x invalid function type %d\n",
-			sdca_data->interface_revision, function_type);
-		return ret;
+	if (!sdca_device_quirk_match(slave, SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING)) {
+		ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type);
+		if (ret < 0) {
+			dev_err(dev, "SDCA version %#x invalid function type %d\n",
+				sdca_data->interface_revision, function_type);
+			return ret;
+		}
 	}
 
 	function_name = get_sdca_function_name(function_type);
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 8018773ee426..79bf3042f57d 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -155,7 +155,7 @@ static irqreturn_t detected_mode_handler(int irq, void *data)
 							   SDCA_CTL_SELECTED_MODE_NAME);
 
 		if (!name)
-			return -ENOMEM;
+			return IRQ_NONE;
 
 		kctl = snd_soc_component_get_kcontrol(component, name);
 		if (!kctl) {
diff --git a/sound/soc/sdca/sdca_regmap.c b/sound/soc/sdca/sdca_regmap.c
index 5cb3048ea8cf..72f893e00ff5 100644
--- a/sound/soc/sdca/sdca_regmap.c
+++ b/sound/soc/sdca/sdca_regmap.c
@@ -196,7 +196,7 @@ int sdca_regmap_mbq_size(struct sdca_function_data *function, unsigned int reg)
 
 	control = function_find_control(function, reg);
 	if (!control)
-		return false;
+		return -EINVAL;
 
 	return clamp_val(control->nbits / BITS_PER_BYTE, sizeof(u8), sizeof(u32));
 }
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 37bc5867f81d..cc9125ffe92a 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -369,20 +369,25 @@ struct snd_soc_component
 *snd_soc_lookup_component_nolocked(struct device *dev, const char *driver_name)
 {
 	struct snd_soc_component *component;
-	struct snd_soc_component *found_component;
 
-	found_component = NULL;
 	for_each_component(component) {
-		if ((dev == component->dev) &&
-		    (!driver_name ||
-		     (driver_name == component->driver->name) ||
-		     (strcmp(component->driver->name, driver_name) == 0))) {
-			found_component = component;
-			break;
-		}
+		if (dev != component->dev)
+			continue;
+
+		if (!driver_name)
+			return component;
+
+		if (!component->driver->name)
+			continue;
+
+		if (component->driver->name == driver_name)
+			return component;
+
+		if (strcmp(component->driver->name, driver_name) == 0)
+			return component;
 	}
 
-	return found_component;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(snd_soc_lookup_component_nolocked);
 
diff --git a/sound/soc/sof/imx/imx-common.c b/sound/soc/sof/imx/imx-common.c
index f00b381cec3b..d66c198b861a 100644
--- a/sound/soc/sof/imx/imx-common.c
+++ b/sound/soc/sof/imx/imx-common.c
@@ -316,9 +316,9 @@ static int imx_parse_ioremap_memory(struct snd_sof_dev *sdev)
 		}
 
 		sdev->bar[blk_type] = devm_ioremap_resource(sdev->dev, res);
-		if (!sdev->bar[blk_type])
+		if (IS_ERR(sdev->bar[blk_type]))
 			return dev_err_probe(sdev->dev,
-					     -ENOMEM,
+					     PTR_ERR(sdev->bar[blk_type]),
 					     "failed to ioremap %s region\n",
 					     chip_info->memory[i].name);
 	}
diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c
index aa6b0247d5c9..a34f472ef175 100644
--- a/sound/soc/sof/intel/hda-stream.c
+++ b/sound/soc/sof/intel/hda-stream.c
@@ -890,7 +890,7 @@ int hda_dsp_stream_init(struct snd_sof_dev *sdev)
 
 	if (num_capture >= SOF_HDA_CAPTURE_STREAMS) {
 		dev_err(sdev->dev, "error: too many capture streams %d\n",
-			num_playback);
+			num_capture);
 		return -EINVAL;
 	}
 
diff --git a/sound/soc/sof/intel/ptl.c b/sound/soc/sof/intel/ptl.c
index 1bc1f54c470d..4633cd01e7dd 100644
--- a/sound/soc/sof/intel/ptl.c
+++ b/sound/soc/sof/intel/ptl.c
@@ -143,6 +143,7 @@ const struct sof_intel_dsp_desc wcl_chip_info = {
 	.read_sdw_lcount =  hda_sdw_check_lcount_ext,
 	.check_sdw_irq = lnl_dsp_check_sdw_irq,
 	.check_sdw_wakeen_irq = lnl_sdw_check_wakeen_irq,
+	.sdw_process_wakeen = hda_sdw_process_wakeen_common,
 	.check_ipc_irq = mtl_dsp_check_ipc_irq,
 	.cl_init = mtl_dsp_cl_init,
 	.power_down_dsp = mtl_power_down_dsp,
diff --git a/sound/soc/stm/stm32_i2s.c b/sound/soc/stm/stm32_i2s.c
index 0e489097d9c1..6ca21780f21d 100644
--- a/sound/soc/stm/stm32_i2s.c
+++ b/sound/soc/stm/stm32_i2s.c
@@ -469,11 +469,8 @@ static int stm32_i2smclk_determine_rate(struct clk_hw *hw,
 	int ret;
 
 	ret = stm32_i2s_calc_clk_div(i2s, req->best_parent_rate, req->rate);
-	if (ret) {
-		req->rate = ret;
-
-		return 0;
-	}
+	if (ret)
+		return ret;
 
 	mclk->freq = req->best_parent_rate / i2s->divider;
 
diff --git a/sound/usb/format.c b/sound/usb/format.c
index 0ee532acbb60..ec95a063beb1 100644
--- a/sound/usb/format.c
+++ b/sound/usb/format.c
@@ -327,12 +327,16 @@ static bool focusrite_valid_sample_rate(struct snd_usb_audio *chip,
 		max_rate = combine_quad(&fmt[6]);
 
 		switch (max_rate) {
+		case 192000:
+			if (rate == 176400 || rate == 192000)
+				return true;
+			fallthrough;
+		case 96000:
+			if (rate == 88200 || rate == 96000)
+				return true;
+			fallthrough;
 		case 48000:
 			return (rate == 44100 || rate == 48000);
-		case 96000:
-			return (rate == 88200 || rate == 96000);
-		case 192000:
-			return (rate == 176400 || rate == 192000);
 		default:
 			usb_audio_info(chip,
 				"%u:%d : unexpected max rate: %u\n",
diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c
index 7cc27ae5512f..3df537fdb9f1 100644
--- a/sound/usb/mixer_quirks.c
+++ b/sound/usb/mixer_quirks.c
@@ -4608,14 +4608,12 @@ void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer,
 		if (unitid == 7 && cval->control == UAC_FU_VOLUME)
 			snd_dragonfly_quirk_db_scale(mixer, cval, kctl);
 		break;
+	}
+
 	/* lowest playback value is muted on some devices */
-	case USB_ID(0x0d8c, 0x000c): /* C-Media */
-	case USB_ID(0x0d8c, 0x0014): /* C-Media */
-	case USB_ID(0x19f7, 0x0003): /* RODE NT-USB */
+	if (mixer->chip->quirk_flags & QUIRK_FLAG_MIXER_MIN_MUTE)
 		if (strstr(kctl->id.name, "Playback"))
 			cval->min_mute = 1;
-		break;
-	}
 
 	/* ALSA-ify some Plantronics headset control names */
 	if (USB_ID_VENDOR(mixer->chip->usb_id) == 0x047f &&
diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c
index a25c5a531690..9ad76fff741b 100644
--- a/sound/usb/qcom/qc_audio_offload.c
+++ b/sound/usb/qcom/qc_audio_offload.c
@@ -538,38 +538,33 @@ static void uaudio_iommu_unmap(enum mem_type mtype, unsigned long iova,
 			umap_size, iova, mapped_iova_size);
 }
 
+static int uaudio_iommu_map_prot(bool dma_coherent)
+{
+	int prot = IOMMU_READ | IOMMU_WRITE;
+
+	if (dma_coherent)
+		prot |= IOMMU_CACHE;
+	return prot;
+}
+
 /**
- * uaudio_iommu_map() - maps iommu memory for adsp
+ * uaudio_iommu_map_pa() - maps iommu memory for adsp
  * @mtype: ring type
  * @dma_coherent: dma coherent
  * @pa: physical address for ring/buffer
  * @size: size of memory region
- * @sgt: sg table for memory region
  *
  * Maps the XHCI related resources to a memory region that is assigned to be
  * used by the adsp.  This will be mapped to the domain, which is created by
  * the ASoC USB backend driver.
  *
  */
-static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent,
-				      phys_addr_t pa, size_t size,
-				      struct sg_table *sgt)
+static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent,
+					 phys_addr_t pa, size_t size)
 {
-	struct scatterlist *sg;
 	unsigned long iova = 0;
-	size_t total_len = 0;
-	unsigned long iova_sg;
-	phys_addr_t pa_sg;
 	bool map = true;
-	size_t sg_len;
-	int prot;
-	int ret;
-	int i;
-
-	prot = IOMMU_READ | IOMMU_WRITE;
-
-	if (dma_coherent)
-		prot |= IOMMU_CACHE;
+	int prot = uaudio_iommu_map_prot(dma_coherent);
 
 	switch (mtype) {
 	case MEM_EVENT_RING:
@@ -583,20 +578,41 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent,
 				     &uaudio_qdev->xfer_ring_iova_size,
 				     &uaudio_qdev->xfer_ring_list, size);
 		break;
-	case MEM_XFER_BUF:
-		iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova,
-				     &uaudio_qdev->xfer_buf_iova_size,
-				     &uaudio_qdev->xfer_buf_list, size);
-		break;
 	default:
 		dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype);
 	}
 
 	if (!iova || !map)
-		goto done;
+		return 0;
+
+	iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL);
 
-	if (!sgt)
-		goto skip_sgt_map;
+	return iova;
+}
+
+static unsigned long uaudio_iommu_map_xfer_buf(bool dma_coherent, size_t size,
+					       struct sg_table *sgt)
+{
+	struct scatterlist *sg;
+	unsigned long iova = 0;
+	size_t total_len = 0;
+	unsigned long iova_sg;
+	phys_addr_t pa_sg;
+	size_t sg_len;
+	int prot = uaudio_iommu_map_prot(dma_coherent);
+	int ret;
+	int i;
+
+	prot = IOMMU_READ | IOMMU_WRITE;
+
+	if (dma_coherent)
+		prot |= IOMMU_CACHE;
+
+	iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova,
+			       &uaudio_qdev->xfer_buf_iova_size,
+			       &uaudio_qdev->xfer_buf_list, size);
+	if (!iova)
+		goto done;
 
 	iova_sg = iova;
 	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
@@ -618,11 +634,6 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent,
 		uaudio_iommu_unmap(MEM_XFER_BUF, iova, size, total_len);
 		iova = 0;
 	}
-	return iova;
-
-skip_sgt_map:
-	iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL);
-
 done:
 	return iova;
 }
@@ -1020,7 +1031,6 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs,
 	struct sg_table xfer_buf_sgt;
 	dma_addr_t xfer_buf_dma;
 	void *xfer_buf;
-	phys_addr_t xfer_buf_pa;
 	u32 len = xfer_buf_len;
 	bool dma_coherent;
 	dma_addr_t xfer_buf_dma_sysdev;
@@ -1051,18 +1061,12 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs,
 	if (!xfer_buf)
 		return -ENOMEM;
 
-	/* Remapping is not possible if xfer_buf is outside of linear map */
-	xfer_buf_pa = virt_to_phys(xfer_buf);
-	if (WARN_ON(!page_is_ram(PFN_DOWN(xfer_buf_pa)))) {
-		ret = -ENXIO;
-		goto unmap_sync;
-	}
 	dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf,
 			xfer_buf_dma, len);
 
 	/* map the physical buffer into sysdev as well */
-	xfer_buf_dma_sysdev = uaudio_iommu_map(MEM_XFER_BUF, dma_coherent,
-					       xfer_buf_pa, len, &xfer_buf_sgt);
+	xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent,
+							len, &xfer_buf_sgt);
 	if (!xfer_buf_dma_sysdev) {
 		ret = -ENOMEM;
 		goto unmap_sync;
@@ -1143,8 +1147,8 @@ uaudio_endpoint_setup(struct snd_usb_substream *subs,
 	sg_free_table(sgt);
 
 	/* data transfer ring */
-	iova = uaudio_iommu_map(MEM_XFER_RING, dma_coherent, tr_pa,
-			      PAGE_SIZE, NULL);
+	iova = uaudio_iommu_map_pa(MEM_XFER_RING, dma_coherent, tr_pa,
+				   PAGE_SIZE);
 	if (!iova) {
 		ret = -ENOMEM;
 		goto clear_pa;
@@ -1207,8 +1211,8 @@ static int uaudio_event_ring_setup(struct snd_usb_substream *subs,
 	mem_info->dma = sg_dma_address(sgt->sgl);
 	sg_free_table(sgt);
 
-	iova = uaudio_iommu_map(MEM_EVENT_RING, dma_coherent, er_pa,
-			      PAGE_SIZE, NULL);
+	iova = uaudio_iommu_map_pa(MEM_EVENT_RING, dma_coherent, er_pa,
+				   PAGE_SIZE);
 	if (!iova) {
 		ret = -ENOMEM;
 		goto clear_pa;
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index e75b0b1df6eb..766db7d00cbc 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2199,6 +2199,10 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_SET_IFACE_FIRST),
 	DEVICE_FLG(0x0556, 0x0014, /* Phoenix Audio TMX320VC */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
+	DEVICE_FLG(0x0572, 0x1b08, /* Conexant Systems (Rockwell), Inc. */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
+	DEVICE_FLG(0x0572, 0x1b09, /* Conexant Systems (Rockwell), Inc. */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x05a3, 0x9420, /* ELP HD USB Camera */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x05a7, 0x1020, /* Bose Companion 5 */
@@ -2241,12 +2245,16 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
 	DEVICE_FLG(0x0b0e, 0x0349, /* Jabra 550a */
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
+	DEVICE_FLG(0x0bda, 0x498a, /* Realtek Semiconductor Corp. */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x0c45, 0x636b, /* Microdia JP001 USB Camera */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
-	DEVICE_FLG(0x0d8c, 0x0014, /* USB Audio Device */
-		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
+	DEVICE_FLG(0x0d8c, 0x000c, /* C-Media */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
+	DEVICE_FLG(0x0d8c, 0x0014, /* C-Media */
+		   QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */
 		   QUIRK_FLAG_FIXED_RATE),
 	DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */
@@ -2255,6 +2263,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER),
 	DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
+	DEVICE_FLG(0x12d1, 0x3a07, /* Huawei Technologies Co., Ltd. */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */
 		   QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16),
 	DEVICE_FLG(0x1395, 0x740a, /* Sennheiser DECT */
@@ -2293,6 +2303,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY),
 	DEVICE_FLG(0x1901, 0x0191, /* GE B850V3 CP2114 audio interface */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
+	DEVICE_FLG(0x19f7, 0x0003, /* RODE NT-USB */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x19f7, 0x0035, /* RODE NT-USB+ */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */
@@ -2343,6 +2355,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_IGNORE_CTL_ERROR),
 	DEVICE_FLG(0x2912, 0x30c8, /* Audioengine D1 */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
+	DEVICE_FLG(0x2a70, 0x1881, /* OnePlus Technology (Shenzhen) Co., Ltd. BE02T */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x2b53, 0x0023, /* Fiero SC-01 (firmware v1.0.0 @ 48 kHz) */
 		   QUIRK_FLAG_GENERIC_IMPLICIT_FB),
 	DEVICE_FLG(0x2b53, 0x0024, /* Fiero SC-01 (firmware v1.0.0 @ 96 kHz) */
@@ -2353,10 +2367,14 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
 	DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
+	DEVICE_FLG(0x2d99, 0x0026, /* HECATE G2 GAMING HEADSET */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x2fc6, 0xf0b7, /* iBasso DC07 Pro */
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
 	DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */
 		   QUIRK_FLAG_IGNORE_CTL_ERROR),
+	DEVICE_FLG(0x339b, 0x3a07, /* Synaptics HONOR USB-C HEADSET */
+		   QUIRK_FLAG_MIXER_MIN_MUTE),
 	DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x534d, 0x0021, /* MacroSilicon MS2100/MS2106 */
diff --git a/sound/usb/stream.c b/sound/usb/stream.c
index ad6ced780634..5c235a5ba7e1 100644
--- a/sound/usb/stream.c
+++ b/sound/usb/stream.c
@@ -341,20 +341,28 @@ snd_pcm_chmap_elem *convert_chmap_v3(struct uac3_cluster_header_descriptor
 
 	len = le16_to_cpu(cluster->wLength);
 	c = 0;
-	p += sizeof(struct uac3_cluster_header_descriptor);
+	p += sizeof(*cluster);
+	len -= sizeof(*cluster);
 
-	while (((p - (void *)cluster) < len) && (c < channels)) {
+	while (len > 0 && (c < channels)) {
 		struct uac3_cluster_segment_descriptor *cs_desc = p;
 		u16 cs_len;
 		u8 cs_type;
 
+		if (len < sizeof(*cs_desc))
+			break;
 		cs_len = le16_to_cpu(cs_desc->wLength);
+		if (len < cs_len)
+			break;
 		cs_type = cs_desc->bSegmentType;
 
 		if (cs_type == UAC3_CHANNEL_INFORMATION) {
 			struct uac3_cluster_information_segment_descriptor *is = p;
 			unsigned char map;
 
+			if (cs_len < sizeof(*is))
+				break;
+
 			/*
 			 * TODO: this conversion is not complete, update it
 			 * after adding UAC3 values to asound.h
@@ -456,6 +464,7 @@ snd_pcm_chmap_elem *convert_chmap_v3(struct uac3_cluster_header_descriptor
 			chmap->map[c++] = map;
 		}
 		p += cs_len;
+		len -= cs_len;
 	}
 
 	if (channels < c)
@@ -881,7 +890,7 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip,
 	u64 badd_formats = 0;
 	unsigned int num_channels;
 	struct audioformat *fp;
-	u16 cluster_id, wLength;
+	u16 cluster_id, wLength, cluster_wLength;
 	int clock = 0;
 	int err;
 
@@ -1011,6 +1020,16 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip,
 		return ERR_PTR(-EIO);
 	}
 
+	cluster_wLength = le16_to_cpu(cluster->wLength);
+	if (cluster_wLength < sizeof(*cluster) ||
+	    cluster_wLength > wLength) {
+		dev_err(&dev->dev,
+			"%u:%d : invalid Cluster Descriptor size\n",
+			iface_no, altno);
+		kfree(cluster);
+		return ERR_PTR(-EIO);
+	}
+
 	num_channels = cluster->bNrChannels;
 	chmap = convert_chmap_v3(cluster);
 	kfree(cluster);
diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h
index 158ec053dc44..1ef4d39978df 100644
--- a/sound/usb/usbaudio.h
+++ b/sound/usb/usbaudio.h
@@ -196,6 +196,9 @@ extern bool snd_usb_skip_validation;
  *  for the given endpoint.
  * QUIRK_FLAG_MIC_RES_16 and QUIRK_FLAG_MIC_RES_384
  *  Set the fixed resolution for Mic Capture Volume (mostly for webcams)
+ * QUIRK_FLAG_MIXER_MIN_MUTE
+ *  Set minimum volume control value as mute for devices where the lowest
+ *  playback value represents muted state instead of minimum audible volume
  */
 
 #define QUIRK_FLAG_GET_SAMPLE_RATE	(1U << 0)
@@ -222,5 +225,6 @@ extern bool snd_usb_skip_validation;
 #define QUIRK_FLAG_FIXED_RATE		(1U << 21)
 #define QUIRK_FLAG_MIC_RES_16		(1U << 22)
 #define QUIRK_FLAG_MIC_RES_384		(1U << 23)
+#define QUIRK_FLAG_MIXER_MIN_MUTE	(1U << 24)
 
 #endif /* __USBAUDIO_H */
diff --git a/sound/usb/validate.c b/sound/usb/validate.c
index 6fe206f6e911..a0d55b77c994 100644
--- a/sound/usb/validate.c
+++ b/sound/usb/validate.c
@@ -221,6 +221,17 @@ static bool validate_uac3_feature_unit(const void *p,
 	return d->bLength >= sizeof(*d) + 4 + 2;
 }
 
+static bool validate_uac3_power_domain_unit(const void *p,
+					    const struct usb_desc_validator *v)
+{
+	const struct uac3_power_domain_descriptor *d = p;
+
+	if (d->bLength < sizeof(*d))
+		return false;
+	/* baEntities[] + wPDomainDescrStr */
+	return d->bLength >= sizeof(*d) + d->bNrEntities + 2;
+}
+
 static bool validate_midi_out_jack(const void *p,
 				   const struct usb_desc_validator *v)
 {
@@ -274,7 +285,7 @@ static const struct usb_desc_validator audio_validators[] = {
 	/* UAC_VERSION_3, UAC3_EXTENDED_TERMINAL: not implemented yet */
 	FUNC(UAC_VERSION_3, UAC3_MIXER_UNIT, validate_mixer_unit),
 	FUNC(UAC_VERSION_3, UAC3_SELECTOR_UNIT, validate_selector_unit),
-	FUNC(UAC_VERSION_3, UAC_FEATURE_UNIT, validate_uac3_feature_unit),
+	FUNC(UAC_VERSION_3, UAC3_FEATURE_UNIT, validate_uac3_feature_unit),
 	/*  UAC_VERSION_3, UAC3_EFFECT_UNIT: not implemented yet */
 	FUNC(UAC_VERSION_3, UAC3_PROCESSING_UNIT, validate_processing_unit),
 	FUNC(UAC_VERSION_3, UAC3_EXTENSION_UNIT, validate_processing_unit),
@@ -285,6 +296,7 @@ static const struct usb_desc_validator audio_validators[] = {
 	      struct uac3_clock_multiplier_descriptor),
 	/* UAC_VERSION_3, UAC3_SAMPLE_RATE_CONVERTER: not implemented yet */
 	/* UAC_VERSION_3, UAC3_CONNECTORS: not implemented yet */
+	FUNC(UAC_VERSION_3, UAC3_POWER_DOMAIN, validate_uac3_power_domain_unit),
 	{ } /* terminator */
 };
 
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h
index 9a5d85cfd1fb..139d5e87dc95 100644
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@@ -75,11 +75,13 @@
 #define ARM_CPU_PART_CORTEX_A76		0xD0B
 #define ARM_CPU_PART_NEOVERSE_N1	0xD0C
 #define ARM_CPU_PART_CORTEX_A77		0xD0D
+#define ARM_CPU_PART_CORTEX_A76AE	0xD0E
 #define ARM_CPU_PART_NEOVERSE_V1	0xD40
 #define ARM_CPU_PART_CORTEX_A78		0xD41
 #define ARM_CPU_PART_CORTEX_A78AE	0xD42
 #define ARM_CPU_PART_CORTEX_X1		0xD44
 #define ARM_CPU_PART_CORTEX_A510	0xD46
+#define ARM_CPU_PART_CORTEX_X1C		0xD4C
 #define ARM_CPU_PART_CORTEX_A520	0xD80
 #define ARM_CPU_PART_CORTEX_A710	0xD47
 #define ARM_CPU_PART_CORTEX_A715	0xD4D
@@ -119,9 +121,11 @@
 #define QCOM_CPU_PART_KRYO		0x200
 #define QCOM_CPU_PART_KRYO_2XX_GOLD	0x800
 #define QCOM_CPU_PART_KRYO_2XX_SILVER	0x801
+#define QCOM_CPU_PART_KRYO_3XX_GOLD	0x802
 #define QCOM_CPU_PART_KRYO_3XX_SILVER	0x803
 #define QCOM_CPU_PART_KRYO_4XX_GOLD	0x804
 #define QCOM_CPU_PART_KRYO_4XX_SILVER	0x805
+#define QCOM_CPU_PART_ORYON_X1		0x001
 
 #define NVIDIA_CPU_PART_DENVER		0x003
 #define NVIDIA_CPU_PART_CARMEL		0x004
@@ -129,6 +133,7 @@
 #define FUJITSU_CPU_PART_A64FX		0x001
 
 #define HISI_CPU_PART_TSV110		0xD01
+#define HISI_CPU_PART_HIP09			0xD02
 #define HISI_CPU_PART_HIP12		0xD06
 
 #define APPLE_CPU_PART_M1_ICESTORM	0x022
@@ -159,11 +164,13 @@
 #define MIDR_CORTEX_A76	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
 #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1)
 #define MIDR_CORTEX_A77	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77)
+#define MIDR_CORTEX_A76AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE)
 #define MIDR_NEOVERSE_V1	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1)
 #define MIDR_CORTEX_A78	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78)
 #define MIDR_CORTEX_A78AE	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE)
 #define MIDR_CORTEX_X1	MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1)
 #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510)
+#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C)
 #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520)
 #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710)
 #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715)
@@ -196,13 +203,26 @@
 #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO)
 #define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD)
 #define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER)
+#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD)
 #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER)
 #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD)
 #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER)
+#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1)
+
+/*
+ * NOTES:
+ * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77
+ * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER
+ * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1
+ * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78
+ * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55
+ */
+
 #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER)
 #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL)
 #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX)
 #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110)
+#define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09)
 #define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12)
 #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM)
 #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM)
@@ -291,6 +311,14 @@ static inline u32 __attribute_const__ read_cpuid_id(void)
 	return read_cpuid(MIDR_EL1);
 }
 
+struct target_impl_cpu {
+	u64 midr;
+	u64 revidr;
+	u64 aidr;
+};
+
+bool cpu_errata_set_target_impl(u64 num, void *impl_cpus);
+
 static inline u64 __attribute_const__ read_cpuid_mpidr(void)
 {
 	return read_cpuid(MPIDR_EL1);
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index 690b6ebd118f..65f2759ea27a 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -1080,9 +1080,6 @@
 
 #define ARM64_FEATURE_FIELD_BITS	4
 
-/* Defined for compatibility only, do not add new users. */
-#define ARM64_FEATURE_MASK(x)	(x##_MASK)
-
 #ifdef __ASSEMBLY__
 
 	.macro	mrs_s, rt, sreg
diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h
index c25b5853181d..d68fad63c8b7 100644
--- a/tools/arch/loongarch/include/asm/inst.h
+++ b/tools/arch/loongarch/include/asm/inst.h
@@ -51,6 +51,10 @@ enum reg2i16_op {
 	bgeu_op		= 0x1b,
 };
 
+enum reg3_op {
+	amswapw_op	= 0x70c0,
+};
+
 struct reg0i15_format {
 	unsigned int immediate : 15;
 	unsigned int opcode : 17;
@@ -96,6 +100,13 @@ struct reg2i16_format {
 	unsigned int opcode : 6;
 };
 
+struct reg3_format {
+	unsigned int rd : 5;
+	unsigned int rj : 5;
+	unsigned int rk : 5;
+	unsigned int opcode : 17;
+};
+
 union loongarch_instruction {
 	unsigned int word;
 	struct reg0i15_format	reg0i15_format;
@@ -105,6 +116,7 @@ union loongarch_instruction {
 	struct reg2i12_format	reg2i12_format;
 	struct reg2i14_format	reg2i14_format;
 	struct reg2i16_format	reg2i16_format;
+	struct reg3_format	reg3_format;
 };
 
 #define LOONGARCH_INSN_SIZE	sizeof(union loongarch_instruction)
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index eaeda001784e..077c5437f521 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -1,18 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
  * Copyright IBM Corp. 2007
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h
index 0dfc09254f99..56d7367ee344 100644
--- a/tools/arch/riscv/include/asm/csr.h
+++ b/tools/arch/riscv/include/asm/csr.h
@@ -468,13 +468,13 @@
 #define IE_TIE		(_AC(0x1, UL) << RV_IRQ_TIMER)
 #define IE_EIE		(_AC(0x1, UL) << RV_IRQ_EXT)
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #define __ASM_STR(x)    x
 #else
 #define __ASM_STR(x)    #x
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define csr_swap(csr, val)					\
 ({								\
@@ -536,6 +536,6 @@
 			      : "memory");			\
 })
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* _ASM_RISCV_CSR_H */
diff --git a/tools/arch/riscv/include/asm/vdso/processor.h b/tools/arch/riscv/include/asm/vdso/processor.h
index 662aca039848..0665b117f30f 100644
--- a/tools/arch/riscv/include/asm/vdso/processor.h
+++ b/tools/arch/riscv/include/asm/vdso/processor.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_VDSO_PROCESSOR_H
 #define __ASM_VDSO_PROCESSOR_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm-generic/barrier.h>
 
@@ -27,6 +27,6 @@ static inline void cpu_relax(void)
 	barrier();
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index ee176236c2be..06fc0479a23f 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -218,6 +218,7 @@
 #define X86_FEATURE_FLEXPRIORITY	( 8*32+ 1) /* "flexpriority" Intel FlexPriority */
 #define X86_FEATURE_EPT			( 8*32+ 2) /* "ept" Intel Extended Page Table */
 #define X86_FEATURE_VPID		( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */
+#define X86_FEATURE_COHERENCY_SFW_NO	( 8*32+ 4) /* SNP cache coherency software work around not needed */
 
 #define X86_FEATURE_VMMCALL		( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */
 #define X86_FEATURE_XENPV		( 8*32+16) /* Xen paravirtual guest */
@@ -456,10 +457,14 @@
 #define X86_FEATURE_NO_NESTED_DATA_BP	(20*32+ 0) /* No Nested Data Breakpoints */
 #define X86_FEATURE_WRMSR_XX_BASE_NS	(20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
 #define X86_FEATURE_LFENCE_RDTSC	(20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_VERW_CLEAR		(20*32+ 5) /* The memory form of VERW mitigates TSA */
 #define X86_FEATURE_NULL_SEL_CLR_BASE	(20*32+ 6) /* Null Selector Clears Base */
+
 #define X86_FEATURE_AUTOIBRS		(20*32+ 8) /* Automatic IBRS */
 #define X86_FEATURE_NO_SMM_CTL_MSR	(20*32+ 9) /* SMM_CTL MSR is not present */
 
+#define X86_FEATURE_GP_ON_USER_CPUID	(20*32+17) /* User CPUID faulting */
+
 #define X86_FEATURE_PREFETCHI		(20*32+20) /* Prefetch Data/Instruction to Cache Level */
 #define X86_FEATURE_SBPB		(20*32+27) /* Selective Branch Prediction Barrier */
 #define X86_FEATURE_IBPB_BRTYPE		(20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
@@ -487,6 +492,9 @@
 #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
 #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
 #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_TSA_SQ_NO		(21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO		(21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM	(21*32+13) /* Clear CPU buffers using VERW before VMRUN */
 
 /*
  * BUG word(s)
@@ -542,5 +550,5 @@
 #define X86_BUG_OLD_MICROCODE		X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */
 #define X86_BUG_ITS			X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
 #define X86_BUG_ITS_NATIVE_ONLY		X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
-
+#define X86_BUG_TSA			X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/inat.h b/tools/arch/x86/include/asm/inat.h
index 183aa662b165..099e926595bd 100644
--- a/tools/arch/x86/include/asm/inat.h
+++ b/tools/arch/x86/include/asm/inat.h
@@ -37,6 +37,8 @@
 #define INAT_PFX_EVEX	15	/* EVEX prefix */
 /* x86-64 REX2 prefix */
 #define INAT_PFX_REX2	16	/* 0xD5 */
+/* AMD XOP prefix */
+#define INAT_PFX_XOP	17	/* 0x8F */
 
 #define INAT_LSTPFX_MAX	3
 #define INAT_LGCPFX_MAX	11
@@ -77,6 +79,7 @@
 #define INAT_MOFFSET	(1 << (INAT_FLAG_OFFS + 3))
 #define INAT_VARIANT	(1 << (INAT_FLAG_OFFS + 4))
 #define INAT_VEXOK	(1 << (INAT_FLAG_OFFS + 5))
+#define INAT_XOPOK	INAT_VEXOK
 #define INAT_VEXONLY	(1 << (INAT_FLAG_OFFS + 6))
 #define INAT_EVEXONLY	(1 << (INAT_FLAG_OFFS + 7))
 #define INAT_NO_REX2	(1 << (INAT_FLAG_OFFS + 8))
@@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
 extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
 					  insn_byte_t vex_m,
 					  insn_byte_t vex_pp);
+extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode,
+					  insn_byte_t map_select);
 
 /* Attribute checking functions */
 static inline int inat_is_legacy_prefix(insn_attr_t attr)
@@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr)
 	return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
 }
 
+static inline int inat_is_xop_prefix(insn_attr_t attr)
+{
+	return (attr & INAT_PFX_MASK) == INAT_PFX_XOP;
+}
+
 static inline int inat_is_escape(insn_attr_t attr)
 {
 	return attr & INAT_ESC_MASK;
@@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr)
 	return attr & INAT_VEXOK;
 }
 
+static inline int inat_accept_xop(insn_attr_t attr)
+{
+	return attr & INAT_XOPOK;
+}
+
 static inline int inat_must_vex(insn_attr_t attr)
 {
 	return attr & (INAT_VEXONLY | INAT_EVEXONLY);
diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h
index 0e5abd896ad4..c683d609934b 100644
--- a/tools/arch/x86/include/asm/insn.h
+++ b/tools/arch/x86/include/asm/insn.h
@@ -71,7 +71,10 @@ struct insn {
 					 * prefixes.bytes[3]: last prefix
 					 */
 	struct insn_field rex_prefix;	/* REX prefix */
-	struct insn_field vex_prefix;	/* VEX prefix */
+	union {
+		struct insn_field vex_prefix;	/* VEX prefix */
+		struct insn_field xop_prefix;	/* XOP prefix */
+	};
 	struct insn_field opcode;	/*
 					 * opcode.bytes[0]: opcode1
 					 * opcode.bytes[1]: opcode2
@@ -135,6 +138,17 @@ struct insn {
 #define X86_VEX_V(vex)	(((vex) & 0x78) >> 3)	/* VEX3 Byte2, VEX2 Byte1 */
 #define X86_VEX_P(vex)	((vex) & 0x03)		/* VEX3 Byte2, VEX2 Byte1 */
 #define X86_VEX_M_MAX	0x1f			/* VEX3.M Maximum value */
+/* XOP bit fields */
+#define X86_XOP_R(xop)	((xop) & 0x80)	/* XOP Byte2 */
+#define X86_XOP_X(xop)	((xop) & 0x40)	/* XOP Byte2 */
+#define X86_XOP_B(xop)	((xop) & 0x20)	/* XOP Byte2 */
+#define X86_XOP_M(xop)	((xop) & 0x1f)	/* XOP Byte2 */
+#define X86_XOP_W(xop)	((xop) & 0x80)	/* XOP Byte3 */
+#define X86_XOP_V(xop)	((xop) & 0x78)	/* XOP Byte3 */
+#define X86_XOP_L(xop)	((xop) & 0x04)	/* XOP Byte3 */
+#define X86_XOP_P(xop)	((xop) & 0x03)	/* XOP Byte3 */
+#define X86_XOP_M_MIN	0x08	/* Min of XOP.M */
+#define X86_XOP_M_MAX	0x1f	/* Max of XOP.M */
 
 extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
 extern int insn_get_prefixes(struct insn *insn);
@@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn)
 	return X86_REX2_M(insn->rex_prefix.bytes[1]);
 }
 
-static inline int insn_is_avx(struct insn *insn)
+static inline int insn_is_avx_or_xop(struct insn *insn)
 {
 	if (!insn->prefixes.got)
 		insn_get_prefixes(insn);
@@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn)
 	return (insn->vex_prefix.nbytes == 4);
 }
 
+/* If we already know this is AVX/XOP encoded */
+static inline int avx_insn_is_xop(struct insn *insn)
+{
+	insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]);
+
+	return inat_is_xop_prefix(attr);
+}
+
+static inline int insn_is_xop(struct insn *insn)
+{
+	if (!insn_is_avx_or_xop(insn))
+		return 0;
+
+	return avx_insn_is_xop(insn);
+}
+
 static inline int insn_has_emulate_prefix(struct insn *insn)
 {
 	return !!insn->emulate_prefix_size;
@@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn)
 	return X86_VEX_W(insn->vex_prefix.bytes[2]);
 }
 
+static inline insn_byte_t insn_xop_map_bits(struct insn *insn)
+{
+	if (insn->xop_prefix.nbytes < 3)	/* XOP is 3 bytes */
+		return 0;
+	return X86_XOP_M(insn->xop_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_xop_p_bits(struct insn *insn)
+{
+	return X86_XOP_P(insn->vex_prefix.bytes[2]);
+}
+
 /* Get the last prefix id from last prefix or VEX prefix */
 static inline int insn_last_prefix_id(struct insn *insn)
 {
-	if (insn_is_avx(insn))
+	if (insn_is_avx_or_xop(insn)) {
+		if (avx_insn_is_xop(insn))
+			return insn_xop_p_bits(insn);
 		return insn_vex_p_bits(insn);	/* VEX_p is a SIMD prefix id */
+	}
 
 	if (insn->prefixes.bytes[3])
 		return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 5cfb5d74dd5f..f627196eb796 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -315,12 +315,14 @@
 #define PERF_CAP_PT_IDX			16
 
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
-#define PERF_CAP_PEBS_TRAP             BIT_ULL(6)
-#define PERF_CAP_ARCH_REG              BIT_ULL(7)
-#define PERF_CAP_PEBS_FORMAT           0xf00
-#define PERF_CAP_PEBS_BASELINE         BIT_ULL(14)
-#define PERF_CAP_PEBS_MASK	(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
-				 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
+#define PERF_CAP_PEBS_TRAP		BIT_ULL(6)
+#define PERF_CAP_ARCH_REG		BIT_ULL(7)
+#define PERF_CAP_PEBS_FORMAT		0xf00
+#define PERF_CAP_PEBS_BASELINE		BIT_ULL(14)
+#define PERF_CAP_PEBS_TIMING_INFO	BIT_ULL(17)
+#define PERF_CAP_PEBS_MASK		(PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
+					 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
+					 PERF_CAP_PEBS_TIMING_INFO)
 
 #define MSR_IA32_RTIT_CTL		0x00000570
 #define RTIT_CTL_TRACEEN		BIT(0)
@@ -419,6 +421,7 @@
 #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI	(1UL << 12)
 #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT	14
 #define DEBUGCTLMSR_FREEZE_IN_SMM	(1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+#define DEBUGCTLMSR_RTM_DEBUG		BIT(15)
 
 #define MSR_PEBS_FRONTEND		0x000003f7
 
@@ -733,6 +736,11 @@
 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL		0xc0000301
 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR	0xc0000302
 
+/* AMD Hardware Feedback Support MSRs */
+#define MSR_AMD_WORKLOAD_CLASS_CONFIG		0xc0000500
+#define MSR_AMD_WORKLOAD_CLASS_ID		0xc0000501
+#define MSR_AMD_WORKLOAD_HRST			0xc0000502
+
 /* AMD Last Branch Record MSRs */
 #define MSR_AMD64_LBR_SELECT			0xc000010e
 
@@ -831,6 +839,7 @@
 #define MSR_K7_HWCR_SMMLOCK		BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
 #define MSR_K7_HWCR_IRPERF_EN_BIT	30
 #define MSR_K7_HWCR_IRPERF_EN		BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_HWCR_CPUID_USER_DIS_BIT	35
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 #define MSR_K7_HWCR_CPB_DIS_BIT		25
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 6f3499507c5e..0f15d683817d 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -965,7 +965,13 @@ struct kvm_tdx_cmd {
 struct kvm_tdx_capabilities {
 	__u64 supported_attrs;
 	__u64 supported_xfam;
-	__u64 reserved[254];
+
+	__u64 kernel_tdvmcallinfo_1_r11;
+	__u64 user_tdvmcallinfo_1_r11;
+	__u64 kernel_tdvmcallinfo_1_r12;
+	__u64 user_tdvmcallinfo_1_r12;
+
+	__u64 reserved[250];
 
 	/* Configurable CPUID bits for userspace */
 	struct kvm_cpuid2 cpuid;
diff --git a/tools/arch/x86/lib/inat.c b/tools/arch/x86/lib/inat.c
index dfbcc6405941..ffcb0e27453b 100644
--- a/tools/arch/x86/lib/inat.c
+++ b/tools/arch/x86/lib/inat.c
@@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
 	return table[opcode];
 }
 
+insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select)
+{
+	const insn_attr_t *table;
+
+	if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX)
+		return 0;
+	map_select -= X86_XOP_M_MIN;
+	/* At first, this checks the master table */
+	table = inat_xop_tables[map_select];
+	if (!table)
+		return 0;
+	return table[opcode];
+}
diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c
index bce69c6bfa69..1d1c57c74d1f 100644
--- a/tools/arch/x86/lib/insn.c
+++ b/tools/arch/x86/lib/insn.c
@@ -200,12 +200,15 @@ found:
 	}
 	insn->rex_prefix.got = 1;
 
-	/* Decode VEX prefix */
+	/* Decode VEX/XOP prefix */
 	b = peek_next(insn_byte_t, insn);
-	attr = inat_get_opcode_attribute(b);
-	if (inat_is_vex_prefix(attr)) {
+	if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) {
 		insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
-		if (!insn->x86_64) {
+
+		if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) {
+			/* Grp1A.0 is always POP Ev */
+			goto vex_end;
+		} else if (!insn->x86_64) {
 			/*
 			 * In 32-bits mode, if the [7:6] bits (mod bits of
 			 * ModRM) on the second byte are not 11b, it is
@@ -226,13 +229,13 @@ found:
 			if (insn->x86_64 && X86_VEX_W(b2))
 				/* VEX.W overrides opnd_size */
 				insn->opnd_bytes = 8;
-		} else if (inat_is_vex3_prefix(attr)) {
+		} else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) {
 			b2 = peek_nbyte_next(insn_byte_t, insn, 2);
 			insn_set_byte(&insn->vex_prefix, 2, b2);
 			insn->vex_prefix.nbytes = 3;
 			insn->next_byte += 3;
 			if (insn->x86_64 && X86_VEX_W(b2))
-				/* VEX.W overrides opnd_size */
+				/* VEX.W/XOP.W overrides opnd_size */
 				insn->opnd_bytes = 8;
 		} else {
 			/*
@@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn)
 	insn_set_byte(opcode, 0, op);
 	opcode->nbytes = 1;
 
-	/* Check if there is VEX prefix or not */
-	if (insn_is_avx(insn)) {
+	/* Check if there is VEX/XOP prefix or not */
+	if (insn_is_avx_or_xop(insn)) {
 		insn_byte_t m, p;
+
+		/* XOP prefix has different encoding */
+		if (unlikely(avx_insn_is_xop(insn))) {
+			m = insn_xop_map_bits(insn);
+			insn->attr = inat_get_xop_attribute(op, m);
+			if (!inat_accept_xop(insn->attr)) {
+				insn->attr = 0;
+				return -EINVAL;
+			}
+			/* XOP has only 1 byte for opcode */
+			goto end;
+		}
+
 		m = insn_vex_m_bits(insn);
 		p = insn_vex_p_bits(insn);
 		insn->attr = inat_get_avx_attribute(op, m, p);
@@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn)
 			pfx_id = insn_last_prefix_id(insn);
 			insn->attr = inat_get_group_attribute(mod, pfx_id,
 							      insn->attr);
-			if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) {
+			if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) &&
+			    !inat_accept_xop(insn->attr)) {
 				/* Bad insn */
 				insn->attr = 0;
 				return -EINVAL;
diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt
index 262f7ca1fb95..2a4e69ecc2de 100644
--- a/tools/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/arch/x86/lib/x86-opcode-map.txt
@@ -27,6 +27,11 @@
 #  (evo): this opcode is changed by EVEX prefix (EVEX opcode)
 #  (v): this opcode requires VEX prefix.
 #  (v1): this opcode only supports 128bit VEX.
+#  (xop): this opcode accepts XOP prefix.
+#
+# XOP Superscripts
+#  (W=0): this opcode requires XOP.W == 0
+#  (W=1): this opcode requires XOP.W == 1
 #
 # Last Prefix Superscripts
 #  - (66): the last prefix is 0x66
@@ -194,7 +199,7 @@ AVXcode:
 8c: MOV Ev,Sw
 8d: LEA Gv,M
 8e: MOV Sw,Ew
-8f: Grp1A (1A) | POP Ev (d64)
+8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix)
 # 0x90 - 0x9f
 90: NOP | PAUSE (F3) | XCHG r8,rAX
 91: XCHG rCX/r9,rAX
@@ -1106,6 +1111,84 @@ AVXcode: 7
 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B)
 EndTable
 
+# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5
+Table: XOP map 8h
+Referrer:
+XOPcode: 0
+85: VPMACSSWW Vo,Ho,Wo,Lo
+86: VPMACSSWD Vo,Ho,Wo,Lo
+87: VPMACSSDQL Vo,Ho,Wo,Lo
+8e: VPMACSSDD Vo,Ho,Wo,Lo
+8f: VPMACSSDQH Vo,Ho,Wo,Lo
+95: VPMACSWW Vo,Ho,Wo,Lo
+96: VPMACSWD Vo,Ho,Wo,Lo
+97: VPMACSDQL Vo,Ho,Wo,Lo
+9e: VPMACSDD Vo,Ho,Wo,Lo
+9f: VPMACSDQH Vo,Ho,Wo,Lo
+a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1)
+a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1)
+a6: VPMADCSSWD Vo,Ho,Wo,Lo
+b6: VPMADCSWD Vo,Ho,Wo,Lo
+c0: VPROTB Vo,Wo,Ib
+c1: VPROTW Vo,Wo,Ib
+c2: VPROTD Vo,Wo,Ib
+c3: VPROTQ Vo,Wo,Ib
+cc: VPCOMccB Vo,Ho,Wo,Ib
+cd: VPCOMccW Vo,Ho,Wo,Ib
+ce: VPCOMccD Vo,Ho,Wo,Ib
+cf: VPCOMccQ Vo,Ho,Wo,Ib
+ec: VPCOMccUB Vo,Ho,Wo,Ib
+ed: VPCOMccUW Vo,Ho,Wo,Ib
+ee: VPCOMccUD Vo,Ho,Wo,Ib
+ef: VPCOMccUQ Vo,Ho,Wo,Ib
+EndTable
+
+Table: XOP map 9h
+Referrer:
+XOPcode: 1
+01: GrpXOP1
+02: GrpXOP2
+12: GrpXOP3
+80: VFRCZPS Vx,Wx
+81: VFRCZPD Vx,Wx
+82: VFRCZSS Vq,Wss
+83: VFRCZSD Vq,Wsd
+90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1)
+94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1)
+95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1)
+96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1)
+97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1)
+98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1)
+99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1)
+9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1)
+9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1)
+c1: VPHADDBW Vo,Wo
+c2: VPHADDBD Vo,Wo
+c3: VPHADDBQ Vo,Wo
+c6: VPHADDWD Vo,Wo
+c7: VPHADDWQ Vo,Wo
+cb: VPHADDDQ Vo,Wo
+d1: VPHADDUBWD Vo,Wo
+d2: VPHADDUBD Vo,Wo
+d3: VPHADDUBQ Vo,Wo
+d6: VPHADDUWD Vo,Wo
+d7: VPHADDUWQ Vo,Wo
+db: VPHADDUDQ Vo,Wo
+e1: VPHSUBBW Vo,Wo
+e2: VPHSUBWD Vo,Wo
+e3: VPHSUBDQ Vo,Wo
+EndTable
+
+Table: XOP map Ah
+Referrer:
+XOPcode: 2
+10: BEXTR Gy,Ey,Id
+12: GrpXOP4
+EndTable
+
 GrpTable: Grp1
 0: ADD
 1: OR
@@ -1320,3 +1403,29 @@ GrpTable: GrpRNG
 4: xcrypt-cfb
 5: xcrypt-ofb
 EndTable
+
+# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4
+GrpTable: GrpXOP1
+1: BLCFILL By,Ey (xop)
+2: BLSFILL By,Ey (xop)
+3: BLCS By,Ey (xop)
+4: TZMSK By,Ey (xop)
+5: BLCIC By,Ey (xop)
+6: BLSIC By,Ey (xop)
+7: T1MSKC By,Ey (xop)
+EndTable
+
+GrpTable: GrpXOP2
+1: BLCMSK By,Ey (xop)
+6: BLCI By,Ey (xop)
+EndTable
+
+GrpTable: GrpXOP3
+0: LLWPCB Ry (xop)
+1: SLWPCB Ry (xop)
+EndTable
+
+GrpTable: GrpXOP4
+0: LWPINS By,Ed,Id (xop)
+1: LWPVAL By,Ed,Id (xop)
+EndTable
diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk
index 2c19d7fc8a85..7ea1b75e59b7 100644
--- a/tools/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk
@@ -21,6 +21,7 @@ function clear_vars() {
 	eid = -1 # escape id
 	gid = -1 # group id
 	aid = -1 # AVX id
+	xopid = -1 # XOP id
 	tname = ""
 }
 
@@ -39,9 +40,11 @@ BEGIN {
 	ggid = 1
 	geid = 1
 	gaid = 0
+	gxopid = 0
 	delete etable
 	delete gtable
 	delete atable
+	delete xoptable
 
 	opnd_expr = "^[A-Za-z/]"
 	ext_expr = "^\\("
@@ -61,6 +64,7 @@ BEGIN {
 	imm_flag["Ob"] = "INAT_MOFFSET"
 	imm_flag["Ov"] = "INAT_MOFFSET"
 	imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+	imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
 
 	modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
 	force64_expr = "\\([df]64\\)"
@@ -87,6 +91,8 @@ BEGIN {
 	evexonly_expr = "\\(ev\\)"
 	# (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size
 	evex_scalable_expr = "\\(es\\)"
+	# All opcodes in XOP table or with (xop) superscript accept XOP prefix
+	xopok_expr = "\\(xop\\)"
 
 	prefix_expr = "\\(Prefix\\)"
 	prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -106,6 +112,7 @@ BEGIN {
 	prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
 	prefix_num["EVEX"] = "INAT_PFX_EVEX"
 	prefix_num["REX2"] = "INAT_PFX_REX2"
+	prefix_num["XOP"] = "INAT_PFX_XOP"
 
 	clear_vars()
 }
@@ -147,6 +154,7 @@ function array_size(arr,   i,c) {
 	if (NF != 1) {
 		# AVX/escape opcode table
 		aid = $2
+		xopid = -1
 		if (gaid <= aid)
 			gaid = aid + 1
 		if (tname == "")	# AVX only opcode table
@@ -156,6 +164,20 @@ function array_size(arr,   i,c) {
 		tname = "inat_primary_table"
 }
 
+/^XOPcode:/ {
+	if (NF != 1) {
+		# XOP opcode table
+		xopid = $2
+		aid = -1
+		if (gxopid <= xopid)
+			gxopid = xopid + 1
+		if (tname == "")	# XOP only opcode table
+			tname = sprintf("inat_xop_table_%d", $2)
+	}
+	if (xopid == -1 && eid == -1)	# primary opcode table
+		tname = "inat_primary_table"
+}
+
 /^GrpTable:/ {
 	print "/* " $0 " */"
 	if (!($2 in group))
@@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n)
 			etable[eid,0] = tname
 			if (aid >= 0)
 				atable[aid,0] = tname
+			else if (xopid >= 0)
+				xoptable[xopid] = tname
 		}
 		if (array_size(lptable1) != 0) {
 			print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
@@ -347,6 +371,8 @@ function convert_operands(count,opnd,       i,j,imm,mod)
 			flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
 		else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
 			flags = add_flags(flags, "INAT_VEXOK")
+		else if (match(ext, xopok_expr) || xopid >= 0)
+			flags = add_flags(flags, "INAT_XOPOK")
 
 		# check prefixes
 		if (match(ext, prefix_expr)) {
@@ -413,6 +439,14 @@ END {
 				print "	["i"]["j"] = "atable[i,j]","
 	print "};\n"
 
+	print "/* XOP opcode map array */"
+	print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \
+	      " = {"
+	for (i = 0; i < gxopid; i++)
+		if (xoptable[i])
+			print "	["i"] = "xoptable[i]","
+	print "};"
+
 	print "#else /* !__BOOT_COMPRESSED */\n"
 
 	print "/* Escape opcode map array */"
@@ -430,6 +464,10 @@ END {
 	      "[INAT_LSTPFX_MAX + 1];"
 	print ""
 
+	print "/* XOP opcode map array */"
+	print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];"
+	print ""
+
 	print "static void inat_init_tables(void)"
 	print "{"
 
@@ -455,6 +493,12 @@ END {
 			if (atable[i,j])
 				print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";"
 
+	print ""
+	print "\t/* Print XOP opcode map array */"
+	for (i = 0; i < gxopid; i++)
+		if (xoptable[i])
+			print "\tinat_xop_tables["i"] = "xoptable[i]";"
+
 	print "}"
 	print "#endif"
 }
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 57c669d2aa90..55d59ed507d5 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -193,7 +193,7 @@ static int load_xbc_from_initrd(int fd, char **buf)
 	if (stat.st_size < BOOTCONFIG_FOOTER_SIZE)
 		return 0;
 
-	if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0)
+	if (lseek(fd, -(off_t)BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0)
 		return pr_errno("Failed to lseek for magic", -errno);
 
 	if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0)
@@ -203,7 +203,7 @@ static int load_xbc_from_initrd(int fd, char **buf)
 	if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0)
 		return 0;
 
-	if (lseek(fd, -BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0)
+	if (lseek(fd, -(off_t)BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0)
 		return pr_errno("Failed to lseek for size", -errno);
 
 	if (read(fd, &size, sizeof(uint32_t)) < 0)
diff --git a/tools/gpio/Makefile b/tools/gpio/Makefile
index ed565eb52275..342e056c8c66 100644
--- a/tools/gpio/Makefile
+++ b/tools/gpio/Makefile
@@ -77,7 +77,7 @@ $(OUTPUT)gpio-watch: $(GPIO_WATCH_IN)
 
 clean:
 	rm -f $(ALL_PROGRAMS)
-	rm -f $(OUTPUT)include/linux/gpio.h
+	rm -rf $(OUTPUT)include
 	find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete
 
 install: $(ALL_PROGRAMS)
diff --git a/tools/include/linux/args.h b/tools/include/linux/args.h
new file mode 100644
index 000000000000..2e8e65d975c7
--- /dev/null
+++ b/tools/include/linux/args.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_ARGS_H
+#define _LINUX_ARGS_H
+
+/*
+ * How do these macros work?
+ *
+ * In __COUNT_ARGS() _0 to _12 are just placeholders from the start
+ * in order to make sure _n is positioned over the correct number
+ * from 12 to 0 (depending on X, which is a variadic argument list).
+ * They serve no purpose other than occupying a position. Since each
+ * macro parameter must have a distinct identifier, those identifiers
+ * are as good as any.
+ *
+ * In COUNT_ARGS() we use actual integers, so __COUNT_ARGS() returns
+ * that as _n.
+ */
+
+/* This counts to 15. Any more, it will return 16th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+/* Concatenate two parameters, but allow them to be expanded beforehand. */
+#define __CONCAT(a, b) a ## b
+#define CONCATENATE(a, b) __CONCAT(a, b)
+
+#endif	/* _LINUX_ARGS_H */
diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h
index 7ad056219115..a40cc861b3a7 100644
--- a/tools/include/linux/bits.h
+++ b/tools/include/linux/bits.h
@@ -2,10 +2,8 @@
 #ifndef __LINUX_BITS_H
 #define __LINUX_BITS_H
 
-#include <linux/const.h>
 #include <vdso/bits.h>
 #include <uapi/linux/bits.h>
-#include <asm/bitsperlong.h>
 
 #define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
@@ -50,10 +48,14 @@
 	     (type_max(t) << (l) &				\
 	      type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h)))))
 
+#define GENMASK(h, l)		GENMASK_TYPE(unsigned long, h, l)
+#define GENMASK_ULL(h, l)	GENMASK_TYPE(unsigned long long, h, l)
+
 #define GENMASK_U8(h, l)	GENMASK_TYPE(u8, h, l)
 #define GENMASK_U16(h, l)	GENMASK_TYPE(u16, h, l)
 #define GENMASK_U32(h, l)	GENMASK_TYPE(u32, h, l)
 #define GENMASK_U64(h, l)	GENMASK_TYPE(u64, h, l)
+#define GENMASK_U128(h, l)	GENMASK_TYPE(u128, h, l)
 
 /*
  * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The
@@ -79,28 +81,9 @@
  * BUILD_BUG_ON_ZERO is not available in h files included from asm files,
  * disable the input check if that is the case.
  */
-#define GENMASK_INPUT_CHECK(h, l) 0
+#define GENMASK(h, l)		__GENMASK(h, l)
+#define GENMASK_ULL(h, l)	__GENMASK_ULL(h, l)
 
 #endif /* !defined(__ASSEMBLY__) */
 
-#define GENMASK(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
-#define GENMASK_ULL(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
-
-#if !defined(__ASSEMBLY__)
-/*
- * Missing asm support
- *
- * __GENMASK_U128() depends on _BIT128() which would not work
- * in the asm code, as it shifts an 'unsigned __int128' data
- * type instead of direct representation of 128 bit constants
- * such as long and unsigned long. The fundamental problem is
- * that a 128 bit constant will get silently truncated by the
- * gcc compiler.
- */
-#define GENMASK_U128(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
-#endif
-
 #endif	/* __LINUX_BITS_H */
diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h
index 6b8713675765..fb8d90bff92e 100644
--- a/tools/include/linux/cfi_types.h
+++ b/tools/include/linux/cfi_types.h
@@ -8,7 +8,7 @@
 #ifdef __ASSEMBLY__
 #include <linux/linkage.h>
 
-#ifdef CONFIG_CFI_CLANG
+#ifdef CONFIG_CFI
 /*
  * Use the __kcfi_typeid_<function> type identifier symbol to
  * annotate indirectly called assembly functions. The compiler emits
@@ -29,17 +29,40 @@
 #define SYM_TYPED_START(name, linkage, align...)	\
 	SYM_TYPED_ENTRY(name, linkage, align)
 
-#else /* CONFIG_CFI_CLANG */
+#else /* CONFIG_CFI */
 
 #define SYM_TYPED_START(name, linkage, align...)	\
 	SYM_START(name, linkage, align)
 
-#endif /* CONFIG_CFI_CLANG */
+#endif /* CONFIG_CFI */
 
 #ifndef SYM_TYPED_FUNC_START
 #define SYM_TYPED_FUNC_START(name) 			\
 	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
 #endif
 
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_CFI_CLANG
+#define DEFINE_CFI_TYPE(name, func)						\
+	/*									\
+	 * Force a reference to the function so the compiler generates		\
+	 * __kcfi_typeid_<func>.						\
+	 */									\
+	__ADDRESSABLE(func);							\
+	/* u32 name __ro_after_init = __kcfi_typeid_<func> */			\
+	extern u32 name;							\
+	asm (									\
+	"	.pushsection	.data..ro_after_init,\"aw\",\%progbits	\n"	\
+	"	.type	" #name ",\%object				\n"	\
+	"	.globl	" #name "					\n"	\
+	"	.p2align	2, 0x0					\n"	\
+	#name ":							\n"	\
+	"	.4byte	__kcfi_typeid_" #func "				\n"	\
+	"	.size	" #name ", 4					\n"	\
+	"	.popsection						\n"	\
+	);
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _LINUX_CFI_TYPES_H */
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 2892a45023af..04e0077fb4c9 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat)
 #define __NR_open_tree_attr 467
 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
 
+/* fs/inode.c */
+#define __NR_file_getattr 468
+__SYSCALL(__NR_file_getattr, sys_file_getattr)
+#define __NR_file_setattr 469
+__SYSCALL(__NR_file_setattr, sys_file_setattr)
+
 #undef __NR_syscalls
-#define __NR_syscalls 468
+#define __NR_syscalls 470
 
 /*
  * 32 bit systems traditionally used different
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 7415a3863891..f0f0d49d2544 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -178,6 +178,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_NOTIFY           37
 #define KVM_EXIT_LOONGARCH_IOCSR  38
 #define KVM_EXIT_MEMORY_FAULT     39
+#define KVM_EXIT_TDX              40
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -447,6 +448,31 @@ struct kvm_run {
 			__u64 gpa;
 			__u64 size;
 		} memory_fault;
+		/* KVM_EXIT_TDX */
+		struct {
+			__u64 flags;
+			__u64 nr;
+			union {
+				struct {
+					__u64 ret;
+					__u64 data[5];
+				} unknown;
+				struct {
+					__u64 ret;
+					__u64 gpa;
+					__u64 size;
+				} get_quote;
+				struct {
+					__u64 ret;
+					__u64 leaf;
+					__u64 r11, r12, r13, r14;
+				} get_tdvmcall_info;
+				struct {
+					__u64 ret;
+					__u64 vector;
+				} setup_event_notify;
+			};
+		} tdx;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -935,6 +961,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_ARM_EL2 240
 #define KVM_CAP_ARM_EL2_E2H0 241
 #define KVM_CAP_RISCV_MP_STATE_RESET 242
+#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
index 34127653fd00..33c9b578b3b2 100644
--- a/tools/include/uapi/linux/nsfs.h
+++ b/tools/include/uapi/linux/nsfs.h
@@ -16,8 +16,6 @@
 #define NS_GET_NSTYPE		_IO(NSIO, 0x3)
 /* Get owner UID (in the caller's user namespace) for a user namespace */
 #define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
-/* Get the id for a mount namespace */
-#define NS_GET_MNTNS_ID		_IOR(NSIO, 0x5, __u64)
 /* Translate pid from target pid namespace into the caller's pid namespace. */
 #define NS_GET_PID_FROM_PIDNS	_IOR(NSIO, 0x6, int)
 /* Return thread-group leader id of pid in the callers pid namespace. */
@@ -42,4 +40,19 @@ struct mnt_ns_info {
 /* Get previous namespace. */
 #define NS_MNT_GET_PREV		_IOR(NSIO, 12, struct mnt_ns_info)
 
+/* Retrieve namespace identifiers. */
+#define NS_GET_MNTNS_ID		_IOR(NSIO, 5,  __u64)
+#define NS_GET_ID		_IOR(NSIO, 13, __u64)
+
+enum init_ns_ino {
+	IPC_NS_INIT_INO		= 0xEFFFFFFFU,
+	UTS_NS_INIT_INO		= 0xEFFFFFFEU,
+	USER_NS_INIT_INO	= 0xEFFFFFFDU,
+	PID_NS_INIT_INO		= 0xEFFFFFFCU,
+	CGROUP_NS_INIT_INO	= 0xEFFFFFFBU,
+	TIME_NS_INIT_INO	= 0xEFFFFFFAU,
+	NET_NS_INIT_INO		= 0xEFFFFFF9U,
+	MNT_NS_INIT_INO		= 0xEFFFFFF8U,
+};
+
 #endif /* __LINUX_NSFS_H */
diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c
index 9ef569492560..ddaeb4eb3e24 100644
--- a/tools/lib/subcmd/help.c
+++ b/tools/lib/subcmd/help.c
@@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
 	size_t ci, cj, ei;
 	int cmp;
 
+	if (!excludes->cnt)
+		return;
+
 	ci = cj = ei = 0;
 	while (ci < cmds->cnt && ei < excludes->cnt) {
 		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index ef032e17fec4..eb295756c3bf 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -830,7 +830,7 @@ class TypeArrayNest(Type):
                      'ynl_attr_for_each_nested(attr2, attr) {',
                      '\tif (ynl_attr_validate(yarg, attr2))',
                      '\t\treturn YNL_PARSE_CB_ERROR;',
-                     f'\t{var}->_count.{self.c_name}++;',
+                     f'\tn_{self.c_name}++;',
                      '}']
         return get_lines, None, local_vars
 
diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c
index b6fdc68053cc..2e555c4060c5 100644
--- a/tools/objtool/arch/loongarch/decode.c
+++ b/tools/objtool/arch/loongarch/decode.c
@@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst,
 	return true;
 }
 
+static bool decode_insn_reg3_fomat(union loongarch_instruction inst,
+				   struct instruction *insn)
+{
+	switch (inst.reg3_format.opcode) {
+	case amswapw_op:
+		if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO &&
+		    inst.reg3_format.rk == LOONGARCH_GPR_RA &&
+		    inst.reg3_format.rj == LOONGARCH_GPR_ZERO) {
+			/* amswap.w $zero, $ra, $zero */
+			insn->type = INSN_BUG;
+		}
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 int arch_decode_instruction(struct objtool_file *file, const struct section *sec,
 			    unsigned long offset, unsigned int maxlen,
 			    struct instruction *insn)
@@ -309,11 +328,19 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 		return 0;
 	if (decode_insn_reg2i16_fomat(inst, insn))
 		return 0;
+	if (decode_insn_reg3_fomat(inst, insn))
+		return 0;
 
-	if (inst.word == 0)
+	if (inst.word == 0) {
+		/* andi $zero, $zero, 0x0 */
 		insn->type = INSN_NOP;
-	else if (inst.reg0i15_format.opcode == break_op) {
-		/* break */
+	} else if (inst.reg0i15_format.opcode == break_op &&
+		   inst.reg0i15_format.immediate == 0x0) {
+		/* break 0x0 */
+		insn->type = INSN_TRAP;
+	} else if (inst.reg0i15_format.opcode == break_op &&
+		   inst.reg0i15_format.immediate == 0x1) {
+		/* break 0x1 */
 		insn->type = INSN_BUG;
 	} else if (inst.reg2_format.opcode == ertn_op) {
 		/* ertn */
diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c
index e39f86d97002..a80b75f7b061 100644
--- a/tools/objtool/arch/loongarch/special.c
+++ b/tools/objtool/arch/loongarch/special.c
@@ -27,6 +27,7 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file,
 	struct table_info *next_table;
 	unsigned long tmp_insn_offset;
 	unsigned long tmp_rodata_offset;
+	bool is_valid_list = false;
 
 	rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate");
 	if (!rsec)
@@ -35,6 +36,12 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file,
 	INIT_LIST_HEAD(&table_list);
 
 	for_each_reloc(rsec, reloc) {
+		if (reloc->sym->sec->rodata)
+			continue;
+
+		if (strcmp(insn->sec->name, reloc->sym->sec->name))
+			continue;
+
 		orig_table = malloc(sizeof(struct table_info));
 		if (!orig_table) {
 			WARN("malloc failed");
@@ -49,6 +56,22 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file,
 
 		if (reloc_idx(reloc) + 1 == sec_num_entries(rsec))
 			break;
+
+		if (strcmp(insn->sec->name, (reloc + 1)->sym->sec->name)) {
+			list_for_each_entry(orig_table, &table_list, jump_info) {
+				if (orig_table->insn_offset == insn->offset) {
+					is_valid_list = true;
+					break;
+				}
+			}
+
+			if (!is_valid_list) {
+				list_del_init(&table_list);
+				continue;
+			}
+
+			break;
+		}
 	}
 
 	list_for_each_entry(orig_table, &table_list, jump_info) {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 98c4713c1b09..0ad5cc70ecbe 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -880,3 +880,15 @@ unsigned int arch_reloc_size(struct reloc *reloc)
 		return 8;
 	}
 }
+
+bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc)
+{
+	switch (reloc_type(reloc)) {
+	case R_X86_64_32:
+	case R_X86_64_32S:
+	case R_X86_64_64:
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 80239843e9f0..0f6b197cfcb0 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -87,6 +87,7 @@ static const struct option check_options[] = {
 	OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"),
 	OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"),
 	OPT_BOOLEAN(0  , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"),
+	OPT_BOOLEAN(0  , "noabs", &opts.noabs, "reject absolute references in allocatable sections"),
 	OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump),
 
 	OPT_GROUP("Options:"),
@@ -162,6 +163,7 @@ static bool opts_valid(void)
 	    opts.hack_noinstr		||
 	    opts.ibt			||
 	    opts.mcount			||
+	    opts.noabs			||
 	    opts.noinstr		||
 	    opts.orc			||
 	    opts.retpoline		||
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index d14f20ef1db1..093fcd01dd6e 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3564,7 +3564,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 		if (func && insn_func(insn) && func != insn_func(insn)->pfunc) {
 			/* Ignore KCFI type preambles, which always fall through */
 			if (!strncmp(func->name, "__cfi_", 6) ||
-			    !strncmp(func->name, "__pfx_", 6))
+			    !strncmp(func->name, "__pfx_", 6) ||
+			    !strncmp(func->name, "__pi___cfi_", 11) ||
+			    !strncmp(func->name, "__pi___pfx_", 11))
 				return 0;
 
 			if (file->ignore_unreachables)
@@ -4644,6 +4646,47 @@ static void disas_warned_funcs(struct objtool_file *file)
 		disas_funcs(funcs);
 }
 
+__weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc)
+{
+	unsigned int type = reloc_type(reloc);
+	size_t sz = elf_addr_size(elf);
+
+	return (sz == 8) ? (type == R_ABS64) : (type == R_ABS32);
+}
+
+static int check_abs_references(struct objtool_file *file)
+{
+	struct section *sec;
+	struct reloc *reloc;
+	int ret = 0;
+
+	for_each_sec(file, sec) {
+		/* absolute references in non-loadable sections are fine */
+		if (!(sec->sh.sh_flags & SHF_ALLOC))
+			continue;
+
+		/* section must have an associated .rela section */
+		if (!sec->rsec)
+			continue;
+
+		/*
+		 * Special case for compiler generated metadata that is not
+		 * consumed until after boot.
+		 */
+		if (!strcmp(sec->name, "__patchable_function_entries"))
+			continue;
+
+		for_each_reloc(sec->rsec, reloc) {
+			if (arch_absolute_reloc(file->elf, reloc)) {
+				WARN("section %s has absolute relocation at offset 0x%lx",
+				     sec->name, reloc_offset(reloc));
+				ret++;
+			}
+		}
+	}
+	return ret;
+}
+
 struct insn_chunk {
 	void *addr;
 	struct insn_chunk *next;
@@ -4777,6 +4820,9 @@ int check(struct objtool_file *file)
 			goto out;
 	}
 
+	if (opts.noabs)
+		warnings += check_abs_references(file);
+
 	if (opts.orc && nr_insns) {
 		ret = orc_create(file);
 		if (ret)
diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h
index 01ef6f415adf..be33c7b43180 100644
--- a/tools/objtool/include/objtool/arch.h
+++ b/tools/objtool/include/objtool/arch.h
@@ -97,6 +97,7 @@ bool arch_is_embedded_insn(struct symbol *sym);
 int arch_rewrite_retpolines(struct objtool_file *file);
 
 bool arch_pc_relative_reloc(struct reloc *reloc);
+bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc);
 
 unsigned int arch_reloc_size(struct reloc *reloc);
 unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table);
diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h
index 6b08666fa69d..ab22673862e1 100644
--- a/tools/objtool/include/objtool/builtin.h
+++ b/tools/objtool/include/objtool/builtin.h
@@ -26,6 +26,7 @@ struct opts {
 	bool uaccess;
 	int prefix;
 	bool cfi;
+	bool noabs;
 
 	/* options: */
 	bool backtrace;
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 6a922d046b8e..802895fae3ca 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -45,7 +45,6 @@ NORETURN(rewind_stack_and_make_dead)
 NORETURN(rust_begin_unwind)
 NORETURN(rust_helper_BUG)
 NORETURN(sev_es_terminate)
-NORETURN(snp_abort)
 NORETURN(start_kernel)
 NORETURN(stop_this_cpu)
 NORETURN(usercopy_abort)
diff --git a/tools/perf/arch/arm/entry/syscalls/syscall.tbl b/tools/perf/arch/arm/entry/syscalls/syscall.tbl
index 27c1d5ebcd91..b07e699aaa3c 100644
--- a/tools/perf/arch/arm/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/arm/entry/syscalls/syscall.tbl
@@ -482,3 +482,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
index 1e8c44c7b614..7a7049c2c307 100644
--- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -382,3 +382,5 @@
 465	n64	listxattrat			sys_listxattrat
 466	n64	removexattrat			sys_removexattrat
 467	n64	open_tree_attr			sys_open_tree_attr
+468	n64	file_getattr			sys_file_getattr
+469	n64	file_setattr			sys_file_setattr
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 9a084bdb8926..b453e80dfc00 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -558,3 +558,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index a4569b96ef06..8a6744d658db 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -470,3 +470,5 @@
 465  common	listxattrat		sys_listxattrat			sys_listxattrat
 466  common	removexattrat		sys_removexattrat		sys_removexattrat
 467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
+468  common	file_getattr		sys_file_getattr		sys_file_getattr
+469  common	file_setattr		sys_file_setattr		sys_file_setattr
diff --git a/tools/perf/arch/sh/entry/syscalls/syscall.tbl b/tools/perf/arch/sh/entry/syscalls/syscall.tbl
index 52a7652fcff6..5e9c9eff5539 100644
--- a/tools/perf/arch/sh/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/sh/entry/syscalls/syscall.tbl
@@ -471,3 +471,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl
index 83e45eb6c095..ebb7d06d1044 100644
--- a/tools/perf/arch/sparc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/sparc/entry/syscalls/syscall.tbl
@@ -513,3 +513,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl
index ac007ea00979..4877e16da69a 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_32.tbl
@@ -473,3 +473,5 @@
 465	i386	listxattrat		sys_listxattrat
 466	i386	removexattrat		sys_removexattrat
 467	i386	open_tree_attr		sys_open_tree_attr
+468	i386	file_getattr		sys_file_getattr
+469	i386	file_setattr		sys_file_setattr
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..92cf0fe2291e 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -391,6 +391,8 @@
 465	common	listxattrat		sys_listxattrat
 466	common	removexattrat		sys_removexattrat
 467	common	open_tree_attr		sys_open_tree_attr
+468	common	file_getattr		sys_file_getattr
+469	common	file_setattr		sys_file_setattr
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/tools/perf/arch/x86/tests/topdown.c b/tools/perf/arch/x86/tests/topdown.c
index 8d0ea7a4bbc1..1eba3b4594ef 100644
--- a/tools/perf/arch/x86/tests/topdown.c
+++ b/tools/perf/arch/x86/tests/topdown.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "arch-tests.h"
 #include "../util/topdown.h"
+#include "debug.h"
 #include "evlist.h"
 #include "parse-events.h"
 #include "pmu.h"
diff --git a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl
index f657a77314f8..374e4cb788d8 100644
--- a/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl
@@ -438,3 +438,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c
index aad572a78d7f..12387ea88b9a 100644
--- a/tools/perf/bench/inject-buildid.c
+++ b/tools/perf/bench/inject-buildid.c
@@ -85,7 +85,7 @@ static int add_dso(const char *fpath, const struct stat *sb __maybe_unused,
 	if (typeflag == FTW_D || typeflag == FTW_SL)
 		return 0;
 
-	if (filename__read_build_id(fpath, &bid) < 0)
+	if (filename__read_build_id(fpath, &bid, /*block=*/true) < 0)
 		return 0;
 
 	dso->name = realpath(fpath, NULL);
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index c98104481c8a..2e0f2004696a 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -180,7 +180,7 @@ static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi)
 	struct nscookie nsc;
 
 	nsinfo__mountns_enter(nsi, &nsc);
-	err = filename__read_build_id(filename, &bid);
+	err = filename__read_build_id(filename, &bid, /*block=*/true);
 	nsinfo__mountns_exit(&nsc);
 	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
@@ -204,7 +204,7 @@ static int build_id_cache__remove_file(const char *filename, struct nsinfo *nsi)
 	int err;
 
 	nsinfo__mountns_enter(nsi, &nsc);
-	err = filename__read_build_id(filename, &bid);
+	err = filename__read_build_id(filename, &bid, /*block=*/true);
 	nsinfo__mountns_exit(&nsc);
 	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
@@ -280,7 +280,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
 	if (!dso__build_id_filename(dso, filename, sizeof(filename), false))
 		return true;
 
-	if (filename__read_build_id(filename, &bid) == -1) {
+	if (filename__read_build_id(filename, &bid, /*block=*/true) == -1) {
 		if (errno == ENOENT)
 			return false;
 
@@ -309,7 +309,7 @@ static int build_id_cache__update_file(const char *filename, struct nsinfo *nsi)
 	int err;
 
 	nsinfo__mountns_enter(nsi, &nsc);
-	err = filename__read_build_id(filename, &bid);
+	err = filename__read_build_id(filename, &bid, /*block=*/true);
 	nsinfo__mountns_exit(&nsc);
 	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 40ba6a94f719..a114b3fa1bea 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -680,12 +680,12 @@ static int dso__read_build_id(struct dso *dso)
 
 	mutex_lock(dso__lock(dso));
 	nsinfo__mountns_enter(dso__nsinfo(dso), &nsc);
-	if (filename__read_build_id(dso__long_name(dso), &bid) > 0)
+	if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0)
 		dso__set_build_id(dso, &bid);
 	else if (dso__nsinfo(dso)) {
 		char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso));
 
-		if (new_name && filename__read_build_id(new_name, &bid) > 0)
+		if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0)
 			dso__set_build_id(dso, &bid);
 		free(new_name);
 	}
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index fd49703021fd..078634461df2 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv)
 		.owner = show_lock_owner,
 		.cgroups = RB_ROOT,
 	};
+	struct perf_env host_env;
 
 	lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table));
 	if (!lockhash_table)
@@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv)
 	eops.mmap		 = perf_event__process_mmap;
 	eops.tracing_data	 = perf_event__process_tracing_data;
 
-	session = perf_session__new(use_bpf ? NULL : &data, &eops);
+	perf_env__init(&host_env);
+	session = __perf_session__new(use_bpf ? NULL : &data, &eops,
+				/*trace_event_repipe=*/false, &host_env);
+
 	if (IS_ERR(session)) {
 		pr_err("Initializing perf session failed\n");
 		err = PTR_ERR(session);
@@ -2142,6 +2146,7 @@ out_delete:
 	evlist__delete(con.evlist);
 	lock_contention_finish(&con);
 	perf_session__delete(session);
+	perf_env__exit(&host_env);
 	zfree(&lockhash_table);
 	return err;
 }
diff --git a/tools/perf/tests/pe-file-parsing.c b/tools/perf/tests/pe-file-parsing.c
index 30c7da79e109..8b31d1d05f90 100644
--- a/tools/perf/tests/pe-file-parsing.c
+++ b/tools/perf/tests/pe-file-parsing.c
@@ -37,7 +37,7 @@ static int run_dir(const char *d)
 	size_t idx;
 
 	scnprintf(filename, PATH_MAX, "%s/pe-file.exe", d);
-	ret = filename__read_build_id(filename, &bid);
+	ret = filename__read_build_id(filename, &bid, /*block=*/true);
 	TEST_ASSERT_VAL("Failed to read build_id",
 			ret == sizeof(expect_build_id));
 	TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id,
@@ -49,7 +49,7 @@ static int run_dir(const char *d)
 			!strcmp(debuglink, expect_debuglink));
 
 	scnprintf(debugfile, PATH_MAX, "%s/%s", d, debuglink);
-	ret = filename__read_build_id(debugfile, &bid);
+	ret = filename__read_build_id(debugfile, &bid, /*block=*/true);
 	TEST_ASSERT_VAL("Failed to read debug file build_id",
 			ret == sizeof(expect_build_id));
 	TEST_ASSERT_VAL("Wrong build_id", !memcmp(bid.data, expect_build_id,
diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c
index 93baee2eae42..6132f1af3e22 100644
--- a/tools/perf/tests/sdt.c
+++ b/tools/perf/tests/sdt.c
@@ -31,7 +31,7 @@ static int build_id_cache__add_file(const char *filename)
 	struct build_id bid = { .size = 0, };
 	int err;
 
-	err = filename__read_build_id(filename, &bid);
+	err = filename__read_build_id(filename, &bid, /*block=*/true);
 	if (err < 0) {
 		pr_debug("Failed to read build id of %s\n", filename);
 		return err;
diff --git a/tools/perf/tests/shell/test_bpf_metadata.sh b/tools/perf/tests/shell/test_bpf_metadata.sh
index 69e3c2055134..be67d56e0f09 100755
--- a/tools/perf/tests/shell/test_bpf_metadata.sh
+++ b/tools/perf/tests/shell/test_bpf_metadata.sh
@@ -61,7 +61,7 @@ test_bpf_metadata() {
 		/perf_version/ {
 			if (entry) print $NF;
 		}
-	' | egrep "$VERS" > /dev/null
+	' | grep -qF "$VERS"
 	then
 		echo "Basic BPF metadata test [Failed invalid output]"
 		err=1
diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
index a15ac2fa4b20..f291ab4f94eb 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h
@@ -90,10 +90,28 @@
 #define DN_ATTRIB	0x00000020	/* File changed attibutes */
 #define DN_MULTISHOT	0x80000000	/* Don't remove notifier */
 
+/* Reserved kernel ranges [-100], [-10000, -40000]. */
 #define AT_FDCWD		-100    /* Special value for dirfd used to
 					   indicate openat should use the
 					   current working directory. */
 
+/*
+ * The concept of process and threads in userland and the kernel is a confusing
+ * one - within the kernel every thread is a 'task' with its own individual PID,
+ * however from userland's point of view threads are grouped by a single PID,
+ * which is that of the 'thread group leader', typically the first thread
+ * spawned.
+ *
+ * To cut the Gideon knot, for internal kernel usage, we refer to
+ * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
+ * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
+ * group leader...
+ */
+#define PIDFD_SELF_THREAD		-10000 /* Current thread. */
+#define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
+
+#define FD_PIDFS_ROOT			-10002 /* Root of the pidfs filesystem */
+#define FD_INVALID			-10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
 
 /* Generic flags for the *at(2) family of syscalls. */
 
diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h
index 0098b0ce8ccb..0bd678a4a10e 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/fs.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h
@@ -60,6 +60,17 @@
 #define RENAME_EXCHANGE		(1 << 1)	/* Exchange source and dest */
 #define RENAME_WHITEOUT		(1 << 2)	/* Whiteout source */
 
+/*
+ * The root inode of procfs is guaranteed to always have the same inode number.
+ * For programs that make heavy use of procfs, verifying that the root is a
+ * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH})
+ * will allow you to make sure you are never tricked into operating on the
+ * wrong procfs file.
+ */
+enum procfs_ino {
+	PROCFS_ROOT_INO = 1,
+};
+
 struct file_clone_range {
 	__s64 src_fd;
 	__u64 src_offset;
@@ -91,6 +102,63 @@ struct fs_sysfs_path {
 	__u8			name[128];
 };
 
+/* Protection info capability flags */
+#define	LBMD_PI_CAP_INTEGRITY		(1 << 0)
+#define	LBMD_PI_CAP_REFTAG		(1 << 1)
+
+/* Checksum types for Protection Information */
+#define LBMD_PI_CSUM_NONE		0
+#define LBMD_PI_CSUM_IP			1
+#define LBMD_PI_CSUM_CRC16_T10DIF	2
+#define LBMD_PI_CSUM_CRC64_NVME		4
+
+/* sizeof first published struct */
+#define LBMD_SIZE_VER0			16
+
+/*
+ * Logical block metadata capability descriptor
+ * If the device does not support metadata, all the fields will be zero.
+ * Applications must check lbmd_flags to determine whether metadata is
+ * supported or not.
+ */
+struct logical_block_metadata_cap {
+	/* Bitmask of logical block metadata capability flags */
+	__u32	lbmd_flags;
+	/*
+	 * The amount of data described by each unit of logical block
+	 * metadata
+	 */
+	__u16	lbmd_interval;
+	/*
+	 * Size in bytes of the logical block metadata associated with each
+	 * interval
+	 */
+	__u8	lbmd_size;
+	/*
+	 * Size in bytes of the opaque block tag associated with each
+	 * interval
+	 */
+	__u8	lbmd_opaque_size;
+	/*
+	 * Offset in bytes of the opaque block tag within the logical block
+	 * metadata
+	 */
+	__u8	lbmd_opaque_offset;
+	/* Size in bytes of the T10 PI tuple associated with each interval */
+	__u8	lbmd_pi_size;
+	/* Offset in bytes of T10 PI tuple within the logical block metadata */
+	__u8	lbmd_pi_offset;
+	/* T10 PI guard tag type */
+	__u8	lbmd_guard_tag_type;
+	/* Size in bytes of the T10 PI application tag */
+	__u8	lbmd_app_tag_size;
+	/* Size in bytes of the T10 PI reference tag */
+	__u8	lbmd_ref_tag_size;
+	/* Size in bytes of the T10 PI storage tag */
+	__u8	lbmd_storage_tag_size;
+	__u8	pad;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -149,6 +217,24 @@ struct fsxattr {
 };
 
 /*
+ * Variable size structure for file_[sg]et_attr().
+ *
+ * Note. This is alternative to the structure 'struct file_kattr'/'struct fsxattr'.
+ * As this structure is passed to/from userspace with its size, this can
+ * be versioned based on the size.
+ */
+struct file_attr {
+	__u64 fa_xflags;	/* xflags field value (get/set) */
+	__u32 fa_extsize;	/* extsize field value (get/set)*/
+	__u32 fa_nextents;	/* nextents field value (get)   */
+	__u32 fa_projid;	/* project identifier (get/set) */
+	__u32 fa_cowextsize;	/* CoW extsize field value (get/set) */
+};
+
+#define FILE_ATTR_SIZE_VER0 24
+#define FILE_ATTR_SIZE_LATEST FILE_ATTR_SIZE_VER0
+
+/*
  * Flags for the fsx_xflags field
  */
 #define FS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
@@ -247,6 +333,8 @@ struct fsxattr {
  * also /sys/kernel/debug/ for filesystems with debugfs exports
  */
 #define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
+/* Get logical block metadata capability details */
+#define FS_IOC_GETLBMD_CAP		_IOWR(0x15, 2, struct logical_block_metadata_cap)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index 3b93fb906e3c..ed3aed264aeb 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -244,6 +244,8 @@ struct prctl_mm_map {
 # define PR_MTE_TAG_MASK		(0xffffUL << PR_MTE_TAG_SHIFT)
 /* Unused; kept only for source compatibility */
 # define PR_MTE_TCF_SHIFT		1
+/* MTE tag check store only */
+# define PR_MTE_STORE_ONLY		(1UL << 19)
 /* RISC-V pointer masking tag length */
 # define PR_PMLEN_SHIFT			24
 # define PR_PMLEN_MASK			(0x7fUL << PR_PMLEN_SHIFT)
@@ -255,7 +257,12 @@ struct prctl_mm_map {
 /* Dispatch syscalls to a userspace handler */
 #define PR_SET_SYSCALL_USER_DISPATCH	59
 # define PR_SYS_DISPATCH_OFF		0
-# define PR_SYS_DISPATCH_ON		1
+/* Enable dispatch except for the specified range */
+# define PR_SYS_DISPATCH_EXCLUSIVE_ON	1
+/* Enable dispatch for the specified range */
+# define PR_SYS_DISPATCH_INCLUSIVE_ON	2
+/* Legacy name for backwards compatibility */
+# define PR_SYS_DISPATCH_ON		PR_SYS_DISPATCH_EXCLUSIVE_ON
 /* The control values for the user space selector when dispatch is enabled */
 # define SYSCALL_DISPATCH_FILTER_ALLOW	0
 # define SYSCALL_DISPATCH_FILTER_BLOCK	1
diff --git a/tools/perf/trace/beauty/include/uapi/linux/vhost.h b/tools/perf/trace/beauty/include/uapi/linux/vhost.h
index d4b3e2ae1314..c57674a6aa0d 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/vhost.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/vhost.h
@@ -235,4 +235,39 @@
  */
 #define VHOST_VDPA_GET_VRING_SIZE	_IOWR(VHOST_VIRTIO, 0x82,	\
 					      struct vhost_vring_state)
+
+/* Extended features manipulation */
+#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+
+/* fork_owner values for vhost */
+#define VHOST_FORK_OWNER_KTHREAD 0
+#define VHOST_FORK_OWNER_TASK 1
+
+/**
+ * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device,
+ * This ioctl must called before VHOST_SET_OWNER.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @param fork_owner: An 8-bit value that determines the vhost thread mode
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value):
+ *   - Vhost will create vhost worker as tasks forked from the owner,
+ *     inheriting all of the owner's attributes.
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
+ *   - Vhost will create vhost workers as kernel threads.
+ */
+#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8)
+
+/**
+ * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @return: An 8-bit value indicating the current thread mode.
+ */
+#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8)
+
 #endif
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 5b6d3e899e11..2298cd396c42 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -657,9 +657,15 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 		info_node->info_linear = info_linear;
 		info_node->metadata = NULL;
 		if (!perf_env__insert_bpf_prog_info(env, info_node)) {
-			free(info_linear);
+			/*
+			 * Insert failed, likely because of a duplicate event
+			 * made by the sideband thread. Ignore synthesizing the
+			 * metadata.
+			 */
 			free(info_node);
+			goto out;
 		}
+		/* info_linear is now owned by info_node and shouldn't be freed below. */
 		info_linear = NULL;
 
 		/*
@@ -827,18 +833,18 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 	return err;
 }
 
-static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
+static int perf_env__add_bpf_info(struct perf_env *env, u32 id)
 {
 	struct bpf_prog_info_node *info_node;
 	struct perf_bpil *info_linear;
 	struct btf *btf = NULL;
 	u64 arrays;
 	u32 btf_id;
-	int fd;
+	int fd, err = 0;
 
 	fd = bpf_prog_get_fd_by_id(id);
 	if (fd < 0)
-		return;
+		return -EINVAL;
 
 	arrays = 1UL << PERF_BPIL_JITED_KSYMS;
 	arrays |= 1UL << PERF_BPIL_JITED_FUNC_LENS;
@@ -852,6 +858,7 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 	info_linear = get_bpf_prog_info_linear(fd, arrays);
 	if (IS_ERR_OR_NULL(info_linear)) {
 		pr_debug("%s: failed to get BPF program info. aborting\n", __func__);
+		err = PTR_ERR(info_linear);
 		goto out;
 	}
 
@@ -862,38 +869,46 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 		info_node->info_linear = info_linear;
 		info_node->metadata = bpf_metadata_create(&info_linear->info);
 		if (!perf_env__insert_bpf_prog_info(env, info_node)) {
+			pr_debug("%s: duplicate add bpf info request for id %u\n",
+				 __func__, btf_id);
 			free(info_linear);
 			free(info_node);
+			goto out;
 		}
-	} else
+	} else {
 		free(info_linear);
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (btf_id == 0)
 		goto out;
 
 	btf = btf__load_from_kernel_by_id(btf_id);
-	if (libbpf_get_error(btf)) {
-		pr_debug("%s: failed to get BTF of id %u, aborting\n",
-			 __func__, btf_id);
-		goto out;
+	if (!btf) {
+		err = -errno;
+		pr_debug("%s: failed to get BTF of id %u %d\n", __func__, btf_id, err);
+	} else {
+		perf_env__fetch_btf(env, btf_id, btf);
 	}
-	perf_env__fetch_btf(env, btf_id, btf);
 
 out:
 	btf__free(btf);
 	close(fd);
+	return err;
 }
 
 static int bpf_event__sb_cb(union perf_event *event, void *data)
 {
 	struct perf_env *env = data;
+	int ret = 0;
 
 	if (event->header.type != PERF_RECORD_BPF_EVENT)
 		return -1;
 
 	switch (event->bpf.type) {
 	case PERF_BPF_EVENT_PROG_LOAD:
-		perf_env__add_bpf_info(env, event->bpf.id);
+		ret = perf_env__add_bpf_info(env, event->bpf.id);
 
 	case PERF_BPF_EVENT_PROG_UNLOAD:
 		/*
@@ -907,7 +922,7 @@ static int bpf_event__sb_cb(union perf_event *event, void *data)
 		break;
 	}
 
-	return 0;
+	return ret;
 }
 
 int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env)
diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c
index 80b1d2b3729b..5a66dc8594aa 100644
--- a/tools/perf/util/bpf-utils.c
+++ b/tools/perf/util/bpf-utils.c
@@ -20,7 +20,7 @@ struct bpil_array_desc {
 				 */
 };
 
-static struct bpil_array_desc bpil_array_desc[] = {
+static const struct bpil_array_desc bpil_array_desc[] = {
 	[PERF_BPIL_JITED_INSNS] = {
 		offsetof(struct bpf_prog_info, jited_prog_insns),
 		offsetof(struct bpf_prog_info, jited_prog_len),
@@ -115,7 +115,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	__u32 info_len = sizeof(info);
 	__u32 data_len = 0;
 	int i, err;
-	void *ptr;
+	__u8 *ptr;
 
 	if (arrays >> PERF_BPIL_LAST_ARRAY)
 		return ERR_PTR(-EINVAL);
@@ -126,15 +126,15 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 		pr_debug("can't get prog info: %s", strerror(errno));
 		return ERR_PTR(-EFAULT);
 	}
+	if (info.type >= __MAX_BPF_PROG_TYPE)
+		pr_debug("%s:%d: unexpected program type %u\n", __func__, __LINE__, info.type);
 
 	/* step 2: calculate total size of all arrays */
 	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
+		const struct bpil_array_desc *desc = &bpil_array_desc[i];
 		bool include_array = (arrays & (1UL << i)) > 0;
-		struct bpil_array_desc *desc;
 		__u32 count, size;
 
-		desc = bpil_array_desc + i;
-
 		/* kernel is too old to support this field */
 		if (info_len < desc->array_offset + sizeof(__u32) ||
 		    info_len < desc->count_offset + sizeof(__u32) ||
@@ -163,19 +163,20 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	ptr = info_linear->data;
 
 	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
-		struct bpil_array_desc *desc;
+		const struct bpil_array_desc *desc = &bpil_array_desc[i];
 		__u32 count, size;
 
 		if ((arrays & (1UL << i)) == 0)
 			continue;
 
-		desc  = bpil_array_desc + i;
 		count = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
 		size  = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
 		bpf_prog_info_set_offset_u32(&info_linear->info,
 					     desc->count_offset, count);
 		bpf_prog_info_set_offset_u32(&info_linear->info,
 					     desc->size_offset, size);
+		assert(ptr >= info_linear->data);
+		assert(ptr < &info_linear->data[data_len]);
 		bpf_prog_info_set_offset_u64(&info_linear->info,
 					     desc->array_offset,
 					     ptr_to_u64(ptr));
@@ -189,27 +190,45 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 		free(info_linear);
 		return ERR_PTR(-EFAULT);
 	}
+	if (info_linear->info.type >= __MAX_BPF_PROG_TYPE) {
+		pr_debug("%s:%d: unexpected program type %u\n",
+			 __func__, __LINE__, info_linear->info.type);
+	}
 
 	/* step 6: verify the data */
+	ptr = info_linear->data;
 	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
-		struct bpil_array_desc *desc;
-		__u32 v1, v2;
+		const struct bpil_array_desc *desc = &bpil_array_desc[i];
+		__u32 count1, count2, size1, size2;
+		__u64 ptr2;
 
 		if ((arrays & (1UL << i)) == 0)
 			continue;
 
-		desc = bpil_array_desc + i;
-		v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
-		v2 = bpf_prog_info_read_offset_u32(&info_linear->info,
+		count1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset);
+		count2 = bpf_prog_info_read_offset_u32(&info_linear->info,
 						   desc->count_offset);
-		if (v1 != v2)
-			pr_warning("%s: mismatch in element count\n", __func__);
+		if (count1 != count2) {
+			pr_warning("%s: mismatch in element count %u vs %u\n", __func__, count1, count2);
+			free(info_linear);
+			return ERR_PTR(-ERANGE);
+		}
 
-		v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
-		v2 = bpf_prog_info_read_offset_u32(&info_linear->info,
+		size1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset);
+		size2 = bpf_prog_info_read_offset_u32(&info_linear->info,
 						   desc->size_offset);
-		if (v1 != v2)
-			pr_warning("%s: mismatch in rec size\n", __func__);
+		if (size1 != size2) {
+			pr_warning("%s: mismatch in rec size %u vs %u\n", __func__, size1, size2);
+			free(info_linear);
+			return ERR_PTR(-ERANGE);
+		}
+		ptr2 = bpf_prog_info_read_offset_u64(&info_linear->info, desc->array_offset);
+		if (ptr_to_u64(ptr) != ptr2) {
+			pr_warning("%s: mismatch in array %p vs %llx\n", __func__, ptr, ptr2);
+			free(info_linear);
+			return ERR_PTR(-ERANGE);
+		}
+		ptr += roundup(count1 * size1, sizeof(__u64));
 	}
 
 	/* step 7: update info_len and data_len */
@@ -224,13 +243,12 @@ void bpil_addr_to_offs(struct perf_bpil *info_linear)
 	int i;
 
 	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
-		struct bpil_array_desc *desc;
+		const struct bpil_array_desc *desc = &bpil_array_desc[i];
 		__u64 addr, offs;
 
 		if ((info_linear->arrays & (1UL << i)) == 0)
 			continue;
 
-		desc = bpil_array_desc + i;
 		addr = bpf_prog_info_read_offset_u64(&info_linear->info,
 						     desc->array_offset);
 		offs = addr - ptr_to_u64(info_linear->data);
@@ -244,13 +262,12 @@ void bpil_offs_to_addr(struct perf_bpil *info_linear)
 	int i;
 
 	for (i = PERF_BPIL_FIRST_ARRAY; i < PERF_BPIL_LAST_ARRAY; ++i) {
-		struct bpil_array_desc *desc;
+		const struct bpil_array_desc *desc = &bpil_array_desc[i];
 		__u64 addr, offs;
 
 		if ((info_linear->arrays & (1UL << i)) == 0)
 			continue;
 
-		desc = bpil_array_desc + i;
 		offs = bpf_prog_info_read_offset_u64(&info_linear->info,
 						     desc->array_offset);
 		addr = offs + ptr_to_u64(info_linear->data);
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index a7018a3b0437..bf7f3268b9a2 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -115,7 +115,7 @@ int filename__snprintf_build_id(const char *pathname, char *sbuild_id, size_t sb
 	struct build_id bid = { .size = 0, };
 	int ret;
 
-	ret = filename__read_build_id(pathname, &bid);
+	ret = filename__read_build_id(pathname, &bid, /*block=*/true);
 	if (ret < 0)
 		return ret;
 
@@ -841,7 +841,7 @@ static int filename__read_build_id_ns(const char *filename,
 	int ret;
 
 	nsinfo__mountns_enter(nsi, &nsc);
-	ret = filename__read_build_id(filename, bid);
+	ret = filename__read_build_id(filename, bid, /*block=*/true);
 	nsinfo__mountns_exit(&nsc);
 
 	return ret;
diff --git a/tools/perf/util/debuginfo.c b/tools/perf/util/debuginfo.c
index a44c70f93156..bb9ebd84ec2d 100644
--- a/tools/perf/util/debuginfo.c
+++ b/tools/perf/util/debuginfo.c
@@ -110,8 +110,12 @@ struct debuginfo *debuginfo__new(const char *path)
 	if (!dso)
 		goto out;
 
-	/* Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO */
-	if (is_regular_file(path) && filename__read_build_id(path, &bid) > 0)
+	/*
+	 * Set the build id for DSO_BINARY_TYPE__BUILDID_DEBUGINFO. Don't block
+	 * incase the path isn't for a regular file.
+	 */
+	assert(!dso__has_build_id(dso));
+	if (filename__read_build_id(path, &bid, /*block=*/false) > 0)
 		dso__set_build_id(dso, &bid);
 
 	for (type = distro_dwarf_types;
diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c
index 0a7645c7fae7..64c1d65b0149 100644
--- a/tools/perf/util/dsos.c
+++ b/tools/perf/util/dsos.c
@@ -81,13 +81,13 @@ static int dsos__read_build_ids_cb(struct dso *dso, void *data)
 		return 0;
 	}
 	nsinfo__mountns_enter(dso__nsinfo(dso), &nsc);
-	if (filename__read_build_id(dso__long_name(dso), &bid) > 0) {
+	if (filename__read_build_id(dso__long_name(dso), &bid, /*block=*/true) > 0) {
 		dso__set_build_id(dso, &bid);
 		args->have_build_id = true;
 	} else if (errno == ENOENT && dso__nsinfo(dso)) {
 		char *new_name = dso__filename_with_chroot(dso, dso__long_name(dso));
 
-		if (new_name && filename__read_build_id(new_name, &bid) > 0) {
+		if (new_name && filename__read_build_id(new_name, &bid, /*block=*/true) > 0) {
 			dso__set_build_id(dso, &bid);
 			args->have_build_id = true;
 		}
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h
index 89979ca23c3f..34e2fdfe7300 100644
--- a/tools/perf/util/include/linux/linkage.h
+++ b/tools/perf/util/include/linux/linkage.h
@@ -120,7 +120,7 @@
 #endif
 
 // In the kernel sources (include/linux/cfi_types.h), this has a different
-// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang
+// definition when CONFIG_CFI is used, for tools/ just use the !cfi
 // definition:
 #ifndef SYM_TYPED_START
 #define SYM_TYPED_START(name, linkage, align...)        \
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 8fabddc1c0da..72c7a4e15d61 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -32,7 +32,7 @@ static void intel_pt_insn_decoder(struct insn *insn,
 	intel_pt_insn->rel = 0;
 	intel_pt_insn->emulated_ptwrite = false;
 
-	if (insn_is_avx(insn)) {
+	if (insn_is_avx_or_xop(insn)) {
 		intel_pt_insn->op = INTEL_PT_OP_OTHER;
 		intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH;
 		intel_pt_insn->length = insn->length;
diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index 85b2a93a59ac..779f6230130a 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new)
 	}
 	/* Insert the value at the end. */
 	maps_by_address[nr_maps] = map__get(new);
+	map__set_kmap_maps(new, maps);
 	if (maps_by_name)
 		maps_by_name[nr_maps] = map__get(new);
 
@@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new)
 	if (map__end(new) < map__start(new))
 		RC_CHK_ACCESS(maps)->ends_broken = true;
 
-	map__set_kmap_maps(new, maps);
-
 	return 0;
 }
 
@@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 		if (before) {
 			map__put(maps_by_address[i]);
 			maps_by_address[i] = before;
+			map__set_kmap_maps(before, maps);
 
 			if (maps_by_name) {
 				map__put(maps_by_name[ni]);
@@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 			 */
 			map__put(maps_by_address[i]);
 			maps_by_address[i] = map__get(new);
+			map__set_kmap_maps(new, maps);
 
 			if (maps_by_name) {
 				map__put(maps_by_name[ni]);
@@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 				 */
 				map__put(maps_by_address[i]);
 				maps_by_address[i] = map__get(new);
+				map__set_kmap_maps(new, maps);
 
 				if (maps_by_name) {
 					map__put(maps_by_name[ni]);
 					maps_by_name[ni] = map__get(new);
 				}
 
-				map__set_kmap_maps(new, maps);
-
 				check_invariants(maps);
 				return err;
 			}
@@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent)
 				err = unwind__prepare_access(dest, new, NULL);
 				if (!err) {
 					dest_maps_by_address[i] = new;
+					map__set_kmap_maps(new, dest);
 					if (dest_maps_by_name)
 						dest_maps_by_name[i] = map__get(new);
 					RC_CHK_ACCESS(dest)->nr_maps = i + 1;
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 6d2c280a1730..1346fd180653 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -873,13 +873,17 @@ out:
 
 #ifdef HAVE_LIBBFD_BUILDID_SUPPORT
 
-static int read_build_id(const char *filename, struct build_id *bid)
+static int read_build_id(const char *filename, struct build_id *bid, bool block)
 {
 	size_t size = sizeof(bid->data);
-	int err = -1;
+	int err = -1, fd;
 	bfd *abfd;
 
-	abfd = bfd_openr(filename, NULL);
+	fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK));
+	if (fd < 0)
+		return -1;
+
+	abfd = bfd_fdopenr(filename, /*target=*/NULL, fd);
 	if (!abfd)
 		return -1;
 
@@ -902,7 +906,7 @@ out_close:
 
 #else // HAVE_LIBBFD_BUILDID_SUPPORT
 
-static int read_build_id(const char *filename, struct build_id *bid)
+static int read_build_id(const char *filename, struct build_id *bid, bool block)
 {
 	size_t size = sizeof(bid->data);
 	int fd, err = -1;
@@ -911,7 +915,7 @@ static int read_build_id(const char *filename, struct build_id *bid)
 	if (size < BUILD_ID_SIZE)
 		goto out;
 
-	fd = open(filename, O_RDONLY);
+	fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK));
 	if (fd < 0)
 		goto out;
 
@@ -934,7 +938,7 @@ out:
 
 #endif // HAVE_LIBBFD_BUILDID_SUPPORT
 
-int filename__read_build_id(const char *filename, struct build_id *bid)
+int filename__read_build_id(const char *filename, struct build_id *bid, bool block)
 {
 	struct kmod_path m = { .name = NULL, };
 	char path[PATH_MAX];
@@ -958,9 +962,10 @@ int filename__read_build_id(const char *filename, struct build_id *bid)
 		}
 		close(fd);
 		filename = path;
+		block = true;
 	}
 
-	err = read_build_id(filename, bid);
+	err = read_build_id(filename, bid, block);
 
 	if (m.comp)
 		unlink(filename);
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 7201494c5c20..41e4ebe5eac5 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -4,7 +4,6 @@
 
 #include <errno.h>
 #include <unistd.h>
-#include <stdio.h>
 #include <fcntl.h>
 #include <string.h>
 #include <stdlib.h>
@@ -86,13 +85,10 @@ int filename__read_debuglink(const char *filename __maybe_unused,
 /*
  * Just try PT_NOTE header otherwise fails
  */
-int filename__read_build_id(const char *filename, struct build_id *bid)
+int filename__read_build_id(const char *filename, struct build_id *bid, bool block)
 {
-	FILE *fp;
-	int ret = -1;
+	int fd, ret = -1;
 	bool need_swap = false, elf32;
-	u8 e_ident[EI_NIDENT];
-	int i;
 	union {
 		struct {
 			Elf32_Ehdr ehdr32;
@@ -103,28 +99,27 @@ int filename__read_build_id(const char *filename, struct build_id *bid)
 			Elf64_Phdr *phdr64;
 		};
 	} hdrs;
-	void *phdr;
-	size_t phdr_size;
-	void *buf = NULL;
-	size_t buf_size = 0;
+	void *phdr, *buf = NULL;
+	ssize_t phdr_size, ehdr_size, buf_size = 0;
 
-	fp = fopen(filename, "r");
-	if (fp == NULL)
+	fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK));
+	if (fd < 0)
 		return -1;
 
-	if (fread(e_ident, sizeof(e_ident), 1, fp) != 1)
+	if (read(fd, hdrs.ehdr32.e_ident, EI_NIDENT) != EI_NIDENT)
 		goto out;
 
-	if (memcmp(e_ident, ELFMAG, SELFMAG) ||
-	    e_ident[EI_VERSION] != EV_CURRENT)
+	if (memcmp(hdrs.ehdr32.e_ident, ELFMAG, SELFMAG) ||
+	    hdrs.ehdr32.e_ident[EI_VERSION] != EV_CURRENT)
 		goto out;
 
-	need_swap = check_need_swap(e_ident[EI_DATA]);
-	elf32 = e_ident[EI_CLASS] == ELFCLASS32;
+	need_swap = check_need_swap(hdrs.ehdr32.e_ident[EI_DATA]);
+	elf32 = hdrs.ehdr32.e_ident[EI_CLASS] == ELFCLASS32;
+	ehdr_size = (elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64)) - EI_NIDENT;
 
-	if (fread(elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64,
-		  elf32 ? sizeof(hdrs.ehdr32) : sizeof(hdrs.ehdr64),
-		  1, fp) != 1)
+	if (read(fd,
+		 (elf32 ? (void *)&hdrs.ehdr32 : (void *)&hdrs.ehdr64) + EI_NIDENT,
+		 ehdr_size) != ehdr_size)
 		goto out;
 
 	if (need_swap) {
@@ -138,14 +133,18 @@ int filename__read_build_id(const char *filename, struct build_id *bid)
 			hdrs.ehdr64.e_phnum = bswap_16(hdrs.ehdr64.e_phnum);
 		}
 	}
-	phdr_size = elf32 ? hdrs.ehdr32.e_phentsize * hdrs.ehdr32.e_phnum
-			  : hdrs.ehdr64.e_phentsize * hdrs.ehdr64.e_phnum;
+	if ((elf32 && hdrs.ehdr32.e_phentsize != sizeof(Elf32_Phdr)) ||
+	    (!elf32 && hdrs.ehdr64.e_phentsize != sizeof(Elf64_Phdr)))
+		goto out;
+
+	phdr_size = elf32 ? sizeof(Elf32_Phdr) * hdrs.ehdr32.e_phnum
+			  : sizeof(Elf64_Phdr) * hdrs.ehdr64.e_phnum;
 	phdr = malloc(phdr_size);
 	if (phdr == NULL)
 		goto out;
 
-	fseek(fp, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET);
-	if (fread(phdr, phdr_size, 1, fp) != 1)
+	lseek(fd, elf32 ? hdrs.ehdr32.e_phoff : hdrs.ehdr64.e_phoff, SEEK_SET);
+	if (read(fd, phdr, phdr_size) != phdr_size)
 		goto out_free;
 
 	if (elf32)
@@ -153,8 +152,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid)
 	else
 		hdrs.phdr64 = phdr;
 
-	for (i = 0; i < elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum; i++) {
-		size_t p_filesz;
+	for (int i = 0; i < (elf32 ? hdrs.ehdr32.e_phnum : hdrs.ehdr64.e_phnum); i++) {
+		ssize_t p_filesz;
 
 		if (need_swap) {
 			if (elf32) {
@@ -180,8 +179,8 @@ int filename__read_build_id(const char *filename, struct build_id *bid)
 				goto out_free;
 			buf = tmp;
 		}
-		fseek(fp, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET);
-		if (fread(buf, p_filesz, 1, fp) != 1)
+		lseek(fd, elf32 ? hdrs.phdr32[i].p_offset : hdrs.phdr64[i].p_offset, SEEK_SET);
+		if (read(fd, buf, p_filesz) != p_filesz)
 			goto out_free;
 
 		ret = read_build_id(buf, p_filesz, bid, need_swap);
@@ -194,7 +193,7 @@ out_free:
 	free(buf);
 	free(phdr);
 out:
-	fclose(fp);
+	close(fd);
 	return ret;
 }
 
@@ -324,7 +323,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused,
 	if (ret >= 0)
 		RC_CHK_ACCESS(dso)->is_64_bit = ret;
 
-	if (filename__read_build_id(ss->name, &bid) > 0)
+	if (filename__read_build_id(ss->name, &bid, /*block=*/true) > 0)
 		dso__set_build_id(dso, &bid);
 	return 0;
 }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e816e4220d33..3fed54de5401 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1869,14 +1869,14 @@ int dso__load(struct dso *dso, struct map *map)
 
 	/*
 	 * Read the build id if possible. This is required for
-	 * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
+	 * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work. Don't block in case path
+	 * isn't for a regular file.
 	 */
-	if (!dso__has_build_id(dso) &&
-	    is_regular_file(dso__long_name(dso))) {
+	if (!dso__has_build_id(dso)) {
 		struct build_id bid = { .size = 0, };
 
 		__symbol__join_symfs(name, PATH_MAX, dso__long_name(dso));
-		if (filename__read_build_id(name, &bid) > 0)
+		if (filename__read_build_id(name, &bid, /*block=*/false) > 0)
 			dso__set_build_id(dso, &bid);
 	}
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 3fb5d146d9b1..347106218799 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -140,7 +140,7 @@ struct symbol *dso__next_symbol(struct symbol *sym);
 
 enum dso_type dso__type_fd(int fd);
 
-int filename__read_build_id(const char *filename, struct build_id *id);
+int filename__read_build_id(const char *filename, struct build_id *id, bool block);
 int sysfs__read_build_id(const char *filename, struct build_id *bid);
 int modules__parse(const char *filename, void *arg,
 		   int (*process_module)(void *arg, const char *name,
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index cb2c1ace304a..fcd1fd13c30e 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -401,7 +401,7 @@ static void perf_record_mmap2__read_build_id(struct perf_record_mmap2 *event,
 	nsi = nsinfo__new(event->pid);
 	nsinfo__mountns_enter(nsi, &nc);
 
-	rc = filename__read_build_id(event->filename, &bid) > 0 ? 0 : -1;
+	rc = filename__read_build_id(event->filename, &bid, /*block=*/false) > 0 ? 0 : -1;
 
 	nsinfo__mountns_exit(&nc);
 	nsinfo__put(nsi);
diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1
index 500653ef98c7..8ac82b6f9189 100644
--- a/tools/power/cpupower/man/cpupower-set.1
+++ b/tools/power/cpupower/man/cpupower-set.1
@@ -81,10 +81,11 @@ Refer to the AMD P-State kernel documentation for further information.
 .RE
 
 .PP
-\-\-turbo\-boost, \-t
+\-\-turbo\-boost, \-\-boost, \-t
 .RS 4
-This option is used to enable or disable the turbo boost feature on
-supported Intel and AMD processors.
+This option is used to enable or disable the boost feature on
+supported Intel and AMD processors, and other boost supported systems.
+(The --boost option is an alias for the --turbo-boost option)
 
 This option takes as parameter either \fB1\fP to enable, or \fB0\fP to disable the feature.
 
diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index fc750e127404..7d3732f5f2f6 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -128,7 +128,7 @@ static int get_boost_mode_x86(unsigned int cpu)
 	/* ToDo: Make this more global */
 	unsigned long pstates[MAX_HW_PSTATES] = {0,};
 
-	ret = cpufreq_has_boost_support(cpu, &support, &active, &b_states);
+	ret = cpufreq_has_x86_boost_support(cpu, &support, &active, &b_states);
 	if (ret) {
 		printf(_("Error while evaluating Boost Capabilities"
 				" on CPU %d -- are you root?\n"), cpu);
@@ -204,6 +204,18 @@ static int get_boost_mode_x86(unsigned int cpu)
 	return 0;
 }
 
+static int get_boost_mode_generic(unsigned int cpu)
+{
+	bool active;
+
+	if (!cpufreq_has_generic_boost_support(&active)) {
+		printf(_("  boost state support:\n"));
+		printf(_("    Active: %s\n"), active ? _("yes") : _("no"));
+	}
+
+	return 0;
+}
+
 /* --boost / -b */
 
 static int get_boost_mode(unsigned int cpu)
@@ -214,6 +226,8 @@ static int get_boost_mode(unsigned int cpu)
 	    cpupower_cpu_info.vendor == X86_VENDOR_HYGON ||
 	    cpupower_cpu_info.vendor == X86_VENDOR_INTEL)
 		return get_boost_mode_x86(cpu);
+	else
+		get_boost_mode_generic(cpu);
 
 	freqs = cpufreq_get_boost_frequencies(cpu);
 	if (freqs) {
diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c
index 0677b58374ab..c2117e5650dd 100644
--- a/tools/power/cpupower/utils/cpupower-set.c
+++ b/tools/power/cpupower/utils/cpupower-set.c
@@ -21,6 +21,7 @@ static struct option set_opts[] = {
 	{"epp", required_argument, NULL, 'e'},
 	{"amd-pstate-mode", required_argument, NULL, 'm'},
 	{"turbo-boost", required_argument, NULL, 't'},
+	{"boost", required_argument, NULL, 't'},
 	{ },
 };
 
@@ -62,8 +63,8 @@ int cmd_set(int argc, char **argv)
 
 	params.params = 0;
 	/* parameter parsing */
-	while ((ret = getopt_long(argc, argv, "b:e:m:",
-						set_opts, NULL)) != -1) {
+	while ((ret = getopt_long(argc, argv, "b:e:m:t:",
+				  set_opts, NULL)) != -1) {
 		switch (ret) {
 		case 'b':
 			if (params.perf_bias)
diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h
index 95749b8ee475..82ea62bdf5a2 100644
--- a/tools/power/cpupower/utils/helpers/helpers.h
+++ b/tools/power/cpupower/utils/helpers/helpers.h
@@ -103,6 +103,9 @@ extern struct cpupower_cpu_info cpupower_cpu_info;
 
 /* cpuid and cpuinfo helpers  **************************/
 
+int cpufreq_has_generic_boost_support(bool *active);
+int cpupower_set_turbo_boost(int turbo_boost);
+
 /* X86 ONLY ****************************************/
 #if defined(__i386__) || defined(__x86_64__)
 
@@ -118,7 +121,6 @@ extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu);
 
 extern int cpupower_set_epp(unsigned int cpu, char *epp);
 extern int cpupower_set_amd_pstate_mode(char *mode);
-extern int cpupower_set_turbo_boost(int turbo_boost);
 
 /* Read/Write msr ****************************/
 
@@ -139,8 +141,8 @@ extern int decode_pstates(unsigned int cpu, int boost_states,
 
 /* AMD HW pstate decoding **************************/
 
-extern int cpufreq_has_boost_support(unsigned int cpu, int *support,
-				     int *active, int * states);
+int cpufreq_has_x86_boost_support(unsigned int cpu, int *support,
+				  int *active, int *states);
 
 /* AMD P-State stuff **************************/
 bool cpupower_amd_pstate_enabled(void);
@@ -181,13 +183,11 @@ static inline int cpupower_set_epp(unsigned int cpu, char *epp)
 { return -1; };
 static inline int cpupower_set_amd_pstate_mode(char *mode)
 { return -1; };
-static inline int cpupower_set_turbo_boost(int turbo_boost)
-{ return -1; };
 
 /* Read/Write msr ****************************/
 
-static inline int cpufreq_has_boost_support(unsigned int cpu, int *support,
-					    int *active, int * states)
+static inline int cpufreq_has_x86_boost_support(unsigned int cpu, int *support,
+						int *active, int *states)
 { return -1; }
 
 static inline bool cpupower_amd_pstate_enabled(void)
diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c
index 76e461ff4f74..166dc1e470ea 100644
--- a/tools/power/cpupower/utils/helpers/misc.c
+++ b/tools/power/cpupower/utils/helpers/misc.c
@@ -8,15 +8,14 @@
 #include "helpers/helpers.h"
 #include "helpers/sysfs.h"
 #include "cpufreq.h"
+#include "cpupower_intern.h"
 
 #if defined(__i386__) || defined(__x86_64__)
 
-#include "cpupower_intern.h"
-
 #define MSR_AMD_HWCR	0xc0010015
 
-int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active,
-			int *states)
+int cpufreq_has_x86_boost_support(unsigned int cpu, int *support, int *active,
+				  int *states)
 {
 	int ret;
 	unsigned long long val;
@@ -124,24 +123,6 @@ int cpupower_set_amd_pstate_mode(char *mode)
 	return 0;
 }
 
-int cpupower_set_turbo_boost(int turbo_boost)
-{
-	char path[SYSFS_PATH_MAX];
-	char linebuf[2] = {};
-
-	snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost");
-
-	if (!is_valid_path(path))
-		return -1;
-
-	snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost);
-
-	if (cpupower_write_sysfs(path, linebuf, 2) <= 0)
-		return -1;
-
-	return 0;
-}
-
 bool cpupower_amd_pstate_enabled(void)
 {
 	char *driver = cpufreq_get_driver(0);
@@ -160,6 +141,39 @@ bool cpupower_amd_pstate_enabled(void)
 
 #endif /* #if defined(__i386__) || defined(__x86_64__) */
 
+int cpufreq_has_generic_boost_support(bool *active)
+{
+	char path[SYSFS_PATH_MAX];
+	char linebuf[2] = {};
+	unsigned long val;
+	char *endp;
+
+	snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost");
+
+	if (!is_valid_path(path))
+		return -EACCES;
+
+	if (cpupower_read_sysfs(path, linebuf, 2) <= 0)
+		return -EINVAL;
+
+	val = strtoul(linebuf, &endp, 0);
+	if (endp == linebuf || errno == ERANGE)
+		return -EINVAL;
+
+	switch (val) {
+	case 0:
+		*active = false;
+		break;
+	case 1:
+		*active = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* get_cpustate
  *
  * Gather the information of all online CPUs into bitmask struct
@@ -259,3 +273,21 @@ void print_speed(unsigned long speed, int no_rounding)
 		}
 	}
 }
+
+int cpupower_set_turbo_boost(int turbo_boost)
+{
+	char path[SYSFS_PATH_MAX];
+	char linebuf[2] = {};
+
+	snprintf(path, sizeof(path), PATH_TO_CPU "cpufreq/boost");
+
+	if (!is_valid_path(path))
+		return -1;
+
+	snprintf(linebuf, sizeof(linebuf), "%d", turbo_boost);
+
+	if (cpupower_write_sysfs(path, linebuf, 2) <= 0)
+		return -1;
+
+	return 0;
+}
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 72a280e7a9d5..47eb2d4d13a5 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = {
 	{ INTEL_EMERALDRAPIDS_X, &spr_features },
 	{ INTEL_GRANITERAPIDS_X, &spr_features },
 	{ INTEL_GRANITERAPIDS_D, &spr_features },
-	{ INTEL_PANTHERCOVE_X, &dmr_features },
+	{ INTEL_DIAMONDRAPIDS_X, &dmr_features },
 	{ INTEL_LAKEFIELD, &cnl_features },
 	{ INTEL_ALDERLAKE, &adl_features },
 	{ INTEL_ALDERLAKE_L, &adl_features },
diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
new file mode 100644
index 000000000000..4366fb3c91ce
--- /dev/null
+++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE __PAGE_SIZE
+/*
+ * for older kernels try sizeof(struct genradix_node)
+ * or flexible:
+ * static inline long __bpf_page_size(void) {
+ *   return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
+ * }
+ * but generated code is not great.
+ */
+#endif
+
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#define __arena __attribute__((address_space(1)))
+#define __arena_global __attribute__((address_space(1)))
+#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
+#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
+#else
+
+/* emit instruction:
+ * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
+ *
+ * This is a workaround for LLVM compiler versions without
+ * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena
+ * pointers and native kernel/userspace ones. In this case we explicitly do so
+ * with cast_kern() and cast_user(). E.g., in the Linux kernel tree,
+ * tools/testing/selftests/bpf includes tests that use these macros to implement
+ * linked lists and hashtables backed by arena memory. In sched_ext, we use
+ * cast_kern() and cast_user() for compatibility with older LLVM toolchains.
+ */
+#ifndef bpf_addr_space_cast
+#define bpf_addr_space_cast(var, dst_as, src_as)\
+	asm volatile(".byte 0xBF;		\
+		     .ifc %[reg], r0;		\
+		     .byte 0x00;		\
+		     .endif;			\
+		     .ifc %[reg], r1;		\
+		     .byte 0x11;		\
+		     .endif;			\
+		     .ifc %[reg], r2;		\
+		     .byte 0x22;		\
+		     .endif;			\
+		     .ifc %[reg], r3;		\
+		     .byte 0x33;		\
+		     .endif;			\
+		     .ifc %[reg], r4;		\
+		     .byte 0x44;		\
+		     .endif;			\
+		     .ifc %[reg], r5;		\
+		     .byte 0x55;		\
+		     .endif;			\
+		     .ifc %[reg], r6;		\
+		     .byte 0x66;		\
+		     .endif;			\
+		     .ifc %[reg], r7;		\
+		     .byte 0x77;		\
+		     .endif;			\
+		     .ifc %[reg], r8;		\
+		     .byte 0x88;		\
+		     .endif;			\
+		     .ifc %[reg], r9;		\
+		     .byte 0x99;		\
+		     .endif;			\
+		     .short %[off];		\
+		     .long %[as]"		\
+		     : [reg]"+r"(var)		\
+		     : [off]"i"(BPF_ADDR_SPACE_CAST) \
+		     , [as]"i"((dst_as << 16) | src_as));
+#endif
+
+#define __arena
+#define __arena_global SEC(".addr_space.1")
+#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
+#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
+#endif
+
+void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
+				    int node_id, __u64 flags) __ksym __weak;
+void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
+
+/*
+ * Note that cond_break can only be portably used in the body of a breakable
+ * construct, whereas can_loop can be used anywhere.
+ */
+#ifdef TEST
+#define can_loop true
+#define __cond_break(expr) expr
+#else
+#ifdef __BPF_FEATURE_MAY_GOTO
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#else
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#else
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+#endif /* __BPF_FEATURE_MAY_GOTO */
+#endif /* TEST */
+
+#define cond_break __cond_break(break)
+#define cond_break_label(label) __cond_break(goto label)
+
+
+void bpf_preempt_disable(void) __weak __ksym;
+void bpf_preempt_enable(void) __weak __ksym;
diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h
new file mode 100644
index 000000000000..10141db0b59d
--- /dev/null
+++ b/tools/sched_ext/include/scx/bpf_arena_common.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef arena_container_of
+#define arena_container_of(ptr, type, member)			\
+	({							\
+		void __arena *__mptr = (void __arena *)(ptr);	\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+#endif
+
+/* Provide the definition of PAGE_SIZE. */
+#include <sys/user.h>
+
+#define __arena
+#define __arg_arena
+#define cast_kern(ptr) /* nop for user space */
+#define cast_user(ptr) /* nop for user space */
+char __attribute__((weak)) arena[1];
+
+#ifndef offsetof
+#define offsetof(type, member)  ((unsigned long)&((type *)0)->member)
+#endif
+
+static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
+						  int node_id, __u64 flags)
+{
+	return NULL;
+}
+static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
+{
+}
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index d4e21558e982..06e2551033cb 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -24,14 +24,26 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <asm-generic/errno.h>
-#include "user_exit_info.h"
+#include "user_exit_info.bpf.h"
 #include "enum_defs.autogen.h"
 
+#define PF_IDLE				0x00000002	/* I am an IDLE thread */
+#define PF_IO_WORKER			0x00000010	/* Task is an IO worker */
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
+#define PF_KCOMPACTD			0x00010000      /* I am kcompactd */
+#define PF_KSWAPD			0x00020000      /* I am kswapd */
 #define PF_KTHREAD			0x00200000	/* I am a kernel thread */
 #define PF_EXITING			0x00000004
 #define CLOCK_MONOTONIC			1
 
+#ifndef NR_CPUS
+#define NR_CPUS 1024
+#endif
+
+#ifndef NUMA_NO_NODE
+#define	NUMA_NO_NODE	(-1)
+#endif
+
 extern int LINUX_KERNEL_VERSION __kconfig;
 extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
 extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
@@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct rq *scx_bpf_locked_rq(void) __ksym;
+struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
@@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __
 static inline __attribute__((format(printf, 1, 2)))
 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
 
+#define SCX_STRINGIFY(x) #x
+#define SCX_TOSTRING(x) SCX_STRINGIFY(x)
+
 /*
  * Helper macro for initializing the fmt and variadic argument inputs to both
  * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
@@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
  * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
  * instead of an array of u64. Invoking this macro will cause the scheduler to
  * exit in an erroneous state, with diagnostic information being passed to the
- * user.
+ * user. It appends the file and line number to aid debugging.
  */
 #define scx_bpf_error(fmt, args...)						\
 ({										\
-	scx_bpf_bstr_preamble(fmt, args)					\
+	scx_bpf_bstr_preamble(							\
+		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args)		\
 	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
-	___scx_bpf_bstr_format_checker(fmt, ##args);				\
+	___scx_bpf_bstr_format_checker(						\
+		__FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args);		\
 })
 
 /*
@@ -229,6 +248,7 @@ BPF_PROG(name, ##args)
  * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
  * `MEMBER_VPTR(ptr, ->member)`.
  */
+#ifndef MEMBER_VPTR
 #define MEMBER_VPTR(base, member) (typeof((base) member) *)			\
 ({										\
 	u64 __base = (u64)&(base);						\
@@ -245,6 +265,7 @@ BPF_PROG(name, ##args)
 		  [max]"i"(sizeof(base) - sizeof((base) member)));		\
 	__addr;									\
 })
+#endif /* MEMBER_VPTR */
 
 /**
  * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
@@ -260,6 +281,7 @@ BPF_PROG(name, ##args)
  * size of the array to compute the max, which will result in rejection by
  * the verifier.
  */
+#ifndef ARRAY_ELEM_PTR
 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)				\
 ({										\
 	u64 __base = (u64)arr;							\
@@ -274,7 +296,7 @@ BPF_PROG(name, ##args)
 		  [max]"r"(sizeof(arr[0]) * ((n) - 1)));			\
 	__addr;									\
 })
-
+#endif /* ARRAY_ELEM_PTR */
 
 /*
  * BPF declarations and helpers
@@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
  */
 static inline bool is_migration_disabled(const struct task_struct *p)
 {
-	if (bpf_core_field_exists(p->migration_disabled))
-		return p->migration_disabled;
+	/*
+	 * Testing p->migration_disabled in a BPF code is tricky because the
+	 * migration is _always_ disabled while running the BPF code.
+	 * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
+	 * code execution disable and re-enable the migration of the current
+	 * task, respectively. So, the _current_ task of the sched_ext ops is
+	 * always migration-disabled. Moreover, p->migration_disabled could be
+	 * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
+	 * executed in the middle of the other BPF code execution.
+	 *
+	 * Therefore, we should decide that the _current_ task is
+	 * migration-disabled only when its migration_disabled count is greater
+	 * than one. In other words, when  p->migration_disabled == 1, there is
+	 * an ambiguity, so we should check if @p is the current task or not.
+	 */
+	if (bpf_core_field_exists(p->migration_disabled)) {
+		if (p->migration_disabled == 1)
+			return bpf_get_current_task_btf() != p;
+		else
+			return p->migration_disabled;
+	}
 	return false;
 }
 
@@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before)
  */
 static inline bool time_after(u64 a, u64 b)
 {
-	 return (s64)(b - a) < 0;
+	return (s64)(b - a) < 0;
 }
 
 /**
@@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b)
  */
 static inline bool time_after_eq(u64 a, u64 b)
 {
-	 return (s64)(a - b) >= 0;
+	return (s64)(a - b) >= 0;
 }
 
 /**
@@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c)
  */
 
 /* useful compiler attributes */
+#ifndef likely
 #define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
 #define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#ifndef __maybe_unused
 #define __maybe_unused __attribute__((__unused__))
+#endif
 
 /*
  * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
@@ -633,6 +680,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 })
 
 /*
+ * __calc_avg - Calculate exponential weighted moving average (EWMA) with
+ * @old and @new values. @decay represents how large the @old value remains.
+ * With a larger @decay value, the moving average changes slowly, exhibiting
+ * fewer fluctuations.
+ */
+#define __calc_avg(old, new, decay) ({						\
+	typeof(decay) thr = 1 << (decay);					\
+	typeof(old) ret;							\
+	if (((old) < thr) || ((new) < thr)) {					\
+		if (((old) == 1) && ((new) == 0))				\
+			ret = 0;						\
+		else								\
+			ret = ((old) - ((old) >> 1)) + ((new) >> 1);		\
+	} else {								\
+		ret = ((old) - ((old) >> (decay))) + ((new) >> (decay));	\
+	}									\
+	ret;									\
+})
+
+/*
  * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
  * @v: The value for which we're computing the base 2 logarithm.
  */
@@ -663,6 +730,25 @@ static inline u32 log2_u64(u64 v)
 }
 
 /*
+ * sqrt_u64 - Calculate the square root of value @x using Newton's method.
+ */
+static inline u64 __sqrt_u64(u64 x)
+{
+	if (x == 0 || x == 1)
+		return x;
+
+	u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
+
+	for (int i = 0; i < 8; ++i) {
+		u64 q = x / r;
+		if (r <= q)
+			break;
+		r = (r + q) >> 1;
+	}
+	return r;
+}
+
+/*
  * Return a value proportionally scaled to the task's weight.
  */
 static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value)
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index 1dc76bd84296..b3c6372bcf81 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -75,8 +75,9 @@ typedef int64_t s64;
 #include "enums.h"
 
 /* not available when building kernel tools/sched_ext */
-#if __has_include(<lib/sdt_task.h>)
-#include <lib/sdt_task.h>
+#if __has_include(<lib/sdt_task_defs.h>)
+#include "bpf_arena_common.h"
+#include <lib/sdt_task_defs.h>
 #endif
 
 #endif	/* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 9252e1a00556..dd9144624dc9 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i
 void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
 
 #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
 	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
@@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
 	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
 	  false))
 
+#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz)		\
+	(bpf_ksym_exists(bpf_cpumask_populate) ?			\
+	 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
+
 #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
 	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
 
@@ -226,6 +231,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
 	 scx_bpf_pick_any_cpu(cpus_allowed, flags))
 
 /*
+ * v6.18: Add a helper to retrieve the current task running on a CPU.
+ *
+ * Keep this helper available until v6.20 for compatibility.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
+{
+	struct rq *rq;
+
+	if (bpf_ksym_exists(scx_bpf_cpu_curr))
+		return scx_bpf_cpu_curr(cpu);
+
+	rq = scx_bpf_cpu_rq(cpu);
+
+	return rq ? rq->curr : NULL;
+}
+
+/*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
  */
diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h
new file mode 100644
index 000000000000..e7ac6611a990
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __USER_EXIT_INFO_BPF_H
+#define __USER_EXIT_INFO_BPF_H
+
+#ifndef LSP
+#include "vmlinux.h"
+#endif
+#include <bpf/bpf_core_read.h>
+
+#include "user_exit_info_common.h"
+
+#define UEI_DEFINE(__name)							\
+	char RESIZABLE_ARRAY(data, __name##_dump);				\
+	const volatile u32 __name##_dump_len;					\
+	struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({						\
+	bpf_probe_read_kernel_str(__uei_name.reason,				\
+				  sizeof(__uei_name.reason), (__ei)->reason);	\
+	bpf_probe_read_kernel_str(__uei_name.msg,				\
+				  sizeof(__uei_name.msg), (__ei)->msg);		\
+	bpf_probe_read_kernel_str(__uei_name##_dump,				\
+				  __uei_name##_dump_len, (__ei)->dump);		\
+	if (bpf_core_field_exists((__ei)->exit_code))				\
+		__uei_name.exit_code = (__ei)->exit_code;			\
+	/* use __sync to force memory barrier */				\
+	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
+				    (__ei)->kind);				\
+})
+
+#endif /* __USER_EXIT_INFO_BPF_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 66f856640ee7..399697fa372f 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -10,55 +10,11 @@
 #ifndef __USER_EXIT_INFO_H
 #define __USER_EXIT_INFO_H
 
-#ifdef LSP
-#define __bpf__
-#include "../vmlinux.h"
-#endif
-
-enum uei_sizes {
-	UEI_REASON_LEN		= 128,
-	UEI_MSG_LEN		= 1024,
-	UEI_DUMP_DFL_LEN	= 32768,
-};
-
-struct user_exit_info {
-	int		kind;
-	s64		exit_code;
-	char		reason[UEI_REASON_LEN];
-	char		msg[UEI_MSG_LEN];
-};
-
-#ifdef __bpf__
-
-#ifndef LSP
-#include "vmlinux.h"
-#endif
-#include <bpf/bpf_core_read.h>
-
-#define UEI_DEFINE(__name)							\
-	char RESIZABLE_ARRAY(data, __name##_dump);				\
-	const volatile u32 __name##_dump_len;					\
-	struct user_exit_info __name SEC(".data")
-
-#define UEI_RECORD(__uei_name, __ei) ({						\
-	bpf_probe_read_kernel_str(__uei_name.reason,				\
-				  sizeof(__uei_name.reason), (__ei)->reason);	\
-	bpf_probe_read_kernel_str(__uei_name.msg,				\
-				  sizeof(__uei_name.msg), (__ei)->msg);		\
-	bpf_probe_read_kernel_str(__uei_name##_dump,				\
-				  __uei_name##_dump_len, (__ei)->dump);		\
-	if (bpf_core_field_exists((__ei)->exit_code))				\
-		__uei_name.exit_code = (__ei)->exit_code;			\
-	/* use __sync to force memory barrier */				\
-	__sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind,		\
-				    (__ei)->kind);				\
-})
-
-#else	/* !__bpf__ */
-
 #include <stdio.h>
 #include <stdbool.h>
 
+#include "user_exit_info_common.h"
+
 /* no need to call the following explicitly if SCX_OPS_LOAD() is used */
 #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({					\
 	u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN;	\
@@ -114,5 +70,4 @@ enum uei_ecode_mask {
 
 #define UEI_ECODE_RESTART(__ecode)	(UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
 
-#endif	/* __bpf__ */
 #endif	/* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h
new file mode 100644
index 000000000000..2d0981aedd89
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info_common.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_COMMON_H
+#define __USER_EXIT_INFO_COMMON_H
+
+#ifdef LSP
+#include "../vmlinux.h"
+#endif
+
+enum uei_sizes {
+	UEI_REASON_LEN		= 128,
+	UEI_MSG_LEN		= 1024,
+	UEI_DUMP_DFL_LEN	= 32768,
+};
+
+struct user_exit_info {
+	int		kind;
+	s64		exit_code;
+	char		reason[UEI_REASON_LEN];
+	char		msg[UEI_MSG_LEN];
+};
+
+#endif /* __USER_EXIT_INFO_COMMON_H */
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 50bc1737c167..55df8b798865 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * A central FIFO sched_ext scheduler which demonstrates the followings:
+ * A central FIFO sched_ext scheduler which demonstrates the following:
  *
  * a. Making all scheduling decisions from one CPU:
  *
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 6ba6e610eeaa..55931a4cd71c 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -61,6 +61,7 @@ restart:
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
+	assert(skel->rodata->nr_cpu_ids > 0);
 	assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
 
 	while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index fdc7170639e6..2c720e3ecad5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
 	       .init			= (void *)fcg_init,
 	       .exit			= (void *)fcg_exit,
-	       .flags			= SCX_OPS_ENQ_EXITING,
+	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
 	       .name			= "flatcg");
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 6dd423eeb4ff..cd85eb401179 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <signal.h>
+#include <assert.h>
 #include <unistd.h>
 #include <libgen.h>
 #include <limits.h>
@@ -137,6 +138,7 @@ restart:
 	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->nr_cpus > 0);
 	skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
 	while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 69d877501cb7..3072b593f898 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth;
 const volatile u32 dsp_inf_loop_after;
 const volatile u32 dsp_batch;
 const volatile bool highpri_boosting;
-const volatile bool print_shared_dsq;
+const volatile bool print_dsqs_and_events;
+const volatile bool print_msgs;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
@@ -56,7 +57,8 @@ struct qmap {
   queue1 SEC(".maps"),
   queue2 SEC(".maps"),
   queue3 SEC(".maps"),
-  queue4 SEC(".maps");
+  queue4 SEC(".maps"),
+  dump_store SEC(".maps");
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
@@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 			return;
 
 		scx_bpf_dump("QMAP FIFO[%d]:", i);
+
+		/*
+		 * Dump can be invoked anytime and there is no way to iterate in
+		 * a non-destructive way. Pop and store in dump_store and then
+		 * restore afterwards. If racing against new enqueues, ordering
+		 * can get mixed up.
+		 */
 		bpf_repeat(4096) {
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
+			bpf_map_push_elem(&dump_store, &pid, 0);
 			scx_bpf_dump(" %d", pid);
 		}
+
+		bpf_repeat(4096) {
+			if (bpf_map_pop_elem(&dump_store, &pid))
+				break;
+			bpf_map_push_elem(fifo, &pid, 0);
+		}
+
 		scx_bpf_dump("\n");
 	}
 }
@@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 
 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
 {
-	bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
-		   cgrp->kn->id, args->weight, args->bw_period_us,
-		   args->bw_quota_us, args->bw_burst_us);
+	if (print_msgs)
+		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
+			   cgrp->kn->id, args->weight, args->bw_period_us,
+			   args->bw_quota_us, args->bw_burst_us);
 	return 0;
 }
 
 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 {
-	bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
+	if (print_msgs)
+		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
 }
 
 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
 		    u64 period_us, u64 quota_us, u64 burst_us)
 {
-	bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
-		   period_us, quota_us, burst_us);
+	if (print_msgs)
+		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
+			   cgrp->kn->id, period_us, quota_us, burst_us);
 }
 
 /*
@@ -676,16 +696,20 @@ static void print_cpus(void)
 
 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
 {
-	bpf_printk("CPU %d coming online", cpu);
-	/* @cpu is already online at this point */
-	print_cpus();
+	if (print_msgs) {
+		bpf_printk("CPU %d coming online", cpu);
+		/* @cpu is already online at this point */
+		print_cpus();
+	}
 }
 
 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
 {
-	bpf_printk("CPU %d going offline", cpu);
-	/* @cpu is still online at this point */
-	print_cpus();
+	if (print_msgs) {
+		bpf_printk("CPU %d going offline", cpu);
+		/* @cpu is still online at this point */
+		print_cpus();
+	}
 }
 
 struct monitor_timer {
@@ -783,35 +807,36 @@ static void dump_shared_dsq(void)
 
 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 {
-	struct scx_event_stats events;
-
 	bpf_rcu_read_lock();
 	dispatch_highpri(true);
 	bpf_rcu_read_unlock();
 
 	monitor_cpuperf();
 
-	if (print_shared_dsq)
+	if (print_dsqs_and_events) {
+		struct scx_event_stats events;
+
 		dump_shared_dsq();
 
-	__COMPAT_scx_bpf_events(&events, sizeof(events));
-
-	bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
-		   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
-	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
-		   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
-	bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
-		   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
-	bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
-		   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
-	bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
-		   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
-	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
-		   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
-	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
-		   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
-	bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
-		   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+		__COMPAT_scx_bpf_events(&events, sizeof(events));
+
+		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
+			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
+		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
+			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
+		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
+			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
+		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
+			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
+		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
+			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
+		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
+			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
+		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
+			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
+		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
+			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+	}
 
 	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 	return 0;
@@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	struct bpf_timer *timer;
 	s32 ret;
 
-	print_cpus();
+	if (print_msgs)
+		print_cpus();
 
 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
 	if (ret)
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index c4912ab2e76f..ef701d45ba43 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-d PID] [-D LEN] [-p] [-v]\n"
+"       [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,7 +28,8 @@ const char help_fmt[] =
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
-"  -P            Print out DSQ content to trace_pipe every second, use with -b\n"
+"  -P            Print out DSQ content and event counters to trace_pipe every second\n"
+"  -M            Print out debug messages to trace_pipe\n"
 "  -H            Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
 "  -d PID        Disallow a process from switching into SCHED_EXT (-1 for self)\n"
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
@@ -66,7 +67,7 @@ int main(int argc, char **argv)
 
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -87,7 +88,10 @@ int main(int argc, char **argv)
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
 		case 'P':
-			skel->rodata->print_shared_dsq = true;
+			skel->rodata->print_dsqs_and_events = true;
+			break;
+		case 'M':
+			skel->rodata->print_msgs = true;
 			break;
 		case 'H':
 			skel->rodata->highpri_boosting = true;
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 76d83199545c..06d4b13bf76b 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
+#include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
@@ -41,6 +42,7 @@ static void sigint_handler(int simple)
 static void read_stats(struct scx_simple *skel, __u64 *stats)
 {
 	int nr_cpus = libbpf_num_possible_cpus();
+	assert(nr_cpus > 0);
 	__u64 cnts[2][nr_cpus];
 	__u32 idx;
 
diff --git a/tools/scripts/syscall.tbl b/tools/scripts/syscall.tbl
index 580b4e246aec..d1ae5e92c615 100644
--- a/tools/scripts/syscall.tbl
+++ b/tools/scripts/syscall.tbl
@@ -408,3 +408,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 002ec38a8bbb..3b96d090c5eb 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -17,6 +17,8 @@
 #include <asm/sigcontext.h>
 #include <asm/unistd.h>
 
+#include <linux/auxvec.h>
+
 #include "../../kselftest.h"
 
 #define TESTS_PER_HWCAP 3
@@ -55,7 +57,6 @@ static void cmpbr_sigill(void)
 	/* Not implemented, too complicated and unreliable anyway */
 }
 
-
 static void crc32_sigill(void)
 {
 	/* CRC32W W0, W0, W1 */
@@ -169,6 +170,18 @@ static void lse128_sigill(void)
 		     : "cc", "memory");
 }
 
+static void lsfe_sigill(void)
+{
+	float __attribute__ ((aligned (16))) mem;
+	register float *memp asm ("x0") = &mem;
+
+	/* STFADD H0, [X0] */
+	asm volatile(".inst 0x7c20801f"
+		     : "+r" (memp)
+		     :
+		     : "memory");
+}
+
 static void lut_sigill(void)
 {
 	/* LUTI2 V0.16B, { V0.16B }, V[0] */
@@ -763,6 +776,13 @@ static const struct hwcap_data {
 		.sigill_fn = lse128_sigill,
 	},
 	{
+		.name = "LSFE",
+		.at_hwcap = AT_HWCAP3,
+		.hwcap_bit = HWCAP3_LSFE,
+		.cpuinfo = "lsfe",
+		.sigill_fn = lsfe_sigill,
+	},
+	{
 		.name = "LUT",
 		.at_hwcap = AT_HWCAP2,
 		.hwcap_bit = HWCAP2_LUT,
diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c
index f58a9f89b952..4c89ab0f1010 100644
--- a/tools/testing/selftests/arm64/abi/tpidr2.c
+++ b/tools/testing/selftests/arm64/abi/tpidr2.c
@@ -227,10 +227,10 @@ int main(int argc, char **argv)
 	ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0);
 	if (ret >= 0) {
 		ksft_test_result(default_value(), "default_value\n");
-		ksft_test_result(write_read, "write_read\n");
-		ksft_test_result(write_sleep_read, "write_sleep_read\n");
-		ksft_test_result(write_fork_read, "write_fork_read\n");
-		ksft_test_result(write_clone_read, "write_clone_read\n");
+		ksft_test_result(write_read(), "write_read\n");
+		ksft_test_result(write_sleep_read(), "write_sleep_read\n");
+		ksft_test_result(write_fork_read(), "write_fork_read\n");
+		ksft_test_result(write_clone_read(), "write_clone_read\n");
 
 	} else {
 		ksft_print_msg("SME support not present\n");
diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h
index 04e7b72880ef..141cdcbf0b8f 100644
--- a/tools/testing/selftests/arm64/bti/assembler.h
+++ b/tools/testing/selftests/arm64/bti/assembler.h
@@ -14,7 +14,6 @@
 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
 
-
 .macro startfn name:req
 	.globl \name
 \name:
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index 124bc883365e..a85c19e9524e 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1187,7 +1187,7 @@ static void sve_write_sve(pid_t child, struct test_config *config)
 	if (!vl)
 		return;
 
-	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_SVE);
 	iov.iov_base = malloc(iov.iov_len);
 	if (!iov.iov_base) {
 		ksft_print_msg("Failed allocating %lu byte SVE write buffer\n",
@@ -1234,8 +1234,7 @@ static void sve_write_fpsimd(pid_t child, struct test_config *config)
 	if (!vl)
 		return;
 
-	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq,
-							  SVE_PT_REGS_FPSIMD);
+	iov.iov_len = SVE_PT_SIZE(vq, SVE_PT_REGS_FPSIMD);
 	iov.iov_base = malloc(iov.iov_len);
 	if (!iov.iov_base) {
 		ksft_print_msg("Failed allocating %lu byte SVE write buffer\n",
@@ -1569,7 +1568,6 @@ static void run_sve_tests(void)
 					  &test_config);
 		}
 	}
-
 }
 
 static void run_sme_tests(void)
diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index 74e23208b94c..9349aa630c84 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program)
 
 		/*
 		 * Read from the startup pipe, there should be no data
-		 * and we should block until it is closed.  We just
-		 * carry on on error since this isn't super critical.
+		 * and we should block until it is closed. We just
+		 * carry-on on error since this isn't super critical.
 		 */
 		ret = read(3, &i, sizeof(i));
 		if (ret < 0)
@@ -549,7 +549,7 @@ int main(int argc, char **argv)
 
 	evs = calloc(tests, sizeof(*evs));
 	if (!evs)
-		ksft_exit_fail_msg("Failed to allocated %d epoll events\n",
+		ksft_exit_fail_msg("Failed to allocate %d epoll events\n",
 				   tests);
 
 	for (i = 0; i < cpus; i++) {
diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c
index e3cec3723ffa..0c40007d1282 100644
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -188,13 +188,13 @@ static bool create_socket(void)
 
 	ref = malloc(digest_len);
 	if (!ref) {
-		printf("Failed to allocated %d byte reference\n", digest_len);
+		printf("Failed to allocate %d byte reference\n", digest_len);
 		return false;
 	}
 
 	digest = malloc(digest_len);
 	if (!digest) {
-		printf("Failed to allocated %d byte digest\n", digest_len);
+		printf("Failed to allocate %d byte digest\n", digest_len);
 		return false;
 	}
 
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index b22303778fb0..e0fc3a001e28 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = {
 };
 
 #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4)
-#define FLAG_TESTS 2
+#define FLAG_TESTS 4
 #define FPSIMD_TESTS 2
 
 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types))
@@ -95,19 +95,27 @@ static int do_child(void)
 static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd)
 {
 	struct iovec iov;
+	int ret;
 
 	iov.iov_base = fpsimd;
 	iov.iov_len = sizeof(*fpsimd);
-	return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov);
+	ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov);
+	if (ret == -1)
+		ksft_perror("ptrace(PTRACE_GETREGSET)");
+	return ret;
 }
 
 static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd)
 {
 	struct iovec iov;
+	int ret;
 
 	iov.iov_base = fpsimd;
 	iov.iov_len = sizeof(*fpsimd);
-	return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov);
+	ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov);
+	if (ret == -1)
+		ksft_perror("ptrace(PTRACE_SETREGSET)");
+	return ret;
 }
 
 static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
@@ -115,8 +123,9 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
 {
 	struct user_sve_header *sve;
 	void *p;
-	size_t sz = sizeof *sve;
+	size_t sz = sizeof(*sve);
 	struct iovec iov;
+	int ret;
 
 	while (1) {
 		if (*size < sz) {
@@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
 
 		iov.iov_base = *buf;
 		iov.iov_len = sz;
-		if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov))
+		ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov);
+		if (ret) {
+			ksft_perror("ptrace(PTRACE_GETREGSET)");
 			goto error;
+		}
 
 		sve = *buf;
 		if (sve->size <= sz)
@@ -152,10 +164,46 @@ static int set_sve(pid_t pid, const struct vec_type *type,
 		   const struct user_sve_header *sve)
 {
 	struct iovec iov;
+	int ret;
 
 	iov.iov_base = (void *)sve;
 	iov.iov_len = sve->size;
-	return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov);
+	ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov);
+	if (ret == -1)
+		ksft_perror("ptrace(PTRACE_SETREGSET)");
+	return ret;
+}
+
+/* A read operation fails */
+static void read_fails(pid_t child, const struct vec_type *type)
+{
+	struct user_sve_header *new_sve = NULL;
+	size_t new_sve_size = 0;
+	void *ret;
+
+	ret = get_sve(child, type, (void **)&new_sve, &new_sve_size);
+
+	ksft_test_result(ret == NULL, "%s unsupported read fails\n",
+			 type->name);
+
+	free(new_sve);
+}
+
+/* A write operation fails */
+static void write_fails(pid_t child, const struct vec_type *type)
+{
+	struct user_sve_header sve;
+	int ret;
+
+	/* Just the header, no data */
+	memset(&sve, 0, sizeof(sve));
+	sve.size = sizeof(sve);
+	sve.flags = SVE_PT_REGS_SVE;
+	sve.vl = SVE_VL_MIN;
+	ret = set_sve(child, type, &sve);
+
+	ksft_test_result(ret != 0, "%s unsupported write fails\n",
+			 type->name);
 }
 
 /* Validate setting and getting the inherit flag */
@@ -270,6 +318,25 @@ static void check_u32(unsigned int vl, const char *reg,
 	}
 }
 
+/* Set out of range VLs */
+static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type)
+{
+	struct user_sve_header sve;
+	int ret;
+
+	memset(&sve, 0, sizeof(sve));
+	sve.flags = SVE_PT_REGS_SVE;
+	sve.size = sizeof(sve);
+
+	ret = set_sve(child, type, &sve);
+	ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name);
+
+	sve.vl = SVE_VL_MAX + SVE_VQ_BYTES;
+	ret = set_sve(child, type, &sve);
+	ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name,
+			 SVE_VL_MAX + SVE_VQ_BYTES);
+}
+
 /* Access the FPSIMD registers via the SVE regset */
 static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type)
 {
@@ -683,6 +750,20 @@ static int do_parent(pid_t child)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(vec_types); i++) {
+		/*
+		 * If the vector type isn't supported reads and writes
+		 * should fail.
+		 */
+		if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) {
+			read_fails(child, &vec_types[i]);
+			write_fails(child, &vec_types[i]);
+		} else {
+			ksft_test_result_skip("%s unsupported read fails\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s unsupported write fails\n",
+					      vec_types[i].name);
+		}
+
 		/* FPSIMD via SVE regset */
 		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
 			ptrace_sve_fpsimd(child, &vec_types[i]);
@@ -703,6 +784,17 @@ static int do_parent(pid_t child)
 					      vec_types[i].name);
 		}
 
+		/* Setting out of bounds VLs should fail */
+		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
+			ptrace_set_vl_ranges(child, &vec_types[i]);
+		} else {
+			ksft_test_result_skip("%s Set invalid VL 0\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s Set invalid VL %d\n",
+					      vec_types[i].name,
+					      SVE_VL_MAX + SVE_VQ_BYTES);
+		}
+
 		/* Step through every possible VQ */
 		for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) {
 			vl = sve_vl_from_vq(vq);
diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c
index ea9c7d47790f..2d75d342eeb9 100644
--- a/tools/testing/selftests/arm64/fp/vec-syscfg.c
+++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c
@@ -690,7 +690,6 @@ static inline void smstop(void)
 	asm volatile("msr S0_3_C4_C6_3, xzr");
 }
 
-
 /*
  * Verify we can change the SVE vector length while SME is active and
  * continue to use SME afterwards.
diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c
index 584b8d59b7ea..a7f34040fbf1 100644
--- a/tools/testing/selftests/arm64/fp/zt-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c
@@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES])
 	return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov);
 }
 
-
 static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES])
 {
 	struct iovec iov;
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index d2f3497a9103..1fbbf0ca1f02 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -14,11 +14,11 @@ LDLIBS+=-lpthread
 include ../../lib.mk
 
 $(OUTPUT)/basic-gcs: basic-gcs.c
-	$(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
-		-static -include ../../../../include/nolibc/nolibc.h \
+	$(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \
+		-static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \
 		-I../../../../../usr/include \
 		-std=gnu99 -I../.. -g \
-		-ffreestanding -Wall $^ -o $@ -lgcc
+		-ffreestanding $^ -o $@ -lgcc
 
 $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
 	$(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c
index 54f9c888249d..250977abc398 100644
--- a/tools/testing/selftests/arm64/gcs/basic-gcs.c
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -10,6 +10,7 @@
 
 #include <sys/mman.h>
 #include <asm/mman.h>
+#include <asm/hwcap.h>
 #include <linux/sched.h>
 
 #include "kselftest.h"
@@ -386,14 +387,13 @@ int main(void)
 
 	ksft_print_header();
 
-	/*
-	 * We don't have getauxval() with nolibc so treat a failure to
-	 * read GCS state as a lack of support and skip.
-	 */
+	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+		ksft_exit_skip("SKIP GCS not supported\n");
+
 	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
 			  &gcs_mode, 0, 0, 0);
 	if (ret != 0)
-		ksft_exit_skip("Failed to read GCS state: %d\n", ret);
+		ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret);
 
 	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
 		gcs_mode = PR_SHADOW_STACK_ENABLE;
@@ -410,7 +410,7 @@ int main(void)
 	}
 
 	/* One last test: disable GCS, we can do this one time */
-	my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
+	ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
 	if (ret != 0)
 		ksft_print_msg("Failed to disable GCS: %d\n", ret);
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c
index 989f75a491b7..1e6abb136ffd 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-locking.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c
@@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others)
 	ASSERT_EQ(ret, 0);
 	ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES);
 
-
 	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
 			  variant->mode);
 	ASSERT_EQ(ret, 0);
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
index bbc7f4950c13..cf316d78ea97 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-stress.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -433,7 +433,7 @@ int main(int argc, char **argv)
 
 	evs = calloc(tests, sizeof(*evs));
 	if (!evs)
-		ksft_exit_fail_msg("Failed to allocated %d epoll events\n",
+		ksft_exit_fail_msg("Failed to allocate %d epoll events\n",
 				   tests);
 
 	for (i = 0; i < gcs_threads; i++)
diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c
index 4435600ca400..e597861b26d6 100644
--- a/tools/testing/selftests/arm64/pauth/exec_target.c
+++ b/tools/testing/selftests/arm64/pauth/exec_target.c
@@ -13,7 +13,12 @@ int main(void)
 	unsigned long hwcaps;
 	size_t val;
 
-	fread(&val, sizeof(size_t), 1, stdin);
+	size_t size = fread(&val, sizeof(size_t), 1, stdin);
+
+	if (size != 1) {
+		fprintf(stderr, "Could not read input from stdin\n");
+		return EXIT_FAILURE;
+	}
 
 	/* don't try to execute illegal (unimplemented) instructions) caller
 	 * should have checked this and keep worker simple
diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c
index b7b77a6b2979..0de8facca4c5 100644
--- a/tools/testing/selftests/bpf/prog_tests/free_timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c
@@ -124,6 +124,10 @@ void test_free_timer(void)
 	int err;
 
 	skel = free_timer__open_and_load();
+	if (!skel && errno == EOPNOTSUPP) {
+		test__skip();
+		return;
+	}
 	if (!ASSERT_OK_PTR(skel, "open_load"))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
index d66687f1ee6a..56f660ca567b 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -86,6 +86,10 @@ void serial_test_timer(void)
 	int err;
 
 	timer_skel = timer__open_and_load();
+	if (!timer_skel && errno == EOPNOTSUPP) {
+		test__skip();
+		return;
+	}
 	if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c
index f74b82305da8..b841597c8a3a 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c
@@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode)
 	struct timer_crash *skel;
 
 	skel = timer_crash__open_and_load();
+	if (!skel && errno == EOPNOTSUPP) {
+		test__skip();
+		return;
+	}
 	if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load"))
 		return;
 	skel->bss->pid = getpid();
diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
index 1a2f99596916..eb303fa1e09a 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c
@@ -59,6 +59,10 @@ void test_timer_lockup(void)
 	}
 
 	skel = timer_lockup__open_and_load();
+	if (!skel && errno == EOPNOTSUPP) {
+		test__skip();
+		return;
+	}
 	if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
 		return;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c
index 9ff7843909e7..c930c7d7105b 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c
@@ -65,6 +65,10 @@ void serial_test_timer_mim(void)
 		goto cleanup;
 
 	timer_skel = timer_mim__open_and_load();
+	if (!timer_skel && errno == EOPNOTSUPP) {
+		test__skip();
+		return;
+	}
 	if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
 		goto cleanup;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index b17dc39a23db..6d75ede16e7c 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -8,22 +8,31 @@
 #include <asm/ptrace.h>
 #include <linux/compiler.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include <sys/wait.h>
 #include <sys/syscall.h>
 #include <sys/prctl.h>
 #include <asm/prctl.h>
 #include "uprobe_syscall.skel.h"
 #include "uprobe_syscall_executed.skel.h"
+#include "bpf/libbpf_internal.h"
 
-__naked unsigned long uretprobe_regs_trigger(void)
+#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00
+#include "usdt.h"
+
+#pragma GCC diagnostic ignored "-Wattributes"
+
+__attribute__((aligned(16)))
+__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void)
 {
 	asm volatile (
+		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */
 		"movq $0xdeadbeef, %rax\n"
 		"ret\n"
 	);
 }
 
-__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
+__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after)
 {
 	asm volatile (
 		"movq %r15,   0(%rdi)\n"
@@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
 		"movq   $0, 120(%rdi)\n" /* orig_rax */
 		"movq   $0, 128(%rdi)\n" /* rip      */
 		"movq   $0, 136(%rdi)\n" /* cs       */
+		"pushq %rax\n"
 		"pushf\n"
 		"pop %rax\n"
 		"movq %rax, 144(%rdi)\n" /* eflags   */
+		"pop %rax\n"
 		"movq %rsp, 152(%rdi)\n" /* rsp      */
 		"movq   $0, 160(%rdi)\n" /* ss       */
 
 		/* save 2nd argument */
 		"pushq %rsi\n"
-		"call uretprobe_regs_trigger\n"
+		"call uprobe_regs_trigger\n"
 
 		/* save  return value and load 2nd argument pointer to rax */
 		"pushq %rax\n"
@@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
 );
 }
 
-static void test_uretprobe_regs_equal(void)
+static void test_uprobe_regs_equal(bool retprobe)
 {
+	LIBBPF_OPTS(bpf_uprobe_opts, opts,
+		.retprobe = retprobe,
+	);
 	struct uprobe_syscall *skel = NULL;
 	struct pt_regs before = {}, after = {};
 	unsigned long *pb = (unsigned long *) &before;
 	unsigned long *pa = (unsigned long *) &after;
 	unsigned long *pp;
+	unsigned long offset;
 	unsigned int i, cnt;
-	int err;
+
+	offset = get_uprobe_offset(&uprobe_regs_trigger);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		return;
 
 	skel = uprobe_syscall__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load"))
 		goto cleanup;
 
-	err = uprobe_syscall__attach(skel);
-	if (!ASSERT_OK(err, "uprobe_syscall__attach"))
+	skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe,
+				0, "/proc/self/exe", offset, &opts);
+	if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts"))
 		goto cleanup;
 
-	uretprobe_regs(&before, &after);
+	/* make sure uprobe gets optimized */
+	if (!retprobe)
+		uprobe_regs_trigger();
+
+	uprobe_regs(&before, &after);
 
 	pp = (unsigned long *) &skel->bss->regs;
 	cnt = sizeof(before)/sizeof(*pb);
@@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void)
 		unsigned int offset = i * sizeof(unsigned long);
 
 		/*
-		 * Check register before and after uretprobe_regs_trigger call
+		 * Check register before and after uprobe_regs_trigger call
 		 * that triggers the uretprobe.
 		 */
 		switch (offset) {
@@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void)
 
 		/*
 		 * Check register seen from bpf program and register after
-		 * uretprobe_regs_trigger call
+		 * uprobe_regs_trigger call (with rax exception, check below).
 		 */
 		switch (offset) {
 		/*
@@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void)
 		case offsetof(struct pt_regs, rsp):
 		case offsetof(struct pt_regs, ss):
 			break;
+		/*
+		 * uprobe does not see return value in rax, it needs to see the
+		 * original (before) rax value
+		 */
+		case offsetof(struct pt_regs, rax):
+			if (!retprobe) {
+				ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check");
+				break;
+			}
 		default:
 			if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check"))
 				fprintf(stdout, "failed register offset %u\n", offset);
@@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset)
 	return ret != n ? (int) ret : 0;
 }
 
-static void test_uretprobe_regs_change(void)
+static void test_regs_change(void)
 {
 	struct pt_regs before = {}, after = {};
 	unsigned long *pb = (unsigned long *) &before;
@@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void)
 	unsigned long cnt = sizeof(before)/sizeof(*pb);
 	unsigned int i, err, offset;
 
-	offset = get_uprobe_offset(uretprobe_regs_trigger);
+	offset = get_uprobe_offset(uprobe_regs_trigger);
 
 	err = write_bpf_testmod_uprobe(offset);
 	if (!ASSERT_OK(err, "register_uprobe"))
 		return;
 
-	uretprobe_regs(&before, &after);
+	/* make sure uprobe gets optimized */
+	uprobe_regs_trigger();
+
+	uprobe_regs(&before, &after);
 
 	err = write_bpf_testmod_uprobe(0);
 	if (!ASSERT_OK(err, "unregister_uprobe"))
@@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void)
 	);
 	struct uprobe_syscall_executed *skel;
 	int pid, status, err, go[2], c = 0;
+	struct bpf_link *link;
 
 	if (!ASSERT_OK(pipe(go), "pipe"))
 		return;
@@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void)
 		_exit(0);
 	}
 
-	skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid,
-							    "/proc/self/exe",
-							    "uretprobe_syscall_call", &opts);
-	if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi"))
+	skel->bss->pid = pid;
+
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
+						pid, "/proc/self/exe",
+						"uretprobe_syscall_call", &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
 		goto cleanup;
+	skel->links.test_uretprobe_multi = link;
 
 	/* kick the child */
 	write(go[1], &c, 1);
@@ -301,6 +340,256 @@ cleanup:
 	close(go[0]);
 }
 
+#define TRAMP "[uprobes-trampoline]"
+
+__attribute__((aligned(16)))
+__nocf_check __weak __naked void uprobe_test(void)
+{
+	asm volatile ("					\n"
+		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00	\n"
+		"ret					\n"
+	);
+}
+
+__attribute__((aligned(16)))
+__nocf_check __weak void usdt_test(void)
+{
+	USDT(optimized_uprobe, usdt);
+}
+
+static int find_uprobes_trampoline(void *tramp_addr)
+{
+	void *start, *end;
+	char line[128];
+	int ret = -1;
+	FILE *maps;
+
+	maps = fopen("/proc/self/maps", "r");
+	if (!maps) {
+		fprintf(stderr, "cannot open maps\n");
+		return -1;
+	}
+
+	while (fgets(line, sizeof(line), maps)) {
+		int m = -1;
+
+		/* We care only about private r-x mappings. */
+		if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2)
+			continue;
+		if (m < 0)
+			continue;
+		if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) {
+			ret = 0;
+			break;
+		}
+	}
+
+	fclose(maps);
+	return ret;
+}
+
+static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 };
+
+static void *find_nop5(void *fn)
+{
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		if (!memcmp(nop5, fn + i, 5))
+			return fn + i;
+	}
+	return NULL;
+}
+
+typedef void (__attribute__((nocf_check)) *trigger_t)(void);
+
+static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger,
+			  void *addr, int executed)
+{
+	struct __arch_relative_insn {
+		__u8 op;
+		__s32 raddr;
+	} __packed *call;
+	void *tramp = NULL;
+
+	/* Uprobe gets optimized after first trigger, so let's press twice. */
+	trigger();
+	trigger();
+
+	/* Make sure bpf program got executed.. */
+	ASSERT_EQ(skel->bss->executed, executed, "executed");
+
+	/* .. and check the trampoline is as expected. */
+	call = (struct __arch_relative_insn *) addr;
+	tramp = (void *) (call + 1) + call->raddr;
+	ASSERT_EQ(call->op, 0xe8, "call");
+	ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
+
+	return tramp;
+}
+
+static void check_detach(void *addr, void *tramp)
+{
+	/* [uprobes_trampoline] stays after detach */
+	ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
+	ASSERT_OK(memcmp(addr, nop5, 5), "nop5");
+}
+
+static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
+		  trigger_t trigger, void *addr, int executed)
+{
+	void *tramp;
+
+	tramp = check_attach(skel, trigger, addr, executed);
+	bpf_link__destroy(link);
+	check_detach(addr, tramp);
+}
+
+static void test_uprobe_legacy(void)
+{
+	struct uprobe_syscall_executed *skel = NULL;
+	LIBBPF_OPTS(bpf_uprobe_opts, opts,
+		.retprobe = true,
+	);
+	struct bpf_link *link;
+	unsigned long offset;
+
+	offset = get_uprobe_offset(&uprobe_test);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		goto cleanup;
+
+	/* uprobe */
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+				0, "/proc/self/exe", offset, NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 2);
+
+	/* uretprobe */
+	skel->bss->executed = 0;
+
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe,
+				0, "/proc/self/exe", offset, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 2);
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
+static void test_uprobe_multi(void)
+{
+	struct uprobe_syscall_executed *skel = NULL;
+	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
+	struct bpf_link *link;
+	unsigned long offset;
+
+	offset = get_uprobe_offset(&uprobe_test);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		goto cleanup;
+
+	opts.offsets = &offset;
+	opts.cnt = 1;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	/* uprobe.multi */
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 2);
+
+	/* uretprobe.multi */
+	skel->bss->executed = 0;
+	opts.retprobe = true;
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 2);
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
+static void test_uprobe_session(void)
+{
+	struct uprobe_syscall_executed *skel = NULL;
+	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+		.session = true,
+	);
+	struct bpf_link *link;
+	unsigned long offset;
+
+	offset = get_uprobe_offset(&uprobe_test);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		goto cleanup;
+
+	opts.offsets = &offset;
+	opts.cnt = 1;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 4);
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
+static void test_uprobe_usdt(void)
+{
+	struct uprobe_syscall_executed *skel;
+	struct bpf_link *link;
+	void *addr;
+
+	errno = 0;
+	addr = find_nop5(usdt_test);
+	if (!ASSERT_OK_PTR(addr, "find_nop5"))
+		return;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	link = bpf_program__attach_usdt(skel->progs.test_usdt,
+				-1 /* all PIDs */, "/proc/self/exe",
+				"optimized_uprobe", "usdt", NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
+		goto cleanup;
+
+	check(skel, link, usdt_test, addr, 2);
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
 /*
  * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c.
  *
@@ -343,43 +632,172 @@ static void test_uretprobe_shadow_stack(void)
 		return;
 	}
 
-	/* Run all of the uretprobe tests. */
-	test_uretprobe_regs_equal();
-	test_uretprobe_regs_change();
+	/* Run all the tests with shadow stack in place. */
+
+	test_uprobe_regs_equal(false);
+	test_uprobe_regs_equal(true);
 	test_uretprobe_syscall_call();
 
+	test_uprobe_legacy();
+	test_uprobe_multi();
+	test_uprobe_session();
+	test_uprobe_usdt();
+
+	test_regs_change();
+
 	ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
 }
-#else
-static void test_uretprobe_regs_equal(void)
+
+static volatile bool race_stop;
+
+static USDT_DEFINE_SEMA(race);
+
+static void *worker_trigger(void *arg)
 {
-	test__skip();
+	unsigned long rounds = 0;
+
+	while (!race_stop) {
+		uprobe_test();
+		rounds++;
+	}
+
+	printf("tid %d trigger rounds: %lu\n", gettid(), rounds);
+	return NULL;
 }
 
-static void test_uretprobe_regs_change(void)
+static void *worker_attach(void *arg)
 {
-	test__skip();
+	LIBBPF_OPTS(bpf_uprobe_opts, opts);
+	struct uprobe_syscall_executed *skel;
+	unsigned long rounds = 0, offset;
+	const char *sema[2] = {
+		__stringify(USDT_SEMA(race)),
+		NULL,
+	};
+	unsigned long *ref;
+	int err;
+
+	offset = get_uprobe_offset(&uprobe_test);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		return NULL;
+
+	err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT);
+	if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema"))
+		return NULL;
+
+	opts.ref_ctr_offset = *ref;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+		return NULL;
+
+	skel->bss->pid = getpid();
+
+	while (!race_stop) {
+		skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+					0, "/proc/self/exe", offset, &opts);
+		if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts"))
+			break;
+
+		bpf_link__destroy(skel->links.test_uprobe);
+		skel->links.test_uprobe = NULL;
+		rounds++;
+	}
+
+	printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed);
+	uprobe_syscall_executed__destroy(skel);
+	free(ref);
+	return NULL;
 }
 
-static void test_uretprobe_syscall_call(void)
+static useconds_t race_msec(void)
 {
-	test__skip();
+	char *env;
+
+	env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC");
+	if (env)
+		return atoi(env);
+
+	/* default duration is 500ms */
+	return 500;
 }
 
-static void test_uretprobe_shadow_stack(void)
+static void test_uprobe_race(void)
 {
-	test__skip();
+	int err, i, nr_threads;
+	pthread_t *threads;
+
+	nr_threads = libbpf_num_possible_cpus();
+	if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus"))
+		return;
+	nr_threads = max(2, nr_threads);
+
+	threads = alloca(sizeof(*threads) * nr_threads);
+	if (!ASSERT_OK_PTR(threads, "malloc"))
+		return;
+
+	for (i = 0; i < nr_threads; i++) {
+		err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach,
+				     NULL);
+		if (!ASSERT_OK(err, "pthread_create"))
+			goto cleanup;
+	}
+
+	usleep(race_msec() * 1000);
+
+cleanup:
+	race_stop = true;
+	for (nr_threads = i, i = 0; i < nr_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore");
 }
+
+#ifndef __NR_uprobe
+#define __NR_uprobe 336
 #endif
 
-void test_uprobe_syscall(void)
+static void test_uprobe_error(void)
+{
+	long err = syscall(__NR_uprobe);
+
+	ASSERT_EQ(err, -1, "error");
+	ASSERT_EQ(errno, ENXIO, "errno");
+}
+
+static void __test_uprobe_syscall(void)
 {
 	if (test__start_subtest("uretprobe_regs_equal"))
-		test_uretprobe_regs_equal();
-	if (test__start_subtest("uretprobe_regs_change"))
-		test_uretprobe_regs_change();
+		test_uprobe_regs_equal(true);
 	if (test__start_subtest("uretprobe_syscall_call"))
 		test_uretprobe_syscall_call();
 	if (test__start_subtest("uretprobe_shadow_stack"))
 		test_uretprobe_shadow_stack();
+	if (test__start_subtest("uprobe_legacy"))
+		test_uprobe_legacy();
+	if (test__start_subtest("uprobe_multi"))
+		test_uprobe_multi();
+	if (test__start_subtest("uprobe_session"))
+		test_uprobe_session();
+	if (test__start_subtest("uprobe_usdt"))
+		test_uprobe_usdt();
+	if (test__start_subtest("uprobe_race"))
+		test_uprobe_race();
+	if (test__start_subtest("uprobe_error"))
+		test_uprobe_error();
+	if (test__start_subtest("uprobe_regs_equal"))
+		test_uprobe_regs_equal(false);
+	if (test__start_subtest("regs_change"))
+		test_regs_change();
+}
+#else
+static void __test_uprobe_syscall(void)
+{
+	test__skip();
+}
+#endif
+
+void test_uprobe_syscall(void)
+{
+	__test_uprobe_syscall();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
index 9057e983cc54..833eb87483a1 100644
--- a/tools/testing/selftests/bpf/prog_tests/usdt.c
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) {
 	}
 }
 
-static void subtest_basic_usdt(void)
+static void subtest_basic_usdt(bool optimized)
 {
 	LIBBPF_OPTS(bpf_usdt_opts, opts);
 	struct test_usdt *skel;
 	struct test_usdt__bss *bss;
-	int err, i;
+	int err, i, called;
+
+#define TRIGGER(x) ({			\
+	trigger_func(x);		\
+	if (optimized)			\
+		trigger_func(x);	\
+	optimized ? 2 : 1;		\
+	})
 
 	skel = test_usdt__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
@@ -66,11 +73,11 @@ static void subtest_basic_usdt(void)
 	if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link"))
 		goto cleanup;
 
-	trigger_func(1);
+	called = TRIGGER(1);
 
-	ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called");
-	ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called");
-	ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called");
+	ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
+	ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
+	ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
 
 	ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie");
 	ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt");
@@ -119,11 +126,11 @@ static void subtest_basic_usdt(void)
 	 * bpf_program__attach_usdt() handles this properly and attaches to
 	 * all possible places of USDT invocation.
 	 */
-	trigger_func(2);
+	called += TRIGGER(2);
 
-	ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called");
-	ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called");
-	ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called");
+	ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
+	ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
+	ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
 
 	/* only check values that depend on trigger_func()'s input value */
 	ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1");
@@ -142,9 +149,9 @@ static void subtest_basic_usdt(void)
 	if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach"))
 		goto cleanup;
 
-	trigger_func(3);
+	called += TRIGGER(3);
 
-	ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called");
+	ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
 	/* this time usdt3 has custom cookie */
 	ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie");
 	ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt");
@@ -158,6 +165,7 @@ static void subtest_basic_usdt(void)
 
 cleanup:
 	test_usdt__destroy(skel);
+#undef TRIGGER
 }
 
 unsigned short test_usdt_100_semaphore SEC(".probes");
@@ -425,7 +433,11 @@ cleanup:
 void test_usdt(void)
 {
 	if (test__start_subtest("basic"))
-		subtest_basic_usdt();
+		subtest_basic_usdt(false);
+#ifdef __x86_64__
+	if (test__start_subtest("basic_optimized"))
+		subtest_basic_usdt(true);
+#endif
 	if (test__start_subtest("multispec"))
 		subtest_multispec_usdt();
 	if (test__start_subtest("urand_auto_attach"))
diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
index d67466c1ff77..f90531cf3ee5 100644
--- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
+++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
@@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val
 	 * barriers.
 	 */
 	if (val & _Q_LOCKED_MASK)
-		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
+		(void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
 
 	/*
 	 * take ownership and clear the pending bit.
@@ -380,7 +380,7 @@ queue:
 		/* Link @node into the waitqueue. */
 		WRITE_ONCE(prev->next, node);
 
-		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
+		(void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
 
 		/*
 		 * While waiting for the MCS lock, the next pointer may have
diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c
index 645be6cddf36..dfd8a258f14a 100644
--- a/tools/testing/selftests/bpf/progs/crypto_sanity.c
+++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c
@@ -14,7 +14,7 @@ unsigned char key[256] = {};
 u16 udp_test_port = 7777;
 u32 authsize, key_len;
 char algo[128] = {};
-char dst[16] = {};
+char dst[16] = {}, dst_bad[8] = {};
 int status;
 
 static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc)
@@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx)
 		.authsize = authsize,
 	};
 	struct bpf_crypto_ctx *cctx;
-	int err = 0;
+	int err;
 
 	status = 0;
-
 	if (key_len > 256) {
 		status = -EINVAL;
 		return 0;
@@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx)
 
 	__builtin_memcpy(&params.algo, algo, sizeof(algo));
 	__builtin_memcpy(&params.key, key, sizeof(key));
-	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
 
+	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
 	if (!cctx) {
 		status = err;
 		return 0;
@@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx)
 	err = crypto_ctx_insert(cctx);
 	if (err && err != -EEXIST)
 		status = err;
-
 	return 0;
 }
 
@@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb)
 	struct bpf_dynptr psrc, pdst;
 	int err;
 
+	status = 0;
 	err = skb_dynptr_validate(skb, &psrc);
 	if (err < 0) {
 		status = err;
@@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	/* dst is a global variable to make testing part easier to check. In real
-	 * production code, a percpu map should be used to store the result.
+	/* Check also bad case where the dst buffer is smaller than the
+	 * skb's linear section.
+	 */
+	bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst);
+	status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL);
+	if (!status)
+		status = -EIO;
+	if (status != -EINVAL)
+		goto err;
+
+	/* dst is a global variable to make testing part easier to check.
+	 * In real production code, a percpu map should be used to store
+	 * the result.
 	 */
 	bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst);
-
 	status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL);
-
+err:
 	return TC_ACT_SHOT;
 }
 
@@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb)
 	int err;
 
 	status = 0;
-
 	err = skb_dynptr_validate(skb, &psrc);
 	if (err < 0) {
 		status = err;
@@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	/* dst is a global variable to make testing part easier to check. In real
-	 * production code, a percpu map should be used to store the result.
+	/* Check also bad case where the dst buffer is smaller than the
+	 * skb's linear section.
+	 */
+	bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst);
+	status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL);
+	if (!status)
+		status = -EIO;
+	if (status != -EINVAL)
+		goto err;
+
+	/* dst is a global variable to make testing part easier to check.
+	 * In real production code, a percpu map should be used to store
+	 * the result.
 	 */
 	bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst);
-
 	status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL);
-
+err:
 	return TC_ACT_SHOT;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c
index 6438982b928b..ddd26d1a083f 100644
--- a/tools/testing/selftests/bpf/progs/linked_list_fail.c
+++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c
@@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx)
 SEC("?tc")
 int obj_new_no_struct(void *ctx)
 {
-
-	bpf_obj_new(union { int data; unsigned udata; });
+	(void)bpf_obj_new(union { int data; unsigned udata; });
 	return 0;
 }
 
@@ -252,7 +251,7 @@ int new_null_ret(void *ctx)
 SEC("?tc")
 int obj_new_acq(void *ctx)
 {
-	bpf_obj_new(struct foo);
+	(void)bpf_obj_new(struct foo);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
index 46697f381878..a47690174e0e 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
@@ -30,8 +30,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); }
 __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); }
 __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); }
 __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); }
-__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); }
-__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); }
+__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); }
+__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); }
+__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); }
+__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); }
+__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); }
+__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); }
 __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); }
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c
index 8a4fa6c7ef59..e08c31669e5a 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c
@@ -7,8 +7,8 @@ struct pt_regs regs;
 
 char _license[] SEC("license") = "GPL";
 
-SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger")
-int uretprobe(struct pt_regs *ctx)
+SEC("uprobe")
+int probe(struct pt_regs *ctx)
 {
 	__builtin_memcpy(&regs, ctx, sizeof(regs));
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
index 0d7f1a7db2e2..915d38591bf6 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/usdt.bpf.h>
 #include <string.h>
 
 struct pt_regs regs;
@@ -8,10 +10,64 @@ struct pt_regs regs;
 char _license[] SEC("license") = "GPL";
 
 int executed = 0;
+int pid;
+
+SEC("uprobe")
+int BPF_UPROBE(test_uprobe)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
+
+SEC("uretprobe")
+int BPF_URETPROBE(test_uretprobe)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
+
+SEC("uprobe.multi")
+int test_uprobe_multi(struct pt_regs *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
 
 SEC("uretprobe.multi")
-int test(struct pt_regs *regs)
+int test_uretprobe_multi(struct pt_regs *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
+
+SEC("uprobe.session")
+int test_uprobe_session(struct pt_regs *ctx)
 {
-	executed = 1;
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
+
+SEC("usdt")
+int test_usdt(struct pt_regs *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	executed++;
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index e9e918cdf31f..511911053bdc 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -501,14 +501,20 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 #ifdef __x86_64__
 
 static int
+uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data)
+{
+	regs->cx = 0x87654321feebdaed;
+	return 0;
+}
+
+static int
 uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
 		   struct pt_regs *regs, __u64 *data)
 
 {
 	regs->ax  = 0x12345678deadbeef;
-	regs->cx  = 0x87654321feebdaed;
 	regs->r11 = (u64) -1;
-	return true;
+	return 0;
 }
 
 struct testmod_uprobe {
@@ -520,6 +526,7 @@ struct testmod_uprobe {
 static DEFINE_MUTEX(testmod_uprobe_mutex);
 
 static struct testmod_uprobe uprobe = {
+	.consumer.handler = uprobe_handler,
 	.consumer.ret_handler = uprobe_ret_handler,
 };
 
diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h
new file mode 100644
index 000000000000..549d1f774810
--- /dev/null
+++ b/tools/testing/selftests/bpf/usdt.h
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: BSD-2-Clause
+/*
+ *  This single-header library defines a collection of variadic macros for
+ *  defining and triggering USDTs (User Statically-Defined Tracepoints):
+ *
+ *      - For USDTs without associated semaphore:
+ *          USDT(group, name, args...)
+ *
+ *      - For USDTs with implicit (transparent to the user) semaphore:
+ *          USDT_WITH_SEMA(group, name, args...)
+ *          USDT_IS_ACTIVE(group, name)
+ *
+ *      - For USDTs with explicit (user-defined and provided) semaphore:
+ *          USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...)
+ *          USDT_SEMA_IS_ACTIVE(sema)
+ *
+ *  all of which emit a NOP instruction into the instruction stream, and so
+ *  have *zero* overhead for the surrounding code. USDTs are identified by
+ *  a combination of `group` and `name` identifiers, which is used by external
+ *  tracing tooling (tracers) for identifying exact USDTs of interest.
+ *
+ *  USDTs can have an associated (2-byte) activity counter (USDT semaphore),
+ *  automatically maintained by Linux kernel whenever any correctly written
+ *  BPF-based tracer is attached to the USDT. This USDT semaphore can be used
+ *  to check whether there is a need to do any extra data collection and
+ *  processing for a given USDT (if necessary), and otherwise avoid extra work
+ *  for a common case of USDT not being traced ("active").
+ *
+ *  See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or
+ *  USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on
+ *  working with USDTs with implicitly or explicitly associated
+ *  USDT semaphores, respectively.
+ *
+ *  There is also some additional data recorded into an auxiliary note
+ *  section. The data in the note section describes the operands, in terms of
+ *  size and location, used by tracing tooling to know where to find USDT
+ *  arguments. Each location is encoded as an assembler operand string.
+ *  Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert
+ *  breakpoints on top of the nop, and decode the location operand-strings,
+ *  like an assembler, to find the values being passed.
+ *
+ *  The operand strings are selected by the compiler for each operand.
+ *  They are constrained by inline-assembler codes.The default is:
+ *
+ *  #define USDT_ARG_CONSTRAINT nor
+ *
+ *  This is a good default if the operands tend to be integral and
+ *  moderate in number (smaller than number of registers). In other
+ *  cases, the compiler may report "'asm' requires impossible reload" or
+ *  similar. In this case, consider simplifying the macro call (fewer
+ *  and simpler operands), reduce optimization, or override the default
+ *  constraints string via:
+ *
+ *  #define USDT_ARG_CONSTRAINT g
+ *  #include <usdt.h>
+ *
+ * For some historical description of USDT v3 format (the one used by this
+ * library and generally recognized and assumed by BPF-based tracing tools)
+ * see [0]. The more formal specification can be found at [1]. Additional
+ * argument constraints information can be found at [2].
+ *
+ * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for
+ * this USDT library implementation. Current implementation differs *a lot* in
+ * terms of exposed user API and general usability, which was the main goal
+ * and focus of the reimplementation work. Nevertheless, underlying recorded
+ * USDT definitions are fully binary compatible and any USDT-based tooling
+ * should work equally well with USDTs defined by either SystemTap's or this
+ * library's USDT implementation.
+ *
+ *   [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html
+ *   [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
+ *   [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html
+ *   [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h
+ */
+#ifndef __USDT_H
+#define __USDT_H
+
+/*
+ * Changelog:
+ *
+ * 0.1.0
+ * -----
+ * - Initial release
+ */
+#define USDT_MAJOR_VERSION 0
+#define USDT_MINOR_VERSION 1
+#define USDT_PATCH_VERSION 0
+
+/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
+#define __usdt_va_opt 1
+#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__
+#else
+#define __usdt_va_args(...) , ##__VA_ARGS__
+#endif
+
+/*
+ * Trigger USDT with `group`:`name` identifier and pass through `args` as its
+ * arguments. Zero arguments are acceptable as well. No USDT semaphore is
+ * associated with this USDT.
+ *
+ * Such "semaphoreless" USDTs are commonly used when there is no extra data
+ * collection or processing needed to collect and prepare USDT arguments and
+ * they are just available in the surrounding code. USDT() macro will just
+ * record their locations in CPU registers or in memory for tracing tooling to
+ * be able to access them, if necessary.
+ */
+#ifdef __usdt_va_opt
+#define USDT(group, name, ...)							\
+	__usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__)
+#else
+#define USDT(group, name, ...)							\
+	__usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__)
+#endif
+
+/*
+ * Trigger USDT with `group`:`name` identifier and pass through `args` as its
+ * arguments. Zero arguments are acceptable as well. USDT also get an
+ * implicitly-defined associated USDT semaphore, which will be "activated" by
+ * tracing tooling and can be used to check whether USDT is being actively
+ * observed.
+ *
+ * USDTs with semaphore are commonly used when there is a need to perform
+ * additional data collection and processing to prepare USDT arguments, which
+ * otherwise might not be necessary for the rest of application logic. In such
+ * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT
+ * is not traced (which is presumed to be a common situation), the associated
+ * USDT semaphore is "inactive", and so there is no need to waste resources to
+ * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether
+ * USDT is "active".
+ *
+ * N.B. There is an inherent (albeit short) gap between checking whether USDT
+ * is active and triggering corresponding USDT, in which external tracer can
+ * be attached to an USDT and activate USDT semaphore after the activity check.
+ * If such a race occurs, tracers might miss one USDT execution. Tracers are
+ * expected to accommodate such possibility and this is expected to not be
+ * a problem for applications and tracers.
+ *
+ * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained
+ * within a single executable or shared library and is not shared outside
+ * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name
+ * identifier across executable and shared library, it will work and won't
+ * conflict, per se, but will define independent USDT semaphores, one for each
+ * shared library/executable in which USDT_WITH_SEMA(group, name) is used.
+ * That is, if you attach to this USDT in one shared library (or executable),
+ * then only USDT semaphore within that shared library (or executable) will be
+ * updated by the kernel, while other libraries (or executable) will not see
+ * activated USDT semaphore. In short, it's best to use unique USDT group:name
+ * identifiers across different shared libraries (and, equivalently, between
+ * executable and shared library). This is advanced consideration and is
+ * rarely (if ever) seen in practice, but just to avoid surprises this is
+ * called out here. (Static libraries become a part of final executable, once
+ * linked by linker, so the above considerations don't apply to them.)
+ */
+#ifdef __usdt_va_opt
+#define USDT_WITH_SEMA(group, name, ...)					\
+	__usdt_probe(group, name,						\
+		     __usdt_sema_implicit, __usdt_sema_name(group, name)	\
+		     __VA_OPT__(,) __VA_ARGS__)
+#else
+#define USDT_WITH_SEMA(group, name, ...)					\
+	__usdt_probe(group, name,						\
+		     __usdt_sema_implicit, __usdt_sema_name(group, name),	\
+		     ##__VA_ARGS__)
+#endif
+
+struct usdt_sema { volatile unsigned short active; };
+
+/*
+ * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it
+ * is attached to by external tracing tooling and is actively observed).
+ *
+ * This macro can be used to decide whether any additional and potentially
+ * expensive data collection or processing should be done to pass extra
+ * information into the given USDT. It is assumed that USDT is triggered with
+ * USDT_WITH_SEMA() macro which will implicitly define associated USDT
+ * semaphore. (If one needs more control over USDT semaphore, see
+ * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.)
+ *
+ * N.B. Such checks are necessarily racy and speculative. Between checking
+ * whether USDT is active and triggering the USDT itself, tracer can be
+ * detached with no notification. This race should be extremely rare and worst
+ * case should result in one-time wasted extra data collection and processing.
+ */
+#define USDT_IS_ACTIVE(group, name) ({						\
+	extern struct usdt_sema __usdt_sema_name(group, name)			\
+		__usdt_asm_name(__usdt_sema_name(group, name));			\
+	__usdt_sema_implicit(__usdt_sema_name(group, name));			\
+	__usdt_sema_name(group, name).active > 0;				\
+})
+
+/*
+ * APIs for working with user-defined explicit USDT semaphores.
+ *
+ * This is a less commonly used advanced API for use cases in which user needs
+ * an explicit control over (potentially shared across multiple USDTs) USDT
+ * semaphore instance. This can be used when there is a group of logically
+ * related USDTs that all need extra data collection and processing whenever
+ * any of a family of related USDTs are "activated" (i.e., traced). In such
+ * a case, all such related USDTs will be associated with the same shared USDT
+ * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be
+ * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra
+ * USDT semaphore identifier as an extra parameter.
+ */
+
+/**
+ * Underlying C global variable name for user-defined USDT semaphore with
+ * `sema` identifier. Could be useful for debugging, but normally shouldn't be
+ * used explicitly.
+ */
+#define USDT_SEMA(sema) __usdt_sema_##sema
+
+/*
+ * Define storage for user-defined USDT semaphore `sema`.
+ *
+ * Should be used only once in non-header source file to let compiler allocate
+ * space for the semaphore variable. Just like with any other global variable.
+ *
+ * This macro can be used anywhere where global variable declaration is
+ * allowed. Just like with global variable definitions, there should be only
+ * one definition of user-defined USDT semaphore with given `sema` identifier,
+ * otherwise compiler or linker will complain about duplicate variable
+ * definition.
+ *
+ * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace
+ * and inside namespaces (including nested namespaces). Just make sure that
+ * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is
+ * referenced, or any of its parent namespaces, so the C++ language-level
+ * identifier is visible to the code that needs to reference the semaphore.
+ * At the lowest layer, USDT semaphores have global naming and visibility
+ * (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked
+ * against from C or C++ code, if necessary). To keep it simple, putting
+ * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest
+ * no-brainer solution. All these aspects are irrelevant for plain C, because
+ * C doesn't have namespaces and everything is always in the global namespace.
+ *
+ * N.B. Due to USDT metadata being recorded in non-allocatable ELF note
+ * section, it has limitations when it comes to relocations, which, in
+ * practice, means that it's not possible to correctly share USDT semaphores
+ * between main executable and shared libraries, or even between multiple
+ * shared libraries. USDT semaphore has to be contained to individual shared
+ * library or executable to avoid unpleasant surprises with half-working USDT
+ * semaphores. We enforce this by marking semaphore ELF symbols as having
+ * a hidden visibility. This is quite an advanced use case and consideration
+ * and for most users this should have no consequences whatsoever.
+ */
+#define USDT_DEFINE_SEMA(sema)							\
+	struct usdt_sema __usdt_sema_sec USDT_SEMA(sema)			\
+		__usdt_asm_name(USDT_SEMA(sema))				\
+		__attribute__((visibility("hidden"))) = { 0 }
+
+/*
+ * Declare extern reference to user-defined USDT semaphore `sema`.
+ *
+ * Refers to a variable defined in another compilation unit by
+ * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across
+ * multiple compilation units (i.e., .c and .cpp files).
+ *
+ * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities.
+ */
+#define USDT_DECLARE_SEMA(sema)							\
+	extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema))
+
+/*
+ * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it
+ * is attached to by external tracing tooling and is actively observed).
+ *
+ * This macro can be used to decide whether any additional and potentially
+ * expensive data collection or processing should be done to pass extra
+ * information into USDT(s) associated with USDT semaphore `sema`.
+ *
+ * N.B. Such checks are necessarily racy. Between checking the state of USDT
+ * semaphore and triggering associated USDT(s), the active tracer might attach
+ * or detach. This race should be extremely rare and worst case should result
+ * in one-time missed USDT event or wasted extra data collection and
+ * processing. USDT-using tracers should be written with this in mind and is
+ * not a concern of the application defining USDTs with associated semaphore.
+ */
+#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0)
+
+/*
+ * Invoke USDT specified by `group` and `name` identifiers and associate
+ * explicitly user-defined semaphore `sema` with it. Pass through `args` as
+ * USDT arguments. `args` are optional and zero arguments are acceptable.
+ *
+ * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be
+ * checked whether active with USDT_SEMA_IS_ACTIVE().
+ */
+#ifdef __usdt_va_opt
+#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...)				\
+	__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__)
+#else
+#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...)				\
+	__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__)
+#endif
+
+/*
+ * Adjustable implementation aspects
+ */
+#ifndef USDT_ARG_CONSTRAINT
+#if defined __powerpc__
+#define USDT_ARG_CONSTRAINT		nZr
+#elif defined __arm__
+#define USDT_ARG_CONSTRAINT		g
+#elif defined __loongarch__
+#define USDT_ARG_CONSTRAINT		nmr
+#else
+#define USDT_ARG_CONSTRAINT		nor
+#endif
+#endif /* USDT_ARG_CONSTRAINT */
+
+#ifndef USDT_NOP
+#if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
+#define USDT_NOP			nop 0
+#else
+#define USDT_NOP			nop
+#endif
+#endif /* USDT_NOP */
+
+/*
+ * Implementation details
+ */
+/* USDT name for implicitly-defined USDT semaphore, derived from group:name */
+#define __usdt_sema_name(group, name)	__usdt_sema_##group##__##name
+/* ELF section into which USDT semaphores are put */
+#define __usdt_sema_sec			__attribute__((section(".probes")))
+
+#define __usdt_concat(a, b)		a ## b
+#define __usdt_apply(fn, n)		__usdt_concat(fn, n)
+
+#ifndef __usdt_nth
+#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N
+#endif
+
+#ifndef __usdt_narg
+#ifdef __usdt_va_opt
+#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#else
+#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#endif
+#endif /* __usdt_narg */
+
+#define __usdt_hash			#
+#define __usdt_str_(x)			#x
+#define __usdt_str(x)			__usdt_str_(x)
+
+#ifndef __usdt_asm_name
+#define __usdt_asm_name(name)		__asm__(__usdt_str(name))
+#endif
+
+#define __usdt_asm0()		"\n"
+#define __usdt_asm1(x)		__usdt_str(x) "\n"
+#define __usdt_asm2(x, ...)	__usdt_str(x) "," __usdt_asm1(__VA_ARGS__)
+#define __usdt_asm3(x, ...)	__usdt_str(x) "," __usdt_asm2(__VA_ARGS__)
+#define __usdt_asm4(x, ...)	__usdt_str(x) "," __usdt_asm3(__VA_ARGS__)
+#define __usdt_asm5(x, ...)	__usdt_str(x) "," __usdt_asm4(__VA_ARGS__)
+#define __usdt_asm6(x, ...)	__usdt_str(x) "," __usdt_asm5(__VA_ARGS__)
+#define __usdt_asm7(x, ...)	__usdt_str(x) "," __usdt_asm6(__VA_ARGS__)
+#define __usdt_asm8(x, ...)	__usdt_str(x) "," __usdt_asm7(__VA_ARGS__)
+#define __usdt_asm9(x, ...)	__usdt_str(x) "," __usdt_asm8(__VA_ARGS__)
+#define __usdt_asm10(x, ...)	__usdt_str(x) "," __usdt_asm9(__VA_ARGS__)
+#define __usdt_asm11(x, ...)	__usdt_str(x) "," __usdt_asm10(__VA_ARGS__)
+#define __usdt_asm12(x, ...)	__usdt_str(x) "," __usdt_asm11(__VA_ARGS__)
+#define __usdt_asm(...)		__usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
+
+#ifdef __LP64__
+#define __usdt_asm_addr		.8byte
+#else
+#define __usdt_asm_addr		.4byte
+#endif
+
+#define __usdt_asm_strz_(x)	__usdt_asm1(.asciz #x)
+#define __usdt_asm_strz(x)	__usdt_asm_strz_(x)
+#define __usdt_asm_str_(x)	__usdt_asm1(.ascii #x)
+#define __usdt_asm_str(x)	__usdt_asm_str_(x)
+
+/* "semaphoreless" USDT case */
+#ifndef __usdt_sema_none
+#define __usdt_sema_none(sema)
+#endif
+
+/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */
+#ifndef __usdt_sema_implicit
+#define __usdt_sema_implicit(sema)								\
+	__asm__ __volatile__ (									\
+	__usdt_asm1(.ifndef sema)								\
+	__usdt_asm3(		.pushsection .probes, "aw", "progbits")				\
+	__usdt_asm1(		.weak sema)							\
+	__usdt_asm1(		.hidden sema)							\
+	__usdt_asm1(		.align 2)							\
+	__usdt_asm1(sema:)									\
+	__usdt_asm1(		.zero 2)							\
+	__usdt_asm2(		.type sema, @object)						\
+	__usdt_asm2(		.size sema, 2)							\
+	__usdt_asm1(		.popsection)							\
+	__usdt_asm1(.endif)									\
+	);
+#endif
+
+/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */
+#ifndef __usdt_sema_explicit
+#define __usdt_sema_explicit(sema)								\
+	__asm__ __volatile__ ("" :: "m" (sema));
+#endif
+
+/* main USDT definition (nop and .note.stapsdt metadata) */
+#define __usdt_probe(group, name, sema_def, sema, ...) do {					\
+	sema_def(sema)										\
+	__asm__ __volatile__ (									\
+	__usdt_asm( 990:	USDT_NOP)							\
+	__usdt_asm3(		.pushsection .note.stapsdt, "", "note")				\
+	__usdt_asm1(		.balign 4)							\
+	__usdt_asm3(		.4byte 992f-991f,994f-993f,3)					\
+	__usdt_asm1(991:	.asciz "stapsdt")						\
+	__usdt_asm1(992:	.balign 4)							\
+	__usdt_asm1(993:	__usdt_asm_addr 990b)						\
+	__usdt_asm1(		__usdt_asm_addr _.stapsdt.base)					\
+	__usdt_asm1(		__usdt_asm_addr sema)						\
+	__usdt_asm_strz(group)									\
+	__usdt_asm_strz(name)									\
+	__usdt_asm_args(__VA_ARGS__)								\
+	__usdt_asm1(		.ascii "\0")							\
+	__usdt_asm1(994:	.balign 4)							\
+	__usdt_asm1(		.popsection)							\
+	__usdt_asm1(.ifndef _.stapsdt.base)							\
+	__usdt_asm5(		.pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\
+	__usdt_asm1(		.weak _.stapsdt.base)						\
+	__usdt_asm1(		.hidden _.stapsdt.base)						\
+	__usdt_asm1(_.stapsdt.base:)								\
+	__usdt_asm1(		.space 1)							\
+	__usdt_asm2(		.size _.stapsdt.base, 1)					\
+	__usdt_asm1(		.popsection)							\
+	__usdt_asm1(.endif)									\
+	:: __usdt_asm_ops(__VA_ARGS__)								\
+	);											\
+} while (0)
+
+/*
+ * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h
+ * operand note format.
+ *
+ * The named register may be a longer or shorter (!) alias for the
+ * storage where the value in question is found. For example, on
+ * i386, 64-bit value may be put in register pairs, and a register
+ * name stored would identify just one of them. Previously, gcc was
+ * asked to emit the %w[id] (16-bit alias of some registers holding
+ * operands), even when a wider 32-bit value was used.
+ *
+ * Bottom line: the byte-width given before the @ sign governs. If
+ * there is a mismatch between that width and that of the named
+ * register, then a sys/sdt.h note consumer may need to employ
+ * architecture-specific heuristics to figure out where the compiler
+ * has actually put the complete value.
+ */
+#if defined(__powerpc__) || defined(__powerpc64__)
+#define __usdt_argref(id)	%I[id]%[id]
+#elif defined(__i386__)
+#define __usdt_argref(id)	%k[id]		/* gcc.gnu.org/PR80115 sourceware.org/PR24541 */
+#else
+#define __usdt_argref(id)	%[id]
+#endif
+
+#define __usdt_asm_arg(n)	__usdt_asm_str(%c[__usdt_asz##n])				\
+				__usdt_asm1(.ascii "@")						\
+				__usdt_asm_str(__usdt_argref(__usdt_aval##n))
+
+#define __usdt_asm_args0	/* no arguments */
+#define __usdt_asm_args1	__usdt_asm_arg(1)
+#define __usdt_asm_args2	__usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2)
+#define __usdt_asm_args3	__usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3)
+#define __usdt_asm_args4	__usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4)
+#define __usdt_asm_args5	__usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5)
+#define __usdt_asm_args6	__usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6)
+#define __usdt_asm_args7	__usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7)
+#define __usdt_asm_args8	__usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8)
+#define __usdt_asm_args9	__usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9)
+#define __usdt_asm_args10	__usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10)
+#define __usdt_asm_args11	__usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11)
+#define __usdt_asm_args12	__usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12)
+#define __usdt_asm_args(...)	__usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__))
+
+#define __usdt_is_arr(x)	(__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5)
+#define __usdt_arg_size(x)	(__usdt_is_arr(x) ? sizeof(void *) : sizeof(x))
+
+/*
+ * We can't use __builtin_choose_expr() in C++, so fall back to table-based
+ * signedness determination for known types, utilizing templates magic.
+ */
+#ifdef __cplusplus
+
+#define __usdt_is_signed(x)	(!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed)
+
+#include <cstddef>
+
+template<typename T> struct __usdt_t { static const bool is_signed = false; };
+template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {};
+template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {};
+
+#define __usdt_def_signed(T)									\
+template<> struct __usdt_t<T>		     { static const bool is_signed = true; };		\
+template<> struct __usdt_t<const T>	     { static const bool is_signed = true; };		\
+template<> struct __usdt_t<volatile T>	     { static const bool is_signed = true; };		\
+template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; }
+#define __usdt_maybe_signed(T)									\
+template<> struct __usdt_t<T>		     { static const bool is_signed = (T)-1 < (T)1; };	\
+template<> struct __usdt_t<const T>	     { static const bool is_signed = (T)-1 < (T)1; };	\
+template<> struct __usdt_t<volatile T>	     { static const bool is_signed = (T)-1 < (T)1; };	\
+template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; }
+
+__usdt_def_signed(signed char);
+__usdt_def_signed(short);
+__usdt_def_signed(int);
+__usdt_def_signed(long);
+__usdt_def_signed(long long);
+__usdt_maybe_signed(char);
+__usdt_maybe_signed(wchar_t);
+
+#else /* !__cplusplus */
+
+#define __usdt_is_inttype(x)	(__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4)
+#define __usdt_inttype(x)	__typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U))
+#define __usdt_is_signed(x)	((__usdt_inttype(x))-1 < (__usdt_inttype(x))1)
+
+#endif /* __cplusplus */
+
+#define __usdt_asm_op(n, x)									\
+	[__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)),	\
+	[__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x)
+
+#define __usdt_asm_ops0()				[__usdt_dummy] "g" (0)
+#define __usdt_asm_ops1(x)				__usdt_asm_op(1, x)
+#define __usdt_asm_ops2(a,x)				__usdt_asm_ops1(a), __usdt_asm_op(2, x)
+#define __usdt_asm_ops3(a,b,x)				__usdt_asm_ops2(a,b), __usdt_asm_op(3, x)
+#define __usdt_asm_ops4(a,b,c,x)			__usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x)
+#define __usdt_asm_ops5(a,b,c,d,x)			__usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x)
+#define __usdt_asm_ops6(a,b,c,d,e,x)			__usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x)
+#define __usdt_asm_ops7(a,b,c,d,e,f,x)			__usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x)
+#define __usdt_asm_ops8(a,b,c,d,e,f,g,x)		__usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x)
+#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x)		__usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x)
+#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x)		__usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x)
+#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x)		__usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x)
+#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x)	__usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x)
+#define __usdt_asm_ops(...)				__usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
+
+#endif /* __USDT_H */
diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 0e89fcff4d05..44c52f620fda 100644
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -522,6 +522,18 @@ int proc_mount_contains(const char *option)
 	return strstr(buf, option) != NULL;
 }
 
+int cgroup_feature(const char *feature)
+{
+	char buf[PAGE_SIZE];
+	ssize_t read;
+
+	read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf));
+	if (read < 0)
+		return read;
+
+	return strstr(buf, feature) != NULL;
+}
+
 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
 {
 	char path[PATH_MAX];
diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index c69cab66254b..9dc90a1b386d 100644
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -60,6 +60,7 @@ extern int cg_run_nowait(const char *cgroup,
 extern int cg_wait_for_proc_count(const char *cgroup, int count);
 extern int cg_killall(const char *cgroup);
 int proc_mount_contains(const char *option);
+int cgroup_feature(const char *feature);
 extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
 extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
 extern pid_t clone_into_cgroup(int cgroup_fd);
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
index 8730645d363a..dfb763819581 100644
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -804,6 +804,662 @@ cleanup:
 	return ret;
 }
 
+/*
+ * Get the current frozen_usec for the cgroup.
+ */
+static long cg_check_freezetime(const char *cgroup)
+{
+	return cg_read_key_long(cgroup, "cgroup.stat.local",
+				"frozen_usec ");
+}
+
+/*
+ * Test that the freeze time will behave as expected for an empty cgroup.
+ */
+static int test_cgfreezer_time_empty(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	long prev, curr;
+
+	cgroup = cg_name(root, "cg_time_test_empty");
+	if (!cgroup)
+		goto cleanup;
+
+	/*
+	 * 1) Create an empty cgroup and check that its freeze time
+	 *    is 0.
+	 */
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	curr = cg_check_freezetime(cgroup);
+	if (curr < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+	if (curr > 0) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	if (cg_freeze_nowait(cgroup, true))
+		goto cleanup;
+
+	/*
+	 * 2) Sleep for 1000 us. Check that the freeze time is at
+	 *    least 1000 us.
+	 */
+	usleep(1000);
+	curr = cg_check_freezetime(cgroup);
+	if (curr < 1000) {
+		debug("Expect time (%ld) to be at least 1000 us\n",
+		      curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 3) Unfreeze the cgroup. Check that the freeze time is
+	 *    larger than at 2).
+	 */
+	if (cg_freeze_nowait(cgroup, false))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 4) Check the freeze time again to ensure that it has not
+	 *    changed.
+	 */
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr != prev) {
+		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * A simple test for cgroup freezer time accounting. This test follows
+ * the same flow as test_cgfreezer_time_empty, but with a single process
+ * in the cgroup.
+ */
+static int test_cgfreezer_time_simple(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	long prev, curr;
+
+	cgroup = cg_name(root, "cg_time_test_simple");
+	if (!cgroup)
+		goto cleanup;
+
+	/*
+	 * 1) Create a cgroup and check that its freeze time is 0.
+	 */
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	curr = cg_check_freezetime(cgroup);
+	if (curr < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+	if (curr > 0) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 2) Populate the cgroup with one child and check that the
+	 *    freeze time is still 0.
+	 */
+	cg_run_nowait(cgroup, child_fn, NULL);
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr > prev) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	if (cg_freeze_nowait(cgroup, true))
+		goto cleanup;
+
+	/*
+	 * 3) Sleep for 1000 us. Check that the freeze time is at
+	 *    least 1000 us.
+	 */
+	usleep(1000);
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr < 1000) {
+		debug("Expect time (%ld) to be at least 1000 us\n",
+		      curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 4) Unfreeze the cgroup. Check that the freeze time is
+	 *    larger than at 3).
+	 */
+	if (cg_freeze_nowait(cgroup, false))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 5) Sleep for 1000 us. Check that the freeze time is the
+	 *    same as at 4).
+	 */
+	usleep(1000);
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr != prev) {
+		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * Test that freezer time accounting works as expected, even while we're
+ * populating a cgroup with processes.
+ */
+static int test_cgfreezer_time_populate(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cgroup = NULL;
+	long prev, curr;
+	int i;
+
+	cgroup = cg_name(root, "cg_time_test_populate");
+	if (!cgroup)
+		goto cleanup;
+
+	if (cg_create(cgroup))
+		goto cleanup;
+
+	curr = cg_check_freezetime(cgroup);
+	if (curr < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+	if (curr > 0) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 1) Populate the cgroup with 100 processes. Check that
+	 *    the freeze time is 0.
+	 */
+	for (i = 0; i < 100; i++)
+		cg_run_nowait(cgroup, child_fn, NULL);
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr != prev) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 2) Wait for the group to become fully populated. Check
+	 *    that the freeze time is 0.
+	 */
+	if (cg_wait_for_proc_count(cgroup, 100))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr != prev) {
+		debug("Expect time (%ld) to be 0\n", curr);
+		goto cleanup;
+	}
+
+	/*
+	 * 3) Freeze the cgroup and then populate it with 100 more
+	 *    processes. Check that the freeze time continues to grow.
+	 */
+	if (cg_freeze_nowait(cgroup, true))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	for (i = 0; i < 100; i++)
+		cg_run_nowait(cgroup, child_fn, NULL);
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 4) Wait for the group to become fully populated. Check
+	 *    that the freeze time is larger than at 3).
+	 */
+	if (cg_wait_for_proc_count(cgroup, 200))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 5) Unfreeze the cgroup. Check that the freeze time is
+	 *    larger than at 4).
+	 */
+	if (cg_freeze_nowait(cgroup, false))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 6) Kill the processes. Check that the freeze time is the
+	 *    same as it was at 5).
+	 */
+	if (cg_killall(cgroup))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr != prev) {
+		debug("Expect time (%ld) to be unchanged from previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	/*
+	 * 7) Freeze and unfreeze the cgroup. Check that the freeze
+	 *    time is larger than it was at 6).
+	 */
+	if (cg_freeze_nowait(cgroup, true))
+		goto cleanup;
+	if (cg_freeze_nowait(cgroup, false))
+		goto cleanup;
+	prev = curr;
+	curr = cg_check_freezetime(cgroup);
+	if (curr <= prev) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr, prev);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup)
+		cg_destroy(cgroup);
+	free(cgroup);
+	return ret;
+}
+
+/*
+ * Test that frozen time for a cgroup continues to work as expected,
+ * even as processes are migrated. Frozen cgroup A's freeze time should
+ * continue to increase and running cgroup B's should stay 0.
+ */
+static int test_cgfreezer_time_migrate(const char *root)
+{
+	long prev_A, curr_A, curr_B;
+	char *cgroup[2] = {0};
+	int ret = KSFT_FAIL;
+	int pid;
+
+	cgroup[0] = cg_name(root, "cg_time_test_migrate_A");
+	if (!cgroup[0])
+		goto cleanup;
+
+	cgroup[1] = cg_name(root, "cg_time_test_migrate_B");
+	if (!cgroup[1])
+		goto cleanup;
+
+	if (cg_create(cgroup[0]))
+		goto cleanup;
+
+	if (cg_check_freezetime(cgroup[0]) < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_create(cgroup[1]))
+		goto cleanup;
+
+	pid = cg_run_nowait(cgroup[0], child_fn, NULL);
+	if (pid < 0)
+		goto cleanup;
+
+	if (cg_wait_for_proc_count(cgroup[0], 1))
+		goto cleanup;
+
+	curr_A = cg_check_freezetime(cgroup[0]);
+	if (curr_A) {
+		debug("Expect time (%ld) to be 0\n", curr_A);
+		goto cleanup;
+	}
+	curr_B = cg_check_freezetime(cgroup[1]);
+	if (curr_B) {
+		debug("Expect time (%ld) to be 0\n", curr_B);
+		goto cleanup;
+	}
+
+	/*
+	 * Freeze cgroup A.
+	 */
+	if (cg_freeze_wait(cgroup[0], true))
+		goto cleanup;
+	prev_A = curr_A;
+	curr_A = cg_check_freezetime(cgroup[0]);
+	if (curr_A <= prev_A) {
+		debug("Expect time (%ld) to be > 0\n", curr_A);
+		goto cleanup;
+	}
+
+	/*
+	 * Migrate from A (frozen) to B (running).
+	 */
+	if (cg_enter(cgroup[1], pid))
+		goto cleanup;
+
+	usleep(1000);
+	curr_B = cg_check_freezetime(cgroup[1]);
+	if (curr_B) {
+		debug("Expect time (%ld) to be 0\n", curr_B);
+		goto cleanup;
+	}
+
+	prev_A = curr_A;
+	curr_A = cg_check_freezetime(cgroup[0]);
+	if (curr_A <= prev_A) {
+		debug("Expect time (%ld) to be more than previous check (%ld)\n",
+		      curr_A, prev_A);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (cgroup[0])
+		cg_destroy(cgroup[0]);
+	free(cgroup[0]);
+	if (cgroup[1])
+		cg_destroy(cgroup[1]);
+	free(cgroup[1]);
+	return ret;
+}
+
+/*
+ * The test creates a cgroup and freezes it. Then it creates a child cgroup.
+ * After that it checks that the child cgroup has a non-zero freeze time
+ * that is less than the parent's. Next, it freezes the child, unfreezes
+ * the parent, and sleeps. Finally, it checks that the child's freeze
+ * time has grown larger than the parent's.
+ */
+static int test_cgfreezer_time_parent(const char *root)
+{
+	char *parent, *child = NULL;
+	int ret = KSFT_FAIL;
+	long ptime, ctime;
+
+	parent = cg_name(root, "cg_test_parent_A");
+	if (!parent)
+		goto cleanup;
+
+	child = cg_name(parent, "cg_test_parent_B");
+	if (!child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_check_freezetime(parent) < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_freeze_wait(parent, true))
+		goto cleanup;
+
+	usleep(1000);
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_check_frozen(child, true))
+		goto cleanup;
+
+	/*
+	 * Since the parent was frozen the entire time the child cgroup
+	 * was being created, we expect the parent's freeze time to be
+	 * larger than the child's.
+	 *
+	 * Ideally, we would be able to check both times simultaneously,
+	 * but here we get the child's after we get the parent's.
+	 */
+	ptime = cg_check_freezetime(parent);
+	ctime = cg_check_freezetime(child);
+	if (ptime <= ctime) {
+		debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime);
+		goto cleanup;
+	}
+
+	if (cg_freeze_nowait(child, true))
+		goto cleanup;
+
+	if (cg_freeze_wait(parent, false))
+		goto cleanup;
+
+	if (cg_check_frozen(child, true))
+		goto cleanup;
+
+	usleep(100000);
+
+	ctime = cg_check_freezetime(child);
+	ptime = cg_check_freezetime(parent);
+
+	if (ctime <= ptime) {
+		debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	free(child);
+	if (parent)
+		cg_destroy(parent);
+	free(parent);
+	return ret;
+}
+
+/*
+ * The test creates a parent cgroup and a child cgroup. Then, it freezes
+ * the child and checks that the child's freeze time is greater than the
+ * parent's, which should be zero.
+ */
+static int test_cgfreezer_time_child(const char *root)
+{
+	char *parent, *child = NULL;
+	int ret = KSFT_FAIL;
+	long ptime, ctime;
+
+	parent = cg_name(root, "cg_test_child_A");
+	if (!parent)
+		goto cleanup;
+
+	child = cg_name(parent, "cg_test_child_B");
+	if (!child)
+		goto cleanup;
+
+	if (cg_create(parent))
+		goto cleanup;
+
+	if (cg_check_freezetime(parent) < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_create(child))
+		goto cleanup;
+
+	if (cg_freeze_wait(child, true))
+		goto cleanup;
+
+	ctime = cg_check_freezetime(child);
+	ptime = cg_check_freezetime(parent);
+	if (ptime != 0) {
+		debug("Expect ptime (%ld) to be 0\n", ptime);
+		goto cleanup;
+	}
+
+	if (ctime <= ptime) {
+		debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	if (child)
+		cg_destroy(child);
+	free(child);
+	if (parent)
+		cg_destroy(parent);
+	free(parent);
+	return ret;
+}
+
+/*
+ * The test creates the following hierarchy:
+ *    A
+ *    |
+ *    B
+ *    |
+ *    C
+ *
+ * Then it freezes the cgroups in the order C, B, A.
+ * Then it unfreezes the cgroups in the order A, B, C.
+ * Then it checks that C's freeze time is larger than B's and
+ * that B's is larger than A's.
+ */
+static int test_cgfreezer_time_nested(const char *root)
+{
+	char *cgroup[3] = {0};
+	int ret = KSFT_FAIL;
+	long time[3] = {0};
+	int i;
+
+	cgroup[0] = cg_name(root, "cg_test_time_A");
+	if (!cgroup[0])
+		goto cleanup;
+
+	cgroup[1] = cg_name(cgroup[0], "B");
+	if (!cgroup[1])
+		goto cleanup;
+
+	cgroup[2] = cg_name(cgroup[1], "C");
+	if (!cgroup[2])
+		goto cleanup;
+
+	if (cg_create(cgroup[0]))
+		goto cleanup;
+
+	if (cg_check_freezetime(cgroup[0]) < 0) {
+		ret = KSFT_SKIP;
+		goto cleanup;
+	}
+
+	if (cg_create(cgroup[1]))
+		goto cleanup;
+
+	if (cg_create(cgroup[2]))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[2], true))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[1], true))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[0], true))
+		goto cleanup;
+
+	usleep(1000);
+
+	if (cg_freeze_nowait(cgroup[0], false))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[1], false))
+		goto cleanup;
+
+	if (cg_freeze_nowait(cgroup[2], false))
+		goto cleanup;
+
+	time[2] = cg_check_freezetime(cgroup[2]);
+	time[1] = cg_check_freezetime(cgroup[1]);
+	time[0] = cg_check_freezetime(cgroup[0]);
+
+	if (time[2] <= time[1]) {
+		debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]);
+		goto cleanup;
+	}
+
+	if (time[1] <= time[0]) {
+		debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]);
+		goto cleanup;
+	}
+
+	ret = KSFT_PASS;
+
+cleanup:
+	for (i = 2; i >= 0 && cgroup[i]; i--) {
+		cg_destroy(cgroup[i]);
+		free(cgroup[i]);
+	}
+
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct cgfreezer_test {
 	int (*fn)(const char *root);
@@ -819,6 +1475,13 @@ struct cgfreezer_test {
 	T(test_cgfreezer_stopped),
 	T(test_cgfreezer_ptraced),
 	T(test_cgfreezer_vfork),
+	T(test_cgfreezer_time_empty),
+	T(test_cgfreezer_time_simple),
+	T(test_cgfreezer_time_populate),
+	T(test_cgfreezer_time_migrate),
+	T(test_cgfreezer_time_parent),
+	T(test_cgfreezer_time_child),
+	T(test_cgfreezer_time_nested),
 };
 #undef T
 
diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c
index 9ecb83c6cc5c..d8a1d1cd5007 100644
--- a/tools/testing/selftests/cgroup/test_pids.c
+++ b/tools/testing/selftests/cgroup/test_pids.c
@@ -77,6 +77,9 @@ static int test_pids_events(const char *root)
 	char *cg_parent = NULL, *cg_child = NULL;
 	int pid;
 
+	if (cgroup_feature("pids_localevents") <= 0)
+		return KSFT_SKIP;
+
 	cg_parent = cg_name(root, "pids_parent");
 	cg_child = cg_name(cg_parent, "pids_child");
 	if (!cg_parent || !cg_child)
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index 5a5a7a5f7e1d..a4ac80bb1003 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -446,9 +446,6 @@ TEST_F(coredump, socket_detect_userspace_client)
 		if (info.coredump_mask & PIDFD_COREDUMPED)
 			goto out;
 
-		if (read(fd_coredump, &c, 1) < 1)
-			goto out;
-
 		exit_code = EXIT_SUCCESS;
 out:
 		if (fd_peer_pidfd >= 0)
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 5b230deb19e8..9a3499827d4b 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -4,6 +4,7 @@
 TEST_GEN_FILES += access_memory access_memory_even
 
 TEST_FILES = _damon_sysfs.py
+TEST_FILES += drgn_dump_damon_status.py
 
 # functionality tests
 TEST_PROGS += sysfs.sh
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 2b10854e4b1e..44b98f17f8ff 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -10,7 +10,8 @@ TEST_PROGS := \
 	mode-2-recovery-updelay.sh \
 	bond_options.sh \
 	bond-eth-type-change.sh \
-	bond_macvlan_ipvlan.sh
+	bond_macvlan_ipvlan.sh \
+	bond_passive_lacp.sh
 
 TEST_FILES := \
 	lag_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
index 7bc148889ca7..187b478d0ddf 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
@@ -7,6 +7,8 @@ ALL_TESTS="
 	prio
 	arp_validate
 	num_grat_arp
+	fail_over_mac
+	vlan_over_bond
 "
 
 lib_dir=$(dirname "$0")
@@ -352,8 +354,8 @@ garp_test()
 
 	exp_num=$(echo "${param}" | cut -f6 -d ' ')
 	active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
-	slowwait_for_counter $((exp_num + 5)) $exp_num \
-		tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}"
+	slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \
+		"dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null
 
 	# check result
 	real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}")
@@ -376,6 +378,197 @@ num_grat_arp()
 	done
 }
 
+check_all_mac_same()
+{
+	RET=0
+	# all slaves should have same mac address (with the first port's mac)
+	local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]')
+	local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]')
+	local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]')
+	local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]')
+	if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \
+		[ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then
+		RET=1
+	fi
+}
+
+check_bond_mac_same_with_first()
+{
+	RET=0
+	# bond mac address should be same with the first added slave
+	local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]')
+	if [ "$bond_mac" != "${mac[0]}" ]; then
+		RET=1
+	fi
+}
+
+check_bond_mac_same_with_active()
+{
+	RET=0
+	# bond mac address should be same with active slave
+	local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]')
+	local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+	local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]')
+	if [ "$bond_mac" != "$active_slave_mac" ]; then
+		RET=1
+	fi
+}
+
+check_backup_slave_mac_not_change()
+{
+	RET=0
+	# backup slave's mac address is not changed
+	if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[]
+		| select(.linkinfo.info_slave_data.state=="BACKUP")
+		| select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then
+		RET=1
+	fi
+}
+
+check_backup_slave_mac_inherit()
+{
+	local backup_mac
+	RET=0
+
+	# backup slaves should use mac[1] or mac[2]
+	local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \
+		jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address')
+	for backup_mac in $backup_macs; do
+		if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then
+			RET=1
+		fi
+	done
+}
+
+check_first_slave_random_mac()
+{
+	RET=0
+	# remove the first added slave and added it back
+	ip -n "$s_ns" link set eth0 nomaster
+	ip -n "$s_ns" link set eth0 master bond0
+
+	# the first slave should use random mac address
+	eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]')
+	[ "$eth0_mac" = "${mac[0]}" ] && RET=1
+	log_test "bond fail_over_mac follow" "random first slave mac"
+
+	# remove the first slave, the permanent MAC address should be restored back
+	ip -n "$s_ns" link set eth0 nomaster
+	eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]')
+	[ "$eth0_mac" != "${mac[0]}" ] && RET=1
+}
+
+do_active_backup_failover()
+{
+	local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+	ip -n ${s_ns} link set ${active_slave} down
+	slowwait 2 active_slave_changed $active_slave
+	ip -n ${s_ns} link set ${active_slave} up
+}
+
+fail_over_mac()
+{
+	# Bring down the first interface on the switch to force the bond to
+	# select another active interface instead of the first one that joined.
+	ip -n "$g_ns" link set s0 down
+
+	# fail_over_mac none
+	bond_reset "mode active-backup miimon 100 fail_over_mac 0"
+	check_all_mac_same
+	log_test "fail_over_mac 0" "all slaves have same mac"
+	do_active_backup_failover
+	check_all_mac_same
+	log_test "fail_over_mac 0" "failover: all slaves have same mac"
+
+	# fail_over_mac active
+	bond_reset "mode active-backup miimon 100 fail_over_mac 1"
+	check_bond_mac_same_with_active
+	log_test "fail_over_mac 1" "bond mac is same with active slave mac"
+	check_backup_slave_mac_not_change
+	log_test "fail_over_mac 1" "backup slave mac is not changed"
+	do_active_backup_failover
+	check_bond_mac_same_with_active
+	log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac"
+	check_backup_slave_mac_not_change
+	log_test "fail_over_mac 1" "failover: backup slave mac is not changed"
+
+	# fail_over_mac follow
+	bond_reset "mode active-backup miimon 100 fail_over_mac 2"
+	check_bond_mac_same_with_first
+	log_test "fail_over_mac 2" "bond mac is same with first slave mac"
+	check_bond_mac_same_with_active
+	log_test "fail_over_mac 2" "bond mac is same with active slave mac"
+	check_backup_slave_mac_inherit
+	log_test "fail_over_mac 2" "backup slave mac inherit"
+	do_active_backup_failover
+	check_bond_mac_same_with_first
+	log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac"
+	check_bond_mac_same_with_active
+	log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac"
+	check_backup_slave_mac_inherit
+	log_test "fail_over_mac 2" "failover: backup slave mac inherit"
+	check_first_slave_random_mac
+	log_test "fail_over_mac 2" "first slave mac random"
+}
+
+vlan_over_bond_arp()
+{
+	local mode="$1"
+	RET=0
+
+	bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10"
+	ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3
+	ip -n "${s_ns}" link set bond0.3 up
+	ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3
+	ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3
+
+	slowwait_for_counter 5 5 tc_rule_handle_stats_get \
+		"dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1
+	log_test "vlan over bond arp" "$mode"
+}
+
+vlan_over_bond_ns()
+{
+	local mode="$1"
+	RET=0
+
+	if skip_ns; then
+		log_test_skip "vlan_over_bond ns" "$mode"
+		return 0
+	fi
+
+	bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10"
+	ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3
+	ip -n "${s_ns}" link set bond0.3 up
+	ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3
+	ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3
+
+	slowwait_for_counter 5 5 tc_rule_handle_stats_get \
+		"dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1
+	log_test "vlan over bond ns" "$mode"
+}
+
+vlan_over_bond()
+{
+	# add vlan 3 for client
+	ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3
+	ip -n "${c_ns}" link set eth0.3 up
+	ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3
+	ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3
+
+	# Add tc rule to check the vlan pkts
+	tc -n "${c_ns}" qdisc add dev eth0.3 clsact
+	tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \
+		handle 101 flower skip_hw arp_op request \
+		arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass
+	tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \
+		handle 102 flower skip_hw ip_proto icmpv6 \
+		type 135 src_ip 2001:db8::3:1 action pass
+
+	vlan_over_bond_arp "active-backup"
+	vlan_over_bond_ns "active-backup"
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh
new file mode 100755
index 000000000000..9c3b089813df
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test if a bond interface works with lacp_active=off.
+
+# shellcheck disable=SC2034
+REQUIRE_MZ=no
+NUM_NETIFS=0
+lib_dir=$(dirname "$0")
+# shellcheck disable=SC1091
+source "$lib_dir"/../../../net/forwarding/lib.sh
+
+# shellcheck disable=SC2317
+check_port_state()
+{
+	local netns=$1
+	local port=$2
+	local state=$3
+
+	ip -n "${netns}" -d -j link show "$port" | \
+		jq -e ".[].linkinfo.info_slave_data.ad_actor_oper_port_state_str | index(\"${state}\") != null" > /dev/null
+}
+
+check_pkt_count()
+{
+	RET=0
+	local ns="$1"
+	local iface="$2"
+
+	# wait 65s, one per 30s
+	slowwait_for_counter 65 2 tc_rule_handle_stats_get \
+		"dev ${iface} egress" 101 ".packets" "-n ${ns}" &> /dev/null
+}
+
+setup() {
+	setup_ns c_ns s_ns
+
+	# shellcheck disable=SC2154
+	ip -n "${c_ns}" link add eth0 type veth peer name eth0 netns "${s_ns}"
+	ip -n "${c_ns}" link add eth1 type veth peer name eth1 netns "${s_ns}"
+
+	# Add tc filter to count the pkts
+	tc -n "${c_ns}" qdisc add dev eth0 clsact
+	tc -n "${c_ns}" filter add dev eth0 egress handle 101 protocol 0x8809 matchall action pass
+	tc -n "${s_ns}" qdisc add dev eth1 clsact
+	tc -n "${s_ns}" filter add dev eth1 egress handle 101 protocol 0x8809 matchall action pass
+
+	ip -n "${s_ns}" link add bond0 type bond mode 802.3ad lacp_active on lacp_rate fast
+	ip -n "${s_ns}" link set eth0 master bond0
+	ip -n "${s_ns}" link set eth1 master bond0
+
+	ip -n "${c_ns}" link add bond0 type bond mode 802.3ad lacp_active off lacp_rate fast
+	ip -n "${c_ns}" link set eth0 master bond0
+	ip -n "${c_ns}" link set eth1 master bond0
+
+}
+
+trap cleanup_all_ns EXIT
+setup
+
+# The bond will send 2 lacpdu pkts during init time, let's wait at least 2s
+# after interface up
+ip -n "${c_ns}" link set bond0 up
+sleep 2
+
+# 1. The passive side shouldn't send LACPDU.
+check_pkt_count "${c_ns}" "eth0" && RET=1
+log_test "802.3ad lacp_active off" "init port"
+
+ip -n "${s_ns}" link set bond0 up
+# 2. The passive side should not have the 'active' flag.
+RET=0
+slowwait 2 check_port_state "${c_ns}" "eth0" "active" && RET=1
+log_test "802.3ad lacp_active off" "port state active"
+
+# 3. The active side should have the 'active' flag.
+RET=0
+slowwait 2 check_port_state "${s_ns}" "eth0" "active" || RET=1
+log_test "802.3ad lacp_active on" "port state active"
+
+# 4. Make sure the connection is not expired.
+RET=0
+slowwait 5 check_port_state "${s_ns}" "eth0" "distributing"
+slowwait 10 check_port_state "${s_ns}" "eth0" "expired" && RET=1
+log_test "bond 802.3ad lacp_active off" "port connection"
+
+# After testing, disconnect one port on each side to check the state.
+ip -n "${s_ns}" link set eth0 nomaster
+ip -n "${s_ns}" link set eth0 up
+ip -n "${c_ns}" link set eth1 nomaster
+ip -n "${c_ns}" link set eth1 up
+# Due to Periodic Machine and Rx Machine state change, the bond will still
+# send lacpdu pkts in a few seconds. sleep at lease 5s to make sure
+# negotiation finished
+sleep 5
+
+# 5. The active side should keep sending LACPDU.
+check_pkt_count "${s_ns}" "eth1" || RET=1
+log_test "bond 802.3ad lacp_active on" "port pkt after disconnect"
+
+# 6. The passive side shouldn't send LACPDU anymore.
+check_pkt_count "${c_ns}" "eth0" && RET=1
+log_test "bond 802.3ad lacp_active off" "port pkt after disconnect"
+
+exit "$EXIT_STATUS"
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
index 195ef83cfbf1..167aa4a4a12a 100644
--- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh
@@ -39,6 +39,8 @@ g_ip4="192.0.2.254"
 s_ip6="2001:db8::1"
 c_ip6="2001:db8::10"
 g_ip6="2001:db8::254"
+mac[0]="00:0a:0b:0c:0d:01"
+mac[1]="00:0a:0b:0c:0d:02"
 
 gateway_create()
 {
@@ -62,6 +64,7 @@ server_create()
 
 	for i in $(seq 0 1); do
 		ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns}
+		ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}"
 
 		ip -n ${g_ns} link set s${i} up
 		ip -n ${g_ns} link set s${i} master br0
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
index 3a1333d9a85b..23a2932301cc 100644
--- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh
@@ -26,6 +26,7 @@
 #  +-------------------------------------+
 
 source bond_topo_2d1c.sh
+mac[2]="00:0a:0b:0c:0d:03"
 
 setup_prepare()
 {
@@ -36,6 +37,7 @@ setup_prepare()
 	# Add the extra device as we use 3 down links for bond0
 	local i=2
 	ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns}
+	ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}"
 	ip -n ${g_ns} link set s${i} up
 	ip -n ${g_ns} link set s${i} master br0
 	ip -n ${s_ns} link set eth${i} master bond0
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index dad4e5fda4db..832fa1caeb66 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -6,6 +6,8 @@ CONFIG_MACVLAN=y
 CONFIG_IPVLAN=y
 CONFIG_NET_ACT_GACT=y
 CONFIG_NET_CLS_FLOWER=y
+CONFIG_NET_CLS_MATCHALL=m
 CONFIG_NET_SCH_INGRESS=y
 CONFIG_NLMON=y
 CONFIG_VETH=y
+CONFIG_VLAN_8021Q=m
diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py
index cd23af875317..3e3a89a34afe 100755
--- a/tools/testing/selftests/drivers/net/hw/csum.py
+++ b/tools/testing/selftests/drivers/net/hw/csum.py
@@ -17,7 +17,7 @@ def test_receive(cfg, ipver="6", extra_args=None):
     ip_args = f"-{ipver} -S {cfg.remote_addr_v[ipver]} -D {cfg.addr_v[ipver]}"
 
     rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}"
-    tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}"
+    tx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -n 100 {ip_args} -r 1 -T {extra_args}"
 
     with bkg(rx_cmd, exit_wait=True):
         wait_port_listen(34000, proto="udp")
@@ -37,7 +37,7 @@ def test_transmit(cfg, ipver="6", extra_args=None):
     if extra_args != "-U -Z":
         extra_args += " -r 1"
 
-    rx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -L 1 -n 100 {ip_args} -R {extra_args}"
+    rx_cmd = f"{cfg.bin_remote} -i {cfg.remote_ifname} -L 1 -n 100 {ip_args} -R {extra_args}"
     tx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -L 1 -n 100 {ip_args} -T {extra_args}"
 
     with bkg(rx_cmd, host=cfg.remote, exit_wait=True):
diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py
index b2698db39817..9699a100a87d 100755
--- a/tools/testing/selftests/drivers/net/napi_threaded.py
+++ b/tools/testing/selftests/drivers/net/napi_threaded.py
@@ -35,6 +35,8 @@ def _setup_deferred_cleanup(cfg) -> None:
     threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout
     defer(_set_threaded_state, cfg, threaded)
 
+    return combined
+
 
 def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None:
     """
@@ -49,7 +51,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None:
     napi0_id = napis[0]['id']
     napi1_id = napis[1]['id']
 
-    _setup_deferred_cleanup(cfg)
+    qcnt = _setup_deferred_cleanup(cfg)
 
     # set threaded
     _set_threaded_state(cfg, 1)
@@ -62,7 +64,7 @@ def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None:
     nl.napi_set({'id': napi1_id, 'threaded': 'disabled'})
 
     cmd(f"ethtool -L {cfg.ifname} combined 1")
-    cmd(f"ethtool -L {cfg.ifname} combined 2")
+    cmd(f"ethtool -L {cfg.ifname} combined {qcnt}")
     _assert_napi_threaded_enabled(nl, napi0_id)
     _assert_napi_threaded_disabled(nl, napi1_id)
 
@@ -80,7 +82,7 @@ def change_num_queues(cfg, nl) -> None:
     napi0_id = napis[0]['id']
     napi1_id = napis[1]['id']
 
-    _setup_deferred_cleanup(cfg)
+    qcnt = _setup_deferred_cleanup(cfg)
 
     # set threaded
     _set_threaded_state(cfg, 1)
@@ -90,7 +92,7 @@ def change_num_queues(cfg, nl) -> None:
     _assert_napi_threaded_enabled(nl, napi1_id)
 
     cmd(f"ethtool -L {cfg.ifname} combined 1")
-    cmd(f"ethtool -L {cfg.ifname} combined 2")
+    cmd(f"ethtool -L {cfg.ifname} combined {qcnt}")
 
     # check napi threaded is set for both napis
     _assert_napi_threaded_enabled(nl, napi0_id)
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index fcbdb1297e24..64ac0dfa46b7 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 dnotify_test
 devpts_pts
+fclog
 file_stressor
 anon_inode_test
 kernfs_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 73d4650af1a5..85427d7f19b9 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/fclog.c b/tools/testing/selftests/filesystems/fclog.c
new file mode 100644
index 000000000000..912a8b755c3b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fclog.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2025 SUSE LLC.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mount.h>
+
+#include "../kselftest_harness.h"
+
+#define ASSERT_ERRNO(expected, _t, seen)				\
+	__EXPECT(expected, #expected,					\
+		({__typeof__(seen) _tmp_seen = (seen);			\
+		  _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1)
+
+#define ASSERT_ERRNO_EQ(expected, seen) \
+	ASSERT_ERRNO(expected, ==, seen)
+
+#define ASSERT_SUCCESS(seen) \
+	ASSERT_ERRNO(0, <=, seen)
+
+FIXTURE(ns)
+{
+	int host_mntns;
+};
+
+FIXTURE_SETUP(ns)
+{
+	/* Stash the old mntns. */
+	self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC);
+	ASSERT_SUCCESS(self->host_mntns);
+
+	/* Create a new mount namespace and make it private. */
+	ASSERT_SUCCESS(unshare(CLONE_NEWNS));
+	ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL));
+}
+
+FIXTURE_TEARDOWN(ns)
+{
+	ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS));
+	ASSERT_SUCCESS(close(self->host_mntns));
+}
+
+TEST_F(ns, fscontext_log_enodata)
+{
+	int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	/* A brand new fscontext has no log entries. */
+	char buf[128] = {};
+	for (int i = 0; i < 16; i++)
+		ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf)));
+
+	ASSERT_SUCCESS(close(fsfd));
+}
+
+TEST_F(ns, fscontext_log_errorfc)
+{
+	int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0));
+
+	char buf[128] = {};
+	ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf)));
+	EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf);
+
+	/* The message has been consumed. */
+	ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf)));
+	ASSERT_SUCCESS(close(fsfd));
+}
+
+TEST_F(ns, fscontext_log_errorfc_after_fsmount)
+{
+	int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0));
+
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+	int mfd = fsmount(fsfd, FSMOUNT_CLOEXEC, MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NOSUID);
+	ASSERT_SUCCESS(mfd);
+	ASSERT_SUCCESS(move_mount(mfd, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH));
+
+	/*
+	 * The fscontext log should still contain data even after
+	 * FSCONFIG_CMD_CREATE and fsmount().
+	 */
+	char buf[128] = {};
+	ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf)));
+	EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf);
+
+	/* The message has been consumed. */
+	ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf)));
+	ASSERT_SUCCESS(close(fsfd));
+}
+
+TEST_F(ns, fscontext_log_emsgsize)
+{
+	int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_ERRNO_EQ(-EINVAL, fsconfig(fsfd, FSCONFIG_SET_STRING, "invalid-arg", "123", 0));
+
+	char buf[128] = {};
+	/*
+	 * Attempting to read a message with too small a buffer should not
+	 * result in the message getting consumed.
+	 */
+	ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 0));
+	ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 1));
+	for (int i = 0; i < 16; i++)
+		ASSERT_ERRNO_EQ(-EMSGSIZE, read(fsfd, buf, 16));
+
+	ASSERT_SUCCESS(read(fsfd, buf, sizeof(buf)));
+	EXPECT_STREQ("e tmpfs: Unknown parameter 'invalid-arg'\n", buf);
+
+	/* The message has been consumed. */
+	ASSERT_ERRNO_EQ(-ENODATA, read(fsfd, buf, sizeof(buf)));
+	ASSERT_SUCCESS(close(fsfd));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
index 63ce708d93ed..e4b7c2b457ee 100644
--- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
@@ -2,6 +2,13 @@
 // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu>
 
 #define _GNU_SOURCE
+
+// Needed for linux/fanotify.h
+typedef struct {
+	int	val[2];
+} __kernel_fsid_t;
+#define __kernel_fsid_t __kernel_fsid_t
+
 #include <fcntl.h>
 #include <sched.h>
 #include <stdio.h>
@@ -10,20 +17,12 @@
 #include <sys/mount.h>
 #include <unistd.h>
 #include <sys/syscall.h>
+#include <sys/fanotify.h>
 
 #include "../../kselftest_harness.h"
 #include "../statmount/statmount.h"
 #include "../utils.h"
 
-// Needed for linux/fanotify.h
-#ifndef __kernel_fsid_t
-typedef struct {
-	int	val[2];
-} __kernel_fsid_t;
-#endif
-
-#include <sys/fanotify.h>
-
 static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
 
 static const int mark_cmds[] = {
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
index 090a5ca65004..9f57ca46e3af 100644
--- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
@@ -2,6 +2,13 @@
 // Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu>
 
 #define _GNU_SOURCE
+
+// Needed for linux/fanotify.h
+typedef struct {
+	int	val[2];
+} __kernel_fsid_t;
+#define __kernel_fsid_t __kernel_fsid_t
+
 #include <fcntl.h>
 #include <sched.h>
 #include <stdio.h>
@@ -10,21 +17,12 @@
 #include <sys/mount.h>
 #include <unistd.h>
 #include <sys/syscall.h>
+#include <sys/fanotify.h>
 
 #include "../../kselftest_harness.h"
-#include "../../pidfd/pidfd.h"
 #include "../statmount/statmount.h"
 #include "../utils.h"
 
-// Needed for linux/fanotify.h
-#ifndef __kernel_fsid_t
-typedef struct {
-	int	val[2];
-} __kernel_fsid_t;
-#endif
-
-#include <sys/fanotify.h>
-
 static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
 
 static const int mark_types[] = {
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index 651fc9f13c08..45c14323a618 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -113,7 +113,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata,
 	 * necessarily mean a test failure, just that the limit has to be made
 	 * bigger.
 	 */
-	ASSERT_GT(400, nth_state->iteration);
+	ASSERT_GT(1000, nth_state->iteration);
 	if (nth_state->iteration != 0) {
 		ssize_t res;
 		ssize_t res2;
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 2925e47db995..8516e8434bc4 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -751,7 +751,7 @@
 	for (; _metadata->trigger; _metadata->trigger = \
 			__bail(_assert, _metadata))
 
-#define is_signed_type(var)       (!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
+#define is_signed_var(var)	(!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
 
 #define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \
 	/* Avoid multiple evaluation of the cases */ \
@@ -759,7 +759,7 @@
 	__typeof__(_seen) __seen = (_seen); \
 	if (!(__exp _t __seen)) { \
 		/* Report with actual signedness to avoid weird output. */ \
-		switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \
+		switch (is_signed_var(__exp) * 2 + is_signed_var(__seen)) { \
 		case 0: { \
 			uintmax_t __exp_print = (uintmax_t)__exp; \
 			uintmax_t __seen_print = (uintmax_t)__seen; \
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index f6fe7a07a0a2..41b40c676d7f 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -169,6 +169,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq
 TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
 TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
 TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
+TEST_GEN_PROGS_arm64 += arm64/kvm-uuid
 TEST_GEN_PROGS_arm64 += access_tracking_perf_test
 TEST_GEN_PROGS_arm64 += arch_timer
 TEST_GEN_PROGS_arm64 += coalesced_io_test
diff --git a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
index cef8f7323ceb..713005b6f508 100644
--- a/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/aarch32_id_regs.c
@@ -146,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
 
 	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
 
-	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
+	el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val);
 	return el0 == ID_AA64PFR0_EL1_EL0_IMP;
 }
 
diff --git a/tools/testing/selftests/kvm/arm64/debug-exceptions.c b/tools/testing/selftests/kvm/arm64/debug-exceptions.c
index e34963956fbc..1d431de8729c 100644
--- a/tools/testing/selftests/kvm/arm64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/arm64/debug-exceptions.c
@@ -116,12 +116,12 @@ static void reset_debug_state(void)
 
 	/* Reset all bcr/bvr/wcr/wvr registers */
 	dfr0 = read_sysreg(id_aa64dfr0_el1);
-	brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0);
+	brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs, dfr0);
 	for (i = 0; i <= brps; i++) {
 		write_dbgbcr(i, 0);
 		write_dbgbvr(i, 0);
 	}
-	wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0);
+	wrps = FIELD_GET(ID_AA64DFR0_EL1_WRPs, dfr0);
 	for (i = 0; i <= wrps; i++) {
 		write_dbgwcr(i, 0);
 		write_dbgwvr(i, 0);
@@ -418,7 +418,7 @@ static void guest_code_ss(int test_cnt)
 
 static int debug_version(uint64_t id_aa64dfr0)
 {
-	return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0);
+	return FIELD_GET(ID_AA64DFR0_EL1_DebugVer, id_aa64dfr0);
 }
 
 static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
@@ -539,14 +539,14 @@ void test_guest_debug_exceptions_all(uint64_t aa64dfr0)
 	int b, w, c;
 
 	/* Number of breakpoints */
-	brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1;
+	brp_num = FIELD_GET(ID_AA64DFR0_EL1_BRPs, aa64dfr0) + 1;
 	__TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required");
 
 	/* Number of watchpoints */
-	wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1;
+	wrp_num = FIELD_GET(ID_AA64DFR0_EL1_WRPs, aa64dfr0) + 1;
 
 	/* Number of context aware breakpoints */
-	ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1;
+	ctx_brp_num = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, aa64dfr0) + 1;
 
 	pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__,
 		 brp_num, wrp_num, ctx_brp_num);
diff --git a/tools/testing/selftests/kvm/arm64/kvm-uuid.c b/tools/testing/selftests/kvm/arm64/kvm-uuid.c
new file mode 100644
index 000000000000..af9581b860f1
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/kvm-uuid.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Check that nobody has tampered with KVM's UID
+
+#include <errno.h>
+#include <linux/arm-smccc.h>
+#include <asm/kvm.h>
+#include <kvm_util.h>
+
+#include "processor.h"
+
+/*
+ * Do NOT redefine these constants, or try to replace them with some
+ * "common" version. They are hardcoded here to detect any potential
+ * breakage happening in the rest of the kernel.
+ *
+ * KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74
+ */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0	0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1	0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2	0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3	0x743a004dU
+
+static void guest_code(void)
+{
+	struct arm_smccc_res res = {};
+
+	smccc_hvc(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0, 0, 0, 0, 0, 0, 0, &res);
+
+	__GUEST_ASSERT(res.a0 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 &&
+		       res.a1 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 &&
+		       res.a2 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 &&
+		       res.a3 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3,
+		       "Unexpected KVM-specific UID %lx %lx %lx %lx\n", res.a0, res.a1, res.a2, res.a3);
+	GUEST_DONE();
+}
+
+int main (int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	bool guest_done = false;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	while (!guest_done) {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			guest_done = true;
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			printf("%s", uc.buffer);
+			break;
+		default:
+			TEST_FAIL("Unexpected guest exit");
+		}
+	}
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
index ebd70430c89d..f222538e6084 100644
--- a/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
+++ b/tools/testing/selftests/kvm/arm64/no-vgic-v3.c
@@ -54,7 +54,7 @@ static void guest_code(void)
 	 * Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
 	 * hidden the feature at runtime without any other userspace action.
 	 */
-	__GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC),
+	__GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC,
 				 read_sysreg(id_aa64pfr0_el1)) == 0,
 		       "GICv3 wrongly advertised");
 
@@ -165,7 +165,7 @@ int main(int argc, char *argv[])
 
 	vm = vm_create_with_one_vcpu(&vcpu, NULL);
 	pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-	__TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0),
+	__TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0),
 		       "GICv3 not supported.");
 	kvm_vm_free(vm);
 
diff --git a/tools/testing/selftests/kvm/arm64/page_fault_test.c b/tools/testing/selftests/kvm/arm64/page_fault_test.c
index dc6559dad9d8..4ccbd389d133 100644
--- a/tools/testing/selftests/kvm/arm64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/arm64/page_fault_test.c
@@ -95,14 +95,14 @@ static bool guest_check_lse(void)
 	uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
 	uint64_t atomic;
 
-	atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0);
+	atomic = FIELD_GET(ID_AA64ISAR0_EL1_ATOMIC, isar0);
 	return atomic >= 2;
 }
 
 static bool guest_check_dc_zva(void)
 {
 	uint64_t dczid = read_sysreg(dczid_el0);
-	uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid);
+	uint64_t dzp = FIELD_GET(DCZID_EL0_DZP, dczid);
 
 	return dzp == 0;
 }
@@ -195,7 +195,7 @@ static bool guest_set_ha(void)
 	uint64_t hadbs, tcr;
 
 	/* Skip if HA is not supported. */
-	hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1);
+	hadbs = FIELD_GET(ID_AA64MMFR1_EL1_HAFDBS, mmfr1);
 	if (hadbs == 0)
 		return false;
 
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index d3bf9204409c..189321e96925 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -243,6 +243,7 @@ static void guest_code(void)
 	GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1);
 	GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
 	GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
+	GUEST_REG_SYNC(SYS_ID_AA64MMFR3_EL1);
 	GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
 	GUEST_REG_SYNC(SYS_CTR_EL0);
 	GUEST_REG_SYNC(SYS_MIDR_EL1);
@@ -594,8 +595,8 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu)
 	 */
 	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
 
-	mte = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), val);
-	mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val);
+	mte = FIELD_GET(ID_AA64PFR1_EL1_MTE, val);
+	mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val);
 	if (mte != ID_AA64PFR1_EL1_MTE_MTE2 ||
 	    mte_frac != ID_AA64PFR1_EL1_MTE_frac_NI) {
 		ksft_test_result_skip("MTE_ASYNC or MTE_ASYMM are supported, nothing to test\n");
@@ -612,7 +613,7 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu)
 	}
 
 	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
-	mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val);
+	mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val);
 	if (mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI)
 		ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac=0 accepted and still 0xF\n");
 	else
@@ -774,7 +775,7 @@ int main(void)
 
 	/* Check for AARCH64 only system */
 	val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
-	el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
+	el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val);
 	aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP);
 
 	ksft_print_header();
diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
index f16b3b27e32e..a0c4ab839155 100644
--- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
+++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c
@@ -441,7 +441,7 @@ static void create_vpmu_vm(void *guest_code)
 
 	/* Make sure that PMUv3 support is indicated in the ID register */
 	dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
-	pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0);
+	pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer, dfr0);
 	TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF &&
 		    pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP,
 		    "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver);
diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
index 9d69904cb608..eb115123d741 100644
--- a/tools/testing/selftests/kvm/lib/arm64/processor.c
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -573,15 +573,15 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
 	err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
 	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
 
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val);
+	gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN4, val);
 	*ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI,
 					ID_AA64MMFR0_EL1_TGRAN4_52_BIT);
 
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val);
+	gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN64, val);
 	*ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI,
 					ID_AA64MMFR0_EL1_TGRAN64_IMP);
 
-	gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val);
+	gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN16, val);
 	*ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI,
 					ID_AA64MMFR0_EL1_TGRAN16_52_BIT);
 
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index d30625c18259..c744c603d688 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -1554,8 +1554,8 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Read from the page to populate the shared zeropage. */
-	FORCE_READ(mem);
-	FORCE_READ(smem);
+	FORCE_READ(*mem);
+	FORCE_READ(*smem);
 
 	fn(mem, smem, pagesize);
 munmap:
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
index b0d42eb04e3a..8dd81c0a4a5a 100644
--- a/tools/testing/selftests/mm/guard-regions.c
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -145,7 +145,7 @@ static bool try_access_buf(char *ptr, bool write)
 		if (write)
 			*ptr = 'x';
 		else
-			FORCE_READ(ptr);
+			FORCE_READ(*ptr);
 	}
 
 	signal_jump_set = false;
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index 1afe14b9dc0c..c5940c0595be 100644
--- a/tools/testing/selftests/mm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -50,8 +50,10 @@ void read_fault_pages(void *addr, unsigned long nr_pages)
 	unsigned long i;
 
 	for (i = 0; i < nr_pages; i++) {
+		unsigned long *addr2 =
+			((unsigned long *)(addr + (i * huge_page_size)));
 		/* Prevent the compiler from optimizing out the entire loop: */
-		FORCE_READ(((unsigned long *)(addr + (i * huge_page_size))));
+		FORCE_READ(*addr2);
 	}
 }
 
diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
index c5a73617796a..ea945eebec2f 100644
--- a/tools/testing/selftests/mm/migration.c
+++ b/tools/testing/selftests/mm/migration.c
@@ -110,7 +110,7 @@ void *access_mem(void *ptr)
 		 * the memory access actually happens and prevents the compiler
 		 * from optimizing away this entire loop.
 		 */
-		FORCE_READ((uint64_t *)ptr);
+		FORCE_READ(*(uint64_t *)ptr);
 	}
 
 	return NULL;
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index fccf9e797a0c..5bd52a951cbd 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -5,10 +5,14 @@
 #define _GNU_SOURCE
 
 #include <errno.h>
+#include <fcntl.h>
+#include <linux/userfaultfd.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <syscall.h>
 #include <time.h>
 #include <stdbool.h>
 
@@ -168,6 +172,7 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start,
 
 		if (first_val <= start && second_val >= end) {
 			success = true;
+			fflush(maps_fp);
 			break;
 		}
 	}
@@ -175,6 +180,15 @@ static bool is_range_mapped(FILE *maps_fp, unsigned long start,
 	return success;
 }
 
+/* Check if [ptr, ptr + size) mapped in /proc/self/maps. */
+static bool is_ptr_mapped(FILE *maps_fp, void *ptr, unsigned long size)
+{
+	unsigned long start = (unsigned long)ptr;
+	unsigned long end = start + size;
+
+	return is_range_mapped(maps_fp, start, end);
+}
+
 /*
  * Returns the start address of the mapping on success, else returns
  * NULL on failure.
@@ -733,6 +747,249 @@ out:
 				      dont_unmap ? " [dontunnmap]" : "");
 }
 
+#ifdef __NR_userfaultfd
+static void mremap_move_multi_invalid_vmas(FILE *maps_fp,
+		unsigned long page_size)
+{
+	char *test_name = "mremap move multiple invalid vmas";
+	const size_t size = 10 * page_size;
+	bool success = true;
+	char *ptr, *tgt_ptr;
+	int uffd, err, i;
+	void *res;
+	struct uffdio_api api = {
+		.api = UFFD_API,
+		.features = UFFD_EVENT_PAGEFAULT,
+	};
+
+	uffd = syscall(__NR_userfaultfd, O_NONBLOCK);
+	if (uffd == -1) {
+		err = errno;
+		perror("userfaultfd");
+		if (err == EPERM) {
+			ksft_test_result_skip("%s - missing uffd", test_name);
+			return;
+		}
+		success = false;
+		goto out;
+	}
+	if (ioctl(uffd, UFFDIO_API, &api)) {
+		perror("ioctl UFFDIO_API");
+		success = false;
+		goto out_close_uffd;
+	}
+
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANON, -1, 0);
+	if (ptr == MAP_FAILED) {
+		perror("mmap");
+		success = false;
+		goto out_close_uffd;
+	}
+
+	tgt_ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+	if (tgt_ptr == MAP_FAILED) {
+		perror("mmap");
+		success = false;
+		goto out_close_uffd;
+	}
+	if (munmap(tgt_ptr, size)) {
+		perror("munmap");
+		success = false;
+		goto out_unmap;
+	}
+
+	/*
+	 * Unmap so we end up with:
+	 *
+	 *  0   2   4   6   8   10 offset in buffer
+	 * |*| |*| |*| |*| |*|
+	 * |*| |*| |*| |*| |*|
+	 *
+	 * Additionally, register each with UFFD.
+	 */
+	for (i = 0; i < 10; i += 2) {
+		void *unmap_ptr = &ptr[(i + 1) * page_size];
+		unsigned long start = (unsigned long)&ptr[i * page_size];
+		struct uffdio_register reg = {
+			.range = {
+				.start = start,
+				.len = page_size,
+			},
+			.mode = UFFDIO_REGISTER_MODE_MISSING,
+		};
+
+		if (ioctl(uffd, UFFDIO_REGISTER, &reg) == -1) {
+			perror("ioctl UFFDIO_REGISTER");
+			success = false;
+			goto out_unmap;
+		}
+		if (munmap(unmap_ptr, page_size)) {
+			perror("munmap");
+			success = false;
+			goto out_unmap;
+		}
+	}
+
+	/*
+	 * Now try to move the entire range which is invalid for multi VMA move.
+	 *
+	 * This will fail, and no VMA should be moved, as we check this ahead of
+	 * time.
+	 */
+	res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr);
+	err = errno;
+	if (res != MAP_FAILED) {
+		fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n");
+		success = false;
+		goto out_unmap;
+	}
+	if (err != EFAULT) {
+		errno = err;
+		perror("mrmeap() unexpected error");
+		success = false;
+		goto out_unmap;
+	}
+	if (is_ptr_mapped(maps_fp, tgt_ptr, page_size)) {
+		fprintf(stderr,
+			"Invalid uffd-armed VMA at start of multi range moved\n");
+		success = false;
+		goto out_unmap;
+	}
+
+	/*
+	 * Now try to move a single VMA, this should succeed as not multi VMA
+	 * move.
+	 */
+	res = mremap(ptr, page_size, page_size,
+		     MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr);
+	if (res == MAP_FAILED) {
+		perror("mremap single invalid-multi VMA");
+		success = false;
+		goto out_unmap;
+	}
+
+	/*
+	 * Unmap the VMA, and remap a non-uffd registered (therefore, multi VMA
+	 * move valid) VMA at the start of ptr range.
+	 */
+	if (munmap(tgt_ptr, page_size)) {
+		perror("munmap");
+		success = false;
+		goto out_unmap;
+	}
+	res = mmap(ptr, page_size, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	if (res == MAP_FAILED) {
+		perror("mmap");
+		success = false;
+		goto out_unmap;
+	}
+
+	/*
+	 * Now try to move the entire range, we should succeed in moving the
+	 * first VMA, but no others, and report a failure.
+	 */
+	res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr);
+	err = errno;
+	if (res != MAP_FAILED) {
+		fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n");
+		success = false;
+		goto out_unmap;
+	}
+	if (err != EFAULT) {
+		errno = err;
+		perror("mrmeap() unexpected error");
+		success = false;
+		goto out_unmap;
+	}
+	if (!is_ptr_mapped(maps_fp, tgt_ptr, page_size)) {
+		fprintf(stderr, "Valid VMA not moved\n");
+		success = false;
+		goto out_unmap;
+	}
+
+	/*
+	 * Unmap the VMA, and map valid VMA at start of ptr range, and replace
+	 * all existing multi-move invalid VMAs, except the last, with valid
+	 * multi-move VMAs.
+	 */
+	if (munmap(tgt_ptr, page_size)) {
+		perror("munmap");
+		success = false;
+		goto out_unmap;
+	}
+	if (munmap(ptr, size - 2 * page_size)) {
+		perror("munmap");
+		success = false;
+		goto out_unmap;
+	}
+	for (i = 0; i < 8; i += 2) {
+		res = mmap(&ptr[i * page_size], page_size,
+			   PROT_READ | PROT_WRITE,
+			   MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+		if (res == MAP_FAILED) {
+			perror("mmap");
+			success = false;
+			goto out_unmap;
+		}
+	}
+
+	/*
+	 * Now try to move the entire range, we should succeed in moving all but
+	 * the last VMA, and report a failure.
+	 */
+	res = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tgt_ptr);
+	err = errno;
+	if (res != MAP_FAILED) {
+		fprintf(stderr, "mremap() succeeded for multi VMA uffd armed\n");
+		success = false;
+		goto out_unmap;
+	}
+	if (err != EFAULT) {
+		errno = err;
+		perror("mrmeap() unexpected error");
+		success = false;
+		goto out_unmap;
+	}
+
+	for (i = 0; i < 10; i += 2) {
+		bool is_mapped = is_ptr_mapped(maps_fp,
+				&tgt_ptr[i * page_size], page_size);
+
+		if (i < 8 && !is_mapped) {
+			fprintf(stderr, "Valid VMA not moved at %d\n", i);
+			success = false;
+			goto out_unmap;
+		} else if (i == 8 && is_mapped) {
+			fprintf(stderr, "Invalid VMA moved at %d\n", i);
+			success = false;
+			goto out_unmap;
+		}
+	}
+
+out_unmap:
+	if (munmap(tgt_ptr, size))
+		perror("munmap tgt");
+	if (munmap(ptr, size))
+		perror("munmap src");
+out_close_uffd:
+	close(uffd);
+out:
+	if (success)
+		ksft_test_result_pass("%s\n", test_name);
+	else
+		ksft_test_result_fail("%s\n", test_name);
+}
+#else
+static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_size)
+{
+	char *test_name = "mremap move multiple invalid vmas";
+
+	ksft_test_result_skip("%s - missing uffd", test_name);
+}
+#endif /* __NR_userfaultfd */
+
 /* Returns the time taken for the remap on success else returns -1. */
 static long long remap_region(struct config c, unsigned int threshold_mb,
 			      char *rand_addr)
@@ -1074,7 +1331,7 @@ int main(int argc, char **argv)
 	char *rand_addr;
 	size_t rand_size;
 	int num_expand_tests = 2;
-	int num_misc_tests = 8;
+	int num_misc_tests = 9;
 	struct test test_cases[MAX_TEST] = {};
 	struct test perf_test_cases[MAX_PERF_TEST];
 	int page_size;
@@ -1197,8 +1454,6 @@ int main(int argc, char **argv)
 	mremap_expand_merge(maps_fp, page_size);
 	mremap_expand_merge_offset(maps_fp, page_size);
 
-	fclose(maps_fp);
-
 	mremap_move_within_range(pattern_seed, rand_addr);
 	mremap_move_1mb_from_start(pattern_seed, rand_addr);
 	mremap_shrink_multiple_vmas(page_size, /* inplace= */true);
@@ -1207,6 +1462,9 @@ int main(int argc, char **argv)
 	mremap_move_multiple_vmas(pattern_seed, page_size, /* dontunmap= */ true);
 	mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ false);
 	mremap_move_multiple_vmas_split(pattern_seed, page_size, /* dontunmap= */ true);
+	mremap_move_multi_invalid_vmas(maps_fp, page_size);
+
+	fclose(maps_fp);
 
 	if (run_perf_tests) {
 		ksft_print_msg("\n%s\n",
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 0d4209eef0c3..e6face7c0166 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1525,7 +1525,7 @@ void zeropfn_tests(void)
 
 	ret = madvise(mem, hpage_size, MADV_HUGEPAGE);
 	if (!ret) {
-		FORCE_READ(mem);
+		FORCE_READ(*mem);
 
 		ret = pagemap_ioctl(mem, hpage_size, &vec, 1, 0,
 				    0, PAGE_IS_PFNZERO, 0, 0, PAGE_IS_PFNZERO);
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 05de1fc0005b..44a3f8a58806 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -439,8 +439,11 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
 	}
 	madvise(*addr, fd_size, MADV_HUGEPAGE);
 
-	for (size_t i = 0; i < fd_size; i++)
-		FORCE_READ((*addr + i));
+	for (size_t i = 0; i < fd_size; i++) {
+		char *addr2 = *addr + i;
+
+		FORCE_READ(*addr2);
+	}
 
 	if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) {
 		ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n");
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index c20298ae98ea..b55d1809debc 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -23,7 +23,7 @@
  * anything with it in order to trigger a read page fault. We therefore must use
  * volatile to stop the compiler from optimising this away.
  */
-#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+#define FORCE_READ(x) (*(const volatile typeof(x) *)&(x))
 
 extern unsigned int __page_size;
 extern unsigned int __page_shift;
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index b1e4618399be..a688871a98eb 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -107,6 +107,26 @@
 	#endif
 #endif
 
+#ifndef __NR_open_tree_attr
+	#if defined __alpha__
+		#define __NR_open_tree_attr 577
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_open_tree_attr (467 + 4000)
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_open_tree_attr (467 + 6000)
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_open_tree_attr (467 + 5000)
+		#endif
+	#elif defined __ia64__
+		#define __NR_open_tree_attr (467 + 1024)
+	#else
+		#define __NR_open_tree_attr 467
+	#endif
+#endif
+
 #ifndef MOUNT_ATTR_IDMAP
 #define MOUNT_ATTR_IDMAP 0x00100000
 #endif
@@ -121,6 +141,12 @@ static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flag
 	return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
 }
 
+static inline int sys_open_tree_attr(int dfd, const char *path, unsigned int flags,
+				     struct mount_attr *attr, size_t size)
+{
+	return syscall(__NR_open_tree_attr, dfd, path, flags, attr, size);
+}
+
 static ssize_t write_nointr(int fd, const void *buf, size_t count)
 {
 	ssize_t ret;
@@ -1222,6 +1248,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace)
 	attr.userns_fd	= get_userns_fd(0, 10000, 10000);
 	ASSERT_GE(attr.userns_fd, 0);
 	ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
+	/*
+	 * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way
+	 * to bypass this mount_setattr() restriction.
+	 */
+	ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
+
 	ASSERT_EQ(close(attr.userns_fd), 0);
 	ASSERT_EQ(close(open_tree_fd), 0);
 }
@@ -1255,6 +1287,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_outside_current_mount_namespace)
 	ASSERT_GE(attr.userns_fd, 0);
 	ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr,
 				    sizeof(attr)), 0);
+	/*
+	 * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way
+	 * to bypass this mount_setattr() restriction.
+	 */
+	ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
+
 	ASSERT_EQ(close(attr.userns_fd), 0);
 	ASSERT_EQ(close(open_tree_fd), 0);
 }
@@ -1321,6 +1359,19 @@ TEST_F(mount_setattr_idmapped, detached_mount_outside_current_mount_namespace)
 	ASSERT_EQ(close(open_tree_fd), 0);
 }
 
+static bool expected_uid_gid(int dfd, const char *path, int flags,
+			     uid_t expected_uid, gid_t expected_gid)
+{
+	int ret;
+	struct stat st;
+
+	ret = fstatat(dfd, path, &st, flags);
+	if (ret < 0)
+		return false;
+
+	return st.st_uid == expected_uid && st.st_gid == expected_gid;
+}
+
 /**
  * Validate that currently changing the idmapping of an idmapped mount fails.
  */
@@ -1331,6 +1382,8 @@ TEST_F(mount_setattr_idmapped, change_idmapping)
 		.attr_set = MOUNT_ATTR_IDMAP,
 	};
 
+	ASSERT_TRUE(expected_uid_gid(-EBADF, "/mnt/D", 0, 0, 0));
+
 	if (!mount_setattr_supported())
 		SKIP(return, "mount_setattr syscall not supported");
 
@@ -1348,27 +1401,25 @@ TEST_F(mount_setattr_idmapped, change_idmapping)
 				    AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
 	ASSERT_EQ(close(attr.userns_fd), 0);
 
+	EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 0, 0));
+	EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000));
+
 	/* Change idmapping on a detached mount that is already idmapped. */
 	attr.userns_fd	= get_userns_fd(0, 20000, 10000);
 	ASSERT_GE(attr.userns_fd, 0);
 	ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
+	/*
+	 * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way
+	 * to bypass this mount_setattr() restriction.
+	 */
+	EXPECT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0);
+	EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 20000, 20000));
+	EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000));
+
 	ASSERT_EQ(close(attr.userns_fd), 0);
 	ASSERT_EQ(close(open_tree_fd), 0);
 }
 
-static bool expected_uid_gid(int dfd, const char *path, int flags,
-			     uid_t expected_uid, gid_t expected_gid)
-{
-	int ret;
-	struct stat st;
-
-	ret = fstatat(dfd, path, &st, flags);
-	if (ret < 0)
-		return false;
-
-	return st.st_uid == expected_uid && st.st_gid == expected_gid;
-}
-
 TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid)
 {
 	int open_tree_fd = -EBADF;
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
new file mode 100644
index 000000000000..ccfb40837a73
--- /dev/null
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -0,0 +1,3 @@
+nsid_test
+file_handle_test
+init_ino_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
new file mode 100644
index 000000000000..5fe4b3dc07d3
--- /dev/null
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+
+TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config
new file mode 100644
index 000000000000..d09836260262
--- /dev/null
+++ b/tools/testing/selftests/namespaces/config
@@ -0,0 +1,7 @@
+CONFIG_UTS_NS=y
+CONFIG_TIME_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c
new file mode 100644
index 000000000000..f1bc5773f552
--- /dev/null
+++ b/tools/testing/selftests/namespaces/file_handle_test.c
@@ -0,0 +1,1429 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+#include "../kselftest_harness.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+TEST(nsfs_net_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open a namespace file descriptor */
+	ns_fd = open("/proc/self/ns/net", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT as unprivileged user */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	if (fd < 0 && errno == EPERM) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "Permission denied for unprivileged user (expected)");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_uts_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open UTS namespace file descriptor */
+	ns_fd = open("/proc/self/ns/uts", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_ipc_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open IPC namespace file descriptor */
+	ns_fd = open("/proc/self/ns/ipc", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_pid_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open PID namespace file descriptor */
+	ns_fd = open("/proc/self/ns/pid", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_mnt_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open mount namespace file descriptor */
+	ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_user_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open user namespace file descriptor */
+	ns_fd = open("/proc/self/ns/user", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_cgroup_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open cgroup namespace file descriptor */
+	ns_fd = open("/proc/self/ns/cgroup", O_RDONLY);
+	if (ns_fd < 0) {
+		SKIP(free(handle); return, "cgroup namespace not available");
+	}
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_time_handle)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	struct stat st1, st2;
+
+	/* Drop to unprivileged uid/gid */
+	ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */
+	ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open time namespace file descriptor */
+	ns_fd = open("/proc/self/ns/time", O_RDONLY);
+	if (ns_fd < 0) {
+		SKIP(free(handle); return, "time namespace not available");
+	}
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Try to open using FD_NSFS_ROOT */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+	if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+		SKIP(free(handle); close(ns_fd);
+		     return,
+			   "open_by_handle_at with FD_NSFS_ROOT not supported");
+	}
+	ASSERT_GE(fd, 0);
+
+	/* Verify we opened the correct namespace */
+	ASSERT_EQ(fstat(ns_fd, &st1), 0);
+	ASSERT_EQ(fstat(fd, &st2), 0);
+	ASSERT_EQ(st1.st_ino, st2.st_ino);
+	ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+	close(fd);
+	close(ns_fd);
+	free(handle);
+}
+
+TEST(nsfs_user_net_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current network namespace */
+	ns_fd = open("/proc/self/ns/net", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new network namespace */
+		ret = unshare(CLONE_NEWNET);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create network namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Try to open parent's network namespace handle from new user+net namespace */
+		fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+		if (fd >= 0) {
+			/* Should NOT succeed - we're in a different user namespace */
+			write(pipefd[1], "S", 1); /* Unexpected success */
+			close(fd);
+		} else if (errno == ESTALE) {
+			/* Expected: Stale file handle */
+			write(pipefd[1], "P", 1);
+		} else {
+			/* Other error */
+			write(pipefd[1], "F", 1);
+		}
+
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new network namespace");
+	}
+
+	/* Should fail with permission denied since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_uts_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current UTS namespace */
+	ns_fd = open("/proc/self/ns/uts", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new UTS namespace */
+		ret = unshare(CLONE_NEWUTS);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create UTS namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Try to open parent's UTS namespace handle from new user+uts namespace */
+		fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+		if (fd >= 0) {
+			/* Should NOT succeed - we're in a different user namespace */
+			write(pipefd[1], "S", 1); /* Unexpected success */
+			close(fd);
+		} else if (errno == ESTALE) {
+			/* Expected: Stale file handle */
+			write(pipefd[1], "P", 1);
+		} else {
+			/* Other error */
+			write(pipefd[1], "F", 1);
+		}
+
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new UTS namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_ipc_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current IPC namespace */
+	ns_fd = open("/proc/self/ns/ipc", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new IPC namespace */
+		ret = unshare(CLONE_NEWIPC);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create IPC namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Try to open parent's IPC namespace handle from new user+ipc namespace */
+		fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+		if (fd >= 0) {
+			/* Should NOT succeed - we're in a different user namespace */
+			write(pipefd[1], "S", 1); /* Unexpected success */
+			close(fd);
+		} else if (errno == ESTALE) {
+			/* Expected: Stale file handle */
+			write(pipefd[1], "P", 1);
+		} else {
+			/* Other error */
+			write(pipefd[1], "F", 1);
+		}
+
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new IPC namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_mnt_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current mount namespace */
+	ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new mount namespace */
+		ret = unshare(CLONE_NEWNS);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create mount namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Try to open parent's mount namespace handle from new user+mnt namespace */
+		fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+		if (fd >= 0) {
+			/* Should NOT succeed - we're in a different user namespace */
+			write(pipefd[1], "S", 1); /* Unexpected success */
+			close(fd);
+		} else if (errno == ESTALE) {
+			/* Expected: Stale file handle */
+			write(pipefd[1], "P", 1);
+		} else {
+			/* Other error */
+			write(pipefd[1], "F", 1);
+		}
+
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new mount namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_cgroup_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current cgroup namespace */
+	ns_fd = open("/proc/self/ns/cgroup", O_RDONLY);
+	if (ns_fd < 0) {
+		SKIP(free(handle); close(pipefd[0]); close(pipefd[1]);
+		     return, "cgroup namespace not available");
+	}
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new cgroup namespace */
+		ret = unshare(CLONE_NEWCGROUP);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create cgroup namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Try to open parent's cgroup namespace handle from new user+cgroup namespace */
+		fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+		if (fd >= 0) {
+			/* Should NOT succeed - we're in a different user namespace */
+			write(pipefd[1], "S", 1); /* Unexpected success */
+			close(fd);
+		} else if (errno == ESTALE) {
+			/* Expected: Stale file handle */
+			write(pipefd[1], "P", 1);
+		} else {
+			/* Other error */
+			write(pipefd[1], "F", 1);
+		}
+
+		close(pipefd[1]);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new cgroup namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_pid_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current PID namespace */
+	ns_fd = open("/proc/self/ns/pid", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new PID namespace - requires fork to take effect */
+		ret = unshare(CLONE_NEWPID);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create PID namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Fork again for PID namespace to take effect */
+		pid_t child_pid = fork();
+		if (child_pid < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to fork in PID namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		if (child_pid == 0) {
+			/* Grandchild in new PID namespace */
+			/* Try to open parent's PID namespace handle from new user+pid namespace */
+			fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+			if (fd >= 0) {
+				/* Should NOT succeed - we're in a different user namespace */
+				write(pipefd[1], "S",
+				      1); /* Unexpected success */
+				close(fd);
+			} else if (errno == ESTALE) {
+				/* Expected: Stale file handle */
+				write(pipefd[1], "P", 1);
+			} else {
+				/* Other error */
+				write(pipefd[1], "F", 1);
+			}
+
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Wait for grandchild */
+		waitpid(child_pid, NULL, 0);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new PID namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_user_time_namespace_isolation)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+	pid_t pid;
+	int status;
+	int pipefd[2];
+	char result;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Create pipe for communication */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	/* Get handle for current time namespace */
+	ns_fd = open("/proc/self/ns/time", O_RDONLY);
+	if (ns_fd < 0) {
+		SKIP(free(handle); close(pipefd[0]); close(pipefd[1]);
+		     return, "time namespace not available");
+	}
+
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd); close(pipefd[0]);
+		     close(pipefd[1]);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	close(ns_fd);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* First create new user namespace to drop privileges */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			write(pipefd[1], "U",
+			      1); /* Unable to create user namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Write uid/gid mappings to maintain some capabilities */
+		int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+		int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+		int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+		if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) {
+			write(pipefd[1], "M", 1); /* Unable to set mappings */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Disable setgroups to allow gid mapping */
+		write(setgroups_fd, "deny", 4);
+		close(setgroups_fd);
+
+		/* Map current uid/gid to root in the new namespace */
+		char mapping[64];
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+		write(uid_map_fd, mapping, strlen(mapping));
+		close(uid_map_fd);
+
+		snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+		write(gid_map_fd, mapping, strlen(mapping));
+		close(gid_map_fd);
+
+		/* Now create new time namespace - requires fork to take effect */
+		ret = unshare(CLONE_NEWTIME);
+		if (ret < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to create time namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Fork again for time namespace to take effect */
+		pid_t child_pid = fork();
+		if (child_pid < 0) {
+			write(pipefd[1], "N",
+			      1); /* Unable to fork in time namespace */
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		if (child_pid == 0) {
+			/* Grandchild in new time namespace */
+			/* Try to open parent's time namespace handle from new user+time namespace */
+			fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+
+			if (fd >= 0) {
+				/* Should NOT succeed - we're in a different user namespace */
+				write(pipefd[1], "S",
+				      1); /* Unexpected success */
+				close(fd);
+			} else if (errno == ESTALE) {
+				/* Expected: Stale file handle */
+				write(pipefd[1], "P", 1);
+			} else {
+				/* Other error */
+				write(pipefd[1], "F", 1);
+			}
+
+			close(pipefd[1]);
+			exit(0);
+		}
+
+		/* Wait for grandchild */
+		waitpid(child_pid, NULL, 0);
+		exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+	ASSERT_EQ(read(pipefd[0], &result, 1), 1);
+
+	waitpid(pid, &status, 0);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+
+	if (result == 'U') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new user namespace");
+	}
+	if (result == 'M') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot set uid/gid mappings");
+	}
+	if (result == 'N') {
+		SKIP(free(handle); close(pipefd[0]);
+		     return, "Cannot create new time namespace");
+	}
+
+	/* Should fail with ESTALE since we're in a different user namespace */
+	ASSERT_EQ(result, 'P');
+
+	close(pipefd[0]);
+	free(handle);
+}
+
+TEST(nsfs_open_flags)
+{
+	struct file_handle *handle;
+	int mount_id;
+	int ret;
+	int fd;
+	int ns_fd;
+
+	handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(handle, NULL);
+
+	/* Open a namespace file descriptor */
+	ns_fd = open("/proc/self/ns/net", O_RDONLY);
+	ASSERT_GE(ns_fd, 0);
+
+	/* Get handle for the namespace */
+	handle->handle_bytes = MAX_HANDLE_SZ;
+	ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH);
+	if (ret < 0 && errno == EOPNOTSUPP) {
+		SKIP(free(handle); close(ns_fd);
+		     return, "nsfs doesn't support file handles");
+	}
+	ASSERT_EQ(ret, 0);
+	ASSERT_GT(handle->handle_bytes, 0);
+
+	/* Test invalid flags that should fail */
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, EPERM);
+
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, EPERM);
+
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, EPERM);
+
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, EINVAL);
+
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, EINVAL);
+
+	fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY);
+	ASSERT_LT(fd, 0);
+	ASSERT_EQ(errno, ENOTDIR);
+
+	close(ns_fd);
+	free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c
new file mode 100644
index 000000000000..5b6993c3740b
--- /dev/null
+++ b/tools/testing/selftests/namespaces/init_ino_test.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/nsfs.h>
+
+#include "../kselftest_harness.h"
+
+struct ns_info {
+	const char *name;
+	const char *proc_path;
+	unsigned int expected_ino;
+};
+
+static struct ns_info namespaces[] = {
+	{ "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO },
+	{ "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO },
+	{ "user", "/proc/1/ns/user", USER_NS_INIT_INO },
+	{ "pid", "/proc/1/ns/pid", PID_NS_INIT_INO },
+	{ "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO },
+	{ "time", "/proc/1/ns/time", TIME_NS_INIT_INO },
+	{ "net", "/proc/1/ns/net", NET_NS_INIT_INO },
+	{ "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO },
+};
+
+TEST(init_namespace_inodes)
+{
+	struct stat st;
+
+	for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) {
+		int ret = stat(namespaces[i].proc_path, &st);
+
+		/* Some namespaces might not be available (e.g., time namespace on older kernels) */
+		if (ret < 0) {
+			if (errno == ENOENT) {
+				ksft_test_result_skip("%s namespace not available\n",
+						      namespaces[i].name);
+				continue;
+			}
+			ASSERT_GE(ret, 0)
+			TH_LOG("Failed to stat %s: %s",
+			       namespaces[i].proc_path, strerror(errno));
+		}
+
+		ASSERT_EQ(st.st_ino, namespaces[i].expected_ino)
+			TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x",
+			       namespaces[i].name, st.st_ino, namespaces[i].expected_ino);
+
+		ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n",
+			       namespaces[i].name, st.st_ino, namespaces[i].expected_ino);
+	}
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
new file mode 100644
index 000000000000..e28accd74a57
--- /dev/null
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -0,0 +1,986 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <assert.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <poll.h>
+#include <sys/epoll.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+
+TEST(nsid_mntns_basic)
+{
+	__u64 mnt_ns_id = 0;
+	int fd_mntns;
+	int ret;
+
+	/* Open the current mount namespace */
+	fd_mntns = open("/proc/self/ns/mnt", O_RDONLY);
+	ASSERT_GE(fd_mntns, 0);
+
+	/* Get the mount namespace ID */
+	ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(mnt_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 mnt_ns_id2 = 0;
+	ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mnt_ns_id, mnt_ns_id2);
+
+	close(fd_mntns);
+}
+
+TEST(nsid_mntns_separate)
+{
+	__u64 parent_mnt_ns_id = 0;
+	__u64 child_mnt_ns_id = 0;
+	int fd_parent_mntns, fd_child_mntns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's mount namespace ID */
+	fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY);
+	ASSERT_GE(fd_parent_mntns, 0);
+	ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_mnt_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new mount namespace */
+		ret = unshare(CLONE_NEWNS);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_mntns);
+		SKIP(return, "No permission to create mount namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's mount namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid);
+	fd_child_mntns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_mntns, 0);
+
+	/* Get child's mount namespace ID */
+	ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_mnt_ns_id, 0);
+
+	/* Parent and child should have different mount namespace IDs */
+	ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id);
+
+	close(fd_parent_mntns);
+	close(fd_child_mntns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_cgroupns_basic)
+{
+	__u64 cgroup_ns_id = 0;
+	int fd_cgroupns;
+	int ret;
+
+	/* Open the current cgroup namespace */
+	fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY);
+	ASSERT_GE(fd_cgroupns, 0);
+
+	/* Get the cgroup namespace ID */
+	ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(cgroup_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 cgroup_ns_id2 = 0;
+	ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2);
+
+	close(fd_cgroupns);
+}
+
+TEST(nsid_cgroupns_separate)
+{
+	__u64 parent_cgroup_ns_id = 0;
+	__u64 child_cgroup_ns_id = 0;
+	int fd_parent_cgroupns, fd_child_cgroupns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's cgroup namespace ID */
+	fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY);
+	ASSERT_GE(fd_parent_cgroupns, 0);
+	ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_cgroup_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new cgroup namespace */
+		ret = unshare(CLONE_NEWCGROUP);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_cgroupns);
+		SKIP(return, "No permission to create cgroup namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's cgroup namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid);
+	fd_child_cgroupns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_cgroupns, 0);
+
+	/* Get child's cgroup namespace ID */
+	ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_cgroup_ns_id, 0);
+
+	/* Parent and child should have different cgroup namespace IDs */
+	ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id);
+
+	close(fd_parent_cgroupns);
+	close(fd_child_cgroupns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_ipcns_basic)
+{
+	__u64 ipc_ns_id = 0;
+	int fd_ipcns;
+	int ret;
+
+	/* Open the current IPC namespace */
+	fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY);
+	ASSERT_GE(fd_ipcns, 0);
+
+	/* Get the IPC namespace ID */
+	ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(ipc_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 ipc_ns_id2 = 0;
+	ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(ipc_ns_id, ipc_ns_id2);
+
+	close(fd_ipcns);
+}
+
+TEST(nsid_ipcns_separate)
+{
+	__u64 parent_ipc_ns_id = 0;
+	__u64 child_ipc_ns_id = 0;
+	int fd_parent_ipcns, fd_child_ipcns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's IPC namespace ID */
+	fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY);
+	ASSERT_GE(fd_parent_ipcns, 0);
+	ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_ipc_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new IPC namespace */
+		ret = unshare(CLONE_NEWIPC);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_ipcns);
+		SKIP(return, "No permission to create IPC namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's IPC namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid);
+	fd_child_ipcns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_ipcns, 0);
+
+	/* Get child's IPC namespace ID */
+	ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_ipc_ns_id, 0);
+
+	/* Parent and child should have different IPC namespace IDs */
+	ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id);
+
+	close(fd_parent_ipcns);
+	close(fd_child_ipcns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_utsns_basic)
+{
+	__u64 uts_ns_id = 0;
+	int fd_utsns;
+	int ret;
+
+	/* Open the current UTS namespace */
+	fd_utsns = open("/proc/self/ns/uts", O_RDONLY);
+	ASSERT_GE(fd_utsns, 0);
+
+	/* Get the UTS namespace ID */
+	ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(uts_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 uts_ns_id2 = 0;
+	ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(uts_ns_id, uts_ns_id2);
+
+	close(fd_utsns);
+}
+
+TEST(nsid_utsns_separate)
+{
+	__u64 parent_uts_ns_id = 0;
+	__u64 child_uts_ns_id = 0;
+	int fd_parent_utsns, fd_child_utsns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's UTS namespace ID */
+	fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY);
+	ASSERT_GE(fd_parent_utsns, 0);
+	ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_uts_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new UTS namespace */
+		ret = unshare(CLONE_NEWUTS);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_utsns);
+		SKIP(return, "No permission to create UTS namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's UTS namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+	fd_child_utsns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_utsns, 0);
+
+	/* Get child's UTS namespace ID */
+	ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_uts_ns_id, 0);
+
+	/* Parent and child should have different UTS namespace IDs */
+	ASSERT_NE(parent_uts_ns_id, child_uts_ns_id);
+
+	close(fd_parent_utsns);
+	close(fd_child_utsns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_userns_basic)
+{
+	__u64 user_ns_id = 0;
+	int fd_userns;
+	int ret;
+
+	/* Open the current user namespace */
+	fd_userns = open("/proc/self/ns/user", O_RDONLY);
+	ASSERT_GE(fd_userns, 0);
+
+	/* Get the user namespace ID */
+	ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(user_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 user_ns_id2 = 0;
+	ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(user_ns_id, user_ns_id2);
+
+	close(fd_userns);
+}
+
+TEST(nsid_userns_separate)
+{
+	__u64 parent_user_ns_id = 0;
+	__u64 child_user_ns_id = 0;
+	int fd_parent_userns, fd_child_userns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's user namespace ID */
+	fd_parent_userns = open("/proc/self/ns/user", O_RDONLY);
+	ASSERT_GE(fd_parent_userns, 0);
+	ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_user_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new user namespace */
+		ret = unshare(CLONE_NEWUSER);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_userns);
+		SKIP(return, "No permission to create user namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's user namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+	fd_child_userns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_userns, 0);
+
+	/* Get child's user namespace ID */
+	ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_user_ns_id, 0);
+
+	/* Parent and child should have different user namespace IDs */
+	ASSERT_NE(parent_user_ns_id, child_user_ns_id);
+
+	close(fd_parent_userns);
+	close(fd_child_userns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_timens_basic)
+{
+	__u64 time_ns_id = 0;
+	int fd_timens;
+	int ret;
+
+	/* Open the current time namespace */
+	fd_timens = open("/proc/self/ns/time", O_RDONLY);
+	if (fd_timens < 0) {
+		SKIP(return, "Time namespaces not supported");
+	}
+
+	/* Get the time namespace ID */
+	ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(time_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 time_ns_id2 = 0;
+	ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(time_ns_id, time_ns_id2);
+
+	close(fd_timens);
+}
+
+TEST(nsid_timens_separate)
+{
+	__u64 parent_time_ns_id = 0;
+	__u64 child_time_ns_id = 0;
+	int fd_parent_timens, fd_child_timens;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Open the current time namespace */
+	fd_parent_timens = open("/proc/self/ns/time", O_RDONLY);
+	if (fd_parent_timens < 0) {
+		SKIP(return, "Time namespaces not supported");
+	}
+
+	/* Get parent's time namespace ID */
+	ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_time_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new time namespace */
+		ret = unshare(CLONE_NEWTIME);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES || errno == EINVAL) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Fork a grandchild to actually enter the new namespace */
+		pid_t grandchild = fork();
+		if (grandchild == 0) {
+			/* Grandchild is in the new namespace */
+			write(pipefd[1], "Y", 1);
+			close(pipefd[1]);
+			pause();
+			_exit(0);
+		} else if (grandchild > 0) {
+			/* Child writes grandchild PID and waits */
+			write(pipefd[1], "Y", 1);
+			write(pipefd[1], &grandchild, sizeof(grandchild));
+			close(pipefd[1]);
+			pause(); /* Keep the parent alive to maintain the grandchild */
+			_exit(0);
+		} else {
+			_exit(1);
+		}
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_timens);
+		close(pipefd[0]);
+		SKIP(return, "Cannot create time namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	pid_t grandchild_pid;
+	ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid));
+	close(pipefd[0]);
+
+	/* Open grandchild's time namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid);
+	fd_child_timens = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_timens, 0);
+
+	/* Get child's time namespace ID */
+	ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_time_ns_id, 0);
+
+	/* Parent and child should have different time namespace IDs */
+	ASSERT_NE(parent_time_ns_id, child_time_ns_id);
+
+	close(fd_parent_timens);
+	close(fd_child_timens);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_pidns_basic)
+{
+	__u64 pid_ns_id = 0;
+	int fd_pidns;
+	int ret;
+
+	/* Open the current PID namespace */
+	fd_pidns = open("/proc/self/ns/pid", O_RDONLY);
+	ASSERT_GE(fd_pidns, 0);
+
+	/* Get the PID namespace ID */
+	ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(pid_ns_id, 0);
+
+	/* Verify we can get the same ID again */
+	__u64 pid_ns_id2 = 0;
+	ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(pid_ns_id, pid_ns_id2);
+
+	close(fd_pidns);
+}
+
+TEST(nsid_pidns_separate)
+{
+	__u64 parent_pid_ns_id = 0;
+	__u64 child_pid_ns_id = 0;
+	int fd_parent_pidns, fd_child_pidns;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's PID namespace ID */
+	fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY);
+	ASSERT_GE(fd_parent_pidns, 0);
+	ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_pid_ns_id, 0);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new PID namespace */
+		ret = unshare(CLONE_NEWPID);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Fork a grandchild to actually enter the new namespace */
+		pid_t grandchild = fork();
+		if (grandchild == 0) {
+			/* Grandchild is in the new namespace */
+			write(pipefd[1], "Y", 1);
+			close(pipefd[1]);
+			pause();
+			_exit(0);
+		} else if (grandchild > 0) {
+			/* Child writes grandchild PID and waits */
+			write(pipefd[1], "Y", 1);
+			write(pipefd[1], &grandchild, sizeof(grandchild));
+			close(pipefd[1]);
+			pause(); /* Keep the parent alive to maintain the grandchild */
+			_exit(0);
+		} else {
+			_exit(1);
+		}
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_pidns);
+		close(pipefd[0]);
+		SKIP(return, "No permission to create PID namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	pid_t grandchild_pid;
+	ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid));
+	close(pipefd[0]);
+
+	/* Open grandchild's PID namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid);
+	fd_child_pidns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_pidns, 0);
+
+	/* Get child's PID namespace ID */
+	ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_pid_ns_id, 0);
+
+	/* Parent and child should have different PID namespace IDs */
+	ASSERT_NE(parent_pid_ns_id, child_pid_ns_id);
+
+	close(fd_parent_pidns);
+	close(fd_child_pidns);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST(nsid_netns_basic)
+{
+	__u64 net_ns_id = 0;
+	__u64 netns_cookie = 0;
+	int fd_netns;
+	int sock;
+	socklen_t optlen;
+	int ret;
+
+	/* Open the current network namespace */
+	fd_netns = open("/proc/self/ns/net", O_RDONLY);
+	ASSERT_GE(fd_netns, 0);
+
+	/* Get the network namespace ID via ioctl */
+	ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(net_ns_id, 0);
+
+	/* Create a socket to get the SO_NETNS_COOKIE */
+	sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_GE(sock, 0);
+
+	/* Get the network namespace cookie via socket option */
+	optlen = sizeof(netns_cookie);
+	ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(optlen, sizeof(netns_cookie));
+
+	/* The namespace ID and cookie should be identical */
+	ASSERT_EQ(net_ns_id, netns_cookie);
+
+	/* Verify we can get the same ID again */
+	__u64 net_ns_id2 = 0;
+	ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(net_ns_id, net_ns_id2);
+
+	close(sock);
+	close(fd_netns);
+}
+
+TEST(nsid_netns_separate)
+{
+	__u64 parent_net_ns_id = 0;
+	__u64 parent_netns_cookie = 0;
+	__u64 child_net_ns_id = 0;
+	__u64 child_netns_cookie = 0;
+	int fd_parent_netns, fd_child_netns;
+	int parent_sock, child_sock;
+	socklen_t optlen;
+	int ret;
+	pid_t pid;
+	int pipefd[2];
+
+	/* Get parent's network namespace ID */
+	fd_parent_netns = open("/proc/self/ns/net", O_RDONLY);
+	ASSERT_GE(fd_parent_netns, 0);
+	ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(parent_net_ns_id, 0);
+
+	/* Get parent's network namespace cookie */
+	parent_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	ASSERT_GE(parent_sock, 0);
+	optlen = sizeof(parent_netns_cookie);
+	ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify parent's ID and cookie match */
+	ASSERT_EQ(parent_net_ns_id, parent_netns_cookie);
+
+	/* Create a pipe for synchronization */
+	ASSERT_EQ(pipe(pipefd), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child process */
+		close(pipefd[0]);
+
+		/* Create new network namespace */
+		ret = unshare(CLONE_NEWNET);
+		if (ret != 0) {
+			/* Skip test if we don't have permission */
+			if (errno == EPERM || errno == EACCES) {
+				write(pipefd[1], "S", 1); /* Signal skip */
+				_exit(0);
+			}
+			_exit(1);
+		}
+
+		/* Signal success */
+		write(pipefd[1], "Y", 1);
+		close(pipefd[1]);
+
+		/* Keep namespace alive */
+		pause();
+		_exit(0);
+	}
+
+	/* Parent process */
+	close(pipefd[1]);
+
+	char buf;
+	ASSERT_EQ(read(pipefd[0], &buf, 1), 1);
+	close(pipefd[0]);
+
+	if (buf == 'S') {
+		/* Child couldn't create namespace, skip test */
+		kill(pid, SIGTERM);
+		waitpid(pid, NULL, 0);
+		close(fd_parent_netns);
+		close(parent_sock);
+		SKIP(return, "No permission to create network namespace");
+	}
+
+	ASSERT_EQ(buf, 'Y');
+
+	/* Open child's network namespace */
+	char path[256];
+	snprintf(path, sizeof(path), "/proc/%d/ns/net", pid);
+	fd_child_netns = open(path, O_RDONLY);
+	ASSERT_GE(fd_child_netns, 0);
+
+	/* Get child's network namespace ID */
+	ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id);
+	ASSERT_EQ(ret, 0);
+	ASSERT_NE(child_net_ns_id, 0);
+
+	/* Create socket in child's namespace to get cookie */
+	ret = setns(fd_child_netns, CLONE_NEWNET);
+	if (ret == 0) {
+		child_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+		ASSERT_GE(child_sock, 0);
+
+		optlen = sizeof(child_netns_cookie);
+		ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen);
+		ASSERT_EQ(ret, 0);
+
+		/* Verify child's ID and cookie match */
+		ASSERT_EQ(child_net_ns_id, child_netns_cookie);
+
+		close(child_sock);
+
+		/* Return to parent namespace */
+		setns(fd_parent_netns, CLONE_NEWNET);
+	}
+
+	/* Parent and child should have different network namespace IDs */
+	ASSERT_NE(parent_net_ns_id, child_net_ns_id);
+	if (child_netns_cookie != 0) {
+		ASSERT_NE(parent_netns_cookie, child_netns_cookie);
+	}
+
+	close(fd_parent_netns);
+	close(fd_child_netns);
+	close(parent_sock);
+
+	/* Clean up child process */
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index b31a71f2b372..2b31d4a93ad7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -99,6 +99,7 @@ TEST_GEN_PROGS += bind_wildcard
 TEST_GEN_PROGS += bind_timewait
 TEST_PROGS += test_vxlan_mdb.sh
 TEST_PROGS += test_bridge_neigh_suppress.sh
+TEST_PROGS += test_vxlan_nh.sh
 TEST_PROGS += test_vxlan_nolocalbypass.sh
 TEST_PROGS += test_bridge_backup_port.sh
 TEST_PROGS += test_neigh.sh
@@ -115,6 +116,7 @@ TEST_PROGS += skf_net_off.sh
 TEST_GEN_FILES += skf_net_off
 TEST_GEN_FILES += tfo
 TEST_PROGS += tfo_passive.sh
+TEST_PROGS += broadcast_ether_dst.sh
 TEST_PROGS += broadcast_pmtu.sh
 TEST_PROGS += ipv6_force_forwarding.sh
 
diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c
index 57ff67a3751e..da04b0b19b73 100644
--- a/tools/testing/selftests/net/bind_bhash.c
+++ b/tools/testing/selftests/net/bind_bhash.c
@@ -75,7 +75,7 @@ static void *setup(void *arg)
 	int *array = (int *)arg;
 
 	for (i = 0; i < MAX_CONNECTIONS; i++) {
-		sock_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
+		sock_fd = bind_socket(SO_REUSEPORT, setup_addr);
 		if (sock_fd < 0) {
 			ret = sock_fd;
 			pthread_exit(&ret);
@@ -103,7 +103,7 @@ int main(int argc, const char *argv[])
 
 	setup_addr = use_v6 ? setup_addr_v6 : setup_addr_v4;
 
-	listener_fd = bind_socket(SO_REUSEADDR | SO_REUSEPORT, setup_addr);
+	listener_fd = bind_socket(SO_REUSEPORT, setup_addr);
 	if (listen(listener_fd, 100) < 0) {
 		perror("listen failed");
 		return -1;
diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh
new file mode 100755
index 000000000000..334a7eca8a80
--- /dev/null
+++ b/tools/testing/selftests/net/broadcast_ether_dst.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Author: Brett A C Sheffield <bacs@librecast.net>
+# Author: Oscar Maes <oscmaes92@gmail.com>
+#
+# Ensure destination ethernet field is correctly set for
+# broadcast packets
+
+source lib.sh
+
+CLIENT_IP4="192.168.0.1"
+GW_IP4="192.168.0.2"
+
+setup() {
+	setup_ns CLIENT_NS SERVER_NS
+
+	ip -net "${SERVER_NS}" link add link1 type veth \
+		peer name link0 netns "${CLIENT_NS}"
+
+	ip -net "${CLIENT_NS}" link set link0 up
+	ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0
+
+	ip -net "${SERVER_NS}" link set link1 up
+
+	ip -net "${CLIENT_NS}" route add default via "${GW_IP4}"
+	ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55
+}
+
+cleanup() {
+	rm -f "${CAPFILE}" "${OUTPUT}"
+	ip -net "${SERVER_NS}" link del link1
+	cleanup_ns "${CLIENT_NS}" "${SERVER_NS}"
+}
+
+test_broadcast_ether_dst() {
+	local rc=0
+	CAPFILE=$(mktemp -u cap.XXXXXXXXXX)
+	OUTPUT=$(mktemp -u out.XXXXXXXXXX)
+
+	echo "Testing ethernet broadcast destination"
+
+	# start tcpdump listening for icmp
+	# tcpdump will exit after receiving a single packet
+	# timeout will kill tcpdump if it is still running after 2s
+	timeout 2s ip netns exec "${CLIENT_NS}" \
+		tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" &
+	pid=$!
+	slowwait 1 grep -qs "listening" "${OUTPUT}"
+
+	# send broadcast ping
+	ip netns exec "${CLIENT_NS}" \
+		ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null
+
+	# wait for tcpdump for exit after receiving packet
+	wait "${pid}"
+
+	# compare ethernet destination field to ff:ff:ff:ff:ff:ff
+	ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \
+			awk '{sub(/,/,"",$3); print $3}')
+	if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then
+		echo "[ OK ]"
+		rc="${ksft_pass}"
+	else
+		echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \
+			"got ${ether_dst}"
+		rc="${ksft_fail}"
+	fi
+
+	return "${rc}"
+}
+
+if [ ! -x "$(command -v tcpdump)" ]; then
+	echo "SKIP: Could not run test without tcpdump tool"
+	exit "${ksft_skip}"
+fi
+
+trap cleanup EXIT
+
+setup
+test_broadcast_ether_dst
+
+exit $?
diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config
new file mode 100644
index 000000000000..188f79796670
--- /dev/null
+++ b/tools/testing/selftests/net/can/config
@@ -0,0 +1,3 @@
+CONFIG_CAN=m
+CONFIG_CAN_DEV=m
+CONFIG_CAN_VCAN=m
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index b39f748c2572..2b0a90581e2f 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -467,8 +467,8 @@ ipv6_fdb_grp_fcnal()
 	log_test $? 0 "Get Fdb nexthop group by id"
 
 	# fdb nexthop group can only contain fdb nexthops
-	run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
-	run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
+	run_cmd "$IP nexthop add id 63 via 2001:db8:91::4 dev veth1"
+	run_cmd "$IP nexthop add id 64 via 2001:db8:91::5 dev veth1"
 	run_cmd "$IP nexthop add id 103 group 63/64 fdb"
 	log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
 
@@ -494,6 +494,26 @@ ipv6_fdb_grp_fcnal()
 	run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
 	log_test $? 2 "Fdb Nexthop with encap"
 
+	# Replace FDB nexthop to non-FDB and vice versa
+	run_cmd "$IP nexthop add id 70 via 2001:db8:91::2 fdb"
+	run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1"
+	log_test $? 0 "Replace FDB nexthop to non-FDB nexthop"
+	run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 fdb"
+	log_test $? 0 "Replace non-FDB nexthop to FDB nexthop"
+
+	# Replace FDB nexthop address while in a group
+	run_cmd "$IP nexthop add id 71 group 70 fdb"
+	run_cmd "$IP nexthop replace id 70 via 2001:db8:91::3 fdb"
+	log_test $? 0 "Replace FDB nexthop address while in a group"
+
+	# Cannot replace FDB nexthop to non-FDB and vice versa while in a group
+	run_cmd "$IP nexthop replace id 70 via 2001:db8:91::2 dev veth1"
+	log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group"
+	run_cmd "$IP nexthop add id 72 via 2001:db8:91::2 dev veth1"
+	run_cmd "$IP nexthop add id 73 group 72"
+	run_cmd "$IP nexthop replace id 72 via 2001:db8:91::2 fdb"
+	log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group"
+
 	run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
 	run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
 	log_test $? 0 "Fdb mac add with nexthop group"
@@ -547,15 +567,15 @@ ipv4_fdb_grp_fcnal()
 	log_test $? 0 "Get Fdb nexthop group by id"
 
 	# fdb nexthop group can only contain fdb nexthops
-	run_cmd "$IP nexthop add id 14 via 172.16.1.2"
-	run_cmd "$IP nexthop add id 15 via 172.16.1.3"
+	run_cmd "$IP nexthop add id 14 via 172.16.1.2 dev veth1"
+	run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1"
 	run_cmd "$IP nexthop add id 103 group 14/15 fdb"
 	log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
 
 	# Non fdb nexthop group can not contain fdb nexthops
 	run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
 	run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
-	run_cmd "$IP nexthop add id 104 group 14/15"
+	run_cmd "$IP nexthop add id 104 group 16/17"
 	log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
 
 	# fdb nexthop cannot have blackhole
@@ -574,6 +594,26 @@ ipv4_fdb_grp_fcnal()
 	run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
 	log_test $? 2 "Fdb Nexthop with encap"
 
+	# Replace FDB nexthop to non-FDB and vice versa
+	run_cmd "$IP nexthop add id 18 via 172.16.1.2 fdb"
+	run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1"
+	log_test $? 0 "Replace FDB nexthop to non-FDB nexthop"
+	run_cmd "$IP nexthop replace id 18 via 172.16.1.2 fdb"
+	log_test $? 0 "Replace non-FDB nexthop to FDB nexthop"
+
+	# Replace FDB nexthop address while in a group
+	run_cmd "$IP nexthop add id 19 group 18 fdb"
+	run_cmd "$IP nexthop replace id 18 via 172.16.1.3 fdb"
+	log_test $? 0 "Replace FDB nexthop address while in a group"
+
+	# Cannot replace FDB nexthop to non-FDB and vice versa while in a group
+	run_cmd "$IP nexthop replace id 18 via 172.16.1.2 dev veth1"
+	log_test $? 2 "Replace FDB nexthop to non-FDB nexthop while in a group"
+	run_cmd "$IP nexthop add id 20 via 172.16.1.2 dev veth1"
+	run_cmd "$IP nexthop add id 21 group 20"
+	run_cmd "$IP nexthop replace id 20 via 172.16.1.2 fdb"
+	log_test $? 2 "Replace non-FDB nexthop to FDB nexthop while in a group"
+
 	run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
 	run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
 	log_test $? 0 "Fdb mac add with nexthop group"
@@ -582,7 +622,7 @@ ipv4_fdb_grp_fcnal()
 	run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
 	log_test $? 255 "Fdb mac add with nexthop"
 
-	run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
+	run_cmd "$IP ro add 172.16.0.0/22 nhid 16"
 	log_test $? 2 "Route add with fdb nexthop"
 
 	run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh
index b98ea9449b8b..dfb6646cb97b 100755
--- a/tools/testing/selftests/net/forwarding/router.sh
+++ b/tools/testing/selftests/net/forwarding/router.sh
@@ -18,6 +18,8 @@
 # | 2001:db8:1::1/64                             2001:db8:2::1/64   |
 # |                                                                 |
 # +-----------------------------------------------------------------+
+#
+#shellcheck disable=SC2034 # SC doesn't see our uses of global variables
 
 ALL_TESTS="
 	ping_ipv4
@@ -27,6 +29,7 @@ ALL_TESTS="
 	ipv4_sip_equal_dip
 	ipv6_sip_equal_dip
 	ipv4_dip_link_local
+	ipv4_sip_link_local
 "
 
 NUM_NETIFS=4
@@ -330,6 +333,32 @@ ipv4_dip_link_local()
 	tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
 }
 
+ipv4_sip_link_local()
+{
+	local sip=169.254.1.1
+
+	RET=0
+
+	# Disable rpfilter to prevent packets to be dropped because of it.
+	sysctl_set net.ipv4.conf.all.rp_filter 0
+	sysctl_set net.ipv4.conf."$rp1".rp_filter 0
+
+	tc filter add dev "$rp2" egress protocol ip pref 1 handle 101 \
+		flower src_ip "$sip" action pass
+
+	$MZ "$h1" -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b "$rp1mac" \
+		-A "$sip" -B 198.51.100.2 -q
+
+	tc_check_packets "dev $rp2 egress" 101 5
+	check_err $? "Packets were dropped"
+
+	log_test "IPv4 source IP is link-local"
+
+	tc filter del dev "$rp2" egress protocol ip pref 1 handle 101 flower
+	sysctl_restore net.ipv4.conf."$rp1".rp_filter
+	sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
 trap cleanup EXIT
 
 setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh
index 1f6f53e284b5..6269d5e23487 100755
--- a/tools/testing/selftests/net/forwarding/sch_ets.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets.sh
@@ -11,6 +11,7 @@ ALL_TESTS="
 	ets_test_strict
 	ets_test_mixed
 	ets_test_dwrr
+	ets_test_plug
 	classifier_mode
 	ets_test_strict
 	ets_test_mixed
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
index 08240d3e3c87..79d837a2868a 100644
--- a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -224,3 +224,11 @@ ets_test_dwrr()
 	ets_set_dwrr_two_bands
 	xfail_on_slow ets_dwrr_test_01
 }
+
+ets_test_plug()
+{
+	ets_change_qdisc $put 2 "3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3" "1514 1514"
+	tc qdisc add dev $put handle 20: parent 10:4 plug
+	start_traffic_pktsize 100 $h1.10 192.0.2.1 192.0.2.2 00:c1:a0:c1:a0:00 "-c 1"
+	ets_qdisc_setup $put 2
+}
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
index 7a3cb4c09e45..d847ff1737c3 100755
--- a/tools/testing/selftests/net/mptcp/diag.sh
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -28,7 +28,7 @@ flush_pids()
 }
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index ac1349c4b9e5..b148cadb96d0 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -183,9 +183,10 @@ static void xgetaddrinfo(const char *node, const char *service,
 			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
-again:
-	int err = getaddrinfo(node, service, hints, res);
+	int err;
 
+again:
+	err = getaddrinfo(node, service, hints, res);
 	if (err) {
 		const char *errstr;
 
@@ -1092,6 +1093,7 @@ int main_loop_s(int listensock)
 	struct pollfd polls;
 	socklen_t salen;
 	int remotesock;
+	int err = 0;
 	int fd = 0;
 
 again:
@@ -1124,7 +1126,7 @@ again:
 		SOCK_TEST_TCPULP(remotesock, 0);
 
 		memset(&winfo, 0, sizeof(winfo));
-		copyfd_io(fd, remotesock, 1, true, &winfo);
+		err = copyfd_io(fd, remotesock, 1, true, &winfo);
 	} else {
 		perror("accept");
 		return 1;
@@ -1133,10 +1135,10 @@ again:
 	if (cfg_input)
 		close(fd);
 
-	if (--cfg_repeat > 0)
+	if (!err && --cfg_repeat > 0)
 		goto again;
 
-	return 0;
+	return err;
 }
 
 static void init_rng(void)
@@ -1246,7 +1248,7 @@ void xdisconnect(int fd)
 	else
 		xerror("bad family");
 
-	strcpy(cmd, "ss -M | grep -q ");
+	strcpy(cmd, "ss -Mnt | grep -q ");
 	cmdlen = strlen(cmd);
 	if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen],
 		       sizeof(cmd) - cmdlen))
@@ -1256,7 +1258,7 @@ void xdisconnect(int fd)
 
 	/*
 	 * wait until the pending data is completely flushed and all
-	 * the MPTCP sockets reached the closed status.
+	 * the sockets reached the closed status.
 	 * disconnect will bypass/ignore/drop any pending data.
 	 */
 	for (i = 0; ; i += msec_sleep) {
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 5e3c56253274..47ecb5b3836e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -134,7 +134,7 @@ ns4=""
 TEST_GROUP=""
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	rm -f "$cin_disconnect"
@@ -211,6 +211,11 @@ if $checksum; then
 	done
 fi
 
+if $capture; then
+	rndh="${ns1:4}"
+	mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-"
+fi
+
 set_ethtool_flags() {
 	local ns="$1"
 	local dev="$2"
@@ -361,7 +366,6 @@ do_transfer()
 
 	if $capture; then
 		local capuser
-		local rndh="${connector_ns:4}"
 		if [ -z $SUDO_USER ] ; then
 			capuser=""
 		else
diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c
index 3cf1e2a612ce..f3bcaa48df8f 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_inq.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c
@@ -75,9 +75,10 @@ static void xgetaddrinfo(const char *node, const char *service,
 			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
-again:
-	int err = getaddrinfo(node, service, hints, res);
+	int err;
 
+again:
+	err = getaddrinfo(node, service, hints, res);
 	if (err) {
 		const char *errstr;
 
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index b8af65373b3a..7fd555b123b9 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -8,7 +8,7 @@
 
 # ShellCheck incorrectly believes that most of the code here is unreachable
 # because it's invoked by variable name, see how the "tests" array is used
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 
 . "$(dirname "${0}")/mptcp_lib.sh"
 
@@ -3842,6 +3842,7 @@ endpoint_tests()
 	# remove and re-add
 	if reset_with_events "delete re-add signal" &&
 	   mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+		ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=0
 		pm_nl_set_limits $ns1 0 3
 		pm_nl_set_limits $ns2 3 3
 		pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index 09cd24b2ae46..d62e653d48b0 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -384,7 +384,7 @@ mptcp_lib_make_file() {
 mptcp_lib_print_file_err() {
 	ls -l "${1}" 1>&2
 	echo "Trailing bytes are: "
-	tail -c 27 "${1}"
+	tail -c 32 "${1}" | od -x | head -n2
 }
 
 # $1: input file ; $2: output file ; $3: what kind of file
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
index 9934a68df237..112c07c4c37a 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c
@@ -162,9 +162,10 @@ static void xgetaddrinfo(const char *node, const char *service,
 			 struct addrinfo *hints,
 			 struct addrinfo **res)
 {
-again:
-	int err = getaddrinfo(node, service, hints, res);
+	int err;
 
+again:
+	err = getaddrinfo(node, service, hints, res);
 	if (err) {
 		const char *errstr;
 
@@ -666,22 +667,26 @@ static void process_one_client(int fd, int pipefd)
 
 	do_getsockopts(&s, fd, ret, ret2);
 	if (s.mptcpi_rcv_delta != (uint64_t)ret + 1)
-		xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret);
+		xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64,
+		       s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1));
 
 	/* be nice when running on top of older kernel */
 	if (s.pkt_stats_avail) {
 		if (s.last_sample.mptcpi_bytes_sent != ret2)
-			xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64,
+			xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64
+			       ", diff %" PRId64,
 			       s.last_sample.mptcpi_bytes_sent, ret2,
 			       s.last_sample.mptcpi_bytes_sent - ret2);
 		if (s.last_sample.mptcpi_bytes_received != ret)
-			xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64,
+			xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64
+			       ", diff %" PRId64,
 			       s.last_sample.mptcpi_bytes_received, ret,
 			       s.last_sample.mptcpi_bytes_received - ret);
 		if (s.last_sample.mptcpi_bytes_acked != ret)
-			xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64,
-			       s.last_sample.mptcpi_bytes_acked, ret2,
-			       s.last_sample.mptcpi_bytes_acked - ret2);
+			xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64
+			       ", diff %" PRId64,
+			       s.last_sample.mptcpi_bytes_acked, ret,
+			       s.last_sample.mptcpi_bytes_acked - ret);
 	}
 
 	close(fd);
diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
index 418a903c3a4d..f01989be6e9b 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh
@@ -95,7 +95,7 @@ init()
 }
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}"
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
index 2e6648a2b2c0..ec6a87588191 100755
--- a/tools/testing/selftests/net/mptcp/pm_netlink.sh
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -32,7 +32,7 @@ ns1=""
 err=$(mktemp)
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	rm -f "${err}"
@@ -70,8 +70,9 @@ format_endpoints() {
 	mptcp_lib_pm_nl_format_endpoints "${@}"
 }
 
+# This function is invoked indirectly
+#shellcheck disable=SC2317,SC2329
 get_endpoint() {
-	# shellcheck disable=SC2317 # invoked indirectly
 	mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}"
 }
 
@@ -198,6 +199,7 @@ set_limits 1 9 2>/dev/null
 check "get_limits" "${default_limits}" "subflows above hard limit"
 
 set_limits 8 8
+flush_endpoint  ## to make sure it doesn't affect the limits
 check "get_limits" "$(format_limits 8 8)" "set limits"
 
 flush_endpoint
diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
index 994a556f46c1..93fea3442216 100644
--- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group)
 					fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs));
 				else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE)
 					fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs));
+				else if (attrs->rta_type == MPTCP_ATTR_FLAGS) {
+					__u16 flags = *(__u16 *)RTA_DATA(attrs);
+
+					/* only print when present, easier */
+					if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0)
+						fprintf(stderr, ",deny_join_id0:1");
+				}
 
 				attrs = RTA_NEXT(attrs, msg_len);
 			}
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 2329c2f8519b..1903e8e84a31 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -35,7 +35,7 @@ usage() {
 }
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	rm -f "$cout" "$sout"
diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh
index 333064b0b5ac..3d45991f24ed 100755
--- a/tools/testing/selftests/net/mptcp/userspace_pm.sh
+++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh
@@ -94,7 +94,7 @@ test_fail()
 }
 
 # This function is used in the cleanup trap
-#shellcheck disable=SC2317
+#shellcheck disable=SC2317,SC2329
 cleanup()
 {
 	print_title "Cleanup"
@@ -201,6 +201,9 @@ make_connection()
 		is_v6="v4"
 	fi
 
+	# set this on the client side only: will not affect the rest
+	ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0
+
 	:>"$client_evts"
 	:>"$server_evts"
 
@@ -223,23 +226,28 @@ make_connection()
 	local client_token
 	local client_port
 	local client_serverside
+	local client_nojoin
 	local server_token
 	local server_serverside
+	local server_nojoin
 
 	client_token=$(mptcp_lib_evts_get_info token "$client_evts")
 	client_port=$(mptcp_lib_evts_get_info sport "$client_evts")
 	client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts")
+	client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts")
 	server_token=$(mptcp_lib_evts_get_info token "$server_evts")
 	server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts")
+	server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts")
 
 	print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1"
-	if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] &&
-		   [ "$server_serverside" = 1 ]
+	if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] &&
+	   [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] &&
+	   [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ]
 	then
 		test_pass
 		print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}"
 	else
-		test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})"
+		test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})"
 		mptcp_lib_result_print_all_tap
 		exit ${KSFT_FAIL}
 	fi
diff --git a/tools/testing/selftests/net/netfilter/conntrack_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_clash.sh
index 606a43a60f73..7fc6c5dbd551 100755
--- a/tools/testing/selftests/net/netfilter/conntrack_clash.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_clash.sh
@@ -99,7 +99,7 @@ run_one_clash_test()
 	local entries
 	local cre
 
-	if ! ip netns exec "$ns" ./udpclash $daddr $dport;then
+	if ! ip netns exec "$ns" timeout 30 ./udpclash $daddr $dport;then
 		echo "INFO: did not receive expected number of replies for $daddr:$dport"
 		ip netns exec "$ctns" conntrack -S
 		# don't fail: check if clash resolution triggered after all.
diff --git a/tools/testing/selftests/net/netfilter/conntrack_resize.sh b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
index 788cd56ea4a0..615fe3c6f405 100755
--- a/tools/testing/selftests/net/netfilter/conntrack_resize.sh
+++ b/tools/testing/selftests/net/netfilter/conntrack_resize.sh
@@ -187,7 +187,7 @@ ct_udpclash()
 	[ -x udpclash ] || return
 
         while [ $now -lt $end ]; do
-		ip netns exec "$ns" ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1
+		ip netns exec "$ns" timeout 30 ./udpclash 127.0.0.1 $((RANDOM%65536)) > /dev/null 2>&1
 
 		now=$(date +%s)
 	done
@@ -277,6 +277,7 @@ check_taint()
 insert_flood()
 {
 	local n="$1"
+	local timeout="$2"
 	local r=0
 
 	r=$((RANDOM%$insert_count))
@@ -302,7 +303,7 @@ test_floodresize_all()
 	read tainted_then < /proc/sys/kernel/tainted
 
 	for n in "$nsclient1" "$nsclient2";do
-		insert_flood "$n" &
+		insert_flood "$n" "$timeout" &
 	done
 
 	# resize table constantly while flood/insert/dump/flushs
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index a4ee5496f2a1..45832df98295 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -20,6 +20,7 @@ ret=0
 SOCAT_TIMEOUT=60
 
 nsin=""
+nsin_small=""
 ns1out=""
 ns2out=""
 
@@ -36,7 +37,7 @@ cleanup() {
 
 	cleanup_all_ns
 
-	rm -f "$nsin" "$ns1out" "$ns2out"
+	rm -f "$nsin" "$nsin_small" "$ns1out" "$ns2out"
 
 	[ "$log_netns" -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns="$log_netns"
 }
@@ -72,6 +73,7 @@ lmtu=1500
 rmtu=2000
 
 filesize=$((2 * 1024 * 1024))
+filesize_small=$((filesize / 16))
 
 usage(){
 	echo "nft_flowtable.sh [OPTIONS]"
@@ -89,7 +91,10 @@ do
 		o) omtu=$OPTARG;;
 		l) lmtu=$OPTARG;;
 		r) rmtu=$OPTARG;;
-		s) filesize=$OPTARG;;
+		s)
+			filesize=$OPTARG
+			filesize_small=$((OPTARG / 16))
+		;;
 		*) usage;;
 	esac
 done
@@ -215,6 +220,7 @@ if ! ip netns exec "$ns2" ping -c 1 -q 10.0.1.99 > /dev/null; then
 fi
 
 nsin=$(mktemp)
+nsin_small=$(mktemp)
 ns1out=$(mktemp)
 ns2out=$(mktemp)
 
@@ -265,6 +271,7 @@ check_counters()
 check_dscp()
 {
 	local what=$1
+	local pmtud="$2"
 	local ok=1
 
 	local counter
@@ -277,37 +284,39 @@ check_dscp()
 	local pc4z=${counter%*bytes*}
 	local pc4z=${pc4z#*packets}
 
+	local failmsg="FAIL: pmtu $pmtu: $what counters do not match, expected"
+
 	case "$what" in
 	"dscp_none")
 		if [ "$pc4" -gt 0 ] || [ "$pc4z" -eq 0 ]; then
-			echo "FAIL: dscp counters do not match, expected dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
+			echo "$failmsg dscp3 == 0, dscp0 > 0, but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_fwd")
 		if [ "$pc4" -eq 0 ] || [ "$pc4z" -eq 0 ]; then
-			echo "FAIL: dscp counters do not match, expected dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
+			echo "$failmsg dscp3 and dscp0 > 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_ingress")
 		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
-			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
+			echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	"dscp_egress")
 		if [ "$pc4" -eq 0 ] || [ "$pc4z" -gt 0 ]; then
-			echo "FAIL: dscp counters do not match, expected dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
+			echo "$failmsg dscp3 > 0, dscp0 == 0 but got $pc4,$pc4z" 1>&2
 			ret=1
 			ok=0
 		fi
 		;;
 	*)
-		echo "FAIL: Unknown DSCP check" 1>&2
+		echo "$failmsg: Unknown DSCP check" 1>&2
 		ret=1
 		ok=0
 	esac
@@ -319,9 +328,9 @@ check_dscp()
 
 check_transfer()
 {
-	in=$1
-	out=$2
-	what=$3
+	local in=$1
+	local out=$2
+	local what=$3
 
 	if ! cmp "$in" "$out" > /dev/null 2>&1; then
 		echo "FAIL: file mismatch for $what" 1>&2
@@ -342,25 +351,39 @@ test_tcp_forwarding_ip()
 {
 	local nsa=$1
 	local nsb=$2
-	local dstip=$3
-	local dstport=$4
+	local pmtu=$3
+	local dstip=$4
+	local dstport=$5
 	local lret=0
+	local socatc
+	local socatl
+	local infile="$nsin"
+
+	if [ $pmtu -eq 0 ]; then
+		infile="$nsin_small"
+	fi
 
-	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$nsin" > "$ns2out" &
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" &
 	lpid=$!
 
 	busywait 1000 listener_ready
 
-	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$nsin" > "$ns1out"
+	timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out"
+	socatc=$?
 
 	wait $lpid
+	socatl=$?
 
-	if ! check_transfer "$nsin" "$ns2out" "ns1 -> ns2"; then
+	if [ $socatl -ne 0 ] || [ $socatc -ne 0 ];then
+		rc=1
+	fi
+
+	if ! check_transfer "$infile" "$ns2out" "ns1 -> ns2"; then
 		lret=1
 		ret=1
 	fi
 
-	if ! check_transfer "$nsin" "$ns1out" "ns1 <- ns2"; then
+	if ! check_transfer "$infile" "$ns1out" "ns1 <- ns2"; then
 		lret=1
 		ret=1
 	fi
@@ -370,14 +393,16 @@ test_tcp_forwarding_ip()
 
 test_tcp_forwarding()
 {
-	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
+	local pmtu="$3"
+
+	test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345
 
 	return $?
 }
 
 test_tcp_forwarding_set_dscp()
 {
-	check_dscp "dscp_none"
+	local pmtu="$3"
 
 ip netns exec "$nsr1" nft -f - <<EOF
 table netdev dscpmangle {
@@ -388,8 +413,8 @@ table netdev dscpmangle {
 }
 EOF
 if [ $? -eq 0 ]; then
-	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
-	check_dscp "dscp_ingress"
+	test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345
+	check_dscp "dscp_ingress" "$pmtu"
 
 	ip netns exec "$nsr1" nft delete table netdev dscpmangle
 else
@@ -405,10 +430,10 @@ table netdev dscpmangle {
 }
 EOF
 if [ $? -eq 0 ]; then
-	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
-	check_dscp "dscp_egress"
+	test_tcp_forwarding_ip "$1" "$2" "$pmtu"  10.0.2.99 12345
+	check_dscp "dscp_egress" "$pmtu"
 
-	ip netns exec "$nsr1" nft flush table netdev dscpmangle
+	ip netns exec "$nsr1" nft delete table netdev dscpmangle
 else
 	echo "SKIP: Could not load netdev:egress for veth1"
 fi
@@ -416,48 +441,53 @@ fi
 	# partial.  If flowtable really works, then both dscp-is-0 and dscp-is-cs3
 	# counters should have seen packets (before and after ft offload kicks in).
 	ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3
-	test_tcp_forwarding_ip "$1" "$2"  10.0.2.99 12345
-	check_dscp "dscp_fwd"
+	test_tcp_forwarding_ip "$1" "$2" "$pmtu"  10.0.2.99 12345
+	check_dscp "dscp_fwd" "$pmtu"
 }
 
 test_tcp_forwarding_nat()
 {
+	local nsa="$1"
+	local nsb="$2"
+	local pmtu="$3"
+	local what="$4"
 	local lret
-	local pmtu
 
-	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
-	lret=$?
+	[ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)"
 
-	pmtu=$3
-	what=$4
+	test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345
+	lret=$?
 
 	if [ "$lret" -eq 0 ] ; then
 		if [ "$pmtu" -eq 1 ] ;then
-			check_counters "flow offload for ns1/ns2 with masquerade and pmtu discovery $what"
+			check_counters "flow offload for ns1/ns2 with masquerade $what"
 		else
 			echo "PASS: flow offload for ns1/ns2 with masquerade $what"
 		fi
 
-		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
+		test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666
 		lret=$?
 		if [ "$pmtu" -eq 1 ] ;then
-			check_counters "flow offload for ns1/ns2 with dnat and pmtu discovery $what"
+			check_counters "flow offload for ns1/ns2 with dnat $what"
 		elif [ "$lret" -eq 0 ] ; then
 			echo "PASS: flow offload for ns1/ns2 with dnat $what"
 		fi
+	else
+		echo "FAIL: flow offload for ns1/ns2 with dnat $what"
 	fi
 
 	return $lret
 }
 
 make_file "$nsin" "$filesize"
+make_file "$nsin_small" "$filesize_small"
 
 # First test:
 # No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
 # Due to MTU mismatch in both directions, all packets (except small packets like pure
 # acks) have to be handled by normal forwarding path.  Therefore, packet counters
 # are not checked.
-if test_tcp_forwarding "$ns1" "$ns2"; then
+if test_tcp_forwarding "$ns1" "$ns2" 0; then
 	echo "PASS: flow offloaded for ns1/ns2"
 else
 	echo "FAIL: flow offload for ns1/ns2:" 1>&2
@@ -489,8 +519,9 @@ table ip nat {
 }
 EOF
 
+check_dscp "dscp_none" "0"
 if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then
-	echo "FAIL: flow offload for ns1/ns2 with dscp update" 1>&2
+	echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2
 	exit 0
 fi
 
@@ -513,6 +544,14 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
 # For earlier tests (large mtus), packets cannot be handled via flowtable
 # (except pure acks and other small packets).
 ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null
+ip netns exec "$ns2"  nft reset counters table inet filter >/dev/null
+
+if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then
+	echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2
+	exit 0
+fi
+
+ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null
 
 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then
 	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
@@ -644,7 +683,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1
 ip -net "$ns2" route add default via 10.0.2.1
 ip -net "$ns2" route add default via dead:2::1
 
-if test_tcp_forwarding "$ns1" "$ns2"; then
+if test_tcp_forwarding "$ns1" "$ns2" 1; then
 	check_counters "ipsec tunnel mode for ns1/ns2"
 else
 	echo "FAIL: ipsec tunnel mode for ns1/ns2"
@@ -668,7 +707,7 @@ if [ "$1" = "" ]; then
 	fi
 
 	echo "re-run with random mtus and file size: -o $o -l $l -r $r -s $filesize"
-	$0 -o "$o" -l "$l" -r "$r" -s "$filesize"
+	$0 -o "$o" -l "$l" -r "$r" -s "$filesize" || ret=1
 fi
 
 exit $ret
diff --git a/tools/testing/selftests/net/netfilter/udpclash.c b/tools/testing/selftests/net/netfilter/udpclash.c
index 85c7b906ad08..79de163d61ab 100644
--- a/tools/testing/selftests/net/netfilter/udpclash.c
+++ b/tools/testing/selftests/net/netfilter/udpclash.c
@@ -29,7 +29,7 @@ struct thread_args {
 	int sockfd;
 };
 
-static int wait = 1;
+static volatile int wait = 1;
 
 static void *thread_main(void *varg)
 {
diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh
index 3c8d3455d8e7..b327d3061ed5 100755
--- a/tools/testing/selftests/net/openvswitch/openvswitch.sh
+++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh
@@ -25,6 +25,7 @@ tests="
 	nat_related_v4				ip4-nat-related: ICMP related matches work with SNAT
 	netlink_checks				ovsnl: validate netlink attrs and settings
 	upcall_interfaces			ovs: test the upcall interfaces
+	tunnel_metadata				ovs: test extraction of tunnel metadata
 	drop_reason				drop: test drop reasons are emitted
 	psample					psample: Sampling packets with psample"
 
@@ -113,13 +114,13 @@ ovs_add_dp () {
 }
 
 ovs_add_if () {
-	info "Adding IF to DP: br:$2 if:$3"
-	if [ "$4" != "-u" ]; then
-		ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \
-		    || return 1
+	info "Adding IF to DP: br:$3 if:$4 ($2)"
+	if [ "$5" != "-u" ]; then
+		ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \
+		    -t "$2" "$3" "$4" || return 1
 	else
 		python3 $ovs_base/ovs-dpctl.py add-if \
-		    -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err &
+		    -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err &
 		pid=$!
 		on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null"
 	fi
@@ -166,9 +167,9 @@ ovs_add_netns_and_veths () {
 	fi
 
 	if [ "$7" != "-u" ]; then
-		ovs_add_if "$1" "$2" "$4" || return 1
+		ovs_add_if "$1" "netdev" "$2" "$4" || return 1
 	else
-		ovs_add_if "$1" "$2" "$4" -u || return 1
+		ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1
 	fi
 
 	if [ $TRACING -eq 1 ]; then
@@ -756,6 +757,79 @@ test_upcall_interfaces() {
 	return 0
 }
 
+ovs_add_kernel_tunnel() {
+	local sbxname=$1; shift
+	local ns=$1; shift
+	local tnl_type=$1; shift
+	local name=$1; shift
+	local addr=$1; shift
+
+	info "setting up kernel ${tnl_type} tunnel ${name}"
+	ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1
+	on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1"
+	ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1
+	ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1
+}
+
+test_tunnel_metadata() {
+	which arping >/dev/null 2>&1 || return $ksft_skip
+
+	sbxname="test_tunnel_metadata"
+	sbx_add "${sbxname}" || return 1
+
+	info "setting up new DP"
+	ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1
+
+	ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \
+		172.31.110.1/24 || return 1
+
+	info "removing veth interface from openvswitch and setting IP"
+	ovs_del_if "${sbxname}" tdp0 left0 || return 1
+	ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1
+	ovs_sbx "${sbxname}" ip link set left0 up || return 1
+
+	info "setting up tunnel port in openvswitch"
+	ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1
+	on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0"
+	ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1
+	ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1
+
+	configs=$(echo '
+	    1 172.31.221.1/24 1155332 32   set   udpcsum flags\(df\|csum\)
+	    2 172.31.222.1/24 1234567 45   set noudpcsum flags\(df\)
+	    3 172.31.223.1/24 1020304 23 unset   udpcsum flags\(csum\)
+	    4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d')
+
+	while read -r i addr id ttl df csum flags; do
+		ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \
+			remote 172.31.110.2 id ${id} dstport 4789 \
+			ttl ${ttl} df ${df} ${csum} || return 1
+	done <<< "${configs}"
+
+	ovs_wait grep -q 'listening on upcall packet handler' \
+		${ovs_dir}/ovs-vxlan0.out || return 1
+
+	info "sending arping"
+	for i in 1 2 3 4; do
+		ovs_sbx "${sbxname}" ip netns exec tns \
+			arping -I vxlan${i} 172.31.22${i}.2 -c 1 \
+			>${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr
+	done
+
+	info "checking that received decapsulated packets carry correct metadata"
+	while read -r i addr id ttl df csum flags; do
+		arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha="
+		addrs="src=172.31.110.1,dst=172.31.110.2"
+		ports="tp_src=[0-9]*,tp_dst=4789"
+		tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)"
+
+		ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \
+			${ovs_dir}/ovs-vxlan0.out || return 1
+	done <<< "${configs}"
+
+	return 0
+}
+
 run_test() {
 	(
 	tname="$1"
diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt
new file mode 100644
index 000000000000..26794e7ddfd5
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+`./defaults.sh
+ ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0`
+
+    0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:10(10) win 32792 <mss 1460,nop,nop,sackOK>
+   +0 > S. 0:0(0) ack 11 win 65535 <mss 1460,nop,nop,sackOK>
+
+// sk->sk_state is TCP_SYN_RECV
+  +.1 accept(3, ..., ...) = 4
+
+// tcp_disconnect() sets sk->sk_state to TCP_CLOSE
+   +0 connect(4, AF_UNSPEC, ...) = 0
+   +0 > R. 1:1(0) ack 11 win 65535
+
+// connect() sets sk->sk_state to TCP_SYN_SENT
+   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+   +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress)
+   +0 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// tp->fastopen_rsk must be NULL
+   +1 > S 0:0(0) win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 8>
diff --git a/tools/testing/selftests/net/test_vxlan_nh.sh b/tools/testing/selftests/net/test_vxlan_nh.sh
new file mode 100755
index 000000000000..20f3369f776b
--- /dev/null
+++ b/tools/testing/selftests/net/test_vxlan_nh.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+TESTS="
+	basic_tx_ipv4
+	basic_tx_ipv6
+	learning
+	proxy_ipv4
+	proxy_ipv6
+"
+VERBOSE=0
+
+################################################################################
+# Utilities
+
+run_cmd()
+{
+	local cmd="$1"
+	local out
+	local stderr="2>/dev/null"
+
+	if [ "$VERBOSE" = "1" ]; then
+		echo "COMMAND: $cmd"
+		stderr=
+	fi
+
+	out=$(eval "$cmd" "$stderr")
+	rc=$?
+	if [ "$VERBOSE" -eq 1 ] && [ -n "$out" ]; then
+		echo "    $out"
+	fi
+
+	return $rc
+}
+
+################################################################################
+# Cleanup
+
+exit_cleanup_all()
+{
+	cleanup_all_ns
+	exit "${EXIT_STATUS}"
+}
+
+################################################################################
+# Tests
+
+nh_stats_get()
+{
+	ip -n "$ns1" -s -j nexthop show id 10 | jq ".[][\"group_stats\"][][\"packets\"]"
+}
+
+tc_stats_get()
+{
+	tc_rule_handle_stats_get "dev dummy1 egress" 101 ".packets" "-n $ns1"
+}
+
+basic_tx_common()
+{
+	local af_str=$1; shift
+	local proto=$1; shift
+	local local_addr=$1; shift
+	local plen=$1; shift
+	local remote_addr=$1; shift
+
+	RET=0
+
+	# Test basic Tx functionality. Check that stats are incremented on
+	# both the FDB nexthop group and the egress device.
+
+	run_cmd "ip -n $ns1 link add name dummy1 up type dummy"
+	run_cmd "ip -n $ns1 route add $remote_addr/$plen dev dummy1"
+	run_cmd "tc -n $ns1 qdisc add dev dummy1 clsact"
+	run_cmd "tc -n $ns1 filter add dev dummy1 egress proto $proto pref 1 handle 101 flower ip_proto udp dst_ip $remote_addr dst_port 4789 action pass"
+
+	run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo"
+
+	run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb"
+	run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"
+
+	run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789"
+	run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10"
+
+	run_cmd "ip netns exec $ns1 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 1 -q"
+
+	busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" nh_stats_get > /dev/null
+	check_err $? "FDB nexthop group stats did not increase"
+
+	busywait "$BUSYWAIT_TIMEOUT" until_counter_is "== 1" tc_stats_get > /dev/null
+	check_err $? "tc filter stats did not increase"
+
+	log_test "VXLAN FDB nexthop: $af_str basic Tx"
+}
+
+basic_tx_ipv4()
+{
+	basic_tx_common "IPv4" ipv4 192.0.2.1 32 192.0.2.2
+}
+
+basic_tx_ipv6()
+{
+	basic_tx_common "IPv6" ipv6 2001:db8:1::1 128 2001:db8:1::2
+}
+
+learning()
+{
+	RET=0
+
+	# When learning is enabled on the VXLAN device, an incoming packet
+	# might try to refresh an FDB entry that points to an FDB nexthop group
+	# instead of an ordinary remote destination. Check that the kernel does
+	# not crash in this situation.
+
+	run_cmd "ip -n $ns1 address add 192.0.2.1/32 dev lo"
+	run_cmd "ip -n $ns1 address add 192.0.2.2/32 dev lo"
+
+	run_cmd "ip -n $ns1 nexthop add id 1 via 192.0.2.3 fdb"
+	run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"
+
+	run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local 192.0.2.1 dstport 12345 localbypass"
+	run_cmd "ip -n $ns1 link add name vx1 up type vxlan id 10020 local 192.0.2.2 dstport 54321 learning"
+
+	run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static dst 192.0.2.2 port 54321 vni 10020"
+	run_cmd "bridge -n $ns1 fdb add 00:aa:bb:cc:dd:ee dev vx1 self static nhid 10"
+
+	run_cmd "ip netns exec $ns1 mausezahn vx0 -a 00:aa:bb:cc:dd:ee -b 00:11:22:33:44:55 -c 1 -q"
+
+	log_test "VXLAN FDB nexthop: learning"
+}
+
+proxy_common()
+{
+	local af_str=$1; shift
+	local local_addr=$1; shift
+	local plen=$1; shift
+	local remote_addr=$1; shift
+	local neigh_addr=$1; shift
+	local ping_cmd=$1; shift
+
+	RET=0
+
+	# When the "proxy" option is enabled on the VXLAN device, the device
+	# will suppress ARP requests and IPv6 Neighbor Solicitation messages if
+	# it is able to reply on behalf of the remote host. That is, if a
+	# matching and valid neighbor entry is configured on the VXLAN device
+	# whose MAC address is not behind the "any" remote (0.0.0.0 / ::). The
+	# FDB entry for the neighbor's MAC address might point to an FDB
+	# nexthop group instead of an ordinary remote destination. Check that
+	# the kernel does not crash in this situation.
+
+	run_cmd "ip -n $ns1 address add $local_addr/$plen dev lo"
+
+	run_cmd "ip -n $ns1 nexthop add id 1 via $remote_addr fdb"
+	run_cmd "ip -n $ns1 nexthop add id 10 group 1 fdb"
+
+	run_cmd "ip -n $ns1 link add name vx0 up type vxlan id 10010 local $local_addr dstport 4789 proxy"
+
+	run_cmd "ip -n $ns1 neigh add $neigh_addr lladdr 00:11:22:33:44:55 nud perm dev vx0"
+
+	run_cmd "bridge -n $ns1 fdb add 00:11:22:33:44:55 dev vx0 self static nhid 10"
+
+	run_cmd "ip netns exec $ns1 $ping_cmd"
+
+	log_test "VXLAN FDB nexthop: $af_str proxy"
+}
+
+proxy_ipv4()
+{
+	proxy_common "IPv4" 192.0.2.1 32 192.0.2.2 192.0.2.3 \
+		"arping -b -c 1 -s 192.0.2.1 -I vx0 192.0.2.3"
+}
+
+proxy_ipv6()
+{
+	proxy_common "IPv6" 2001:db8:1::1 128 2001:db8:1::2 2001:db8:1::3 \
+		"ndisc6 -r 1 -s 2001:db8:1::1 -w 1 2001:db8:1::3 vx0"
+}
+
+################################################################################
+# Usage
+
+usage()
+{
+	cat <<EOF
+usage: ${0##*/} OPTS
+
+        -t <test>   Test(s) to run (default: all)
+                    (options: $TESTS)
+        -p          Pause on fail
+        -v          Verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# Main
+
+while getopts ":t:pvh" opt; do
+	case $opt in
+		t) TESTS=$OPTARG;;
+		p) PAUSE_ON_FAIL=yes;;
+		v) VERBOSE=$((VERBOSE + 1));;
+		h) usage; exit 0;;
+		*) usage; exit 1;;
+	esac
+done
+
+require_command mausezahn
+require_command arping
+require_command ndisc6
+require_command jq
+
+if ! ip nexthop help 2>&1 | grep -q "stats"; then
+	echo "SKIP: iproute2 ip too old, missing nexthop stats support"
+	exit "$ksft_skip"
+fi
+
+trap exit_cleanup_all EXIT
+
+for t in $TESTS
+do
+	setup_ns ns1; $t; cleanup_all_ns;
+done
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 5ded3b3a7538..dd093f9df6f1 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -181,13 +181,12 @@ static int tls_send_cmsg(int fd, unsigned char record_type,
 	return sendmsg(fd, &msg, flags);
 }
 
-static int tls_recv_cmsg(struct __test_metadata *_metadata,
-			 int fd, unsigned char record_type,
-			 void *data, size_t len, int flags)
+static int __tls_recv_cmsg(struct __test_metadata *_metadata,
+			   int fd, unsigned char *ctype,
+			   void *data, size_t len, int flags)
 {
 	char cbuf[CMSG_SPACE(sizeof(char))];
 	struct cmsghdr *cmsg;
-	unsigned char ctype;
 	struct msghdr msg;
 	struct iovec vec;
 	int n;
@@ -206,7 +205,20 @@ static int tls_recv_cmsg(struct __test_metadata *_metadata,
 	EXPECT_NE(cmsg, NULL);
 	EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
 	EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
-	ctype = *((unsigned char *)CMSG_DATA(cmsg));
+	if (ctype)
+		*ctype = *((unsigned char *)CMSG_DATA(cmsg));
+
+	return n;
+}
+
+static int tls_recv_cmsg(struct __test_metadata *_metadata,
+			 int fd, unsigned char record_type,
+			 void *data, size_t len, int flags)
+{
+	unsigned char ctype;
+	int n;
+
+	n = __tls_recv_cmsg(_metadata, fd, &ctype, data, len, flags);
 	EXPECT_EQ(ctype, record_type);
 
 	return n;
@@ -2164,6 +2176,284 @@ TEST_F(tls, rekey_poll_delay)
 	}
 }
 
+struct raw_rec {
+	unsigned int plain_len;
+	unsigned char plain_data[100];
+	unsigned int cipher_len;
+	unsigned char cipher_data[128];
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: 'Hello world' */
+static const struct raw_rec id0_data_l11 = {
+	.plain_len = 11,
+	.plain_data = {
+		0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+		0x72, 0x6c, 0x64,
+	},
+	.cipher_len = 40,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0xa2, 0x33,
+		0xde, 0x8d, 0x94, 0xf0, 0x29, 0x6c, 0xb1, 0xaf,
+		0x6a, 0x75, 0xb2, 0x93, 0xad, 0x45, 0xd5, 0xfd,
+		0x03, 0x51, 0x57, 0x8f, 0xf9, 0xcc, 0x3b, 0x42,
+	},
+};
+
+/* TLS 1.2, AES_CCM, ctrl, seqno:0, plaintext: '' */
+static const struct raw_rec id0_ctrl_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x38, 0x7b,
+		0xa6, 0x1c, 0xdd, 0xa7, 0x19, 0x33, 0xab, 0xae,
+		0x88, 0xe1, 0xd2, 0x08, 0x4f,
+	},
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: '' */
+static const struct raw_rec id0_data_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, 0x37, 0x90,
+		0x70, 0x45, 0x89, 0xfb, 0x5c, 0xc7, 0x89, 0x03,
+		0x68, 0x80, 0xd3, 0xd8, 0xcc,
+	},
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: 'Hello world' */
+static const struct raw_rec id1_data_l11 = {
+	.plain_len = 11,
+	.plain_data = {
+		0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+		0x72, 0x6c, 0x64,
+	},
+	.cipher_len = 40,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x3a, 0x1a, 0x9c,
+		0xd0, 0xa8, 0x9a, 0xd6, 0x69, 0xd6, 0x1a, 0xe3,
+		0xb5, 0x1f, 0x0d, 0x2c, 0xe2, 0x97, 0x46, 0xff,
+		0x2b, 0xcc, 0x5a, 0xc4, 0xa3, 0xb9, 0xef, 0xba,
+	},
+};
+
+/* TLS 1.2, AES_CCM, ctrl, seqno:1, plaintext: '' */
+static const struct raw_rec id1_ctrl_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0x3e, 0xf0, 0xfe,
+		0xee, 0xd9, 0xe2, 0x5d, 0xc7, 0x11, 0x4c, 0xe6,
+		0xb4, 0x7e, 0xef, 0x40, 0x2b,
+	},
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: '' */
+static const struct raw_rec id1_data_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x01, 0xce, 0xfc, 0x86,
+		0xc8, 0xf0, 0x55, 0xf9, 0x47, 0x3f, 0x74, 0xdc,
+		0xc9, 0xbf, 0xfe, 0x5b, 0xb1,
+	},
+};
+
+/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: 'Hello world' */
+static const struct raw_rec id2_ctrl_l11 = {
+	.plain_len = 11,
+	.plain_data = {
+		0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+		0x72, 0x6c, 0x64,
+	},
+	.cipher_len = 40,
+	.cipher_data = {
+		0x16, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19,
+		0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87,
+		0x2a, 0x04, 0x11, 0x3d, 0xf8, 0x64, 0x5f, 0x36,
+		0x8b, 0xa8, 0xee, 0x4c, 0x6d, 0x62, 0xa5, 0x00,
+	},
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: 'Hello world' */
+static const struct raw_rec id2_data_l11 = {
+	.plain_len = 11,
+	.plain_data = {
+		0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+		0x72, 0x6c, 0x64,
+	},
+	.cipher_len = 40,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19,
+		0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87,
+		0x8e, 0xa1, 0xd0, 0xcd, 0x33, 0xb5, 0x86, 0x2b,
+		0x17, 0xf1, 0x52, 0x2a, 0x55, 0x62, 0x65, 0x11,
+	},
+};
+
+/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: '' */
+static const struct raw_rec id2_ctrl_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x02, 0xdc, 0x5c, 0x0e,
+		0x41, 0xdd, 0xba, 0xd3, 0xcc, 0xcf, 0x6d, 0xd9,
+		0x06, 0xdb, 0x79, 0xe5, 0x5d,
+	},
+};
+
+/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: '' */
+static const struct raw_rec id2_data_l0 = {
+	.plain_len = 0,
+	.plain_data = {
+	},
+	.cipher_len = 29,
+	.cipher_data = {
+		0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x02, 0xc3, 0xca, 0x26,
+		0x22, 0xe4, 0x25, 0xfb, 0x5f, 0x6d, 0xbf, 0x83,
+		0x30, 0x48, 0x69, 0x1a, 0x47,
+	},
+};
+
+FIXTURE(zero_len)
+{
+	int fd, cfd;
+	bool notls;
+};
+
+FIXTURE_VARIANT(zero_len)
+{
+	const struct raw_rec *recs[4];
+	ssize_t recv_ret[4];
+};
+
+FIXTURE_VARIANT_ADD(zero_len, data_data_data)
+{
+	.recs = { &id0_data_l11, &id1_data_l11, &id2_data_l11, },
+	.recv_ret = { 33, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, data_0ctrl_data)
+{
+	.recs = { &id0_data_l11, &id1_ctrl_l0, &id2_data_l11, },
+	.recv_ret = { 11, 0, 11, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0data)
+{
+	.recs = { &id0_data_l0, &id1_data_l0, &id2_data_l0, },
+	.recv_ret = { -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, 0data_0data_ctrl)
+{
+	.recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l11, },
+	.recv_ret = { 0, 11, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0ctrl)
+{
+	.recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l0, },
+	.recv_ret = { 0, 0, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, 0ctrl_0ctrl_0ctrl)
+{
+	.recs = { &id0_ctrl_l0, &id1_ctrl_l0, &id2_ctrl_l0, },
+	.recv_ret = { 0, 0, 0, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, 0data_0data_data)
+{
+	.recs = { &id0_data_l0, &id1_data_l0, &id2_data_l11, },
+	.recv_ret = { 11, -EAGAIN, },
+};
+
+FIXTURE_VARIANT_ADD(zero_len, data_0data_0data)
+{
+	.recs = { &id0_data_l11, &id1_data_l0, &id2_data_l0, },
+	.recv_ret = { 11, -EAGAIN, },
+};
+
+FIXTURE_SETUP(zero_len)
+{
+	struct tls_crypto_info_keys tls12;
+	int ret;
+
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128,
+			     &tls12, 0);
+
+	ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls);
+	if (self->notls)
+		return;
+
+	/* Don't install keys on fd, we'll send raw records */
+	ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len);
+	ASSERT_EQ(ret, 0);
+}
+
+FIXTURE_TEARDOWN(zero_len)
+{
+	close(self->fd);
+	close(self->cfd);
+}
+
+TEST_F(zero_len, test)
+{
+	const struct raw_rec *const *rec;
+	unsigned char buf[128];
+	int rec_off;
+	int i;
+
+	for (i = 0; i < 4 && variant->recs[i]; i++)
+		EXPECT_EQ(send(self->fd, variant->recs[i]->cipher_data,
+			       variant->recs[i]->cipher_len, 0),
+			  variant->recs[i]->cipher_len);
+
+	rec = &variant->recs[0];
+	rec_off = 0;
+	for (i = 0; i < 4; i++) {
+		int j, ret;
+
+		ret = variant->recv_ret[i] >= 0 ? variant->recv_ret[i] : -1;
+		EXPECT_EQ(__tls_recv_cmsg(_metadata, self->cfd, NULL,
+					  buf, sizeof(buf), MSG_DONTWAIT), ret);
+		if (ret == -1)
+			EXPECT_EQ(errno, -variant->recv_ret[i]);
+		if (variant->recv_ret[i] == -EAGAIN)
+			break;
+
+		for (j = 0; j < ret; j++) {
+			while (rec_off == (*rec)->plain_len) {
+				rec++;
+				rec_off = 0;
+			}
+			EXPECT_EQ(buf[j], (*rec)->plain_data[rec_off]);
+			rec_off++;
+		}
+	}
+};
+
 FIXTURE(tls_err)
 {
 	int fd, cfd;
@@ -2480,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async)
 	}
 }
 
+/* Use OOB+large send to trigger copy mode due to memory pressure.
+ * OOB causes a short read.
+ */
+TEST_F(tls_err, oob_pressure)
+{
+	char buf[1<<16];
+	int i;
+
+	memrnd(buf, sizeof(buf));
+
+	EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5);
+	EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf));
+	for (i = 0; i < 64; i++)
+		EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5);
+}
+
 TEST(non_established) {
 	struct tls12_crypto_info_aes_gcm_256 tls12;
 	struct sockaddr_in addr;
@@ -2708,6 +3014,67 @@ TEST(prequeue) {
 	close(cfd);
 }
 
+TEST(data_steal) {
+	struct tls_crypto_info_keys tls;
+	char buf[20000], buf2[20000];
+	struct sockaddr_in addr;
+	int sfd, cfd, ret, fd;
+	int pid, status;
+	socklen_t len;
+
+	len = sizeof(addr);
+	memrnd(buf, sizeof(buf));
+
+	tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_GCM_256, &tls, 0);
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	addr.sin_port = 0;
+
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+	ASSERT_EQ(bind(sfd, &addr, sizeof(addr)), 0);
+	ASSERT_EQ(listen(sfd, 10), 0);
+	ASSERT_EQ(getsockname(sfd, &addr, &len), 0);
+	ASSERT_EQ(connect(fd, &addr, sizeof(addr)), 0);
+	ASSERT_GE(cfd = accept(sfd, &addr, &len), 0);
+	close(sfd);
+
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+	if (ret) {
+		ASSERT_EQ(errno, ENOENT);
+		SKIP(return, "no TLS support");
+	}
+	ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0);
+
+	/* Spawn a child and get it into the read wait path of the underlying
+	 * TCP socket.
+	 */
+	pid = fork();
+	ASSERT_GE(pid, 0);
+	if (!pid) {
+		EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL),
+			  sizeof(buf) / 2);
+		exit(!__test_passed(_metadata));
+	}
+
+	usleep(10000);
+	ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0);
+	ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0);
+
+	EXPECT_EQ(send(fd, buf, sizeof(buf), 0), sizeof(buf));
+	EXPECT_EQ(wait(&status), pid);
+	EXPECT_EQ(status, 0);
+	EXPECT_EQ(recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT), -1);
+	/* Don't check errno, the error will be different depending
+	 * on what random bytes TLS interpreted as the record length.
+	 */
+
+	close(fd);
+	close(cfd);
+}
+
 static void __attribute__((constructor)) fips_check(void) {
 	int res;
 	FILE *f;
diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h
index 4efa6314bd96..864f0c9f1afc 100644
--- a/tools/testing/selftests/powerpc/include/instructions.h
+++ b/tools/testing/selftests/powerpc/include/instructions.h
@@ -67,7 +67,7 @@ static inline int paste_last(void *i)
 #define PPC_INST_PASTE_LAST            __PASTE(0, 0, 1, 1)
 
 /* This defines the prefixed load/store instructions */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 #  define stringify_in_c(...)	__VA_ARGS__
 #else
 #  define __stringify_in_c(...)	#__VA_ARGS__
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 19bb333e2485..6b78a8382d40 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -18,6 +18,7 @@
 /proc-tid0
 /proc-uptime-001
 /proc-uptime-002
+/proc-pidns
 /read
 /self
 /setns-dcache
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 50aba102201a..be3013515aae 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc
 TEST_GEN_PROGS += thread-self
 TEST_GEN_PROGS += proc-multiple-procfs
 TEST_GEN_PROGS += proc-fsconfig-hidepid
+TEST_GEN_PROGS += proc-pidns
 
 include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c
index 66773685a047..94bba4553130 100644
--- a/tools/testing/selftests/proc/proc-maps-race.c
+++ b/tools/testing/selftests/proc/proc-maps-race.c
@@ -202,11 +202,11 @@ static void print_first_lines(char *text, int nr)
 		int offs = end - text;
 
 		text[offs] = '\0';
-		printf(text);
+		printf("%s", text);
 		text[offs] = '\n';
 		printf("\n");
 	} else {
-		printf(text);
+		printf("%s", text);
 	}
 }
 
@@ -221,7 +221,7 @@ static void print_last_lines(char *text, int nr)
 		nr--;
 		start--;
 	}
-	printf(start);
+	printf("%s", start);
 }
 
 static void print_boundaries(const char *title, FIXTURE_DATA(proc_maps_race) *self)
diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c
new file mode 100644
index 000000000000..52500597f951
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-pidns.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2025 SUSE LLC.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/prctl.h>
+
+#include "../kselftest_harness.h"
+
+#define ASSERT_ERRNO(expected, _t, seen)				\
+	__EXPECT(expected, #expected,					\
+		({__typeof__(seen) _tmp_seen = (seen);			\
+		  _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1)
+
+#define ASSERT_ERRNO_EQ(expected, seen) \
+	ASSERT_ERRNO(expected, ==, seen)
+
+#define ASSERT_SUCCESS(seen) \
+	ASSERT_ERRNO(0, <=, seen)
+
+static int touch(char *path)
+{
+	int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
+	if (fd < 0)
+		return -1;
+	return close(fd);
+}
+
+FIXTURE(ns)
+{
+	int host_mntns, host_pidns;
+	int dummy_pidns;
+};
+
+FIXTURE_SETUP(ns)
+{
+	/* Stash the old mntns. */
+	self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC);
+	ASSERT_SUCCESS(self->host_mntns);
+
+	/* Create a new mount namespace and make it private. */
+	ASSERT_SUCCESS(unshare(CLONE_NEWNS));
+	ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL));
+
+	/*
+	 * Create a proper tmpfs that we can use and will disappear once we
+	 * leave this mntns.
+	 */
+	ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL));
+
+	/*
+	 * Create a pidns we can use for later tests. We need to fork off a
+	 * child so that we get a usable nsfd that we can bind-mount and open.
+	 */
+	ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755));
+	ASSERT_SUCCESS(touch("/tmp/dummy/pidns"));
+	ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755));
+
+	self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC);
+	ASSERT_SUCCESS(self->host_pidns);
+	ASSERT_SUCCESS(unshare(CLONE_NEWPID));
+
+	pid_t pid = fork();
+	ASSERT_SUCCESS(pid);
+	if (!pid) {
+		prctl(PR_SET_PDEATHSIG, SIGKILL);
+		ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL));
+		ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL));
+		exit(0);
+	}
+
+	int wstatus;
+	ASSERT_EQ(waitpid(pid, &wstatus, 0), pid);
+	ASSERT_TRUE(WIFEXITED(wstatus));
+	ASSERT_EQ(WEXITSTATUS(wstatus), 0);
+
+	ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID));
+
+	self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC);
+	ASSERT_SUCCESS(self->dummy_pidns);
+}
+
+FIXTURE_TEARDOWN(ns)
+{
+	ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS));
+	ASSERT_SUCCESS(close(self->host_mntns));
+
+	ASSERT_SUCCESS(close(self->host_pidns));
+	ASSERT_SUCCESS(close(self->dummy_pidns));
+}
+
+TEST_F(ns, pidns_mount_string_path)
+{
+	ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755));
+	ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid"));
+	ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK));
+
+	ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755));
+	ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns"));
+	ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK));
+	ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK));
+}
+
+TEST_F(ns, pidns_fsconfig_string_path)
+{
+	int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0));
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+	int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+	ASSERT_SUCCESS(mountfd);
+
+	ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_SUCCESS(close(fsfd));
+	ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_fsconfig_fd)
+{
+	int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns));
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+	int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+	ASSERT_SUCCESS(mountfd);
+
+	ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_SUCCESS(close(fsfd));
+	ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_reconfigure_remount)
+{
+	ASSERT_SUCCESS(mkdir("/tmp/proc", 0755));
+	ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, ""));
+
+	ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK));
+	ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK));
+
+	ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns"));
+
+	ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK));
+	ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK));
+}
+
+TEST_F(ns, pidns_reconfigure_fsconfig_string_path)
+{
+	int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+	int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+	ASSERT_SUCCESS(mountfd);
+
+	ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0));
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */
+
+	ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_SUCCESS(close(fsfd));
+	ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_reconfigure_fsconfig_fd)
+{
+	int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+	ASSERT_SUCCESS(fsfd);
+
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+	int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+	ASSERT_SUCCESS(mountfd);
+
+	ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns));
+	ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */
+
+	ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+	ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+	ASSERT_SUCCESS(close(fsfd));
+	ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/riscv/README b/tools/testing/selftests/riscv/README
new file mode 100644
index 000000000000..443da395da68
--- /dev/null
+++ b/tools/testing/selftests/riscv/README
@@ -0,0 +1,24 @@
+KSelfTest RISC-V
+================
+
+- These tests are riscv specific and so not built or run but just skipped
+  completely when env-variable ARCH is found to be different than 'riscv'.
+
+- Holding true the above, RISC-V KSFT tests can be run within the
+  KSelfTest framework using standard Linux top-level-makefile targets:
+
+      $ make TARGETS=riscv kselftest-clean
+      $ make TARGETS=riscv kselftest
+
+      or
+
+      $ make -C tools/testing/selftests TARGETS=riscv \
+		INSTALL_PATH=<your-installation-path> install
+
+      or, alternatively, only specific riscv/ subtargets can be picked:
+
+      $ make -C tools/testing/selftests TARGETS=riscv RISCV_SUBTARGETS="mm vector" \
+		INSTALL_PATH=<your-installation-path> install
+
+   Further details on building and running KSFT can be found in:
+     Documentation/dev-tools/kselftest.rst
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 663a9cef1952..dcac5cbe7933 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -40,9 +40,9 @@
  * Define weak versions to play nice with binaries that are statically linked
  * against a libc that doesn't support registering its own rseq.
  */
-__weak ptrdiff_t __rseq_offset;
-__weak unsigned int __rseq_size;
-__weak unsigned int __rseq_flags;
+extern __weak ptrdiff_t __rseq_offset;
+extern __weak unsigned int __rseq_size;
+extern __weak unsigned int __rseq_flags;
 
 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
 static const unsigned int *libc_rseq_size_p = &__rseq_size;
@@ -209,7 +209,7 @@ void rseq_init(void)
 	 * libc not having registered a restartable sequence.  Try to find the
 	 * symbols if that's the case.
 	 */
-	if (!*libc_rseq_size_p) {
+	if (!libc_rseq_size_p || !*libc_rseq_size_p) {
 		libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
 		libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
 		libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c
index 1c9ceb661c43..0cfbb111a2d0 100644
--- a/tools/testing/selftests/sched_ext/hotplug.c
+++ b/tools/testing/selftests/sched_ext/hotplug.c
@@ -6,7 +6,6 @@
 #include <bpf/bpf.h>
 #include <sched.h>
 #include <scx/common.h>
-#include <sched.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 61acbd45ffaa..874f17763536 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -24,6 +24,7 @@
 #include <linux/filter.h>
 #include <sys/prctl.h>
 #include <sys/ptrace.h>
+#include <sys/time.h>
 #include <sys/user.h>
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
@@ -73,6 +74,14 @@
 #define noinline __attribute__((noinline))
 #endif
 
+#ifndef __nocf_check
+#define __nocf_check __attribute__((nocf_check))
+#endif
+
+#ifndef __naked
+#define __naked __attribute__((__naked__))
+#endif
+
 #ifndef PR_SET_NO_NEW_PRIVS
 #define PR_SET_NO_NEW_PRIVS 38
 #define PR_GET_NO_NEW_PRIVS 39
@@ -3547,6 +3556,10 @@ static void signal_handler(int signal)
 		perror("write from signal");
 }
 
+static void signal_handler_nop(int signal)
+{
+}
+
 TEST(user_notification_signal)
 {
 	pid_t pid;
@@ -4819,6 +4832,132 @@ TEST(user_notification_wait_killable_fatal)
 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
 }
 
+/* Ensure signals after the reply do not interrupt */
+TEST(user_notification_wait_killable_after_reply)
+{
+	int i, max_iter = 100000;
+	int listener, status;
+	int pipe_fds[2];
+	pid_t pid;
+	long ret;
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret)
+	{
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	listener = user_notif_syscall(
+		__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER |
+			  SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
+	ASSERT_GE(listener, 0);
+
+	/*
+	 * Used to count invocations. One token is transferred from the child
+	 * to the parent per syscall invocation, the parent tries to take
+	 * one token per successful RECV. If the syscall is restarted after
+	 * RECV the parent will try to get two tokens while the child only
+	 * provided one.
+	 */
+	ASSERT_EQ(pipe(pipe_fds), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		struct sigaction new_action = {
+			.sa_handler = signal_handler_nop,
+			.sa_flags = SA_RESTART,
+		};
+		struct itimerval timer = {
+			.it_value = { .tv_usec = 1000 },
+			.it_interval = { .tv_usec = 1000 },
+		};
+		char c = 'a';
+
+		close(pipe_fds[0]);
+
+		/* Setup the sigaction with SA_RESTART */
+		if (sigaction(SIGALRM, &new_action, NULL)) {
+			perror("sigaction");
+			exit(1);
+		}
+
+		/*
+		 * Kill with SIGALRM repeatedly, to try to hit the race when
+		 * handling the syscall.
+		 */
+		if (setitimer(ITIMER_REAL, &timer, NULL) < 0)
+			perror("setitimer");
+
+		for (i = 0; i < max_iter; ++i) {
+			int fd;
+
+			/* Send one token per iteration to catch repeats. */
+			if (write(pipe_fds[1], &c, sizeof(c)) != 1) {
+				perror("write");
+				exit(1);
+			}
+
+			fd = syscall(__NR_dup, 0);
+			if (fd < 0) {
+				perror("dup");
+				exit(1);
+			}
+			close(fd);
+		}
+
+		exit(0);
+	}
+
+	close(pipe_fds[1]);
+
+	for (i = 0; i < max_iter; ++i) {
+		struct seccomp_notif req = {};
+		struct seccomp_notif_addfd addfd = {};
+		struct pollfd pfd = {
+			.fd = pipe_fds[0],
+			.events = POLLIN,
+		};
+		char c;
+
+		/*
+		 * Try to receive one token. If it failed, one child syscall
+		 * was restarted after RECV and needed to be handled twice.
+		 */
+		ASSERT_EQ(poll(&pfd, 1, 1000), 1)
+			kill(pid, SIGKILL);
+
+		ASSERT_EQ(read(pipe_fds[0], &c, sizeof(c)), 1)
+			kill(pid, SIGKILL);
+
+		/*
+		 * Get the notification, reply to it as fast as possible to test
+		 * whether the child wrongly skips going into the non-preemptible
+		 * (TASK_KILLABLE) state.
+		 */
+		do
+			ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+		while (ret < 0 && errno == ENOENT); /* Accept interruptions before RECV */
+		ASSERT_EQ(ret, 0)
+			kill(pid, SIGKILL);
+
+		addfd.id = req.id;
+		addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
+		addfd.srcfd = 0;
+		ASSERT_GE(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), 0)
+			kill(pid, SIGKILL);
+	}
+
+	/*
+	 * Wait for the process to exit, and make sure the process terminated
+	 * with a zero exit code..
+	 */
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
 struct tsync_vs_thread_leader_args {
 	pthread_t leader;
 };
@@ -4896,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader)
 	EXPECT_EQ(0, status);
 }
 
-noinline int probed(void)
+#ifdef __x86_64__
+
+/*
+ * We need naked probed_uprobe function. Using __nocf_check
+ * check to skip possible endbr64 instruction and ignoring
+ * -Wattributes, otherwise the compilation might fail.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+
+__naked __nocf_check noinline int probed_uprobe(void)
+{
+	/*
+	 * Optimized uprobe is possible only on top of nop5 instruction.
+	 */
+	asm volatile ("                                 \n"
+		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00     \n"
+		"ret                                    \n"
+	);
+}
+#pragma GCC diagnostic pop
+
+#else
+noinline int probed_uprobe(void)
+{
+	return 1;
+}
+#endif
+
+noinline int probed_uretprobe(void)
 {
 	return 1;
 }
@@ -4949,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr)
 	return found ? (uintptr_t)addr - start + base : -1;
 }
 
-FIXTURE(URETPROBE) {
+FIXTURE(UPROBE) {
 	int fd;
 };
 
-FIXTURE_VARIANT(URETPROBE) {
+FIXTURE_VARIANT(UPROBE) {
 	/*
-	 * All of the URETPROBE behaviors can be tested with either
-	 * uretprobe attached or not
+	 * All of the U(RET)PROBE behaviors can be tested with either
+	 * u(ret)probe attached or not
 	 */
 	bool attach;
+	/*
+	 * Test both uprobe and uretprobe.
+	 */
+	bool uretprobe;
+};
+
+FIXTURE_VARIANT_ADD(UPROBE, not_attached) {
+	.attach = false,
+	.uretprobe = false,
 };
 
-FIXTURE_VARIANT_ADD(URETPROBE, attached) {
+FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) {
 	.attach = true,
+	.uretprobe = false,
 };
 
-FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
-	.attach = false,
+FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) {
+	.attach = true,
+	.uretprobe = true,
 };
 
-FIXTURE_SETUP(URETPROBE)
+FIXTURE_SETUP(UPROBE)
 {
 	const size_t attr_sz = sizeof(struct perf_event_attr);
 	struct perf_event_attr attr;
 	ssize_t offset;
 	int type, bit;
 
-#ifndef __NR_uretprobe
-	SKIP(return, "__NR_uretprobe syscall not defined");
+#if !defined(__NR_uprobe) || !defined(__NR_uretprobe)
+	SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined");
 #endif
 
 	if (!variant->attach)
@@ -4987,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE)
 
 	type = determine_uprobe_perf_type();
 	ASSERT_GE(type, 0);
-	bit = determine_uprobe_retprobe_bit();
-	ASSERT_GE(bit, 0);
-	offset = get_uprobe_offset(probed);
+
+	if (variant->uretprobe) {
+		bit = determine_uprobe_retprobe_bit();
+		ASSERT_GE(bit, 0);
+	}
+
+	offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
 	ASSERT_GE(offset, 0);
 
-	attr.config |= 1 << bit;
+	if (variant->uretprobe)
+		attr.config |= 1 << bit;
 	attr.size = attr_sz;
 	attr.type = type;
 	attr.config1 = ptr_to_u64("/proc/self/exe");
@@ -5003,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE)
 			   PERF_FLAG_FD_CLOEXEC);
 }
 
-FIXTURE_TEARDOWN(URETPROBE)
+FIXTURE_TEARDOWN(UPROBE)
 {
 	/* we could call close(self->fd), but we'd need extra filter for
 	 * that and since we are calling _exit right away..
@@ -5017,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog)
 		return -1;
 	}
 
-	probed();
+	/*
+	 * Uprobe is optimized after first hit, so let's hit twice.
+	 */
+	probed_uprobe();
+	probed_uprobe();
+
+	probed_uretprobe();
 	return 0;
 }
 
-TEST_F(URETPROBE, uretprobe_default_allow)
+TEST_F(UPROBE, uprobe_default_allow)
 {
 	struct sock_filter filter[] = {
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
@@ -5034,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow)
 	ASSERT_EQ(0, run_probed_with_filter(&prog));
 }
 
-TEST_F(URETPROBE, uretprobe_default_block)
+TEST_F(UPROBE, uprobe_default_block)
 {
 	struct sock_filter filter[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
@@ -5051,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block)
 	ASSERT_EQ(0, run_probed_with_filter(&prog));
 }
 
-TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
+TEST_F(UPROBE, uprobe_block_syscall)
 {
 	struct sock_filter filter[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
+#ifdef __NR_uprobe
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2),
+#endif
 #ifdef __NR_uretprobe
 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
 #endif
@@ -5070,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
 	ASSERT_EQ(0, run_probed_with_filter(&prog));
 }
 
-TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
+TEST_F(UPROBE, uprobe_default_block_with_syscall)
 {
 	struct sock_filter filter[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
+#ifdef __NR_uprobe
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0),
+#endif
 #ifdef __NR_uretprobe
 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
 #endif
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index 23a61e5b99d0..998e5a2f4579 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -186,6 +186,204 @@
         ]
     },
     {
+        "id": "34c0",
+        "name": "Test TBF with HHF Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "hhf"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 hhf limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 hhf limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
+        "id": "fd68",
+        "name": "Test TBF with CODEL Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "codel"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 codel limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 codel limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
+        "id": "514e",
+        "name": "Test TBF with PIE Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "pie"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 pie limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 pie limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
+        "id": "6c97",
+        "name": "Test TBF with FQ Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "fq"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
+        "id": "5d0b",
+        "name": "Test TBF with FQ_CODEL Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "fq_codel"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_codel limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_codel limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
+        "id": "21c3",
+        "name": "Test TBF with FQ_PIE Backlog Accounting in gso_skb case against underflow",
+        "category": [
+            "qdisc",
+            "tbf",
+            "fq_pie"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$IP link set dev $DUMMY up || true",
+            "$IP addr add 10.10.11.10/24 dev $DUMMY || true",
+            "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms",
+            "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_pie limit 1000",
+            [
+                "ping -I $DUMMY -c2 10.10.11.11",
+                1
+            ],
+            "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_pie limit 1"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC -s qdisc show dev $DUMMY",
+        "matchPattern": "backlog 0b 0p",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
+    },
+    {
         "id": "a4bb",
         "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue",
         "category": [
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 2d93ac860bd5..cd9fe69ecce2 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -20,7 +20,7 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q,
 	struct io_uring_sqe *sqe[1];
 
 	ublk_io_alloc_sqes(t, sqe, 1);
-	io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC);
+	io_uring_prep_fsync(sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, IORING_FSYNC_DATASYNC);
 	io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 	/* bit63 marks us as tgt io */
 	sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
@@ -42,7 +42,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 		if (!sqe[0])
 			return -ENOMEM;
 
-		io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/,
+		io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/,
 				addr,
 				iod->nr_sectors << 9,
 				iod->start_sector << 9);
@@ -56,19 +56,19 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 
 	ublk_io_alloc_sqes(t, sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 
-	io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0,
+	io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0,
 		iod->nr_sectors << 9,
 		iod->start_sector << 9);
 	sqe[1]->buf_index = tag;
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
 	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	return 2;
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 95188065b2e9..b71faba86c3b 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -432,7 +432,7 @@ static void ublk_thread_deinit(struct ublk_thread *t)
 	}
 }
 
-static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
+static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
 {
 	struct ublk_dev *dev = q->dev;
 	int depth = dev->dev_info.queue_depth;
@@ -446,6 +446,9 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	q->flags = dev->dev_info.flags;
 	q->flags |= extra_flags;
 
+	/* Cache fd in queue for fast path access */
+	q->ublk_fd = dev->fds[0];
+
 	cmd_buf_size = ublk_queue_cmd_buf_sz(q);
 	off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
 	q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
@@ -481,9 +484,10 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
 	return -ENOMEM;
 }
 
-static int ublk_thread_init(struct ublk_thread *t)
+static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
 {
 	struct ublk_dev *dev = t->dev;
+	unsigned long long flags = dev->dev_info.flags | extra_flags;
 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
 	int ret;
 
@@ -512,7 +516,17 @@ static int ublk_thread_init(struct ublk_thread *t)
 
 	io_uring_register_ring_fd(&t->ring);
 
-	ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
+	if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
+		/* Register only backing files starting from index 1, exclude ublk control device */
+		if (dev->nr_fds > 1) {
+			ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
+		} else {
+			/* No backing files to register, skip file registration */
+			ret = 0;
+		}
+	} else {
+		ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
+	}
 	if (ret) {
 		ublk_err("ublk dev %d thread %d register files failed %d\n",
 				t->dev->dev_info.dev_id, t->idx, ret);
@@ -626,9 +640,12 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
 
 	/* These fields should be written once, never change */
 	ublk_set_sqe_cmd_op(sqe[0], cmd_op);
-	sqe[0]->fd		= 0;	/* dev->fds[0] */
+	sqe[0]->fd	= ublk_get_registered_fd(q, 0);	/* dev->fds[0] */
 	sqe[0]->opcode	= IORING_OP_URING_CMD;
-	sqe[0]->flags	= IOSQE_FIXED_FILE;
+	if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
+		sqe[0]->flags	= 0;  /* Use raw FD, not fixed file */
+	else
+		sqe[0]->flags	= IOSQE_FIXED_FILE;
 	sqe[0]->rw_flags	= 0;
 	cmd->tag	= io->tag;
 	cmd->q_id	= q->q_id;
@@ -832,6 +849,7 @@ struct ublk_thread_info {
 	unsigned		idx;
 	sem_t 			*ready;
 	cpu_set_t 		*affinity;
+	unsigned long long	extra_flags;
 };
 
 static void *ublk_io_handler_fn(void *data)
@@ -844,7 +862,7 @@ static void *ublk_io_handler_fn(void *data)
 	t->dev = info->dev;
 	t->idx = info->idx;
 
-	ret = ublk_thread_init(t);
+	ret = ublk_thread_init(t, info->extra_flags);
 	if (ret) {
 		ublk_err("ublk dev %d thread %u init failed\n",
 				dev_id, t->idx);
@@ -934,6 +952,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 
 	if (ctx->auto_zc_fallback)
 		extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
+	if (ctx->no_ublk_fixed_fd)
+		extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
 
 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
 		dev->q[i].dev = dev;
@@ -951,6 +971,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		tinfo[i].dev = dev;
 		tinfo[i].idx = i;
 		tinfo[i].ready = &ready;
+		tinfo[i].extra_flags = extra_flags;
 
 		/*
 		 * If threads are not tied 1:1 to queues, setting thread
@@ -1400,7 +1421,7 @@ static int cmd_dev_get_features(void)
 
 			if (!((1ULL << i)  & features))
 				continue;
-			if (i < sizeof(feat_map) / sizeof(feat_map[0]))
+			if (i < ARRAY_SIZE(feat_map))
 				feat = feat_map[i];
 			else
 				feat = "unknown";
@@ -1471,13 +1492,13 @@ static void __cmd_create_help(char *exe, bool recovery)
 	printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
 			exe, recovery ? "recover" : "add");
 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
-	printf("\t[-e 0|1 ] [-i 0|1]\n");
+	printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
 	printf("\tdefault: nthreads=nr_queues");
 
-	for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
 
 		if (ops->usage)
@@ -1534,6 +1555,7 @@ int main(int argc, char *argv[])
 		{ "size",		1,	NULL, 's'},
 		{ "nthreads",		1,	NULL,  0 },
 		{ "per_io_tasks",	0,	NULL,  0 },
+		{ "no_ublk_fixed_fd",	0,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1613,6 +1635,8 @@ int main(int argc, char *argv[])
 				ctx.nthreads = strtol(optarg, NULL, 10);
 			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
 				ctx.per_io_tasks = 1;
+			if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
+				ctx.no_ublk_fixed_fd = 1;
 			break;
 		case '?':
 			/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 219233f8a053..5e55484fb0aa 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -77,6 +77,7 @@ struct dev_ctx {
 	unsigned int	recovery:1;
 	unsigned int	auto_zc_fallback:1;
 	unsigned int	per_io_tasks:1;
+	unsigned int	no_ublk_fixed_fd:1;
 
 	int _evtfd;
 	int _shmid;
@@ -166,7 +167,9 @@ struct ublk_queue {
 
 /* borrow one bit of ublk uapi flags, which may never be used */
 #define UBLKS_Q_AUTO_BUF_REG_FALLBACK	(1ULL << 63)
+#define UBLKS_Q_NO_UBLK_FIXED_FD	(1ULL << 62)
 	__u64 flags;
+	int ublk_fd;	/* cached ublk char device fd */
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
 };
 
@@ -273,34 +276,48 @@ static inline int ublk_io_alloc_sqes(struct ublk_thread *t,
 	return nr_sqes;
 }
 
-static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
-		int dev_fd, int tag, int q_id, __u64 index)
+static inline int ublk_get_registered_fd(struct ublk_queue *q, int fd_index)
+{
+	if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
+		if (fd_index == 0)
+			/* Return the raw ublk FD for index 0 */
+			return q->ublk_fd;
+		/* Adjust index for backing files (index 1 becomes 0, etc.) */
+		return fd_index - 1;
+	}
+	return fd_index;
+}
+
+static inline void __io_uring_prep_buf_reg_unreg(struct io_uring_sqe *sqe,
+		struct ublk_queue *q, int tag, int q_id, __u64 index)
 {
 	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+	int dev_fd = ublk_get_registered_fd(q, 0);
 
 	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
 	sqe->opcode		= IORING_OP_URING_CMD;
-	sqe->flags		|= IOSQE_FIXED_FILE;
-	sqe->cmd_op		= UBLK_U_IO_REGISTER_IO_BUF;
+	if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
+		sqe->flags	&= ~IOSQE_FIXED_FILE;
+	else
+		sqe->flags	|= IOSQE_FIXED_FILE;
 
 	cmd->tag		= tag;
 	cmd->addr		= index;
 	cmd->q_id		= q_id;
 }
 
-static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
-		int dev_fd, int tag, int q_id, __u64 index)
+static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
+		struct ublk_queue *q, int tag, int q_id, __u64 index)
 {
-	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+	__io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index);
+	sqe->cmd_op		= UBLK_U_IO_REGISTER_IO_BUF;
+}
 
-	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
-	sqe->opcode		= IORING_OP_URING_CMD;
-	sqe->flags		|= IOSQE_FIXED_FILE;
+static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
+		struct ublk_queue *q, int tag, int q_id, __u64 index)
+{
+	__io_uring_prep_buf_reg_unreg(sqe, q, tag, q_id, index);
 	sqe->cmd_op		= UBLK_U_IO_UNREGISTER_IO_BUF;
-
-	cmd->tag		= tag;
-	cmd->addr		= index;
-	cmd->q_id		= q_id;
 }
 
 static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe)
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index f0e0003a4860..280043f6b689 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -63,7 +63,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q,
 
 	ublk_io_alloc_sqes(t, sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
@@ -71,7 +71,7 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q,
 	__setup_nop_io(tag, iod, sqe[1], q->q_id);
 	sqe[1]->flags |= IOSQE_IO_HARDLINK;
 
-	io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 1fb9b7cc281b..791fa8dc1651 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -142,7 +142,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	ublk_io_alloc_sqes(t, sqe, s->nr + extra);
 
 	if (zc) {
-		io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index);
+		io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index);
 		sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 		sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
@@ -168,7 +168,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	if (zc) {
 		struct io_uring_sqe *unreg = sqe[s->nr + 1];
 
-		io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index);
+		io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index);
 		unreg->user_data = build_user_data(
 			tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
 	}
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 40d1437ca298..3f901db4d09d 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -28,14 +28,14 @@ _create_backfile 0 256M
 _create_backfile 1 128M
 _create_backfile 2 128M
 
-ublk_io_and_kill_daemon 8G -t null -q 4 -z &
-ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
+ublk_io_and_kill_daemon 8G -t null -q 4 -z --no_ublk_fixed_fd &
+ublk_io_and_kill_daemon 256M -t loop -q 4 -z --no_ublk_fixed_fd "${UBLK_BACKFILES[0]}" &
 ublk_io_and_kill_daemon 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
 
 if _have_feature "AUTO_BUF_REG"; then
 	ublk_io_and_kill_daemon 8G -t null -q 4 --auto_zc &
 	ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
-	ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+	ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc --no_ublk_fixed_fd "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
 	ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
 fi
 
diff --git a/tools/testing/shared/linux/idr.h b/tools/testing/shared/linux/idr.h
index 4e342f2e37cf..676c5564e33f 100644
--- a/tools/testing/shared/linux/idr.h
+++ b/tools/testing/shared/linux/idr.h
@@ -1 +1,5 @@
+/* Avoid duplicate definitions due to system headers. */
+#ifdef __CONCAT
+#undef __CONCAT
+#endif
 #include "../../../../include/linux/idr.h"
diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config
index 0fe6b50f029b..6efa13e3ca93 100644
--- a/tools/tracing/latency/Makefile.config
+++ b/tools/tracing/latency/Makefile.config
@@ -1,7 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+include $(srctree)/tools/scripts/utilities.mak
+
 STOP_ERROR :=
 
+ifndef ($(NO_LIBTRACEEVENT),1)
+  ifeq ($(call get-executable,$(PKG_CONFIG)),)
+    $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it)
+  endif
+endif
+
 define lib_setup
   $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
   $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)"))
diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config
index 5f2231d8d626..07ff5e8f3006 100644
--- a/tools/tracing/rtla/Makefile.config
+++ b/tools/tracing/rtla/Makefile.config
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+include $(srctree)/tools/scripts/utilities.mak
+
 STOP_ERROR :=
 
 LIBTRACEEVENT_MIN_VERSION = 1.5
 LIBTRACEFS_MIN_VERSION = 1.6
 
+ifndef ($(NO_LIBTRACEEVENT),1)
+  ifeq ($(call get-executable,$(PKG_CONFIG)),)
+    $(error Error: $(PKG_CONFIG) needed by libtraceevent/libtracefs is missing on this system, please install it)
+  endif
+endif
+
 define lib_setup
   $(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
   $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)"))
diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index aaf0808125d7..13ff1934d47c 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -49,7 +49,7 @@ actions_destroy(struct actions *self)
 static struct action *
 actions_new(struct actions *self)
 {
-	if (self->size >= self->len) {
+	if (self->len >= self->size) {
 		self->size *= 2;
 		self->list = realloc(self->list, self->size * sizeof(struct action));
 	}
@@ -131,7 +131,7 @@ actions_parse(struct actions *self, const char *trigger)
 {
 	enum action_type type = ACTION_NONE;
 	char *token;
-	char trigger_c[strlen(trigger)];
+	char trigger_c[strlen(trigger) + 1];
 
 	/* For ACTION_SIGNAL */
 	int signal = 0, pid = 0;